This commit is contained in:
2026-02-09 19:46:13 +01:00
commit 9690367d70
8 changed files with 537 additions and 0 deletions
+162
View File
@@ -0,0 +1,162 @@
import json
import os.path
from enum import unique
from os import scandir, listdir
import natsort
from ebooklib import epub
from bs4 import BeautifulSoup
import aspose.words as aw
from pprint import pprint
from sympy import false
from src.functions import makeDir, readFromFile, readFromJsonFile
class EpubHandler:
def __init__(self, htmlPath, epubPath):
self.htmlPath = htmlPath
self.epubPath = epubPath
makeDir(self.htmlPath)
makeDir(self.epubPath)
def convertHtmlToEpub(self, language: str, coverImagePath=None):
jsonPath = os.path.join(self.htmlPath, listdir(self.htmlPath)[0].replace(".html", ".json"))
infosDict = readFromJsonFile(jsonPath)
book = epub.EpubBook()
book.set_title(infosDict["seriesTitle"])
book.set_language(infosDict["currentLanguage"])
if coverImagePath:
book.set_cover("cover.jpg", readFromFile("rb", coverImagePath))
spine = ["nav"]
files = natsort.os_sorted([x.path for x in scandir(self.htmlPath) if not x.path.endswith(".json")])
toc = []
chapterDict = {}
for htmlFile in files:
if htmlFile.endswith(".json"):
continue
infosDict = readFromJsonFile(htmlFile.replace(".html", ".json"))
chapter = epub.EpubHtml(title=infosDict["chapterTitle"], file_name=f"chapter{infosDict['chapter']}.xhtml",
lang=language)
chapter.content = readFromFile("r", htmlFile)
book.add_item(chapter)
try:
chapterDict[int(infosDict["chapter"])] = chapter
except IndexError:
pprint(infosDict['chapter'])
# Dictionary nach Keys sortieren und Kapitel zur spine hinzufügen
for key in sorted(chapterDict.keys()):
chapter = chapterDict[key]
toc.append(epub.Link(chapter.file_name, chapter.title, chapter.title))
spine.append(chapter)
book.toc = toc
book.spine = spine
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
epub.write_epub(os.path.join(self.epubPath, f"{book.title}.epub"), book)
# ======================================================
# EPUB → HTML
# ======================================================
def epub_to_html(self, epub_file):
book = epub.read_epub(epub_file)
images_dir = os.path.join(self.htmlPath, "images")
makeDir(images_dir)
chapter_index = 1
for item_id, _ in book.spine:
item = book.get_item_with_id(item_id)
# Kapitel
if item.get_type() == epub.ITEM_DOCUMENT:
filename = f"{chapter_index:03d}_{os.path.basename(item.file_name)}"
filepath = os.path.join(self.htmlPath, filename)
with open(filepath, "wb") as f:
f.write(item.get_content())
chapter_index += 1
# Bilder
elif item.get_type() == epub.ITEM_IMAGE:
image_path = os.path.join(images_dir, os.path.basename(item.file_name))
with open(image_path, "wb") as f:
f.write(item.get_content())
print(f"✔ EPUB nach HTML exportiert ({chapter_index - 1} Kapitel)")
# ======================================================
# HTML → EPUB
# ======================================================
def html_to_epub(self, output_epub, title="Translated Book", lang="de"):
book = epub.EpubBook()
book.set_title(title)
book.set_language(lang)
# Kapitel laden (sortiert!)
html_files = sorted(
f for f in os.listdir(self.htmlPath)
if f.endswith(".html")
)
spine = ["nav"]
chapters = []
for html_file in html_files:
with open(os.path.join(self.htmlPath, html_file), "r", encoding="utf-8") as f:
content = f.read()
chapter = epub.EpubHtml(
title=html_file,
file_name=html_file,
content=content
)
book.add_item(chapter)
chapters.append(chapter)
spine.append(chapter)
# Bilder wieder einbinden
images_dir = os.path.join(self.htmlPath, "images")
if os.path.exists(images_dir):
for img in os.listdir(images_dir):
img_path = os.path.join(images_dir, img)
with open(img_path, "rb") as f:
image = epub.EpubItem(
uid=img,
file_name=f"images/{img}",
media_type=self._guess_mime(img),
content=f.read()
)
book.add_item(image)
book.spine = spine
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
epub.write_epub(output_epub, book)
print("✔ EPUB neu erstellt")
# ======================================================
# MIME helper
# ======================================================
def _guess_mime(self, filename):
ext = filename.lower().split(".")[-1]
return {
"jpg": "image/jpeg",
"jpeg": "image/jpeg",
"png": "image/png",
"gif": "image/gif",
"svg": "image/svg+xml",
"webp": "image/webp"
}.get(ext, "application/octet-stream")
+69
View File
@@ -0,0 +1,69 @@
from pprint import pprint
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
from bs4 import BeautifulSoup
import torch
import os
class Translator:
def __init__(self, modalPath:str, inputFolder:str, outputFolder:str):
# Ändere das Modell und den Tokenizer auf google/madlad400-3b-mt
self.tokenizer = AutoTokenizer.from_pretrained(modalPath, src_lang="jpn_Jpan")
self.model = AutoModelForSeq2SeqLM.from_pretrained(modalPath, torch_dtype=torch.float16)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pprint(self.device)
self.model = self.model.to(self.device)
self.inputFolder = inputFolder
self.outputFolder = outputFolder
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
def downloadModal(self, modalName:str, savePath:str):
tokenizer = T5Tokenizer.from_pretrained(modalName)
model = T5ForConditionalGeneration.from_pretrained(modalName)
# Lokal speichern
tokenizer.save_pretrained(savePath)
model.save_pretrained(savePath)
print(f"Modell gespeichert unter {savePath}")
def _is_valid_text(self, text):
return text and len(text.strip()) > 0 # Sicherstellen, dass der Text nicht leer ist und nur Leerzeichen entfernt werden
def _translate(self, text):
if not self._is_valid_text(text):
print("Ungültiger Text, überspringe Übersetzung.")
return None
batch = self.tokenizer([text], return_tensors="pt", padding=True, truncation=True).to(self.device)
with torch.no_grad():
try:
# Verwende generate() von T5ForConditionalGeneration
generated = self.model.generate(**batch, do_sample=True, forced_bos_token_id=self.tokenizer.convert_tokens_to_ids("eng_Latn"), max_new_tokens=150)
except Exception as e:
print(f"Fehler bei der Modellvorhersage: {e}")
return None
translated = self.tokenizer.decode(generated[0], skip_special_tokens=True)
return translated
def doTranslate(self):
with open(r"W:\Temp\html\第1話 「レア」.html", "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "html.parser")
# --- 2. Text finden und übersetzen ---
for elem in soup.find_all(text=True):
if elem.parent.name not in ['script', 'style']: # Kein JS oder CSS übersetzen
original_text = elem.strip()
print(original_text)
if original_text: # Nur wenn etwas da ist
try:
translated_text = self._translate(original_text)
elem.replace_with(translated_text)
print(translated_text)
except Exception as e:
print(f"Fehler beim Übersetzen von: {original_text[:30]}... => {e}")
# --- 3. Übersetzte Datei speichern ---
with open(r"W:\Temp\translate test\test2.html", "w", encoding="utf-8") as f:
f.write(str(soup))
+116
View File
@@ -0,0 +1,116 @@
import os
import re
import time
from urllib.parse import urljoin
from pprint import pprint
import requests
from bs4 import BeautifulSoup, NavigableString
from src.functions import writeToFile, makeDir, writeToJsonFile
# https://ncode.syosetu.com/n0806fu
class WebScrapper:
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
self.baseLink = baseLink
self.htmlFolderPath = htmlFolderPath
self.currentLanguage = currentLanguage
makeDir(self.htmlFolderPath)
def getHtml(self, uriWithFormat:str, fromChapter:int, toChapter:int, sleepTime:float=0):
"""
uriWithFormat: inserts the current chapter number into the {} brackets
"""
for chapterNumber in range(fromChapter, toChapter + 1):
time.sleep(sleepTime) # to avoid getting blocked by the website for sending too many requests in a short time
infoDict = {
"chapter": chapterNumber,
"originalLanguage": self.currentLanguage,
"currentLanguage": self.currentLanguage,
}
link = urljoin(self.baseLink, uriWithFormat.format(chapterNumber))
print(link)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
#'Referer': 'https://ncode.syosetu.com/',
#'Accept-Language': 'de,en;q=0.9',
}
soup = BeautifulSoup(requests.get(link, headers=headers).content, "html.parser")
if not soup:
print(f"skipping Chapter {chapterNumber}")
continue
chapterContent = self._getChapterContent(soup)
if not chapterContent:
print(f"skipping Chapter {chapterNumber}. No content found")
continue
self._removeUnwantedThinsFromHtml(chapterContent)
chapterTitle = self._addChapterTitle(soup, chapterContent, chapterNumber, infoDict)
infoDict["chapterTitle"] = chapterTitle
filePath = os.path.join(self.htmlFolderPath, f"{chapterTitle}.html")
writeToFile("wb", filePath, chapterContent.renderContents("utf-8", True, 4))
writeToJsonFile(filePath.replace(".html", ".json"), infoDict)
def _getChapterContent(self, soup:BeautifulSoup):
chapterContent = None
if "fanmtl.com" in self.baseLink:
chapterContent = soup.find("div", {"class": "chapter-content"})
elif "syosetu.com" in self.baseLink:
body = soup.select("div.p-novel__body div.js-novel-text.p-novel__text")
for x in body:
if len(x.text) > 1000:
chapterContent = x
elif "fenrirealm.com" in self.baseLink:
chapterContent = soup.select("div.chapter-view > div.content-area")[0]
return chapterContent
def _removeUnwantedThinsFromHtml(self, content:BeautifulSoup|NavigableString):
if not content:
return
# FanMTL advertisements
if "fanmtl.com" in self.baseLink:
for div in content.find_all('div', {'align': 'center'}):
if div.find('script'):
div.decompose()
#general
for script in content.find_all('script'):
script.decompose()
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, chapterNumber, infoDict:dict):
chapterTitle = f"Chapter {chapterNumber}"
if "fanmtl.com" in self.baseLink:
infoDict["seriesTitle"] = re.sub("\(.*?\)|[<>|\?:\*\"\\/]", "", soup.select("div.titles h1 a")[0].text)
chapterTitle = re.sub("\(.*?\)|[<>|\?:\*\"\\/]", "", soup.select("div.titles h2")[0].text)
infoDict["chapterTitle"] =chapterTitle
infoDict["author"] = ""
elif "syosetu.com" in self.baseLink:
chapterTitle = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
# = soup.select("h1.p-novel__title font font")
infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
elif "fenrirealm.com" in self.baseLink:
chapterTitle = soup.select_one(".chapter-view > div > h2").get_text(strip=True).split(":")[-1].strip()
infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip()
infoDict["author"] = "unknown"
titleElement = soup.new_tag("h1")
titleElement.string = chapterTitle
content.insert(0, titleElement)
content.insert(1, soup.new_tag("br"))
content.insert(2, soup.new_tag("br"))
return chapterTitle
+30
View File
@@ -0,0 +1,30 @@
import json
import os
from mysql.connector.aio.charsets import charsets
def writeToFile(mode:str, path:str, content: str | bytes):
encoding = None if "b" in mode else "utf-8"
with open(path, mode, encoding=encoding) as file:
file.write(content)
def readFromFile(mode:str, path:str):
encoding = None if "b" in mode else "utf-8"
with open(path, mode, encoding=encoding) as file:
return file.read()
def makeDir(directory:str):
if not os.path.exists(directory):
os.makedirs(directory)
def writeToJsonFile(path:str, content:dict):
with open(path, "w", encoding="utf-8") as file:
json.dump(content, file)
def readFromJsonFile(path:str):
with open(path, "r", encoding="utf-8") as file:
return json.load(file)
+69
View File
@@ -0,0 +1,69 @@
from pprint import pprint
from transformers import MarianMTModel, MarianTokenizer, T5Tokenizer, T5Model
from bs4 import BeautifulSoup
import torch
import os
class TranslatorOld:
def __init__(self, modalPath:str, inputFolder:str, outputFolder:str):
self.tokenizer = T5Tokenizer.from_pretrained(modalPath)
self.model = T5Model.from_pretrained(modalPath)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pprint(self.device)
self.model = self.model.to(self.device)
self.inputFolder = inputFolder
self.outputFolder = outputFolder
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
def downloadModal(self, modalName:str, savePath:str):
tokenizer = MarianTokenizer.from_pretrained(modalName)
model = MarianMTModel.from_pretrained(modalName)
# Lokal speichern
tokenizer.save_pretrained(savePath)
model.save_pretrained(savePath)
print(f"Modell gespeichert unter {savePath}")
def _is_valid_text(self, text):
return text and len(
text.strip()) > 0 # Sicherstellen, dass der Text nicht leer ist und nur Leerzeichen entfernt werden
def _translate(self, text):
if not self._is_valid_text(text):
print("Ungültiger Text, überspringe Übersetzung.")
return None
batch = self.tokenizer([text], return_tensors="pt", padding=True, truncation=True).to(self.device)
with torch.no_grad():
try:
generated = self.model.generate(**batch, do_sample=True, top_k=50, top_p=0.95, temperature=0.7)
except Exception as e:
print(f"Fehler bei der Modellvorhersage: {e}")
return None
translated = self.tokenizer.decode(generated[0], skip_special_tokens=True)
return translated
def doTranslate(self):
with open(r"W:\Temp\html\第1話 「レア」.html", "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "html.parser")
# --- 2. Text finden und übersetzen ---
for elem in soup.find_all(text=True):
if elem.parent.name not in ['script', 'style']: # Kein JS oder CSS übersetzen
original_text = elem.strip()
print(original_text)
if original_text: # Nur wenn etwas da ist
try:
translated_text = self._translate(original_text)
elem.replace_with(translated_text)
print(translated_text)
except Exception as e:
print(f"Fehler beim Übersetzen von: {original_text[:30]}... => {e}")
# --- 3. Übersetzte Datei speichern ---
with open(r"W:\Temp\translate test\test1.html", "w", encoding="utf-8") as f:
f.write(str(soup))