commit 9690367d704a2f1e8f85ffd79f6c2d6a2dd3fd21 Author: JohannesBOT Date: Mon Feb 9 19:46:13 2026 +0100 init diff --git a/main.py b/main.py new file mode 100644 index 0000000..6a2b743 --- /dev/null +++ b/main.py @@ -0,0 +1,36 @@ +import random + +from src.EpubHandler import EpubHandler +from src.Translator import Translator +from src.WebScrapper import WebScrapper + +# Press the green button in the gutter to run the script. +if __name__ == '__main__': + # scrapper = WebScrapper("https://www.fanmtl.com/novel/", r"E:\temp\WN\Game of the World Tree\HTML", "en") + # scrapper.getHtml("game-of-the-world-tree_{}.html", 1, 987) + + # epubHandler = EpubHandler(r"E:\temp\WN\Game of the World Tree\HTML", r"E:\temp\WN\Game of the World Tree\epub") + # epubHandler.convertHtmlToEpub("en", r"C:\Users\JohannesBOZZ\Downloads\01605-game-of-the-world-tree.jpg") + + + + #scrapper = WebScrapper("https://ncode.syosetu.com/n0806fu/", r"E:\temp\WN\Golden Experience\HTML", "jp") + #scrapper.getHtml("{}/", 334, 620) # 612 + + # scrapper = WebScrapper("https://fenrirealm.com/series/dorothys-forbidden-grimoire/", r"E:\temp\WN\Dorothy’s Forbidden Grimoire\HTML", "en") + # scrapper.getHtml("{}", 377, 828, 2) + epubHandler = EpubHandler(r"E:\temp\WN\Dorothy’s Forbidden Grimoire\HTML", r"E:\temp\WN\Dorothy’s Forbidden Grimoire\EPUB") + epubHandler.convertHtmlToEpub("en", r"C:\Users\JohannesBOZZ\Downloads\6e1de333d6af7aaa3fdf3ffa66ac6f55.png") + + # epubHandler = EpubHandler(r"E:\temp\WN\Golden Experience\501-609", r"E:\temp\WN\Golden Experience\epub") + # epubHandler = EpubHandler(r"C:\Users\JohannesBOZZ\Downloads\黄金の経験値 VI 特定災害生物「魔王」暗躍マグナメルム\html", r"C:\Users\JohannesBOZZ\Downloads\黄金の経験値 VI 特定災害生物「魔王」暗躍マグナメルム") + # epubHandler.epub_to_html(r"C:\Users\JohannesBOZZ\Downloads\黄金の経験値 VI 特定災害生物「魔王」暗躍マグナメルム\黄金の経験値 VI 特定災害生物「魔王」暗躍マグナメルム.epub") + + + + # epubHandler = EpubHandler(r"W:\Temp\html", r"W:\Temp\epub") + #epubHandler.convertHtmlToEpub("jp") + + # translator = Translator(r"facebook/nllb-200-3.3B", r"W:\Temp\html", r"W:\Temp\translate test") + # translator.doTranslate() + diff --git a/requierments.txt b/requierments.txt new file mode 100644 index 0000000..f74cd03 --- /dev/null +++ b/requierments.txt @@ -0,0 +1,12 @@ +requests +bs4 +markdownify +aspose-words +ebooklib +natsort +transformers +sentencepiece +torch +tensorflow +flax +protobuf==3.20.* diff --git a/src/EpubHandler.py b/src/EpubHandler.py new file mode 100644 index 0000000..7488afe --- /dev/null +++ b/src/EpubHandler.py @@ -0,0 +1,162 @@ +import json +import os.path +from enum import unique +from os import scandir, listdir + +import natsort +from ebooklib import epub +from bs4 import BeautifulSoup +import aspose.words as aw +from pprint import pprint + +from sympy import false + +from src.functions import makeDir, readFromFile, readFromJsonFile + + +class EpubHandler: + def __init__(self, htmlPath, epubPath): + self.htmlPath = htmlPath + self.epubPath = epubPath + makeDir(self.htmlPath) + makeDir(self.epubPath) + + def convertHtmlToEpub(self, language: str, coverImagePath=None): + jsonPath = os.path.join(self.htmlPath, listdir(self.htmlPath)[0].replace(".html", ".json")) + infosDict = readFromJsonFile(jsonPath) + + book = epub.EpubBook() + book.set_title(infosDict["seriesTitle"]) + book.set_language(infosDict["currentLanguage"]) + if coverImagePath: + book.set_cover("cover.jpg", readFromFile("rb", coverImagePath)) + spine = ["nav"] + + files = natsort.os_sorted([x.path for x in scandir(self.htmlPath) if not x.path.endswith(".json")]) + toc = [] + chapterDict = {} + for htmlFile in files: + if htmlFile.endswith(".json"): + continue + + infosDict = readFromJsonFile(htmlFile.replace(".html", ".json")) + + chapter = epub.EpubHtml(title=infosDict["chapterTitle"], file_name=f"chapter{infosDict['chapter']}.xhtml", + lang=language) + chapter.content = readFromFile("r", htmlFile) + + book.add_item(chapter) + try: + chapterDict[int(infosDict["chapter"])] = chapter + except IndexError: + pprint(infosDict['chapter']) + + # Dictionary nach Keys sortieren und Kapitel zur spine hinzufügen + for key in sorted(chapterDict.keys()): + chapter = chapterDict[key] + toc.append(epub.Link(chapter.file_name, chapter.title, chapter.title)) + spine.append(chapter) + + book.toc = toc + book.spine = spine + book.add_item(epub.EpubNcx()) + book.add_item(epub.EpubNav()) + epub.write_epub(os.path.join(self.epubPath, f"{book.title}.epub"), book) + + + # ====================================================== + # EPUB → HTML + # ====================================================== + def epub_to_html(self, epub_file): + book = epub.read_epub(epub_file) + + images_dir = os.path.join(self.htmlPath, "images") + makeDir(images_dir) + + chapter_index = 1 + + for item_id, _ in book.spine: + item = book.get_item_with_id(item_id) + + # Kapitel + if item.get_type() == epub.ITEM_DOCUMENT: + filename = f"{chapter_index:03d}_{os.path.basename(item.file_name)}" + filepath = os.path.join(self.htmlPath, filename) + + with open(filepath, "wb") as f: + f.write(item.get_content()) + + chapter_index += 1 + + # Bilder + elif item.get_type() == epub.ITEM_IMAGE: + image_path = os.path.join(images_dir, os.path.basename(item.file_name)) + with open(image_path, "wb") as f: + f.write(item.get_content()) + + print(f"✔ EPUB nach HTML exportiert ({chapter_index - 1} Kapitel)") + + # ====================================================== + # HTML → EPUB + # ====================================================== + def html_to_epub(self, output_epub, title="Translated Book", lang="de"): + book = epub.EpubBook() + book.set_title(title) + book.set_language(lang) + + # Kapitel laden (sortiert!) + html_files = sorted( + f for f in os.listdir(self.htmlPath) + if f.endswith(".html") + ) + + spine = ["nav"] + chapters = [] + + for html_file in html_files: + with open(os.path.join(self.htmlPath, html_file), "r", encoding="utf-8") as f: + content = f.read() + + chapter = epub.EpubHtml( + title=html_file, + file_name=html_file, + content=content + ) + book.add_item(chapter) + chapters.append(chapter) + spine.append(chapter) + + # Bilder wieder einbinden + images_dir = os.path.join(self.htmlPath, "images") + if os.path.exists(images_dir): + for img in os.listdir(images_dir): + img_path = os.path.join(images_dir, img) + with open(img_path, "rb") as f: + image = epub.EpubItem( + uid=img, + file_name=f"images/{img}", + media_type=self._guess_mime(img), + content=f.read() + ) + book.add_item(image) + + book.spine = spine + book.add_item(epub.EpubNcx()) + book.add_item(epub.EpubNav()) + + epub.write_epub(output_epub, book) + print("✔ EPUB neu erstellt") + + # ====================================================== + # MIME helper + # ====================================================== + def _guess_mime(self, filename): + ext = filename.lower().split(".")[-1] + return { + "jpg": "image/jpeg", + "jpeg": "image/jpeg", + "png": "image/png", + "gif": "image/gif", + "svg": "image/svg+xml", + "webp": "image/webp" + }.get(ext, "application/octet-stream") diff --git a/src/Translator.py b/src/Translator.py new file mode 100644 index 0000000..ad35652 --- /dev/null +++ b/src/Translator.py @@ -0,0 +1,69 @@ +from pprint import pprint + +from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM +from bs4 import BeautifulSoup +import torch +import os + +class Translator: + def __init__(self, modalPath:str, inputFolder:str, outputFolder:str): + # Ändere das Modell und den Tokenizer auf google/madlad400-3b-mt + self.tokenizer = AutoTokenizer.from_pretrained(modalPath, src_lang="jpn_Jpan") + self.model = AutoModelForSeq2SeqLM.from_pretrained(modalPath, torch_dtype=torch.float16) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + pprint(self.device) + self.model = self.model.to(self.device) + self.inputFolder = inputFolder + self.outputFolder = outputFolder + + os.environ["CUDA_LAUNCH_BLOCKING"] = "1" + + def downloadModal(self, modalName:str, savePath:str): + tokenizer = T5Tokenizer.from_pretrained(modalName) + model = T5ForConditionalGeneration.from_pretrained(modalName) + + # Lokal speichern + tokenizer.save_pretrained(savePath) + model.save_pretrained(savePath) + + print(f"Modell gespeichert unter {savePath}") + + def _is_valid_text(self, text): + return text and len(text.strip()) > 0 # Sicherstellen, dass der Text nicht leer ist und nur Leerzeichen entfernt werden + + def _translate(self, text): + if not self._is_valid_text(text): + print("Ungültiger Text, überspringe Übersetzung.") + return None + + batch = self.tokenizer([text], return_tensors="pt", padding=True, truncation=True).to(self.device) + with torch.no_grad(): + try: + # Verwende generate() von T5ForConditionalGeneration + generated = self.model.generate(**batch, do_sample=True, forced_bos_token_id=self.tokenizer.convert_tokens_to_ids("eng_Latn"), max_new_tokens=150) + except Exception as e: + print(f"Fehler bei der Modellvorhersage: {e}") + return None + translated = self.tokenizer.decode(generated[0], skip_special_tokens=True) + return translated + + def doTranslate(self): + with open(r"W:\Temp\html\第1話 「レア」.html", "r", encoding="utf-8") as f: + soup = BeautifulSoup(f, "html.parser") + + # --- 2. Text finden und übersetzen --- + for elem in soup.find_all(text=True): + if elem.parent.name not in ['script', 'style']: # Kein JS oder CSS übersetzen + original_text = elem.strip() + print(original_text) + if original_text: # Nur wenn etwas da ist + try: + translated_text = self._translate(original_text) + elem.replace_with(translated_text) + print(translated_text) + except Exception as e: + print(f"Fehler beim Übersetzen von: {original_text[:30]}... => {e}") + + # --- 3. Übersetzte Datei speichern --- + with open(r"W:\Temp\translate test\test2.html", "w", encoding="utf-8") as f: + f.write(str(soup)) diff --git a/src/WebScrapper.py b/src/WebScrapper.py new file mode 100644 index 0000000..857831d --- /dev/null +++ b/src/WebScrapper.py @@ -0,0 +1,116 @@ +import os +import re +import time +from urllib.parse import urljoin +from pprint import pprint +import requests +from bs4 import BeautifulSoup, NavigableString +from src.functions import writeToFile, makeDir, writeToJsonFile + + +# https://ncode.syosetu.com/n0806fu + + + +class WebScrapper: + def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str): + self.baseLink = baseLink + self.htmlFolderPath = htmlFolderPath + self.currentLanguage = currentLanguage + makeDir(self.htmlFolderPath) + + def getHtml(self, uriWithFormat:str, fromChapter:int, toChapter:int, sleepTime:float=0): + """ + uriWithFormat: inserts the current chapter number into the {} brackets + """ + for chapterNumber in range(fromChapter, toChapter + 1): + time.sleep(sleepTime) # to avoid getting blocked by the website for sending too many requests in a short time + + infoDict = { + "chapter": chapterNumber, + "originalLanguage": self.currentLanguage, + "currentLanguage": self.currentLanguage, + } + link = urljoin(self.baseLink, uriWithFormat.format(chapterNumber)) + print(link) + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36', + #'Referer': 'https://ncode.syosetu.com/', + #'Accept-Language': 'de,en;q=0.9', + } + + soup = BeautifulSoup(requests.get(link, headers=headers).content, "html.parser") + if not soup: + print(f"skipping Chapter {chapterNumber}") + continue + + chapterContent = self._getChapterContent(soup) + if not chapterContent: + print(f"skipping Chapter {chapterNumber}. No content found") + continue + + self._removeUnwantedThinsFromHtml(chapterContent) + chapterTitle = self._addChapterTitle(soup, chapterContent, chapterNumber, infoDict) + infoDict["chapterTitle"] = chapterTitle + + filePath = os.path.join(self.htmlFolderPath, f"{chapterTitle}.html") + writeToFile("wb", filePath, chapterContent.renderContents("utf-8", True, 4)) + writeToJsonFile(filePath.replace(".html", ".json"), infoDict) + + def _getChapterContent(self, soup:BeautifulSoup): + chapterContent = None + + if "fanmtl.com" in self.baseLink: + chapterContent = soup.find("div", {"class": "chapter-content"}) + elif "syosetu.com" in self.baseLink: + body = soup.select("div.p-novel__body div.js-novel-text.p-novel__text") + for x in body: + if len(x.text) > 1000: + chapterContent = x + elif "fenrirealm.com" in self.baseLink: + chapterContent = soup.select("div.chapter-view > div.content-area")[0] + + return chapterContent + + def _removeUnwantedThinsFromHtml(self, content:BeautifulSoup|NavigableString): + if not content: + return + + # FanMTL advertisements + if "fanmtl.com" in self.baseLink: + for div in content.find_all('div', {'align': 'center'}): + if div.find('script'): + div.decompose() + + #general + for script in content.find_all('script'): + script.decompose() + + + def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, chapterNumber, infoDict:dict): + chapterTitle = f"Chapter {chapterNumber}" + if "fanmtl.com" in self.baseLink: + infoDict["seriesTitle"] = re.sub("\(.*?\)|[<>|\?:\*\"\\/]", "", soup.select("div.titles h1 a")[0].text) + chapterTitle = re.sub("\(.*?\)|[<>|\?:\*\"\\/]", "", soup.select("div.titles h2")[0].text) + infoDict["chapterTitle"] =chapterTitle + infoDict["author"] = "" + elif "syosetu.com" in self.baseLink: + chapterTitle = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip() + # = soup.select("h1.p-novel__title font font") + infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip() + infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip() + elif "fenrirealm.com" in self.baseLink: + chapterTitle = soup.select_one(".chapter-view > div > h2").get_text(strip=True).split(":")[-1].strip() + infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip() + infoDict["author"] = "unknown" + + titleElement = soup.new_tag("h1") + titleElement.string = chapterTitle + content.insert(0, titleElement) + content.insert(1, soup.new_tag("br")) + content.insert(2, soup.new_tag("br")) + + return chapterTitle + + + diff --git a/src/functions.py b/src/functions.py new file mode 100644 index 0000000..ffefcb8 --- /dev/null +++ b/src/functions.py @@ -0,0 +1,30 @@ +import json +import os + +from mysql.connector.aio.charsets import charsets + + +def writeToFile(mode:str, path:str, content: str | bytes): + encoding = None if "b" in mode else "utf-8" + with open(path, mode, encoding=encoding) as file: + file.write(content) + + +def readFromFile(mode:str, path:str): + encoding = None if "b" in mode else "utf-8" + with open(path, mode, encoding=encoding) as file: + return file.read() + + +def makeDir(directory:str): + if not os.path.exists(directory): + os.makedirs(directory) + +def writeToJsonFile(path:str, content:dict): + with open(path, "w", encoding="utf-8") as file: + json.dump(content, file) + + +def readFromJsonFile(path:str): + with open(path, "r", encoding="utf-8") as file: + return json.load(file) diff --git a/src/mainold.py b/src/mainold.py new file mode 100644 index 0000000..6219b29 --- /dev/null +++ b/src/mainold.py @@ -0,0 +1,69 @@ +from pprint import pprint + +from transformers import MarianMTModel, MarianTokenizer, T5Tokenizer, T5Model +from bs4 import BeautifulSoup +import torch +import os + +class TranslatorOld: + def __init__(self, modalPath:str, inputFolder:str, outputFolder:str): + self.tokenizer = T5Tokenizer.from_pretrained(modalPath) + self.model = T5Model.from_pretrained(modalPath) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + pprint(self.device) + self.model = self.model.to(self.device) + self.inputFolder = inputFolder + self.outputFolder = outputFolder + + os.environ["CUDA_LAUNCH_BLOCKING"] = "1" + + def downloadModal(self, modalName:str, savePath:str): + tokenizer = MarianTokenizer.from_pretrained(modalName) + model = MarianMTModel.from_pretrained(modalName) + + # Lokal speichern + tokenizer.save_pretrained(savePath) + model.save_pretrained(savePath) + + print(f"Modell gespeichert unter {savePath}") + + def _is_valid_text(self, text): + return text and len( + text.strip()) > 0 # Sicherstellen, dass der Text nicht leer ist und nur Leerzeichen entfernt werden + + def _translate(self, text): + if not self._is_valid_text(text): + print("Ungültiger Text, überspringe Übersetzung.") + return None + + batch = self.tokenizer([text], return_tensors="pt", padding=True, truncation=True).to(self.device) + with torch.no_grad(): + try: + generated = self.model.generate(**batch, do_sample=True, top_k=50, top_p=0.95, temperature=0.7) + except Exception as e: + print(f"Fehler bei der Modellvorhersage: {e}") + return None + translated = self.tokenizer.decode(generated[0], skip_special_tokens=True) + return translated + + + def doTranslate(self): + with open(r"W:\Temp\html\第1話 「レア」.html", "r", encoding="utf-8") as f: + soup = BeautifulSoup(f, "html.parser") + + # --- 2. Text finden und übersetzen --- + for elem in soup.find_all(text=True): + if elem.parent.name not in ['script', 'style']: # Kein JS oder CSS übersetzen + original_text = elem.strip() + print(original_text) + if original_text: # Nur wenn etwas da ist + try: + translated_text = self._translate(original_text) + elem.replace_with(translated_text) + print(translated_text) + except Exception as e: + print(f"Fehler beim Übersetzen von: {original_text[:30]}... => {e}") + + # --- 3. Übersetzte Datei speichern --- + with open(r"W:\Temp\translate test\test1.html", "w", encoding="utf-8") as f: + f.write(str(soup)) \ No newline at end of file diff --git a/test.py b/test.py new file mode 100644 index 0000000..39a2e5c --- /dev/null +++ b/test.py @@ -0,0 +1,43 @@ +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, M2M100Model, M2M100Tokenizer, NllbTokenizer, \ + NllbMoeModel, NllbTokenizerFast, T5Tokenizer, T5Model, T5ForConditionalGeneration +import torch +import os + + +os.environ["CUDA_LAUNCH_BLOCKING"] = "1" + +# Modell und Tokenizer für NLLB laden +model_name = r"facebook/nllb-200-3.3B" + +# Tokenizer und Modell laden +tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="jpn_Jpan") +model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float16) + +model.save_pretrained(f"E:\\4K Anime\\models\\{model_name}", safe_serialization=False) +tokenizer.save_pretrained(f"E:\\4K Anime\\models\\{model_name}") + +# Gerät wählen +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model = model.to(device) + +# Übersetzungsfunktion +def translate(text): + # Text tokenisieren und an das gleiche Gerät wie das Modell schicken + batch = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device) + + with torch.no_grad(): + # Modellvorhersage erzeugen + try: + generated = model.generate(**batch, do_sample=False, forced_bos_token_id=tokenizer.convert_tokens_to_ids("eng_Latn"), max_new_tokens=150) + except Exception as e: + print(f"Fehler bei der Modellvorhersage: {e}") + return None + + # Übersetzung dekodieren + translated = tokenizer.decode(generated[0], skip_special_tokens=True) + return translated + +# Beispieltext übersetzen +text = f"新暦12年。人類は地球の重力という枷から解き放たれる前に、肉体という枷から逃げ出すほうに注力していた。" +result = translate(text) +print(result)