This commit is contained in:
2026-02-09 19:46:13 +01:00
commit 9690367d70
8 changed files with 537 additions and 0 deletions
+36
View File
@@ -0,0 +1,36 @@
import random
from src.EpubHandler import EpubHandler
from src.Translator import Translator
from src.WebScrapper import WebScrapper
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
# scrapper = WebScrapper("https://www.fanmtl.com/novel/", r"E:\temp\WN\Game of the World Tree\HTML", "en")
# scrapper.getHtml("game-of-the-world-tree_{}.html", 1, 987)
# epubHandler = EpubHandler(r"E:\temp\WN\Game of the World Tree\HTML", r"E:\temp\WN\Game of the World Tree\epub")
# epubHandler.convertHtmlToEpub("en", r"C:\Users\JohannesBOZZ\Downloads\01605-game-of-the-world-tree.jpg")
#scrapper = WebScrapper("https://ncode.syosetu.com/n0806fu/", r"E:\temp\WN\Golden Experience\HTML", "jp")
#scrapper.getHtml("{}/", 334, 620) # 612
# scrapper = WebScrapper("https://fenrirealm.com/series/dorothys-forbidden-grimoire/", r"E:\temp\WN\Dorothys Forbidden Grimoire\HTML", "en")
# scrapper.getHtml("{}", 377, 828, 2)
epubHandler = EpubHandler(r"E:\temp\WN\Dorothys Forbidden Grimoire\HTML", r"E:\temp\WN\Dorothys Forbidden Grimoire\EPUB")
epubHandler.convertHtmlToEpub("en", r"C:\Users\JohannesBOZZ\Downloads\6e1de333d6af7aaa3fdf3ffa66ac6f55.png")
# epubHandler = EpubHandler(r"E:\temp\WN\Golden Experience\501-609", r"E:\temp\WN\Golden Experience\epub")
# epubHandler = EpubHandler(r"C:\Users\JohannesBOZZ\Downloads\黄金の経験値 VI 特定災害生物「魔王」暗躍マグナメルム\html", r"C:\Users\JohannesBOZZ\Downloads\黄金の経験値 VI 特定災害生物「魔王」暗躍マグナメルム")
# epubHandler.epub_to_html(r"C:\Users\JohannesBOZZ\Downloads\黄金の経験値 VI 特定災害生物「魔王」暗躍マグナメルム\黄金の経験値 VI 特定災害生物「魔王」暗躍マグナメルム.epub")
# epubHandler = EpubHandler(r"W:\Temp\html", r"W:\Temp\epub")
#epubHandler.convertHtmlToEpub("jp")
# translator = Translator(r"facebook/nllb-200-3.3B", r"W:\Temp\html", r"W:\Temp\translate test")
# translator.doTranslate()
+12
View File
@@ -0,0 +1,12 @@
requests
bs4
markdownify
aspose-words
ebooklib
natsort
transformers
sentencepiece
torch
tensorflow
flax
protobuf==3.20.*
+162
View File
@@ -0,0 +1,162 @@
import json
import os.path
from enum import unique
from os import scandir, listdir
import natsort
from ebooklib import epub
from bs4 import BeautifulSoup
import aspose.words as aw
from pprint import pprint
from sympy import false
from src.functions import makeDir, readFromFile, readFromJsonFile
class EpubHandler:
def __init__(self, htmlPath, epubPath):
self.htmlPath = htmlPath
self.epubPath = epubPath
makeDir(self.htmlPath)
makeDir(self.epubPath)
def convertHtmlToEpub(self, language: str, coverImagePath=None):
jsonPath = os.path.join(self.htmlPath, listdir(self.htmlPath)[0].replace(".html", ".json"))
infosDict = readFromJsonFile(jsonPath)
book = epub.EpubBook()
book.set_title(infosDict["seriesTitle"])
book.set_language(infosDict["currentLanguage"])
if coverImagePath:
book.set_cover("cover.jpg", readFromFile("rb", coverImagePath))
spine = ["nav"]
files = natsort.os_sorted([x.path for x in scandir(self.htmlPath) if not x.path.endswith(".json")])
toc = []
chapterDict = {}
for htmlFile in files:
if htmlFile.endswith(".json"):
continue
infosDict = readFromJsonFile(htmlFile.replace(".html", ".json"))
chapter = epub.EpubHtml(title=infosDict["chapterTitle"], file_name=f"chapter{infosDict['chapter']}.xhtml",
lang=language)
chapter.content = readFromFile("r", htmlFile)
book.add_item(chapter)
try:
chapterDict[int(infosDict["chapter"])] = chapter
except IndexError:
pprint(infosDict['chapter'])
# Dictionary nach Keys sortieren und Kapitel zur spine hinzufügen
for key in sorted(chapterDict.keys()):
chapter = chapterDict[key]
toc.append(epub.Link(chapter.file_name, chapter.title, chapter.title))
spine.append(chapter)
book.toc = toc
book.spine = spine
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
epub.write_epub(os.path.join(self.epubPath, f"{book.title}.epub"), book)
# ======================================================
# EPUB → HTML
# ======================================================
def epub_to_html(self, epub_file):
book = epub.read_epub(epub_file)
images_dir = os.path.join(self.htmlPath, "images")
makeDir(images_dir)
chapter_index = 1
for item_id, _ in book.spine:
item = book.get_item_with_id(item_id)
# Kapitel
if item.get_type() == epub.ITEM_DOCUMENT:
filename = f"{chapter_index:03d}_{os.path.basename(item.file_name)}"
filepath = os.path.join(self.htmlPath, filename)
with open(filepath, "wb") as f:
f.write(item.get_content())
chapter_index += 1
# Bilder
elif item.get_type() == epub.ITEM_IMAGE:
image_path = os.path.join(images_dir, os.path.basename(item.file_name))
with open(image_path, "wb") as f:
f.write(item.get_content())
print(f"✔ EPUB nach HTML exportiert ({chapter_index - 1} Kapitel)")
# ======================================================
# HTML → EPUB
# ======================================================
def html_to_epub(self, output_epub, title="Translated Book", lang="de"):
book = epub.EpubBook()
book.set_title(title)
book.set_language(lang)
# Kapitel laden (sortiert!)
html_files = sorted(
f for f in os.listdir(self.htmlPath)
if f.endswith(".html")
)
spine = ["nav"]
chapters = []
for html_file in html_files:
with open(os.path.join(self.htmlPath, html_file), "r", encoding="utf-8") as f:
content = f.read()
chapter = epub.EpubHtml(
title=html_file,
file_name=html_file,
content=content
)
book.add_item(chapter)
chapters.append(chapter)
spine.append(chapter)
# Bilder wieder einbinden
images_dir = os.path.join(self.htmlPath, "images")
if os.path.exists(images_dir):
for img in os.listdir(images_dir):
img_path = os.path.join(images_dir, img)
with open(img_path, "rb") as f:
image = epub.EpubItem(
uid=img,
file_name=f"images/{img}",
media_type=self._guess_mime(img),
content=f.read()
)
book.add_item(image)
book.spine = spine
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
epub.write_epub(output_epub, book)
print("✔ EPUB neu erstellt")
# ======================================================
# MIME helper
# ======================================================
def _guess_mime(self, filename):
ext = filename.lower().split(".")[-1]
return {
"jpg": "image/jpeg",
"jpeg": "image/jpeg",
"png": "image/png",
"gif": "image/gif",
"svg": "image/svg+xml",
"webp": "image/webp"
}.get(ext, "application/octet-stream")
+69
View File
@@ -0,0 +1,69 @@
from pprint import pprint
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
from bs4 import BeautifulSoup
import torch
import os
class Translator:
def __init__(self, modalPath:str, inputFolder:str, outputFolder:str):
# Ändere das Modell und den Tokenizer auf google/madlad400-3b-mt
self.tokenizer = AutoTokenizer.from_pretrained(modalPath, src_lang="jpn_Jpan")
self.model = AutoModelForSeq2SeqLM.from_pretrained(modalPath, torch_dtype=torch.float16)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pprint(self.device)
self.model = self.model.to(self.device)
self.inputFolder = inputFolder
self.outputFolder = outputFolder
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
def downloadModal(self, modalName:str, savePath:str):
tokenizer = T5Tokenizer.from_pretrained(modalName)
model = T5ForConditionalGeneration.from_pretrained(modalName)
# Lokal speichern
tokenizer.save_pretrained(savePath)
model.save_pretrained(savePath)
print(f"Modell gespeichert unter {savePath}")
def _is_valid_text(self, text):
return text and len(text.strip()) > 0 # Sicherstellen, dass der Text nicht leer ist und nur Leerzeichen entfernt werden
def _translate(self, text):
if not self._is_valid_text(text):
print("Ungültiger Text, überspringe Übersetzung.")
return None
batch = self.tokenizer([text], return_tensors="pt", padding=True, truncation=True).to(self.device)
with torch.no_grad():
try:
# Verwende generate() von T5ForConditionalGeneration
generated = self.model.generate(**batch, do_sample=True, forced_bos_token_id=self.tokenizer.convert_tokens_to_ids("eng_Latn"), max_new_tokens=150)
except Exception as e:
print(f"Fehler bei der Modellvorhersage: {e}")
return None
translated = self.tokenizer.decode(generated[0], skip_special_tokens=True)
return translated
def doTranslate(self):
with open(r"W:\Temp\html\第1話 「レア」.html", "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "html.parser")
# --- 2. Text finden und übersetzen ---
for elem in soup.find_all(text=True):
if elem.parent.name not in ['script', 'style']: # Kein JS oder CSS übersetzen
original_text = elem.strip()
print(original_text)
if original_text: # Nur wenn etwas da ist
try:
translated_text = self._translate(original_text)
elem.replace_with(translated_text)
print(translated_text)
except Exception as e:
print(f"Fehler beim Übersetzen von: {original_text[:30]}... => {e}")
# --- 3. Übersetzte Datei speichern ---
with open(r"W:\Temp\translate test\test2.html", "w", encoding="utf-8") as f:
f.write(str(soup))
+116
View File
@@ -0,0 +1,116 @@
import os
import re
import time
from urllib.parse import urljoin
from pprint import pprint
import requests
from bs4 import BeautifulSoup, NavigableString
from src.functions import writeToFile, makeDir, writeToJsonFile
# https://ncode.syosetu.com/n0806fu
class WebScrapper:
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
self.baseLink = baseLink
self.htmlFolderPath = htmlFolderPath
self.currentLanguage = currentLanguage
makeDir(self.htmlFolderPath)
def getHtml(self, uriWithFormat:str, fromChapter:int, toChapter:int, sleepTime:float=0):
"""
uriWithFormat: inserts the current chapter number into the {} brackets
"""
for chapterNumber in range(fromChapter, toChapter + 1):
time.sleep(sleepTime) # to avoid getting blocked by the website for sending too many requests in a short time
infoDict = {
"chapter": chapterNumber,
"originalLanguage": self.currentLanguage,
"currentLanguage": self.currentLanguage,
}
link = urljoin(self.baseLink, uriWithFormat.format(chapterNumber))
print(link)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
#'Referer': 'https://ncode.syosetu.com/',
#'Accept-Language': 'de,en;q=0.9',
}
soup = BeautifulSoup(requests.get(link, headers=headers).content, "html.parser")
if not soup:
print(f"skipping Chapter {chapterNumber}")
continue
chapterContent = self._getChapterContent(soup)
if not chapterContent:
print(f"skipping Chapter {chapterNumber}. No content found")
continue
self._removeUnwantedThinsFromHtml(chapterContent)
chapterTitle = self._addChapterTitle(soup, chapterContent, chapterNumber, infoDict)
infoDict["chapterTitle"] = chapterTitle
filePath = os.path.join(self.htmlFolderPath, f"{chapterTitle}.html")
writeToFile("wb", filePath, chapterContent.renderContents("utf-8", True, 4))
writeToJsonFile(filePath.replace(".html", ".json"), infoDict)
def _getChapterContent(self, soup:BeautifulSoup):
chapterContent = None
if "fanmtl.com" in self.baseLink:
chapterContent = soup.find("div", {"class": "chapter-content"})
elif "syosetu.com" in self.baseLink:
body = soup.select("div.p-novel__body div.js-novel-text.p-novel__text")
for x in body:
if len(x.text) > 1000:
chapterContent = x
elif "fenrirealm.com" in self.baseLink:
chapterContent = soup.select("div.chapter-view > div.content-area")[0]
return chapterContent
def _removeUnwantedThinsFromHtml(self, content:BeautifulSoup|NavigableString):
if not content:
return
# FanMTL advertisements
if "fanmtl.com" in self.baseLink:
for div in content.find_all('div', {'align': 'center'}):
if div.find('script'):
div.decompose()
#general
for script in content.find_all('script'):
script.decompose()
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, chapterNumber, infoDict:dict):
chapterTitle = f"Chapter {chapterNumber}"
if "fanmtl.com" in self.baseLink:
infoDict["seriesTitle"] = re.sub("\(.*?\)|[<>|\?:\*\"\\/]", "", soup.select("div.titles h1 a")[0].text)
chapterTitle = re.sub("\(.*?\)|[<>|\?:\*\"\\/]", "", soup.select("div.titles h2")[0].text)
infoDict["chapterTitle"] =chapterTitle
infoDict["author"] = ""
elif "syosetu.com" in self.baseLink:
chapterTitle = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
# = soup.select("h1.p-novel__title font font")
infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
elif "fenrirealm.com" in self.baseLink:
chapterTitle = soup.select_one(".chapter-view > div > h2").get_text(strip=True).split(":")[-1].strip()
infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip()
infoDict["author"] = "unknown"
titleElement = soup.new_tag("h1")
titleElement.string = chapterTitle
content.insert(0, titleElement)
content.insert(1, soup.new_tag("br"))
content.insert(2, soup.new_tag("br"))
return chapterTitle
+30
View File
@@ -0,0 +1,30 @@
import json
import os
from mysql.connector.aio.charsets import charsets
def writeToFile(mode:str, path:str, content: str | bytes):
encoding = None if "b" in mode else "utf-8"
with open(path, mode, encoding=encoding) as file:
file.write(content)
def readFromFile(mode:str, path:str):
encoding = None if "b" in mode else "utf-8"
with open(path, mode, encoding=encoding) as file:
return file.read()
def makeDir(directory:str):
if not os.path.exists(directory):
os.makedirs(directory)
def writeToJsonFile(path:str, content:dict):
with open(path, "w", encoding="utf-8") as file:
json.dump(content, file)
def readFromJsonFile(path:str):
with open(path, "r", encoding="utf-8") as file:
return json.load(file)
+69
View File
@@ -0,0 +1,69 @@
from pprint import pprint
from transformers import MarianMTModel, MarianTokenizer, T5Tokenizer, T5Model
from bs4 import BeautifulSoup
import torch
import os
class TranslatorOld:
def __init__(self, modalPath:str, inputFolder:str, outputFolder:str):
self.tokenizer = T5Tokenizer.from_pretrained(modalPath)
self.model = T5Model.from_pretrained(modalPath)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pprint(self.device)
self.model = self.model.to(self.device)
self.inputFolder = inputFolder
self.outputFolder = outputFolder
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
def downloadModal(self, modalName:str, savePath:str):
tokenizer = MarianTokenizer.from_pretrained(modalName)
model = MarianMTModel.from_pretrained(modalName)
# Lokal speichern
tokenizer.save_pretrained(savePath)
model.save_pretrained(savePath)
print(f"Modell gespeichert unter {savePath}")
def _is_valid_text(self, text):
return text and len(
text.strip()) > 0 # Sicherstellen, dass der Text nicht leer ist und nur Leerzeichen entfernt werden
def _translate(self, text):
if not self._is_valid_text(text):
print("Ungültiger Text, überspringe Übersetzung.")
return None
batch = self.tokenizer([text], return_tensors="pt", padding=True, truncation=True).to(self.device)
with torch.no_grad():
try:
generated = self.model.generate(**batch, do_sample=True, top_k=50, top_p=0.95, temperature=0.7)
except Exception as e:
print(f"Fehler bei der Modellvorhersage: {e}")
return None
translated = self.tokenizer.decode(generated[0], skip_special_tokens=True)
return translated
def doTranslate(self):
with open(r"W:\Temp\html\第1話 「レア」.html", "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "html.parser")
# --- 2. Text finden und übersetzen ---
for elem in soup.find_all(text=True):
if elem.parent.name not in ['script', 'style']: # Kein JS oder CSS übersetzen
original_text = elem.strip()
print(original_text)
if original_text: # Nur wenn etwas da ist
try:
translated_text = self._translate(original_text)
elem.replace_with(translated_text)
print(translated_text)
except Exception as e:
print(f"Fehler beim Übersetzen von: {original_text[:30]}... => {e}")
# --- 3. Übersetzte Datei speichern ---
with open(r"W:\Temp\translate test\test1.html", "w", encoding="utf-8") as f:
f.write(str(soup))
+43
View File
@@ -0,0 +1,43 @@
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, M2M100Model, M2M100Tokenizer, NllbTokenizer, \
NllbMoeModel, NllbTokenizerFast, T5Tokenizer, T5Model, T5ForConditionalGeneration
import torch
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
# Modell und Tokenizer für NLLB laden
model_name = r"facebook/nllb-200-3.3B"
# Tokenizer und Modell laden
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="jpn_Jpan")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float16)
model.save_pretrained(f"E:\\4K Anime\\models\\{model_name}", safe_serialization=False)
tokenizer.save_pretrained(f"E:\\4K Anime\\models\\{model_name}")
# Gerät wählen
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
# Übersetzungsfunktion
def translate(text):
# Text tokenisieren und an das gleiche Gerät wie das Modell schicken
batch = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
with torch.no_grad():
# Modellvorhersage erzeugen
try:
generated = model.generate(**batch, do_sample=False, forced_bos_token_id=tokenizer.convert_tokens_to_ids("eng_Latn"), max_new_tokens=150)
except Exception as e:
print(f"Fehler bei der Modellvorhersage: {e}")
return None
# Übersetzung dekodieren
translated = tokenizer.decode(generated[0], skip_special_tokens=True)
return translated
# Beispieltext übersetzen
text = f"新暦12年。人類は地球の重力という枷から解き放たれる前に、肉体という枷から逃げ出すほうに注力していた。"
result = translate(text)
print(result)