init
This commit is contained in:
@@ -0,0 +1,36 @@
|
|||||||
|
import random
|
||||||
|
|
||||||
|
from src.EpubHandler import EpubHandler
|
||||||
|
from src.Translator import Translator
|
||||||
|
from src.WebScrapper import WebScrapper
|
||||||
|
|
||||||
|
# Press the green button in the gutter to run the script.
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# scrapper = WebScrapper("https://www.fanmtl.com/novel/", r"E:\temp\WN\Game of the World Tree\HTML", "en")
|
||||||
|
# scrapper.getHtml("game-of-the-world-tree_{}.html", 1, 987)
|
||||||
|
|
||||||
|
# epubHandler = EpubHandler(r"E:\temp\WN\Game of the World Tree\HTML", r"E:\temp\WN\Game of the World Tree\epub")
|
||||||
|
# epubHandler.convertHtmlToEpub("en", r"C:\Users\JohannesBOZZ\Downloads\01605-game-of-the-world-tree.jpg")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#scrapper = WebScrapper("https://ncode.syosetu.com/n0806fu/", r"E:\temp\WN\Golden Experience\HTML", "jp")
|
||||||
|
#scrapper.getHtml("{}/", 334, 620) # 612
|
||||||
|
|
||||||
|
# scrapper = WebScrapper("https://fenrirealm.com/series/dorothys-forbidden-grimoire/", r"E:\temp\WN\Dorothy’s Forbidden Grimoire\HTML", "en")
|
||||||
|
# scrapper.getHtml("{}", 377, 828, 2)
|
||||||
|
epubHandler = EpubHandler(r"E:\temp\WN\Dorothy’s Forbidden Grimoire\HTML", r"E:\temp\WN\Dorothy’s Forbidden Grimoire\EPUB")
|
||||||
|
epubHandler.convertHtmlToEpub("en", r"C:\Users\JohannesBOZZ\Downloads\6e1de333d6af7aaa3fdf3ffa66ac6f55.png")
|
||||||
|
|
||||||
|
# epubHandler = EpubHandler(r"E:\temp\WN\Golden Experience\501-609", r"E:\temp\WN\Golden Experience\epub")
|
||||||
|
# epubHandler = EpubHandler(r"C:\Users\JohannesBOZZ\Downloads\黄金の経験値 VI 特定災害生物「魔王」暗躍マグナメルム\html", r"C:\Users\JohannesBOZZ\Downloads\黄金の経験値 VI 特定災害生物「魔王」暗躍マグナメルム")
|
||||||
|
# epubHandler.epub_to_html(r"C:\Users\JohannesBOZZ\Downloads\黄金の経験値 VI 特定災害生物「魔王」暗躍マグナメルム\黄金の経験値 VI 特定災害生物「魔王」暗躍マグナメルム.epub")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# epubHandler = EpubHandler(r"W:\Temp\html", r"W:\Temp\epub")
|
||||||
|
#epubHandler.convertHtmlToEpub("jp")
|
||||||
|
|
||||||
|
# translator = Translator(r"facebook/nllb-200-3.3B", r"W:\Temp\html", r"W:\Temp\translate test")
|
||||||
|
# translator.doTranslate()
|
||||||
|
|
||||||
@@ -0,0 +1,12 @@
|
|||||||
|
requests
|
||||||
|
bs4
|
||||||
|
markdownify
|
||||||
|
aspose-words
|
||||||
|
ebooklib
|
||||||
|
natsort
|
||||||
|
transformers
|
||||||
|
sentencepiece
|
||||||
|
torch
|
||||||
|
tensorflow
|
||||||
|
flax
|
||||||
|
protobuf==3.20.*
|
||||||
@@ -0,0 +1,162 @@
|
|||||||
|
import json
|
||||||
|
import os.path
|
||||||
|
from enum import unique
|
||||||
|
from os import scandir, listdir
|
||||||
|
|
||||||
|
import natsort
|
||||||
|
from ebooklib import epub
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import aspose.words as aw
|
||||||
|
from pprint import pprint
|
||||||
|
|
||||||
|
from sympy import false
|
||||||
|
|
||||||
|
from src.functions import makeDir, readFromFile, readFromJsonFile
|
||||||
|
|
||||||
|
|
||||||
|
class EpubHandler:
|
||||||
|
def __init__(self, htmlPath, epubPath):
|
||||||
|
self.htmlPath = htmlPath
|
||||||
|
self.epubPath = epubPath
|
||||||
|
makeDir(self.htmlPath)
|
||||||
|
makeDir(self.epubPath)
|
||||||
|
|
||||||
|
def convertHtmlToEpub(self, language: str, coverImagePath=None):
|
||||||
|
jsonPath = os.path.join(self.htmlPath, listdir(self.htmlPath)[0].replace(".html", ".json"))
|
||||||
|
infosDict = readFromJsonFile(jsonPath)
|
||||||
|
|
||||||
|
book = epub.EpubBook()
|
||||||
|
book.set_title(infosDict["seriesTitle"])
|
||||||
|
book.set_language(infosDict["currentLanguage"])
|
||||||
|
if coverImagePath:
|
||||||
|
book.set_cover("cover.jpg", readFromFile("rb", coverImagePath))
|
||||||
|
spine = ["nav"]
|
||||||
|
|
||||||
|
files = natsort.os_sorted([x.path for x in scandir(self.htmlPath) if not x.path.endswith(".json")])
|
||||||
|
toc = []
|
||||||
|
chapterDict = {}
|
||||||
|
for htmlFile in files:
|
||||||
|
if htmlFile.endswith(".json"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
infosDict = readFromJsonFile(htmlFile.replace(".html", ".json"))
|
||||||
|
|
||||||
|
chapter = epub.EpubHtml(title=infosDict["chapterTitle"], file_name=f"chapter{infosDict['chapter']}.xhtml",
|
||||||
|
lang=language)
|
||||||
|
chapter.content = readFromFile("r", htmlFile)
|
||||||
|
|
||||||
|
book.add_item(chapter)
|
||||||
|
try:
|
||||||
|
chapterDict[int(infosDict["chapter"])] = chapter
|
||||||
|
except IndexError:
|
||||||
|
pprint(infosDict['chapter'])
|
||||||
|
|
||||||
|
# Dictionary nach Keys sortieren und Kapitel zur spine hinzufügen
|
||||||
|
for key in sorted(chapterDict.keys()):
|
||||||
|
chapter = chapterDict[key]
|
||||||
|
toc.append(epub.Link(chapter.file_name, chapter.title, chapter.title))
|
||||||
|
spine.append(chapter)
|
||||||
|
|
||||||
|
book.toc = toc
|
||||||
|
book.spine = spine
|
||||||
|
book.add_item(epub.EpubNcx())
|
||||||
|
book.add_item(epub.EpubNav())
|
||||||
|
epub.write_epub(os.path.join(self.epubPath, f"{book.title}.epub"), book)
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# EPUB → HTML
|
||||||
|
# ======================================================
|
||||||
|
def epub_to_html(self, epub_file):
|
||||||
|
book = epub.read_epub(epub_file)
|
||||||
|
|
||||||
|
images_dir = os.path.join(self.htmlPath, "images")
|
||||||
|
makeDir(images_dir)
|
||||||
|
|
||||||
|
chapter_index = 1
|
||||||
|
|
||||||
|
for item_id, _ in book.spine:
|
||||||
|
item = book.get_item_with_id(item_id)
|
||||||
|
|
||||||
|
# Kapitel
|
||||||
|
if item.get_type() == epub.ITEM_DOCUMENT:
|
||||||
|
filename = f"{chapter_index:03d}_{os.path.basename(item.file_name)}"
|
||||||
|
filepath = os.path.join(self.htmlPath, filename)
|
||||||
|
|
||||||
|
with open(filepath, "wb") as f:
|
||||||
|
f.write(item.get_content())
|
||||||
|
|
||||||
|
chapter_index += 1
|
||||||
|
|
||||||
|
# Bilder
|
||||||
|
elif item.get_type() == epub.ITEM_IMAGE:
|
||||||
|
image_path = os.path.join(images_dir, os.path.basename(item.file_name))
|
||||||
|
with open(image_path, "wb") as f:
|
||||||
|
f.write(item.get_content())
|
||||||
|
|
||||||
|
print(f"✔ EPUB nach HTML exportiert ({chapter_index - 1} Kapitel)")
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# HTML → EPUB
|
||||||
|
# ======================================================
|
||||||
|
def html_to_epub(self, output_epub, title="Translated Book", lang="de"):
|
||||||
|
book = epub.EpubBook()
|
||||||
|
book.set_title(title)
|
||||||
|
book.set_language(lang)
|
||||||
|
|
||||||
|
# Kapitel laden (sortiert!)
|
||||||
|
html_files = sorted(
|
||||||
|
f for f in os.listdir(self.htmlPath)
|
||||||
|
if f.endswith(".html")
|
||||||
|
)
|
||||||
|
|
||||||
|
spine = ["nav"]
|
||||||
|
chapters = []
|
||||||
|
|
||||||
|
for html_file in html_files:
|
||||||
|
with open(os.path.join(self.htmlPath, html_file), "r", encoding="utf-8") as f:
|
||||||
|
content = f.read()
|
||||||
|
|
||||||
|
chapter = epub.EpubHtml(
|
||||||
|
title=html_file,
|
||||||
|
file_name=html_file,
|
||||||
|
content=content
|
||||||
|
)
|
||||||
|
book.add_item(chapter)
|
||||||
|
chapters.append(chapter)
|
||||||
|
spine.append(chapter)
|
||||||
|
|
||||||
|
# Bilder wieder einbinden
|
||||||
|
images_dir = os.path.join(self.htmlPath, "images")
|
||||||
|
if os.path.exists(images_dir):
|
||||||
|
for img in os.listdir(images_dir):
|
||||||
|
img_path = os.path.join(images_dir, img)
|
||||||
|
with open(img_path, "rb") as f:
|
||||||
|
image = epub.EpubItem(
|
||||||
|
uid=img,
|
||||||
|
file_name=f"images/{img}",
|
||||||
|
media_type=self._guess_mime(img),
|
||||||
|
content=f.read()
|
||||||
|
)
|
||||||
|
book.add_item(image)
|
||||||
|
|
||||||
|
book.spine = spine
|
||||||
|
book.add_item(epub.EpubNcx())
|
||||||
|
book.add_item(epub.EpubNav())
|
||||||
|
|
||||||
|
epub.write_epub(output_epub, book)
|
||||||
|
print("✔ EPUB neu erstellt")
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# MIME helper
|
||||||
|
# ======================================================
|
||||||
|
def _guess_mime(self, filename):
|
||||||
|
ext = filename.lower().split(".")[-1]
|
||||||
|
return {
|
||||||
|
"jpg": "image/jpeg",
|
||||||
|
"jpeg": "image/jpeg",
|
||||||
|
"png": "image/png",
|
||||||
|
"gif": "image/gif",
|
||||||
|
"svg": "image/svg+xml",
|
||||||
|
"webp": "image/webp"
|
||||||
|
}.get(ext, "application/octet-stream")
|
||||||
@@ -0,0 +1,69 @@
|
|||||||
|
from pprint import pprint
|
||||||
|
|
||||||
|
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import torch
|
||||||
|
import os
|
||||||
|
|
||||||
|
class Translator:
|
||||||
|
def __init__(self, modalPath:str, inputFolder:str, outputFolder:str):
|
||||||
|
# Ändere das Modell und den Tokenizer auf google/madlad400-3b-mt
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained(modalPath, src_lang="jpn_Jpan")
|
||||||
|
self.model = AutoModelForSeq2SeqLM.from_pretrained(modalPath, torch_dtype=torch.float16)
|
||||||
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
pprint(self.device)
|
||||||
|
self.model = self.model.to(self.device)
|
||||||
|
self.inputFolder = inputFolder
|
||||||
|
self.outputFolder = outputFolder
|
||||||
|
|
||||||
|
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
||||||
|
|
||||||
|
def downloadModal(self, modalName:str, savePath:str):
|
||||||
|
tokenizer = T5Tokenizer.from_pretrained(modalName)
|
||||||
|
model = T5ForConditionalGeneration.from_pretrained(modalName)
|
||||||
|
|
||||||
|
# Lokal speichern
|
||||||
|
tokenizer.save_pretrained(savePath)
|
||||||
|
model.save_pretrained(savePath)
|
||||||
|
|
||||||
|
print(f"Modell gespeichert unter {savePath}")
|
||||||
|
|
||||||
|
def _is_valid_text(self, text):
|
||||||
|
return text and len(text.strip()) > 0 # Sicherstellen, dass der Text nicht leer ist und nur Leerzeichen entfernt werden
|
||||||
|
|
||||||
|
def _translate(self, text):
|
||||||
|
if not self._is_valid_text(text):
|
||||||
|
print("Ungültiger Text, überspringe Übersetzung.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
batch = self.tokenizer([text], return_tensors="pt", padding=True, truncation=True).to(self.device)
|
||||||
|
with torch.no_grad():
|
||||||
|
try:
|
||||||
|
# Verwende generate() von T5ForConditionalGeneration
|
||||||
|
generated = self.model.generate(**batch, do_sample=True, forced_bos_token_id=self.tokenizer.convert_tokens_to_ids("eng_Latn"), max_new_tokens=150)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Fehler bei der Modellvorhersage: {e}")
|
||||||
|
return None
|
||||||
|
translated = self.tokenizer.decode(generated[0], skip_special_tokens=True)
|
||||||
|
return translated
|
||||||
|
|
||||||
|
def doTranslate(self):
|
||||||
|
with open(r"W:\Temp\html\第1話 「レア」.html", "r", encoding="utf-8") as f:
|
||||||
|
soup = BeautifulSoup(f, "html.parser")
|
||||||
|
|
||||||
|
# --- 2. Text finden und übersetzen ---
|
||||||
|
for elem in soup.find_all(text=True):
|
||||||
|
if elem.parent.name not in ['script', 'style']: # Kein JS oder CSS übersetzen
|
||||||
|
original_text = elem.strip()
|
||||||
|
print(original_text)
|
||||||
|
if original_text: # Nur wenn etwas da ist
|
||||||
|
try:
|
||||||
|
translated_text = self._translate(original_text)
|
||||||
|
elem.replace_with(translated_text)
|
||||||
|
print(translated_text)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Fehler beim Übersetzen von: {original_text[:30]}... => {e}")
|
||||||
|
|
||||||
|
# --- 3. Übersetzte Datei speichern ---
|
||||||
|
with open(r"W:\Temp\translate test\test2.html", "w", encoding="utf-8") as f:
|
||||||
|
f.write(str(soup))
|
||||||
@@ -0,0 +1,116 @@
|
|||||||
|
import os
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
from pprint import pprint
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup, NavigableString
|
||||||
|
from src.functions import writeToFile, makeDir, writeToJsonFile
|
||||||
|
|
||||||
|
|
||||||
|
# https://ncode.syosetu.com/n0806fu
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class WebScrapper:
|
||||||
|
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
|
||||||
|
self.baseLink = baseLink
|
||||||
|
self.htmlFolderPath = htmlFolderPath
|
||||||
|
self.currentLanguage = currentLanguage
|
||||||
|
makeDir(self.htmlFolderPath)
|
||||||
|
|
||||||
|
def getHtml(self, uriWithFormat:str, fromChapter:int, toChapter:int, sleepTime:float=0):
|
||||||
|
"""
|
||||||
|
uriWithFormat: inserts the current chapter number into the {} brackets
|
||||||
|
"""
|
||||||
|
for chapterNumber in range(fromChapter, toChapter + 1):
|
||||||
|
time.sleep(sleepTime) # to avoid getting blocked by the website for sending too many requests in a short time
|
||||||
|
|
||||||
|
infoDict = {
|
||||||
|
"chapter": chapterNumber,
|
||||||
|
"originalLanguage": self.currentLanguage,
|
||||||
|
"currentLanguage": self.currentLanguage,
|
||||||
|
}
|
||||||
|
link = urljoin(self.baseLink, uriWithFormat.format(chapterNumber))
|
||||||
|
print(link)
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
|
||||||
|
#'Referer': 'https://ncode.syosetu.com/',
|
||||||
|
#'Accept-Language': 'de,en;q=0.9',
|
||||||
|
}
|
||||||
|
|
||||||
|
soup = BeautifulSoup(requests.get(link, headers=headers).content, "html.parser")
|
||||||
|
if not soup:
|
||||||
|
print(f"skipping Chapter {chapterNumber}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
chapterContent = self._getChapterContent(soup)
|
||||||
|
if not chapterContent:
|
||||||
|
print(f"skipping Chapter {chapterNumber}. No content found")
|
||||||
|
continue
|
||||||
|
|
||||||
|
self._removeUnwantedThinsFromHtml(chapterContent)
|
||||||
|
chapterTitle = self._addChapterTitle(soup, chapterContent, chapterNumber, infoDict)
|
||||||
|
infoDict["chapterTitle"] = chapterTitle
|
||||||
|
|
||||||
|
filePath = os.path.join(self.htmlFolderPath, f"{chapterTitle}.html")
|
||||||
|
writeToFile("wb", filePath, chapterContent.renderContents("utf-8", True, 4))
|
||||||
|
writeToJsonFile(filePath.replace(".html", ".json"), infoDict)
|
||||||
|
|
||||||
|
def _getChapterContent(self, soup:BeautifulSoup):
|
||||||
|
chapterContent = None
|
||||||
|
|
||||||
|
if "fanmtl.com" in self.baseLink:
|
||||||
|
chapterContent = soup.find("div", {"class": "chapter-content"})
|
||||||
|
elif "syosetu.com" in self.baseLink:
|
||||||
|
body = soup.select("div.p-novel__body div.js-novel-text.p-novel__text")
|
||||||
|
for x in body:
|
||||||
|
if len(x.text) > 1000:
|
||||||
|
chapterContent = x
|
||||||
|
elif "fenrirealm.com" in self.baseLink:
|
||||||
|
chapterContent = soup.select("div.chapter-view > div.content-area")[0]
|
||||||
|
|
||||||
|
return chapterContent
|
||||||
|
|
||||||
|
def _removeUnwantedThinsFromHtml(self, content:BeautifulSoup|NavigableString):
|
||||||
|
if not content:
|
||||||
|
return
|
||||||
|
|
||||||
|
# FanMTL advertisements
|
||||||
|
if "fanmtl.com" in self.baseLink:
|
||||||
|
for div in content.find_all('div', {'align': 'center'}):
|
||||||
|
if div.find('script'):
|
||||||
|
div.decompose()
|
||||||
|
|
||||||
|
#general
|
||||||
|
for script in content.find_all('script'):
|
||||||
|
script.decompose()
|
||||||
|
|
||||||
|
|
||||||
|
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, chapterNumber, infoDict:dict):
|
||||||
|
chapterTitle = f"Chapter {chapterNumber}"
|
||||||
|
if "fanmtl.com" in self.baseLink:
|
||||||
|
infoDict["seriesTitle"] = re.sub("\(.*?\)|[<>|\?:\*\"\\/]", "", soup.select("div.titles h1 a")[0].text)
|
||||||
|
chapterTitle = re.sub("\(.*?\)|[<>|\?:\*\"\\/]", "", soup.select("div.titles h2")[0].text)
|
||||||
|
infoDict["chapterTitle"] =chapterTitle
|
||||||
|
infoDict["author"] = ""
|
||||||
|
elif "syosetu.com" in self.baseLink:
|
||||||
|
chapterTitle = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
|
||||||
|
# = soup.select("h1.p-novel__title font font")
|
||||||
|
infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
|
||||||
|
infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
|
||||||
|
elif "fenrirealm.com" in self.baseLink:
|
||||||
|
chapterTitle = soup.select_one(".chapter-view > div > h2").get_text(strip=True).split(":")[-1].strip()
|
||||||
|
infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip()
|
||||||
|
infoDict["author"] = "unknown"
|
||||||
|
|
||||||
|
titleElement = soup.new_tag("h1")
|
||||||
|
titleElement.string = chapterTitle
|
||||||
|
content.insert(0, titleElement)
|
||||||
|
content.insert(1, soup.new_tag("br"))
|
||||||
|
content.insert(2, soup.new_tag("br"))
|
||||||
|
|
||||||
|
return chapterTitle
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,30 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
from mysql.connector.aio.charsets import charsets
|
||||||
|
|
||||||
|
|
||||||
|
def writeToFile(mode:str, path:str, content: str | bytes):
|
||||||
|
encoding = None if "b" in mode else "utf-8"
|
||||||
|
with open(path, mode, encoding=encoding) as file:
|
||||||
|
file.write(content)
|
||||||
|
|
||||||
|
|
||||||
|
def readFromFile(mode:str, path:str):
|
||||||
|
encoding = None if "b" in mode else "utf-8"
|
||||||
|
with open(path, mode, encoding=encoding) as file:
|
||||||
|
return file.read()
|
||||||
|
|
||||||
|
|
||||||
|
def makeDir(directory:str):
|
||||||
|
if not os.path.exists(directory):
|
||||||
|
os.makedirs(directory)
|
||||||
|
|
||||||
|
def writeToJsonFile(path:str, content:dict):
|
||||||
|
with open(path, "w", encoding="utf-8") as file:
|
||||||
|
json.dump(content, file)
|
||||||
|
|
||||||
|
|
||||||
|
def readFromJsonFile(path:str):
|
||||||
|
with open(path, "r", encoding="utf-8") as file:
|
||||||
|
return json.load(file)
|
||||||
@@ -0,0 +1,69 @@
|
|||||||
|
from pprint import pprint
|
||||||
|
|
||||||
|
from transformers import MarianMTModel, MarianTokenizer, T5Tokenizer, T5Model
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import torch
|
||||||
|
import os
|
||||||
|
|
||||||
|
class TranslatorOld:
|
||||||
|
def __init__(self, modalPath:str, inputFolder:str, outputFolder:str):
|
||||||
|
self.tokenizer = T5Tokenizer.from_pretrained(modalPath)
|
||||||
|
self.model = T5Model.from_pretrained(modalPath)
|
||||||
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
pprint(self.device)
|
||||||
|
self.model = self.model.to(self.device)
|
||||||
|
self.inputFolder = inputFolder
|
||||||
|
self.outputFolder = outputFolder
|
||||||
|
|
||||||
|
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
||||||
|
|
||||||
|
def downloadModal(self, modalName:str, savePath:str):
|
||||||
|
tokenizer = MarianTokenizer.from_pretrained(modalName)
|
||||||
|
model = MarianMTModel.from_pretrained(modalName)
|
||||||
|
|
||||||
|
# Lokal speichern
|
||||||
|
tokenizer.save_pretrained(savePath)
|
||||||
|
model.save_pretrained(savePath)
|
||||||
|
|
||||||
|
print(f"Modell gespeichert unter {savePath}")
|
||||||
|
|
||||||
|
def _is_valid_text(self, text):
|
||||||
|
return text and len(
|
||||||
|
text.strip()) > 0 # Sicherstellen, dass der Text nicht leer ist und nur Leerzeichen entfernt werden
|
||||||
|
|
||||||
|
def _translate(self, text):
|
||||||
|
if not self._is_valid_text(text):
|
||||||
|
print("Ungültiger Text, überspringe Übersetzung.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
batch = self.tokenizer([text], return_tensors="pt", padding=True, truncation=True).to(self.device)
|
||||||
|
with torch.no_grad():
|
||||||
|
try:
|
||||||
|
generated = self.model.generate(**batch, do_sample=True, top_k=50, top_p=0.95, temperature=0.7)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Fehler bei der Modellvorhersage: {e}")
|
||||||
|
return None
|
||||||
|
translated = self.tokenizer.decode(generated[0], skip_special_tokens=True)
|
||||||
|
return translated
|
||||||
|
|
||||||
|
|
||||||
|
def doTranslate(self):
|
||||||
|
with open(r"W:\Temp\html\第1話 「レア」.html", "r", encoding="utf-8") as f:
|
||||||
|
soup = BeautifulSoup(f, "html.parser")
|
||||||
|
|
||||||
|
# --- 2. Text finden und übersetzen ---
|
||||||
|
for elem in soup.find_all(text=True):
|
||||||
|
if elem.parent.name not in ['script', 'style']: # Kein JS oder CSS übersetzen
|
||||||
|
original_text = elem.strip()
|
||||||
|
print(original_text)
|
||||||
|
if original_text: # Nur wenn etwas da ist
|
||||||
|
try:
|
||||||
|
translated_text = self._translate(original_text)
|
||||||
|
elem.replace_with(translated_text)
|
||||||
|
print(translated_text)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Fehler beim Übersetzen von: {original_text[:30]}... => {e}")
|
||||||
|
|
||||||
|
# --- 3. Übersetzte Datei speichern ---
|
||||||
|
with open(r"W:\Temp\translate test\test1.html", "w", encoding="utf-8") as f:
|
||||||
|
f.write(str(soup))
|
||||||
@@ -0,0 +1,43 @@
|
|||||||
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, M2M100Model, M2M100Tokenizer, NllbTokenizer, \
|
||||||
|
NllbMoeModel, NllbTokenizerFast, T5Tokenizer, T5Model, T5ForConditionalGeneration
|
||||||
|
import torch
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
||||||
|
|
||||||
|
# Modell und Tokenizer für NLLB laden
|
||||||
|
model_name = r"facebook/nllb-200-3.3B"
|
||||||
|
|
||||||
|
# Tokenizer und Modell laden
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="jpn_Jpan")
|
||||||
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float16)
|
||||||
|
|
||||||
|
model.save_pretrained(f"E:\\4K Anime\\models\\{model_name}", safe_serialization=False)
|
||||||
|
tokenizer.save_pretrained(f"E:\\4K Anime\\models\\{model_name}")
|
||||||
|
|
||||||
|
# Gerät wählen
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
model = model.to(device)
|
||||||
|
|
||||||
|
# Übersetzungsfunktion
|
||||||
|
def translate(text):
|
||||||
|
# Text tokenisieren und an das gleiche Gerät wie das Modell schicken
|
||||||
|
batch = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
# Modellvorhersage erzeugen
|
||||||
|
try:
|
||||||
|
generated = model.generate(**batch, do_sample=False, forced_bos_token_id=tokenizer.convert_tokens_to_ids("eng_Latn"), max_new_tokens=150)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Fehler bei der Modellvorhersage: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Übersetzung dekodieren
|
||||||
|
translated = tokenizer.decode(generated[0], skip_special_tokens=True)
|
||||||
|
return translated
|
||||||
|
|
||||||
|
# Beispieltext übersetzen
|
||||||
|
text = f"新暦12年。人類は地球の重力という枷から解き放たれる前に、肉体という枷から逃げ出すほうに注力していた。"
|
||||||
|
result = translate(text)
|
||||||
|
print(result)
|
||||||
Reference in New Issue
Block a user