init
This commit is contained in:
@@ -0,0 +1,36 @@
|
||||
import random
|
||||
|
||||
from src.EpubHandler import EpubHandler
|
||||
from src.Translator import Translator
|
||||
from src.WebScrapper import WebScrapper
|
||||
|
||||
# Press the green button in the gutter to run the script.
|
||||
if __name__ == '__main__':
|
||||
# scrapper = WebScrapper("https://www.fanmtl.com/novel/", r"E:\temp\WN\Game of the World Tree\HTML", "en")
|
||||
# scrapper.getHtml("game-of-the-world-tree_{}.html", 1, 987)
|
||||
|
||||
# epubHandler = EpubHandler(r"E:\temp\WN\Game of the World Tree\HTML", r"E:\temp\WN\Game of the World Tree\epub")
|
||||
# epubHandler.convertHtmlToEpub("en", r"C:\Users\JohannesBOZZ\Downloads\01605-game-of-the-world-tree.jpg")
|
||||
|
||||
|
||||
|
||||
#scrapper = WebScrapper("https://ncode.syosetu.com/n0806fu/", r"E:\temp\WN\Golden Experience\HTML", "jp")
|
||||
#scrapper.getHtml("{}/", 334, 620) # 612
|
||||
|
||||
# scrapper = WebScrapper("https://fenrirealm.com/series/dorothys-forbidden-grimoire/", r"E:\temp\WN\Dorothy’s Forbidden Grimoire\HTML", "en")
|
||||
# scrapper.getHtml("{}", 377, 828, 2)
|
||||
epubHandler = EpubHandler(r"E:\temp\WN\Dorothy’s Forbidden Grimoire\HTML", r"E:\temp\WN\Dorothy’s Forbidden Grimoire\EPUB")
|
||||
epubHandler.convertHtmlToEpub("en", r"C:\Users\JohannesBOZZ\Downloads\6e1de333d6af7aaa3fdf3ffa66ac6f55.png")
|
||||
|
||||
# epubHandler = EpubHandler(r"E:\temp\WN\Golden Experience\501-609", r"E:\temp\WN\Golden Experience\epub")
|
||||
# epubHandler = EpubHandler(r"C:\Users\JohannesBOZZ\Downloads\黄金の経験値 VI 特定災害生物「魔王」暗躍マグナメルム\html", r"C:\Users\JohannesBOZZ\Downloads\黄金の経験値 VI 特定災害生物「魔王」暗躍マグナメルム")
|
||||
# epubHandler.epub_to_html(r"C:\Users\JohannesBOZZ\Downloads\黄金の経験値 VI 特定災害生物「魔王」暗躍マグナメルム\黄金の経験値 VI 特定災害生物「魔王」暗躍マグナメルム.epub")
|
||||
|
||||
|
||||
|
||||
# epubHandler = EpubHandler(r"W:\Temp\html", r"W:\Temp\epub")
|
||||
#epubHandler.convertHtmlToEpub("jp")
|
||||
|
||||
# translator = Translator(r"facebook/nllb-200-3.3B", r"W:\Temp\html", r"W:\Temp\translate test")
|
||||
# translator.doTranslate()
|
||||
|
||||
@@ -0,0 +1,12 @@
|
||||
requests
|
||||
bs4
|
||||
markdownify
|
||||
aspose-words
|
||||
ebooklib
|
||||
natsort
|
||||
transformers
|
||||
sentencepiece
|
||||
torch
|
||||
tensorflow
|
||||
flax
|
||||
protobuf==3.20.*
|
||||
@@ -0,0 +1,162 @@
|
||||
import json
|
||||
import os.path
|
||||
from enum import unique
|
||||
from os import scandir, listdir
|
||||
|
||||
import natsort
|
||||
from ebooklib import epub
|
||||
from bs4 import BeautifulSoup
|
||||
import aspose.words as aw
|
||||
from pprint import pprint
|
||||
|
||||
from sympy import false
|
||||
|
||||
from src.functions import makeDir, readFromFile, readFromJsonFile
|
||||
|
||||
|
||||
class EpubHandler:
|
||||
def __init__(self, htmlPath, epubPath):
|
||||
self.htmlPath = htmlPath
|
||||
self.epubPath = epubPath
|
||||
makeDir(self.htmlPath)
|
||||
makeDir(self.epubPath)
|
||||
|
||||
def convertHtmlToEpub(self, language: str, coverImagePath=None):
|
||||
jsonPath = os.path.join(self.htmlPath, listdir(self.htmlPath)[0].replace(".html", ".json"))
|
||||
infosDict = readFromJsonFile(jsonPath)
|
||||
|
||||
book = epub.EpubBook()
|
||||
book.set_title(infosDict["seriesTitle"])
|
||||
book.set_language(infosDict["currentLanguage"])
|
||||
if coverImagePath:
|
||||
book.set_cover("cover.jpg", readFromFile("rb", coverImagePath))
|
||||
spine = ["nav"]
|
||||
|
||||
files = natsort.os_sorted([x.path for x in scandir(self.htmlPath) if not x.path.endswith(".json")])
|
||||
toc = []
|
||||
chapterDict = {}
|
||||
for htmlFile in files:
|
||||
if htmlFile.endswith(".json"):
|
||||
continue
|
||||
|
||||
infosDict = readFromJsonFile(htmlFile.replace(".html", ".json"))
|
||||
|
||||
chapter = epub.EpubHtml(title=infosDict["chapterTitle"], file_name=f"chapter{infosDict['chapter']}.xhtml",
|
||||
lang=language)
|
||||
chapter.content = readFromFile("r", htmlFile)
|
||||
|
||||
book.add_item(chapter)
|
||||
try:
|
||||
chapterDict[int(infosDict["chapter"])] = chapter
|
||||
except IndexError:
|
||||
pprint(infosDict['chapter'])
|
||||
|
||||
# Dictionary nach Keys sortieren und Kapitel zur spine hinzufügen
|
||||
for key in sorted(chapterDict.keys()):
|
||||
chapter = chapterDict[key]
|
||||
toc.append(epub.Link(chapter.file_name, chapter.title, chapter.title))
|
||||
spine.append(chapter)
|
||||
|
||||
book.toc = toc
|
||||
book.spine = spine
|
||||
book.add_item(epub.EpubNcx())
|
||||
book.add_item(epub.EpubNav())
|
||||
epub.write_epub(os.path.join(self.epubPath, f"{book.title}.epub"), book)
|
||||
|
||||
|
||||
# ======================================================
|
||||
# EPUB → HTML
|
||||
# ======================================================
|
||||
def epub_to_html(self, epub_file):
|
||||
book = epub.read_epub(epub_file)
|
||||
|
||||
images_dir = os.path.join(self.htmlPath, "images")
|
||||
makeDir(images_dir)
|
||||
|
||||
chapter_index = 1
|
||||
|
||||
for item_id, _ in book.spine:
|
||||
item = book.get_item_with_id(item_id)
|
||||
|
||||
# Kapitel
|
||||
if item.get_type() == epub.ITEM_DOCUMENT:
|
||||
filename = f"{chapter_index:03d}_{os.path.basename(item.file_name)}"
|
||||
filepath = os.path.join(self.htmlPath, filename)
|
||||
|
||||
with open(filepath, "wb") as f:
|
||||
f.write(item.get_content())
|
||||
|
||||
chapter_index += 1
|
||||
|
||||
# Bilder
|
||||
elif item.get_type() == epub.ITEM_IMAGE:
|
||||
image_path = os.path.join(images_dir, os.path.basename(item.file_name))
|
||||
with open(image_path, "wb") as f:
|
||||
f.write(item.get_content())
|
||||
|
||||
print(f"✔ EPUB nach HTML exportiert ({chapter_index - 1} Kapitel)")
|
||||
|
||||
# ======================================================
|
||||
# HTML → EPUB
|
||||
# ======================================================
|
||||
def html_to_epub(self, output_epub, title="Translated Book", lang="de"):
|
||||
book = epub.EpubBook()
|
||||
book.set_title(title)
|
||||
book.set_language(lang)
|
||||
|
||||
# Kapitel laden (sortiert!)
|
||||
html_files = sorted(
|
||||
f for f in os.listdir(self.htmlPath)
|
||||
if f.endswith(".html")
|
||||
)
|
||||
|
||||
spine = ["nav"]
|
||||
chapters = []
|
||||
|
||||
for html_file in html_files:
|
||||
with open(os.path.join(self.htmlPath, html_file), "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
chapter = epub.EpubHtml(
|
||||
title=html_file,
|
||||
file_name=html_file,
|
||||
content=content
|
||||
)
|
||||
book.add_item(chapter)
|
||||
chapters.append(chapter)
|
||||
spine.append(chapter)
|
||||
|
||||
# Bilder wieder einbinden
|
||||
images_dir = os.path.join(self.htmlPath, "images")
|
||||
if os.path.exists(images_dir):
|
||||
for img in os.listdir(images_dir):
|
||||
img_path = os.path.join(images_dir, img)
|
||||
with open(img_path, "rb") as f:
|
||||
image = epub.EpubItem(
|
||||
uid=img,
|
||||
file_name=f"images/{img}",
|
||||
media_type=self._guess_mime(img),
|
||||
content=f.read()
|
||||
)
|
||||
book.add_item(image)
|
||||
|
||||
book.spine = spine
|
||||
book.add_item(epub.EpubNcx())
|
||||
book.add_item(epub.EpubNav())
|
||||
|
||||
epub.write_epub(output_epub, book)
|
||||
print("✔ EPUB neu erstellt")
|
||||
|
||||
# ======================================================
|
||||
# MIME helper
|
||||
# ======================================================
|
||||
def _guess_mime(self, filename):
|
||||
ext = filename.lower().split(".")[-1]
|
||||
return {
|
||||
"jpg": "image/jpeg",
|
||||
"jpeg": "image/jpeg",
|
||||
"png": "image/png",
|
||||
"gif": "image/gif",
|
||||
"svg": "image/svg+xml",
|
||||
"webp": "image/webp"
|
||||
}.get(ext, "application/octet-stream")
|
||||
@@ -0,0 +1,69 @@
|
||||
from pprint import pprint
|
||||
|
||||
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
|
||||
from bs4 import BeautifulSoup
|
||||
import torch
|
||||
import os
|
||||
|
||||
class Translator:
|
||||
def __init__(self, modalPath:str, inputFolder:str, outputFolder:str):
|
||||
# Ändere das Modell und den Tokenizer auf google/madlad400-3b-mt
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(modalPath, src_lang="jpn_Jpan")
|
||||
self.model = AutoModelForSeq2SeqLM.from_pretrained(modalPath, torch_dtype=torch.float16)
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
pprint(self.device)
|
||||
self.model = self.model.to(self.device)
|
||||
self.inputFolder = inputFolder
|
||||
self.outputFolder = outputFolder
|
||||
|
||||
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
||||
|
||||
def downloadModal(self, modalName:str, savePath:str):
|
||||
tokenizer = T5Tokenizer.from_pretrained(modalName)
|
||||
model = T5ForConditionalGeneration.from_pretrained(modalName)
|
||||
|
||||
# Lokal speichern
|
||||
tokenizer.save_pretrained(savePath)
|
||||
model.save_pretrained(savePath)
|
||||
|
||||
print(f"Modell gespeichert unter {savePath}")
|
||||
|
||||
def _is_valid_text(self, text):
|
||||
return text and len(text.strip()) > 0 # Sicherstellen, dass der Text nicht leer ist und nur Leerzeichen entfernt werden
|
||||
|
||||
def _translate(self, text):
|
||||
if not self._is_valid_text(text):
|
||||
print("Ungültiger Text, überspringe Übersetzung.")
|
||||
return None
|
||||
|
||||
batch = self.tokenizer([text], return_tensors="pt", padding=True, truncation=True).to(self.device)
|
||||
with torch.no_grad():
|
||||
try:
|
||||
# Verwende generate() von T5ForConditionalGeneration
|
||||
generated = self.model.generate(**batch, do_sample=True, forced_bos_token_id=self.tokenizer.convert_tokens_to_ids("eng_Latn"), max_new_tokens=150)
|
||||
except Exception as e:
|
||||
print(f"Fehler bei der Modellvorhersage: {e}")
|
||||
return None
|
||||
translated = self.tokenizer.decode(generated[0], skip_special_tokens=True)
|
||||
return translated
|
||||
|
||||
def doTranslate(self):
|
||||
with open(r"W:\Temp\html\第1話 「レア」.html", "r", encoding="utf-8") as f:
|
||||
soup = BeautifulSoup(f, "html.parser")
|
||||
|
||||
# --- 2. Text finden und übersetzen ---
|
||||
for elem in soup.find_all(text=True):
|
||||
if elem.parent.name not in ['script', 'style']: # Kein JS oder CSS übersetzen
|
||||
original_text = elem.strip()
|
||||
print(original_text)
|
||||
if original_text: # Nur wenn etwas da ist
|
||||
try:
|
||||
translated_text = self._translate(original_text)
|
||||
elem.replace_with(translated_text)
|
||||
print(translated_text)
|
||||
except Exception as e:
|
||||
print(f"Fehler beim Übersetzen von: {original_text[:30]}... => {e}")
|
||||
|
||||
# --- 3. Übersetzte Datei speichern ---
|
||||
with open(r"W:\Temp\translate test\test2.html", "w", encoding="utf-8") as f:
|
||||
f.write(str(soup))
|
||||
@@ -0,0 +1,116 @@
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from urllib.parse import urljoin
|
||||
from pprint import pprint
|
||||
import requests
|
||||
from bs4 import BeautifulSoup, NavigableString
|
||||
from src.functions import writeToFile, makeDir, writeToJsonFile
|
||||
|
||||
|
||||
# https://ncode.syosetu.com/n0806fu
|
||||
|
||||
|
||||
|
||||
class WebScrapper:
|
||||
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
|
||||
self.baseLink = baseLink
|
||||
self.htmlFolderPath = htmlFolderPath
|
||||
self.currentLanguage = currentLanguage
|
||||
makeDir(self.htmlFolderPath)
|
||||
|
||||
def getHtml(self, uriWithFormat:str, fromChapter:int, toChapter:int, sleepTime:float=0):
|
||||
"""
|
||||
uriWithFormat: inserts the current chapter number into the {} brackets
|
||||
"""
|
||||
for chapterNumber in range(fromChapter, toChapter + 1):
|
||||
time.sleep(sleepTime) # to avoid getting blocked by the website for sending too many requests in a short time
|
||||
|
||||
infoDict = {
|
||||
"chapter": chapterNumber,
|
||||
"originalLanguage": self.currentLanguage,
|
||||
"currentLanguage": self.currentLanguage,
|
||||
}
|
||||
link = urljoin(self.baseLink, uriWithFormat.format(chapterNumber))
|
||||
print(link)
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
|
||||
#'Referer': 'https://ncode.syosetu.com/',
|
||||
#'Accept-Language': 'de,en;q=0.9',
|
||||
}
|
||||
|
||||
soup = BeautifulSoup(requests.get(link, headers=headers).content, "html.parser")
|
||||
if not soup:
|
||||
print(f"skipping Chapter {chapterNumber}")
|
||||
continue
|
||||
|
||||
chapterContent = self._getChapterContent(soup)
|
||||
if not chapterContent:
|
||||
print(f"skipping Chapter {chapterNumber}. No content found")
|
||||
continue
|
||||
|
||||
self._removeUnwantedThinsFromHtml(chapterContent)
|
||||
chapterTitle = self._addChapterTitle(soup, chapterContent, chapterNumber, infoDict)
|
||||
infoDict["chapterTitle"] = chapterTitle
|
||||
|
||||
filePath = os.path.join(self.htmlFolderPath, f"{chapterTitle}.html")
|
||||
writeToFile("wb", filePath, chapterContent.renderContents("utf-8", True, 4))
|
||||
writeToJsonFile(filePath.replace(".html", ".json"), infoDict)
|
||||
|
||||
def _getChapterContent(self, soup:BeautifulSoup):
|
||||
chapterContent = None
|
||||
|
||||
if "fanmtl.com" in self.baseLink:
|
||||
chapterContent = soup.find("div", {"class": "chapter-content"})
|
||||
elif "syosetu.com" in self.baseLink:
|
||||
body = soup.select("div.p-novel__body div.js-novel-text.p-novel__text")
|
||||
for x in body:
|
||||
if len(x.text) > 1000:
|
||||
chapterContent = x
|
||||
elif "fenrirealm.com" in self.baseLink:
|
||||
chapterContent = soup.select("div.chapter-view > div.content-area")[0]
|
||||
|
||||
return chapterContent
|
||||
|
||||
def _removeUnwantedThinsFromHtml(self, content:BeautifulSoup|NavigableString):
|
||||
if not content:
|
||||
return
|
||||
|
||||
# FanMTL advertisements
|
||||
if "fanmtl.com" in self.baseLink:
|
||||
for div in content.find_all('div', {'align': 'center'}):
|
||||
if div.find('script'):
|
||||
div.decompose()
|
||||
|
||||
#general
|
||||
for script in content.find_all('script'):
|
||||
script.decompose()
|
||||
|
||||
|
||||
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, chapterNumber, infoDict:dict):
|
||||
chapterTitle = f"Chapter {chapterNumber}"
|
||||
if "fanmtl.com" in self.baseLink:
|
||||
infoDict["seriesTitle"] = re.sub("\(.*?\)|[<>|\?:\*\"\\/]", "", soup.select("div.titles h1 a")[0].text)
|
||||
chapterTitle = re.sub("\(.*?\)|[<>|\?:\*\"\\/]", "", soup.select("div.titles h2")[0].text)
|
||||
infoDict["chapterTitle"] =chapterTitle
|
||||
infoDict["author"] = ""
|
||||
elif "syosetu.com" in self.baseLink:
|
||||
chapterTitle = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
|
||||
# = soup.select("h1.p-novel__title font font")
|
||||
infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
|
||||
infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
|
||||
elif "fenrirealm.com" in self.baseLink:
|
||||
chapterTitle = soup.select_one(".chapter-view > div > h2").get_text(strip=True).split(":")[-1].strip()
|
||||
infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip()
|
||||
infoDict["author"] = "unknown"
|
||||
|
||||
titleElement = soup.new_tag("h1")
|
||||
titleElement.string = chapterTitle
|
||||
content.insert(0, titleElement)
|
||||
content.insert(1, soup.new_tag("br"))
|
||||
content.insert(2, soup.new_tag("br"))
|
||||
|
||||
return chapterTitle
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
from mysql.connector.aio.charsets import charsets
|
||||
|
||||
|
||||
def writeToFile(mode:str, path:str, content: str | bytes):
|
||||
encoding = None if "b" in mode else "utf-8"
|
||||
with open(path, mode, encoding=encoding) as file:
|
||||
file.write(content)
|
||||
|
||||
|
||||
def readFromFile(mode:str, path:str):
|
||||
encoding = None if "b" in mode else "utf-8"
|
||||
with open(path, mode, encoding=encoding) as file:
|
||||
return file.read()
|
||||
|
||||
|
||||
def makeDir(directory:str):
|
||||
if not os.path.exists(directory):
|
||||
os.makedirs(directory)
|
||||
|
||||
def writeToJsonFile(path:str, content:dict):
|
||||
with open(path, "w", encoding="utf-8") as file:
|
||||
json.dump(content, file)
|
||||
|
||||
|
||||
def readFromJsonFile(path:str):
|
||||
with open(path, "r", encoding="utf-8") as file:
|
||||
return json.load(file)
|
||||
@@ -0,0 +1,69 @@
|
||||
from pprint import pprint
|
||||
|
||||
from transformers import MarianMTModel, MarianTokenizer, T5Tokenizer, T5Model
|
||||
from bs4 import BeautifulSoup
|
||||
import torch
|
||||
import os
|
||||
|
||||
class TranslatorOld:
|
||||
def __init__(self, modalPath:str, inputFolder:str, outputFolder:str):
|
||||
self.tokenizer = T5Tokenizer.from_pretrained(modalPath)
|
||||
self.model = T5Model.from_pretrained(modalPath)
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
pprint(self.device)
|
||||
self.model = self.model.to(self.device)
|
||||
self.inputFolder = inputFolder
|
||||
self.outputFolder = outputFolder
|
||||
|
||||
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
||||
|
||||
def downloadModal(self, modalName:str, savePath:str):
|
||||
tokenizer = MarianTokenizer.from_pretrained(modalName)
|
||||
model = MarianMTModel.from_pretrained(modalName)
|
||||
|
||||
# Lokal speichern
|
||||
tokenizer.save_pretrained(savePath)
|
||||
model.save_pretrained(savePath)
|
||||
|
||||
print(f"Modell gespeichert unter {savePath}")
|
||||
|
||||
def _is_valid_text(self, text):
|
||||
return text and len(
|
||||
text.strip()) > 0 # Sicherstellen, dass der Text nicht leer ist und nur Leerzeichen entfernt werden
|
||||
|
||||
def _translate(self, text):
|
||||
if not self._is_valid_text(text):
|
||||
print("Ungültiger Text, überspringe Übersetzung.")
|
||||
return None
|
||||
|
||||
batch = self.tokenizer([text], return_tensors="pt", padding=True, truncation=True).to(self.device)
|
||||
with torch.no_grad():
|
||||
try:
|
||||
generated = self.model.generate(**batch, do_sample=True, top_k=50, top_p=0.95, temperature=0.7)
|
||||
except Exception as e:
|
||||
print(f"Fehler bei der Modellvorhersage: {e}")
|
||||
return None
|
||||
translated = self.tokenizer.decode(generated[0], skip_special_tokens=True)
|
||||
return translated
|
||||
|
||||
|
||||
def doTranslate(self):
|
||||
with open(r"W:\Temp\html\第1話 「レア」.html", "r", encoding="utf-8") as f:
|
||||
soup = BeautifulSoup(f, "html.parser")
|
||||
|
||||
# --- 2. Text finden und übersetzen ---
|
||||
for elem in soup.find_all(text=True):
|
||||
if elem.parent.name not in ['script', 'style']: # Kein JS oder CSS übersetzen
|
||||
original_text = elem.strip()
|
||||
print(original_text)
|
||||
if original_text: # Nur wenn etwas da ist
|
||||
try:
|
||||
translated_text = self._translate(original_text)
|
||||
elem.replace_with(translated_text)
|
||||
print(translated_text)
|
||||
except Exception as e:
|
||||
print(f"Fehler beim Übersetzen von: {original_text[:30]}... => {e}")
|
||||
|
||||
# --- 3. Übersetzte Datei speichern ---
|
||||
with open(r"W:\Temp\translate test\test1.html", "w", encoding="utf-8") as f:
|
||||
f.write(str(soup))
|
||||
@@ -0,0 +1,43 @@
|
||||
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, M2M100Model, M2M100Tokenizer, NllbTokenizer, \
|
||||
NllbMoeModel, NllbTokenizerFast, T5Tokenizer, T5Model, T5ForConditionalGeneration
|
||||
import torch
|
||||
import os
|
||||
|
||||
|
||||
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
||||
|
||||
# Modell und Tokenizer für NLLB laden
|
||||
model_name = r"facebook/nllb-200-3.3B"
|
||||
|
||||
# Tokenizer und Modell laden
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="jpn_Jpan")
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float16)
|
||||
|
||||
model.save_pretrained(f"E:\\4K Anime\\models\\{model_name}", safe_serialization=False)
|
||||
tokenizer.save_pretrained(f"E:\\4K Anime\\models\\{model_name}")
|
||||
|
||||
# Gerät wählen
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
model = model.to(device)
|
||||
|
||||
# Übersetzungsfunktion
|
||||
def translate(text):
|
||||
# Text tokenisieren und an das gleiche Gerät wie das Modell schicken
|
||||
batch = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
|
||||
|
||||
with torch.no_grad():
|
||||
# Modellvorhersage erzeugen
|
||||
try:
|
||||
generated = model.generate(**batch, do_sample=False, forced_bos_token_id=tokenizer.convert_tokens_to_ids("eng_Latn"), max_new_tokens=150)
|
||||
except Exception as e:
|
||||
print(f"Fehler bei der Modellvorhersage: {e}")
|
||||
return None
|
||||
|
||||
# Übersetzung dekodieren
|
||||
translated = tokenizer.decode(generated[0], skip_special_tokens=True)
|
||||
return translated
|
||||
|
||||
# Beispieltext übersetzen
|
||||
text = f"新暦12年。人類は地球の重力という枷から解き放たれる前に、肉体という枷から逃げ出すほうに注力していた。"
|
||||
result = translate(text)
|
||||
print(result)
|
||||
Reference in New Issue
Block a user