This commit is contained in:
2026-05-21 08:16:10 +02:00
parent 9690367d70
commit 54c7b40737
5 changed files with 431 additions and 9 deletions
+4
View File
@@ -0,0 +1,4 @@
.idea
test*.py
test.*
*.log
+374
View File
@@ -0,0 +1,374 @@
import customtkinter as ctk
import threading
import sys
from io import StringIO
from tkinter import filedialog, messagebox
from src.EpubHandler import EpubHandler
from src.WebScrapper import WebScrapper
class LogRedirector:
"""Leitet print() Ausgaben in ein Textfeld um"""
def __init__(self, text_widget):
self.text_widget = text_widget
self.buffer = StringIO()
def write(self, message):
self.text_widget.configure(state="normal")
self.text_widget.insert("end", message)
self.text_widget.see("end")
self.text_widget.configure(state="disabled")
self.text_widget.update_idletasks()
def flush(self):
pass
class App(ctk.CTk):
def __init__(self):
super().__init__()
self.title("J-Novel Scrapper & Translator")
self.geometry("900x700")
# Theme
ctk.set_appearance_mode("dark")
ctk.set_default_color_theme("blue")
# Tabview
self.tabview = ctk.CTkTabview(self, width=850, height=650)
self.tabview.pack(padx=20, pady=20, fill="both", expand=True)
# Tabs erstellen
self.tabview.add("WebScrapper")
self.tabview.add("EpubHandler")
# Tab-Inhalte aufbauen
self.setup_webscrapper_tab()
self.setup_epubhandler_tab()
# ==================== WebScrapper Tab ====================
def setup_webscrapper_tab(self):
tab = self.tabview.tab("WebScrapper")
# Frame für Input-Felder
input_frame = ctk.CTkFrame(tab)
input_frame.pack(padx=10, pady=10, fill="x")
# Base URL
ctk.CTkLabel(input_frame, text="Base URL:").grid(row=0, column=0, padx=5, pady=5, sticky="w")
self.ws_base_url = ctk.CTkEntry(input_frame, width=400, placeholder_text="https://example.com/novel/")
self.ws_base_url.grid(row=0, column=1, padx=5, pady=5, sticky="ew")
# HTML Folder Path
ctk.CTkLabel(input_frame, text="HTML Ordner:").grid(row=1, column=0, padx=5, pady=5, sticky="w")
self.ws_html_path = ctk.CTkEntry(input_frame, width=300, placeholder_text=r"E:\temp\WN\Novel\HTML")
self.ws_html_path.grid(row=1, column=1, padx=5, pady=5, sticky="ew")
ctk.CTkButton(input_frame, text="...", width=50, command=self.browse_ws_html_folder).grid(row=1, column=2, padx=5, pady=5)
# URI Format
ctk.CTkLabel(input_frame, text="URI Format:").grid(row=2, column=0, padx=5, pady=5, sticky="w")
self.ws_uri_format = ctk.CTkEntry(input_frame, width=400, placeholder_text="chapter_{}.html oder {}/")
self.ws_uri_format.grid(row=2, column=1, padx=5, pady=5, sticky="ew")
# From/To Chapter
ctk.CTkLabel(input_frame, text="Von Kapitel:").grid(row=3, column=0, padx=5, pady=5, sticky="w")
self.ws_from_chapter = ctk.CTkEntry(input_frame, width=100, placeholder_text="1")
self.ws_from_chapter.grid(row=3, column=1, padx=5, pady=5, sticky="w")
ctk.CTkLabel(input_frame, text="Bis Kapitel:").grid(row=4, column=0, padx=5, pady=5, sticky="w")
self.ws_to_chapter = ctk.CTkEntry(input_frame, width=100, placeholder_text="100")
self.ws_to_chapter.grid(row=4, column=1, padx=5, pady=5, sticky="w")
# Sleep Time
ctk.CTkLabel(input_frame, text="Sleep Time (s):").grid(row=5, column=0, padx=5, pady=5, sticky="w")
self.ws_sleep_time = ctk.CTkEntry(input_frame, width=100, placeholder_text="0")
self.ws_sleep_time.grid(row=5, column=1, padx=5, pady=5, sticky="w")
# Language
ctk.CTkLabel(input_frame, text="Sprache:").grid(row=6, column=0, padx=5, pady=5, sticky="w")
self.ws_language = ctk.CTkComboBox(input_frame, values=["en", "jp", "de"], width=100)
self.ws_language.set("en")
self.ws_language.grid(row=6, column=1, padx=5, pady=5, sticky="w")
input_frame.columnconfigure(1, weight=1)
# Start Button
self.ws_start_btn = ctk.CTkButton(tab, text="Scraping starten", command=self.start_webscrapper, height=40)
self.ws_start_btn.pack(padx=10, pady=10, fill="x")
# Progress Bar
self.ws_progress = ctk.CTkProgressBar(tab, width=400)
self.ws_progress.pack(padx=10, pady=5, fill="x")
self.ws_progress.set(0)
# Log Textfeld (readonly)
ctk.CTkLabel(tab, text="Logs:", anchor="w").pack(padx=10, pady=(10, 0), fill="x")
self.ws_log = ctk.CTkTextbox(tab, height=200, state="disabled")
self.ws_log.pack(padx=10, pady=5, fill="both", expand=True)
def browse_ws_html_folder(self):
folder = filedialog.askdirectory()
if folder:
self.ws_html_path.delete(0, "end")
self.ws_html_path.insert(0, folder)
def start_webscrapper(self):
# Validierung
if not self.ws_base_url.get() or not self.ws_html_path.get() or not self.ws_uri_format.get():
messagebox.showerror("Fehler", "Bitte alle Pflichtfelder ausfüllen!")
return
try:
from_chapter = int(self.ws_from_chapter.get() or 1)
to_chapter = int(self.ws_to_chapter.get() or 1)
sleep_time = float(self.ws_sleep_time.get() or 0)
except ValueError:
messagebox.showerror("Fehler", "Kapitel und Sleep Time müssen Zahlen sein!")
return
# Button deaktivieren
self.ws_start_btn.configure(state="disabled", text="Läuft...")
self.ws_progress.set(0)
self.ws_log.configure(state="normal")
self.ws_log.delete("1.0", "end")
self.ws_log.configure(state="disabled")
# Thread starten
thread = threading.Thread(
target=self.run_webscrapper,
args=(self.ws_base_url.get(), self.ws_html_path.get(), self.ws_language.get(),
self.ws_uri_format.get(), from_chapter, to_chapter, sleep_time),
daemon=True
)
thread.start()
def run_webscrapper(self, base_url, html_path, language, uri_format, from_ch, to_ch, sleep_time):
# Log umleiten
old_stdout = sys.stdout
sys.stdout = LogRedirector(self.ws_log)
try:
scrapper = WebScrapper(base_url, html_path, language)
total = to_ch - from_ch + 1
for i, chapter in enumerate(range(from_ch, to_ch + 1)):
# Progress aktualisieren
progress = (i + 1) / total
self.ws_progress.set(progress)
# Einzelnes Kapitel scrapen
scrapper.getHtml(uri_format, chapter, chapter, sleep_time)
print("\n✅ Scraping erfolgreich abgeschlossen!")
messagebox.showinfo("Erfolg", "Scraping abgeschlossen!")
except Exception as e:
print(f"\n❌ Fehler: {str(e)}")
messagebox.showerror("Fehler", f"Ein Fehler ist aufgetreten:\n{str(e)}")
finally:
sys.stdout = old_stdout
self.ws_start_btn.configure(state="normal", text="Scraping starten")
self.ws_progress.set(1.0)
# ==================== EpubHandler Tab ====================
def setup_epubhandler_tab(self):
tab = self.tabview.tab("EpubHandler")
# Frame für Input-Felder
input_frame = ctk.CTkFrame(tab)
input_frame.pack(padx=10, pady=10, fill="x")
# HTML Folder Path
ctk.CTkLabel(input_frame, text="HTML Ordner:").grid(row=0, column=0, padx=5, pady=5, sticky="w")
self.eh_html_path = ctk.CTkEntry(input_frame, width=300, placeholder_text=r"E:\temp\WN\Novel\HTML")
self.eh_html_path.grid(row=0, column=1, padx=5, pady=5, sticky="ew")
ctk.CTkButton(input_frame, text="...", width=50, command=self.browse_eh_html_folder).grid(row=0, column=2, padx=5, pady=5)
# EPUB Folder Path
ctk.CTkLabel(input_frame, text="EPUB Ordner:").grid(row=1, column=0, padx=5, pady=5, sticky="w")
self.eh_epub_path = ctk.CTkEntry(input_frame, width=300, placeholder_text=r"E:\temp\WN\Novel\EPUB")
self.eh_epub_path.grid(row=1, column=1, padx=5, pady=5, sticky="ew")
ctk.CTkButton(input_frame, text="...", width=50, command=self.browse_eh_epub_folder).grid(row=1, column=2, padx=5, pady=5)
# Cover Image Path
ctk.CTkLabel(input_frame, text="Cover Bild (optional):").grid(row=2, column=0, padx=5, pady=5, sticky="w")
self.eh_cover_path = ctk.CTkEntry(input_frame, width=300, placeholder_text="cover.jpg (optional)")
self.eh_cover_path.grid(row=2, column=1, padx=5, pady=5, sticky="ew")
ctk.CTkButton(input_frame, text="...", width=50, command=self.browse_eh_cover_image).grid(row=2, column=2, padx=5, pady=5)
# Language
ctk.CTkLabel(input_frame, text="Sprache:").grid(row=3, column=0, padx=5, pady=5, sticky="w")
self.eh_language = ctk.CTkComboBox(input_frame, values=["en", "jp", "de"], width=100)
self.eh_language.set("en")
self.eh_language.grid(row=3, column=1, padx=5, pady=5, sticky="w")
input_frame.columnconfigure(1, weight=1)
# Buttons Frame
button_frame = ctk.CTkFrame(tab)
button_frame.pack(padx=10, pady=10, fill="x")
# HTML zu EPUB Button
self.eh_html_to_epub_btn = ctk.CTkButton(
button_frame,
text="HTML → EPUB konvertieren",
command=self.start_html_to_epub,
height=40
)
self.eh_html_to_epub_btn.pack(side="left", padx=5, fill="x", expand=True)
# EPUB zu HTML Button
self.eh_epub_to_html_btn = ctk.CTkButton(
button_frame,
text="EPUB → HTML konvertieren",
command=self.start_epub_to_html,
height=40,
fg_color="gray40"
)
self.eh_epub_to_html_btn.pack(side="left", padx=5, fill="x", expand=True)
# Progress Bar
self.eh_progress = ctk.CTkProgressBar(tab, width=400)
self.eh_progress.pack(padx=10, pady=5, fill="x")
self.eh_progress.set(0)
# Log Textfeld (readonly)
ctk.CTkLabel(tab, text="Logs:", anchor="w").pack(padx=10, pady=(10, 0), fill="x")
self.eh_log = ctk.CTkTextbox(tab, height=200, state="disabled")
self.eh_log.pack(padx=10, pady=5, fill="both", expand=True)
def browse_eh_html_folder(self):
folder = filedialog.askdirectory()
if folder:
self.eh_html_path.delete(0, "end")
self.eh_html_path.insert(0, folder)
def browse_eh_epub_folder(self):
folder = filedialog.askdirectory()
if folder:
self.eh_epub_path.delete(0, "end")
self.eh_epub_path.insert(0, folder)
def browse_eh_cover_image(self):
file = filedialog.askopenfilename(
title="Cover Bild auswählen",
filetypes=[("Bilddateien", "*.jpg *.jpeg *.png"), ("Alle Dateien", "*.*")]
)
if file:
self.eh_cover_path.delete(0, "end")
self.eh_cover_path.insert(0, file)
def start_html_to_epub(self):
# Validierung
if not self.eh_html_path.get() or not self.eh_epub_path.get():
messagebox.showerror("Fehler", "Bitte HTML- und EPUB-Ordner angeben!")
return
# Buttons deaktivieren
self.eh_html_to_epub_btn.configure(state="disabled", text="Läuft...")
self.eh_epub_to_html_btn.configure(state="disabled")
self.eh_progress.set(0)
self.eh_log.configure(state="normal")
self.eh_log.delete("1.0", "end")
self.eh_log.configure(state="disabled")
# Thread starten
thread = threading.Thread(
target=self.run_html_to_epub,
args=(self.eh_html_path.get(), self.eh_epub_path.get(),
self.eh_language.get(), self.eh_cover_path.get() or None),
daemon=True
)
thread.start()
def run_html_to_epub(self, html_path, epub_path, language, cover_path):
# Log umleiten
old_stdout = sys.stdout
sys.stdout = LogRedirector(self.eh_log)
try:
self.eh_progress.set(0.3)
print("Starte HTML → EPUB Konvertierung...")
epub_handler = EpubHandler(html_path, epub_path)
self.eh_progress.set(0.5)
epub_handler.convertHtmlToEpub(language, cover_path)
self.eh_progress.set(1.0)
print("\n✅ EPUB erfolgreich erstellt!")
messagebox.showinfo("Erfolg", "EPUB wurde erfolgreich erstellt!")
except Exception as e:
print(f"\n❌ Fehler: {str(e)}")
messagebox.showerror("Fehler", f"Ein Fehler ist aufgetreten:\n{str(e)}")
finally:
sys.stdout = old_stdout
self.eh_html_to_epub_btn.configure(state="normal", text="HTML → EPUB konvertieren")
self.eh_epub_to_html_btn.configure(state="normal")
def start_epub_to_html(self):
# Validierung
if not self.eh_epub_path.get() or not self.eh_html_path.get():
messagebox.showerror("Fehler", "Bitte HTML- und EPUB-Ordner angeben!")
return
# EPUB-Datei auswählen
epub_file = filedialog.askopenfilename(
title="EPUB-Datei auswählen",
initialdir=self.eh_epub_path.get(),
filetypes=[("EPUB Dateien", "*.epub"), ("Alle Dateien", "*.*")]
)
if not epub_file:
return
# Buttons deaktivieren
self.eh_html_to_epub_btn.configure(state="disabled")
self.eh_epub_to_html_btn.configure(state="disabled", text="Läuft...")
self.eh_progress.set(0)
self.eh_log.configure(state="normal")
self.eh_log.delete("1.0", "end")
self.eh_log.configure(state="disabled")
# Thread starten
thread = threading.Thread(
target=self.run_epub_to_html,
args=(self.eh_html_path.get(), self.eh_epub_path.get(), epub_file),
daemon=True
)
thread.start()
def run_epub_to_html(self, html_path, epub_path, epub_file):
# Log umleiten
old_stdout = sys.stdout
sys.stdout = LogRedirector(self.eh_log)
try:
self.eh_progress.set(0.3)
print("Starte EPUB → HTML Konvertierung...")
epub_handler = EpubHandler(html_path, epub_path)
self.eh_progress.set(0.5)
epub_handler.epub_to_html(epub_file)
self.eh_progress.set(1.0)
print("\n✅ HTML-Dateien erfolgreich erstellt!")
messagebox.showinfo("Erfolg", "HTML-Dateien wurden erfolgreich erstellt!")
except Exception as e:
print(f"\n❌ Fehler: {str(e)}")
messagebox.showerror("Fehler", f"Ein Fehler ist aufgetreten:\n{str(e)}")
finally:
sys.stdout = old_stdout
self.eh_html_to_epub_btn.configure(state="normal")
self.eh_epub_to_html_btn.configure(state="normal", text="EPUB → HTML konvertieren")
if __name__ == "__main__":
app = App()
app.mainloop()
+7 -3
View File
@@ -3,9 +3,13 @@ import random
from src.EpubHandler import EpubHandler from src.EpubHandler import EpubHandler
from src.Translator import Translator from src.Translator import Translator
from src.WebScrapper import WebScrapper from src.WebScrapper import WebScrapper
from gui import App
# Press the green button in the gutter to run the script. # Press the green button in the gutter to run the script.
if __name__ == '__main__': if __name__ == '__main__':
# GUI starten
# Alte Beispiele (auskommentiert):
# scrapper = WebScrapper("https://www.fanmtl.com/novel/", r"E:\temp\WN\Game of the World Tree\HTML", "en") # scrapper = WebScrapper("https://www.fanmtl.com/novel/", r"E:\temp\WN\Game of the World Tree\HTML", "en")
# scrapper.getHtml("game-of-the-world-tree_{}.html", 1, 987) # scrapper.getHtml("game-of-the-world-tree_{}.html", 1, 987)
@@ -17,9 +21,9 @@ if __name__ == '__main__':
#scrapper = WebScrapper("https://ncode.syosetu.com/n0806fu/", r"E:\temp\WN\Golden Experience\HTML", "jp") #scrapper = WebScrapper("https://ncode.syosetu.com/n0806fu/", r"E:\temp\WN\Golden Experience\HTML", "jp")
#scrapper.getHtml("{}/", 334, 620) # 612 #scrapper.getHtml("{}/", 334, 620) # 612
# scrapper = WebScrapper("https://fenrirealm.com/series/dorothys-forbidden-grimoire/", r"E:\temp\WN\Dorothys Forbidden Grimoire\HTML", "en") # scrapper = WebScrapper("https://fenrirealm.com/series/dorothys-forbidden-grimoire/", r"E:\temp\WN\Dorothy's Forbidden Grimoire\HTML", "en")
# scrapper.getHtml("{}", 377, 828, 2) # scrapper.getHtml("{}", 1, 828, 2)
epubHandler = EpubHandler(r"E:\temp\WN\Dorothys Forbidden Grimoire\HTML", r"E:\temp\WN\Dorothys Forbidden Grimoire\EPUB") epubHandler = EpubHandler(r"E:\temp\WN\Dorothy's Forbidden Grimoire\HTML", r"E:\temp\WN\Dorothy's Forbidden Grimoire\EPUB")
epubHandler.convertHtmlToEpub("en", r"C:\Users\JohannesBOZZ\Downloads\6e1de333d6af7aaa3fdf3ffa66ac6f55.png") epubHandler.convertHtmlToEpub("en", r"C:\Users\JohannesBOZZ\Downloads\6e1de333d6af7aaa3fdf3ffa66ac6f55.png")
# epubHandler = EpubHandler(r"E:\temp\WN\Golden Experience\501-609", r"E:\temp\WN\Golden Experience\epub") # epubHandler = EpubHandler(r"E:\temp\WN\Golden Experience\501-609", r"E:\temp\WN\Golden Experience\epub")
+1
View File
@@ -10,3 +10,4 @@ torch
tensorflow tensorflow
flax flax
protobuf==3.20.* protobuf==3.20.*
customtkinter
+44 -5
View File
@@ -13,12 +13,45 @@ from src.functions import writeToFile, makeDir, writeToJsonFile
class WebScrapper: class WebScrapper:
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str): def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
self.baseLink = baseLink self.baseLink = baseLink
self.htmlFolderPath = htmlFolderPath self.htmlFolderPath = htmlFolderPath
self.currentLanguage = currentLanguage self.currentLanguage = currentLanguage
makeDir(self.htmlFolderPath) makeDir(self.htmlFolderPath)
@staticmethod
def _sanitizeFilename(filename: str) -> str:
"""
Entfernt ungültige Zeichen für Windows und Linux Dateinamen.
Windows verboten: < > : " / \ | ? *
Linux verboten: / und \0 (null byte)
Zusätzlich: Leerzeichen am Anfang/Ende entfernen, mehrfache Leerzeichen reduzieren
"""
# Ungültige Zeichen für Windows und Linux entfernen
filename = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '', filename)
# Mehrfache Leerzeichen durch einzelnes ersetzen
filename = re.sub(r'\s+', ' ', filename)
# Leerzeichen am Anfang/Ende entfernen
filename = filename.strip()
# Punkte am Ende entfernen (Windows-Problem)
filename = filename.rstrip('.')
# Falls Dateiname leer ist, Fallback verwenden
if not filename:
filename = "chapter"
# Dateiname auf maximal 255 Zeichen begrenzen (ohne Erweiterung)
if len(filename) > 200:
filename = filename[:200]
return filename
def getHtml(self, uriWithFormat:str, fromChapter:int, toChapter:int, sleepTime:float=0): def getHtml(self, uriWithFormat:str, fromChapter:int, toChapter:int, sleepTime:float=0):
""" """
uriWithFormat: inserts the current chapter number into the {} brackets uriWithFormat: inserts the current chapter number into the {} brackets
@@ -53,10 +86,16 @@ class WebScrapper:
chapterTitle = self._addChapterTitle(soup, chapterContent, chapterNumber, infoDict) chapterTitle = self._addChapterTitle(soup, chapterContent, chapterNumber, infoDict)
infoDict["chapterTitle"] = chapterTitle infoDict["chapterTitle"] = chapterTitle
filePath = os.path.join(self.htmlFolderPath, f"{chapterTitle}.html") # Dateinamen bereinigen
safe_filename = self._sanitizeFilename(chapterTitle)
if safe_filename != chapterTitle:
print(f" → Dateiname bereinigt: '{chapterTitle}''{safe_filename}'")
filePath = os.path.join(self.htmlFolderPath, f"{safe_filename}.html")
writeToFile("wb", filePath, chapterContent.renderContents("utf-8", True, 4)) writeToFile("wb", filePath, chapterContent.renderContents("utf-8", True, 4))
writeToJsonFile(filePath.replace(".html", ".json"), infoDict) writeToJsonFile(filePath.replace(".html", ".json"), infoDict)
def _getChapterContent(self, soup:BeautifulSoup): def _getChapterContent(self, soup:BeautifulSoup):
chapterContent = None chapterContent = None
@@ -72,6 +111,7 @@ class WebScrapper:
return chapterContent return chapterContent
def _removeUnwantedThinsFromHtml(self, content:BeautifulSoup|NavigableString): def _removeUnwantedThinsFromHtml(self, content:BeautifulSoup|NavigableString):
if not content: if not content:
return return
@@ -90,17 +130,16 @@ class WebScrapper:
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, chapterNumber, infoDict:dict): def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, chapterNumber, infoDict:dict):
chapterTitle = f"Chapter {chapterNumber}" chapterTitle = f"Chapter {chapterNumber}"
if "fanmtl.com" in self.baseLink: if "fanmtl.com" in self.baseLink:
infoDict["seriesTitle"] = re.sub("\(.*?\)|[<>|\?:\*\"\\/]", "", soup.select("div.titles h1 a")[0].text) infoDict["seriesTitle"] = soup.select("div.titles h1 a")[0].text.strip()
chapterTitle = re.sub("\(.*?\)|[<>|\?:\*\"\\/]", "", soup.select("div.titles h2")[0].text) chapterTitle = soup.select("div.titles h2")[0].text.strip()
infoDict["chapterTitle"] = chapterTitle infoDict["chapterTitle"] = chapterTitle
infoDict["author"] = "" infoDict["author"] = ""
elif "syosetu.com" in self.baseLink: elif "syosetu.com" in self.baseLink:
chapterTitle = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip() chapterTitle = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
# = soup.select("h1.p-novel__title font font")
infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip() infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip() infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
elif "fenrirealm.com" in self.baseLink: elif "fenrirealm.com" in self.baseLink:
chapterTitle = soup.select_one(".chapter-view > div > h2").get_text(strip=True).split(":")[-1].strip() chapterTitle = soup.select_one(".chapter-view > div > h2").get_text(strip=True)
infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip() infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip()
infoDict["author"] = "unknown" infoDict["author"] = "unknown"