diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e6d450b --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.idea +test*.py +test.* +*.log \ No newline at end of file diff --git a/gui.py b/gui.py new file mode 100644 index 0000000..854806a --- /dev/null +++ b/gui.py @@ -0,0 +1,374 @@ +import customtkinter as ctk +import threading +import sys +from io import StringIO +from tkinter import filedialog, messagebox +from src.EpubHandler import EpubHandler +from src.WebScrapper import WebScrapper + + +class LogRedirector: + """Leitet print() Ausgaben in ein Textfeld um""" + def __init__(self, text_widget): + self.text_widget = text_widget + self.buffer = StringIO() + + def write(self, message): + self.text_widget.configure(state="normal") + self.text_widget.insert("end", message) + self.text_widget.see("end") + self.text_widget.configure(state="disabled") + self.text_widget.update_idletasks() + + def flush(self): + pass + + +class App(ctk.CTk): + def __init__(self): + super().__init__() + + self.title("J-Novel Scrapper & Translator") + self.geometry("900x700") + + # Theme + ctk.set_appearance_mode("dark") + ctk.set_default_color_theme("blue") + + # Tabview + self.tabview = ctk.CTkTabview(self, width=850, height=650) + self.tabview.pack(padx=20, pady=20, fill="both", expand=True) + + # Tabs erstellen + self.tabview.add("WebScrapper") + self.tabview.add("EpubHandler") + + # Tab-Inhalte aufbauen + self.setup_webscrapper_tab() + self.setup_epubhandler_tab() + + # ==================== WebScrapper Tab ==================== + def setup_webscrapper_tab(self): + tab = self.tabview.tab("WebScrapper") + + # Frame für Input-Felder + input_frame = ctk.CTkFrame(tab) + input_frame.pack(padx=10, pady=10, fill="x") + + # Base URL + ctk.CTkLabel(input_frame, text="Base URL:").grid(row=0, column=0, padx=5, pady=5, sticky="w") + self.ws_base_url = ctk.CTkEntry(input_frame, width=400, placeholder_text="https://example.com/novel/") + self.ws_base_url.grid(row=0, column=1, padx=5, pady=5, sticky="ew") + + # HTML Folder Path + ctk.CTkLabel(input_frame, text="HTML Ordner:").grid(row=1, column=0, padx=5, pady=5, sticky="w") + self.ws_html_path = ctk.CTkEntry(input_frame, width=300, placeholder_text=r"E:\temp\WN\Novel\HTML") + self.ws_html_path.grid(row=1, column=1, padx=5, pady=5, sticky="ew") + ctk.CTkButton(input_frame, text="...", width=50, command=self.browse_ws_html_folder).grid(row=1, column=2, padx=5, pady=5) + + # URI Format + ctk.CTkLabel(input_frame, text="URI Format:").grid(row=2, column=0, padx=5, pady=5, sticky="w") + self.ws_uri_format = ctk.CTkEntry(input_frame, width=400, placeholder_text="chapter_{}.html oder {}/") + self.ws_uri_format.grid(row=2, column=1, padx=5, pady=5, sticky="ew") + + # From/To Chapter + ctk.CTkLabel(input_frame, text="Von Kapitel:").grid(row=3, column=0, padx=5, pady=5, sticky="w") + self.ws_from_chapter = ctk.CTkEntry(input_frame, width=100, placeholder_text="1") + self.ws_from_chapter.grid(row=3, column=1, padx=5, pady=5, sticky="w") + + ctk.CTkLabel(input_frame, text="Bis Kapitel:").grid(row=4, column=0, padx=5, pady=5, sticky="w") + self.ws_to_chapter = ctk.CTkEntry(input_frame, width=100, placeholder_text="100") + self.ws_to_chapter.grid(row=4, column=1, padx=5, pady=5, sticky="w") + + # Sleep Time + ctk.CTkLabel(input_frame, text="Sleep Time (s):").grid(row=5, column=0, padx=5, pady=5, sticky="w") + self.ws_sleep_time = ctk.CTkEntry(input_frame, width=100, placeholder_text="0") + self.ws_sleep_time.grid(row=5, column=1, padx=5, pady=5, sticky="w") + + # Language + ctk.CTkLabel(input_frame, text="Sprache:").grid(row=6, column=0, padx=5, pady=5, sticky="w") + self.ws_language = ctk.CTkComboBox(input_frame, values=["en", "jp", "de"], width=100) + self.ws_language.set("en") + self.ws_language.grid(row=6, column=1, padx=5, pady=5, sticky="w") + + input_frame.columnconfigure(1, weight=1) + + # Start Button + self.ws_start_btn = ctk.CTkButton(tab, text="Scraping starten", command=self.start_webscrapper, height=40) + self.ws_start_btn.pack(padx=10, pady=10, fill="x") + + # Progress Bar + self.ws_progress = ctk.CTkProgressBar(tab, width=400) + self.ws_progress.pack(padx=10, pady=5, fill="x") + self.ws_progress.set(0) + + # Log Textfeld (readonly) + ctk.CTkLabel(tab, text="Logs:", anchor="w").pack(padx=10, pady=(10, 0), fill="x") + self.ws_log = ctk.CTkTextbox(tab, height=200, state="disabled") + self.ws_log.pack(padx=10, pady=5, fill="both", expand=True) + + def browse_ws_html_folder(self): + folder = filedialog.askdirectory() + if folder: + self.ws_html_path.delete(0, "end") + self.ws_html_path.insert(0, folder) + + def start_webscrapper(self): + # Validierung + if not self.ws_base_url.get() or not self.ws_html_path.get() or not self.ws_uri_format.get(): + messagebox.showerror("Fehler", "Bitte alle Pflichtfelder ausfüllen!") + return + + try: + from_chapter = int(self.ws_from_chapter.get() or 1) + to_chapter = int(self.ws_to_chapter.get() or 1) + sleep_time = float(self.ws_sleep_time.get() or 0) + except ValueError: + messagebox.showerror("Fehler", "Kapitel und Sleep Time müssen Zahlen sein!") + return + + # Button deaktivieren + self.ws_start_btn.configure(state="disabled", text="Läuft...") + self.ws_progress.set(0) + self.ws_log.configure(state="normal") + self.ws_log.delete("1.0", "end") + self.ws_log.configure(state="disabled") + + # Thread starten + thread = threading.Thread( + target=self.run_webscrapper, + args=(self.ws_base_url.get(), self.ws_html_path.get(), self.ws_language.get(), + self.ws_uri_format.get(), from_chapter, to_chapter, sleep_time), + daemon=True + ) + thread.start() + + def run_webscrapper(self, base_url, html_path, language, uri_format, from_ch, to_ch, sleep_time): + # Log umleiten + old_stdout = sys.stdout + sys.stdout = LogRedirector(self.ws_log) + + try: + scrapper = WebScrapper(base_url, html_path, language) + + total = to_ch - from_ch + 1 + for i, chapter in enumerate(range(from_ch, to_ch + 1)): + # Progress aktualisieren + progress = (i + 1) / total + self.ws_progress.set(progress) + + # Einzelnes Kapitel scrapen + scrapper.getHtml(uri_format, chapter, chapter, sleep_time) + + print("\n✅ Scraping erfolgreich abgeschlossen!") + messagebox.showinfo("Erfolg", "Scraping abgeschlossen!") + + except Exception as e: + print(f"\n❌ Fehler: {str(e)}") + messagebox.showerror("Fehler", f"Ein Fehler ist aufgetreten:\n{str(e)}") + + finally: + sys.stdout = old_stdout + self.ws_start_btn.configure(state="normal", text="Scraping starten") + self.ws_progress.set(1.0) + + # ==================== EpubHandler Tab ==================== + def setup_epubhandler_tab(self): + tab = self.tabview.tab("EpubHandler") + + # Frame für Input-Felder + input_frame = ctk.CTkFrame(tab) + input_frame.pack(padx=10, pady=10, fill="x") + + # HTML Folder Path + ctk.CTkLabel(input_frame, text="HTML Ordner:").grid(row=0, column=0, padx=5, pady=5, sticky="w") + self.eh_html_path = ctk.CTkEntry(input_frame, width=300, placeholder_text=r"E:\temp\WN\Novel\HTML") + self.eh_html_path.grid(row=0, column=1, padx=5, pady=5, sticky="ew") + ctk.CTkButton(input_frame, text="...", width=50, command=self.browse_eh_html_folder).grid(row=0, column=2, padx=5, pady=5) + + # EPUB Folder Path + ctk.CTkLabel(input_frame, text="EPUB Ordner:").grid(row=1, column=0, padx=5, pady=5, sticky="w") + self.eh_epub_path = ctk.CTkEntry(input_frame, width=300, placeholder_text=r"E:\temp\WN\Novel\EPUB") + self.eh_epub_path.grid(row=1, column=1, padx=5, pady=5, sticky="ew") + ctk.CTkButton(input_frame, text="...", width=50, command=self.browse_eh_epub_folder).grid(row=1, column=2, padx=5, pady=5) + + # Cover Image Path + ctk.CTkLabel(input_frame, text="Cover Bild (optional):").grid(row=2, column=0, padx=5, pady=5, sticky="w") + self.eh_cover_path = ctk.CTkEntry(input_frame, width=300, placeholder_text="cover.jpg (optional)") + self.eh_cover_path.grid(row=2, column=1, padx=5, pady=5, sticky="ew") + ctk.CTkButton(input_frame, text="...", width=50, command=self.browse_eh_cover_image).grid(row=2, column=2, padx=5, pady=5) + + # Language + ctk.CTkLabel(input_frame, text="Sprache:").grid(row=3, column=0, padx=5, pady=5, sticky="w") + self.eh_language = ctk.CTkComboBox(input_frame, values=["en", "jp", "de"], width=100) + self.eh_language.set("en") + self.eh_language.grid(row=3, column=1, padx=5, pady=5, sticky="w") + + input_frame.columnconfigure(1, weight=1) + + # Buttons Frame + button_frame = ctk.CTkFrame(tab) + button_frame.pack(padx=10, pady=10, fill="x") + + # HTML zu EPUB Button + self.eh_html_to_epub_btn = ctk.CTkButton( + button_frame, + text="HTML → EPUB konvertieren", + command=self.start_html_to_epub, + height=40 + ) + self.eh_html_to_epub_btn.pack(side="left", padx=5, fill="x", expand=True) + + # EPUB zu HTML Button + self.eh_epub_to_html_btn = ctk.CTkButton( + button_frame, + text="EPUB → HTML konvertieren", + command=self.start_epub_to_html, + height=40, + fg_color="gray40" + ) + self.eh_epub_to_html_btn.pack(side="left", padx=5, fill="x", expand=True) + + # Progress Bar + self.eh_progress = ctk.CTkProgressBar(tab, width=400) + self.eh_progress.pack(padx=10, pady=5, fill="x") + self.eh_progress.set(0) + + # Log Textfeld (readonly) + ctk.CTkLabel(tab, text="Logs:", anchor="w").pack(padx=10, pady=(10, 0), fill="x") + self.eh_log = ctk.CTkTextbox(tab, height=200, state="disabled") + self.eh_log.pack(padx=10, pady=5, fill="both", expand=True) + + def browse_eh_html_folder(self): + folder = filedialog.askdirectory() + if folder: + self.eh_html_path.delete(0, "end") + self.eh_html_path.insert(0, folder) + + def browse_eh_epub_folder(self): + folder = filedialog.askdirectory() + if folder: + self.eh_epub_path.delete(0, "end") + self.eh_epub_path.insert(0, folder) + + def browse_eh_cover_image(self): + file = filedialog.askopenfilename( + title="Cover Bild auswählen", + filetypes=[("Bilddateien", "*.jpg *.jpeg *.png"), ("Alle Dateien", "*.*")] + ) + if file: + self.eh_cover_path.delete(0, "end") + self.eh_cover_path.insert(0, file) + + def start_html_to_epub(self): + # Validierung + if not self.eh_html_path.get() or not self.eh_epub_path.get(): + messagebox.showerror("Fehler", "Bitte HTML- und EPUB-Ordner angeben!") + return + + # Buttons deaktivieren + self.eh_html_to_epub_btn.configure(state="disabled", text="Läuft...") + self.eh_epub_to_html_btn.configure(state="disabled") + self.eh_progress.set(0) + self.eh_log.configure(state="normal") + self.eh_log.delete("1.0", "end") + self.eh_log.configure(state="disabled") + + # Thread starten + thread = threading.Thread( + target=self.run_html_to_epub, + args=(self.eh_html_path.get(), self.eh_epub_path.get(), + self.eh_language.get(), self.eh_cover_path.get() or None), + daemon=True + ) + thread.start() + + def run_html_to_epub(self, html_path, epub_path, language, cover_path): + # Log umleiten + old_stdout = sys.stdout + sys.stdout = LogRedirector(self.eh_log) + + try: + self.eh_progress.set(0.3) + print("Starte HTML → EPUB Konvertierung...") + + epub_handler = EpubHandler(html_path, epub_path) + self.eh_progress.set(0.5) + + epub_handler.convertHtmlToEpub(language, cover_path) + self.eh_progress.set(1.0) + + print("\n✅ EPUB erfolgreich erstellt!") + messagebox.showinfo("Erfolg", "EPUB wurde erfolgreich erstellt!") + + except Exception as e: + print(f"\n❌ Fehler: {str(e)}") + messagebox.showerror("Fehler", f"Ein Fehler ist aufgetreten:\n{str(e)}") + + finally: + sys.stdout = old_stdout + self.eh_html_to_epub_btn.configure(state="normal", text="HTML → EPUB konvertieren") + self.eh_epub_to_html_btn.configure(state="normal") + + def start_epub_to_html(self): + # Validierung + if not self.eh_epub_path.get() or not self.eh_html_path.get(): + messagebox.showerror("Fehler", "Bitte HTML- und EPUB-Ordner angeben!") + return + + # EPUB-Datei auswählen + epub_file = filedialog.askopenfilename( + title="EPUB-Datei auswählen", + initialdir=self.eh_epub_path.get(), + filetypes=[("EPUB Dateien", "*.epub"), ("Alle Dateien", "*.*")] + ) + if not epub_file: + return + + # Buttons deaktivieren + self.eh_html_to_epub_btn.configure(state="disabled") + self.eh_epub_to_html_btn.configure(state="disabled", text="Läuft...") + self.eh_progress.set(0) + self.eh_log.configure(state="normal") + self.eh_log.delete("1.0", "end") + self.eh_log.configure(state="disabled") + + # Thread starten + thread = threading.Thread( + target=self.run_epub_to_html, + args=(self.eh_html_path.get(), self.eh_epub_path.get(), epub_file), + daemon=True + ) + thread.start() + + def run_epub_to_html(self, html_path, epub_path, epub_file): + # Log umleiten + old_stdout = sys.stdout + sys.stdout = LogRedirector(self.eh_log) + + try: + self.eh_progress.set(0.3) + print("Starte EPUB → HTML Konvertierung...") + + epub_handler = EpubHandler(html_path, epub_path) + self.eh_progress.set(0.5) + + epub_handler.epub_to_html(epub_file) + self.eh_progress.set(1.0) + + print("\n✅ HTML-Dateien erfolgreich erstellt!") + messagebox.showinfo("Erfolg", "HTML-Dateien wurden erfolgreich erstellt!") + + except Exception as e: + print(f"\n❌ Fehler: {str(e)}") + messagebox.showerror("Fehler", f"Ein Fehler ist aufgetreten:\n{str(e)}") + + finally: + sys.stdout = old_stdout + self.eh_html_to_epub_btn.configure(state="normal") + self.eh_epub_to_html_btn.configure(state="normal", text="EPUB → HTML konvertieren") + + +if __name__ == "__main__": + app = App() + app.mainloop() diff --git a/main.py b/main.py index 6a2b743..a25b3d5 100644 --- a/main.py +++ b/main.py @@ -3,9 +3,13 @@ import random from src.EpubHandler import EpubHandler from src.Translator import Translator from src.WebScrapper import WebScrapper +from gui import App # Press the green button in the gutter to run the script. if __name__ == '__main__': + # GUI starten + + # Alte Beispiele (auskommentiert): # scrapper = WebScrapper("https://www.fanmtl.com/novel/", r"E:\temp\WN\Game of the World Tree\HTML", "en") # scrapper.getHtml("game-of-the-world-tree_{}.html", 1, 987) @@ -17,9 +21,9 @@ if __name__ == '__main__': #scrapper = WebScrapper("https://ncode.syosetu.com/n0806fu/", r"E:\temp\WN\Golden Experience\HTML", "jp") #scrapper.getHtml("{}/", 334, 620) # 612 - # scrapper = WebScrapper("https://fenrirealm.com/series/dorothys-forbidden-grimoire/", r"E:\temp\WN\Dorothy’s Forbidden Grimoire\HTML", "en") - # scrapper.getHtml("{}", 377, 828, 2) - epubHandler = EpubHandler(r"E:\temp\WN\Dorothy’s Forbidden Grimoire\HTML", r"E:\temp\WN\Dorothy’s Forbidden Grimoire\EPUB") + # scrapper = WebScrapper("https://fenrirealm.com/series/dorothys-forbidden-grimoire/", r"E:\temp\WN\Dorothy's Forbidden Grimoire\HTML", "en") + # scrapper.getHtml("{}", 1, 828, 2) + epubHandler = EpubHandler(r"E:\temp\WN\Dorothy's Forbidden Grimoire\HTML", r"E:\temp\WN\Dorothy's Forbidden Grimoire\EPUB") epubHandler.convertHtmlToEpub("en", r"C:\Users\JohannesBOZZ\Downloads\6e1de333d6af7aaa3fdf3ffa66ac6f55.png") # epubHandler = EpubHandler(r"E:\temp\WN\Golden Experience\501-609", r"E:\temp\WN\Golden Experience\epub") diff --git a/requierments.txt b/requierments.txt index f74cd03..913ff32 100644 --- a/requierments.txt +++ b/requierments.txt @@ -10,3 +10,4 @@ torch tensorflow flax protobuf==3.20.* +customtkinter diff --git a/src/WebScrapper.py b/src/WebScrapper.py index 857831d..c1e827b 100644 --- a/src/WebScrapper.py +++ b/src/WebScrapper.py @@ -13,12 +13,45 @@ from src.functions import writeToFile, makeDir, writeToJsonFile class WebScrapper: + def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str): self.baseLink = baseLink self.htmlFolderPath = htmlFolderPath self.currentLanguage = currentLanguage makeDir(self.htmlFolderPath) + + @staticmethod + def _sanitizeFilename(filename: str) -> str: + """ + Entfernt ungültige Zeichen für Windows und Linux Dateinamen. + Windows verboten: < > : " / \ | ? * + Linux verboten: / und \0 (null byte) + Zusätzlich: Leerzeichen am Anfang/Ende entfernen, mehrfache Leerzeichen reduzieren + """ + # Ungültige Zeichen für Windows und Linux entfernen + filename = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '', filename) + + # Mehrfache Leerzeichen durch einzelnes ersetzen + filename = re.sub(r'\s+', ' ', filename) + + # Leerzeichen am Anfang/Ende entfernen + filename = filename.strip() + + # Punkte am Ende entfernen (Windows-Problem) + filename = filename.rstrip('.') + + # Falls Dateiname leer ist, Fallback verwenden + if not filename: + filename = "chapter" + + # Dateiname auf maximal 255 Zeichen begrenzen (ohne Erweiterung) + if len(filename) > 200: + filename = filename[:200] + + return filename + + def getHtml(self, uriWithFormat:str, fromChapter:int, toChapter:int, sleepTime:float=0): """ uriWithFormat: inserts the current chapter number into the {} brackets @@ -53,10 +86,16 @@ class WebScrapper: chapterTitle = self._addChapterTitle(soup, chapterContent, chapterNumber, infoDict) infoDict["chapterTitle"] = chapterTitle - filePath = os.path.join(self.htmlFolderPath, f"{chapterTitle}.html") + # Dateinamen bereinigen + safe_filename = self._sanitizeFilename(chapterTitle) + if safe_filename != chapterTitle: + print(f" → Dateiname bereinigt: '{chapterTitle}' → '{safe_filename}'") + + filePath = os.path.join(self.htmlFolderPath, f"{safe_filename}.html") writeToFile("wb", filePath, chapterContent.renderContents("utf-8", True, 4)) writeToJsonFile(filePath.replace(".html", ".json"), infoDict) + def _getChapterContent(self, soup:BeautifulSoup): chapterContent = None @@ -72,6 +111,7 @@ class WebScrapper: return chapterContent + def _removeUnwantedThinsFromHtml(self, content:BeautifulSoup|NavigableString): if not content: return @@ -90,17 +130,16 @@ class WebScrapper: def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, chapterNumber, infoDict:dict): chapterTitle = f"Chapter {chapterNumber}" if "fanmtl.com" in self.baseLink: - infoDict["seriesTitle"] = re.sub("\(.*?\)|[<>|\?:\*\"\\/]", "", soup.select("div.titles h1 a")[0].text) - chapterTitle = re.sub("\(.*?\)|[<>|\?:\*\"\\/]", "", soup.select("div.titles h2")[0].text) - infoDict["chapterTitle"] =chapterTitle + infoDict["seriesTitle"] = soup.select("div.titles h1 a")[0].text.strip() + chapterTitle = soup.select("div.titles h2")[0].text.strip() + infoDict["chapterTitle"] = chapterTitle infoDict["author"] = "" elif "syosetu.com" in self.baseLink: chapterTitle = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip() - # = soup.select("h1.p-novel__title font font") infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip() infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip() elif "fenrirealm.com" in self.baseLink: - chapterTitle = soup.select_one(".chapter-view > div > h2").get_text(strip=True).split(":")[-1].strip() + chapterTitle = soup.select_one(".chapter-view > div > h2").get_text(strip=True) infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip() infoDict["author"] = "unknown"