init2.0
This commit is contained in:
@@ -0,0 +1,4 @@
|
|||||||
|
.idea
|
||||||
|
test*.py
|
||||||
|
test.*
|
||||||
|
*.log
|
||||||
@@ -0,0 +1,374 @@
|
|||||||
|
import customtkinter as ctk
|
||||||
|
import threading
|
||||||
|
import sys
|
||||||
|
from io import StringIO
|
||||||
|
from tkinter import filedialog, messagebox
|
||||||
|
from src.EpubHandler import EpubHandler
|
||||||
|
from src.WebScrapper import WebScrapper
|
||||||
|
|
||||||
|
|
||||||
|
class LogRedirector:
|
||||||
|
"""Leitet print() Ausgaben in ein Textfeld um"""
|
||||||
|
def __init__(self, text_widget):
|
||||||
|
self.text_widget = text_widget
|
||||||
|
self.buffer = StringIO()
|
||||||
|
|
||||||
|
def write(self, message):
|
||||||
|
self.text_widget.configure(state="normal")
|
||||||
|
self.text_widget.insert("end", message)
|
||||||
|
self.text_widget.see("end")
|
||||||
|
self.text_widget.configure(state="disabled")
|
||||||
|
self.text_widget.update_idletasks()
|
||||||
|
|
||||||
|
def flush(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class App(ctk.CTk):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.title("J-Novel Scrapper & Translator")
|
||||||
|
self.geometry("900x700")
|
||||||
|
|
||||||
|
# Theme
|
||||||
|
ctk.set_appearance_mode("dark")
|
||||||
|
ctk.set_default_color_theme("blue")
|
||||||
|
|
||||||
|
# Tabview
|
||||||
|
self.tabview = ctk.CTkTabview(self, width=850, height=650)
|
||||||
|
self.tabview.pack(padx=20, pady=20, fill="both", expand=True)
|
||||||
|
|
||||||
|
# Tabs erstellen
|
||||||
|
self.tabview.add("WebScrapper")
|
||||||
|
self.tabview.add("EpubHandler")
|
||||||
|
|
||||||
|
# Tab-Inhalte aufbauen
|
||||||
|
self.setup_webscrapper_tab()
|
||||||
|
self.setup_epubhandler_tab()
|
||||||
|
|
||||||
|
# ==================== WebScrapper Tab ====================
|
||||||
|
def setup_webscrapper_tab(self):
|
||||||
|
tab = self.tabview.tab("WebScrapper")
|
||||||
|
|
||||||
|
# Frame für Input-Felder
|
||||||
|
input_frame = ctk.CTkFrame(tab)
|
||||||
|
input_frame.pack(padx=10, pady=10, fill="x")
|
||||||
|
|
||||||
|
# Base URL
|
||||||
|
ctk.CTkLabel(input_frame, text="Base URL:").grid(row=0, column=0, padx=5, pady=5, sticky="w")
|
||||||
|
self.ws_base_url = ctk.CTkEntry(input_frame, width=400, placeholder_text="https://example.com/novel/")
|
||||||
|
self.ws_base_url.grid(row=0, column=1, padx=5, pady=5, sticky="ew")
|
||||||
|
|
||||||
|
# HTML Folder Path
|
||||||
|
ctk.CTkLabel(input_frame, text="HTML Ordner:").grid(row=1, column=0, padx=5, pady=5, sticky="w")
|
||||||
|
self.ws_html_path = ctk.CTkEntry(input_frame, width=300, placeholder_text=r"E:\temp\WN\Novel\HTML")
|
||||||
|
self.ws_html_path.grid(row=1, column=1, padx=5, pady=5, sticky="ew")
|
||||||
|
ctk.CTkButton(input_frame, text="...", width=50, command=self.browse_ws_html_folder).grid(row=1, column=2, padx=5, pady=5)
|
||||||
|
|
||||||
|
# URI Format
|
||||||
|
ctk.CTkLabel(input_frame, text="URI Format:").grid(row=2, column=0, padx=5, pady=5, sticky="w")
|
||||||
|
self.ws_uri_format = ctk.CTkEntry(input_frame, width=400, placeholder_text="chapter_{}.html oder {}/")
|
||||||
|
self.ws_uri_format.grid(row=2, column=1, padx=5, pady=5, sticky="ew")
|
||||||
|
|
||||||
|
# From/To Chapter
|
||||||
|
ctk.CTkLabel(input_frame, text="Von Kapitel:").grid(row=3, column=0, padx=5, pady=5, sticky="w")
|
||||||
|
self.ws_from_chapter = ctk.CTkEntry(input_frame, width=100, placeholder_text="1")
|
||||||
|
self.ws_from_chapter.grid(row=3, column=1, padx=5, pady=5, sticky="w")
|
||||||
|
|
||||||
|
ctk.CTkLabel(input_frame, text="Bis Kapitel:").grid(row=4, column=0, padx=5, pady=5, sticky="w")
|
||||||
|
self.ws_to_chapter = ctk.CTkEntry(input_frame, width=100, placeholder_text="100")
|
||||||
|
self.ws_to_chapter.grid(row=4, column=1, padx=5, pady=5, sticky="w")
|
||||||
|
|
||||||
|
# Sleep Time
|
||||||
|
ctk.CTkLabel(input_frame, text="Sleep Time (s):").grid(row=5, column=0, padx=5, pady=5, sticky="w")
|
||||||
|
self.ws_sleep_time = ctk.CTkEntry(input_frame, width=100, placeholder_text="0")
|
||||||
|
self.ws_sleep_time.grid(row=5, column=1, padx=5, pady=5, sticky="w")
|
||||||
|
|
||||||
|
# Language
|
||||||
|
ctk.CTkLabel(input_frame, text="Sprache:").grid(row=6, column=0, padx=5, pady=5, sticky="w")
|
||||||
|
self.ws_language = ctk.CTkComboBox(input_frame, values=["en", "jp", "de"], width=100)
|
||||||
|
self.ws_language.set("en")
|
||||||
|
self.ws_language.grid(row=6, column=1, padx=5, pady=5, sticky="w")
|
||||||
|
|
||||||
|
input_frame.columnconfigure(1, weight=1)
|
||||||
|
|
||||||
|
# Start Button
|
||||||
|
self.ws_start_btn = ctk.CTkButton(tab, text="Scraping starten", command=self.start_webscrapper, height=40)
|
||||||
|
self.ws_start_btn.pack(padx=10, pady=10, fill="x")
|
||||||
|
|
||||||
|
# Progress Bar
|
||||||
|
self.ws_progress = ctk.CTkProgressBar(tab, width=400)
|
||||||
|
self.ws_progress.pack(padx=10, pady=5, fill="x")
|
||||||
|
self.ws_progress.set(0)
|
||||||
|
|
||||||
|
# Log Textfeld (readonly)
|
||||||
|
ctk.CTkLabel(tab, text="Logs:", anchor="w").pack(padx=10, pady=(10, 0), fill="x")
|
||||||
|
self.ws_log = ctk.CTkTextbox(tab, height=200, state="disabled")
|
||||||
|
self.ws_log.pack(padx=10, pady=5, fill="both", expand=True)
|
||||||
|
|
||||||
|
def browse_ws_html_folder(self):
|
||||||
|
folder = filedialog.askdirectory()
|
||||||
|
if folder:
|
||||||
|
self.ws_html_path.delete(0, "end")
|
||||||
|
self.ws_html_path.insert(0, folder)
|
||||||
|
|
||||||
|
def start_webscrapper(self):
|
||||||
|
# Validierung
|
||||||
|
if not self.ws_base_url.get() or not self.ws_html_path.get() or not self.ws_uri_format.get():
|
||||||
|
messagebox.showerror("Fehler", "Bitte alle Pflichtfelder ausfüllen!")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
from_chapter = int(self.ws_from_chapter.get() or 1)
|
||||||
|
to_chapter = int(self.ws_to_chapter.get() or 1)
|
||||||
|
sleep_time = float(self.ws_sleep_time.get() or 0)
|
||||||
|
except ValueError:
|
||||||
|
messagebox.showerror("Fehler", "Kapitel und Sleep Time müssen Zahlen sein!")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Button deaktivieren
|
||||||
|
self.ws_start_btn.configure(state="disabled", text="Läuft...")
|
||||||
|
self.ws_progress.set(0)
|
||||||
|
self.ws_log.configure(state="normal")
|
||||||
|
self.ws_log.delete("1.0", "end")
|
||||||
|
self.ws_log.configure(state="disabled")
|
||||||
|
|
||||||
|
# Thread starten
|
||||||
|
thread = threading.Thread(
|
||||||
|
target=self.run_webscrapper,
|
||||||
|
args=(self.ws_base_url.get(), self.ws_html_path.get(), self.ws_language.get(),
|
||||||
|
self.ws_uri_format.get(), from_chapter, to_chapter, sleep_time),
|
||||||
|
daemon=True
|
||||||
|
)
|
||||||
|
thread.start()
|
||||||
|
|
||||||
|
def run_webscrapper(self, base_url, html_path, language, uri_format, from_ch, to_ch, sleep_time):
|
||||||
|
# Log umleiten
|
||||||
|
old_stdout = sys.stdout
|
||||||
|
sys.stdout = LogRedirector(self.ws_log)
|
||||||
|
|
||||||
|
try:
|
||||||
|
scrapper = WebScrapper(base_url, html_path, language)
|
||||||
|
|
||||||
|
total = to_ch - from_ch + 1
|
||||||
|
for i, chapter in enumerate(range(from_ch, to_ch + 1)):
|
||||||
|
# Progress aktualisieren
|
||||||
|
progress = (i + 1) / total
|
||||||
|
self.ws_progress.set(progress)
|
||||||
|
|
||||||
|
# Einzelnes Kapitel scrapen
|
||||||
|
scrapper.getHtml(uri_format, chapter, chapter, sleep_time)
|
||||||
|
|
||||||
|
print("\n✅ Scraping erfolgreich abgeschlossen!")
|
||||||
|
messagebox.showinfo("Erfolg", "Scraping abgeschlossen!")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ Fehler: {str(e)}")
|
||||||
|
messagebox.showerror("Fehler", f"Ein Fehler ist aufgetreten:\n{str(e)}")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
sys.stdout = old_stdout
|
||||||
|
self.ws_start_btn.configure(state="normal", text="Scraping starten")
|
||||||
|
self.ws_progress.set(1.0)
|
||||||
|
|
||||||
|
# ==================== EpubHandler Tab ====================
|
||||||
|
def setup_epubhandler_tab(self):
|
||||||
|
tab = self.tabview.tab("EpubHandler")
|
||||||
|
|
||||||
|
# Frame für Input-Felder
|
||||||
|
input_frame = ctk.CTkFrame(tab)
|
||||||
|
input_frame.pack(padx=10, pady=10, fill="x")
|
||||||
|
|
||||||
|
# HTML Folder Path
|
||||||
|
ctk.CTkLabel(input_frame, text="HTML Ordner:").grid(row=0, column=0, padx=5, pady=5, sticky="w")
|
||||||
|
self.eh_html_path = ctk.CTkEntry(input_frame, width=300, placeholder_text=r"E:\temp\WN\Novel\HTML")
|
||||||
|
self.eh_html_path.grid(row=0, column=1, padx=5, pady=5, sticky="ew")
|
||||||
|
ctk.CTkButton(input_frame, text="...", width=50, command=self.browse_eh_html_folder).grid(row=0, column=2, padx=5, pady=5)
|
||||||
|
|
||||||
|
# EPUB Folder Path
|
||||||
|
ctk.CTkLabel(input_frame, text="EPUB Ordner:").grid(row=1, column=0, padx=5, pady=5, sticky="w")
|
||||||
|
self.eh_epub_path = ctk.CTkEntry(input_frame, width=300, placeholder_text=r"E:\temp\WN\Novel\EPUB")
|
||||||
|
self.eh_epub_path.grid(row=1, column=1, padx=5, pady=5, sticky="ew")
|
||||||
|
ctk.CTkButton(input_frame, text="...", width=50, command=self.browse_eh_epub_folder).grid(row=1, column=2, padx=5, pady=5)
|
||||||
|
|
||||||
|
# Cover Image Path
|
||||||
|
ctk.CTkLabel(input_frame, text="Cover Bild (optional):").grid(row=2, column=0, padx=5, pady=5, sticky="w")
|
||||||
|
self.eh_cover_path = ctk.CTkEntry(input_frame, width=300, placeholder_text="cover.jpg (optional)")
|
||||||
|
self.eh_cover_path.grid(row=2, column=1, padx=5, pady=5, sticky="ew")
|
||||||
|
ctk.CTkButton(input_frame, text="...", width=50, command=self.browse_eh_cover_image).grid(row=2, column=2, padx=5, pady=5)
|
||||||
|
|
||||||
|
# Language
|
||||||
|
ctk.CTkLabel(input_frame, text="Sprache:").grid(row=3, column=0, padx=5, pady=5, sticky="w")
|
||||||
|
self.eh_language = ctk.CTkComboBox(input_frame, values=["en", "jp", "de"], width=100)
|
||||||
|
self.eh_language.set("en")
|
||||||
|
self.eh_language.grid(row=3, column=1, padx=5, pady=5, sticky="w")
|
||||||
|
|
||||||
|
input_frame.columnconfigure(1, weight=1)
|
||||||
|
|
||||||
|
# Buttons Frame
|
||||||
|
button_frame = ctk.CTkFrame(tab)
|
||||||
|
button_frame.pack(padx=10, pady=10, fill="x")
|
||||||
|
|
||||||
|
# HTML zu EPUB Button
|
||||||
|
self.eh_html_to_epub_btn = ctk.CTkButton(
|
||||||
|
button_frame,
|
||||||
|
text="HTML → EPUB konvertieren",
|
||||||
|
command=self.start_html_to_epub,
|
||||||
|
height=40
|
||||||
|
)
|
||||||
|
self.eh_html_to_epub_btn.pack(side="left", padx=5, fill="x", expand=True)
|
||||||
|
|
||||||
|
# EPUB zu HTML Button
|
||||||
|
self.eh_epub_to_html_btn = ctk.CTkButton(
|
||||||
|
button_frame,
|
||||||
|
text="EPUB → HTML konvertieren",
|
||||||
|
command=self.start_epub_to_html,
|
||||||
|
height=40,
|
||||||
|
fg_color="gray40"
|
||||||
|
)
|
||||||
|
self.eh_epub_to_html_btn.pack(side="left", padx=5, fill="x", expand=True)
|
||||||
|
|
||||||
|
# Progress Bar
|
||||||
|
self.eh_progress = ctk.CTkProgressBar(tab, width=400)
|
||||||
|
self.eh_progress.pack(padx=10, pady=5, fill="x")
|
||||||
|
self.eh_progress.set(0)
|
||||||
|
|
||||||
|
# Log Textfeld (readonly)
|
||||||
|
ctk.CTkLabel(tab, text="Logs:", anchor="w").pack(padx=10, pady=(10, 0), fill="x")
|
||||||
|
self.eh_log = ctk.CTkTextbox(tab, height=200, state="disabled")
|
||||||
|
self.eh_log.pack(padx=10, pady=5, fill="both", expand=True)
|
||||||
|
|
||||||
|
def browse_eh_html_folder(self):
|
||||||
|
folder = filedialog.askdirectory()
|
||||||
|
if folder:
|
||||||
|
self.eh_html_path.delete(0, "end")
|
||||||
|
self.eh_html_path.insert(0, folder)
|
||||||
|
|
||||||
|
def browse_eh_epub_folder(self):
|
||||||
|
folder = filedialog.askdirectory()
|
||||||
|
if folder:
|
||||||
|
self.eh_epub_path.delete(0, "end")
|
||||||
|
self.eh_epub_path.insert(0, folder)
|
||||||
|
|
||||||
|
def browse_eh_cover_image(self):
|
||||||
|
file = filedialog.askopenfilename(
|
||||||
|
title="Cover Bild auswählen",
|
||||||
|
filetypes=[("Bilddateien", "*.jpg *.jpeg *.png"), ("Alle Dateien", "*.*")]
|
||||||
|
)
|
||||||
|
if file:
|
||||||
|
self.eh_cover_path.delete(0, "end")
|
||||||
|
self.eh_cover_path.insert(0, file)
|
||||||
|
|
||||||
|
def start_html_to_epub(self):
|
||||||
|
# Validierung
|
||||||
|
if not self.eh_html_path.get() or not self.eh_epub_path.get():
|
||||||
|
messagebox.showerror("Fehler", "Bitte HTML- und EPUB-Ordner angeben!")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Buttons deaktivieren
|
||||||
|
self.eh_html_to_epub_btn.configure(state="disabled", text="Läuft...")
|
||||||
|
self.eh_epub_to_html_btn.configure(state="disabled")
|
||||||
|
self.eh_progress.set(0)
|
||||||
|
self.eh_log.configure(state="normal")
|
||||||
|
self.eh_log.delete("1.0", "end")
|
||||||
|
self.eh_log.configure(state="disabled")
|
||||||
|
|
||||||
|
# Thread starten
|
||||||
|
thread = threading.Thread(
|
||||||
|
target=self.run_html_to_epub,
|
||||||
|
args=(self.eh_html_path.get(), self.eh_epub_path.get(),
|
||||||
|
self.eh_language.get(), self.eh_cover_path.get() or None),
|
||||||
|
daemon=True
|
||||||
|
)
|
||||||
|
thread.start()
|
||||||
|
|
||||||
|
def run_html_to_epub(self, html_path, epub_path, language, cover_path):
|
||||||
|
# Log umleiten
|
||||||
|
old_stdout = sys.stdout
|
||||||
|
sys.stdout = LogRedirector(self.eh_log)
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.eh_progress.set(0.3)
|
||||||
|
print("Starte HTML → EPUB Konvertierung...")
|
||||||
|
|
||||||
|
epub_handler = EpubHandler(html_path, epub_path)
|
||||||
|
self.eh_progress.set(0.5)
|
||||||
|
|
||||||
|
epub_handler.convertHtmlToEpub(language, cover_path)
|
||||||
|
self.eh_progress.set(1.0)
|
||||||
|
|
||||||
|
print("\n✅ EPUB erfolgreich erstellt!")
|
||||||
|
messagebox.showinfo("Erfolg", "EPUB wurde erfolgreich erstellt!")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ Fehler: {str(e)}")
|
||||||
|
messagebox.showerror("Fehler", f"Ein Fehler ist aufgetreten:\n{str(e)}")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
sys.stdout = old_stdout
|
||||||
|
self.eh_html_to_epub_btn.configure(state="normal", text="HTML → EPUB konvertieren")
|
||||||
|
self.eh_epub_to_html_btn.configure(state="normal")
|
||||||
|
|
||||||
|
def start_epub_to_html(self):
|
||||||
|
# Validierung
|
||||||
|
if not self.eh_epub_path.get() or not self.eh_html_path.get():
|
||||||
|
messagebox.showerror("Fehler", "Bitte HTML- und EPUB-Ordner angeben!")
|
||||||
|
return
|
||||||
|
|
||||||
|
# EPUB-Datei auswählen
|
||||||
|
epub_file = filedialog.askopenfilename(
|
||||||
|
title="EPUB-Datei auswählen",
|
||||||
|
initialdir=self.eh_epub_path.get(),
|
||||||
|
filetypes=[("EPUB Dateien", "*.epub"), ("Alle Dateien", "*.*")]
|
||||||
|
)
|
||||||
|
if not epub_file:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Buttons deaktivieren
|
||||||
|
self.eh_html_to_epub_btn.configure(state="disabled")
|
||||||
|
self.eh_epub_to_html_btn.configure(state="disabled", text="Läuft...")
|
||||||
|
self.eh_progress.set(0)
|
||||||
|
self.eh_log.configure(state="normal")
|
||||||
|
self.eh_log.delete("1.0", "end")
|
||||||
|
self.eh_log.configure(state="disabled")
|
||||||
|
|
||||||
|
# Thread starten
|
||||||
|
thread = threading.Thread(
|
||||||
|
target=self.run_epub_to_html,
|
||||||
|
args=(self.eh_html_path.get(), self.eh_epub_path.get(), epub_file),
|
||||||
|
daemon=True
|
||||||
|
)
|
||||||
|
thread.start()
|
||||||
|
|
||||||
|
def run_epub_to_html(self, html_path, epub_path, epub_file):
|
||||||
|
# Log umleiten
|
||||||
|
old_stdout = sys.stdout
|
||||||
|
sys.stdout = LogRedirector(self.eh_log)
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.eh_progress.set(0.3)
|
||||||
|
print("Starte EPUB → HTML Konvertierung...")
|
||||||
|
|
||||||
|
epub_handler = EpubHandler(html_path, epub_path)
|
||||||
|
self.eh_progress.set(0.5)
|
||||||
|
|
||||||
|
epub_handler.epub_to_html(epub_file)
|
||||||
|
self.eh_progress.set(1.0)
|
||||||
|
|
||||||
|
print("\n✅ HTML-Dateien erfolgreich erstellt!")
|
||||||
|
messagebox.showinfo("Erfolg", "HTML-Dateien wurden erfolgreich erstellt!")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ Fehler: {str(e)}")
|
||||||
|
messagebox.showerror("Fehler", f"Ein Fehler ist aufgetreten:\n{str(e)}")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
sys.stdout = old_stdout
|
||||||
|
self.eh_html_to_epub_btn.configure(state="normal")
|
||||||
|
self.eh_epub_to_html_btn.configure(state="normal", text="EPUB → HTML konvertieren")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app = App()
|
||||||
|
app.mainloop()
|
||||||
@@ -3,9 +3,13 @@ import random
|
|||||||
from src.EpubHandler import EpubHandler
|
from src.EpubHandler import EpubHandler
|
||||||
from src.Translator import Translator
|
from src.Translator import Translator
|
||||||
from src.WebScrapper import WebScrapper
|
from src.WebScrapper import WebScrapper
|
||||||
|
from gui import App
|
||||||
|
|
||||||
# Press the green button in the gutter to run the script.
|
# Press the green button in the gutter to run the script.
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
# GUI starten
|
||||||
|
|
||||||
|
# Alte Beispiele (auskommentiert):
|
||||||
# scrapper = WebScrapper("https://www.fanmtl.com/novel/", r"E:\temp\WN\Game of the World Tree\HTML", "en")
|
# scrapper = WebScrapper("https://www.fanmtl.com/novel/", r"E:\temp\WN\Game of the World Tree\HTML", "en")
|
||||||
# scrapper.getHtml("game-of-the-world-tree_{}.html", 1, 987)
|
# scrapper.getHtml("game-of-the-world-tree_{}.html", 1, 987)
|
||||||
|
|
||||||
@@ -17,9 +21,9 @@ if __name__ == '__main__':
|
|||||||
#scrapper = WebScrapper("https://ncode.syosetu.com/n0806fu/", r"E:\temp\WN\Golden Experience\HTML", "jp")
|
#scrapper = WebScrapper("https://ncode.syosetu.com/n0806fu/", r"E:\temp\WN\Golden Experience\HTML", "jp")
|
||||||
#scrapper.getHtml("{}/", 334, 620) # 612
|
#scrapper.getHtml("{}/", 334, 620) # 612
|
||||||
|
|
||||||
# scrapper = WebScrapper("https://fenrirealm.com/series/dorothys-forbidden-grimoire/", r"E:\temp\WN\Dorothy’s Forbidden Grimoire\HTML", "en")
|
# scrapper = WebScrapper("https://fenrirealm.com/series/dorothys-forbidden-grimoire/", r"E:\temp\WN\Dorothy's Forbidden Grimoire\HTML", "en")
|
||||||
# scrapper.getHtml("{}", 377, 828, 2)
|
# scrapper.getHtml("{}", 1, 828, 2)
|
||||||
epubHandler = EpubHandler(r"E:\temp\WN\Dorothy’s Forbidden Grimoire\HTML", r"E:\temp\WN\Dorothy’s Forbidden Grimoire\EPUB")
|
epubHandler = EpubHandler(r"E:\temp\WN\Dorothy's Forbidden Grimoire\HTML", r"E:\temp\WN\Dorothy's Forbidden Grimoire\EPUB")
|
||||||
epubHandler.convertHtmlToEpub("en", r"C:\Users\JohannesBOZZ\Downloads\6e1de333d6af7aaa3fdf3ffa66ac6f55.png")
|
epubHandler.convertHtmlToEpub("en", r"C:\Users\JohannesBOZZ\Downloads\6e1de333d6af7aaa3fdf3ffa66ac6f55.png")
|
||||||
|
|
||||||
# epubHandler = EpubHandler(r"E:\temp\WN\Golden Experience\501-609", r"E:\temp\WN\Golden Experience\epub")
|
# epubHandler = EpubHandler(r"E:\temp\WN\Golden Experience\501-609", r"E:\temp\WN\Golden Experience\epub")
|
||||||
|
|||||||
@@ -10,3 +10,4 @@ torch
|
|||||||
tensorflow
|
tensorflow
|
||||||
flax
|
flax
|
||||||
protobuf==3.20.*
|
protobuf==3.20.*
|
||||||
|
customtkinter
|
||||||
|
|||||||
+45
-6
@@ -13,12 +13,45 @@ from src.functions import writeToFile, makeDir, writeToJsonFile
|
|||||||
|
|
||||||
|
|
||||||
class WebScrapper:
|
class WebScrapper:
|
||||||
|
|
||||||
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
|
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
|
||||||
self.baseLink = baseLink
|
self.baseLink = baseLink
|
||||||
self.htmlFolderPath = htmlFolderPath
|
self.htmlFolderPath = htmlFolderPath
|
||||||
self.currentLanguage = currentLanguage
|
self.currentLanguage = currentLanguage
|
||||||
makeDir(self.htmlFolderPath)
|
makeDir(self.htmlFolderPath)
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _sanitizeFilename(filename: str) -> str:
|
||||||
|
"""
|
||||||
|
Entfernt ungültige Zeichen für Windows und Linux Dateinamen.
|
||||||
|
Windows verboten: < > : " / \ | ? *
|
||||||
|
Linux verboten: / und \0 (null byte)
|
||||||
|
Zusätzlich: Leerzeichen am Anfang/Ende entfernen, mehrfache Leerzeichen reduzieren
|
||||||
|
"""
|
||||||
|
# Ungültige Zeichen für Windows und Linux entfernen
|
||||||
|
filename = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '', filename)
|
||||||
|
|
||||||
|
# Mehrfache Leerzeichen durch einzelnes ersetzen
|
||||||
|
filename = re.sub(r'\s+', ' ', filename)
|
||||||
|
|
||||||
|
# Leerzeichen am Anfang/Ende entfernen
|
||||||
|
filename = filename.strip()
|
||||||
|
|
||||||
|
# Punkte am Ende entfernen (Windows-Problem)
|
||||||
|
filename = filename.rstrip('.')
|
||||||
|
|
||||||
|
# Falls Dateiname leer ist, Fallback verwenden
|
||||||
|
if not filename:
|
||||||
|
filename = "chapter"
|
||||||
|
|
||||||
|
# Dateiname auf maximal 255 Zeichen begrenzen (ohne Erweiterung)
|
||||||
|
if len(filename) > 200:
|
||||||
|
filename = filename[:200]
|
||||||
|
|
||||||
|
return filename
|
||||||
|
|
||||||
|
|
||||||
def getHtml(self, uriWithFormat:str, fromChapter:int, toChapter:int, sleepTime:float=0):
|
def getHtml(self, uriWithFormat:str, fromChapter:int, toChapter:int, sleepTime:float=0):
|
||||||
"""
|
"""
|
||||||
uriWithFormat: inserts the current chapter number into the {} brackets
|
uriWithFormat: inserts the current chapter number into the {} brackets
|
||||||
@@ -53,10 +86,16 @@ class WebScrapper:
|
|||||||
chapterTitle = self._addChapterTitle(soup, chapterContent, chapterNumber, infoDict)
|
chapterTitle = self._addChapterTitle(soup, chapterContent, chapterNumber, infoDict)
|
||||||
infoDict["chapterTitle"] = chapterTitle
|
infoDict["chapterTitle"] = chapterTitle
|
||||||
|
|
||||||
filePath = os.path.join(self.htmlFolderPath, f"{chapterTitle}.html")
|
# Dateinamen bereinigen
|
||||||
|
safe_filename = self._sanitizeFilename(chapterTitle)
|
||||||
|
if safe_filename != chapterTitle:
|
||||||
|
print(f" → Dateiname bereinigt: '{chapterTitle}' → '{safe_filename}'")
|
||||||
|
|
||||||
|
filePath = os.path.join(self.htmlFolderPath, f"{safe_filename}.html")
|
||||||
writeToFile("wb", filePath, chapterContent.renderContents("utf-8", True, 4))
|
writeToFile("wb", filePath, chapterContent.renderContents("utf-8", True, 4))
|
||||||
writeToJsonFile(filePath.replace(".html", ".json"), infoDict)
|
writeToJsonFile(filePath.replace(".html", ".json"), infoDict)
|
||||||
|
|
||||||
|
|
||||||
def _getChapterContent(self, soup:BeautifulSoup):
|
def _getChapterContent(self, soup:BeautifulSoup):
|
||||||
chapterContent = None
|
chapterContent = None
|
||||||
|
|
||||||
@@ -72,6 +111,7 @@ class WebScrapper:
|
|||||||
|
|
||||||
return chapterContent
|
return chapterContent
|
||||||
|
|
||||||
|
|
||||||
def _removeUnwantedThinsFromHtml(self, content:BeautifulSoup|NavigableString):
|
def _removeUnwantedThinsFromHtml(self, content:BeautifulSoup|NavigableString):
|
||||||
if not content:
|
if not content:
|
||||||
return
|
return
|
||||||
@@ -90,17 +130,16 @@ class WebScrapper:
|
|||||||
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, chapterNumber, infoDict:dict):
|
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, chapterNumber, infoDict:dict):
|
||||||
chapterTitle = f"Chapter {chapterNumber}"
|
chapterTitle = f"Chapter {chapterNumber}"
|
||||||
if "fanmtl.com" in self.baseLink:
|
if "fanmtl.com" in self.baseLink:
|
||||||
infoDict["seriesTitle"] = re.sub("\(.*?\)|[<>|\?:\*\"\\/]", "", soup.select("div.titles h1 a")[0].text)
|
infoDict["seriesTitle"] = soup.select("div.titles h1 a")[0].text.strip()
|
||||||
chapterTitle = re.sub("\(.*?\)|[<>|\?:\*\"\\/]", "", soup.select("div.titles h2")[0].text)
|
chapterTitle = soup.select("div.titles h2")[0].text.strip()
|
||||||
infoDict["chapterTitle"] =chapterTitle
|
infoDict["chapterTitle"] = chapterTitle
|
||||||
infoDict["author"] = ""
|
infoDict["author"] = ""
|
||||||
elif "syosetu.com" in self.baseLink:
|
elif "syosetu.com" in self.baseLink:
|
||||||
chapterTitle = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
|
chapterTitle = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
|
||||||
# = soup.select("h1.p-novel__title font font")
|
|
||||||
infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
|
infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
|
||||||
infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
|
infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
|
||||||
elif "fenrirealm.com" in self.baseLink:
|
elif "fenrirealm.com" in self.baseLink:
|
||||||
chapterTitle = soup.select_one(".chapter-view > div > h2").get_text(strip=True).split(":")[-1].strip()
|
chapterTitle = soup.select_one(".chapter-view > div > h2").get_text(strip=True)
|
||||||
infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip()
|
infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip()
|
||||||
infoDict["author"] = "unknown"
|
infoDict["author"] = "unknown"
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user