init2.0

2026-05-21 08:16:10 +02:00
parent 9690367d70
commit 54c7b40737
5 changed files with 431 additions and 9 deletions
@@ -13,12 +13,45 @@ from src.functions import writeToFile, makeDir, writeToJsonFile


 class WebScrapper:
+
    def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
        self.baseLink = baseLink
        self.htmlFolderPath = htmlFolderPath
        self.currentLanguage = currentLanguage
        makeDir(self.htmlFolderPath)

+
+    @staticmethod
+    def _sanitizeFilename(filename: str) -> str:
+        """
+        Entfernt ungültige Zeichen für Windows und Linux Dateinamen.
+        Windows verboten: < > : " / \ | ? *
+        Linux verboten: / und \0 (null byte)
+        Zusätzlich: Leerzeichen am Anfang/Ende entfernen, mehrfache Leerzeichen reduzieren
+        """
+        # Ungültige Zeichen für Windows und Linux entfernen
+        filename = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '', filename)
+
+        # Mehrfache Leerzeichen durch einzelnes ersetzen
+        filename = re.sub(r'\s+', ' ', filename)
+
+        # Leerzeichen am Anfang/Ende entfernen
+        filename = filename.strip()
+
+        # Punkte am Ende entfernen (Windows-Problem)
+        filename = filename.rstrip('.')
+
+        # Falls Dateiname leer ist, Fallback verwenden
+        if not filename:
+            filename = "chapter"
+
+        # Dateiname auf maximal 255 Zeichen begrenzen (ohne Erweiterung)
+        if len(filename) > 200:
+            filename = filename[:200]
+
+        return filename
+
+
    def getHtml(self, uriWithFormat:str, fromChapter:int, toChapter:int, sleepTime:float=0):
        """
        uriWithFormat: inserts the current chapter number into the {} brackets
@@ -53,10 +86,16 @@ class WebScrapper:
            chapterTitle = self._addChapterTitle(soup, chapterContent, chapterNumber, infoDict)
            infoDict["chapterTitle"] = chapterTitle

-            filePath = os.path.join(self.htmlFolderPath, f"{chapterTitle}.html")
+            # Dateinamen bereinigen
+            safe_filename = self._sanitizeFilename(chapterTitle)
+            if safe_filename != chapterTitle:
+                print(f"  → Dateiname bereinigt: '{chapterTitle}' → '{safe_filename}'")
+
+            filePath = os.path.join(self.htmlFolderPath, f"{safe_filename}.html")
            writeToFile("wb", filePath, chapterContent.renderContents("utf-8", True, 4))
            writeToJsonFile(filePath.replace(".html", ".json"), infoDict)

+
    def _getChapterContent(self, soup:BeautifulSoup):
        chapterContent = None

@@ -72,6 +111,7 @@ class WebScrapper:

        return chapterContent

+
    def _removeUnwantedThinsFromHtml(self, content:BeautifulSoup|NavigableString):
        if not content:
            return
@@ -90,17 +130,16 @@ class WebScrapper:
    def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, chapterNumber, infoDict:dict):
        chapterTitle = f"Chapter {chapterNumber}"
        if "fanmtl.com" in self.baseLink:
-            infoDict["seriesTitle"] = re.sub("\(.*?\)|[<>|\?:\*\"\\/]", "", soup.select("div.titles h1 a")[0].text)
-            chapterTitle = re.sub("\(.*?\)|[<>|\?:\*\"\\/]", "", soup.select("div.titles h2")[0].text)
-            infoDict["chapterTitle"] =chapterTitle
+            infoDict["seriesTitle"] = soup.select("div.titles h1 a")[0].text.strip()
+            chapterTitle = soup.select("div.titles h2")[0].text.strip()
+            infoDict["chapterTitle"] = chapterTitle
            infoDict["author"] = ""
        elif "syosetu.com" in self.baseLink:
            chapterTitle = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
-            # = soup.select("h1.p-novel__title font font")
            infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
            infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
        elif "fenrirealm.com" in self.baseLink:
-            chapterTitle = soup.select_one(".chapter-view > div > h2").get_text(strip=True).split(":")[-1].strip()
+            chapterTitle = soup.select_one(".chapter-view > div > h2").get_text(strip=True)
            infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip()
            infoDict["author"] = "unknown"