This commit is contained in:
2026-05-21 08:16:10 +02:00
parent 9690367d70
commit 54c7b40737
5 changed files with 431 additions and 9 deletions
+45 -6
View File
@@ -13,12 +13,45 @@ from src.functions import writeToFile, makeDir, writeToJsonFile
class WebScrapper:
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
self.baseLink = baseLink
self.htmlFolderPath = htmlFolderPath
self.currentLanguage = currentLanguage
makeDir(self.htmlFolderPath)
@staticmethod
def _sanitizeFilename(filename: str) -> str:
"""
Entfernt ungültige Zeichen für Windows und Linux Dateinamen.
Windows verboten: < > : " / \ | ? *
Linux verboten: / und \0 (null byte)
Zusätzlich: Leerzeichen am Anfang/Ende entfernen, mehrfache Leerzeichen reduzieren
"""
# Ungültige Zeichen für Windows und Linux entfernen
filename = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '', filename)
# Mehrfache Leerzeichen durch einzelnes ersetzen
filename = re.sub(r'\s+', ' ', filename)
# Leerzeichen am Anfang/Ende entfernen
filename = filename.strip()
# Punkte am Ende entfernen (Windows-Problem)
filename = filename.rstrip('.')
# Falls Dateiname leer ist, Fallback verwenden
if not filename:
filename = "chapter"
# Dateiname auf maximal 255 Zeichen begrenzen (ohne Erweiterung)
if len(filename) > 200:
filename = filename[:200]
return filename
def getHtml(self, uriWithFormat:str, fromChapter:int, toChapter:int, sleepTime:float=0):
"""
uriWithFormat: inserts the current chapter number into the {} brackets
@@ -53,10 +86,16 @@ class WebScrapper:
chapterTitle = self._addChapterTitle(soup, chapterContent, chapterNumber, infoDict)
infoDict["chapterTitle"] = chapterTitle
filePath = os.path.join(self.htmlFolderPath, f"{chapterTitle}.html")
# Dateinamen bereinigen
safe_filename = self._sanitizeFilename(chapterTitle)
if safe_filename != chapterTitle:
print(f" → Dateiname bereinigt: '{chapterTitle}''{safe_filename}'")
filePath = os.path.join(self.htmlFolderPath, f"{safe_filename}.html")
writeToFile("wb", filePath, chapterContent.renderContents("utf-8", True, 4))
writeToJsonFile(filePath.replace(".html", ".json"), infoDict)
def _getChapterContent(self, soup:BeautifulSoup):
chapterContent = None
@@ -72,6 +111,7 @@ class WebScrapper:
return chapterContent
def _removeUnwantedThinsFromHtml(self, content:BeautifulSoup|NavigableString):
if not content:
return
@@ -90,17 +130,16 @@ class WebScrapper:
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, chapterNumber, infoDict:dict):
chapterTitle = f"Chapter {chapterNumber}"
if "fanmtl.com" in self.baseLink:
infoDict["seriesTitle"] = re.sub("\(.*?\)|[<>|\?:\*\"\\/]", "", soup.select("div.titles h1 a")[0].text)
chapterTitle = re.sub("\(.*?\)|[<>|\?:\*\"\\/]", "", soup.select("div.titles h2")[0].text)
infoDict["chapterTitle"] =chapterTitle
infoDict["seriesTitle"] = soup.select("div.titles h1 a")[0].text.strip()
chapterTitle = soup.select("div.titles h2")[0].text.strip()
infoDict["chapterTitle"] = chapterTitle
infoDict["author"] = ""
elif "syosetu.com" in self.baseLink:
chapterTitle = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
# = soup.select("h1.p-novel__title font font")
infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
elif "fenrirealm.com" in self.baseLink:
chapterTitle = soup.select_one(".chapter-view > div > h2").get_text(strip=True).split(":")[-1].strip()
chapterTitle = soup.select_one(".chapter-view > div > h2").get_text(strip=True)
infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip()
infoDict["author"] = "unknown"