init2.0
This commit is contained in:
+45
-6
@@ -13,12 +13,45 @@ from src.functions import writeToFile, makeDir, writeToJsonFile
|
||||
|
||||
|
||||
class WebScrapper:
|
||||
|
||||
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
|
||||
self.baseLink = baseLink
|
||||
self.htmlFolderPath = htmlFolderPath
|
||||
self.currentLanguage = currentLanguage
|
||||
makeDir(self.htmlFolderPath)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _sanitizeFilename(filename: str) -> str:
|
||||
"""
|
||||
Entfernt ungültige Zeichen für Windows und Linux Dateinamen.
|
||||
Windows verboten: < > : " / \ | ? *
|
||||
Linux verboten: / und \0 (null byte)
|
||||
Zusätzlich: Leerzeichen am Anfang/Ende entfernen, mehrfache Leerzeichen reduzieren
|
||||
"""
|
||||
# Ungültige Zeichen für Windows und Linux entfernen
|
||||
filename = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '', filename)
|
||||
|
||||
# Mehrfache Leerzeichen durch einzelnes ersetzen
|
||||
filename = re.sub(r'\s+', ' ', filename)
|
||||
|
||||
# Leerzeichen am Anfang/Ende entfernen
|
||||
filename = filename.strip()
|
||||
|
||||
# Punkte am Ende entfernen (Windows-Problem)
|
||||
filename = filename.rstrip('.')
|
||||
|
||||
# Falls Dateiname leer ist, Fallback verwenden
|
||||
if not filename:
|
||||
filename = "chapter"
|
||||
|
||||
# Dateiname auf maximal 255 Zeichen begrenzen (ohne Erweiterung)
|
||||
if len(filename) > 200:
|
||||
filename = filename[:200]
|
||||
|
||||
return filename
|
||||
|
||||
|
||||
def getHtml(self, uriWithFormat:str, fromChapter:int, toChapter:int, sleepTime:float=0):
|
||||
"""
|
||||
uriWithFormat: inserts the current chapter number into the {} brackets
|
||||
@@ -53,10 +86,16 @@ class WebScrapper:
|
||||
chapterTitle = self._addChapterTitle(soup, chapterContent, chapterNumber, infoDict)
|
||||
infoDict["chapterTitle"] = chapterTitle
|
||||
|
||||
filePath = os.path.join(self.htmlFolderPath, f"{chapterTitle}.html")
|
||||
# Dateinamen bereinigen
|
||||
safe_filename = self._sanitizeFilename(chapterTitle)
|
||||
if safe_filename != chapterTitle:
|
||||
print(f" → Dateiname bereinigt: '{chapterTitle}' → '{safe_filename}'")
|
||||
|
||||
filePath = os.path.join(self.htmlFolderPath, f"{safe_filename}.html")
|
||||
writeToFile("wb", filePath, chapterContent.renderContents("utf-8", True, 4))
|
||||
writeToJsonFile(filePath.replace(".html", ".json"), infoDict)
|
||||
|
||||
|
||||
def _getChapterContent(self, soup:BeautifulSoup):
|
||||
chapterContent = None
|
||||
|
||||
@@ -72,6 +111,7 @@ class WebScrapper:
|
||||
|
||||
return chapterContent
|
||||
|
||||
|
||||
def _removeUnwantedThinsFromHtml(self, content:BeautifulSoup|NavigableString):
|
||||
if not content:
|
||||
return
|
||||
@@ -90,17 +130,16 @@ class WebScrapper:
|
||||
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, chapterNumber, infoDict:dict):
|
||||
chapterTitle = f"Chapter {chapterNumber}"
|
||||
if "fanmtl.com" in self.baseLink:
|
||||
infoDict["seriesTitle"] = re.sub("\(.*?\)|[<>|\?:\*\"\\/]", "", soup.select("div.titles h1 a")[0].text)
|
||||
chapterTitle = re.sub("\(.*?\)|[<>|\?:\*\"\\/]", "", soup.select("div.titles h2")[0].text)
|
||||
infoDict["chapterTitle"] =chapterTitle
|
||||
infoDict["seriesTitle"] = soup.select("div.titles h1 a")[0].text.strip()
|
||||
chapterTitle = soup.select("div.titles h2")[0].text.strip()
|
||||
infoDict["chapterTitle"] = chapterTitle
|
||||
infoDict["author"] = ""
|
||||
elif "syosetu.com" in self.baseLink:
|
||||
chapterTitle = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
|
||||
# = soup.select("h1.p-novel__title font font")
|
||||
infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
|
||||
infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
|
||||
elif "fenrirealm.com" in self.baseLink:
|
||||
chapterTitle = soup.select_one(".chapter-view > div > h2").get_text(strip=True).split(":")[-1].strip()
|
||||
chapterTitle = soup.select_one(".chapter-view > div > h2").get_text(strip=True)
|
||||
infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip()
|
||||
infoDict["author"] = "unknown"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user