Scrapper changes

2026-05-21 10:31:19 +02:00
parent 54c7b40737
commit 6c09053ff0
7 changed files with 185 additions and 40 deletions
@@ -0,0 +1,45 @@
+import re
+import time
+from urllib.parse import urljoin
+from bs4 import BeautifulSoup, NavigableString
+from src.WebScrapper.WebScrapper import WebScrapper
+
+
+class FanmtlWebScrapper(WebScrapper):
+
+
+    def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
+        super().__init__(baseLink, htmlFolderPath, currentLanguage)
+
+
+    def _getChapterContent(self, soup:BeautifulSoup) -> str:
+        return soup.find("div", {"class": "chapter-content"})
+
+
+    def _removeUnwantedThinsFromHtml(self, soup:BeautifulSoup) -> BeautifulSoup:
+        for div in content.find_all('div', {'align': 'center'}):
+            if div.find('script'):
+                div.decompose()
+        
+        super()._removeUnwantedThinsFromHtml(soup)
+
+
+    def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
+        infoDict["seriesTitle"] = soup.select("div.titles h1 a")[0].text.strip()
+        chapterTitle = soup.select("div.titles h2")[0].text.strip()
+        infoDict["chapterTitle"] = chapterTitle
+        infoDict["author"] = ""
+
+        return super()._addChapterTitle(soup, content, chapterNumber, infoDict)
+
+
+
+    def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
+        if uriWithFormat is None:
+            raise Exception("uriWithFormat must be provided for Fanmtl.")
+
+        currentChapter = fromChapter
+        
+        while currentChapter <= toChapter:
+            yield urljoin(self.baseLink, uriWithFormat.format(currentChapter))
+            currentChapter += 1
@@ -0,0 +1,24 @@
+import re
+import time
+from urllib.parse import urljoin
+from bs4 import BeautifulSoup, NavigableString
+from src.WebScrapper.WebScrapper import WebScrapper
+
+
+class FenrirealmWebScrapper(WebScrapper):
+
+
+    def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
+        super().__init__(baseLink, htmlFolderPath, currentLanguage)
+
+
+    def _getChapterContent(self, soup:BeautifulSoup) -> str:
+        return soup.select("div.chapter-view > div.content-area")[0]
+
+
+    def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
+        infoDict["chapterTitle"] = soup.select_one(".chapter-view > div > h2").get_text(strip=True)
+        infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip()
+        infoDict["author"] = "unknown"
+        return super()._addChapterTitle(soup, content, infoDict)
+
@@ -0,0 +1,37 @@
+import re
+import time
+from urllib.parse import urljoin
+from bs4 import BeautifulSoup, NavigableString
+from src.WebScrapper.WebScrapper import WebScrapper
+
+
+class OniichanyameteWebScrapper(WebScrapper):
+
+
+    def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
+        super().__init__(baseLink, htmlFolderPath, currentLanguage)
+
+
+    def _getChapterContent(self, soup:BeautifulSoup) -> str:
+        return soup.select("div", {"class": "chapter-content"})
+
+
+    def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
+        infoDict["seriesTitle"] = ""
+        infoDict["chapterTitle"] = ""
+        infoDict["author"] = ""
+
+        return super()._addChapterTitle(soup, content, chapterNumber, infoDict)
+
+
+
+    def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
+        soup = BeautifulSoup(requests.get(self.baseLink, headers=headers).content, "html.parser")
+
+        links =soup.select("p[style*='padding-left:60px'] > a")
+        pprint (links)
+        
+        
+        for link in links:
+            chapterNumber = re.search(r'\d+', link.text).group()
+            yield chapterNumber, urljoin(self.baseLink, link["href"])
@@ -0,0 +1,38 @@
+import re
+import time
+from urllib.parse import urljoin
+from bs4 import BeautifulSoup, NavigableString
+from src.WebScrapper.WebScrapper import WebScrapper
+
+
+class SyosetuWebScrapper(WebScrapper):
+
+
+    def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
+        super().__init__(baseLink, htmlFolderPath, currentLanguage)
+
+
+    def _getChapterContent(self, soup:BeautifulSoup) -> str:
+        body = soup.select("div.p-novel__body div.js-novel-text.p-novel__text")
+        for x in body:
+            if len(x.text) > 1000:
+                return x
+
+
+    def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
+        infoDict["chapterTitle"] = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
+        infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
+        infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
+        return super()._addChapterTitle(soup, content, infoDict)
+
+
+
+    def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
+        if uriWithFormat is None:
+            uriWithFormat = "{}/"
+
+        currentChapter = fromChapter
+        
+        while currentChapter <= toChapter:
+            yield urljoin(self.baseLink, uriWithFormat.format(currentChapter))
+            currentChapter += 1
@@ -8,9 +8,6 @@ from bs4 import BeautifulSoup, NavigableString
 from src.functions import writeToFile, makeDir, writeToJsonFile


-# https://ncode.syosetu.com/n0806fu
-
-

 class WebScrapper:

@@ -20,6 +17,19 @@ class WebScrapper:
        self.currentLanguage = currentLanguage
        makeDir(self.htmlFolderPath)

+    @staticmethod
+    def Get(baseLink:str, htmlFolderPath:str, currentLanguage:str) -> self:
+        if "fanmtl.com" in baseLink:
+            return FanmtlWebScrapper(baseLink, htmlFolderPath, currentLanguage)
+        elif "syosetu.com" in baseLink:
+            return SyosetuWebScrapper(baseLink, htmlFolderPath, currentLanguage)
+        elif "fenrirealm.com" in baseLink:
+            return FenrirealmWebScrapper(baseLink, htmlFolderPath, currentLanguage)
+        elif "oniichanyamete.moe" in baseLink:
+            return OniichanyameteWebScrapper(baseLink, htmlFolderPath, currentLanguage)
+        else:
+            raise Exception(f"Website {baseLink} is not supported for scraping yet.")
+    

    @staticmethod
    def _sanitizeFilename(filename: str) -> str:
@@ -52,20 +62,19 @@ class WebScrapper:
        return filename


-    def getHtml(self, uriWithFormat:str, fromChapter:int, toChapter:int, sleepTime:float=0):
+    def run(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, sleepTime:float=0):
        """
        uriWithFormat: inserts the current chapter number into the {} brackets
        """
-        for chapterNumber in range(fromChapter, toChapter + 1):
+        for chapterNumber, link in self._getChapterLinks(fromChapter, toChapter, uriWithFormat):
            time.sleep(sleepTime) # to avoid getting blocked by the website for sending too many requests in a short time
+            print(link)

            infoDict = {
                "chapter": chapterNumber,
                "originalLanguage": self.currentLanguage,
                "currentLanguage": self.currentLanguage,
            }
-            link = urljoin(self.baseLink, uriWithFormat.format(chapterNumber))
-            print(link)
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
                #'Referer': 'https://ncode.syosetu.com/',
@@ -83,8 +92,7 @@ class WebScrapper:
                continue

            self._removeUnwantedThinsFromHtml(chapterContent)
-            chapterTitle = self._addChapterTitle(soup, chapterContent, chapterNumber, infoDict)
-            infoDict["chapterTitle"] = chapterTitle
+            self._addChapterTitle(soup, chapterContent, infoDict)

            # Dateinamen bereinigen
            safe_filename = self._sanitizeFilename(chapterTitle)
@@ -112,44 +120,25 @@ class WebScrapper:
        return chapterContent


-    def _removeUnwantedThinsFromHtml(self, content:BeautifulSoup|NavigableString):
-        if not content:
-            return
-
-        # FanMTL advertisements
-        if "fanmtl.com" in self.baseLink:
-            for div in content.find_all('div', {'align': 'center'}):
-                if div.find('script'):
-                    div.decompose()
-
+    def _removeUnwantedThinsFromHtml(self, soup:BeautifulSoup|NavigableString):
        #general
-        for script in content.find_all('script'):
+        for script in soup.find_all('script'):
            script.decompose()


-    def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, chapterNumber, infoDict:dict):
-        chapterTitle = f"Chapter {chapterNumber}"
-        if "fanmtl.com" in self.baseLink:
-            infoDict["seriesTitle"] = soup.select("div.titles h1 a")[0].text.strip()
-            chapterTitle = soup.select("div.titles h2")[0].text.strip()
-            infoDict["chapterTitle"] = chapterTitle
-            infoDict["author"] = ""
-        elif "syosetu.com" in self.baseLink:
-            chapterTitle = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
-            infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
-            infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
-        elif "fenrirealm.com" in self.baseLink:
-            chapterTitle = soup.select_one(".chapter-view > div > h2").get_text(strip=True)
-            infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip()
-            infoDict["author"] = "unknown"
-
+    def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict):
        titleElement = soup.new_tag("h1")
-        titleElement.string = chapterTitle
+        titleElement.string = infoDict["chapterTitle"] if "chapterTitle" in infoDict else f"Chapter {infoDict['chapter']}"
        content.insert(0, titleElement)
        content.insert(1, soup.new_tag("br"))
        content.insert(2, soup.new_tag("br"))

-        return chapterTitle
-

+    def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
+        currentChapter = fromChapter
+        
+        while currentChapter <= toChapter:
+            yield currentChapter, urljoin(self.baseLink, currentChapter)
+            currentChapter += 1
+