Scrapper changes

2026-05-21 10:31:19 +02:00
parent 54c7b40737
commit 6c09053ff0
7 changed files with 185 additions and 40 deletions
@@ -1,4 +1,4 @@
 .idea
-test*.py
+*test*.py
 test.*
 *.log
@@ -0,0 +1,12 @@
 from scr.WebScrapper.OniichanyameteWebScrapper import OniichanyameteWebScrapper
 from scr.WebScrapper.FenrirealmWebScrapper import FenrirealmWeb
 from scr.WebScrapper.WebScrapper import WebScrapper
 from scr.WebScrapper.FanmtlWebScrapper import FanmtlWebScrapper
 from scr.WebScrapper.SyosetuWebScrapper import SyosetuWebScrapper
 scrapper = WebScrapper.Get(r"https://oniichanyamete.moe/index/bunnygirl/", "html", "en")
 for chapterNumber, link in scrapper._getChapterLinks(1, None):
    print(chapterNumber, link)
@@ -0,0 +1,45 @@
 import re
 import time
 from urllib.parse import urljoin
 from bs4 import BeautifulSoup, NavigableString
 from src.WebScrapper.WebScrapper import WebScrapper
 class FanmtlWebScrapper(WebScrapper):
    def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
        super().__init__(baseLink, htmlFolderPath, currentLanguage)
    def _getChapterContent(self, soup:BeautifulSoup) -> str:
        return soup.find("div", {"class": "chapter-content"})
    def _removeUnwantedThinsFromHtml(self, soup:BeautifulSoup) -> BeautifulSoup:
        for div in content.find_all('div', {'align': 'center'}):
            if div.find('script'):
                div.decompose()
        super()._removeUnwantedThinsFromHtml(soup)
    def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
        infoDict["seriesTitle"] = soup.select("div.titles h1 a")[0].text.strip()
        chapterTitle = soup.select("div.titles h2")[0].text.strip()
        infoDict["chapterTitle"] = chapterTitle
        infoDict["author"] = ""
        return super()._addChapterTitle(soup, content, chapterNumber, infoDict)
    def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
        if uriWithFormat is None:
            raise Exception("uriWithFormat must be provided for Fanmtl.")
        currentChapter = fromChapter
        while currentChapter <= toChapter:
            yield urljoin(self.baseLink, uriWithFormat.format(currentChapter))
            currentChapter += 1
@@ -0,0 +1,24 @@
 import re
 import time
 from urllib.parse import urljoin
 from bs4 import BeautifulSoup, NavigableString
 from src.WebScrapper.WebScrapper import WebScrapper
 class FenrirealmWebScrapper(WebScrapper):
    def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
        super().__init__(baseLink, htmlFolderPath, currentLanguage)
    def _getChapterContent(self, soup:BeautifulSoup) -> str:
        return soup.select("div.chapter-view > div.content-area")[0]
    def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
        infoDict["chapterTitle"] = soup.select_one(".chapter-view > div > h2").get_text(strip=True)
        infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip()
        infoDict["author"] = "unknown"
        return super()._addChapterTitle(soup, content, infoDict)
@@ -0,0 +1,37 @@
 import re
 import time
 from urllib.parse import urljoin
 from bs4 import BeautifulSoup, NavigableString
 from src.WebScrapper.WebScrapper import WebScrapper
 class OniichanyameteWebScrapper(WebScrapper):
    def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
        super().__init__(baseLink, htmlFolderPath, currentLanguage)
    def _getChapterContent(self, soup:BeautifulSoup) -> str:
        return soup.select("div", {"class": "chapter-content"})
    def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
        infoDict["seriesTitle"] = ""
        infoDict["chapterTitle"] = ""
        infoDict["author"] = ""
        return super()._addChapterTitle(soup, content, chapterNumber, infoDict)
    def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
        soup = BeautifulSoup(requests.get(self.baseLink, headers=headers).content, "html.parser")
        links =soup.select("p[style*='padding-left:60px'] > a")
        pprint (links)
        for link in links:
            chapterNumber = re.search(r'\d+', link.text).group()
            yield chapterNumber, urljoin(self.baseLink, link["href"])
@@ -0,0 +1,38 @@
 import re
 import time
 from urllib.parse import urljoin
 from bs4 import BeautifulSoup, NavigableString
 from src.WebScrapper.WebScrapper import WebScrapper
 class SyosetuWebScrapper(WebScrapper):
    def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
        super().__init__(baseLink, htmlFolderPath, currentLanguage)
    def _getChapterContent(self, soup:BeautifulSoup) -> str:
        body = soup.select("div.p-novel__body div.js-novel-text.p-novel__text")
        for x in body:
            if len(x.text) > 1000:
                return x
    def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
        infoDict["chapterTitle"] = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
        infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
        infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
        return super()._addChapterTitle(soup, content, infoDict)
    def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
        if uriWithFormat is None:
            uriWithFormat = "{}/"
        currentChapter = fromChapter
        while currentChapter <= toChapter:
            yield urljoin(self.baseLink, uriWithFormat.format(currentChapter))
            currentChapter += 1
@@ -8,9 +8,6 @@ from bs4 import BeautifulSoup, NavigableString
 from src.functions import writeToFile, makeDir, writeToJsonFile
 # https://ncode.syosetu.com/n0806fu
 class WebScrapper:
@@ -20,6 +17,19 @@ class WebScrapper:
        self.currentLanguage = currentLanguage
        makeDir(self.htmlFolderPath)
    @staticmethod
    def Get(baseLink:str, htmlFolderPath:str, currentLanguage:str) -> self:
        if "fanmtl.com" in baseLink:
            return FanmtlWebScrapper(baseLink, htmlFolderPath, currentLanguage)
        elif "syosetu.com" in baseLink:
            return SyosetuWebScrapper(baseLink, htmlFolderPath, currentLanguage)
        elif "fenrirealm.com" in baseLink:
            return FenrirealmWebScrapper(baseLink, htmlFolderPath, currentLanguage)
        elif "oniichanyamete.moe" in baseLink:
            return OniichanyameteWebScrapper(baseLink, htmlFolderPath, currentLanguage)
        else:
            raise Exception(f"Website {baseLink} is not supported for scraping yet.")
    @staticmethod
    def _sanitizeFilename(filename: str) -> str:
@@ -52,20 +62,19 @@ class WebScrapper:
        return filename
-    def getHtml(self, uriWithFormat:str, fromChapter:int, toChapter:int, sleepTime:float=0):
+    def run(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, sleepTime:float=0):
        """
        uriWithFormat: inserts the current chapter number into the {} brackets
        """
-        for chapterNumber in range(fromChapter, toChapter + 1):
+        for chapterNumber, link in self._getChapterLinks(fromChapter, toChapter, uriWithFormat):
            time.sleep(sleepTime) # to avoid getting blocked by the website for sending too many requests in a short time
            print(link)
            infoDict = {
                "chapter": chapterNumber,
                "originalLanguage": self.currentLanguage,
                "currentLanguage": self.currentLanguage,
            }
            link = urljoin(self.baseLink, uriWithFormat.format(chapterNumber))
            print(link)
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
                #'Referer': 'https://ncode.syosetu.com/',
@@ -83,8 +92,7 @@ class WebScrapper:
                continue
            self._removeUnwantedThinsFromHtml(chapterContent)
-            chapterTitle = self._addChapterTitle(soup, chapterContent, chapterNumber, infoDict)
+            self._addChapterTitle(soup, chapterContent, infoDict)
            infoDict["chapterTitle"] = chapterTitle
            # Dateinamen bereinigen
            safe_filename = self._sanitizeFilename(chapterTitle)
@@ -112,44 +120,25 @@ class WebScrapper:
        return chapterContent
-    def _removeUnwantedThinsFromHtml(self, content:BeautifulSoup|NavigableString):
+    def _removeUnwantedThinsFromHtml(self, soup:BeautifulSoup|NavigableString):
        if not content:
            return
        # FanMTL advertisements
        if "fanmtl.com" in self.baseLink:
            for div in content.find_all('div', {'align': 'center'}):
                if div.find('script'):
                    div.decompose()
        #general
-        for script in content.find_all('script'):
+        for script in soup.find_all('script'):
            script.decompose()
-    def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, chapterNumber, infoDict:dict):
+    def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict):
        chapterTitle = f"Chapter {chapterNumber}"
        if "fanmtl.com" in self.baseLink:
            infoDict["seriesTitle"] = soup.select("div.titles h1 a")[0].text.strip()
            chapterTitle = soup.select("div.titles h2")[0].text.strip()
            infoDict["chapterTitle"] = chapterTitle
            infoDict["author"] = ""
        elif "syosetu.com" in self.baseLink:
            chapterTitle = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
            infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
            infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
        elif "fenrirealm.com" in self.baseLink:
            chapterTitle = soup.select_one(".chapter-view > div > h2").get_text(strip=True)
            infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip()
            infoDict["author"] = "unknown"
        titleElement = soup.new_tag("h1")
-        titleElement.string = chapterTitle
+        titleElement.string = infoDict["chapterTitle"] if "chapterTitle" in infoDict else f"Chapter {infoDict['chapter']}"
        content.insert(0, titleElement)
        content.insert(1, soup.new_tag("br"))
        content.insert(2, soup.new_tag("br"))
        return chapterTitle
    def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
        currentChapter = fromChapter
        while currentChapter <= toChapter:
            yield currentChapter, urljoin(self.baseLink, currentChapter)
            currentChapter += 1