j-novel-scrapper-translator/src/WebScrapper/SyosetuWebScrapper.py

import re
import time
from urllib.parse import urljoin
from bs4 import BeautifulSoup, NavigableString
from src.WebScrapper.WebScrapper import WebScrapper


class SyosetuWebScrapper(WebScrapper):


    def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
        super().__init__(baseLink, htmlFolderPath, currentLanguage)


    def _getChapterContent(self, soup:BeautifulSoup) -> str:
        body = soup.select("div.p-novel__body div.js-novel-text.p-novel__text")
        for x in body:
            if len(x.text) > 1000:
                return x


    def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
        infoDict["chapterTitle"] = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
        infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
        infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
        return super()._addChapterTitle(soup, content, infoDict)


    def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
        if uriWithFormat is None:
            uriWithFormat = "{}/"

        currentChapter = fromChapter

        while currentChapter <= toChapter:
            yield urljoin(self.baseLink, uriWithFormat.format(currentChapter))
            currentChapter += 1