import re import time from urllib.parse import urljoin from bs4 import BeautifulSoup, NavigableString from src.WebScrapper.WebScrapper import WebScrapper class SyosetuWebScrapper(WebScrapper): def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str): super().__init__(baseLink, htmlFolderPath, currentLanguage) def _getChapterContent(self, soup:BeautifulSoup) -> str: body = soup.select("div.p-novel__body div.js-novel-text.p-novel__text") for x in body: if len(x.text) > 1000: return x def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str: infoDict["chapterTitle"] = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip() infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip() infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip() return super()._addChapterTitle(soup, content, infoDict) def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None): if uriWithFormat is None: uriWithFormat = "{}/" currentChapter = fromChapter while currentChapter <= toChapter: yield urljoin(self.baseLink, uriWithFormat.format(currentChapter)) currentChapter += 1