Files
j-novel-scrapper-translator/src/WebScrapper/SyosetuWebScrapper.py
T
2026-05-21 10:31:19 +02:00

38 lines
1.4 KiB
Python

import re
import time
from urllib.parse import urljoin
from bs4 import BeautifulSoup, NavigableString
from src.WebScrapper.WebScrapper import WebScrapper
class SyosetuWebScrapper(WebScrapper):
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
super().__init__(baseLink, htmlFolderPath, currentLanguage)
def _getChapterContent(self, soup:BeautifulSoup) -> str:
body = soup.select("div.p-novel__body div.js-novel-text.p-novel__text")
for x in body:
if len(x.text) > 1000:
return x
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
infoDict["chapterTitle"] = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
return super()._addChapterTitle(soup, content, infoDict)
def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
if uriWithFormat is None:
uriWithFormat = "{}/"
currentChapter = fromChapter
while currentChapter <= toChapter:
yield urljoin(self.baseLink, uriWithFormat.format(currentChapter))
currentChapter += 1