38 lines
1.4 KiB
Python
38 lines
1.4 KiB
Python
import re
|
|
import time
|
|
from urllib.parse import urljoin
|
|
from bs4 import BeautifulSoup, NavigableString
|
|
from src.WebScrapper.WebScrapper import WebScrapper
|
|
|
|
|
|
class SyosetuWebScrapper(WebScrapper):
|
|
|
|
|
|
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
|
|
super().__init__(baseLink, htmlFolderPath, currentLanguage)
|
|
|
|
|
|
def _getChapterContent(self, soup:BeautifulSoup) -> str:
|
|
body = soup.select("div.p-novel__body div.js-novel-text.p-novel__text")
|
|
for x in body:
|
|
if len(x.text) > 1000:
|
|
return x
|
|
|
|
|
|
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
|
|
infoDict["chapterTitle"] = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
|
|
infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
|
|
infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
|
|
return super()._addChapterTitle(soup, content, infoDict)
|
|
|
|
|
|
|
|
def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
|
|
if uriWithFormat is None:
|
|
uriWithFormat = "{}/"
|
|
|
|
currentChapter = fromChapter
|
|
|
|
while currentChapter <= toChapter:
|
|
yield urljoin(self.baseLink, uriWithFormat.format(currentChapter))
|
|
currentChapter += 1 |