Scrapper changes
This commit is contained in:
@@ -0,0 +1,38 @@
|
||||
import re
|
||||
import time
|
||||
from urllib.parse import urljoin
|
||||
from bs4 import BeautifulSoup, NavigableString
|
||||
from src.WebScrapper.WebScrapper import WebScrapper
|
||||
|
||||
|
||||
class SyosetuWebScrapper(WebScrapper):
|
||||
|
||||
|
||||
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
|
||||
super().__init__(baseLink, htmlFolderPath, currentLanguage)
|
||||
|
||||
|
||||
def _getChapterContent(self, soup:BeautifulSoup) -> str:
|
||||
body = soup.select("div.p-novel__body div.js-novel-text.p-novel__text")
|
||||
for x in body:
|
||||
if len(x.text) > 1000:
|
||||
return x
|
||||
|
||||
|
||||
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
|
||||
infoDict["chapterTitle"] = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
|
||||
infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
|
||||
infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
|
||||
return super()._addChapterTitle(soup, content, infoDict)
|
||||
|
||||
|
||||
|
||||
def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
|
||||
if uriWithFormat is None:
|
||||
uriWithFormat = "{}/"
|
||||
|
||||
currentChapter = fromChapter
|
||||
|
||||
while currentChapter <= toChapter:
|
||||
yield urljoin(self.baseLink, uriWithFormat.format(currentChapter))
|
||||
currentChapter += 1
|
||||
Reference in New Issue
Block a user