Scrapper changes
This commit is contained in:
@@ -0,0 +1,45 @@
|
||||
import re
|
||||
import time
|
||||
from urllib.parse import urljoin
|
||||
from bs4 import BeautifulSoup, NavigableString
|
||||
from src.WebScrapper.WebScrapper import WebScrapper
|
||||
|
||||
|
||||
class FanmtlWebScrapper(WebScrapper):
|
||||
|
||||
|
||||
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
|
||||
super().__init__(baseLink, htmlFolderPath, currentLanguage)
|
||||
|
||||
|
||||
def _getChapterContent(self, soup:BeautifulSoup) -> str:
|
||||
return soup.find("div", {"class": "chapter-content"})
|
||||
|
||||
|
||||
def _removeUnwantedThinsFromHtml(self, soup:BeautifulSoup) -> BeautifulSoup:
|
||||
for div in content.find_all('div', {'align': 'center'}):
|
||||
if div.find('script'):
|
||||
div.decompose()
|
||||
|
||||
super()._removeUnwantedThinsFromHtml(soup)
|
||||
|
||||
|
||||
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
|
||||
infoDict["seriesTitle"] = soup.select("div.titles h1 a")[0].text.strip()
|
||||
chapterTitle = soup.select("div.titles h2")[0].text.strip()
|
||||
infoDict["chapterTitle"] = chapterTitle
|
||||
infoDict["author"] = ""
|
||||
|
||||
return super()._addChapterTitle(soup, content, chapterNumber, infoDict)
|
||||
|
||||
|
||||
|
||||
def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
|
||||
if uriWithFormat is None:
|
||||
raise Exception("uriWithFormat must be provided for Fanmtl.")
|
||||
|
||||
currentChapter = fromChapter
|
||||
|
||||
while currentChapter <= toChapter:
|
||||
yield urljoin(self.baseLink, uriWithFormat.format(currentChapter))
|
||||
currentChapter += 1
|
||||
@@ -0,0 +1,24 @@
|
||||
import re
|
||||
import time
|
||||
from urllib.parse import urljoin
|
||||
from bs4 import BeautifulSoup, NavigableString
|
||||
from src.WebScrapper.WebScrapper import WebScrapper
|
||||
|
||||
|
||||
class FenrirealmWebScrapper(WebScrapper):
|
||||
|
||||
|
||||
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
|
||||
super().__init__(baseLink, htmlFolderPath, currentLanguage)
|
||||
|
||||
|
||||
def _getChapterContent(self, soup:BeautifulSoup) -> str:
|
||||
return soup.select("div.chapter-view > div.content-area")[0]
|
||||
|
||||
|
||||
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
|
||||
infoDict["chapterTitle"] = soup.select_one(".chapter-view > div > h2").get_text(strip=True)
|
||||
infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip()
|
||||
infoDict["author"] = "unknown"
|
||||
return super()._addChapterTitle(soup, content, infoDict)
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
import re
|
||||
import time
|
||||
from urllib.parse import urljoin
|
||||
from bs4 import BeautifulSoup, NavigableString
|
||||
from src.WebScrapper.WebScrapper import WebScrapper
|
||||
|
||||
|
||||
class OniichanyameteWebScrapper(WebScrapper):
|
||||
|
||||
|
||||
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
|
||||
super().__init__(baseLink, htmlFolderPath, currentLanguage)
|
||||
|
||||
|
||||
def _getChapterContent(self, soup:BeautifulSoup) -> str:
|
||||
return soup.select("div", {"class": "chapter-content"})
|
||||
|
||||
|
||||
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
|
||||
infoDict["seriesTitle"] = ""
|
||||
infoDict["chapterTitle"] = ""
|
||||
infoDict["author"] = ""
|
||||
|
||||
return super()._addChapterTitle(soup, content, chapterNumber, infoDict)
|
||||
|
||||
|
||||
|
||||
def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
|
||||
soup = BeautifulSoup(requests.get(self.baseLink, headers=headers).content, "html.parser")
|
||||
|
||||
links =soup.select("p[style*='padding-left:60px'] > a")
|
||||
pprint (links)
|
||||
|
||||
|
||||
for link in links:
|
||||
chapterNumber = re.search(r'\d+', link.text).group()
|
||||
yield chapterNumber, urljoin(self.baseLink, link["href"])
|
||||
@@ -0,0 +1,38 @@
|
||||
import re
|
||||
import time
|
||||
from urllib.parse import urljoin
|
||||
from bs4 import BeautifulSoup, NavigableString
|
||||
from src.WebScrapper.WebScrapper import WebScrapper
|
||||
|
||||
|
||||
class SyosetuWebScrapper(WebScrapper):
|
||||
|
||||
|
||||
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
|
||||
super().__init__(baseLink, htmlFolderPath, currentLanguage)
|
||||
|
||||
|
||||
def _getChapterContent(self, soup:BeautifulSoup) -> str:
|
||||
body = soup.select("div.p-novel__body div.js-novel-text.p-novel__text")
|
||||
for x in body:
|
||||
if len(x.text) > 1000:
|
||||
return x
|
||||
|
||||
|
||||
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
|
||||
infoDict["chapterTitle"] = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
|
||||
infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
|
||||
infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
|
||||
return super()._addChapterTitle(soup, content, infoDict)
|
||||
|
||||
|
||||
|
||||
def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
|
||||
if uriWithFormat is None:
|
||||
uriWithFormat = "{}/"
|
||||
|
||||
currentChapter = fromChapter
|
||||
|
||||
while currentChapter <= toChapter:
|
||||
yield urljoin(self.baseLink, uriWithFormat.format(currentChapter))
|
||||
currentChapter += 1
|
||||
@@ -8,9 +8,6 @@ from bs4 import BeautifulSoup, NavigableString
|
||||
from src.functions import writeToFile, makeDir, writeToJsonFile
|
||||
|
||||
|
||||
# https://ncode.syosetu.com/n0806fu
|
||||
|
||||
|
||||
|
||||
class WebScrapper:
|
||||
|
||||
@@ -20,6 +17,19 @@ class WebScrapper:
|
||||
self.currentLanguage = currentLanguage
|
||||
makeDir(self.htmlFolderPath)
|
||||
|
||||
@staticmethod
|
||||
def Get(baseLink:str, htmlFolderPath:str, currentLanguage:str) -> self:
|
||||
if "fanmtl.com" in baseLink:
|
||||
return FanmtlWebScrapper(baseLink, htmlFolderPath, currentLanguage)
|
||||
elif "syosetu.com" in baseLink:
|
||||
return SyosetuWebScrapper(baseLink, htmlFolderPath, currentLanguage)
|
||||
elif "fenrirealm.com" in baseLink:
|
||||
return FenrirealmWebScrapper(baseLink, htmlFolderPath, currentLanguage)
|
||||
elif "oniichanyamete.moe" in baseLink:
|
||||
return OniichanyameteWebScrapper(baseLink, htmlFolderPath, currentLanguage)
|
||||
else:
|
||||
raise Exception(f"Website {baseLink} is not supported for scraping yet.")
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _sanitizeFilename(filename: str) -> str:
|
||||
@@ -52,20 +62,19 @@ class WebScrapper:
|
||||
return filename
|
||||
|
||||
|
||||
def getHtml(self, uriWithFormat:str, fromChapter:int, toChapter:int, sleepTime:float=0):
|
||||
def run(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, sleepTime:float=0):
|
||||
"""
|
||||
uriWithFormat: inserts the current chapter number into the {} brackets
|
||||
"""
|
||||
for chapterNumber in range(fromChapter, toChapter + 1):
|
||||
for chapterNumber, link in self._getChapterLinks(fromChapter, toChapter, uriWithFormat):
|
||||
time.sleep(sleepTime) # to avoid getting blocked by the website for sending too many requests in a short time
|
||||
print(link)
|
||||
|
||||
infoDict = {
|
||||
"chapter": chapterNumber,
|
||||
"originalLanguage": self.currentLanguage,
|
||||
"currentLanguage": self.currentLanguage,
|
||||
}
|
||||
link = urljoin(self.baseLink, uriWithFormat.format(chapterNumber))
|
||||
print(link)
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
|
||||
#'Referer': 'https://ncode.syosetu.com/',
|
||||
@@ -83,8 +92,7 @@ class WebScrapper:
|
||||
continue
|
||||
|
||||
self._removeUnwantedThinsFromHtml(chapterContent)
|
||||
chapterTitle = self._addChapterTitle(soup, chapterContent, chapterNumber, infoDict)
|
||||
infoDict["chapterTitle"] = chapterTitle
|
||||
self._addChapterTitle(soup, chapterContent, infoDict)
|
||||
|
||||
# Dateinamen bereinigen
|
||||
safe_filename = self._sanitizeFilename(chapterTitle)
|
||||
@@ -112,44 +120,25 @@ class WebScrapper:
|
||||
return chapterContent
|
||||
|
||||
|
||||
def _removeUnwantedThinsFromHtml(self, content:BeautifulSoup|NavigableString):
|
||||
if not content:
|
||||
return
|
||||
|
||||
# FanMTL advertisements
|
||||
if "fanmtl.com" in self.baseLink:
|
||||
for div in content.find_all('div', {'align': 'center'}):
|
||||
if div.find('script'):
|
||||
div.decompose()
|
||||
|
||||
def _removeUnwantedThinsFromHtml(self, soup:BeautifulSoup|NavigableString):
|
||||
#general
|
||||
for script in content.find_all('script'):
|
||||
for script in soup.find_all('script'):
|
||||
script.decompose()
|
||||
|
||||
|
||||
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, chapterNumber, infoDict:dict):
|
||||
chapterTitle = f"Chapter {chapterNumber}"
|
||||
if "fanmtl.com" in self.baseLink:
|
||||
infoDict["seriesTitle"] = soup.select("div.titles h1 a")[0].text.strip()
|
||||
chapterTitle = soup.select("div.titles h2")[0].text.strip()
|
||||
infoDict["chapterTitle"] = chapterTitle
|
||||
infoDict["author"] = ""
|
||||
elif "syosetu.com" in self.baseLink:
|
||||
chapterTitle = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
|
||||
infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
|
||||
infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
|
||||
elif "fenrirealm.com" in self.baseLink:
|
||||
chapterTitle = soup.select_one(".chapter-view > div > h2").get_text(strip=True)
|
||||
infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip()
|
||||
infoDict["author"] = "unknown"
|
||||
|
||||
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict):
|
||||
titleElement = soup.new_tag("h1")
|
||||
titleElement.string = chapterTitle
|
||||
titleElement.string = infoDict["chapterTitle"] if "chapterTitle" in infoDict else f"Chapter {infoDict['chapter']}"
|
||||
content.insert(0, titleElement)
|
||||
content.insert(1, soup.new_tag("br"))
|
||||
content.insert(2, soup.new_tag("br"))
|
||||
|
||||
return chapterTitle
|
||||
|
||||
|
||||
def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
|
||||
currentChapter = fromChapter
|
||||
|
||||
while currentChapter <= toChapter:
|
||||
yield currentChapter, urljoin(self.baseLink, currentChapter)
|
||||
currentChapter += 1
|
||||
|
||||
|
||||
Reference in New Issue
Block a user