Scrapper changes

This commit is contained in:
2026-05-21 10:31:19 +02:00
parent 54c7b40737
commit 6c09053ff0
7 changed files with 185 additions and 40 deletions
+1 -1
View File
@@ -1,4 +1,4 @@
.idea
test*.py
*test*.py
test.*
*.log
+12
View File
@@ -0,0 +1,12 @@
from scr.WebScrapper.OniichanyameteWebScrapper import OniichanyameteWebScrapper
from scr.WebScrapper.FenrirealmWebScrapper import FenrirealmWeb
from scr.WebScrapper.WebScrapper import WebScrapper
from scr.WebScrapper.FanmtlWebScrapper import FanmtlWebScrapper
from scr.WebScrapper.SyosetuWebScrapper import SyosetuWebScrapper
scrapper = WebScrapper.Get(r"https://oniichanyamete.moe/index/bunnygirl/", "html", "en")
for chapterNumber, link in scrapper._getChapterLinks(1, None):
print(chapterNumber, link)
+45
View File
@@ -0,0 +1,45 @@
import re
import time
from urllib.parse import urljoin
from bs4 import BeautifulSoup, NavigableString
from src.WebScrapper.WebScrapper import WebScrapper
class FanmtlWebScrapper(WebScrapper):
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
super().__init__(baseLink, htmlFolderPath, currentLanguage)
def _getChapterContent(self, soup:BeautifulSoup) -> str:
return soup.find("div", {"class": "chapter-content"})
def _removeUnwantedThinsFromHtml(self, soup:BeautifulSoup) -> BeautifulSoup:
for div in content.find_all('div', {'align': 'center'}):
if div.find('script'):
div.decompose()
super()._removeUnwantedThinsFromHtml(soup)
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
infoDict["seriesTitle"] = soup.select("div.titles h1 a")[0].text.strip()
chapterTitle = soup.select("div.titles h2")[0].text.strip()
infoDict["chapterTitle"] = chapterTitle
infoDict["author"] = ""
return super()._addChapterTitle(soup, content, chapterNumber, infoDict)
def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
if uriWithFormat is None:
raise Exception("uriWithFormat must be provided for Fanmtl.")
currentChapter = fromChapter
while currentChapter <= toChapter:
yield urljoin(self.baseLink, uriWithFormat.format(currentChapter))
currentChapter += 1
+24
View File
@@ -0,0 +1,24 @@
import re
import time
from urllib.parse import urljoin
from bs4 import BeautifulSoup, NavigableString
from src.WebScrapper.WebScrapper import WebScrapper
class FenrirealmWebScrapper(WebScrapper):
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
super().__init__(baseLink, htmlFolderPath, currentLanguage)
def _getChapterContent(self, soup:BeautifulSoup) -> str:
return soup.select("div.chapter-view > div.content-area")[0]
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
infoDict["chapterTitle"] = soup.select_one(".chapter-view > div > h2").get_text(strip=True)
infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip()
infoDict["author"] = "unknown"
return super()._addChapterTitle(soup, content, infoDict)
@@ -0,0 +1,37 @@
import re
import time
from urllib.parse import urljoin
from bs4 import BeautifulSoup, NavigableString
from src.WebScrapper.WebScrapper import WebScrapper
class OniichanyameteWebScrapper(WebScrapper):
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
super().__init__(baseLink, htmlFolderPath, currentLanguage)
def _getChapterContent(self, soup:BeautifulSoup) -> str:
return soup.select("div", {"class": "chapter-content"})
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
infoDict["seriesTitle"] = ""
infoDict["chapterTitle"] = ""
infoDict["author"] = ""
return super()._addChapterTitle(soup, content, chapterNumber, infoDict)
def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
soup = BeautifulSoup(requests.get(self.baseLink, headers=headers).content, "html.parser")
links =soup.select("p[style*='padding-left:60px'] > a")
pprint (links)
for link in links:
chapterNumber = re.search(r'\d+', link.text).group()
yield chapterNumber, urljoin(self.baseLink, link["href"])
+38
View File
@@ -0,0 +1,38 @@
import re
import time
from urllib.parse import urljoin
from bs4 import BeautifulSoup, NavigableString
from src.WebScrapper.WebScrapper import WebScrapper
class SyosetuWebScrapper(WebScrapper):
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
super().__init__(baseLink, htmlFolderPath, currentLanguage)
def _getChapterContent(self, soup:BeautifulSoup) -> str:
body = soup.select("div.p-novel__body div.js-novel-text.p-novel__text")
for x in body:
if len(x.text) > 1000:
return x
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
infoDict["chapterTitle"] = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
return super()._addChapterTitle(soup, content, infoDict)
def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
if uriWithFormat is None:
uriWithFormat = "{}/"
currentChapter = fromChapter
while currentChapter <= toChapter:
yield urljoin(self.baseLink, uriWithFormat.format(currentChapter))
currentChapter += 1
@@ -8,9 +8,6 @@ from bs4 import BeautifulSoup, NavigableString
from src.functions import writeToFile, makeDir, writeToJsonFile
# https://ncode.syosetu.com/n0806fu
class WebScrapper:
@@ -20,6 +17,19 @@ class WebScrapper:
self.currentLanguage = currentLanguage
makeDir(self.htmlFolderPath)
@staticmethod
def Get(baseLink:str, htmlFolderPath:str, currentLanguage:str) -> self:
if "fanmtl.com" in baseLink:
return FanmtlWebScrapper(baseLink, htmlFolderPath, currentLanguage)
elif "syosetu.com" in baseLink:
return SyosetuWebScrapper(baseLink, htmlFolderPath, currentLanguage)
elif "fenrirealm.com" in baseLink:
return FenrirealmWebScrapper(baseLink, htmlFolderPath, currentLanguage)
elif "oniichanyamete.moe" in baseLink:
return OniichanyameteWebScrapper(baseLink, htmlFolderPath, currentLanguage)
else:
raise Exception(f"Website {baseLink} is not supported for scraping yet.")
@staticmethod
def _sanitizeFilename(filename: str) -> str:
@@ -52,20 +62,19 @@ class WebScrapper:
return filename
def getHtml(self, uriWithFormat:str, fromChapter:int, toChapter:int, sleepTime:float=0):
def run(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, sleepTime:float=0):
"""
uriWithFormat: inserts the current chapter number into the {} brackets
"""
for chapterNumber in range(fromChapter, toChapter + 1):
for chapterNumber, link in self._getChapterLinks(fromChapter, toChapter, uriWithFormat):
time.sleep(sleepTime) # to avoid getting blocked by the website for sending too many requests in a short time
print(link)
infoDict = {
"chapter": chapterNumber,
"originalLanguage": self.currentLanguage,
"currentLanguage": self.currentLanguage,
}
link = urljoin(self.baseLink, uriWithFormat.format(chapterNumber))
print(link)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
#'Referer': 'https://ncode.syosetu.com/',
@@ -83,8 +92,7 @@ class WebScrapper:
continue
self._removeUnwantedThinsFromHtml(chapterContent)
chapterTitle = self._addChapterTitle(soup, chapterContent, chapterNumber, infoDict)
infoDict["chapterTitle"] = chapterTitle
self._addChapterTitle(soup, chapterContent, infoDict)
# Dateinamen bereinigen
safe_filename = self._sanitizeFilename(chapterTitle)
@@ -112,44 +120,25 @@ class WebScrapper:
return chapterContent
def _removeUnwantedThinsFromHtml(self, content:BeautifulSoup|NavigableString):
if not content:
return
# FanMTL advertisements
if "fanmtl.com" in self.baseLink:
for div in content.find_all('div', {'align': 'center'}):
if div.find('script'):
div.decompose()
def _removeUnwantedThinsFromHtml(self, soup:BeautifulSoup|NavigableString):
#general
for script in content.find_all('script'):
for script in soup.find_all('script'):
script.decompose()
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, chapterNumber, infoDict:dict):
chapterTitle = f"Chapter {chapterNumber}"
if "fanmtl.com" in self.baseLink:
infoDict["seriesTitle"] = soup.select("div.titles h1 a")[0].text.strip()
chapterTitle = soup.select("div.titles h2")[0].text.strip()
infoDict["chapterTitle"] = chapterTitle
infoDict["author"] = ""
elif "syosetu.com" in self.baseLink:
chapterTitle = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
elif "fenrirealm.com" in self.baseLink:
chapterTitle = soup.select_one(".chapter-view > div > h2").get_text(strip=True)
infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip()
infoDict["author"] = "unknown"
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict):
titleElement = soup.new_tag("h1")
titleElement.string = chapterTitle
titleElement.string = infoDict["chapterTitle"] if "chapterTitle" in infoDict else f"Chapter {infoDict['chapter']}"
content.insert(0, titleElement)
content.insert(1, soup.new_tag("br"))
content.insert(2, soup.new_tag("br"))
return chapterTitle
def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
currentChapter = fromChapter
while currentChapter <= toChapter:
yield currentChapter, urljoin(self.baseLink, currentChapter)
currentChapter += 1