This commit is contained in:
2026-05-21 10:53:56 +02:00
parent 6c09053ff0
commit 2bee0bc362
3 changed files with 28 additions and 24 deletions
+3 -7
View File
@@ -1,12 +1,8 @@
from scr.WebScrapper.OniichanyameteWebScrapper import OniichanyameteWebScrapper from src.WebScrapper.OniichanyameteWebScrapper import OniichanyameteWebScrapper
from scr.WebScrapper.FenrirealmWebScrapper import FenrirealmWeb from src.WebScrapper.WebScrapper import WebScrapper
from scr.WebScrapper.WebScrapper import WebScrapper
from scr.WebScrapper.FanmtlWebScrapper import FanmtlWebScrapper
from scr.WebScrapper.SyosetuWebScrapper import SyosetuWebScrapper
scrapper = OniichanyameteWebScrapper(r"https://oniichanyamete.moe/index/bunnygirl/", "html", "en")
scrapper = WebScrapper.Get(r"https://oniichanyamete.moe/index/bunnygirl/", "html", "en")
for chapterNumber, link in scrapper._getChapterLinks(1, None): for chapterNumber, link in scrapper._getChapterLinks(1, None):
print(chapterNumber, link) print(chapterNumber, link)
+7 -4
View File
@@ -1,6 +1,9 @@
import re import re
import time import time
from urllib.parse import urljoin from urllib.parse import urljoin
import requests
from pprint import pprint
from bs4 import BeautifulSoup, NavigableString from bs4 import BeautifulSoup, NavigableString
from src.WebScrapper.WebScrapper import WebScrapper from src.WebScrapper.WebScrapper import WebScrapper
@@ -13,7 +16,7 @@ class OniichanyameteWebScrapper(WebScrapper):
def _getChapterContent(self, soup:BeautifulSoup) -> str: def _getChapterContent(self, soup:BeautifulSoup) -> str:
return soup.select("div", {"class": "chapter-content"}) return soup.select("div", {"class": "chapter-content"})[0].prettify()
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str: def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
@@ -21,15 +24,15 @@ class OniichanyameteWebScrapper(WebScrapper):
infoDict["chapterTitle"] = "" infoDict["chapterTitle"] = ""
infoDict["author"] = "" infoDict["author"] = ""
return super()._addChapterTitle(soup, content, chapterNumber, infoDict) return super()._addChapterTitle(soup, content, infoDict)
def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None): def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
soup = BeautifulSoup(requests.get(self.baseLink, headers=headers).content, "html.parser") soup = BeautifulSoup(requests.get(self.baseLink).content, "html.parser")
links =soup.select("p[style*='padding-left:60px'] > a") links =soup.select("p[style*='padding-left:60px'] > a")
pprint (links) pprint(links)
for link in links: for link in links:
+18 -13
View File
@@ -5,6 +5,11 @@ from urllib.parse import urljoin
from pprint import pprint from pprint import pprint
import requests import requests
from bs4 import BeautifulSoup, NavigableString from bs4 import BeautifulSoup, NavigableString
# from src.WebScrapper.FanmtlWebScrapper import FanmtlWebScrapper
# from src.WebScrapper.FenrirealmWebScrapper import FenrirealmWebScrapper
# from src.WebScrapper.OniichanyameteWebScrapper import OniichanyameteWebScrapper
# from src.WebScrapper.SyosetuWebScrapper import SyosetuWebScrapper
from src.functions import writeToFile, makeDir, writeToJsonFile from src.functions import writeToFile, makeDir, writeToJsonFile
@@ -17,18 +22,18 @@ class WebScrapper:
self.currentLanguage = currentLanguage self.currentLanguage = currentLanguage
makeDir(self.htmlFolderPath) makeDir(self.htmlFolderPath)
@staticmethod # @staticmethod
def Get(baseLink:str, htmlFolderPath:str, currentLanguage:str) -> self: # def Get(baseLink:str, htmlFolderPath:str, currentLanguage:str):
if "fanmtl.com" in baseLink: # if "fanmtl.com" in baseLink:
return FanmtlWebScrapper(baseLink, htmlFolderPath, currentLanguage) # return FanmtlWebScrapper(baseLink, htmlFolderPath, currentLanguage)
elif "syosetu.com" in baseLink: # elif "syosetu.com" in baseLink:
return SyosetuWebScrapper(baseLink, htmlFolderPath, currentLanguage) # return SyosetuWebScrapper(baseLink, htmlFolderPath, currentLanguage)
elif "fenrirealm.com" in baseLink: # elif "fenrirealm.com" in baseLink:
return FenrirealmWebScrapper(baseLink, htmlFolderPath, currentLanguage) # return FenrirealmWebScrapper(baseLink, htmlFolderPath, currentLanguage)
elif "oniichanyamete.moe" in baseLink: # elif "oniichanyamete.moe" in baseLink:
return OniichanyameteWebScrapper(baseLink, htmlFolderPath, currentLanguage) # return OniichanyameteWebScrapper(baseLink, htmlFolderPath, currentLanguage)
else: # else:
raise Exception(f"Website {baseLink} is not supported for scraping yet.") # raise Exception(f"Website {baseLink} is not supported for scraping yet.")
@staticmethod @staticmethod
@@ -138,7 +143,7 @@ class WebScrapper:
currentChapter = fromChapter currentChapter = fromChapter
while currentChapter <= toChapter: while currentChapter <= toChapter:
yield currentChapter, urljoin(self.baseLink, currentChapter) yield currentChapter, urljoin(self.baseLink, str(currentChapter))
currentChapter += 1 currentChapter += 1