diff --git a/.gitignore b/.gitignore index e6d450b..3aae54c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ .idea -test*.py +*test*.py test.* *.log \ No newline at end of file diff --git a/scrapperTest.py b/scrapperTest.py new file mode 100644 index 0000000..af324ec --- /dev/null +++ b/scrapperTest.py @@ -0,0 +1,12 @@ +from scr.WebScrapper.OniichanyameteWebScrapper import OniichanyameteWebScrapper +from scr.WebScrapper.FenrirealmWebScrapper import FenrirealmWeb +from scr.WebScrapper.WebScrapper import WebScrapper +from scr.WebScrapper.FanmtlWebScrapper import FanmtlWebScrapper +from scr.WebScrapper.SyosetuWebScrapper import SyosetuWebScrapper + + + +scrapper = WebScrapper.Get(r"https://oniichanyamete.moe/index/bunnygirl/", "html", "en") +for chapterNumber, link in scrapper._getChapterLinks(1, None): + print(chapterNumber, link) + diff --git a/src/WebScrapper/FanmtlWebScrapper.py b/src/WebScrapper/FanmtlWebScrapper.py new file mode 100644 index 0000000..0b01ebc --- /dev/null +++ b/src/WebScrapper/FanmtlWebScrapper.py @@ -0,0 +1,45 @@ +import re +import time +from urllib.parse import urljoin +from bs4 import BeautifulSoup, NavigableString +from src.WebScrapper.WebScrapper import WebScrapper + + +class FanmtlWebScrapper(WebScrapper): + + + def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str): + super().__init__(baseLink, htmlFolderPath, currentLanguage) + + + def _getChapterContent(self, soup:BeautifulSoup) -> str: + return soup.find("div", {"class": "chapter-content"}) + + + def _removeUnwantedThinsFromHtml(self, soup:BeautifulSoup) -> BeautifulSoup: + for div in content.find_all('div', {'align': 'center'}): + if div.find('script'): + div.decompose() + + super()._removeUnwantedThinsFromHtml(soup) + + + def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str: + infoDict["seriesTitle"] = soup.select("div.titles h1 a")[0].text.strip() + chapterTitle = soup.select("div.titles h2")[0].text.strip() + infoDict["chapterTitle"] = chapterTitle + infoDict["author"] = "" + + return super()._addChapterTitle(soup, content, chapterNumber, infoDict) + + + + def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None): + if uriWithFormat is None: + raise Exception("uriWithFormat must be provided for Fanmtl.") + + currentChapter = fromChapter + + while currentChapter <= toChapter: + yield urljoin(self.baseLink, uriWithFormat.format(currentChapter)) + currentChapter += 1 \ No newline at end of file diff --git a/src/WebScrapper/FenrirealmWebScrapper.py b/src/WebScrapper/FenrirealmWebScrapper.py new file mode 100644 index 0000000..44bc636 --- /dev/null +++ b/src/WebScrapper/FenrirealmWebScrapper.py @@ -0,0 +1,24 @@ +import re +import time +from urllib.parse import urljoin +from bs4 import BeautifulSoup, NavigableString +from src.WebScrapper.WebScrapper import WebScrapper + + +class FenrirealmWebScrapper(WebScrapper): + + + def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str): + super().__init__(baseLink, htmlFolderPath, currentLanguage) + + + def _getChapterContent(self, soup:BeautifulSoup) -> str: + return soup.select("div.chapter-view > div.content-area")[0] + + + def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str: + infoDict["chapterTitle"] = soup.select_one(".chapter-view > div > h2").get_text(strip=True) + infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip() + infoDict["author"] = "unknown" + return super()._addChapterTitle(soup, content, infoDict) + diff --git a/src/WebScrapper/OniichanyameteWebScrapper.py b/src/WebScrapper/OniichanyameteWebScrapper.py new file mode 100644 index 0000000..1d28b78 --- /dev/null +++ b/src/WebScrapper/OniichanyameteWebScrapper.py @@ -0,0 +1,37 @@ +import re +import time +from urllib.parse import urljoin +from bs4 import BeautifulSoup, NavigableString +from src.WebScrapper.WebScrapper import WebScrapper + + +class OniichanyameteWebScrapper(WebScrapper): + + + def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str): + super().__init__(baseLink, htmlFolderPath, currentLanguage) + + + def _getChapterContent(self, soup:BeautifulSoup) -> str: + return soup.select("div", {"class": "chapter-content"}) + + + def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str: + infoDict["seriesTitle"] = "" + infoDict["chapterTitle"] = "" + infoDict["author"] = "" + + return super()._addChapterTitle(soup, content, chapterNumber, infoDict) + + + + def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None): + soup = BeautifulSoup(requests.get(self.baseLink, headers=headers).content, "html.parser") + + links =soup.select("p[style*='padding-left:60px'] > a") + pprint (links) + + + for link in links: + chapterNumber = re.search(r'\d+', link.text).group() + yield chapterNumber, urljoin(self.baseLink, link["href"]) \ No newline at end of file diff --git a/src/WebScrapper/SyosetuWebScrapper.py b/src/WebScrapper/SyosetuWebScrapper.py new file mode 100644 index 0000000..cdb53fc --- /dev/null +++ b/src/WebScrapper/SyosetuWebScrapper.py @@ -0,0 +1,38 @@ +import re +import time +from urllib.parse import urljoin +from bs4 import BeautifulSoup, NavigableString +from src.WebScrapper.WebScrapper import WebScrapper + + +class SyosetuWebScrapper(WebScrapper): + + + def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str): + super().__init__(baseLink, htmlFolderPath, currentLanguage) + + + def _getChapterContent(self, soup:BeautifulSoup) -> str: + body = soup.select("div.p-novel__body div.js-novel-text.p-novel__text") + for x in body: + if len(x.text) > 1000: + return x + + + def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str: + infoDict["chapterTitle"] = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip() + infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip() + infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip() + return super()._addChapterTitle(soup, content, infoDict) + + + + def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None): + if uriWithFormat is None: + uriWithFormat = "{}/" + + currentChapter = fromChapter + + while currentChapter <= toChapter: + yield urljoin(self.baseLink, uriWithFormat.format(currentChapter)) + currentChapter += 1 \ No newline at end of file diff --git a/src/WebScrapper.py b/src/WebScrapper/WebScrapper.py similarity index 68% rename from src/WebScrapper.py rename to src/WebScrapper/WebScrapper.py index c1e827b..226831c 100644 --- a/src/WebScrapper.py +++ b/src/WebScrapper/WebScrapper.py @@ -8,9 +8,6 @@ from bs4 import BeautifulSoup, NavigableString from src.functions import writeToFile, makeDir, writeToJsonFile -# https://ncode.syosetu.com/n0806fu - - class WebScrapper: @@ -20,6 +17,19 @@ class WebScrapper: self.currentLanguage = currentLanguage makeDir(self.htmlFolderPath) + @staticmethod + def Get(baseLink:str, htmlFolderPath:str, currentLanguage:str) -> self: + if "fanmtl.com" in baseLink: + return FanmtlWebScrapper(baseLink, htmlFolderPath, currentLanguage) + elif "syosetu.com" in baseLink: + return SyosetuWebScrapper(baseLink, htmlFolderPath, currentLanguage) + elif "fenrirealm.com" in baseLink: + return FenrirealmWebScrapper(baseLink, htmlFolderPath, currentLanguage) + elif "oniichanyamete.moe" in baseLink: + return OniichanyameteWebScrapper(baseLink, htmlFolderPath, currentLanguage) + else: + raise Exception(f"Website {baseLink} is not supported for scraping yet.") + @staticmethod def _sanitizeFilename(filename: str) -> str: @@ -52,20 +62,19 @@ class WebScrapper: return filename - def getHtml(self, uriWithFormat:str, fromChapter:int, toChapter:int, sleepTime:float=0): + def run(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, sleepTime:float=0): """ uriWithFormat: inserts the current chapter number into the {} brackets """ - for chapterNumber in range(fromChapter, toChapter + 1): + for chapterNumber, link in self._getChapterLinks(fromChapter, toChapter, uriWithFormat): time.sleep(sleepTime) # to avoid getting blocked by the website for sending too many requests in a short time + print(link) infoDict = { "chapter": chapterNumber, "originalLanguage": self.currentLanguage, "currentLanguage": self.currentLanguage, } - link = urljoin(self.baseLink, uriWithFormat.format(chapterNumber)) - print(link) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36', #'Referer': 'https://ncode.syosetu.com/', @@ -83,8 +92,7 @@ class WebScrapper: continue self._removeUnwantedThinsFromHtml(chapterContent) - chapterTitle = self._addChapterTitle(soup, chapterContent, chapterNumber, infoDict) - infoDict["chapterTitle"] = chapterTitle + self._addChapterTitle(soup, chapterContent, infoDict) # Dateinamen bereinigen safe_filename = self._sanitizeFilename(chapterTitle) @@ -112,44 +120,25 @@ class WebScrapper: return chapterContent - def _removeUnwantedThinsFromHtml(self, content:BeautifulSoup|NavigableString): - if not content: - return - - # FanMTL advertisements - if "fanmtl.com" in self.baseLink: - for div in content.find_all('div', {'align': 'center'}): - if div.find('script'): - div.decompose() - + def _removeUnwantedThinsFromHtml(self, soup:BeautifulSoup|NavigableString): #general - for script in content.find_all('script'): + for script in soup.find_all('script'): script.decompose() - def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, chapterNumber, infoDict:dict): - chapterTitle = f"Chapter {chapterNumber}" - if "fanmtl.com" in self.baseLink: - infoDict["seriesTitle"] = soup.select("div.titles h1 a")[0].text.strip() - chapterTitle = soup.select("div.titles h2")[0].text.strip() - infoDict["chapterTitle"] = chapterTitle - infoDict["author"] = "" - elif "syosetu.com" in self.baseLink: - chapterTitle = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip() - infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip() - infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip() - elif "fenrirealm.com" in self.baseLink: - chapterTitle = soup.select_one(".chapter-view > div > h2").get_text(strip=True) - infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip() - infoDict["author"] = "unknown" - + def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict): titleElement = soup.new_tag("h1") - titleElement.string = chapterTitle + titleElement.string = infoDict["chapterTitle"] if "chapterTitle" in infoDict else f"Chapter {infoDict['chapter']}" content.insert(0, titleElement) content.insert(1, soup.new_tag("br")) content.insert(2, soup.new_tag("br")) - return chapterTitle - + def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None): + currentChapter = fromChapter + + while currentChapter <= toChapter: + yield currentChapter, urljoin(self.baseLink, currentChapter) + currentChapter += 1 +