Scrapper changes
This commit is contained in:
+1
-1
@@ -1,4 +1,4 @@
|
|||||||
.idea
|
.idea
|
||||||
test*.py
|
*test*.py
|
||||||
test.*
|
test.*
|
||||||
*.log
|
*.log
|
||||||
@@ -0,0 +1,12 @@
|
|||||||
|
from scr.WebScrapper.OniichanyameteWebScrapper import OniichanyameteWebScrapper
|
||||||
|
from scr.WebScrapper.FenrirealmWebScrapper import FenrirealmWeb
|
||||||
|
from scr.WebScrapper.WebScrapper import WebScrapper
|
||||||
|
from scr.WebScrapper.FanmtlWebScrapper import FanmtlWebScrapper
|
||||||
|
from scr.WebScrapper.SyosetuWebScrapper import SyosetuWebScrapper
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
scrapper = WebScrapper.Get(r"https://oniichanyamete.moe/index/bunnygirl/", "html", "en")
|
||||||
|
for chapterNumber, link in scrapper._getChapterLinks(1, None):
|
||||||
|
print(chapterNumber, link)
|
||||||
|
|
||||||
@@ -0,0 +1,45 @@
|
|||||||
|
import re
|
||||||
|
import time
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
from bs4 import BeautifulSoup, NavigableString
|
||||||
|
from src.WebScrapper.WebScrapper import WebScrapper
|
||||||
|
|
||||||
|
|
||||||
|
class FanmtlWebScrapper(WebScrapper):
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
|
||||||
|
super().__init__(baseLink, htmlFolderPath, currentLanguage)
|
||||||
|
|
||||||
|
|
||||||
|
def _getChapterContent(self, soup:BeautifulSoup) -> str:
|
||||||
|
return soup.find("div", {"class": "chapter-content"})
|
||||||
|
|
||||||
|
|
||||||
|
def _removeUnwantedThinsFromHtml(self, soup:BeautifulSoup) -> BeautifulSoup:
|
||||||
|
for div in content.find_all('div', {'align': 'center'}):
|
||||||
|
if div.find('script'):
|
||||||
|
div.decompose()
|
||||||
|
|
||||||
|
super()._removeUnwantedThinsFromHtml(soup)
|
||||||
|
|
||||||
|
|
||||||
|
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
|
||||||
|
infoDict["seriesTitle"] = soup.select("div.titles h1 a")[0].text.strip()
|
||||||
|
chapterTitle = soup.select("div.titles h2")[0].text.strip()
|
||||||
|
infoDict["chapterTitle"] = chapterTitle
|
||||||
|
infoDict["author"] = ""
|
||||||
|
|
||||||
|
return super()._addChapterTitle(soup, content, chapterNumber, infoDict)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
|
||||||
|
if uriWithFormat is None:
|
||||||
|
raise Exception("uriWithFormat must be provided for Fanmtl.")
|
||||||
|
|
||||||
|
currentChapter = fromChapter
|
||||||
|
|
||||||
|
while currentChapter <= toChapter:
|
||||||
|
yield urljoin(self.baseLink, uriWithFormat.format(currentChapter))
|
||||||
|
currentChapter += 1
|
||||||
@@ -0,0 +1,24 @@
|
|||||||
|
import re
|
||||||
|
import time
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
from bs4 import BeautifulSoup, NavigableString
|
||||||
|
from src.WebScrapper.WebScrapper import WebScrapper
|
||||||
|
|
||||||
|
|
||||||
|
class FenrirealmWebScrapper(WebScrapper):
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
|
||||||
|
super().__init__(baseLink, htmlFolderPath, currentLanguage)
|
||||||
|
|
||||||
|
|
||||||
|
def _getChapterContent(self, soup:BeautifulSoup) -> str:
|
||||||
|
return soup.select("div.chapter-view > div.content-area")[0]
|
||||||
|
|
||||||
|
|
||||||
|
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
|
||||||
|
infoDict["chapterTitle"] = soup.select_one(".chapter-view > div > h2").get_text(strip=True)
|
||||||
|
infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip()
|
||||||
|
infoDict["author"] = "unknown"
|
||||||
|
return super()._addChapterTitle(soup, content, infoDict)
|
||||||
|
|
||||||
@@ -0,0 +1,37 @@
|
|||||||
|
import re
|
||||||
|
import time
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
from bs4 import BeautifulSoup, NavigableString
|
||||||
|
from src.WebScrapper.WebScrapper import WebScrapper
|
||||||
|
|
||||||
|
|
||||||
|
class OniichanyameteWebScrapper(WebScrapper):
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
|
||||||
|
super().__init__(baseLink, htmlFolderPath, currentLanguage)
|
||||||
|
|
||||||
|
|
||||||
|
def _getChapterContent(self, soup:BeautifulSoup) -> str:
|
||||||
|
return soup.select("div", {"class": "chapter-content"})
|
||||||
|
|
||||||
|
|
||||||
|
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
|
||||||
|
infoDict["seriesTitle"] = ""
|
||||||
|
infoDict["chapterTitle"] = ""
|
||||||
|
infoDict["author"] = ""
|
||||||
|
|
||||||
|
return super()._addChapterTitle(soup, content, chapterNumber, infoDict)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
|
||||||
|
soup = BeautifulSoup(requests.get(self.baseLink, headers=headers).content, "html.parser")
|
||||||
|
|
||||||
|
links =soup.select("p[style*='padding-left:60px'] > a")
|
||||||
|
pprint (links)
|
||||||
|
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
chapterNumber = re.search(r'\d+', link.text).group()
|
||||||
|
yield chapterNumber, urljoin(self.baseLink, link["href"])
|
||||||
@@ -0,0 +1,38 @@
|
|||||||
|
import re
|
||||||
|
import time
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
from bs4 import BeautifulSoup, NavigableString
|
||||||
|
from src.WebScrapper.WebScrapper import WebScrapper
|
||||||
|
|
||||||
|
|
||||||
|
class SyosetuWebScrapper(WebScrapper):
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, baseLink:str, htmlFolderPath:str, currentLanguage:str):
|
||||||
|
super().__init__(baseLink, htmlFolderPath, currentLanguage)
|
||||||
|
|
||||||
|
|
||||||
|
def _getChapterContent(self, soup:BeautifulSoup) -> str:
|
||||||
|
body = soup.select("div.p-novel__body div.js-novel-text.p-novel__text")
|
||||||
|
for x in body:
|
||||||
|
if len(x.text) > 1000:
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
|
||||||
|
infoDict["chapterTitle"] = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
|
||||||
|
infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
|
||||||
|
infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
|
||||||
|
return super()._addChapterTitle(soup, content, infoDict)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
|
||||||
|
if uriWithFormat is None:
|
||||||
|
uriWithFormat = "{}/"
|
||||||
|
|
||||||
|
currentChapter = fromChapter
|
||||||
|
|
||||||
|
while currentChapter <= toChapter:
|
||||||
|
yield urljoin(self.baseLink, uriWithFormat.format(currentChapter))
|
||||||
|
currentChapter += 1
|
||||||
@@ -8,9 +8,6 @@ from bs4 import BeautifulSoup, NavigableString
|
|||||||
from src.functions import writeToFile, makeDir, writeToJsonFile
|
from src.functions import writeToFile, makeDir, writeToJsonFile
|
||||||
|
|
||||||
|
|
||||||
# https://ncode.syosetu.com/n0806fu
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class WebScrapper:
|
class WebScrapper:
|
||||||
|
|
||||||
@@ -20,6 +17,19 @@ class WebScrapper:
|
|||||||
self.currentLanguage = currentLanguage
|
self.currentLanguage = currentLanguage
|
||||||
makeDir(self.htmlFolderPath)
|
makeDir(self.htmlFolderPath)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def Get(baseLink:str, htmlFolderPath:str, currentLanguage:str) -> self:
|
||||||
|
if "fanmtl.com" in baseLink:
|
||||||
|
return FanmtlWebScrapper(baseLink, htmlFolderPath, currentLanguage)
|
||||||
|
elif "syosetu.com" in baseLink:
|
||||||
|
return SyosetuWebScrapper(baseLink, htmlFolderPath, currentLanguage)
|
||||||
|
elif "fenrirealm.com" in baseLink:
|
||||||
|
return FenrirealmWebScrapper(baseLink, htmlFolderPath, currentLanguage)
|
||||||
|
elif "oniichanyamete.moe" in baseLink:
|
||||||
|
return OniichanyameteWebScrapper(baseLink, htmlFolderPath, currentLanguage)
|
||||||
|
else:
|
||||||
|
raise Exception(f"Website {baseLink} is not supported for scraping yet.")
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _sanitizeFilename(filename: str) -> str:
|
def _sanitizeFilename(filename: str) -> str:
|
||||||
@@ -52,20 +62,19 @@ class WebScrapper:
|
|||||||
return filename
|
return filename
|
||||||
|
|
||||||
|
|
||||||
def getHtml(self, uriWithFormat:str, fromChapter:int, toChapter:int, sleepTime:float=0):
|
def run(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, sleepTime:float=0):
|
||||||
"""
|
"""
|
||||||
uriWithFormat: inserts the current chapter number into the {} brackets
|
uriWithFormat: inserts the current chapter number into the {} brackets
|
||||||
"""
|
"""
|
||||||
for chapterNumber in range(fromChapter, toChapter + 1):
|
for chapterNumber, link in self._getChapterLinks(fromChapter, toChapter, uriWithFormat):
|
||||||
time.sleep(sleepTime) # to avoid getting blocked by the website for sending too many requests in a short time
|
time.sleep(sleepTime) # to avoid getting blocked by the website for sending too many requests in a short time
|
||||||
|
print(link)
|
||||||
|
|
||||||
infoDict = {
|
infoDict = {
|
||||||
"chapter": chapterNumber,
|
"chapter": chapterNumber,
|
||||||
"originalLanguage": self.currentLanguage,
|
"originalLanguage": self.currentLanguage,
|
||||||
"currentLanguage": self.currentLanguage,
|
"currentLanguage": self.currentLanguage,
|
||||||
}
|
}
|
||||||
link = urljoin(self.baseLink, uriWithFormat.format(chapterNumber))
|
|
||||||
print(link)
|
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
|
||||||
#'Referer': 'https://ncode.syosetu.com/',
|
#'Referer': 'https://ncode.syosetu.com/',
|
||||||
@@ -83,8 +92,7 @@ class WebScrapper:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
self._removeUnwantedThinsFromHtml(chapterContent)
|
self._removeUnwantedThinsFromHtml(chapterContent)
|
||||||
chapterTitle = self._addChapterTitle(soup, chapterContent, chapterNumber, infoDict)
|
self._addChapterTitle(soup, chapterContent, infoDict)
|
||||||
infoDict["chapterTitle"] = chapterTitle
|
|
||||||
|
|
||||||
# Dateinamen bereinigen
|
# Dateinamen bereinigen
|
||||||
safe_filename = self._sanitizeFilename(chapterTitle)
|
safe_filename = self._sanitizeFilename(chapterTitle)
|
||||||
@@ -112,44 +120,25 @@ class WebScrapper:
|
|||||||
return chapterContent
|
return chapterContent
|
||||||
|
|
||||||
|
|
||||||
def _removeUnwantedThinsFromHtml(self, content:BeautifulSoup|NavigableString):
|
def _removeUnwantedThinsFromHtml(self, soup:BeautifulSoup|NavigableString):
|
||||||
if not content:
|
|
||||||
return
|
|
||||||
|
|
||||||
# FanMTL advertisements
|
|
||||||
if "fanmtl.com" in self.baseLink:
|
|
||||||
for div in content.find_all('div', {'align': 'center'}):
|
|
||||||
if div.find('script'):
|
|
||||||
div.decompose()
|
|
||||||
|
|
||||||
#general
|
#general
|
||||||
for script in content.find_all('script'):
|
for script in soup.find_all('script'):
|
||||||
script.decompose()
|
script.decompose()
|
||||||
|
|
||||||
|
|
||||||
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, chapterNumber, infoDict:dict):
|
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict):
|
||||||
chapterTitle = f"Chapter {chapterNumber}"
|
|
||||||
if "fanmtl.com" in self.baseLink:
|
|
||||||
infoDict["seriesTitle"] = soup.select("div.titles h1 a")[0].text.strip()
|
|
||||||
chapterTitle = soup.select("div.titles h2")[0].text.strip()
|
|
||||||
infoDict["chapterTitle"] = chapterTitle
|
|
||||||
infoDict["author"] = ""
|
|
||||||
elif "syosetu.com" in self.baseLink:
|
|
||||||
chapterTitle = soup.select_one("h1.p-novel__title.p-novel__title--rensai").get_text(strip=True).strip()
|
|
||||||
infoDict["seriesTitle"] = re.sub("\[.*?\]", "", soup.select(".c-announce > a:first-of-type")[0].text).strip()
|
|
||||||
infoDict["author"] = soup.select(".c-announce > a:nth-of-type(2)")[0].text.strip()
|
|
||||||
elif "fenrirealm.com" in self.baseLink:
|
|
||||||
chapterTitle = soup.select_one(".chapter-view > div > h2").get_text(strip=True)
|
|
||||||
infoDict["seriesTitle"] = soup.select_one("title").get_text(strip=True).split(" - ")[0].strip()
|
|
||||||
infoDict["author"] = "unknown"
|
|
||||||
|
|
||||||
titleElement = soup.new_tag("h1")
|
titleElement = soup.new_tag("h1")
|
||||||
titleElement.string = chapterTitle
|
titleElement.string = infoDict["chapterTitle"] if "chapterTitle" in infoDict else f"Chapter {infoDict['chapter']}"
|
||||||
content.insert(0, titleElement)
|
content.insert(0, titleElement)
|
||||||
content.insert(1, soup.new_tag("br"))
|
content.insert(1, soup.new_tag("br"))
|
||||||
content.insert(2, soup.new_tag("br"))
|
content.insert(2, soup.new_tag("br"))
|
||||||
|
|
||||||
return chapterTitle
|
|
||||||
|
|
||||||
|
|
||||||
|
def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
|
||||||
|
currentChapter = fromChapter
|
||||||
|
|
||||||
|
while currentChapter <= toChapter:
|
||||||
|
yield currentChapter, urljoin(self.baseLink, currentChapter)
|
||||||
|
currentChapter += 1
|
||||||
|
|
||||||
|
|
||||||
Reference in New Issue
Block a user