fix
This commit is contained in:
+3
-7
@@ -1,12 +1,8 @@
|
|||||||
from scr.WebScrapper.OniichanyameteWebScrapper import OniichanyameteWebScrapper
|
from src.WebScrapper.OniichanyameteWebScrapper import OniichanyameteWebScrapper
|
||||||
from scr.WebScrapper.FenrirealmWebScrapper import FenrirealmWeb
|
from src.WebScrapper.WebScrapper import WebScrapper
|
||||||
from scr.WebScrapper.WebScrapper import WebScrapper
|
|
||||||
from scr.WebScrapper.FanmtlWebScrapper import FanmtlWebScrapper
|
|
||||||
from scr.WebScrapper.SyosetuWebScrapper import SyosetuWebScrapper
|
|
||||||
|
|
||||||
|
|
||||||
|
scrapper = OniichanyameteWebScrapper(r"https://oniichanyamete.moe/index/bunnygirl/", "html", "en")
|
||||||
scrapper = WebScrapper.Get(r"https://oniichanyamete.moe/index/bunnygirl/", "html", "en")
|
|
||||||
for chapterNumber, link in scrapper._getChapterLinks(1, None):
|
for chapterNumber, link in scrapper._getChapterLinks(1, None):
|
||||||
print(chapterNumber, link)
|
print(chapterNumber, link)
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from pprint import pprint
|
||||||
from bs4 import BeautifulSoup, NavigableString
|
from bs4 import BeautifulSoup, NavigableString
|
||||||
from src.WebScrapper.WebScrapper import WebScrapper
|
from src.WebScrapper.WebScrapper import WebScrapper
|
||||||
|
|
||||||
@@ -13,7 +16,7 @@ class OniichanyameteWebScrapper(WebScrapper):
|
|||||||
|
|
||||||
|
|
||||||
def _getChapterContent(self, soup:BeautifulSoup) -> str:
|
def _getChapterContent(self, soup:BeautifulSoup) -> str:
|
||||||
return soup.select("div", {"class": "chapter-content"})
|
return soup.select("div", {"class": "chapter-content"})[0].prettify()
|
||||||
|
|
||||||
|
|
||||||
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
|
def _addChapterTitle(self, soup:BeautifulSoup, content:BeautifulSoup|NavigableString, infoDict:dict) -> str:
|
||||||
@@ -21,12 +24,12 @@ class OniichanyameteWebScrapper(WebScrapper):
|
|||||||
infoDict["chapterTitle"] = ""
|
infoDict["chapterTitle"] = ""
|
||||||
infoDict["author"] = ""
|
infoDict["author"] = ""
|
||||||
|
|
||||||
return super()._addChapterTitle(soup, content, chapterNumber, infoDict)
|
return super()._addChapterTitle(soup, content, infoDict)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
|
def _getChapterLinks(self, fromChapter:int, toChapter:int, uriWithFormat:str=None, soup:BeautifulSoup=None):
|
||||||
soup = BeautifulSoup(requests.get(self.baseLink, headers=headers).content, "html.parser")
|
soup = BeautifulSoup(requests.get(self.baseLink).content, "html.parser")
|
||||||
|
|
||||||
links =soup.select("p[style*='padding-left:60px'] > a")
|
links =soup.select("p[style*='padding-left:60px'] > a")
|
||||||
pprint(links)
|
pprint(links)
|
||||||
|
|||||||
@@ -5,6 +5,11 @@ from urllib.parse import urljoin
|
|||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup, NavigableString
|
from bs4 import BeautifulSoup, NavigableString
|
||||||
|
|
||||||
|
# from src.WebScrapper.FanmtlWebScrapper import FanmtlWebScrapper
|
||||||
|
# from src.WebScrapper.FenrirealmWebScrapper import FenrirealmWebScrapper
|
||||||
|
# from src.WebScrapper.OniichanyameteWebScrapper import OniichanyameteWebScrapper
|
||||||
|
# from src.WebScrapper.SyosetuWebScrapper import SyosetuWebScrapper
|
||||||
from src.functions import writeToFile, makeDir, writeToJsonFile
|
from src.functions import writeToFile, makeDir, writeToJsonFile
|
||||||
|
|
||||||
|
|
||||||
@@ -17,18 +22,18 @@ class WebScrapper:
|
|||||||
self.currentLanguage = currentLanguage
|
self.currentLanguage = currentLanguage
|
||||||
makeDir(self.htmlFolderPath)
|
makeDir(self.htmlFolderPath)
|
||||||
|
|
||||||
@staticmethod
|
# @staticmethod
|
||||||
def Get(baseLink:str, htmlFolderPath:str, currentLanguage:str) -> self:
|
# def Get(baseLink:str, htmlFolderPath:str, currentLanguage:str):
|
||||||
if "fanmtl.com" in baseLink:
|
# if "fanmtl.com" in baseLink:
|
||||||
return FanmtlWebScrapper(baseLink, htmlFolderPath, currentLanguage)
|
# return FanmtlWebScrapper(baseLink, htmlFolderPath, currentLanguage)
|
||||||
elif "syosetu.com" in baseLink:
|
# elif "syosetu.com" in baseLink:
|
||||||
return SyosetuWebScrapper(baseLink, htmlFolderPath, currentLanguage)
|
# return SyosetuWebScrapper(baseLink, htmlFolderPath, currentLanguage)
|
||||||
elif "fenrirealm.com" in baseLink:
|
# elif "fenrirealm.com" in baseLink:
|
||||||
return FenrirealmWebScrapper(baseLink, htmlFolderPath, currentLanguage)
|
# return FenrirealmWebScrapper(baseLink, htmlFolderPath, currentLanguage)
|
||||||
elif "oniichanyamete.moe" in baseLink:
|
# elif "oniichanyamete.moe" in baseLink:
|
||||||
return OniichanyameteWebScrapper(baseLink, htmlFolderPath, currentLanguage)
|
# return OniichanyameteWebScrapper(baseLink, htmlFolderPath, currentLanguage)
|
||||||
else:
|
# else:
|
||||||
raise Exception(f"Website {baseLink} is not supported for scraping yet.")
|
# raise Exception(f"Website {baseLink} is not supported for scraping yet.")
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -138,7 +143,7 @@ class WebScrapper:
|
|||||||
currentChapter = fromChapter
|
currentChapter = fromChapter
|
||||||
|
|
||||||
while currentChapter <= toChapter:
|
while currentChapter <= toChapter:
|
||||||
yield currentChapter, urljoin(self.baseLink, currentChapter)
|
yield currentChapter, urljoin(self.baseLink, str(currentChapter))
|
||||||
currentChapter += 1
|
currentChapter += 1
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user