feat: better crawling
This commit is contained in:
@@ -12,13 +12,15 @@ class GBConvert():
|
|||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
url:str,
|
url:str,
|
||||||
|
standalone = False,
|
||||||
):
|
):
|
||||||
# NOTE move non-code files to data folder
|
# NOTE move non-code files to data folder
|
||||||
self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
|
self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
|
||||||
self.blocklist = open(pkg_resources.files('epub2go').joinpath('blocklist.txt')).read().splitlines()
|
self.blocklist = open(pkg_resources.files('epub2go').joinpath('blocklist.txt')).read().splitlines()
|
||||||
self.root = os.path.dirname(url)
|
self.root = os.path.dirname(url) if url.endswith('html') else url
|
||||||
self.url = urlparse(self.root)
|
self.url = urlparse(self.root)
|
||||||
self.output = self.url.netloc + self.url.path
|
self.output = self.url.netloc + self.url.path
|
||||||
|
self.standalone = standalone
|
||||||
|
|
||||||
def get_meta(self):
|
def get_meta(self):
|
||||||
response = requests.get(self.root)
|
response = requests.get(self.root)
|
||||||
@@ -29,6 +31,7 @@ class GBConvert():
|
|||||||
self.toc = soup.find('ul').find_all('a')
|
self.toc = soup.find('ul').find_all('a')
|
||||||
|
|
||||||
def save_page(self, url):
|
def save_page(self, url):
|
||||||
|
# TODO fix redownloading of shared content
|
||||||
# https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
|
# https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
|
||||||
command = f'''wget \
|
command = f'''wget \
|
||||||
--page-requisites \
|
--page-requisites \
|
||||||
@@ -67,7 +70,7 @@ class GBConvert():
|
|||||||
|
|
||||||
map(lambda x: self.save_page(os.path.join(self.root, x['href'])), self.toc)
|
map(lambda x: self.save_page(os.path.join(self.root, x['href'])), self.toc)
|
||||||
self.chapters = []
|
self.chapters = []
|
||||||
for item in tqdm(self.toc):
|
for item in (tqdm(self.toc) if self.standalone else self.toc):
|
||||||
item_title= item.get_text()
|
item_title= item.get_text()
|
||||||
item_url = os.path.join(self.root, item['href'])
|
item_url = os.path.join(self.root, item['href'])
|
||||||
self.save_page(url=item_url)
|
self.save_page(url=item_url)
|
||||||
@@ -81,7 +84,7 @@ class GBConvert():
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
g = GBConvert(sys.argv[1])
|
g = GBConvert(sys.argv[1], standalone=True)
|
||||||
g.run()
|
g.run()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
21
src/epub2go/crawl.py
Normal file
21
src/epub2go/crawl.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
import requests
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from convert import GBConvert
|
||||||
|
import utils
|
||||||
|
|
||||||
|
def main():
|
||||||
|
books = utils.get_all_book_urls()
|
||||||
|
# NOTE consider making this a map()
|
||||||
|
for book in tqdm(books):
|
||||||
|
book_title = book.get_text()
|
||||||
|
book_url_relative = book.get('href')
|
||||||
|
if book_url_relative is not None:
|
||||||
|
book_url = utils.root_url + os.path.dirname(book_url_relative)[5:]
|
||||||
|
GBConvert(book_url).run()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -1,25 +0,0 @@
|
|||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
import os
|
|
||||||
|
|
||||||
from convert import GBConvert
|
|
||||||
|
|
||||||
def main():
|
|
||||||
allbooks_relative_url ='/info/texte/allworka.html'
|
|
||||||
root_url = 'https://www.projekt-gutenberg.org'
|
|
||||||
allbooks_url = root_url + allbooks_relative_url
|
|
||||||
response = requests.get(allbooks_url)
|
|
||||||
if (response.status_code != 200): raise Exception(f'Couldnt fetch root page {self.root}')
|
|
||||||
soup = BeautifulSoup(response.content, 'html.parser')
|
|
||||||
books = soup.find('dl').find_all('a')
|
|
||||||
for book in books:
|
|
||||||
book_title = book.get_text()
|
|
||||||
book_url_relative = book.get('href')
|
|
||||||
if book_url_relative is not None:
|
|
||||||
book_url = root_url + os.path.dirname(book_url_relative)[5:]
|
|
||||||
gb = GBConvert(book_url)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
14
src/epub2go/utils.py
Normal file
14
src/epub2go/utils.py
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
allbooks_url ='https://www.projekt-gutenberg.org/info/texte/allworka.html'
|
||||||
|
root_url = '{url.scheme}://{url.netloc}'.format(url = urlparse(allbooks_url))
|
||||||
|
|
||||||
|
def get_all_book_urls():
|
||||||
|
response = requests.get(allbooks_url)
|
||||||
|
response.raise_for_status()
|
||||||
|
soup = BeautifulSoup(response.content, 'html.parser')
|
||||||
|
books = soup.find('dl').find_all('a')
|
||||||
|
return books
|
||||||
Reference in New Issue
Block a user