feat: better crawling

2025-02-24 21:17:32 +01:00
parent 8c37822a02
commit daddb58c3c
4 changed files with 41 additions and 28 deletions
--- a/src/epub2go/convert.py
+++ b/src/epub2go/convert.py
@@ -12,13 +12,15 @@ class GBConvert():
    
    def __init__(self,
        url:str,
+        standalone = False,
        ):
        # NOTE move non-code files to data folder
        self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
        self.blocklist = open(pkg_resources.files('epub2go').joinpath('blocklist.txt')).read().splitlines()
-        self.root = os.path.dirname(url)
+        self.root = os.path.dirname(url) if url.endswith('html') else url
        self.url = urlparse(self.root)
        self.output = self.url.netloc + self.url.path
+        self.standalone = standalone
        
    def get_meta(self):
        response = requests.get(self.root)
@@ -29,6 +31,7 @@ class GBConvert():
        self.toc = soup.find('ul').find_all('a')
    
    def save_page(self, url):
+        # TODO fix redownloading of shared content
        # https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
        command = f'''wget \
                    --page-requisites \
@@ -67,7 +70,7 @@ class GBConvert():

        map(lambda x: self.save_page(os.path.join(self.root, x['href'])), self.toc)
        self.chapters = []
-        for item in tqdm(self.toc):
+        for item in (tqdm(self.toc) if self.standalone else self.toc):
            item_title= item.get_text()
            item_url = os.path.join(self.root, item['href'])
            self.save_page(url=item_url)
@@ -81,7 +84,7 @@ class GBConvert():
    
    
 def main():
-    g = GBConvert(sys.argv[1])
+    g = GBConvert(sys.argv[1], standalone=True)
    g.run()


--- a/src/epub2go/crawl.py
+++ b/src/epub2go/crawl.py
@@ -0,0 +1,21 @@
+import requests
+from tqdm import tqdm
+
+import os
+
+from convert import GBConvert
+import utils
+
+def main():
+    books = utils.get_all_book_urls()
+    # NOTE consider making this a map()
+    for book in tqdm(books):
+        book_title = book.get_text()
+        book_url_relative = book.get('href')
+        if book_url_relative is not None:
+            book_url = utils.root_url + os.path.dirname(book_url_relative)[5:]
+            GBConvert(book_url).run()
+
+
+if __name__ == "__main__":
+    main()
--- a/src/epub2go/test.py
+++ b/src/epub2go/test.py
@@ -1,25 +0,0 @@
-import requests
-from bs4 import BeautifulSoup
-
-import os
-
-from convert import GBConvert
-
-def main():
-    allbooks_relative_url ='/info/texte/allworka.html'
-    root_url = 'https://www.projekt-gutenberg.org'
-    allbooks_url = root_url + allbooks_relative_url
-    response = requests.get(allbooks_url)
-    if (response.status_code != 200): raise Exception(f'Couldnt fetch root page {self.root}')
-    soup = BeautifulSoup(response.content, 'html.parser')
-    books = soup.find('dl').find_all('a')
-    for book in books:
-        book_title = book.get_text()
-        book_url_relative = book.get('href')
-        if book_url_relative is not None:
-            book_url = root_url + os.path.dirname(book_url_relative)[5:]
-            gb = GBConvert(book_url)
-
-
-if __name__ == "__main__":
-    main()
--- a/src/epub2go/utils.py
+++ b/src/epub2go/utils.py
@@ -0,0 +1,14 @@
+from bs4 import BeautifulSoup
+import requests
+
+from urllib.parse import urlparse
+
+allbooks_url ='https://www.projekt-gutenberg.org/info/texte/allworka.html'
+root_url = '{url.scheme}://{url.netloc}'.format(url = urlparse(allbooks_url))
+
+def get_all_book_urls():
+    response = requests.get(allbooks_url)
+    response.raise_for_status()
+    soup = BeautifulSoup(response.content, 'html.parser')
+    books = soup.find('dl').find_all('a')
+    return books