refactor: move crawling code to correct file

2025-03-03 22:26:18 +01:00
parent 967f97f381
commit 8f77a97733
2 changed files with 14 additions and 12 deletions
--- a/src/epub2go/convert.py
+++ b/src/epub2go/convert.py
@@ -116,13 +116,6 @@ def get_all_books() -> list:
                books.append(book)
    return books

-def get_all_book_tags ()-> ResultSet:
-    response = requests.get(allbooks_url)
-    response.raise_for_status()
-    soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
-    books = soup.find('dl').find_all('a')
-    return books
-    
 # run main cli
 def main():
    sys.argv.pop(0)
--- a/src/epub2go/crawl.py
+++ b/src/epub2go/crawl.py
@@ -1,17 +1,26 @@
 import requests
 from tqdm import tqdm
+from bs4 import BeautifulSoup
+from bs4 import ResultSet

 import os
 from urllib.parse import urljoin

-from convert import GBConvert, get_all_book_tags, allbooks_url
+from convert import GBConvert, allbooks_url
+
+def parse_book_tags ()-> ResultSet:
+    response = requests.get(allbooks_url)
+    response.raise_for_status()
+    soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
+    books = soup.find('dl').find_all('a')
+    books = [(book.get_text(), book.get('href')) for book in books]
+    return books

 def main():
-    books = get_all_book_tags()
+    books = parse_book_tags()
    # NOTE consider making this a map()
-    for book in tqdm(books):
-        book_title = book.get_text()
-        book_url_relative = book.get('href')
+    for book in tqdm(parse_book_tags()):
+        (book_title, book_url_relative) = book
        if book_url_relative is not None:
            book_url = urljoin(allbooks_url, book_url_relative)
            GBConvert(book_url).run()