refactor: move crawling code to correct file

This commit is contained in:
eneller
2025-03-03 22:26:18 +01:00
parent 967f97f381
commit 8f77a97733
2 changed files with 14 additions and 12 deletions

View File

@@ -116,13 +116,6 @@ def get_all_books() -> list:
books.append(book)
return books
def get_all_book_tags ()-> ResultSet:
response = requests.get(allbooks_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
books = soup.find('dl').find_all('a')
return books
# run main cli
def main():
sys.argv.pop(0)

View File

@@ -1,17 +1,26 @@
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
from bs4 import ResultSet
import os
from urllib.parse import urljoin
from convert import GBConvert, get_all_book_tags, allbooks_url
from convert import GBConvert, allbooks_url
def parse_book_tags ()-> ResultSet:
response = requests.get(allbooks_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
books = soup.find('dl').find_all('a')
books = [(book.get_text(), book.get('href')) for book in books]
return books
def main():
books = get_all_book_tags()
books = parse_book_tags()
# NOTE consider making this a map()
for book in tqdm(books):
book_title = book.get_text()
book_url_relative = book.get('href')
for book in tqdm(parse_book_tags()):
(book_title, book_url_relative) = book
if book_url_relative is not None:
book_url = urljoin(allbooks_url, book_url_relative)
GBConvert(book_url).run()