refactor: move crawling code to correct file

This commit is contained in:
eneller
2025-03-03 22:26:18 +01:00
parent 967f97f381
commit 8f77a97733
2 changed files with 14 additions and 12 deletions

View File

@@ -116,13 +116,6 @@ def get_all_books() -> list:
books.append(book) books.append(book)
return books return books
def get_all_book_tags ()-> ResultSet:
response = requests.get(allbooks_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
books = soup.find('dl').find_all('a')
return books
# run main cli # run main cli
def main(): def main():
sys.argv.pop(0) sys.argv.pop(0)

View File

@@ -1,17 +1,26 @@
import requests import requests
from tqdm import tqdm from tqdm import tqdm
from bs4 import BeautifulSoup
from bs4 import ResultSet
import os import os
from urllib.parse import urljoin from urllib.parse import urljoin
from convert import GBConvert, get_all_book_tags, allbooks_url from convert import GBConvert, allbooks_url
def parse_book_tags ()-> ResultSet:
response = requests.get(allbooks_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
books = soup.find('dl').find_all('a')
books = [(book.get_text(), book.get('href')) for book in books]
return books
def main(): def main():
books = get_all_book_tags() books = parse_book_tags()
# NOTE consider making this a map() # NOTE consider making this a map()
for book in tqdm(books): for book in tqdm(parse_book_tags()):
book_title = book.get_text() (book_title, book_url_relative) = book
book_url_relative = book.get('href')
if book_url_relative is not None: if book_url_relative is not None:
book_url = urljoin(allbooks_url, book_url_relative) book_url = urljoin(allbooks_url, book_url_relative)
GBConvert(book_url).run() GBConvert(book_url).run()