refactor: move crawling code to correct file
This commit is contained in:
@@ -116,13 +116,6 @@ def get_all_books() -> list:
|
|||||||
books.append(book)
|
books.append(book)
|
||||||
return books
|
return books
|
||||||
|
|
||||||
def get_all_book_tags ()-> ResultSet:
|
|
||||||
response = requests.get(allbooks_url)
|
|
||||||
response.raise_for_status()
|
|
||||||
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
|
|
||||||
books = soup.find('dl').find_all('a')
|
|
||||||
return books
|
|
||||||
|
|
||||||
# run main cli
|
# run main cli
|
||||||
def main():
|
def main():
|
||||||
sys.argv.pop(0)
|
sys.argv.pop(0)
|
||||||
|
|||||||
@@ -1,17 +1,26 @@
|
|||||||
import requests
|
import requests
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from bs4 import ResultSet
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
from convert import GBConvert, get_all_book_tags, allbooks_url
|
from convert import GBConvert, allbooks_url
|
||||||
|
|
||||||
|
def parse_book_tags ()-> ResultSet:
|
||||||
|
response = requests.get(allbooks_url)
|
||||||
|
response.raise_for_status()
|
||||||
|
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
|
||||||
|
books = soup.find('dl').find_all('a')
|
||||||
|
books = [(book.get_text(), book.get('href')) for book in books]
|
||||||
|
return books
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
books = get_all_book_tags()
|
books = parse_book_tags()
|
||||||
# NOTE consider making this a map()
|
# NOTE consider making this a map()
|
||||||
for book in tqdm(books):
|
for book in tqdm(parse_book_tags()):
|
||||||
book_title = book.get_text()
|
(book_title, book_url_relative) = book
|
||||||
book_url_relative = book.get('href')
|
|
||||||
if book_url_relative is not None:
|
if book_url_relative is not None:
|
||||||
book_url = urljoin(allbooks_url, book_url_relative)
|
book_url = urljoin(allbooks_url, book_url_relative)
|
||||||
GBConvert(book_url).run()
|
GBConvert(book_url).run()
|
||||||
|
|||||||
Reference in New Issue
Block a user