fix: restructure test

This commit is contained in:
eneller
2025-03-16 18:57:40 +01:00
parent 4a8d4f945d
commit 6754f47e9f

View File

@@ -1,6 +1,5 @@
import requests
from bs4 import BeautifulSoup
from bs4 import ResultSet
from urllib.parse import urljoin
from urllib.request import urlparse
from tqdm import tqdm
@@ -21,35 +20,25 @@ class Book():
author: str
title: str
url: str
class GBConvert():
def __init__(self,
downloaddir,
):
def __init__(self, downloaddir):
# NOTE move non-code files to data folder
self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
with open(pkg_resources.files('epub2go').joinpath('blocklist.txt')) as blocklist:
self.blocklist = blocklist.read().splitlines()
self.dir_download = downloaddir
def download(self,
url:str,
author:str = None,
title:str = None,
showprogress: bool = False,
cleanpages: bool = True,
):
def download(self, url: str, author: str = None, title: str = None, showprogress: bool = False):
tocpage = os.path.dirname(url) # ToC website url
url = urlparse(tocpage)
dir_output = os.path.join(self.dir_download, url.netloc + url.path) # directories created by wget recreating the URL
logger.debug('Downloading to %s, expecting files in in %s', self.dir_download, dir_output)
author = author
title = title
logger.debug('Downloading in %s, expecting files in %s', self.dir_download, dir_output)
#parse_meta
response = requests.get(tocpage)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# TODO allow setting these from interactive mode where those parameters are figured out from the list
if not author:
try:
author = soup.find('meta', {'name': 'author'})['content']
@@ -60,43 +49,37 @@ class GBConvert():
title = soup.find('meta', {'name': 'title'})['content']
except:
title = "UnknownTitle"
chapter_urls = soup.find('ul').find_all('a')
logger.debug('Found ToC with %d entries', len(chapter_urls))
#run
#TODO include images flag
# download all files in toc (chapters)
chapter_files = []
for item in (tqdm(chapter_urls) if showprogress else chapter_urls):
toc = soup.find('ul').find_all('a')
logger.debug('Found ToC with %d entries', len(toc))
chapters = []
for item in (tqdm(toc) if showprogress else toc):
item_url = self.parse_toc_entry(tocpage, item)
parsed_url = urlparse(item_url)
filepath = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path)
if cleanpages: self.parse_page(filepath)
chapter_files.append(os.path.basename(item_url))
self.parse_page(filepath)
chapters.append(os.path.basename(item_url))
return self.create_epub(author,title,chapter_files,dir_output)
return self.create_epub(author, title, chapters, dir_output)
def parse_toc_entry(self, tocpage, entry):
url = os.path.join(tocpage, entry['href'])
self.save_page(url)
return url
# apply blocklist to file
def parse_page(self, file_path):
#TODO clean up file opening, mmap?
count=0
logger.debug('Parsing page at %s', file_path)
with open(file_path, 'r+') as f:
soup = BeautifulSoup(f.read(), 'html.parser')
for blocker in self.blocklist:
for item in soup.select(blocker):
item.decompose()
count+=1
f.seek(0)
f.truncate()
f.write(str(soup))
logger.debug('Removed %d tags from page %s during parsing', count, file_path)
def create_epub(self, author, title, chapters, dir_output) -> int:
#TODO --epub-cover-image
#TODO toc if it isnt described by <h> tags, e.g. https://www.projekt-gutenberg.org/adlersfe/maskenba/
filename = f'{title} - {author}.epub'
logger.debug('Creating epub as "%s"', filename)
command = f'''pandoc -f html -t epub \
@@ -111,7 +94,6 @@ class GBConvert():
def save_page(self, url):
logger.debug('Saving page at %s', url)
# https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
command = f'''wget \
--timestamping \
--page-requisites \
@@ -129,18 +111,13 @@ def get_all_books() -> List[Book]:
tags = soup.find('dl').findChildren()
books = []
for tag in tags:
# is description tag, i.e. contains author name
if tag.name == 'dt':
# update author
# special case when author name and Alphabetical list is in same tag
br_tag = tag.find('br')
if br_tag:
book_author = str(br_tag.next_sibling)
# default case, dt only contains author name
else:
book_author = tag.get_text(strip=True)
book_author = ' '.join(book_author.split())
# is details tag, contains book url
elif tag.name == 'dd':
book_tag = tag.a
if book_tag:
@@ -153,13 +130,10 @@ def get_all_books() -> List[Book]:
# run main cli
@click.command()
#TODO include images flag
@click.option('--debug', '-d', is_flag=True, help='Set the log level to DEBUG')
@click.option('--silent', '-s', is_flag=True, help='Disable the progress bar')
@click.option('--path','-p',type=str,default='./', help='The path to which files are saved' )
@click.option('--no-clean',is_flag=True,help='Do not parse html files with blocklist')
@click.argument('args', nargs=-1)
def main(args, debug, silent, path, no_clean):
def main(args, debug, silent):
'''
Download ePUBs from https://www.projekt-gutenberg.org/
Provide either 0 arguments to enter interactive mode or an arbitrary number of URLs to download from
@@ -169,23 +143,21 @@ def main(args, debug, silent, path, no_clean):
# non-interactive mode
if len(args) > 0:
books = args
# interactive mode using fzf
else:
logger.debug('Received no CLI arguments, starting interactive mode')
delimiter = ';'
# create lines for fzf
books = [f"{ item.author } - { item.title } {delimiter} { item.url }" for item in get_all_books()]
fzf = FzfPrompt()
selection = fzf.prompt(choices=books, fzf_options=r'--exact --with-nth 1 -m -d\;')
books = [item.split(';')[1].strip() for item in selection]
logger.debug('Attempting to download from %d URL(s)', len(books))
converter = GBConvert(path)
converter = GBConvert('./')
if len(books) == 1:
converter.download(books[0], showprogress=not silent, cleanpages= not no_clean)
converter.download(books[0], showprogress=not silent)
else:
for book in (tqdm(books) if not silent else books):
converter.download(book, cleanpages= not no_clean)
converter.download(book)
if __name__ == "__main__":
main()