fix: restructure test

This commit is contained in:
eneller
2025-03-16 18:57:40 +01:00
parent 4a8d4f945d
commit 6754f47e9f

View File

@@ -1,6 +1,5 @@
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4 import ResultSet
from urllib.parse import urljoin from urllib.parse import urljoin
from urllib.request import urlparse from urllib.request import urlparse
from tqdm import tqdm from tqdm import tqdm
@@ -14,42 +13,32 @@ from typing import List
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
allbooks_url ='https://www.projekt-gutenberg.org/info/texte/allworka.html' allbooks_url = 'https://www.projekt-gutenberg.org/info/texte/allworka.html'
@dataclass @dataclass
class Book(): class Book():
author: str author: str
title: str title: str
url: str url: str
class GBConvert(): class GBConvert():
def __init__(self, def __init__(self, downloaddir):
downloaddir,
):
# NOTE move non-code files to data folder # NOTE move non-code files to data folder
self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css") self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
with open(pkg_resources.files('epub2go').joinpath('blocklist.txt')) as blocklist: with open(pkg_resources.files('epub2go').joinpath('blocklist.txt')) as blocklist:
self.blocklist = blocklist.read().splitlines() self.blocklist = blocklist.read().splitlines()
self.dir_download = downloaddir self.dir_download = downloaddir
def download(self, def download(self, url: str, author: str = None, title: str = None, showprogress: bool = False):
url:str,
author:str = None,
title:str = None,
showprogress: bool = False,
cleanpages: bool = True,
):
tocpage = os.path.dirname(url) # ToC website url tocpage = os.path.dirname(url) # ToC website url
url = urlparse(tocpage) url = urlparse(tocpage)
dir_output = os.path.join(self.dir_download, url.netloc + url.path )# directories created by wget recreating the URL dir_output = os.path.join(self.dir_download, url.netloc + url.path) # directories created by wget recreating the URL
logger.debug('Downloading to %s, expecting files in in %s', self.dir_download, dir_output) logger.debug('Downloading in %s, expecting files in %s', self.dir_download, dir_output)
author = author
title = title
#parse_meta
response = requests.get(tocpage) response = requests.get(tocpage)
response.raise_for_status() response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser') soup = BeautifulSoup(response.content, 'html.parser')
# TODO allow setting these from interactive mode where those parameters are figured out from the list
if not author: if not author:
try: try:
author = soup.find('meta', {'name': 'author'})['content'] author = soup.find('meta', {'name': 'author'})['content']
@@ -60,45 +49,39 @@ class GBConvert():
title = soup.find('meta', {'name': 'title'})['content'] title = soup.find('meta', {'name': 'title'})['content']
except: except:
title = "UnknownTitle" title = "UnknownTitle"
chapter_urls = soup.find('ul').find_all('a')
logger.debug('Found ToC with %d entries', len(chapter_urls))
#run toc = soup.find('ul').find_all('a')
#TODO include images flag logger.debug('Found ToC with %d entries', len(toc))
# download all files in toc (chapters)
chapter_files = [] chapters = []
for item in (tqdm(chapter_urls) if showprogress else chapter_urls): for item in (tqdm(toc) if showprogress else toc):
item_url = self.parse_toc_entry(tocpage, item) item_url = self.parse_toc_entry(tocpage, item)
parsed_url = urlparse(item_url) parsed_url = urlparse(item_url)
filepath = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path) filepath = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path)
if cleanpages: self.parse_page(filepath) self.parse_page(filepath)
chapter_files.append(os.path.basename(item_url)) chapters.append(os.path.basename(item_url))
return self.create_epub(author,title,chapter_files,dir_output) return self.create_epub(author, title, chapters, dir_output)
def parse_toc_entry(self, tocpage, entry): def parse_toc_entry(self, tocpage, entry):
url = os.path.join(tocpage, entry['href']) url = os.path.join(tocpage, entry['href'])
self.save_page(url) self.save_page(url)
return url return url
# apply blocklist to file def parse_page(self, file_path):
def parse_page(self,file_path): logger.debug('Parsing page at %s', file_path)
#TODO clean up file opening, mmap?
count=0
with open(file_path, 'r+') as f: with open(file_path, 'r+') as f:
soup = BeautifulSoup(f.read(), 'html.parser') soup = BeautifulSoup(f.read(), 'html.parser')
for blocker in self.blocklist: for blocker in self.blocklist:
for item in soup.select(blocker): for item in soup.select(blocker):
item.decompose() item.decompose()
count+=1 f.seek(0)
f.truncate()
f.write(str(soup)) f.write(str(soup))
logger.debug('Removed %d tags from page %s during parsing', count, file_path)
def create_epub(self, author, title, chapters, dir_output)-> int: def create_epub(self, author, title, chapters, dir_output) -> int:
#TODO --epub-cover-image
#TODO toc if it isnt described by <h> tags, e.g. https://www.projekt-gutenberg.org/adlersfe/maskenba/
filename = f'{title} - {author}.epub' filename = f'{title} - {author}.epub'
logger.debug('Creating epub as "%s"',filename) logger.debug('Creating epub as "%s"', filename)
command = f'''pandoc -f html -t epub \ command = f'''pandoc -f html -t epub \
-o "{filename}" \ -o "{filename}" \
--reference-location=section \ --reference-location=section \
@@ -111,7 +94,6 @@ class GBConvert():
def save_page(self, url): def save_page(self, url):
logger.debug('Saving page at %s', url) logger.debug('Saving page at %s', url)
# https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
command = f'''wget \ command = f'''wget \
--timestamping \ --timestamping \
--page-requisites \ --page-requisites \
@@ -129,18 +111,13 @@ def get_all_books() -> List[Book]:
tags = soup.find('dl').findChildren() tags = soup.find('dl').findChildren()
books = [] books = []
for tag in tags: for tag in tags:
# is description tag, i.e. contains author name if tag.name == 'dt':
if tag.name =='dt':
# update author
# special case when author name and Alphabetical list is in same tag
br_tag = tag.find('br') br_tag = tag.find('br')
if br_tag: if br_tag:
book_author = str(br_tag.next_sibling) book_author = str(br_tag.next_sibling)
# default case, dt only contains author name
else: else:
book_author = tag.get_text(strip=True) book_author = tag.get_text(strip=True)
book_author = ' '.join(book_author.split()) book_author = ' '.join(book_author.split())
# is details tag, contains book url
elif tag.name == 'dd': elif tag.name == 'dd':
book_tag = tag.a book_tag = tag.a
if book_tag: if book_tag:
@@ -153,39 +130,34 @@ def get_all_books() -> List[Book]:
# run main cli # run main cli
@click.command() @click.command()
#TODO include images flag
@click.option('--debug', '-d', is_flag=True, help='Set the log level to DEBUG') @click.option('--debug', '-d', is_flag=True, help='Set the log level to DEBUG')
@click.option('--silent', '-s', is_flag=True, help='Disable the progress bar') @click.option('--silent', '-s', is_flag=True, help='Disable the progress bar')
@click.option('--path','-p',type=str,default='./', help='The path to which files are saved' )
@click.option('--no-clean',is_flag=True,help='Do not parse html files with blocklist')
@click.argument('args', nargs=-1) @click.argument('args', nargs=-1)
def main(args, debug, silent, path, no_clean): def main(args, debug, silent):
''' '''
Download ePUBs from https://www.projekt-gutenberg.org/ Download ePUBs from https://www.projekt-gutenberg.org/
Provide either 0 arguments to enter interactive mode or an arbitrary number of URLs to download from Provide either 0 arguments to enter interactive mode or an arbitrary number of URLs to download from
''' '''
logging.basicConfig(level=logging.ERROR,format='%(asctime)s - %(levelname)s - %(message)s') logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')
if(debug): logger.setLevel(logging.DEBUG) if(debug): logger.setLevel(logging.DEBUG)
# non-interactive mode # non-interactive mode
if len(args) > 0 : if len(args) > 0:
books = args books = args
# interactive mode using fzf
else: else:
logger.debug('Received no CLI arguments, starting interactive mode') logger.debug('Received no CLI arguments, starting interactive mode')
delimiter = ';' delimiter = ';'
# create lines for fzf
books = [f"{ item.author } - { item.title } {delimiter} { item.url }" for item in get_all_books()] books = [f"{ item.author } - { item.title } {delimiter} { item.url }" for item in get_all_books()]
fzf = FzfPrompt() fzf = FzfPrompt()
selection = fzf.prompt(choices=books, fzf_options=r'--exact --with-nth 1 -m -d\;') selection = fzf.prompt(choices=books, fzf_options=r'--exact --with-nth 1 -m -d\;')
books = [item.split(';')[1].strip() for item in selection] books = [item.split(';')[1].strip() for item in selection]
logger.debug('Attempting to download from %d URL(s)', len(books)) logger.debug('Attempting to download from %d URL(s)', len(books))
converter = GBConvert(path) converter = GBConvert('./')
if len(books)==1: if len(books) == 1:
converter.download(books[0], showprogress=not silent, cleanpages= not no_clean) converter.download(books[0], showprogress=not silent)
else: else:
for book in (tqdm(books) if not silent else books): for book in (tqdm(books) if not silent else books):
converter.download(book, cleanpages= not no_clean) converter.download(book)
if __name__ == "__main__": if __name__ == "__main__":
main() main()