fix: restructure test

2025-03-16 18:57:40 +01:00
parent 4a8d4f945d
commit 6754f47e9f
1 changed files with 35 additions and 63 deletions
--- a/src/epub2go/test.py
+++ b/src/epub2go/test.py
@@ -1,6 +1,5 @@
 import requests
 from bs4 import BeautifulSoup
 from bs4 import ResultSet
 from urllib.parse import urljoin
 from urllib.request import urlparse
 from tqdm import tqdm
@@ -14,42 +13,32 @@ from typing import List
 logger = logging.getLogger(__name__)
-allbooks_url ='https://www.projekt-gutenberg.org/info/texte/allworka.html'
+allbooks_url = 'https://www.projekt-gutenberg.org/info/texte/allworka.html'
@dataclass
 class Book():
    author: str
    title: str
    url: str
 class GBConvert():
-    def __init__(self,
+    def __init__(self, downloaddir):
        downloaddir,
        ):
        # NOTE move non-code files to data folder
        self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
        with open(pkg_resources.files('epub2go').joinpath('blocklist.txt')) as blocklist:
            self.blocklist = blocklist.read().splitlines()
        self.dir_download = downloaddir
-    def download(self,
+    def download(self, url: str, author: str = None, title: str = None, showprogress: bool = False):
        url:str,
        author:str = None,
        title:str = None,
        showprogress: bool = False,
        cleanpages: bool = True,
    ):
        tocpage = os.path.dirname(url)  # ToC website url
        url = urlparse(tocpage)
-        dir_output = os.path.join(self.dir_download, url.netloc + url.path )# directories created by wget recreating the URL
+        dir_output = os.path.join(self.dir_download, url.netloc + url.path)  # directories created by wget recreating the URL
-        logger.debug('Downloading to %s, expecting files in in %s', self.dir_download, dir_output)
+        logger.debug('Downloading in %s, expecting files in %s', self.dir_download, dir_output)
        author = author
        title = title
        #parse_meta
        response = requests.get(tocpage)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
-        # TODO allow setting these from interactive mode where those parameters are figured out from the list
+        
        if not author:
            try:
                author = soup.find('meta', {'name': 'author'})['content']
@@ -60,45 +49,39 @@ class GBConvert():
                title = soup.find('meta', {'name': 'title'})['content']
            except:
                title = "UnknownTitle"
        chapter_urls = soup.find('ul').find_all('a')
        logger.debug('Found ToC with %d entries', len(chapter_urls))
-        #run
+        toc = soup.find('ul').find_all('a')
-        #TODO include images flag
+        logger.debug('Found ToC with %d entries', len(toc))
-        # download all files in toc (chapters)
+        
-        chapter_files = []
+        chapters = []
-        for item in (tqdm(chapter_urls) if showprogress else chapter_urls):
+        for item in (tqdm(toc) if showprogress else toc):
            item_url = self.parse_toc_entry(tocpage, item)
            parsed_url = urlparse(item_url)
            filepath = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path)
-            if cleanpages: self.parse_page(filepath)
+            self.parse_page(filepath)
-            chapter_files.append(os.path.basename(item_url))
+            chapters.append(os.path.basename(item_url))
-        return self.create_epub(author,title,chapter_files,dir_output)
+        return self.create_epub(author, title, chapters, dir_output)
    def parse_toc_entry(self, tocpage, entry):
        url = os.path.join(tocpage, entry['href'])
        self.save_page(url)
        return url
-    # apply blocklist to file
+    def parse_page(self, file_path):
-    def parse_page(self,file_path):
+        logger.debug('Parsing page at %s', file_path)
        #TODO clean up file opening, mmap?
        count=0
        with open(file_path, 'r+') as f:
            soup = BeautifulSoup(f.read(), 'html.parser')
            for blocker in self.blocklist:
                for item in soup.select(blocker):
                    item.decompose()
-                    count+=1
+            f.seek(0)
            f.truncate()
            f.write(str(soup))
        logger.debug('Removed %d tags from page %s during parsing', count, file_path)
-    def create_epub(self, author, title, chapters, dir_output)-> int:
+    def create_epub(self, author, title, chapters, dir_output) -> int:
        #TODO --epub-cover-image
        #TODO toc if it isnt described by <h> tags, e.g. https://www.projekt-gutenberg.org/adlersfe/maskenba/
        filename = f'{title} - {author}.epub'
-        logger.debug('Creating epub as "%s"',filename)
+        logger.debug('Creating epub as "%s"', filename)
        command = f'''pandoc -f html -t epub \
                    -o "{filename}" \
                    --reference-location=section \
@@ -111,7 +94,6 @@ class GBConvert():
    def save_page(self, url):
        logger.debug('Saving page at %s', url)
        # https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
        command = f'''wget \
                    --timestamping \
                    --page-requisites \
@@ -129,18 +111,13 @@ def get_all_books() -> List[Book]:
    tags = soup.find('dl').findChildren()
    books = []
    for tag in tags:
-        # is description tag, i.e. contains author name
+        if tag.name == 'dt':
        if tag.name =='dt':
            # update author
            # special case when author name and Alphabetical list is in same tag
            br_tag = tag.find('br')
            if br_tag:
                book_author = str(br_tag.next_sibling)
            # default case, dt only contains author name
            else:
                book_author = tag.get_text(strip=True)
            book_author = ' '.join(book_author.split())
        # is details tag, contains book url
        elif tag.name == 'dd':
            book_tag = tag.a
            if book_tag:
@@ -153,39 +130,34 @@ def get_all_books() -> List[Book]:
 # run main cli
@click.command()
 #TODO include images flag
@click.option('--debug', '-d', is_flag=True, help='Set the log level to DEBUG')
@click.option('--silent', '-s', is_flag=True, help='Disable the progress bar')
@click.option('--path','-p',type=str,default='./', help='The path to which files are saved' )
@click.option('--no-clean',is_flag=True,help='Do not parse html files with blocklist')
@click.argument('args', nargs=-1)
-def main(args, debug, silent, path, no_clean):
+def main(args, debug, silent):
    '''
    Download ePUBs from https://www.projekt-gutenberg.org/
    Provide either 0 arguments to enter interactive mode or an arbitrary number of URLs to download from
    '''
-    logging.basicConfig(level=logging.ERROR,format='%(asctime)s - %(levelname)s - %(message)s')
+    logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')
    if(debug): logger.setLevel(logging.DEBUG)
    # non-interactive mode
-    if len(args) > 0 :
+    if len(args) > 0:
        books = args
    # interactive mode using fzf
    else:
        logger.debug('Received no CLI arguments, starting interactive mode')
        delimiter = ';'
        # create lines for fzf
        books = [f"{ item.author } - { item.title } {delimiter} { item.url }" for item in get_all_books()]
        fzf = FzfPrompt()
        selection = fzf.prompt(choices=books, fzf_options=r'--exact --with-nth 1 -m -d\;')
        books = [item.split(';')[1].strip() for item in selection]
    logger.debug('Attempting to download from %d URL(s)', len(books))
-    converter = GBConvert(path)
+    converter = GBConvert('./')
-    if len(books)==1:
+    if len(books) == 1:
-        converter.download(books[0], showprogress=not silent, cleanpages= not no_clean)
+        converter.download(books[0], showprogress=not silent)
    else:
        for book in (tqdm(books) if not silent else books):
-            converter.download(book, cleanpages= not no_clean)
+            converter.download(book)
 if __name__ == "__main__":
    main()