feat: restructure for memory efficiency

2025-03-16 19:00:01 +01:00
parent 6754f47e9f
commit 5d063d8597
4 changed files with 62 additions and 213 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "epub2go"
-version = "1.3"
+version = "2.0"
 description = "EPUB converter using wget, pandoc and python glue"
 readme = "README.md"
 requires-python = ">=3.12"
--- a/src/epub2go/convert.py
+++ b/src/epub2go/convert.py
@@ -23,75 +23,93 @@ class Book():
    url: str
 class GBConvert():
    def __init__(self,
-        url:str,
+        downloaddir,
        author:str = None,
        title:str = None,
        downloaddir = './',
        showprogress:bool = False,
        ):
        # NOTE move non-code files to data folder
        self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
        with open(pkg_resources.files('epub2go').joinpath('blocklist.txt')) as blocklist:
            self.blocklist = blocklist.read().splitlines()
        self.tocpage = os.path.dirname(url) # ToC website url
        url = urlparse(self.tocpage)
        self.dir_download = downloaddir
        self.dir_output = os.path.join(self.dir_download, url.netloc + url.path )# directories created by wget recreating the URL
        logger.debug('Downloading in %s, expecting files in in %s', self.dir_download, self.dir_output)
        self.showprogress = showprogress
        self.author = author
        self.title = title
        self.chapters = []
-        self.parse_meta()
+    def download(self,
        url:str,
        author:str = None,
        title:str = None,
        showprogress: bool = False,
        cleanpages: bool = True,
    ):
        tocpage = os.path.dirname(url) # ToC website url
        url = urlparse(tocpage)
        dir_output = os.path.join(self.dir_download, url.netloc + url.path )# directories created by wget recreating the URL
        logger.debug('Downloading to %s, expecting files in in %s', self.dir_download, dir_output)
        author = author
        title = title
-    def parse_meta(self):
+        #parse_meta
-        response = requests.get(self.tocpage)
+        response = requests.get(tocpage)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        # TODO allow setting these from interactive mode where those parameters are figured out from the list
-        if not self.author:
+        if not author:
            try:
-                self.author = soup.find('meta', {'name': 'author'})['content']
+                author = soup.find('meta', {'name': 'author'})['content']
            except:
-                self.author = "UnknownAuthor"
+                author = "UnknownAuthor"
-        if not self.title:
+        if not title:
            try:
-                self.title = soup.find('meta', {'name': 'title'})['content']
+                title = soup.find('meta', {'name': 'title'})['content']
            except:
-                self.title = "UnknownTitle"
+                title = "UnknownTitle"
-        self.toc = soup.find('ul').find_all('a')
+        chapter_urls = soup.find('ul').find_all('a')
-        logger.debug('Found ToC with %d entries', len(self.toc))
+        logger.debug('Found ToC with %d entries', len(chapter_urls))
-    def parse_toc_entry(self, entry):
+        #run
-        url = os.path.join(self.tocpage, entry['href'])
+        #TODO include images flag
        # download all files in toc (chapters)
        chapter_files = []
        for item in (tqdm(chapter_urls) if showprogress else chapter_urls):
            item_url = self.parse_toc_entry(tocpage, item)
            parsed_url = urlparse(item_url)
            filepath = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path)
            if cleanpages: self.parse_page(filepath)
            chapter_files.append(os.path.basename(item_url))
        return self.create_epub(author,title,chapter_files,dir_output)
    def parse_toc_entry(self, tocpage, entry):
        url = os.path.join(tocpage, entry['href'])
        self.save_page(url)
        return url
    # apply blocklist to file
    def parse_page(self,file_path):
        #TODO clean up file opening, mmap?
-        logger.debug('Parsing page at %s', file_path)
+        count=0
        with open(file_path, 'r+') as f:
            soup = BeautifulSoup(f.read(), 'html.parser')
            for blocker in self.blocklist:
                for item in soup.select(blocker):
                    item.decompose()
                    count+=1
            f.seek(0)
            f.truncate()
            f.write(str(soup))
        logger.debug('Removed %d tags from page %s during parsing', count, file_path)
-    def create_epub(self,  filename='out.epub')-> int:
+    def create_epub(self, author, title, chapters, dir_output)-> int:
        #TODO --epub-cover-image
        #TODO toc if it isnt described by <h> tags, e.g. https://www.projekt-gutenberg.org/adlersfe/maskenba/
        filename = f'{title} - {author}.epub'
        logger.debug('Creating epub as "%s"',filename)
        command = f'''pandoc -f html -t epub \
                    -o "{filename}" \
                    --reference-location=section \
                    --css="{self.style_path_drama}" \
-                    --metadata title="{self.title}" \
+                    --metadata title="{title}" \
-                    --metadata author="{self.author}" \
+                    --metadata author="{author}" \
                    --epub-title-page=false \
-                    {" ".join(self.chapters)} '''
+                    {" ".join(chapters)} '''
-        return subprocess.run(shlex.split(command), cwd=self.dir_output).returncode
+        return subprocess.run(shlex.split(command), cwd=dir_output).returncode
    def save_page(self, url):
        logger.debug('Saving page at %s', url)
@@ -104,18 +122,6 @@ class GBConvert():
                    --quiet \
                    {url}'''
        return subprocess.run(shlex.split(command), cwd=self.dir_download).returncode
    def run(self):
        #TODO include images flag
        # download all files in toc (chapters)
        for item in (tqdm(self.toc) if self.showprogress else self.toc):
            item_url = self.parse_toc_entry(item)
            parsed_url = urlparse(item_url)
            filepath = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path)
            self.parse_page(filepath)
            self.chapters.append(os.path.basename(item_url))
        return self.create_epub(f'{self.title} - {self.author}.epub')
 # get a list of all books for interactive selection or scraping
 def get_all_books() -> List[Book]:
@@ -149,10 +155,13 @@ def get_all_books() -> List[Book]:
 # run main cli
@click.command()
 #TODO include images flag
@click.option('--debug', '-d', is_flag=True, help='Set the log level to DEBUG')
@click.option('--silent', '-s', is_flag=True, help='Disable the progress bar')
@click.option('--path','-p',type=str,default='./', help='The path to which files are saved' )
@click.option('--no-clean',is_flag=True,help='Do not parse html files with blocklist')
@click.argument('args', nargs=-1)
-def main(args, debug, silent):
+def main(args, debug, silent, path, no_clean):
    '''
    Download ePUBs from https://www.projekt-gutenberg.org/
    Provide either 0 arguments to enter interactive mode or an arbitrary number of URLs to download from
@@ -173,10 +182,12 @@ def main(args, debug, silent):
        books = [item.split(';')[1].strip() for item in selection]
    logger.debug('Attempting to download from %d URL(s)', len(books))
    converter = GBConvert(path)
    if len(books)==1:
-        GBConvert(books[0], showprogress=not silent).run()
+        converter.download(books[0], showprogress=not silent, cleanpages= not no_clean)
    else:
        for book in (tqdm(books) if not silent else books):
-                GBConvert(book).run()
+            converter.download(book, cleanpages= not no_clean)
 if __name__ == "__main__":
    main()
--- a/src/epub2go/crawl.py
+++ b/src/epub2go/crawl.py
@@ -11,9 +11,10 @@ from convert import GBConvert, allbooks_url, get_all_books, Book
 def main():
    books = get_all_books()
    # NOTE consider making this a map()
    converter = GBConvert('./')
    for book in tqdm(books):
        if book.url is not None:
-            GBConvert(book.url).run()
+            converter.download(book.url)
 if __name__ == "__main__":
--- a/src/epub2go/test.py
+++ b/src/epub2go/test.py
@@ -1,163 +0,0 @@
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
 from urllib.request import urlparse
 from tqdm import tqdm
 from pyfzf.pyfzf import FzfPrompt
 import click
 import os, subprocess, shlex, logging
 import importlib.resources as pkg_resources
 from dataclasses import dataclass
 from typing import List
 logger = logging.getLogger(__name__)
 allbooks_url = 'https://www.projekt-gutenberg.org/info/texte/allworka.html'
@dataclass
 class Book():
    author: str
    title: str
    url: str
 class GBConvert():
    def __init__(self, downloaddir):
        # NOTE move non-code files to data folder
        self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
        with open(pkg_resources.files('epub2go').joinpath('blocklist.txt')) as blocklist:
            self.blocklist = blocklist.read().splitlines()
        self.dir_download = downloaddir
    def download(self, url: str, author: str = None, title: str = None, showprogress: bool = False):
        tocpage = os.path.dirname(url)  # ToC website url
        url = urlparse(tocpage)
        dir_output = os.path.join(self.dir_download, url.netloc + url.path)  # directories created by wget recreating the URL
        logger.debug('Downloading in %s, expecting files in %s', self.dir_download, dir_output)
        response = requests.get(tocpage)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        if not author:
            try:
                author = soup.find('meta', {'name': 'author'})['content']
            except:
                author = "UnknownAuthor"
        if not title:
            try:
                title = soup.find('meta', {'name': 'title'})['content']
            except:
                title = "UnknownTitle"
        toc = soup.find('ul').find_all('a')
        logger.debug('Found ToC with %d entries', len(toc))
        chapters = []
        for item in (tqdm(toc) if showprogress else toc):
            item_url = self.parse_toc_entry(tocpage, item)
            parsed_url = urlparse(item_url)
            filepath = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path)
            self.parse_page(filepath)
            chapters.append(os.path.basename(item_url))
        return self.create_epub(author, title, chapters, dir_output)
    def parse_toc_entry(self, tocpage, entry):
        url = os.path.join(tocpage, entry['href'])
        self.save_page(url)
        return url
    def parse_page(self, file_path):
        logger.debug('Parsing page at %s', file_path)
        with open(file_path, 'r+') as f:
            soup = BeautifulSoup(f.read(), 'html.parser')
            for blocker in self.blocklist:
                for item in soup.select(blocker):
                    item.decompose()
            f.seek(0)
            f.truncate()
            f.write(str(soup))
    def create_epub(self, author, title, chapters, dir_output) -> int:
        filename = f'{title} - {author}.epub'
        logger.debug('Creating epub as "%s"', filename)
        command = f'''pandoc -f html -t epub \
                    -o "{filename}" \
                    --reference-location=section \
                    --css="{self.style_path_drama}" \
                    --metadata title="{title}" \
                    --metadata author="{author}" \
                    --epub-title-page=false \
                    {" ".join(chapters)} '''
        return subprocess.run(shlex.split(command), cwd=dir_output).returncode
    def save_page(self, url):
        logger.debug('Saving page at %s', url)
        command = f'''wget \
                    --timestamping \
                    --page-requisites \
                    --convert-links \
                    --tries=5 \
                    --quiet \
                    {url}'''
        return subprocess.run(shlex.split(command), cwd=self.dir_download).returncode
 # get a list of all books for interactive selection or scraping
 def get_all_books() -> List[Book]:
    response = requests.get(allbooks_url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
    tags = soup.find('dl').findChildren()
    books = []
    for tag in tags:
        if tag.name == 'dt':
            br_tag = tag.find('br')
            if br_tag:
                book_author = str(br_tag.next_sibling)
            else:
                book_author = tag.get_text(strip=True)
            book_author = ' '.join(book_author.split())
        elif tag.name == 'dd':
            book_tag = tag.a
            if book_tag:
                book_href = book_tag.get('href')
                book_url = urljoin(allbooks_url, book_href)
                book_title = ' '.join(book_tag.getText().split())
                book = Book(book_author, book_title, book_url)
                books.append(book)
    return books
 # run main cli
@click.command()
@click.option('--debug', '-d', is_flag=True, help='Set the log level to DEBUG')
@click.option('--silent', '-s', is_flag=True, help='Disable the progress bar')
@click.argument('args', nargs=-1)
 def main(args, debug, silent):
    '''
    Download ePUBs from https://www.projekt-gutenberg.org/
    Provide either 0 arguments to enter interactive mode or an arbitrary number of URLs to download from
    '''
    logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')
    if(debug): logger.setLevel(logging.DEBUG)
    # non-interactive mode
    if len(args) > 0:
        books = args
    else:
        logger.debug('Received no CLI arguments, starting interactive mode')
        delimiter = ';'
        books = [f"{ item.author } - { item.title } {delimiter} { item.url }" for item in get_all_books()]
        fzf = FzfPrompt()
        selection = fzf.prompt(choices=books, fzf_options=r'--exact --with-nth 1 -m -d\;')
        books = [item.split(';')[1].strip() for item in selection]
    logger.debug('Attempting to download from %d URL(s)', len(books))
    converter = GBConvert('./')
    if len(books) == 1:
        converter.download(books[0], showprogress=not silent)
    else:
        for book in (tqdm(books) if not silent else books):
            converter.download(book)
 if __name__ == "__main__":
    main()