fix: slugify filenames

feat: prettier logging
fix: parameter getdir
2025-04-06 10:29:19 +02:00 · 2025-04-05 01:42:20 +02:00 · 2025-04-02 11:26:24 +02:00 · 2025-03-23 23:55:05 +01:00 · 2025-03-20 22:11:12 +01:00 · 2025-03-16 20:30:42 +01:00
5 changed files with 115 additions and 68 deletions
--- a/README.md
+++ b/README.md
@@ -1,18 +1,39 @@
 # epub2go.py
-web to epub converter for https://projekt-gutenberg.org.
+Web to ePUB Converter for [projekt-gutenberg.org](https://projekt-gutenberg.org)  developed in conjunction with a [web interface](https://github.com/eneller/epub2go-web).
 ## Installation
 Requires:
 - [pandoc](https://pandoc.org/)
 - [wget](https://www.gnu.org/software/wget/)
- [fzf](https://github.com/junegunn/fzf) (only for interactive mode)
+- [fzf](https://github.com/junegunn/fzf) (optional, only for interactive mode)
- python (duh)
+- [python](https://www.python.org/) (duh)
-## Usage
+
 Invoke the script using the url of any page of the book you would like to download:
 ``` 
 epub2go https://www.projekt-gutenberg.org/ibsen/solness/
 ```
 ## Installation
 Assuming you have a recent version of python installed, run
 ```
 pip install git+https://github.com/eneller/epub2go.py
 ```
-   This will provide the 'epub2go' command.
+This will provide the `epub2go` command.
 ## Usage
 ```
 Usage: epub2go [OPTIONS] [ARGS]...
  Download ePUBs from https://www.projekt-gutenberg.org/
  Provide either 0 arguments to enter interactive mode or an arbitrary number
  of URLs to download from
 Options:
  -d, --debug      Set the log level to DEBUG
  -s, --silent     Disable the progress bar
  -p, --path TEXT  The path to which files are saved
  --no-clean       Do not parse html files with blocklist
  --help           Show this message and exit.
 ```
 Examples:
 ```bash
 epub2go https://www.projekt-gutenberg.org/ibsen/solness/
 epub2go # will enter interactive mode
 ```
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "epub2go"
-version = "1.3"
+version = "2.2.3"
 description = "EPUB converter using wget, pandoc and python glue"
 readme = "README.md"
 requires-python = ">=3.12"
--- a/src/epub2go/convert.py
+++ b/src/epub2go/convert.py
@@ -7,7 +7,7 @@ from tqdm import tqdm
 from pyfzf.pyfzf import FzfPrompt
 import click
-import os, subprocess, shlex, logging
+import os, subprocess, shlex, logging, re
 import importlib.resources as pkg_resources
 from dataclasses import dataclass
 from typing import List
@@ -23,75 +23,100 @@ class Book():
    url: str
 class GBConvert():
    def __init__(self,
-        url:str,
+        downloaddir,
        author:str = None,
        title:str = None,
        downloaddir = './',
        showprogress:bool = False,
        ):
        # NOTE move non-code files to data folder
        self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
        with open(pkg_resources.files('epub2go').joinpath('blocklist.txt')) as blocklist:
            self.blocklist = blocklist.read().splitlines()
        self.tocpage = os.path.dirname(url) # ToC website url
        url = urlparse(self.tocpage)
        self.dir_download = downloaddir
        self.dir_output = os.path.join(self.dir_download, url.netloc + url.path )# directories created by wget recreating the URL
        logger.debug('Downloading in %s, expecting files in in %s', self.dir_download, self.dir_output)
        self.showprogress = showprogress
        self.author = author
        self.title = title
        self.chapters = []
-        self.parse_meta()
+    def getDir(self, url):
        tocpage = os.path.dirname(url) # ToC website url
        parsed_url = urlparse(tocpage)
        # directories created by wget recreating the URL
        dir_output = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path )
        return dir_output
-    def parse_meta(self):
+    def download(self,
-        response = requests.get(self.tocpage)
+        url:str,
        author:str = None,
        title:str = None,
        showprogress: bool = False,
        cleanpages: bool = True,
    ):
        tocpage = os.path.dirname(url) # ToC website url
        dir_output = self.getDir(url)
        logger.debug('Downloading to %s, expecting files in in %s', self.dir_download, dir_output)
        author = author
        title = title
        #parse_meta
        response = requests.get(tocpage)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        # TODO allow setting these from interactive mode where those parameters are figured out from the list
-        if not self.author:
+        if not author:
            try:
-                self.author = soup.find('meta', {'name': 'author'})['content']
+                author = soup.find('meta', {'name': 'author'})['content']
            except:
-                self.author = "UnknownAuthor"
+                author = "UnknownAuthor"
-        if not self.title:
+        if not title:
            try:
-                self.title = soup.find('meta', {'name': 'title'})['content']
+                title = soup.find('meta', {'name': 'title'})['content']
            except:
-                self.title = "UnknownTitle"
+                title = "UnknownTitle"
-        self.toc = soup.find('ul').find_all('a')
+        chapter_urls = soup.find('ul').find_all('a')
-        logger.debug('Found ToC with %d entries', len(self.toc))
+        logger.debug('Found ToC with %d entries', len(chapter_urls))
-    def parse_toc_entry(self, entry):
+        #run
-        url = os.path.join(self.tocpage, entry['href'])
+        #TODO include images flag
        # download all files in toc (chapters)
        chapter_files = []
        for item in (tqdm(chapter_urls) if showprogress else chapter_urls):
            item_url = self.parse_toc_entry(tocpage, item)
            parsed_url = urlparse(item_url)
            filepath = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path)
            if cleanpages: self.parse_page(filepath)
            chapter_files.append(os.path.basename(item_url))
        return self.create_epub(author,title,chapter_files,dir_output)
    def parse_toc_entry(self, tocpage, entry):
        url = os.path.join(tocpage, entry['href'])
        self.save_page(url)
        return url
    # apply blocklist to file
    def parse_page(self,file_path):
        #TODO clean up file opening, mmap?
-        logger.debug('Parsing page at %s', file_path)
+        count=0
        with open(file_path, 'r+') as f:
            soup = BeautifulSoup(f.read(), 'html.parser')
            for blocker in self.blocklist:
                for item in soup.select(blocker):
                    item.decompose()
                    count+=1
            f.seek(0)
            f.truncate()
            f.write(str(soup))
        logger.debug('Removed %d tags from page %s during parsing', count, file_path)
-    def create_epub(self,  filename='out.epub')-> int:
+    def create_epub(self, author, title, chapters, dir_output):
        #TODO --epub-cover-image
        #TODO toc if it isnt described by <h> tags, e.g. https://www.projekt-gutenberg.org/adlersfe/maskenba/
-        logger.debug('Creating epub as "%s"',filename)
+        filename = slugify(f'{title} - {author}.epub')
        command = f'''pandoc -f html -t epub \
                    -o "{filename}" \
                    --reference-location=section \
                    --css="{self.style_path_drama}" \
-                    --metadata title="{self.title}" \
+                    --metadata title="{title}" \
-                    --metadata author="{self.author}" \
+                    --metadata author="{author}" \
                    --epub-title-page=false \
-                    {" ".join(self.chapters)} '''
+                    {" ".join(chapters)} '''
-        return subprocess.run(shlex.split(command), cwd=self.dir_output).returncode
+        logger.debug('Calling "%s"', command)
        subprocess.run(shlex.split(command), cwd=dir_output, check=True)
        return os.path.abspath(os.path.join(dir_output,filename))
    def save_page(self, url):
        logger.debug('Saving page at %s', url)
@@ -103,19 +128,7 @@ class GBConvert():
                    --tries=5 \
                    --quiet \
                    {url}'''
-        return subprocess.run(shlex.split(command), cwd=self.dir_download).returncode
+        subprocess.run(shlex.split(command), cwd=self.dir_download, check=True)
    def run(self):
        #TODO include images flag
        # download all files in toc (chapters)
        for item in (tqdm(self.toc) if self.showprogress else self.toc):
            item_url = self.parse_toc_entry(item)
            parsed_url = urlparse(item_url)
            filepath = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path)
            self.parse_page(filepath)
            self.chapters.append(os.path.basename(item_url))
        return self.create_epub(f'{self.title} - {self.author}.epub')
 # get a list of all books for interactive selection or scraping
 def get_all_books() -> List[Book]:
@@ -147,14 +160,24 @@ def get_all_books() -> List[Book]:
                books.append(book)
    return books
 def slugify(value, replacement='_'):
    value = re.sub(r'[<>:"/\\|?*\x00-\x1F]', replacement, value)
    # Remove leading/trailing whitespace or dots
    value = value.strip().strip(".")
    # Optionally truncate to safe length (e.g. 255 chars for most filesystems)
    return value[:255] or "untitled"
 # run main cli
@click.command()
 #TODO include images flag
@click.option('--debug', '-d', is_flag=True, help='Set the log level to DEBUG')
@click.option('--silent', '-s', is_flag=True, help='Disable the progress bar')
@click.option('--path','-p',type=str,default='./', help='The path to which files are saved' )
@click.option('--no-clean',is_flag=True,help='Do not parse html files with blocklist')
@click.argument('args', nargs=-1)
-def main(args, debug, silent):
+def main(args, debug, silent, path, no_clean):
    '''
-    Download ePUBs from https://www.projekt-gutenberg.org/
+    Download ePUBs from https://www.projekt-gutenberg.org/ \n
    Provide either 0 arguments to enter interactive mode or an arbitrary number of URLs to download from
    '''
    logging.basicConfig(level=logging.ERROR,format='%(asctime)s - %(levelname)s - %(message)s')
@@ -173,10 +196,12 @@ def main(args, debug, silent):
        books = [item.split(';')[1].strip() for item in selection]
    logger.debug('Attempting to download from %d URL(s)', len(books))
    converter = GBConvert(path)
    if len(books)==1:
-        GBConvert(books[0], showprogress=not silent).run()
+        converter.download(books[0], showprogress=not silent, cleanpages= not no_clean)
    else:
        for book in (tqdm(books) if not silent else books):
-                GBConvert(book).run()
+            converter.download(book, cleanpages= not no_clean)
 if __name__ == "__main__":
    main()
--- a/src/epub2go/crawl.py
+++ b/src/epub2go/crawl.py
@@ -11,9 +11,10 @@ from convert import GBConvert, allbooks_url, get_all_books, Book
 def main():
    books = get_all_books()
    # NOTE consider making this a map()
    converter = GBConvert('./')
    for book in tqdm(books):
        if book.url is not None:
-            GBConvert(book.url).run()
+            converter.download(book.url)
 if __name__ == "__main__":
--- a/uv.lock
+++ b/uv.lock
@@ -81,7 +81,7 @@ wheels = [
 [[package]]
 name = "epub2go"
-version = "1.2"
+version = "2.2"
 source = { editable = "." }
 dependencies = [
    { name = "beautifulsoup4" },
Author	SHA1	Message	Date
eneller	75974ae119	fix: slugify filenames	2025-04-06 10:29:19 +02:00
eneller	b3cd49326f	feat: prettier logging	2025-04-05 01:42:20 +02:00
eneller	401d02e0ca	fix: parameter getdir	2025-04-02 11:26:24 +02:00
eneller	660af7fab0	feat: allow getting directory without download	2025-03-23 23:55:05 +01:00
eneller	c49a1be369	docs: readme	2025-03-20 22:11:12 +01:00
eneller	4267700763	feat: return epub path errors from wget and pandoc are thrown up	2025-03-16 20:30:42 +01:00
eneller	5d063d8597	feat: restructure for memory efficiency	2025-03-16 19:06:33 +01:00
eneller	6754f47e9f	fix: restructure test	2025-03-16 18:57:40 +01:00
eneller	4a8d4f945d	begin restructure	2025-03-16 18:34:12 +01:00