fix: slugify filenames

feat: prettier logging
fix: parameter getdir
2025-04-06 10:29:19 +02:00 · 2025-04-05 01:42:20 +02:00 · 2025-04-02 11:26:24 +02:00 · 2025-03-23 23:55:05 +01:00 · 2025-03-20 22:11:12 +01:00
4 changed files with 55 additions and 21 deletions
--- a/README.md
+++ b/README.md
@@ -1,18 +1,39 @@
 # epub2go.py
-web to epub converter for https://projekt-gutenberg.org.
+Web to ePUB Converter for [projekt-gutenberg.org](https://projekt-gutenberg.org)  developed in conjunction with a [web interface](https://github.com/eneller/epub2go-web).
+
+## Installation
 Requires:
 - [pandoc](https://pandoc.org/)
 - [wget](https://www.gnu.org/software/wget/)
- [fzf](https://github.com/junegunn/fzf) (only for interactive mode)
- python (duh)
-## Usage
-Invoke the script using the url of any page of the book you would like to download:
-``` 
-epub2go https://www.projekt-gutenberg.org/ibsen/solness/
-```
-## Installation
+- [fzf](https://github.com/junegunn/fzf) (optional, only for interactive mode)
+- [python](https://www.python.org/) (duh)
+
 Assuming you have a recent version of python installed, run
+
 ```
 pip install git+https://github.com/eneller/epub2go.py
 ```
-   This will provide the 'epub2go' command.
+This will provide the `epub2go` command.
+
+## Usage
+```
+Usage: epub2go [OPTIONS] [ARGS]...
+
+  Download ePUBs from https://www.projekt-gutenberg.org/
+
+  Provide either 0 arguments to enter interactive mode or an arbitrary number
+  of URLs to download from
+
+Options:
+  -d, --debug      Set the log level to DEBUG
+  -s, --silent     Disable the progress bar
+  -p, --path TEXT  The path to which files are saved
+  --no-clean       Do not parse html files with blocklist
+  --help           Show this message and exit.
+```
+
+Examples:
+```bash
+epub2go https://www.projekt-gutenberg.org/ibsen/solness/
+epub2go # will enter interactive mode
+```
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "epub2go"
-version = "2.1"
+version = "2.2.3"
 description = "EPUB converter using wget, pandoc and python glue"
 readme = "README.md"
 requires-python = ">=3.12"
--- a/src/epub2go/convert.py
+++ b/src/epub2go/convert.py
@@ -7,7 +7,7 @@ from tqdm import tqdm
 from pyfzf.pyfzf import FzfPrompt
 import click

-import os, subprocess, shlex, logging
+import os, subprocess, shlex, logging, re
 import importlib.resources as pkg_resources
 from dataclasses import dataclass
 from typing import List
@@ -31,6 +31,13 @@ class GBConvert():
            self.blocklist = blocklist.read().splitlines()
        self.dir_download = downloaddir

+    def getDir(self, url):
+        tocpage = os.path.dirname(url) # ToC website url
+        parsed_url = urlparse(tocpage)
+        # directories created by wget recreating the URL
+        dir_output = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path )
+        return dir_output
+        
    def download(self,
        url:str,
        author:str = None,
@@ -39,8 +46,7 @@ class GBConvert():
        cleanpages: bool = True,
    ):
        tocpage = os.path.dirname(url) # ToC website url
-        url = urlparse(tocpage)
-        dir_output = os.path.join(self.dir_download, url.netloc + url.path )# directories created by wget recreating the URL
+        dir_output = self.getDir(url)
        logger.debug('Downloading to %s, expecting files in in %s', self.dir_download, dir_output)
        author = author
        title = title
@@ -99,8 +105,7 @@ class GBConvert():
    def create_epub(self, author, title, chapters, dir_output):
        #TODO --epub-cover-image
        #TODO toc if it isnt described by <h> tags, e.g. https://www.projekt-gutenberg.org/adlersfe/maskenba/
-        filename = f'{title} - {author}.epub'
-        logger.debug('Creating epub as "%s"',filename)
+        filename = slugify(f'{title} - {author}.epub')
        command = f'''pandoc -f html -t epub \
                    -o "{filename}" \
                    --reference-location=section \
@@ -109,6 +114,7 @@ class GBConvert():
                    --metadata author="{author}" \
                    --epub-title-page=false \
                    {" ".join(chapters)} '''
+        logger.debug('Calling "%s"', command)
        subprocess.run(shlex.split(command), cwd=dir_output, check=True)
        return os.path.abspath(os.path.join(dir_output,filename))

@@ -154,6 +160,13 @@ def get_all_books() -> List[Book]:
                books.append(book)
    return books

+def slugify(value, replacement='_'):
+    value = re.sub(r'[<>:"/\\|?*\x00-\x1F]', replacement, value)
+    # Remove leading/trailing whitespace or dots
+    value = value.strip().strip(".")
+    # Optionally truncate to safe length (e.g. 255 chars for most filesystems)
+    return value[:255] or "untitled"
+
 # run main cli
@click.command()
 #TODO include images flag
@@ -164,7 +177,7 @@ def get_all_books() -> List[Book]:
@click.argument('args', nargs=-1)
 def main(args, debug, silent, path, no_clean):
    '''
-    Download ePUBs from https://www.projekt-gutenberg.org/
+    Download ePUBs from https://www.projekt-gutenberg.org/ \n
    Provide either 0 arguments to enter interactive mode or an arbitrary number of URLs to download from
    '''
    logging.basicConfig(level=logging.ERROR,format='%(asctime)s - %(levelname)s - %(message)s')
--- a/uv.lock
+++ b/uv.lock
@@ -81,7 +81,7 @@ wheels = [

 [[package]]
 name = "epub2go"
-version = "1.2"
+version = "2.2"
 source = { editable = "." }
 dependencies = [
    { name = "beautifulsoup4" },
Author	SHA1	Message	Date
eneller	75974ae119	fix: slugify filenames	2025-04-06 10:29:19 +02:00
eneller	b3cd49326f	feat: prettier logging	2025-04-05 01:42:20 +02:00
eneller	401d02e0ca	fix: parameter getdir	2025-04-02 11:26:24 +02:00
eneller	660af7fab0	feat: allow getting directory without download	2025-03-23 23:55:05 +01:00
eneller	c49a1be369	docs: readme	2025-03-20 22:11:12 +01:00