fix: slugify filenames

feat: prettier logging
fix: parameter getdir
2025-04-06 10:29:19 +02:00 · 2025-04-05 01:42:20 +02:00 · 2025-04-02 11:26:24 +02:00 · 2025-03-23 23:55:05 +01:00 · 2025-03-20 22:11:12 +01:00 · 2025-03-16 20:30:42 +01:00
8 changed files with 219 additions and 135 deletions
--- a/README.md
+++ b/README.md
@@ -1,18 +1,39 @@
 # epub2go.py
-web to epub converter for https://projekt-gutenberg.org.
+Web to ePUB Converter for [projekt-gutenberg.org](https://projekt-gutenberg.org)  developed in conjunction with a [web interface](https://github.com/eneller/epub2go-web).
+
+## Installation
 Requires:
 - [pandoc](https://pandoc.org/)
 - [wget](https://www.gnu.org/software/wget/)
- [fzf](https://github.com/junegunn/fzf) (only for interactive mode)
- python (duh)
-## Usage
-Invoke the script using the url of any page of the book you would like to download:
-``` 
-epub2go https://www.projekt-gutenberg.org/ibsen/solness/
+- [fzf](https://github.com/junegunn/fzf) (optional, only for interactive mode)
+- [python](https://www.python.org/) (duh)
+
+Assuming you have a recent version of python installed, run
+
+```
+pip install git+https://github.com/eneller/epub2go.py
+```
+This will provide the `epub2go` command.
+
+## Usage
+```
+Usage: epub2go [OPTIONS] [ARGS]...
+
+  Download ePUBs from https://www.projekt-gutenberg.org/
+
+  Provide either 0 arguments to enter interactive mode or an arbitrary number
+  of URLs to download from
+
+Options:
+  -d, --debug      Set the log level to DEBUG
+  -s, --silent     Disable the progress bar
+  -p, --path TEXT  The path to which files are saved
+  --no-clean       Do not parse html files with blocklist
+  --help           Show this message and exit.
+```
+
+Examples:
+```bash
+epub2go https://www.projekt-gutenberg.org/ibsen/solness/
+epub2go # will enter interactive mode
 ```
-## Installation
-   Assuming you have a recent version of python installed, run
-   ```
-   pip install git+https://github.com/eneller/epub2go.py
-   ```
-   This will provide the 'epub2go' command.
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,11 +1,12 @@
 [project]
 name = "epub2go"
-version = "1.2"
+version = "2.2.3"
 description = "EPUB converter using wget, pandoc and python glue"
 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
    "beautifulsoup4==4.12.3",
+    "click>=8.1.8",
    "pyfzf>=0.3.1", # hasnt been updated for some time
    "requests==2.32.3",
    "tqdm>=4.67.1",
@@ -21,4 +22,4 @@ include-package-data = true
 requires = ["setuptools>=64", "setuptools_scm>=8"]

 [tool.setuptools_scm]
-# can be empty if no extra settings are needed, presence enables setuptools-scm
+# can be empty if no extra settings are needed, presence enables setuptools-scm
--- a/src/epub2go/convert.py
+++ b/src/epub2go/convert.py
@@ -5,38 +5,121 @@ from urllib.parse import urljoin
 from urllib.request import  urlparse
 from tqdm import tqdm
 from pyfzf.pyfzf import FzfPrompt
+import click

-import os, sys
+import os, subprocess, shlex, logging, re
 import importlib.resources as pkg_resources
+from dataclasses import dataclass
+from typing import List

+logger = logging.getLogger(__name__)

 allbooks_url ='https://www.projekt-gutenberg.org/info/texte/allworka.html'
-root_url = '{url.scheme}://{url.netloc}'.format(url = urlparse(allbooks_url))

+@dataclass
+class Book():
+    author: str
+    title: str
+    url: str
 class GBConvert():
-    #TODO fix toc / headings
-    
    def __init__(self,
-        url:str,
-        standalone = False,
+        downloaddir,
        ):
        # NOTE move non-code files to data folder
        self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
-        self.blocklist = open(pkg_resources.files('epub2go').joinpath('blocklist.txt')).read().splitlines()
-        self.root = os.path.dirname(url)
-        self.url = urlparse(self.root)
-        self.output = self.url.netloc + self.url.path
-        self.standalone = standalone
+        with open(pkg_resources.files('epub2go').joinpath('blocklist.txt')) as blocklist:
+            self.blocklist = blocklist.read().splitlines()
+        self.dir_download = downloaddir
+
+    def getDir(self, url):
+        tocpage = os.path.dirname(url) # ToC website url
+        parsed_url = urlparse(tocpage)
+        # directories created by wget recreating the URL
+        dir_output = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path )
+        return dir_output
        
-    def get_meta(self):
-        response = requests.get(self.root)
+    def download(self,
+        url:str,
+        author:str = None,
+        title:str = None,
+        showprogress: bool = False,
+        cleanpages: bool = True,
+    ):
+        tocpage = os.path.dirname(url) # ToC website url
+        dir_output = self.getDir(url)
+        logger.debug('Downloading to %s, expecting files in in %s', self.dir_download, dir_output)
+        author = author
+        title = title
+
+        #parse_meta
+        response = requests.get(tocpage)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
-        self.author = soup.find('meta', {'name': 'author'})['content']
-        self.title = soup.find('meta', {'name': 'title'})['content']
-        self.toc = soup.find('ul').find_all('a')
-    
+        # TODO allow setting these from interactive mode where those parameters are figured out from the list
+        if not author:
+            try:
+                author = soup.find('meta', {'name': 'author'})['content']
+            except:
+                author = "UnknownAuthor"
+        if not title:
+            try:
+                title = soup.find('meta', {'name': 'title'})['content']
+            except:
+                title = "UnknownTitle"
+        chapter_urls = soup.find('ul').find_all('a')
+        logger.debug('Found ToC with %d entries', len(chapter_urls))
+
+        #run
+        #TODO include images flag
+        # download all files in toc (chapters)
+        chapter_files = []
+        for item in (tqdm(chapter_urls) if showprogress else chapter_urls):
+            item_url = self.parse_toc_entry(tocpage, item)
+            parsed_url = urlparse(item_url)
+            filepath = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path)
+            if cleanpages: self.parse_page(filepath)
+            chapter_files.append(os.path.basename(item_url))
+        
+        return self.create_epub(author,title,chapter_files,dir_output)
+        
+    def parse_toc_entry(self, tocpage, entry):
+        url = os.path.join(tocpage, entry['href'])
+        self.save_page(url)
+        return url
+
+    # apply blocklist to file
+    def parse_page(self,file_path):
+        #TODO clean up file opening, mmap?
+        count=0
+        with open(file_path, 'r+') as f:
+            soup = BeautifulSoup(f.read(), 'html.parser')
+            for blocker in self.blocklist:
+                for item in soup.select(blocker):
+                    item.decompose()
+                    count+=1
+            f.seek(0)
+            f.truncate()
+            f.write(str(soup))
+        logger.debug('Removed %d tags from page %s during parsing', count, file_path)
+
+    def create_epub(self, author, title, chapters, dir_output):
+        #TODO --epub-cover-image
+        #TODO toc if it isnt described by <h> tags, e.g. https://www.projekt-gutenberg.org/adlersfe/maskenba/
+        filename = slugify(f'{title} - {author}.epub')
+        command = f'''pandoc -f html -t epub \
+                    -o "{filename}" \
+                    --reference-location=section \
+                    --css="{self.style_path_drama}" \
+                    --metadata title="{title}" \
+                    --metadata author="{author}" \
+                    --epub-title-page=false \
+                    {" ".join(chapters)} '''
+        logger.debug('Calling "%s"', command)
+        subprocess.run(shlex.split(command), cwd=dir_output, check=True)
+        return os.path.abspath(os.path.join(dir_output,filename))
+
    def save_page(self, url):
+        logger.debug('Saving page at %s', url)
        # https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
        command = f'''wget \
                    --timestamping \
@@ -45,48 +128,10 @@ class GBConvert():
                    --tries=5 \
                    --quiet \
                    {url}'''
-        os.system(command)
+        subprocess.run(shlex.split(command), cwd=self.dir_download, check=True)

-    def clean_page(self,file_path):
-        f = open(file_path, 'r').read()
-        soup = BeautifulSoup(f, 'html.parser')
-        for blocker in self.blocklist:
-            for item in soup.select(blocker):
-                item.decompose()
-        open(file_path, 'w').write(str(soup))
-
-
-    def create_epub(self,  filename='out.epub'):
-        os.chdir(self.output)
-        command = f'''pandoc -f html -t epub \
-                    -o "{filename}" \
-                    --reference-location=section \
-                    --css="{self.style_path_drama}" \
-                    --metadata title="{self.title}" \
-                    --metadata author="{self.author}" \
-                    --epub-title-page=false \
-                    {" ".join(self.chapters)} '''#TODO --epub-cover-image
-        os.system(command)
-
-    def run(self):
-        #TODO include images flag
-
-        self.get_meta()
-
-        map(lambda x: self.save_page(os.path.join(self.root, x['href'])), self.toc)
-        self.chapters = []
-        for item in (tqdm(self.toc) if self.standalone else self.toc):
-            item_title= item.get_text()
-            item_url = os.path.join(self.root, item['href'])
-            self.save_page(url=item_url)
-            parsed_url = urlparse(item_url)
-            filepath = parsed_url.netloc + parsed_url.path
-            self.clean_page(filepath)
-            self.chapters.append(item['href'])
-        
-        self.create_epub(f'{self.title} - {self.author}.epub')
-        
-def get_all_books() -> list:
+# get a list of all books for interactive selection or scraping
+def get_all_books() -> List[Book]:
    response = requests.get(allbooks_url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
@@ -111,35 +156,52 @@ def get_all_books() -> list:
                book_href = book_tag.get('href')
                book_url = urljoin(allbooks_url, book_href)
                book_title = ' '.join(book_tag.getText().split())
-                book = {'author': book_author, 'title': book_title, 'url': book_url}
+                book = Book(book_author, book_title, book_url)
                books.append(book)
    return books

-def get_all_book_tags ()-> ResultSet:
-    response = requests.get(allbooks_url)
-    response.raise_for_status()
-    soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
-    books = soup.find('dl').find_all('a')
-    return books
-    
-def main():
-    sys.argv.pop(0)
+def slugify(value, replacement='_'):
+    value = re.sub(r'[<>:"/\\|?*\x00-\x1F]', replacement, value)
+    # Remove leading/trailing whitespace or dots
+    value = value.strip().strip(".")
+    # Optionally truncate to safe length (e.g. 255 chars for most filesystems)
+    return value[:255] or "untitled"
+
+# run main cli
+@click.command()
+#TODO include images flag
+@click.option('--debug', '-d', is_flag=True, help='Set the log level to DEBUG')
+@click.option('--silent', '-s', is_flag=True, help='Disable the progress bar')
+@click.option('--path','-p',type=str,default='./', help='The path to which files are saved' )
+@click.option('--no-clean',is_flag=True,help='Do not parse html files with blocklist')
+@click.argument('args', nargs=-1)
+def main(args, debug, silent, path, no_clean):
+    '''
+    Download ePUBs from https://www.projekt-gutenberg.org/ \n
+    Provide either 0 arguments to enter interactive mode or an arbitrary number of URLs to download from
+    '''
+    logging.basicConfig(level=logging.ERROR,format='%(asctime)s - %(levelname)s - %(message)s')
+    if(debug): logger.setLevel(logging.DEBUG)
    # non-interactive mode
-    if len(sys.argv) > 0 :
-        books = sys.argv
+    if len(args) > 0 :
+        books = args
    # interactive mode using fzf
    else:
+        logger.debug('Received no CLI arguments, starting interactive mode')
        delimiter = ';'
        # create lines for fzf
-        books = [f"{item['author']} - {item['title']} {delimiter} {item['url']}" for item in get_all_books()]
+        books = [f"{ item.author } - { item.title } {delimiter} { item.url }" for item in get_all_books()]
        fzf = FzfPrompt()
        selection = fzf.prompt(choices=books,  fzf_options=r'--exact --with-nth 1 -m -d\;')
        books = [item.split(';')[1].strip() for item in selection]

+    logger.debug('Attempting to download from %d URL(s)', len(books))
+    converter = GBConvert(path)
    if len(books)==1:
-        GBConvert(books[0], standalone=True).run()
+        converter.download(books[0], showprogress=not silent, cleanpages= not no_clean)
    else:
-        for book in tqdm(books):
-                GBConvert(book).run()
+        for book in (tqdm(books) if not silent else books):
+            converter.download(book, cleanpages= not no_clean)
+
 if __name__ == "__main__":
    main()
--- a/src/epub2go/crawl.py
+++ b/src/epub2go/crawl.py
@@ -1,20 +1,20 @@
 import requests
 from tqdm import tqdm
+from bs4 import BeautifulSoup
+from bs4 import ResultSet

 import os
 from urllib.parse import urljoin

-from convert import GBConvert, get_all_book_tags, allbooks_url
+from convert import GBConvert, allbooks_url, get_all_books, Book

 def main():
-    books = get_all_book_tags()
+    books = get_all_books()
    # NOTE consider making this a map()
+    converter = GBConvert('./')
    for book in tqdm(books):
-        book_title = book.get_text()
-        book_url_relative = book.get('href')
-        if book_url_relative is not None:
-            book_url = urljoin(allbooks_url, book_url_relative)
-            GBConvert(book_url).run()
+        if book.url is not None:
+            converter.download(book.url)


 if __name__ == "__main__":
--- a/src/epub2go/web.py
+++ b/src/epub2go/web.py
@@ -1,38 +0,0 @@
-# run using `django-admin runserver --pythonpath=. --settings=web`
-from django.urls import path
-from django.http import HttpResponse, HttpRequest
-from django.shortcuts import redirect, render 
-import requests
-
-from convert import GBConvert, allbooks_url
-import json
-DEBUG = True
-ROOT_URLCONF = __name__
-SECRET_KEY='1'
-TEMPLATES = [
-        {
-            'BACKEND': 'django.template.backends.django.DjangoTemplates',
-            'DIRS': [
-                'templates/'
-            ],
-        },
-    ]
-
-def root(request: HttpRequest):
-    title = 'epub2go'
-    targetParam = request.GET.get('t', None)
-    if targetParam is not None:
-        getEpub(targetParam)
-    return render(request, 'index.html', locals())
-
-urlpatterns = [
-    path('', root, name='root'),
-]
-
-def getEpub(param):
-    # TODO validate / sanitize input
-    # TODO check for existing file and age
-    # TODO download
-    # TODO redirect to loading page
-    # TODO redirect to download page
-    raise NotImplementedError
--- a/test/init.py
+++ b/test/init.py
--- a/test/test_epub.py
+++ b/test/test_epub.py
@@ -0,0 +1,23 @@
+from tqdm import tqdm
+
+from src.epub2go.convert import GBConvert, get_all_books
+
+import unittest
+
+# run using `python -m unittest test/test_epub.py`
+class TestEpub(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.schiller_raeuber = GBConvert('https://www.projekt-gutenberg.org/schiller/raeuber/')
+        cls.anzengru_allersee = GBConvert('https://www.projekt-gutenberg.org/anzengru/allersee/')
+
+    def test_schiller_raeuber_toc(self):
+        self.assertEqual(len(self.schiller_raeuber.toc), 7)
+
+    def test_anzengru_allersee_toc(self):
+        self.assertEqual(len(self.anzengru_allersee.toc), 1)
+        
+
+if __name__ == '__main__':
+    unittest.main()
--- a/uv.lock
+++ b/uv.lock
@@ -1,4 +1,5 @@
 version = 1
+revision = 1
 requires-python = ">=3.12"

 [[package]]
@@ -57,6 +58,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/0e/f6/65ecc6878a89bb1c23a086ea335ad4bf21a588990c3f535a227b9eea9108/charset_normalizer-3.4.1-py3-none-any.whl", hash = "sha256:d98b1668f06378c6dbefec3b92299716b931cd4e6061f3c875a71ced1780ab85", size = 49767 },
 ]

+[[package]]
+name = "click"
+version = "8.1.8"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188 },
+]
+
 [[package]]
 name = "colorama"
 version = "0.4.6"
@@ -68,10 +81,11 @@ wheels = [

 [[package]]
 name = "epub2go"
-version = "1.0"
+version = "2.2"
 source = { editable = "." }
 dependencies = [
    { name = "beautifulsoup4" },
+    { name = "click" },
    { name = "pyfzf" },
    { name = "requests" },
    { name = "tqdm" },
@@ -81,6 +95,7 @@ dependencies = [
 [package.metadata]
 requires-dist = [
    { name = "beautifulsoup4", specifier = "==4.12.3" },
+    { name = "click", specifier = ">=8.1.8" },
    { name = "pyfzf", specifier = ">=0.3.1" },
    { name = "requests", specifier = "==2.32.3" },
    { name = "tqdm", specifier = ">=4.67.1" },
Author	SHA1	Message	Date
eneller	75974ae119	fix: slugify filenames	2025-04-06 10:29:19 +02:00
eneller	b3cd49326f	feat: prettier logging	2025-04-05 01:42:20 +02:00
eneller	401d02e0ca	fix: parameter getdir	2025-04-02 11:26:24 +02:00
eneller	660af7fab0	feat: allow getting directory without download	2025-03-23 23:55:05 +01:00
eneller	c49a1be369	docs: readme	2025-03-20 22:11:12 +01:00
eneller	4267700763	feat: return epub path errors from wget and pandoc are thrown up	2025-03-16 20:30:42 +01:00
eneller	5d063d8597	feat: restructure for memory efficiency	2025-03-16 19:06:33 +01:00
eneller	6754f47e9f	fix: restructure test	2025-03-16 18:57:40 +01:00
eneller	4a8d4f945d	begin restructure	2025-03-16 18:34:12 +01:00
eneller	4903a58619	feat: cli using click	2025-03-16 17:46:53 +01:00
eneller	7dfab60f18	feat: allow setting of downloaddir	2025-03-15 17:02:28 +01:00
eneller	9736c6135f	chore: logging	2025-03-15 16:41:38 +01:00
eneller	d7ae0cc5a2	test: basic file count	2025-03-04 18:45:27 +01:00
eneller	c78aac28ab	chore: error handling	2025-03-04 18:28:17 +01:00
eneller	00f6cef743	chore: move web code to new repo	2025-03-04 11:02:56 +01:00
eneller	9ae25e40ad	refactor: better typing dataclass replacing dict	2025-03-03 23:11:07 +01:00
eneller	7be0fbc126	refactor: crawl unified from list	2025-03-03 22:56:58 +01:00
eneller	8f77a97733	refactor: move crawling code to correct file	2025-03-03 22:34:59 +01:00
eneller	967f97f381	refactor: command invocation now avoiding chdir using subprocess instead of os.system	2025-03-03 22:14:08 +01:00
eneller	4d8cd00298	refactor: split parsing and logic	2025-03-03 21:34:04 +01:00