feat: cli using click

feat: allow setting of downloaddir
chore: logging
2025-03-16 17:46:53 +01:00 · 2025-03-15 17:02:28 +01:00 · 2025-03-15 16:41:38 +01:00 · 2025-03-04 18:45:27 +01:00 · 2025-03-04 18:28:17 +01:00 · 2025-03-04 11:02:56 +01:00
10 changed files with 860 additions and 83 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -418,3 +418,4 @@ wheels/
 *.css
 *.js
 *.txt
 *.json
--- a/README.md
+++ b/README.md
@@ -2,8 +2,9 @@
 web to epub converter for https://projekt-gutenberg.org.
 Requires:
 - [pandoc](https://pandoc.org/)
- wget
+- [wget](https://www.gnu.org/software/wget/)
- python
+- [fzf](https://github.com/junegunn/fzf) (only for interactive mode)
 - python (duh)
 ## Usage
 Invoke the script using the url of any page of the book you would like to download:
 ``` 
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,11 +1,13 @@
 [project]
 name = "epub2go"
-version = "1.0"
+version = "1.3"
 description = "EPUB converter using wget, pandoc and python glue"
 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
    "beautifulsoup4==4.12.3",
    "click>=8.1.8",
    "pyfzf>=0.3.1", # hasnt been updated for some time
    "requests==2.32.3",
    "tqdm>=4.67.1",
    "urllib3==2.2.2",
--- a/src/epub2go/convert.py
+++ b/src/epub2go/convert.py
@@ -1,55 +1,88 @@
 import requests
 from bs4 import BeautifulSoup
 from bs4 import ResultSet
 from urllib.parse import urljoin
-from urllib.request import urlopen, urlparse
+from urllib.request import  urlparse
 from tqdm import tqdm
 from pyfzf.pyfzf import FzfPrompt
 import click
-import os, sys
+import os, subprocess, shlex, logging
 import importlib.resources as pkg_resources
-from pathlib import Path
+from dataclasses import dataclass
 from typing import List
 logger = logging.getLogger(__name__)
 allbooks_url ='https://www.projekt-gutenberg.org/info/texte/allworka.html'
@dataclass
 class Book():
    author: str
    title: str
    url: str
 class GBConvert():
    #TODO fix toc / headings
    def __init__(self,
        url:str,
        author:str = None,
        title:str = None,
        downloaddir = './',
        showprogress:bool = False,
        ):
        # NOTE move non-code files to data folder
        self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
-        self.blocklist = open(pkg_resources.files('epub2go').joinpath('blocklist.txt')).read().splitlines()
+        with open(pkg_resources.files('epub2go').joinpath('blocklist.txt')) as blocklist:
-        self.root = os.path.dirname(url)
+            self.blocklist = blocklist.read().splitlines()
-        self.url = urlparse(self.root)
+        self.tocpage = os.path.dirname(url) # ToC website url
-        self.output = self.url.netloc + self.url.path
+        url = urlparse(self.tocpage)
        self.dir_download = downloaddir
        self.dir_output = os.path.join(self.dir_download, url.netloc + url.path )# directories created by wget recreating the URL
        logger.debug('Downloading in %s, expecting files in in %s', self.dir_download, self.dir_output)
        self.showprogress = showprogress
        self.author = author
        self.title = title
        self.chapters = []
        self.parse_meta()
-    def get_meta(self):
+    def parse_meta(self):
-        response = requests.get(self.root)
+        response = requests.get(self.tocpage)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
-        self.author = soup.find('meta', {'name': 'author'})['content']
+        # TODO allow setting these from interactive mode where those parameters are figured out from the list
-        self.title = soup.find('meta', {'name': 'title'})['content']
+        if not self.author:
            try:
                self.author = soup.find('meta', {'name': 'author'})['content']
            except:
                self.author = "UnknownAuthor"
        if not self.title:
            try:
                self.title = soup.find('meta', {'name': 'title'})['content']
            except:
                self.title = "UnknownTitle"
        self.toc = soup.find('ul').find_all('a')
-    
+        logger.debug('Found ToC with %d entries', len(self.toc))
-    def save_page(self, url):
+        
-        # https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
+    def parse_toc_entry(self, entry):
-        command = f'''wget \
+        url = os.path.join(self.tocpage, entry['href'])
-                    --page-requisites \
+        self.save_page(url)
-                    --convert-links \
+        return url
                    --execute \
                    --tries=5 \
                    --quiet \
                    {url}'''
        os.system(command)
-    def clean_page(self,file_path):
+    # apply blocklist to file
-        f = open(file_path, 'r').read()
+    def parse_page(self,file_path):
-        soup = BeautifulSoup(f, 'html.parser')
+        #TODO clean up file opening, mmap?
-        for blocker in self.blocklist:
+        logger.debug('Parsing page at %s', file_path)
-            for item in soup.select(blocker):
+        with open(file_path, 'r+') as f:
-                item.decompose()
+            soup = BeautifulSoup(f.read(), 'html.parser')
-        open(file_path, 'w').write(str(soup))
+            for blocker in self.blocklist:
                for item in soup.select(blocker):
                    item.decompose()
            f.write(str(soup))
-
+    def create_epub(self,  filename='out.epub')-> int:
-    def create_epub(self,  filename='out.epub'):
+        #TODO --epub-cover-image
-        os.chdir(self.output)
+        #TODO toc if it isnt described by <h> tags, e.g. https://www.projekt-gutenberg.org/adlersfe/maskenba/
        logger.debug('Creating epub as "%s"',filename)
        command = f'''pandoc -f html -t epub \
                    -o "{filename}" \
                    --reference-location=section \
@@ -57,33 +90,93 @@ class GBConvert():
                    --metadata title="{self.title}" \
                    --metadata author="{self.author}" \
                    --epub-title-page=false \
-                    {" ".join(self.chapters)} '''#TODO --epub-cover-image
+                    {" ".join(self.chapters)} '''
-        os.system(command)
+        return subprocess.run(shlex.split(command), cwd=self.dir_output).returncode
    def save_page(self, url):
        logger.debug('Saving page at %s', url)
        # https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
        command = f'''wget \
                    --timestamping \
                    --page-requisites \
                    --convert-links \
                    --tries=5 \
                    --quiet \
                    {url}'''
        return subprocess.run(shlex.split(command), cwd=self.dir_download).returncode
    def run(self):
        #TODO include images flag
-        self.get_meta()
+        # download all files in toc (chapters)
-
+        for item in (tqdm(self.toc) if self.showprogress else self.toc):
-        map(lambda x: self.save_page(os.path.join(self.root, x['href'])), self.toc)
+            item_url = self.parse_toc_entry(item)
        self.chapters = []
        for item in tqdm(self.toc):
            item_title= item.get_text()
            item_url = os.path.join(self.root, item['href'])
            self.save_page(url=item_url)
            parsed_url = urlparse(item_url)
-            filepath = parsed_url.netloc + parsed_url.path
+            filepath = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path)
-            self.clean_page(filepath)
+            self.parse_page(filepath)
-            self.chapters.append(item['href'])
+            self.chapters.append(os.path.basename(item_url))
-        self.create_epub(f'{self.title} - {self.author}.epub')
+        return self.create_epub(f'{self.title} - {self.author}.epub')
 def main():
    g = GBConvert(sys.argv[1])
    g.run()
 # get a list of all books for interactive selection or scraping
 def get_all_books() -> List[Book]:
    response = requests.get(allbooks_url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
    tags = soup.find('dl').findChildren()
    books = []
    for tag in tags:
        # is description tag, i.e. contains author name
        if tag.name =='dt':
            # update author
            # special case when author name and Alphabetical list is in same tag
            br_tag = tag.find('br')
            if br_tag:
                book_author = str(br_tag.next_sibling)
            # default case, dt only contains author name
            else:
                book_author = tag.get_text(strip=True)
            book_author = ' '.join(book_author.split())
        # is details tag, contains book url
        elif tag.name == 'dd':
            book_tag = tag.a
            if book_tag:
                book_href = book_tag.get('href')
                book_url = urljoin(allbooks_url, book_href)
                book_title = ' '.join(book_tag.getText().split())
                book = Book(book_author, book_title, book_url)
                books.append(book)
    return books
 # run main cli
@click.command()
@click.option('--debug', '-d', is_flag=True, help='Set the log level to DEBUG')
@click.option('--silent', '-s', is_flag=True, help='Disable the progress bar')
@click.argument('args', nargs=-1)
 def main(args, debug, silent):
    '''
    Download ePUBs from https://www.projekt-gutenberg.org/
    Provide either 0 arguments to enter interactive mode or an arbitrary number of URLs to download from
    '''
    logging.basicConfig(level=logging.ERROR,format='%(asctime)s - %(levelname)s - %(message)s')
    if(debug): logger.setLevel(logging.DEBUG)
    # non-interactive mode
    if len(args) > 0 :
        books = args
    # interactive mode using fzf
    else:
        logger.debug('Received no CLI arguments, starting interactive mode')
        delimiter = ';'
        # create lines for fzf
        books = [f"{ item.author } - { item.title } {delimiter} { item.url }" for item in get_all_books()]
        fzf = FzfPrompt()
        selection = fzf.prompt(choices=books,  fzf_options=r'--exact --with-nth 1 -m -d\;')
        books = [item.split(';')[1].strip() for item in selection]
    logger.debug('Attempting to download from %d URL(s)', len(books))
    if len(books)==1:
        GBConvert(books[0], showprogress=not silent).run()
    else:
        for book in (tqdm(books) if not silent else books):
                GBConvert(book).run()
 if __name__ == "__main__":
    main()
--- a/src/epub2go/crawl.py
+++ b/src/epub2go/crawl.py
@@ -0,0 +1,20 @@
 import requests
 from tqdm import tqdm
 from bs4 import BeautifulSoup
 from bs4 import ResultSet
 import os
 from urllib.parse import urljoin
 from convert import GBConvert, allbooks_url, get_all_books, Book
 def main():
    books = get_all_books()
    # NOTE consider making this a map()
    for book in tqdm(books):
        if book.url is not None:
            GBConvert(book.url).run()
 if __name__ == "__main__":
    main()
--- a/src/epub2go/prosa.css
+++ b/src/epub2go/prosa.css
@@ -0,0 +1,636 @@
 /* Modifizierte Prosa Styles von www.projekt-gutenberg.org - Stand: Januar 2020 */
@page {
  margin: 5pt;
 }
 body {
  font-family: serif;
  /*font-family: arial;margin-right: 10%;margin-left: 10%;margin-top: 0%;margin-bottom: 3%;*/
 }
 /* Inhaltsverzeichnis */
 .toc {
  display: none;
 }
 /********************* Links *********************/
 a:link {
  color: #039;
  text-decoration: none;
 }
 /* Titelseite */
 .title {
  /*to be set */
 }
 .subtitle {
  color: darkgray;
 }
 .author {
  color: gray;
 }
 .authorlist {
  text-align: left;
 }
 .box {
  margin-top: 1.5em;
  margin-left: 15%;
  margin-right: 15%;
  margin-bottom: 1.5em;
  padding-top: 1em;
  padding-left: 1em;
  padding-right: 1em;
  padding-bottom: 1em;
  border-top: 1px #666 solid;
  border-right: 1px #666 solid;
  border-bottom: 1px #666 solid;
  border-left: 1px #666 solid;
 }
 .dedication {
  text-indent: 0;
  text-align: center;
  font-size: large;
  margin-top: 2em;
  margin-bottom: 2em;
  margin-left: 20%;
  margin-right: 20%;
 }
 /* Abbildungen */
 img {
  max-width: 100%;
 }
 img.initial {
  float: left;
  margin-top: 0;
  margin-bottom: 0;
  margin-right: 0.3em;
 }
 img.left {
  float: left;
  margin-top: 0.5em;
  margin-bottom: 0.5em;
  margin-right: 0.5em;
 }
 img.right {
  float: right;
  margin-top: 0.5em;
  margin-bottom: 0.5em;
  margin-left: 0.5em;
 }
 img.deko {
  margin-bottom: 20px;
  margin-top: 10px;
  border: 1px solid #606060;
  text-align: center;
 }
 .figcaption {
  text-indent: 0;
  text-align: center;
  font-style: italic;
 }
 .figure {
  text-indent: 0;
  text-align: center;
  margin-top: 1em;
  margin-bottom: 1em;
 }
 /* Textformatierungen */
 .fraktur {
  font-family: "Frankenstein", Times, serif;
 }
 .smallcaps {
  font-variant: small-caps;
 }
 .lektorat {
  color: darkgrey;
  font-size: small;
 }
 .motto {
  text-indent: 0;
  margin-left: 50%;
  margin-top: 1em;
  margin-bottom: 1em;
 }
 .note {
  line-height: 90%;
  font-size: 90%;
 }
 .recipient {
  margin-left: -1em;
  margin-top: 1em;
  margin-bottom: 1em;
 }
 /* Regie-Anweisung im Schauspiel */
 .regie, .action {
  font-size: 90%;
  font-style: italic;
 }
 /*.sender {
  margin-left: 2em;
  font-style: italic;
  font-weight: bold;
  color: darkblue;
  margin-left: 2em;
 }*/
 .signatur, .signature {
  text-align: right;
  margin-right: 2em;
 }
 /* Sprecher im Schauspiel. geändert. Re. */
 .speaker {
  color: #333;
  font-weight: bold;
 }
 /* Sperrsatz (Duden: Satzzeichen außer Punkt und Anführungszeichen werden mit gesperrt, Zahlen werden's nicht), wird von einigen Readern nicht unterstützt */
 .wide, .spaced {
  letter-spacing: 0.15em;
 }
 /******************** Überschriften ********************/
 h1, h2, h3, h4, h6 {
  text-align: center;
 }
 h5 {
  text-align: center;
  font-size: 90%;
  color: #808080;
  font-weight: normal;
 }
 /******************** Fließtext ********************/
 p {
  margin-top: 0.4em;
  margin-bottom: 0.4em;
  text-indent: 0.8em;
  text-align: justify;
  widows: 2;
  orphans: 2;
 }
 p.abstract {
  font-size: 90%;
  font-style: italic;
  margin-left: 3em;
  margin-right: 3em;
  text-indent: 0;
 }
 p.center {
  text-indent: 0;
  text-align: center;
 }
 p.centerbig {
  margin-bottom: 0.6em;
  margin-top: 0.6em;
  text-indent: 0;
  text-align: center;
  font-size: 115%;
 }
 p.centersml {
  text-indent: 0;
  text-align: center;
  font-size: 90%;
  margin-bottom: 0.3em;
  margin-top: 0.3em;
 }
 p.dblmarg {
  text-indent: 0;
  margin-left: 10%;
  margin-right: 10%;
  text-align: justify;
 }
 p.drama {
  margin-left: 2em;
  text-indent: -2em;
  margin-top: 0.5em;
  margin-bottom: 0.5em;
 }
 p.epigraph {
  text-indent: 0;
  text-align: right;
  margin-right: 5%;
  font-style: italic;
 }
 p.left {
  text-indent: 0;
  text-align: left;
  text-align: justify;
 }
 p.initial {
  text-indent: 0;
 }
 p.leftjust {
  text-indent: 0;
  text-align: justify;
 }
 /*p.leftmarg {
  text-indent: 0;
  text-align: left;
  margin-left: 2em;
  text-align: justify;
 }*/
 /********************* Linien im Text *********************/
 hr {
  text-align: center;
  color: #999;
  margin-top: 0.5em;
  margin-bottom: 0.5em;
  /*border-top: 1px solid;border-right: 1px solid;border-bottom: 1px solid;border-left: 1px solid;*/
  border: 1px solid;
 }
 hr.short {
  color: #666;
  margin-top: 2em;
  margin-bottom: 2em;
  width: 20%;
  height: 1px;
  margin-left: 40%;
 }
 hr.star {
  margin-top: 1em;
  margin-bottom: 1em;
  width: 20%;
  margin-left: 40%;
 }
 /********************* Absatzübergreifende Formatierung ********************/
 div.epigraph {
  margin-left: 50%;
  margin-right: 5%;
  font-style: italic;
 }
 div.impressum {
  display: none;
 }
 div.motto p {
  text-align: right;
  text-indent: 0;
 }
 div.navi {
  text-align: center;
 }
 div.titlepage {
  text-align: center;
 }
 /********************* Gedichte *********************/
 div.poem {
  margin-left: 20%;
  margin-right: 20%;
  margin-bottom: 2em;
 }
 div.poem blockquote {
  margin-left: 3em;
  margin-right: 3em;
 }
 div.vers {
  text-indent: 0;
  text-align: left;
  margin-left: 2em;
  margin-top: 1em;
  margin-bottom: 1em;
 }
 div.vers p {
  text-indent: 0;
  margin-top: 0;
  margin-bottom: 0;
 }
 p.line {
  text-align: left;
  text-indent: 0;
  margin-top: 0;
  margin-bottom: 0;
 }
 p.poem, p.vers {
  text-align: left;
  text-indent: 0;
  margin-top: 1em;
  margin-left: 2em;
  margin-right: 2em;
  margin-bottom: 1em;
 }
 /********************* Briefe *********************/
 div.letter {
  text-align: left;
  margin-left: 1.5em;
  margin-top: 1em;
  margin-bottom: 1em;
 }
 p.address {
  text-align: right;
  text-indent: 0;
  font-style: italic;
 }
 p.date {
  text-align: right;
  font-style: italic;
 }
 /********************* Tabellen  *********************/
 tbody {
  /*font-family: arial;*/
 }
 td {
  /*font-family: arial;*/
 }
 /* Wird für mehrspaltige 0hmldir.xml gebraucht */
 table.dirtoc {
  margin-top: 0.3em;
  margin-bottom: 0.3em;
  text-align: left;
 }
 /* 0.4em Horizontal-Abstand zu Trennlinien, Folgezeilen um 1em eingerückt */
 table.dirtoc td {
  padding-top: 0;
  padding-bottom: 0;
  padding-left: 1.4em;
  padding-right: 0.4em;
  text-indent: -1em;
 }
 /* Notwendig, wenn jemand heimlichtückisch <div align="center"> davorsetzt: */
 table.left {
  margin-left: 0;
  text-align: left;
 }
 table.motto {
  margin-left: 30%;
  margin-right: 0;
 }
 table.right {
  margin-right: 0;
 }
 table.toc {
  margin-top: 0.3em;
 }
 table.toc td {
  padding-top: 0;
  padding-left: 0.25em;
  padding-right: 0.25em;
  padding-bottom: 0;
  text-align: left;
 }
 table.true, table.real {
  margin-top: 0.3em;
  margin-bottom: 0.3em;
  text-align: left;
 }
 /* Definitionsliste */
 dd {
  margin-left: 2em;
 }
 dl {
  margin-left: 1.5em;
  margin-top: 1em;
  margin-bottom: 1em;
 }
 dt {
  font-weight: bold;
  margin-top: 4pt;
 }
 /* Ungeordnete Liste */
 ul {
  margin-top: 1em;
  margin-bottom: 1em;
 }
 /* Löschung und Einfügung */
 del {
  color: red;
 }
 ins {
  color: blue;
 }
 /* Zeile mit 3 Sternen: <p class="stars"><sup>*</sup> <sub>*</sub> <sup>*</sup></p> */
 p.stars {
  text-indent: 0;
  text-align: center;
  font-size: 200%;
  letter-spacing: 0.3em;
  margin-top: 0.5em;
  margin-bottom: 0;
 }
 /* Hochstellung ohne Vergößerung des Zeilenabstandes */
 sup {
  font-size: 70%;
  vertical-align: text-top;
 }
 sub {
  font-size: 70%;
  vertical-align: text-bottom;
 }
 /* Formatierung von Brüchen */
 sup.fract {
  font-size: 70%;
  vertical-align: text-top;
 }
 sub.fract {
  font-size: 70%;
  vertical-align: text-bottom;
 }
 .mainnav {
  /*font-family: Arial;*/
  background-color: #fff;
  text-align: center;
  border-top: 1px #d26402 solid;
  border-bottom: 1px #d26402 solid;
 }
 .autalpha {
  /*font-family: Arial;*/
  text-align: center;
 }
 .trenner {
  font-size: 10pt;
  font-weight: bold;
  color: #d26402;
 }
 .right {
  text-align: right;
 }
 .left {
  text-align: left;
 }
 /* Zu überprüfende Klassen: sind sie korrekt oder machen sie Sinn für ein EBook? */
 .hidden, .hide {
  display: none;
 }
 upper {
  /* .upper ? */
  text-transform: uppercase;
 }
 p.initial:first-letter {
  /* funktioniert nicht bei allen Readern */
  font-size: 150%;
 }
 .online {
  display: none;
 }
 /* Seitennummern */
 .pageref {
  /* noch nicht definiert */
 }
 a.pageref {
  display: none;
 }
 a.pageref:before {
  content: "[";
 }
 a.pageref:after {
  content: "]";
 }
 tt {
  font-family: Courier;
 }
 /* besser 
 span.truetype {
  font-family: monospace;
 }*/
 /********************* Anmerkungen und Fußnoten. geändert. Re. *********************/
 a:visited {
  color: #039;
  text-decoration: none;
 }
 a:hover {
  color: #039;
  text-decoration: none;
  background-color: #e0e0e0;
 }
 a:active {
  color: #039;
  text-decoration: none;
 }
 span.tooltip {
  color: #800000;
 }
 span.footnote a:hover {
  background-color: #2B2E21;
  color: #fff;
 }
 span.footnote a:link span, span.footnote a:visited span {
  display: none;
 }
 span.footnote a:hover span.fntext {
  position: absolute;
  margin: 20px;
  background-color: beige;
  max-width: 400px;
  padding: 5px 10px 5px 10px;
  border: 1px solid #C0C0C0;
  font: normal 12px/14px arial;
  color: #000;
  text-align: left;
  display: block;
  text-decoration: none;
  left: 10px;
 }
 span.footnote:before {
  content: " [Fußnote: ";
  color: #505050;
 }
 span.footnote:after {
  content: "] ";
  color: #505050;
 }
 span.footnote {
  color: #505050;
  display: inline;
  font-size: 90%;
 }
 span.teletype {
  font-family: monospace;
 }
 /* Alte Klassen */
 .overline {
  text-decoration: overline;
 }
 .upper {
  text-transform: uppercase;
 }
 p.end {
  text-indent: 0;
  text-align: center;
 }
 p.right {
  text-indent: 0;
  text-align: right;
 }
 div.footnote {
  display: inline;
 }
 table.poem, table.vers {
  margin-left: auto;
  margin-right: auto;
 }
 td.left {
  text-align: left;
 }
 td.right {
  text-align: right;
 }
 td.center {
  text-align: center;
 }
 /* mit dem lang-Attribut markierte Tags. geändert. Re. */
 *[lang=""] {
  color: grey;
 }
 *[lang="fr"] {
  color: red;
 }
 *[lang="la"] {
  color: blue;
 }
 *[lang="en"] {
  color: green;
 }
 *[lang="it"] {
  color: violet;
 }
 *[lang="el"] {
  color: brown;
 }
 /* ******************************************************************* */
 /*    Zusätzliche Definitionen ohne Layout für Text-Strukturierung     */
 /* ******************************************************************* */
 div.ballad {
  /* styles hier einfügen */
 }
 div.chapter {
  /* styles hier einfügen */
 }
 div.part {
  /* styles hier einfügen */
 }
 div.preface {
  /* styles hier einfügen */
 }
 div.section {
  /* styles hier einfügen */
 }
 div.volume {
  /* styles hier einfügen */
 }
 h3.date {
  /* styles hier einfügen */
 }
 h3.subtitle {
  /* styles hier einfügen */
 }
 h3.translator {
  /* styles hier einfügen */
 }
 h4.date {
  /* styles hier einfügen */
 }
 h4.pseudo {
  /* styles hier einfügen */
 }
 h4.publisher {
  /* styles hier einfügen */
 }
 h4.subtitle {
  /* styles hier einfügen */
 }
 h4.translator {
  /* styles hier einfügen */
 }
 h5.date {
  /* styles hier einfügen */
 }
 h5.translator {
  /* styles hier einfügen */
 }
 div.toc {
  display: none;
 }
 p.toc {
  display: none;
 }
--- a/src/epub2go/test.py
+++ b/src/epub2go/test.py
@@ -1,25 +0,0 @@
 import requests
 from bs4 import BeautifulSoup
 import os
 from convert import GBConvert
 def main():
    allbooks_relative_url ='/info/texte/allworka.html'
    root_url = 'https://www.projekt-gutenberg.org'
    allbooks_url = root_url + allbooks_relative_url
    response = requests.get(allbooks_url)
    if (response.status_code != 200): raise Exception(f'Couldnt fetch root page {self.root}')
    soup = BeautifulSoup(response.content, 'html.parser')
    books = soup.find('dl').find_all('a')
    for book in books:
        book_title = book.get_text()
        book_url_relative = book.get('href')
        if book_url_relative is not None:
            book_url = root_url + os.path.dirname(book_url_relative)[5:]
            gb = GBConvert(book_url)
 if __name__ == "__main__":
    main()
--- a/test/init.py
+++ b/test/init.py
--- a/test/test_epub.py
+++ b/test/test_epub.py
@@ -0,0 +1,23 @@
 from tqdm import tqdm
 from src.epub2go.convert import GBConvert, get_all_books
 import unittest
 # run using `python -m unittest test/test_epub.py`
 class TestEpub(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.schiller_raeuber = GBConvert('https://www.projekt-gutenberg.org/schiller/raeuber/')
        cls.anzengru_allersee = GBConvert('https://www.projekt-gutenberg.org/anzengru/allersee/')
    def test_schiller_raeuber_toc(self):
        self.assertEqual(len(self.schiller_raeuber.toc), 7)
    def test_anzengru_allersee_toc(self):
        self.assertEqual(len(self.anzengru_allersee.toc), 1)
 if __name__ == '__main__':
    unittest.main()
--- a/uv.lock
+++ b/uv.lock
@@ -1,4 +1,5 @@
 version = 1
 revision = 1
 requires-python = ">=3.12"
 [[package]]
@@ -57,6 +58,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/0e/f6/65ecc6878a89bb1c23a086ea335ad4bf21a588990c3f535a227b9eea9108/charset_normalizer-3.4.1-py3-none-any.whl", hash = "sha256:d98b1668f06378c6dbefec3b92299716b931cd4e6061f3c875a71ced1780ab85", size = 49767 },
 ]
 [[package]]
 name = "click"
 version = "8.1.8"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "colorama", marker = "sys_platform == 'win32'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188 },
 ]
 [[package]]
 name = "colorama"
 version = "0.4.6"
@@ -68,10 +81,12 @@ wheels = [
 [[package]]
 name = "epub2go"
-version = "1.0"
+version = "1.2"
 source = { editable = "." }
 dependencies = [
    { name = "beautifulsoup4" },
    { name = "click" },
    { name = "pyfzf" },
    { name = "requests" },
    { name = "tqdm" },
    { name = "urllib3" },
@@ -80,6 +95,8 @@ dependencies = [
 [package.metadata]
 requires-dist = [
    { name = "beautifulsoup4", specifier = "==4.12.3" },
    { name = "click", specifier = ">=8.1.8" },
    { name = "pyfzf", specifier = ">=0.3.1" },
    { name = "requests", specifier = "==2.32.3" },
    { name = "tqdm", specifier = ">=4.67.1" },
    { name = "urllib3", specifier = "==2.2.2" },
@@ -94,6 +111,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 },
 ]
 [[package]]
 name = "pyfzf"
 version = "0.3.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/d4/4c/c0c658a1e1e9f0e01932990d7947579515fe048d0a515f07458ecd992b8f/pyfzf-0.3.1.tar.gz", hash = "sha256:dd902e34cffeca9c3082f96131593dd20b4b3a9bba5b9dde1b0688e424b46bd2", size = 3652 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/99/35/6a6c7b95390ec58904646a04f54e1b56fd57d7a247588b791c6331697797/pyfzf-0.3.1-py3-none-any.whl", hash = "sha256:736f71563461b75f6f85b55345bdc638fa0dc14c32c857c59e8b1ca1cfa3cf4a", size = 4315 },
 ]
 [[package]]
 name = "requests"
 version = "2.32.3"
Author	SHA1	Message	Date
eneller	4903a58619	feat: cli using click	2025-03-16 17:46:53 +01:00
eneller	7dfab60f18	feat: allow setting of downloaddir	2025-03-15 17:02:28 +01:00
eneller	9736c6135f	chore: logging	2025-03-15 16:41:38 +01:00
eneller	d7ae0cc5a2	test: basic file count	2025-03-04 18:45:27 +01:00
eneller	c78aac28ab	chore: error handling	2025-03-04 18:28:17 +01:00
eneller	00f6cef743	chore: move web code to new repo	2025-03-04 11:02:56 +01:00
eneller	9ae25e40ad	refactor: better typing dataclass replacing dict	2025-03-03 23:11:07 +01:00
eneller	7be0fbc126	refactor: crawl unified from list	2025-03-03 22:56:58 +01:00
eneller	8f77a97733	refactor: move crawling code to correct file	2025-03-03 22:34:59 +01:00
eneller	967f97f381	refactor: command invocation now avoiding chdir using subprocess instead of os.system	2025-03-03 22:14:08 +01:00
eneller	4d8cd00298	refactor: split parsing and logic	2025-03-03 21:34:04 +01:00
eneller	bde605cc90	chore: version bump	2025-02-25 23:07:55 +01:00
eneller	f9942a75d3	feat: display authors	2025-02-25 20:09:31 +01:00
eneller	90bdf83950	chore: webserver stuff	2025-02-25 15:51:57 +01:00
eneller	55e1472e1d	fix: crawl	2025-02-25 14:09:28 +01:00
eneller	4ffe110bc4	fix: redownloading `wget --timestamping` (alternatively `-N`) is now used to skip already existing files	2025-02-25 13:24:51 +01:00
eneller	8e0d92d796	feat: interactive cli using fzf wrapped by pyfzf	2025-02-25 12:22:12 +01:00
eneller	7f488c638c	begin django webserver	2025-02-25 03:40:12 +01:00
eneller	daddb58c3c	feat: better crawling	2025-02-24 23:35:33 +01:00