fix: redownloading

`wget --timestamping` (alternatively `-N`) is now used to skip already existing files
feat: interactive cli
2025-02-25 13:24:51 +01:00 · 2025-02-25 12:22:12 +01:00 · 2025-02-25 03:40:12 +01:00 · 2025-02-24 23:35:33 +01:00
8 changed files with 114 additions and 36 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -418,3 +418,4 @@ wheels/
 *.css
 *.js
 *.txt
+*.json
--- a/README.md
+++ b/README.md
@@ -2,8 +2,9 @@
 web to epub converter for https://projekt-gutenberg.org.
 Requires:
 - [pandoc](https://pandoc.org/)
- wget
- python
+- [wget](https://www.gnu.org/software/wget/)
+- [fzf](https://github.com/junegunn/fzf) (only for interactive mode)
+- python (duh)
 ## Usage
 Invoke the script using the url of any page of the book you would like to download:
 ``` 
@@ -12,6 +13,6 @@ epub2go https://www.projekt-gutenberg.org/ibsen/solness/
 ## Installation
   Assuming you have a recent version of python installed, run
   ```
-   pip install git+https://github.com/eneller/epub2go.py
+   pip install git+https://github.com/eneller/epub2go.py@latest
   ```
   This will provide the 'epub2go' command.
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,6 +6,7 @@ readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
    "beautifulsoup4==4.12.3",
+    "pyfzf>=0.3.1", # hasnt been updated for some time
    "requests==2.32.3",
    "tqdm>=4.67.1",
    "urllib3==2.2.2",
--- a/src/epub2go/convert.py
+++ b/src/epub2go/convert.py
@@ -1,17 +1,24 @@
 import requests
 from bs4 import BeautifulSoup
+from bs4 import ResultSet
 from urllib.parse import urljoin
-from urllib.request import urlopen, urlparse
+from urllib.request import  urlparse
 from tqdm import tqdm
+from pyfzf.pyfzf import FzfPrompt

 import os, sys
 import importlib.resources as pkg_resources
-from pathlib import Path
+
+
+allbooks_url ='https://www.projekt-gutenberg.org/info/texte/allworka.html'
+root_url = '{url.scheme}://{url.netloc}'.format(url = urlparse(allbooks_url))
+
 class GBConvert():
    #TODO fix toc / headings
    
    def __init__(self,
        url:str,
+        standalone = False,
        ):
        # NOTE move non-code files to data folder
        self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
@@ -19,6 +26,7 @@ class GBConvert():
        self.root = os.path.dirname(url)
        self.url = urlparse(self.root)
        self.output = self.url.netloc + self.url.path
+        self.standalone = standalone
        
    def get_meta(self):
        response = requests.get(self.root)
@@ -31,9 +39,9 @@ class GBConvert():
    def save_page(self, url):
        # https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
        command = f'''wget \
+                    --timestamping \
                    --page-requisites \
                    --convert-links \
-                    --execute \
                    --tries=5 \
                    --quiet \
                    {url}'''
@@ -67,7 +75,7 @@ class GBConvert():

        map(lambda x: self.save_page(os.path.join(self.root, x['href'])), self.toc)
        self.chapters = []
-        for item in tqdm(self.toc):
+        for item in (tqdm(self.toc) if self.standalone else self.toc):
            item_title= item.get_text()
            item_url = os.path.join(self.root, item['href'])
            self.save_page(url=item_url)
@@ -78,12 +86,43 @@ class GBConvert():
        
        self.create_epub(f'{self.title} - {self.author}.epub')
        
+def get_all_books() -> list:
+    books = get_all_book_tags()
+    d = []
+    for book in books:
+        book_href = book.get('href')
+        if book_href is not None:
+            book_url = urljoin(allbooks_url, book_href)
+            book_title = book.getText().translate(str.maketrans('','', '\n\t'))
+            d.append({'title': book_title, 'url': book_url})
+    return d

+def get_all_book_tags ()-> ResultSet:
+    response = requests.get(allbooks_url)
+    response.raise_for_status()
+    soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
+    books = soup.find('dl').find_all('a')
+    return books
    
 def main():
-    g = GBConvert(sys.argv[1])
-    g.run()
-
+    sys.argv.pop(0)
+    # non-interactive mode
+    if len(sys.argv) > 0 :
+        books = sys.argv
+    # interactive mode using fzf
+    else:
+        delimiter = ';'
+        # create lines for fzf
+        # TODO display author
+        books = [f"{item['title']} {delimiter} {item['url']}" for item in get_all_books()]
+        fzf = FzfPrompt()
+        selection = fzf.prompt(choices=books,  fzf_options=r'--exact --with-nth 1 -m -d\;')
+        books = [item.split(';')[1].strip() for item in selection]

+    if len(books)==1:
+        GBConvert(books[0], standalone=True).run()
+    else:
+        for book in tqdm(books):
+                GBConvert(book).run()
 if __name__ == "__main__":
    main()
--- a/src/epub2go/crawl.py
+++ b/src/epub2go/crawl.py
@@ -0,0 +1,22 @@
+import requests
+from tqdm import tqdm
+
+import os
+from urllib.parse import urljoin
+
+from convert import GBConvert
+import utils
+
+def main():
+    books = utils.get_all_book_tags()
+    # NOTE consider making this a map()
+    for book in tqdm(books):
+        book_title = book.get_text()
+        book_url_relative = book.get('href')
+        if book_url_relative is not None:
+            book_url = urljoin(allbooks_url, book_href)
+            GBConvert(book_url).run()
+
+
+if __name__ == "__main__":
+    main()
--- a/src/epub2go/test.py
+++ b/src/epub2go/test.py
@@ -1,25 +0,0 @@
-import requests
-from bs4 import BeautifulSoup
-
-import os
-
-from convert import GBConvert
-
-def main():
-    allbooks_relative_url ='/info/texte/allworka.html'
-    root_url = 'https://www.projekt-gutenberg.org'
-    allbooks_url = root_url + allbooks_relative_url
-    response = requests.get(allbooks_url)
-    if (response.status_code != 200): raise Exception(f'Couldnt fetch root page {self.root}')
-    soup = BeautifulSoup(response.content, 'html.parser')
-    books = soup.find('dl').find_all('a')
-    for book in books:
-        book_title = book.get_text()
-        book_url_relative = book.get('href')
-        if book_url_relative is not None:
-            book_url = root_url + os.path.dirname(book_url_relative)[5:]
-            gb = GBConvert(book_url)
-
-
-if __name__ == "__main__":
-    main()
--- a/src/epub2go/web.py
+++ b/src/epub2go/web.py
@@ -0,0 +1,28 @@
+# run using `django-admin runserver --pythonpath=. --settings=web`
+from django.urls import path
+from django.http import HttpResponse
+from django.shortcuts import redirect, render 
+import requests
+
+from convert import GBConvert, allbooks_url
+import json
+DEBUG = True
+ROOT_URLCONF = __name__
+SECRET_KEY='1'
+TEMPLATES = [
+        {
+            'BACKEND': 'django.template.backends.django.DjangoTemplates',
+            'DIRS': [
+                'templates/'
+            ],
+        },
+    ]
+
+def home(request):
+    title = 'epub2go'
+    items = json.load(open('dict.json', 'r'))
+    return render(request, 'index.html', locals())
+
+urlpatterns = [
+    path('', home, name='homepage'),
+]
--- a/uv.lock
+++ b/uv.lock
@@ -72,6 +72,7 @@ version = "1.0"
 source = { editable = "." }
 dependencies = [
    { name = "beautifulsoup4" },
+    { name = "pyfzf" },
    { name = "requests" },
    { name = "tqdm" },
    { name = "urllib3" },
@@ -80,6 +81,7 @@ dependencies = [
 [package.metadata]
 requires-dist = [
    { name = "beautifulsoup4", specifier = "==4.12.3" },
+    { name = "pyfzf", specifier = ">=0.3.1" },
    { name = "requests", specifier = "==2.32.3" },
    { name = "tqdm", specifier = ">=4.67.1" },
    { name = "urllib3", specifier = "==2.2.2" },
@@ -94,6 +96,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 },
 ]

+[[package]]
+name = "pyfzf"
+version = "0.3.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d4/4c/c0c658a1e1e9f0e01932990d7947579515fe048d0a515f07458ecd992b8f/pyfzf-0.3.1.tar.gz", hash = "sha256:dd902e34cffeca9c3082f96131593dd20b4b3a9bba5b9dde1b0688e424b46bd2", size = 3652 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/99/35/6a6c7b95390ec58904646a04f54e1b56fd57d7a247588b791c6331697797/pyfzf-0.3.1-py3-none-any.whl", hash = "sha256:736f71563461b75f6f85b55345bdc638fa0dc14c32c857c59e8b1ca1cfa3cf4a", size = 4315 },
+]
+
 [[package]]
 name = "requests"
 version = "2.32.3"
Author	SHA1	Message	Date
eneller	4ffe110bc4	fix: redownloading `wget --timestamping` (alternatively `-N`) is now used to skip already existing files	2025-02-25 13:24:51 +01:00
eneller	8e0d92d796	feat: interactive cli using fzf wrapped by pyfzf	2025-02-25 12:22:12 +01:00
eneller	7f488c638c	begin django webserver	2025-02-25 03:40:12 +01:00
eneller	daddb58c3c	feat: better crawling	2025-02-24 23:35:33 +01:00