3 Commits
v1.0 ... v1.1

Author SHA1 Message Date
eneller
8e0d92d796 feat: interactive cli
using fzf wrapped by pyfzf
2025-02-25 12:22:12 +01:00
eneller
7f488c638c begin django webserver 2025-02-25 03:40:12 +01:00
eneller
daddb58c3c feat: better crawling 2025-02-24 23:35:33 +01:00
8 changed files with 114 additions and 35 deletions

1
.gitignore vendored
View File

@@ -418,3 +418,4 @@ wheels/
*.css
*.js
*.txt
*.json

View File

@@ -2,8 +2,9 @@
web to epub converter for https://projekt-gutenberg.org.
Requires:
- [pandoc](https://pandoc.org/)
- wget
- python
- [wget](https://www.gnu.org/software/wget/)
- [fzf](https://github.com/junegunn/fzf) (only for interactive mode)
- python (duh)
## Usage
Invoke the script using the url of any page of the book you would like to download:
```
@@ -12,6 +13,6 @@ epub2go https://www.projekt-gutenberg.org/ibsen/solness/
## Installation
Assuming you have a recent version of python installed, run
```
pip install git+https://github.com/eneller/epub2go.py
pip install git+https://github.com/eneller/epub2go.py@latest
```
This will provide the 'epub2go' command.

View File

@@ -6,6 +6,7 @@ readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"beautifulsoup4==4.12.3",
"pyfzf>=0.3.1", # hasnt been updated for some time
"requests==2.32.3",
"tqdm>=4.67.1",
"urllib3==2.2.2",

View File

@@ -1,17 +1,24 @@
import requests
from bs4 import BeautifulSoup
from bs4 import ResultSet
from urllib.parse import urljoin
from urllib.request import urlopen, urlparse
from urllib.request import urlparse
from tqdm import tqdm
from pyfzf.pyfzf import FzfPrompt
import os, sys
import importlib.resources as pkg_resources
from pathlib import Path
allbooks_url ='https://www.projekt-gutenberg.org/info/texte/allworka.html'
root_url = '{url.scheme}://{url.netloc}'.format(url = urlparse(allbooks_url))
class GBConvert():
#TODO fix toc / headings
def __init__(self,
url:str,
standalone = False,
):
# NOTE move non-code files to data folder
self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
@@ -19,6 +26,7 @@ class GBConvert():
self.root = os.path.dirname(url)
self.url = urlparse(self.root)
self.output = self.url.netloc + self.url.path
self.standalone = standalone
def get_meta(self):
response = requests.get(self.root)
@@ -29,6 +37,7 @@ class GBConvert():
self.toc = soup.find('ul').find_all('a')
def save_page(self, url):
# TODO fix redownloading of shared content
# https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
command = f'''wget \
--page-requisites \
@@ -67,7 +76,7 @@ class GBConvert():
map(lambda x: self.save_page(os.path.join(self.root, x['href'])), self.toc)
self.chapters = []
for item in tqdm(self.toc):
for item in (tqdm(self.toc) if self.standalone else self.toc):
item_title= item.get_text()
item_url = os.path.join(self.root, item['href'])
self.save_page(url=item_url)
@@ -78,12 +87,43 @@ class GBConvert():
self.create_epub(f'{self.title} - {self.author}.epub')
def get_all_books() -> list:
books = get_all_book_tags()
d = []
for book in books:
book_href = book.get('href')
if book_href is not None:
book_url = urljoin(allbooks_url, book_href)
book_title = book.getText().translate(str.maketrans('','', '\n\t'))
d.append({'title': book_title, 'url': book_url})
return d
def get_all_book_tags ()-> ResultSet:
response = requests.get(allbooks_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
books = soup.find('dl').find_all('a')
return books
def main():
g = GBConvert(sys.argv[1])
g.run()
sys.argv.pop(0)
# non-interactive mode
if len(sys.argv) > 0 :
books = sys.argv
# interactive mode using fzf
else:
delimiter = ';'
# create lines for fzf
# TODO display author
books = [f"{item['title']} {delimiter} {item['url']}" for item in get_all_books()]
fzf = FzfPrompt()
selection = fzf.prompt(choices=books, fzf_options=r'--exact --with-nth 1 -m -d\;')
books = [item.split(';')[1].strip() for item in selection]
if len(books)==1:
GBConvert(books[0], standalone=True).run()
else:
for book in tqdm(books):
GBConvert(book).run()
if __name__ == "__main__":
main()

22
src/epub2go/crawl.py Normal file
View File

@@ -0,0 +1,22 @@
import requests
from tqdm import tqdm
import os
from urllib.parse import urljoin
from convert import GBConvert
import utils
def main():
books = utils.get_all_book_tags()
# NOTE consider making this a map()
for book in tqdm(books):
book_title = book.get_text()
book_url_relative = book.get('href')
if book_url_relative is not None:
book_url = urljoin(allbooks_url, book_href)
GBConvert(book_url).run()
if __name__ == "__main__":
main()

View File

@@ -1,25 +0,0 @@
import requests
from bs4 import BeautifulSoup
import os
from convert import GBConvert
def main():
allbooks_relative_url ='/info/texte/allworka.html'
root_url = 'https://www.projekt-gutenberg.org'
allbooks_url = root_url + allbooks_relative_url
response = requests.get(allbooks_url)
if (response.status_code != 200): raise Exception(f'Couldnt fetch root page {self.root}')
soup = BeautifulSoup(response.content, 'html.parser')
books = soup.find('dl').find_all('a')
for book in books:
book_title = book.get_text()
book_url_relative = book.get('href')
if book_url_relative is not None:
book_url = root_url + os.path.dirname(book_url_relative)[5:]
gb = GBConvert(book_url)
if __name__ == "__main__":
main()

28
src/epub2go/web.py Normal file
View File

@@ -0,0 +1,28 @@
# run using `django-admin runserver --pythonpath=. --settings=web`
from django.urls import path
from django.http import HttpResponse
from django.shortcuts import redirect, render
import requests
import utils
import json
DEBUG = True
ROOT_URLCONF = __name__
SECRET_KEY='1'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [
'templates/'
],
},
]
def home(request):
title = 'epub2go'
items = json.load(open('dict.json', 'r'))
return render(request, 'index.html', locals())
urlpatterns = [
path('', home, name='homepage'),
]

11
uv.lock generated
View File

@@ -72,6 +72,7 @@ version = "1.0"
source = { editable = "." }
dependencies = [
{ name = "beautifulsoup4" },
{ name = "pyfzf" },
{ name = "requests" },
{ name = "tqdm" },
{ name = "urllib3" },
@@ -80,6 +81,7 @@ dependencies = [
[package.metadata]
requires-dist = [
{ name = "beautifulsoup4", specifier = "==4.12.3" },
{ name = "pyfzf", specifier = ">=0.3.1" },
{ name = "requests", specifier = "==2.32.3" },
{ name = "tqdm", specifier = ">=4.67.1" },
{ name = "urllib3", specifier = "==2.2.2" },
@@ -94,6 +96,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 },
]
[[package]]
name = "pyfzf"
version = "0.3.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d4/4c/c0c658a1e1e9f0e01932990d7947579515fe048d0a515f07458ecd992b8f/pyfzf-0.3.1.tar.gz", hash = "sha256:dd902e34cffeca9c3082f96131593dd20b4b3a9bba5b9dde1b0688e424b46bd2", size = 3652 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/99/35/6a6c7b95390ec58904646a04f54e1b56fd57d7a247588b791c6331697797/pyfzf-0.3.1-py3-none-any.whl", hash = "sha256:736f71563461b75f6f85b55345bdc638fa0dc14c32c857c59e8b1ca1cfa3cf4a", size = 4315 },
]
[[package]]
name = "requests"
version = "2.32.3"