11 Commits
v1.2 ... v1.3

Author SHA1 Message Date
eneller
4903a58619 feat: cli using click 2025-03-16 17:46:53 +01:00
eneller
7dfab60f18 feat: allow setting of downloaddir 2025-03-15 17:02:28 +01:00
eneller
9736c6135f chore: logging 2025-03-15 16:41:38 +01:00
eneller
d7ae0cc5a2 test: basic file count 2025-03-04 18:45:27 +01:00
eneller
c78aac28ab chore: error handling 2025-03-04 18:28:17 +01:00
eneller
00f6cef743 chore: move web code to new repo 2025-03-04 11:02:56 +01:00
eneller
9ae25e40ad refactor: better typing
dataclass replacing dict
2025-03-03 23:11:07 +01:00
eneller
7be0fbc126 refactor: crawl unified from list 2025-03-03 22:56:58 +01:00
eneller
8f77a97733 refactor: move crawling code to correct file 2025-03-03 22:34:59 +01:00
eneller
967f97f381 refactor: command invocation now avoiding chdir
using subprocess instead of os.system
2025-03-03 22:14:08 +01:00
eneller
4d8cd00298 refactor: split parsing and logic 2025-03-03 21:34:04 +01:00
7 changed files with 151 additions and 114 deletions

View File

@@ -1,11 +1,12 @@
[project]
name = "epub2go"
version = "1.2"
version = "1.3"
description = "EPUB converter using wget, pandoc and python glue"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"beautifulsoup4==4.12.3",
"click>=8.1.8",
"pyfzf>=0.3.1", # hasnt been updated for some time
"requests==2.32.3",
"tqdm>=4.67.1",

View File

@@ -5,38 +5,96 @@ from urllib.parse import urljoin
from urllib.request import urlparse
from tqdm import tqdm
from pyfzf.pyfzf import FzfPrompt
import click
import os, sys
import os, subprocess, shlex, logging
import importlib.resources as pkg_resources
from dataclasses import dataclass
from typing import List
logger = logging.getLogger(__name__)
allbooks_url ='https://www.projekt-gutenberg.org/info/texte/allworka.html'
root_url = '{url.scheme}://{url.netloc}'.format(url = urlparse(allbooks_url))
@dataclass
class Book():
author: str
title: str
url: str
class GBConvert():
#TODO fix toc / headings
def __init__(self,
url:str,
standalone = False,
author:str = None,
title:str = None,
downloaddir = './',
showprogress:bool = False,
):
# NOTE move non-code files to data folder
self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
self.blocklist = open(pkg_resources.files('epub2go').joinpath('blocklist.txt')).read().splitlines()
self.root = os.path.dirname(url)
self.url = urlparse(self.root)
self.output = self.url.netloc + self.url.path
self.standalone = standalone
with open(pkg_resources.files('epub2go').joinpath('blocklist.txt')) as blocklist:
self.blocklist = blocklist.read().splitlines()
self.tocpage = os.path.dirname(url) # ToC website url
url = urlparse(self.tocpage)
self.dir_download = downloaddir
self.dir_output = os.path.join(self.dir_download, url.netloc + url.path )# directories created by wget recreating the URL
logger.debug('Downloading in %s, expecting files in in %s', self.dir_download, self.dir_output)
self.showprogress = showprogress
self.author = author
self.title = title
self.chapters = []
def get_meta(self):
response = requests.get(self.root)
self.parse_meta()
def parse_meta(self):
response = requests.get(self.tocpage)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# TODO allow setting these from interactive mode where those parameters are figured out from the list
if not self.author:
try:
self.author = soup.find('meta', {'name': 'author'})['content']
except:
self.author = "UnknownAuthor"
if not self.title:
try:
self.title = soup.find('meta', {'name': 'title'})['content']
except:
self.title = "UnknownTitle"
self.toc = soup.find('ul').find_all('a')
logger.debug('Found ToC with %d entries', len(self.toc))
def parse_toc_entry(self, entry):
url = os.path.join(self.tocpage, entry['href'])
self.save_page(url)
return url
# apply blocklist to file
def parse_page(self,file_path):
#TODO clean up file opening, mmap?
logger.debug('Parsing page at %s', file_path)
with open(file_path, 'r+') as f:
soup = BeautifulSoup(f.read(), 'html.parser')
for blocker in self.blocklist:
for item in soup.select(blocker):
item.decompose()
f.write(str(soup))
def create_epub(self, filename='out.epub')-> int:
#TODO --epub-cover-image
#TODO toc if it isnt described by <h> tags, e.g. https://www.projekt-gutenberg.org/adlersfe/maskenba/
logger.debug('Creating epub as "%s"',filename)
command = f'''pandoc -f html -t epub \
-o "{filename}" \
--reference-location=section \
--css="{self.style_path_drama}" \
--metadata title="{self.title}" \
--metadata author="{self.author}" \
--epub-title-page=false \
{" ".join(self.chapters)} '''
return subprocess.run(shlex.split(command), cwd=self.dir_output).returncode
def save_page(self, url):
logger.debug('Saving page at %s', url)
# https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
command = f'''wget \
--timestamping \
@@ -45,48 +103,22 @@ class GBConvert():
--tries=5 \
--quiet \
{url}'''
os.system(command)
def clean_page(self,file_path):
f = open(file_path, 'r').read()
soup = BeautifulSoup(f, 'html.parser')
for blocker in self.blocklist:
for item in soup.select(blocker):
item.decompose()
open(file_path, 'w').write(str(soup))
def create_epub(self, filename='out.epub'):
os.chdir(self.output)
command = f'''pandoc -f html -t epub \
-o "{filename}" \
--reference-location=section \
--css="{self.style_path_drama}" \
--metadata title="{self.title}" \
--metadata author="{self.author}" \
--epub-title-page=false \
{" ".join(self.chapters)} '''#TODO --epub-cover-image
os.system(command)
return subprocess.run(shlex.split(command), cwd=self.dir_download).returncode
def run(self):
#TODO include images flag
self.get_meta()
map(lambda x: self.save_page(os.path.join(self.root, x['href'])), self.toc)
self.chapters = []
for item in (tqdm(self.toc) if self.standalone else self.toc):
item_title= item.get_text()
item_url = os.path.join(self.root, item['href'])
self.save_page(url=item_url)
# download all files in toc (chapters)
for item in (tqdm(self.toc) if self.showprogress else self.toc):
item_url = self.parse_toc_entry(item)
parsed_url = urlparse(item_url)
filepath = parsed_url.netloc + parsed_url.path
self.clean_page(filepath)
self.chapters.append(item['href'])
filepath = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path)
self.parse_page(filepath)
self.chapters.append(os.path.basename(item_url))
self.create_epub(f'{self.title} - {self.author}.epub')
return self.create_epub(f'{self.title} - {self.author}.epub')
def get_all_books() -> list:
# get a list of all books for interactive selection or scraping
def get_all_books() -> List[Book]:
response = requests.get(allbooks_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
@@ -111,35 +143,40 @@ def get_all_books() -> list:
book_href = book_tag.get('href')
book_url = urljoin(allbooks_url, book_href)
book_title = ' '.join(book_tag.getText().split())
book = {'author': book_author, 'title': book_title, 'url': book_url}
book = Book(book_author, book_title, book_url)
books.append(book)
return books
def get_all_book_tags ()-> ResultSet:
response = requests.get(allbooks_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
books = soup.find('dl').find_all('a')
return books
def main():
sys.argv.pop(0)
# run main cli
@click.command()
@click.option('--debug', '-d', is_flag=True, help='Set the log level to DEBUG')
@click.option('--silent', '-s', is_flag=True, help='Disable the progress bar')
@click.argument('args', nargs=-1)
def main(args, debug, silent):
'''
Download ePUBs from https://www.projekt-gutenberg.org/
Provide either 0 arguments to enter interactive mode or an arbitrary number of URLs to download from
'''
logging.basicConfig(level=logging.ERROR,format='%(asctime)s - %(levelname)s - %(message)s')
if(debug): logger.setLevel(logging.DEBUG)
# non-interactive mode
if len(sys.argv) > 0 :
books = sys.argv
if len(args) > 0 :
books = args
# interactive mode using fzf
else:
logger.debug('Received no CLI arguments, starting interactive mode')
delimiter = ';'
# create lines for fzf
books = [f"{item['author']} - {item['title']} {delimiter} {item['url']}" for item in get_all_books()]
books = [f"{ item.author } - { item.title } {delimiter} { item.url }" for item in get_all_books()]
fzf = FzfPrompt()
selection = fzf.prompt(choices=books, fzf_options=r'--exact --with-nth 1 -m -d\;')
books = [item.split(';')[1].strip() for item in selection]
logger.debug('Attempting to download from %d URL(s)', len(books))
if len(books)==1:
GBConvert(books[0], standalone=True).run()
GBConvert(books[0], showprogress=not silent).run()
else:
for book in tqdm(books):
for book in (tqdm(books) if not silent else books):
GBConvert(book).run()
if __name__ == "__main__":
main()

View File

@@ -1,20 +1,19 @@
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
from bs4 import ResultSet
import os
from urllib.parse import urljoin
from convert import GBConvert, get_all_book_tags, allbooks_url
from convert import GBConvert, allbooks_url, get_all_books, Book
def main():
books = get_all_book_tags()
books = get_all_books()
# NOTE consider making this a map()
for book in tqdm(books):
book_title = book.get_text()
book_url_relative = book.get('href')
if book_url_relative is not None:
book_url = urljoin(allbooks_url, book_url_relative)
GBConvert(book_url).run()
if book.url is not None:
GBConvert(book.url).run()
if __name__ == "__main__":

View File

@@ -1,38 +0,0 @@
# run using `django-admin runserver --pythonpath=. --settings=web`
from django.urls import path
from django.http import HttpResponse, HttpRequest
from django.shortcuts import redirect, render
import requests
from convert import GBConvert, allbooks_url
import json
DEBUG = True
ROOT_URLCONF = __name__
SECRET_KEY='1'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [
'templates/'
],
},
]
def root(request: HttpRequest):
title = 'epub2go'
targetParam = request.GET.get('t', None)
if targetParam is not None:
getEpub(targetParam)
return render(request, 'index.html', locals())
urlpatterns = [
path('', root, name='root'),
]
def getEpub(param):
# TODO validate / sanitize input
# TODO check for existing file and age
# TODO download
# TODO redirect to loading page
# TODO redirect to download page
raise NotImplementedError

0
test/__init__.py Normal file
View File

23
test/test_epub.py Normal file
View File

@@ -0,0 +1,23 @@
from tqdm import tqdm
from src.epub2go.convert import GBConvert, get_all_books
import unittest
# run using `python -m unittest test/test_epub.py`
class TestEpub(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.schiller_raeuber = GBConvert('https://www.projekt-gutenberg.org/schiller/raeuber/')
cls.anzengru_allersee = GBConvert('https://www.projekt-gutenberg.org/anzengru/allersee/')
def test_schiller_raeuber_toc(self):
self.assertEqual(len(self.schiller_raeuber.toc), 7)
def test_anzengru_allersee_toc(self):
self.assertEqual(len(self.anzengru_allersee.toc), 1)
if __name__ == '__main__':
unittest.main()

17
uv.lock generated
View File

@@ -1,4 +1,5 @@
version = 1
revision = 1
requires-python = ">=3.12"
[[package]]
@@ -57,6 +58,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/0e/f6/65ecc6878a89bb1c23a086ea335ad4bf21a588990c3f535a227b9eea9108/charset_normalizer-3.4.1-py3-none-any.whl", hash = "sha256:d98b1668f06378c6dbefec3b92299716b931cd4e6061f3c875a71ced1780ab85", size = 49767 },
]
[[package]]
name = "click"
version = "8.1.8"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "colorama", marker = "sys_platform == 'win32'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188 },
]
[[package]]
name = "colorama"
version = "0.4.6"
@@ -68,10 +81,11 @@ wheels = [
[[package]]
name = "epub2go"
version = "1.0"
version = "1.2"
source = { editable = "." }
dependencies = [
{ name = "beautifulsoup4" },
{ name = "click" },
{ name = "pyfzf" },
{ name = "requests" },
{ name = "tqdm" },
@@ -81,6 +95,7 @@ dependencies = [
[package.metadata]
requires-dist = [
{ name = "beautifulsoup4", specifier = "==4.12.3" },
{ name = "click", specifier = ">=8.1.8" },
{ name = "pyfzf", specifier = ">=0.3.1" },
{ name = "requests", specifier = "==2.32.3" },
{ name = "tqdm", specifier = ">=4.67.1" },