8 Commits
v1.0 ... v1.2

Author SHA1 Message Date
eneller
bde605cc90 chore: version bump 2025-02-25 23:07:55 +01:00
eneller
f9942a75d3 feat: display authors 2025-02-25 20:09:31 +01:00
eneller
90bdf83950 chore: webserver stuff 2025-02-25 15:51:57 +01:00
eneller
55e1472e1d fix: crawl 2025-02-25 14:09:28 +01:00
eneller
4ffe110bc4 fix: redownloading
`wget --timestamping` (alternatively `-N`) is now used to skip already
existing files
2025-02-25 13:24:51 +01:00
eneller
8e0d92d796 feat: interactive cli
using fzf wrapped by pyfzf
2025-02-25 12:22:12 +01:00
eneller
7f488c638c begin django webserver 2025-02-25 03:40:12 +01:00
eneller
daddb58c3c feat: better crawling 2025-02-24 23:35:33 +01:00
9 changed files with 776 additions and 36 deletions

1
.gitignore vendored
View File

@@ -418,3 +418,4 @@ wheels/
*.css
*.js
*.txt
*.json

View File

@@ -2,8 +2,9 @@
web to epub converter for https://projekt-gutenberg.org.
Requires:
- [pandoc](https://pandoc.org/)
- wget
- python
- [wget](https://www.gnu.org/software/wget/)
- [fzf](https://github.com/junegunn/fzf) (only for interactive mode)
- python (duh)
## Usage
Invoke the script using the url of any page of the book you would like to download:
```

View File

@@ -1,11 +1,12 @@
[project]
name = "epub2go"
version = "1.0"
version = "1.2"
description = "EPUB converter using wget, pandoc and python glue"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"beautifulsoup4==4.12.3",
"pyfzf>=0.3.1", # hasnt been updated for some time
"requests==2.32.3",
"tqdm>=4.67.1",
"urllib3==2.2.2",

View File

@@ -1,17 +1,24 @@
import requests
from bs4 import BeautifulSoup
from bs4 import ResultSet
from urllib.parse import urljoin
from urllib.request import urlopen, urlparse
from urllib.request import urlparse
from tqdm import tqdm
from pyfzf.pyfzf import FzfPrompt
import os, sys
import importlib.resources as pkg_resources
from pathlib import Path
allbooks_url ='https://www.projekt-gutenberg.org/info/texte/allworka.html'
root_url = '{url.scheme}://{url.netloc}'.format(url = urlparse(allbooks_url))
class GBConvert():
#TODO fix toc / headings
def __init__(self,
url:str,
standalone = False,
):
# NOTE move non-code files to data folder
self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
@@ -19,6 +26,7 @@ class GBConvert():
self.root = os.path.dirname(url)
self.url = urlparse(self.root)
self.output = self.url.netloc + self.url.path
self.standalone = standalone
def get_meta(self):
response = requests.get(self.root)
@@ -31,9 +39,9 @@ class GBConvert():
def save_page(self, url):
# https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
command = f'''wget \
--timestamping \
--page-requisites \
--convert-links \
--execute \
--tries=5 \
--quiet \
{url}'''
@@ -67,7 +75,7 @@ class GBConvert():
map(lambda x: self.save_page(os.path.join(self.root, x['href'])), self.toc)
self.chapters = []
for item in tqdm(self.toc):
for item in (tqdm(self.toc) if self.standalone else self.toc):
item_title= item.get_text()
item_url = os.path.join(self.root, item['href'])
self.save_page(url=item_url)
@@ -78,12 +86,60 @@ class GBConvert():
self.create_epub(f'{self.title} - {self.author}.epub')
def get_all_books() -> list:
response = requests.get(allbooks_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
tags = soup.find('dl').findChildren()
books = []
for tag in tags:
# is description tag, i.e. contains author name
if tag.name =='dt':
# update author
# special case when author name and Alphabetical list is in same tag
br_tag = tag.find('br')
if br_tag:
book_author = str(br_tag.next_sibling)
# default case, dt only contains author name
else:
book_author = tag.get_text(strip=True)
book_author = ' '.join(book_author.split())
# is details tag, contains book url
elif tag.name == 'dd':
book_tag = tag.a
if book_tag:
book_href = book_tag.get('href')
book_url = urljoin(allbooks_url, book_href)
book_title = ' '.join(book_tag.getText().split())
book = {'author': book_author, 'title': book_title, 'url': book_url}
books.append(book)
return books
def get_all_book_tags ()-> ResultSet:
response = requests.get(allbooks_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
books = soup.find('dl').find_all('a')
return books
def main():
g = GBConvert(sys.argv[1])
g.run()
sys.argv.pop(0)
# non-interactive mode
if len(sys.argv) > 0 :
books = sys.argv
# interactive mode using fzf
else:
delimiter = ';'
# create lines for fzf
books = [f"{item['author']} - {item['title']} {delimiter} {item['url']}" for item in get_all_books()]
fzf = FzfPrompt()
selection = fzf.prompt(choices=books, fzf_options=r'--exact --with-nth 1 -m -d\;')
books = [item.split(';')[1].strip() for item in selection]
if len(books)==1:
GBConvert(books[0], standalone=True).run()
else:
for book in tqdm(books):
GBConvert(book).run()
if __name__ == "__main__":
main()

21
src/epub2go/crawl.py Normal file
View File

@@ -0,0 +1,21 @@
import requests
from tqdm import tqdm
import os
from urllib.parse import urljoin
from convert import GBConvert, get_all_book_tags, allbooks_url
def main():
books = get_all_book_tags()
# NOTE consider making this a map()
for book in tqdm(books):
book_title = book.get_text()
book_url_relative = book.get('href')
if book_url_relative is not None:
book_url = urljoin(allbooks_url, book_url_relative)
GBConvert(book_url).run()
if __name__ == "__main__":
main()

636
src/epub2go/prosa.css vendored Normal file
View File

@@ -0,0 +1,636 @@
/* Modifizierte Prosa Styles von www.projekt-gutenberg.org - Stand: Januar 2020 */
@page {
margin: 5pt;
}
body {
font-family: serif;
/*font-family: arial;margin-right: 10%;margin-left: 10%;margin-top: 0%;margin-bottom: 3%;*/
}
/* Inhaltsverzeichnis */
.toc {
display: none;
}
/********************* Links *********************/
a:link {
color: #039;
text-decoration: none;
}
/* Titelseite */
.title {
/*to be set */
}
.subtitle {
color: darkgray;
}
.author {
color: gray;
}
.authorlist {
text-align: left;
}
.box {
margin-top: 1.5em;
margin-left: 15%;
margin-right: 15%;
margin-bottom: 1.5em;
padding-top: 1em;
padding-left: 1em;
padding-right: 1em;
padding-bottom: 1em;
border-top: 1px #666 solid;
border-right: 1px #666 solid;
border-bottom: 1px #666 solid;
border-left: 1px #666 solid;
}
.dedication {
text-indent: 0;
text-align: center;
font-size: large;
margin-top: 2em;
margin-bottom: 2em;
margin-left: 20%;
margin-right: 20%;
}
/* Abbildungen */
img {
max-width: 100%;
}
img.initial {
float: left;
margin-top: 0;
margin-bottom: 0;
margin-right: 0.3em;
}
img.left {
float: left;
margin-top: 0.5em;
margin-bottom: 0.5em;
margin-right: 0.5em;
}
img.right {
float: right;
margin-top: 0.5em;
margin-bottom: 0.5em;
margin-left: 0.5em;
}
img.deko {
margin-bottom: 20px;
margin-top: 10px;
border: 1px solid #606060;
text-align: center;
}
.figcaption {
text-indent: 0;
text-align: center;
font-style: italic;
}
.figure {
text-indent: 0;
text-align: center;
margin-top: 1em;
margin-bottom: 1em;
}
/* Textformatierungen */
.fraktur {
font-family: "Frankenstein", Times, serif;
}
.smallcaps {
font-variant: small-caps;
}
.lektorat {
color: darkgrey;
font-size: small;
}
.motto {
text-indent: 0;
margin-left: 50%;
margin-top: 1em;
margin-bottom: 1em;
}
.note {
line-height: 90%;
font-size: 90%;
}
.recipient {
margin-left: -1em;
margin-top: 1em;
margin-bottom: 1em;
}
/* Regie-Anweisung im Schauspiel */
.regie, .action {
font-size: 90%;
font-style: italic;
}
/*.sender {
margin-left: 2em;
font-style: italic;
font-weight: bold;
color: darkblue;
margin-left: 2em;
}*/
.signatur, .signature {
text-align: right;
margin-right: 2em;
}
/* Sprecher im Schauspiel. geändert. Re. */
.speaker {
color: #333;
font-weight: bold;
}
/* Sperrsatz (Duden: Satzzeichen außer Punkt und Anführungszeichen werden mit gesperrt, Zahlen werden's nicht), wird von einigen Readern nicht unterstützt */
.wide, .spaced {
letter-spacing: 0.15em;
}
/******************** Überschriften ********************/
h1, h2, h3, h4, h6 {
text-align: center;
}
h5 {
text-align: center;
font-size: 90%;
color: #808080;
font-weight: normal;
}
/******************** Fließtext ********************/
p {
margin-top: 0.4em;
margin-bottom: 0.4em;
text-indent: 0.8em;
text-align: justify;
widows: 2;
orphans: 2;
}
p.abstract {
font-size: 90%;
font-style: italic;
margin-left: 3em;
margin-right: 3em;
text-indent: 0;
}
p.center {
text-indent: 0;
text-align: center;
}
p.centerbig {
margin-bottom: 0.6em;
margin-top: 0.6em;
text-indent: 0;
text-align: center;
font-size: 115%;
}
p.centersml {
text-indent: 0;
text-align: center;
font-size: 90%;
margin-bottom: 0.3em;
margin-top: 0.3em;
}
p.dblmarg {
text-indent: 0;
margin-left: 10%;
margin-right: 10%;
text-align: justify;
}
p.drama {
margin-left: 2em;
text-indent: -2em;
margin-top: 0.5em;
margin-bottom: 0.5em;
}
p.epigraph {
text-indent: 0;
text-align: right;
margin-right: 5%;
font-style: italic;
}
p.left {
text-indent: 0;
text-align: left;
text-align: justify;
}
p.initial {
text-indent: 0;
}
p.leftjust {
text-indent: 0;
text-align: justify;
}
/*p.leftmarg {
text-indent: 0;
text-align: left;
margin-left: 2em;
text-align: justify;
}*/
/********************* Linien im Text *********************/
hr {
text-align: center;
color: #999;
margin-top: 0.5em;
margin-bottom: 0.5em;
/*border-top: 1px solid;border-right: 1px solid;border-bottom: 1px solid;border-left: 1px solid;*/
border: 1px solid;
}
hr.short {
color: #666;
margin-top: 2em;
margin-bottom: 2em;
width: 20%;
height: 1px;
margin-left: 40%;
}
hr.star {
margin-top: 1em;
margin-bottom: 1em;
width: 20%;
margin-left: 40%;
}
/********************* Absatzübergreifende Formatierung ********************/
div.epigraph {
margin-left: 50%;
margin-right: 5%;
font-style: italic;
}
div.impressum {
display: none;
}
div.motto p {
text-align: right;
text-indent: 0;
}
div.navi {
text-align: center;
}
div.titlepage {
text-align: center;
}
/********************* Gedichte *********************/
div.poem {
margin-left: 20%;
margin-right: 20%;
margin-bottom: 2em;
}
div.poem blockquote {
margin-left: 3em;
margin-right: 3em;
}
div.vers {
text-indent: 0;
text-align: left;
margin-left: 2em;
margin-top: 1em;
margin-bottom: 1em;
}
div.vers p {
text-indent: 0;
margin-top: 0;
margin-bottom: 0;
}
p.line {
text-align: left;
text-indent: 0;
margin-top: 0;
margin-bottom: 0;
}
p.poem, p.vers {
text-align: left;
text-indent: 0;
margin-top: 1em;
margin-left: 2em;
margin-right: 2em;
margin-bottom: 1em;
}
/********************* Briefe *********************/
div.letter {
text-align: left;
margin-left: 1.5em;
margin-top: 1em;
margin-bottom: 1em;
}
p.address {
text-align: right;
text-indent: 0;
font-style: italic;
}
p.date {
text-align: right;
font-style: italic;
}
/********************* Tabellen *********************/
tbody {
/*font-family: arial;*/
}
td {
/*font-family: arial;*/
}
/* Wird für mehrspaltige 0hmldir.xml gebraucht */
table.dirtoc {
margin-top: 0.3em;
margin-bottom: 0.3em;
text-align: left;
}
/* 0.4em Horizontal-Abstand zu Trennlinien, Folgezeilen um 1em eingerückt */
table.dirtoc td {
padding-top: 0;
padding-bottom: 0;
padding-left: 1.4em;
padding-right: 0.4em;
text-indent: -1em;
}
/* Notwendig, wenn jemand heimlichtückisch <div align="center"> davorsetzt: */
table.left {
margin-left: 0;
text-align: left;
}
table.motto {
margin-left: 30%;
margin-right: 0;
}
table.right {
margin-right: 0;
}
table.toc {
margin-top: 0.3em;
}
table.toc td {
padding-top: 0;
padding-left: 0.25em;
padding-right: 0.25em;
padding-bottom: 0;
text-align: left;
}
table.true, table.real {
margin-top: 0.3em;
margin-bottom: 0.3em;
text-align: left;
}
/* Definitionsliste */
dd {
margin-left: 2em;
}
dl {
margin-left: 1.5em;
margin-top: 1em;
margin-bottom: 1em;
}
dt {
font-weight: bold;
margin-top: 4pt;
}
/* Ungeordnete Liste */
ul {
margin-top: 1em;
margin-bottom: 1em;
}
/* Löschung und Einfügung */
del {
color: red;
}
ins {
color: blue;
}
/* Zeile mit 3 Sternen: <p class="stars"><sup>*</sup> <sub>*</sub> <sup>*</sup></p> */
p.stars {
text-indent: 0;
text-align: center;
font-size: 200%;
letter-spacing: 0.3em;
margin-top: 0.5em;
margin-bottom: 0;
}
/* Hochstellung ohne Vergößerung des Zeilenabstandes */
sup {
font-size: 70%;
vertical-align: text-top;
}
sub {
font-size: 70%;
vertical-align: text-bottom;
}
/* Formatierung von Brüchen */
sup.fract {
font-size: 70%;
vertical-align: text-top;
}
sub.fract {
font-size: 70%;
vertical-align: text-bottom;
}
.mainnav {
/*font-family: Arial;*/
background-color: #fff;
text-align: center;
border-top: 1px #d26402 solid;
border-bottom: 1px #d26402 solid;
}
.autalpha {
/*font-family: Arial;*/
text-align: center;
}
.trenner {
font-size: 10pt;
font-weight: bold;
color: #d26402;
}
.right {
text-align: right;
}
.left {
text-align: left;
}
/* Zu überprüfende Klassen: sind sie korrekt oder machen sie Sinn für ein EBook? */
.hidden, .hide {
display: none;
}
upper {
/* .upper ? */
text-transform: uppercase;
}
p.initial:first-letter {
/* funktioniert nicht bei allen Readern */
font-size: 150%;
}
.online {
display: none;
}
/* Seitennummern */
.pageref {
/* noch nicht definiert */
}
a.pageref {
display: none;
}
a.pageref:before {
content: "[";
}
a.pageref:after {
content: "]";
}
tt {
font-family: Courier;
}
/* besser
span.truetype {
font-family: monospace;
}*/
/********************* Anmerkungen und Fußnoten. geändert. Re. *********************/
a:visited {
color: #039;
text-decoration: none;
}
a:hover {
color: #039;
text-decoration: none;
background-color: #e0e0e0;
}
a:active {
color: #039;
text-decoration: none;
}
span.tooltip {
color: #800000;
}
span.footnote a:hover {
background-color: #2B2E21;
color: #fff;
}
span.footnote a:link span, span.footnote a:visited span {
display: none;
}
span.footnote a:hover span.fntext {
position: absolute;
margin: 20px;
background-color: beige;
max-width: 400px;
padding: 5px 10px 5px 10px;
border: 1px solid #C0C0C0;
font: normal 12px/14px arial;
color: #000;
text-align: left;
display: block;
text-decoration: none;
left: 10px;
}
span.footnote:before {
content: " [Fußnote: ";
color: #505050;
}
span.footnote:after {
content: "] ";
color: #505050;
}
span.footnote {
color: #505050;
display: inline;
font-size: 90%;
}
span.teletype {
font-family: monospace;
}
/* Alte Klassen */
.overline {
text-decoration: overline;
}
.upper {
text-transform: uppercase;
}
p.end {
text-indent: 0;
text-align: center;
}
p.right {
text-indent: 0;
text-align: right;
}
div.footnote {
display: inline;
}
table.poem, table.vers {
margin-left: auto;
margin-right: auto;
}
td.left {
text-align: left;
}
td.right {
text-align: right;
}
td.center {
text-align: center;
}
/* mit dem lang-Attribut markierte Tags. geändert. Re. */
*[lang=""] {
color: grey;
}
*[lang="fr"] {
color: red;
}
*[lang="la"] {
color: blue;
}
*[lang="en"] {
color: green;
}
*[lang="it"] {
color: violet;
}
*[lang="el"] {
color: brown;
}
/* ******************************************************************* */
/* Zusätzliche Definitionen ohne Layout für Text-Strukturierung */
/* ******************************************************************* */
div.ballad {
/* styles hier einfügen */
}
div.chapter {
/* styles hier einfügen */
}
div.part {
/* styles hier einfügen */
}
div.preface {
/* styles hier einfügen */
}
div.section {
/* styles hier einfügen */
}
div.volume {
/* styles hier einfügen */
}
h3.date {
/* styles hier einfügen */
}
h3.subtitle {
/* styles hier einfügen */
}
h3.translator {
/* styles hier einfügen */
}
h4.date {
/* styles hier einfügen */
}
h4.pseudo {
/* styles hier einfügen */
}
h4.publisher {
/* styles hier einfügen */
}
h4.subtitle {
/* styles hier einfügen */
}
h4.translator {
/* styles hier einfügen */
}
h5.date {
/* styles hier einfügen */
}
h5.translator {
/* styles hier einfügen */
}
div.toc {
display: none;
}
p.toc {
display: none;
}

View File

@@ -1,25 +0,0 @@
import requests
from bs4 import BeautifulSoup
import os
from convert import GBConvert
def main():
allbooks_relative_url ='/info/texte/allworka.html'
root_url = 'https://www.projekt-gutenberg.org'
allbooks_url = root_url + allbooks_relative_url
response = requests.get(allbooks_url)
if (response.status_code != 200): raise Exception(f'Couldnt fetch root page {self.root}')
soup = BeautifulSoup(response.content, 'html.parser')
books = soup.find('dl').find_all('a')
for book in books:
book_title = book.get_text()
book_url_relative = book.get('href')
if book_url_relative is not None:
book_url = root_url + os.path.dirname(book_url_relative)[5:]
gb = GBConvert(book_url)
if __name__ == "__main__":
main()

38
src/epub2go/web.py Normal file
View File

@@ -0,0 +1,38 @@
# run using `django-admin runserver --pythonpath=. --settings=web`
from django.urls import path
from django.http import HttpResponse, HttpRequest
from django.shortcuts import redirect, render
import requests
from convert import GBConvert, allbooks_url
import json
DEBUG = True
ROOT_URLCONF = __name__
SECRET_KEY='1'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [
'templates/'
],
},
]
def root(request: HttpRequest):
title = 'epub2go'
targetParam = request.GET.get('t', None)
if targetParam is not None:
getEpub(targetParam)
return render(request, 'index.html', locals())
urlpatterns = [
path('', root, name='root'),
]
def getEpub(param):
# TODO validate / sanitize input
# TODO check for existing file and age
# TODO download
# TODO redirect to loading page
# TODO redirect to download page
raise NotImplementedError

11
uv.lock generated
View File

@@ -72,6 +72,7 @@ version = "1.0"
source = { editable = "." }
dependencies = [
{ name = "beautifulsoup4" },
{ name = "pyfzf" },
{ name = "requests" },
{ name = "tqdm" },
{ name = "urllib3" },
@@ -80,6 +81,7 @@ dependencies = [
[package.metadata]
requires-dist = [
{ name = "beautifulsoup4", specifier = "==4.12.3" },
{ name = "pyfzf", specifier = ">=0.3.1" },
{ name = "requests", specifier = "==2.32.3" },
{ name = "tqdm", specifier = ">=4.67.1" },
{ name = "urllib3", specifier = "==2.2.2" },
@@ -94,6 +96,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 },
]
[[package]]
name = "pyfzf"
version = "0.3.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d4/4c/c0c658a1e1e9f0e01932990d7947579515fe048d0a515f07458ecd992b8f/pyfzf-0.3.1.tar.gz", hash = "sha256:dd902e34cffeca9c3082f96131593dd20b4b3a9bba5b9dde1b0688e424b46bd2", size = 3652 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/99/35/6a6c7b95390ec58904646a04f54e1b56fd57d7a247588b791c6331697797/pyfzf-0.3.1-py3-none-any.whl", hash = "sha256:736f71563461b75f6f85b55345bdc638fa0dc14c32c857c59e8b1ca1cfa3cf4a", size = 4315 },
]
[[package]]
name = "requests"
version = "2.32.3"