25 Commits
v1.0 ... v2.2

Author SHA1 Message Date
eneller
660af7fab0 feat: allow getting directory without download 2025-03-23 23:55:05 +01:00
eneller
c49a1be369 docs: readme 2025-03-20 22:11:12 +01:00
eneller
4267700763 feat: return epub path
errors from wget and pandoc are thrown up
2025-03-16 20:30:42 +01:00
eneller
5d063d8597 feat: restructure for memory efficiency 2025-03-16 19:06:33 +01:00
eneller
6754f47e9f fix: restructure test 2025-03-16 18:57:40 +01:00
eneller
4a8d4f945d begin restructure 2025-03-16 18:34:12 +01:00
eneller
4903a58619 feat: cli using click 2025-03-16 17:46:53 +01:00
eneller
7dfab60f18 feat: allow setting of downloaddir 2025-03-15 17:02:28 +01:00
eneller
9736c6135f chore: logging 2025-03-15 16:41:38 +01:00
eneller
d7ae0cc5a2 test: basic file count 2025-03-04 18:45:27 +01:00
eneller
c78aac28ab chore: error handling 2025-03-04 18:28:17 +01:00
eneller
00f6cef743 chore: move web code to new repo 2025-03-04 11:02:56 +01:00
eneller
9ae25e40ad refactor: better typing
dataclass replacing dict
2025-03-03 23:11:07 +01:00
eneller
7be0fbc126 refactor: crawl unified from list 2025-03-03 22:56:58 +01:00
eneller
8f77a97733 refactor: move crawling code to correct file 2025-03-03 22:34:59 +01:00
eneller
967f97f381 refactor: command invocation now avoiding chdir
using subprocess instead of os.system
2025-03-03 22:14:08 +01:00
eneller
4d8cd00298 refactor: split parsing and logic 2025-03-03 21:34:04 +01:00
eneller
bde605cc90 chore: version bump 2025-02-25 23:07:55 +01:00
eneller
f9942a75d3 feat: display authors 2025-02-25 20:09:31 +01:00
eneller
90bdf83950 chore: webserver stuff 2025-02-25 15:51:57 +01:00
eneller
55e1472e1d fix: crawl 2025-02-25 14:09:28 +01:00
eneller
4ffe110bc4 fix: redownloading
`wget --timestamping` (alternatively `-N`) is now used to skip already
existing files
2025-02-25 13:24:51 +01:00
eneller
8e0d92d796 feat: interactive cli
using fzf wrapped by pyfzf
2025-02-25 12:22:12 +01:00
eneller
7f488c638c begin django webserver 2025-02-25 03:40:12 +01:00
eneller
daddb58c3c feat: better crawling 2025-02-24 23:35:33 +01:00
10 changed files with 918 additions and 101 deletions

1
.gitignore vendored
View File

@@ -418,3 +418,4 @@ wheels/
*.css
*.js
*.txt
*.json

View File

@@ -1,17 +1,39 @@
# epub2go.py
web to epub converter for https://projekt-gutenberg.org.
Web to ePUB Converter for [projekt-gutenberg.org](https://projekt-gutenberg.org) developed in conjunction with a [web interface](https://github.com/eneller/epub2go-web).
## Installation
Requires:
- [pandoc](https://pandoc.org/)
- wget
- python
## Usage
Invoke the script using the url of any page of the book you would like to download:
```
epub2go https://www.projekt-gutenberg.org/ibsen/solness/
- [wget](https://www.gnu.org/software/wget/)
- [fzf](https://github.com/junegunn/fzf) (optional, only for interactive mode)
- [python](https://www.python.org/) (duh)
Assuming you have a recent version of python installed, run
```
pip install git+https://github.com/eneller/epub2go.py
```
This will provide the `epub2go` command.
## Usage
```
Usage: epub2go [OPTIONS] [ARGS]...
Download ePUBs from https://www.projekt-gutenberg.org/
Provide either 0 arguments to enter interactive mode or an arbitrary number
of URLs to download from
Options:
-d, --debug Set the log level to DEBUG
-s, --silent Disable the progress bar
-p, --path TEXT The path to which files are saved
--no-clean Do not parse html files with blocklist
--help Show this message and exit.
```
Examples:
```bash
epub2go https://www.projekt-gutenberg.org/ibsen/solness/
epub2go # will enter interactive mode
```
## Installation
Assuming you have a recent version of python installed, run
```
pip install git+https://github.com/eneller/epub2go.py
```
This will provide the 'epub2go' command.

View File

@@ -1,11 +1,13 @@
[project]
name = "epub2go"
version = "1.0"
version = "2.2"
description = "EPUB converter using wget, pandoc and python glue"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"beautifulsoup4==4.12.3",
"click>=8.1.8",
"pyfzf>=0.3.1", # hasnt been updated for some time
"requests==2.32.3",
"tqdm>=4.67.1",
"urllib3==2.2.2",
@@ -20,4 +22,4 @@ include-package-data = true
requires = ["setuptools>=64", "setuptools_scm>=8"]
[tool.setuptools_scm]
# can be empty if no extra settings are needed, presence enables setuptools-scm
# can be empty if no extra settings are needed, presence enables setuptools-scm

View File

@@ -1,89 +1,200 @@
import requests
from bs4 import BeautifulSoup
from bs4 import ResultSet
from urllib.parse import urljoin
from urllib.request import urlopen, urlparse
from urllib.request import urlparse
from tqdm import tqdm
from pyfzf.pyfzf import FzfPrompt
import click
import os, sys
import os, subprocess, shlex, logging
import importlib.resources as pkg_resources
from pathlib import Path
from dataclasses import dataclass
from typing import List
logger = logging.getLogger(__name__)
allbooks_url ='https://www.projekt-gutenberg.org/info/texte/allworka.html'
@dataclass
class Book():
author: str
title: str
url: str
class GBConvert():
#TODO fix toc / headings
def __init__(self,
url:str,
downloaddir,
):
# NOTE move non-code files to data folder
self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
self.blocklist = open(pkg_resources.files('epub2go').joinpath('blocklist.txt')).read().splitlines()
self.root = os.path.dirname(url)
self.url = urlparse(self.root)
self.output = self.url.netloc + self.url.path
with open(pkg_resources.files('epub2go').joinpath('blocklist.txt')) as blocklist:
self.blocklist = blocklist.read().splitlines()
self.dir_download = downloaddir
def getDir(self, url):
tocpage = os.path.dirname(url) # ToC website url
parsed_url = urlparse(tocpage)
# directories created by wget recreating the URL
dir_output = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path )
return dir_output
def get_meta(self):
response = requests.get(self.root)
def download(self,
url:str,
author:str = None,
title:str = None,
showprogress: bool = False,
cleanpages: bool = True,
):
tocpage = os.path.dirname(url) # ToC website url
dir_output = self.getDir()
logger.debug('Downloading to %s, expecting files in in %s', self.dir_download, dir_output)
author = author
title = title
#parse_meta
response = requests.get(tocpage)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
self.author = soup.find('meta', {'name': 'author'})['content']
self.title = soup.find('meta', {'name': 'title'})['content']
self.toc = soup.find('ul').find_all('a')
def save_page(self, url):
# https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
command = f'''wget \
--page-requisites \
--convert-links \
--execute \
--tries=5 \
--quiet \
{url}'''
os.system(command)
# TODO allow setting these from interactive mode where those parameters are figured out from the list
if not author:
try:
author = soup.find('meta', {'name': 'author'})['content']
except:
author = "UnknownAuthor"
if not title:
try:
title = soup.find('meta', {'name': 'title'})['content']
except:
title = "UnknownTitle"
chapter_urls = soup.find('ul').find_all('a')
logger.debug('Found ToC with %d entries', len(chapter_urls))
def clean_page(self,file_path):
f = open(file_path, 'r').read()
soup = BeautifulSoup(f, 'html.parser')
for blocker in self.blocklist:
for item in soup.select(blocker):
item.decompose()
open(file_path, 'w').write(str(soup))
#run
#TODO include images flag
# download all files in toc (chapters)
chapter_files = []
for item in (tqdm(chapter_urls) if showprogress else chapter_urls):
item_url = self.parse_toc_entry(tocpage, item)
parsed_url = urlparse(item_url)
filepath = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path)
if cleanpages: self.parse_page(filepath)
chapter_files.append(os.path.basename(item_url))
return self.create_epub(author,title,chapter_files,dir_output)
def parse_toc_entry(self, tocpage, entry):
url = os.path.join(tocpage, entry['href'])
self.save_page(url)
return url
# apply blocklist to file
def parse_page(self,file_path):
#TODO clean up file opening, mmap?
count=0
with open(file_path, 'r+') as f:
soup = BeautifulSoup(f.read(), 'html.parser')
for blocker in self.blocklist:
for item in soup.select(blocker):
item.decompose()
count+=1
f.seek(0)
f.truncate()
f.write(str(soup))
logger.debug('Removed %d tags from page %s during parsing', count, file_path)
def create_epub(self, filename='out.epub'):
os.chdir(self.output)
def create_epub(self, author, title, chapters, dir_output):
#TODO --epub-cover-image
#TODO toc if it isnt described by <h> tags, e.g. https://www.projekt-gutenberg.org/adlersfe/maskenba/
filename = f'{title} - {author}.epub'
logger.debug('Creating epub as "%s"',filename)
command = f'''pandoc -f html -t epub \
-o "{filename}" \
--reference-location=section \
--css="{self.style_path_drama}" \
--metadata title="{self.title}" \
--metadata author="{self.author}" \
--metadata title="{title}" \
--metadata author="{author}" \
--epub-title-page=false \
{" ".join(self.chapters)} '''#TODO --epub-cover-image
os.system(command)
{" ".join(chapters)} '''
subprocess.run(shlex.split(command), cwd=dir_output, check=True)
return os.path.abspath(os.path.join(dir_output,filename))
def run(self):
#TODO include images flag
def save_page(self, url):
logger.debug('Saving page at %s', url)
# https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
command = f'''wget \
--timestamping \
--page-requisites \
--convert-links \
--tries=5 \
--quiet \
{url}'''
subprocess.run(shlex.split(command), cwd=self.dir_download, check=True)
self.get_meta()
# get a list of all books for interactive selection or scraping
def get_all_books() -> List[Book]:
response = requests.get(allbooks_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
tags = soup.find('dl').findChildren()
books = []
for tag in tags:
# is description tag, i.e. contains author name
if tag.name =='dt':
# update author
# special case when author name and Alphabetical list is in same tag
br_tag = tag.find('br')
if br_tag:
book_author = str(br_tag.next_sibling)
# default case, dt only contains author name
else:
book_author = tag.get_text(strip=True)
book_author = ' '.join(book_author.split())
# is details tag, contains book url
elif tag.name == 'dd':
book_tag = tag.a
if book_tag:
book_href = book_tag.get('href')
book_url = urljoin(allbooks_url, book_href)
book_title = ' '.join(book_tag.getText().split())
book = Book(book_author, book_title, book_url)
books.append(book)
return books
map(lambda x: self.save_page(os.path.join(self.root, x['href'])), self.toc)
self.chapters = []
for item in tqdm(self.toc):
item_title= item.get_text()
item_url = os.path.join(self.root, item['href'])
self.save_page(url=item_url)
parsed_url = urlparse(item_url)
filepath = parsed_url.netloc + parsed_url.path
self.clean_page(filepath)
self.chapters.append(item['href'])
self.create_epub(f'{self.title} - {self.author}.epub')
def main():
g = GBConvert(sys.argv[1])
g.run()
# run main cli
@click.command()
#TODO include images flag
@click.option('--debug', '-d', is_flag=True, help='Set the log level to DEBUG')
@click.option('--silent', '-s', is_flag=True, help='Disable the progress bar')
@click.option('--path','-p',type=str,default='./', help='The path to which files are saved' )
@click.option('--no-clean',is_flag=True,help='Do not parse html files with blocklist')
@click.argument('args', nargs=-1)
def main(args, debug, silent, path, no_clean):
'''
Download ePUBs from https://www.projekt-gutenberg.org/ \n
Provide either 0 arguments to enter interactive mode or an arbitrary number of URLs to download from
'''
logging.basicConfig(level=logging.ERROR,format='%(asctime)s - %(levelname)s - %(message)s')
if(debug): logger.setLevel(logging.DEBUG)
# non-interactive mode
if len(args) > 0 :
books = args
# interactive mode using fzf
else:
logger.debug('Received no CLI arguments, starting interactive mode')
delimiter = ';'
# create lines for fzf
books = [f"{ item.author } - { item.title } {delimiter} { item.url }" for item in get_all_books()]
fzf = FzfPrompt()
selection = fzf.prompt(choices=books, fzf_options=r'--exact --with-nth 1 -m -d\;')
books = [item.split(';')[1].strip() for item in selection]
logger.debug('Attempting to download from %d URL(s)', len(books))
converter = GBConvert(path)
if len(books)==1:
converter.download(books[0], showprogress=not silent, cleanpages= not no_clean)
else:
for book in (tqdm(books) if not silent else books):
converter.download(book, cleanpages= not no_clean)
if __name__ == "__main__":
main()

21
src/epub2go/crawl.py Normal file
View File

@@ -0,0 +1,21 @@
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
from bs4 import ResultSet
import os
from urllib.parse import urljoin
from convert import GBConvert, allbooks_url, get_all_books, Book
def main():
books = get_all_books()
# NOTE consider making this a map()
converter = GBConvert('./')
for book in tqdm(books):
if book.url is not None:
converter.download(book.url)
if __name__ == "__main__":
main()

636
src/epub2go/prosa.css vendored Normal file
View File

@@ -0,0 +1,636 @@
/* Modifizierte Prosa Styles von www.projekt-gutenberg.org - Stand: Januar 2020 */
@page {
margin: 5pt;
}
body {
font-family: serif;
/*font-family: arial;margin-right: 10%;margin-left: 10%;margin-top: 0%;margin-bottom: 3%;*/
}
/* Inhaltsverzeichnis */
.toc {
display: none;
}
/********************* Links *********************/
a:link {
color: #039;
text-decoration: none;
}
/* Titelseite */
.title {
/*to be set */
}
.subtitle {
color: darkgray;
}
.author {
color: gray;
}
.authorlist {
text-align: left;
}
.box {
margin-top: 1.5em;
margin-left: 15%;
margin-right: 15%;
margin-bottom: 1.5em;
padding-top: 1em;
padding-left: 1em;
padding-right: 1em;
padding-bottom: 1em;
border-top: 1px #666 solid;
border-right: 1px #666 solid;
border-bottom: 1px #666 solid;
border-left: 1px #666 solid;
}
.dedication {
text-indent: 0;
text-align: center;
font-size: large;
margin-top: 2em;
margin-bottom: 2em;
margin-left: 20%;
margin-right: 20%;
}
/* Abbildungen */
img {
max-width: 100%;
}
img.initial {
float: left;
margin-top: 0;
margin-bottom: 0;
margin-right: 0.3em;
}
img.left {
float: left;
margin-top: 0.5em;
margin-bottom: 0.5em;
margin-right: 0.5em;
}
img.right {
float: right;
margin-top: 0.5em;
margin-bottom: 0.5em;
margin-left: 0.5em;
}
img.deko {
margin-bottom: 20px;
margin-top: 10px;
border: 1px solid #606060;
text-align: center;
}
.figcaption {
text-indent: 0;
text-align: center;
font-style: italic;
}
.figure {
text-indent: 0;
text-align: center;
margin-top: 1em;
margin-bottom: 1em;
}
/* Textformatierungen */
.fraktur {
font-family: "Frankenstein", Times, serif;
}
.smallcaps {
font-variant: small-caps;
}
.lektorat {
color: darkgrey;
font-size: small;
}
.motto {
text-indent: 0;
margin-left: 50%;
margin-top: 1em;
margin-bottom: 1em;
}
.note {
line-height: 90%;
font-size: 90%;
}
.recipient {
margin-left: -1em;
margin-top: 1em;
margin-bottom: 1em;
}
/* Regie-Anweisung im Schauspiel */
.regie, .action {
font-size: 90%;
font-style: italic;
}
/*.sender {
margin-left: 2em;
font-style: italic;
font-weight: bold;
color: darkblue;
margin-left: 2em;
}*/
.signatur, .signature {
text-align: right;
margin-right: 2em;
}
/* Sprecher im Schauspiel. geändert. Re. */
.speaker {
color: #333;
font-weight: bold;
}
/* Sperrsatz (Duden: Satzzeichen außer Punkt und Anführungszeichen werden mit gesperrt, Zahlen werden's nicht), wird von einigen Readern nicht unterstützt */
.wide, .spaced {
letter-spacing: 0.15em;
}
/******************** Überschriften ********************/
h1, h2, h3, h4, h6 {
text-align: center;
}
h5 {
text-align: center;
font-size: 90%;
color: #808080;
font-weight: normal;
}
/******************** Fließtext ********************/
p {
margin-top: 0.4em;
margin-bottom: 0.4em;
text-indent: 0.8em;
text-align: justify;
widows: 2;
orphans: 2;
}
p.abstract {
font-size: 90%;
font-style: italic;
margin-left: 3em;
margin-right: 3em;
text-indent: 0;
}
p.center {
text-indent: 0;
text-align: center;
}
p.centerbig {
margin-bottom: 0.6em;
margin-top: 0.6em;
text-indent: 0;
text-align: center;
font-size: 115%;
}
p.centersml {
text-indent: 0;
text-align: center;
font-size: 90%;
margin-bottom: 0.3em;
margin-top: 0.3em;
}
p.dblmarg {
text-indent: 0;
margin-left: 10%;
margin-right: 10%;
text-align: justify;
}
p.drama {
margin-left: 2em;
text-indent: -2em;
margin-top: 0.5em;
margin-bottom: 0.5em;
}
p.epigraph {
text-indent: 0;
text-align: right;
margin-right: 5%;
font-style: italic;
}
p.left {
text-indent: 0;
text-align: left;
text-align: justify;
}
p.initial {
text-indent: 0;
}
p.leftjust {
text-indent: 0;
text-align: justify;
}
/*p.leftmarg {
text-indent: 0;
text-align: left;
margin-left: 2em;
text-align: justify;
}*/
/********************* Linien im Text *********************/
hr {
text-align: center;
color: #999;
margin-top: 0.5em;
margin-bottom: 0.5em;
/*border-top: 1px solid;border-right: 1px solid;border-bottom: 1px solid;border-left: 1px solid;*/
border: 1px solid;
}
hr.short {
color: #666;
margin-top: 2em;
margin-bottom: 2em;
width: 20%;
height: 1px;
margin-left: 40%;
}
hr.star {
margin-top: 1em;
margin-bottom: 1em;
width: 20%;
margin-left: 40%;
}
/********************* Absatzübergreifende Formatierung ********************/
div.epigraph {
margin-left: 50%;
margin-right: 5%;
font-style: italic;
}
div.impressum {
display: none;
}
div.motto p {
text-align: right;
text-indent: 0;
}
div.navi {
text-align: center;
}
div.titlepage {
text-align: center;
}
/********************* Gedichte *********************/
div.poem {
margin-left: 20%;
margin-right: 20%;
margin-bottom: 2em;
}
div.poem blockquote {
margin-left: 3em;
margin-right: 3em;
}
div.vers {
text-indent: 0;
text-align: left;
margin-left: 2em;
margin-top: 1em;
margin-bottom: 1em;
}
div.vers p {
text-indent: 0;
margin-top: 0;
margin-bottom: 0;
}
p.line {
text-align: left;
text-indent: 0;
margin-top: 0;
margin-bottom: 0;
}
p.poem, p.vers {
text-align: left;
text-indent: 0;
margin-top: 1em;
margin-left: 2em;
margin-right: 2em;
margin-bottom: 1em;
}
/********************* Briefe *********************/
div.letter {
text-align: left;
margin-left: 1.5em;
margin-top: 1em;
margin-bottom: 1em;
}
p.address {
text-align: right;
text-indent: 0;
font-style: italic;
}
p.date {
text-align: right;
font-style: italic;
}
/********************* Tabellen *********************/
tbody {
/*font-family: arial;*/
}
td {
/*font-family: arial;*/
}
/* Wird für mehrspaltige 0hmldir.xml gebraucht */
table.dirtoc {
margin-top: 0.3em;
margin-bottom: 0.3em;
text-align: left;
}
/* 0.4em Horizontal-Abstand zu Trennlinien, Folgezeilen um 1em eingerückt */
table.dirtoc td {
padding-top: 0;
padding-bottom: 0;
padding-left: 1.4em;
padding-right: 0.4em;
text-indent: -1em;
}
/* Notwendig, wenn jemand heimlichtückisch <div align="center"> davorsetzt: */
table.left {
margin-left: 0;
text-align: left;
}
table.motto {
margin-left: 30%;
margin-right: 0;
}
table.right {
margin-right: 0;
}
table.toc {
margin-top: 0.3em;
}
table.toc td {
padding-top: 0;
padding-left: 0.25em;
padding-right: 0.25em;
padding-bottom: 0;
text-align: left;
}
table.true, table.real {
margin-top: 0.3em;
margin-bottom: 0.3em;
text-align: left;
}
/* Definitionsliste */
dd {
margin-left: 2em;
}
dl {
margin-left: 1.5em;
margin-top: 1em;
margin-bottom: 1em;
}
dt {
font-weight: bold;
margin-top: 4pt;
}
/* Ungeordnete Liste */
ul {
margin-top: 1em;
margin-bottom: 1em;
}
/* Löschung und Einfügung */
del {
color: red;
}
ins {
color: blue;
}
/* Zeile mit 3 Sternen: <p class="stars"><sup>*</sup> <sub>*</sub> <sup>*</sup></p> */
p.stars {
text-indent: 0;
text-align: center;
font-size: 200%;
letter-spacing: 0.3em;
margin-top: 0.5em;
margin-bottom: 0;
}
/* Hochstellung ohne Vergößerung des Zeilenabstandes */
sup {
font-size: 70%;
vertical-align: text-top;
}
sub {
font-size: 70%;
vertical-align: text-bottom;
}
/* Formatierung von Brüchen */
sup.fract {
font-size: 70%;
vertical-align: text-top;
}
sub.fract {
font-size: 70%;
vertical-align: text-bottom;
}
.mainnav {
/*font-family: Arial;*/
background-color: #fff;
text-align: center;
border-top: 1px #d26402 solid;
border-bottom: 1px #d26402 solid;
}
.autalpha {
/*font-family: Arial;*/
text-align: center;
}
.trenner {
font-size: 10pt;
font-weight: bold;
color: #d26402;
}
.right {
text-align: right;
}
.left {
text-align: left;
}
/* Zu überprüfende Klassen: sind sie korrekt oder machen sie Sinn für ein EBook? */
.hidden, .hide {
display: none;
}
upper {
/* .upper ? */
text-transform: uppercase;
}
p.initial:first-letter {
/* funktioniert nicht bei allen Readern */
font-size: 150%;
}
.online {
display: none;
}
/* Seitennummern */
.pageref {
/* noch nicht definiert */
}
a.pageref {
display: none;
}
a.pageref:before {
content: "[";
}
a.pageref:after {
content: "]";
}
tt {
font-family: Courier;
}
/* besser
span.truetype {
font-family: monospace;
}*/
/********************* Anmerkungen und Fußnoten. geändert. Re. *********************/
a:visited {
color: #039;
text-decoration: none;
}
a:hover {
color: #039;
text-decoration: none;
background-color: #e0e0e0;
}
a:active {
color: #039;
text-decoration: none;
}
span.tooltip {
color: #800000;
}
span.footnote a:hover {
background-color: #2B2E21;
color: #fff;
}
span.footnote a:link span, span.footnote a:visited span {
display: none;
}
span.footnote a:hover span.fntext {
position: absolute;
margin: 20px;
background-color: beige;
max-width: 400px;
padding: 5px 10px 5px 10px;
border: 1px solid #C0C0C0;
font: normal 12px/14px arial;
color: #000;
text-align: left;
display: block;
text-decoration: none;
left: 10px;
}
span.footnote:before {
content: " [Fußnote: ";
color: #505050;
}
span.footnote:after {
content: "] ";
color: #505050;
}
span.footnote {
color: #505050;
display: inline;
font-size: 90%;
}
span.teletype {
font-family: monospace;
}
/* Alte Klassen */
.overline {
text-decoration: overline;
}
.upper {
text-transform: uppercase;
}
p.end {
text-indent: 0;
text-align: center;
}
p.right {
text-indent: 0;
text-align: right;
}
div.footnote {
display: inline;
}
table.poem, table.vers {
margin-left: auto;
margin-right: auto;
}
td.left {
text-align: left;
}
td.right {
text-align: right;
}
td.center {
text-align: center;
}
/* mit dem lang-Attribut markierte Tags. geändert. Re. */
*[lang=""] {
color: grey;
}
*[lang="fr"] {
color: red;
}
*[lang="la"] {
color: blue;
}
*[lang="en"] {
color: green;
}
*[lang="it"] {
color: violet;
}
*[lang="el"] {
color: brown;
}
/* ******************************************************************* */
/* Zusätzliche Definitionen ohne Layout für Text-Strukturierung */
/* ******************************************************************* */
div.ballad {
/* styles hier einfügen */
}
div.chapter {
/* styles hier einfügen */
}
div.part {
/* styles hier einfügen */
}
div.preface {
/* styles hier einfügen */
}
div.section {
/* styles hier einfügen */
}
div.volume {
/* styles hier einfügen */
}
h3.date {
/* styles hier einfügen */
}
h3.subtitle {
/* styles hier einfügen */
}
h3.translator {
/* styles hier einfügen */
}
h4.date {
/* styles hier einfügen */
}
h4.pseudo {
/* styles hier einfügen */
}
h4.publisher {
/* styles hier einfügen */
}
h4.subtitle {
/* styles hier einfügen */
}
h4.translator {
/* styles hier einfügen */
}
h5.date {
/* styles hier einfügen */
}
h5.translator {
/* styles hier einfügen */
}
div.toc {
display: none;
}
p.toc {
display: none;
}

View File

@@ -1,25 +0,0 @@
import requests
from bs4 import BeautifulSoup
import os
from convert import GBConvert
def main():
allbooks_relative_url ='/info/texte/allworka.html'
root_url = 'https://www.projekt-gutenberg.org'
allbooks_url = root_url + allbooks_relative_url
response = requests.get(allbooks_url)
if (response.status_code != 200): raise Exception(f'Couldnt fetch root page {self.root}')
soup = BeautifulSoup(response.content, 'html.parser')
books = soup.find('dl').find_all('a')
for book in books:
book_title = book.get_text()
book_url_relative = book.get('href')
if book_url_relative is not None:
book_url = root_url + os.path.dirname(book_url_relative)[5:]
gb = GBConvert(book_url)
if __name__ == "__main__":
main()

0
test/__init__.py Normal file
View File

23
test/test_epub.py Normal file
View File

@@ -0,0 +1,23 @@
from tqdm import tqdm
from src.epub2go.convert import GBConvert, get_all_books
import unittest
# run using `python -m unittest test/test_epub.py`
class TestEpub(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.schiller_raeuber = GBConvert('https://www.projekt-gutenberg.org/schiller/raeuber/')
cls.anzengru_allersee = GBConvert('https://www.projekt-gutenberg.org/anzengru/allersee/')
def test_schiller_raeuber_toc(self):
self.assertEqual(len(self.schiller_raeuber.toc), 7)
def test_anzengru_allersee_toc(self):
self.assertEqual(len(self.anzengru_allersee.toc), 1)
if __name__ == '__main__':
unittest.main()

28
uv.lock generated
View File

@@ -1,4 +1,5 @@
version = 1
revision = 1
requires-python = ">=3.12"
[[package]]
@@ -57,6 +58,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/0e/f6/65ecc6878a89bb1c23a086ea335ad4bf21a588990c3f535a227b9eea9108/charset_normalizer-3.4.1-py3-none-any.whl", hash = "sha256:d98b1668f06378c6dbefec3b92299716b931cd4e6061f3c875a71ced1780ab85", size = 49767 },
]
[[package]]
name = "click"
version = "8.1.8"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "colorama", marker = "sys_platform == 'win32'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188 },
]
[[package]]
name = "colorama"
version = "0.4.6"
@@ -68,10 +81,12 @@ wheels = [
[[package]]
name = "epub2go"
version = "1.0"
version = "2.2"
source = { editable = "." }
dependencies = [
{ name = "beautifulsoup4" },
{ name = "click" },
{ name = "pyfzf" },
{ name = "requests" },
{ name = "tqdm" },
{ name = "urllib3" },
@@ -80,6 +95,8 @@ dependencies = [
[package.metadata]
requires-dist = [
{ name = "beautifulsoup4", specifier = "==4.12.3" },
{ name = "click", specifier = ">=8.1.8" },
{ name = "pyfzf", specifier = ">=0.3.1" },
{ name = "requests", specifier = "==2.32.3" },
{ name = "tqdm", specifier = ">=4.67.1" },
{ name = "urllib3", specifier = "==2.2.2" },
@@ -94,6 +111,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 },
]
[[package]]
name = "pyfzf"
version = "0.3.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d4/4c/c0c658a1e1e9f0e01932990d7947579515fe048d0a515f07458ecd992b8f/pyfzf-0.3.1.tar.gz", hash = "sha256:dd902e34cffeca9c3082f96131593dd20b4b3a9bba5b9dde1b0688e424b46bd2", size = 3652 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/99/35/6a6c7b95390ec58904646a04f54e1b56fd57d7a247588b791c6331697797/pyfzf-0.3.1-py3-none-any.whl", hash = "sha256:736f71563461b75f6f85b55345bdc638fa0dc14c32c857c59e8b1ca1cfa3cf4a", size = 4315 },
]
[[package]]
name = "requests"
version = "2.32.3"