Initial Commit

This commit is contained in:
eneller
2024-08-17 23:28:26 +02:00
commit 2e83eed2ca
6 changed files with 931 additions and 0 deletions

345
.gitignore vendored Normal file
View File

@@ -0,0 +1,345 @@
/nbproject/
/system/IECache/**/*
/multiCMS/updateFramework.php
*.epub
#/multiCMS/system/DBData/Installation.pfdb.php
# Created by https://www.toptal.com/developers/gitignore/api/phpstorm+all,python,visualstudiocode,pycharm+all
# Edit at https://www.toptal.com/developers/gitignore?templates=phpstorm+all,python,visualstudiocode,pycharm+all
### PhpStorm+all ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf
# AWS User-specific
.idea/**/aws.xml
# Generated files
.idea/**/contentModel.xml
# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml
# Gradle
.idea/**/gradle.xml
.idea/**/libraries
# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr
# CMake
cmake-build-*/
# Mongo Explorer plugin
.idea/**/mongoSettings.xml
# File-based project format
*.iws
# IntelliJ
out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Cursive Clojure plugin
.idea/replstate.xml
# SonarLint plugin
.idea/sonarlint/
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
# Editor-based Rest Client
.idea/httpRequests
# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser
### PhpStorm+all Patch ###
# Ignore everything but code style settings and run configurations
# that are supposed to be shared within teams.
.idea/*
!.idea/codeStyles
!.idea/runConfigurations
### PyCharm+all ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
# User-specific stuff
# AWS User-specific
# Generated files
# Sensitive or high-churn files
# Gradle
# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr
# CMake
# Mongo Explorer plugin
# File-based project format
# IntelliJ
# mpeltonen/sbt-idea plugin
# JIRA plugin
# Cursive Clojure plugin
# SonarLint plugin
# Crashlytics plugin (for Android Studio and IntelliJ)
# Editor-based Rest Client
# Android studio 3.1+ serialized cache file
### PyCharm+all Patch ###
# Ignore everything but code style settings and run configurations
# that are supposed to be shared within teams.
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
### Python Patch ###
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
poetry.toml
# ruff
.ruff_cache/
# LSP config files
pyrightconfig.json
### VisualStudioCode ###
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
!.vscode/*.code-snippets
# Local History for Visual Studio Code
.history/
# Built Visual Studio Code Extensions
*.vsix
### VisualStudioCode Patch ###
# Ignore all local history of files
.history
.ionide
# End of https://www.toptal.com/developers/gitignore/api/phpstorm+all,python,visualstudiocode,pycharm+all

10
blocklist.txt Normal file
View File

@@ -0,0 +1,10 @@
html body div.navi-gb
html body h5
html body div.dropdown
html body div.anzeige-chap
html body a
html body hr
html body h3.author
html body h2.title
html body div.bottomnavi-gb
html head

84
convert.py Normal file
View File

@@ -0,0 +1,84 @@
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from urllib.request import urlopen, urlparse
import os, sys
from pathlib import Path
class GBConvert():
#TODO fix toc / headings
def __init__(self
, url:str
):
self.root = os.path.dirname(url)
self.url = urlparse(self.root)
self.output = self.url.netloc + self.url.path
self.blocklist = open('blocklist.txt', 'r').read().splitlines()
def get_meta(self):
response = requests.get(self.root)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
self.author = soup.find('meta', {'name': 'author'})['content']
self.title = soup.find('meta', {'name': 'title'})['content']
self.toc = soup.find('ul').find_all('a')
def save_page(self, url):
# https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
command = f'''wget \
--page-requisites \
--convert-links \
--execute \
--tries=5 \
--quiet \
{url}'''
os.system(command)
def clean_page(self,file_path):
f = open(file_path, 'r').read()
soup = BeautifulSoup(f, 'html.parser')
for blocker in self.blocklist:
for item in soup.select(blocker):
item.decompose()
open(file_path, 'w').write(str(soup))
def create_epub(self, filename='out.epub'):
os.chdir(self.output)
command = f'''pandoc -f html -t epub \
-o "{filename}" \
--reference-location=section \
--css=../../../drama.css \
--metadata title="{self.title}" \
--metadata author="{self.author}" \
--epub-title-page=false \
{" ".join(self.chapters)} '''#TODO --epub-cover-image
os.system(command)
def run(self):
#TODO include images flag
self.get_meta()
map(lambda x: self.save_page(os.path.join(self.root, x['href'])), self.toc)
self.chapters = []
for item in self.toc:
item_title= item.get_text()
item_url = os.path.join(self.root, item['href'])
self.save_page(url=item_url)
parsed_url = urlparse(item_url)
filepath = parsed_url.netloc + parsed_url.path
self.clean_page(filepath)
self.chapters.append(item['href'])
self.create_epub(f'{self.title} - {self.author}.epub')
def main():
g = GBConvert(sys.argv[1])
g.run()
if __name__ == "__main__":
main()

464
drama.css Normal file
View File

@@ -0,0 +1,464 @@
/* drama.css - Stand März 2024 */
body {
margin-right: 10%;
margin-left: 10%;
margin-top: 0%;
margin-bottom: 1%;
}
.toc {
display: none;
}
a:link {
color: #039;
text-decoration: none;
}
a:visited {
color: #039;
text-decoration: none;
}
a:hover {
color: #039;
text-decoration: none;
background-color: #e0e0e0;
}
a:active {
color: #039;
text-decoration: none;
}
td {
font-family: "Arial", sans-serif;
}
.mainnav {
font-family: "Arial", sans-serif;
font-variant: small-caps;
background-color: #fff;
text-align: center;
border-top: 1px #d26402 solid;
border-bottom: 1px #d26402 solid;
}
.mainnav-ed {
font-family: "Arial", sans-serif;
background-color: #fff;
text-align: center;
border-top: 1px #cc6060 solid;
border-bottom: 1px #cc6060 solid;
}
.autalpha {
font-family: "Arial", sans-serif;
text-align: center;
}
.trenner {
font-size: 10pt;
font-weight: bold;
color: #d26402;
}
.right {
text-align: right;
}
.left {
text-align: left;
}
.authorlist {
text-align: left;
}
.author {
color: gray;
}
.box {
margin: 1.5em 15%;
border: 1px solid #666;
padding: 1em;
}
.dedication {
text-indent: 0;
text-align: center;
font-size: 1.5em;
margin-top: 2em;
margin-bottom: 2em;
margin-left: 20%;
margin-right: 20%;
}
.figcaption {
text-indent: 0;
text-align: center;
font-style: italic;
}
.figure {
text-indent: 0;
text-align: center;
margin-top: 1em;
margin-bottom: 1em;
}
.fraktur {
font-family: "Frankenstein", "Times", serif;
}
.hidden {
display: none;
}
.lektorat {
color: red;
}
.motto {
text-indent: 0;
margin-right: 5em;
margin-left: 50%;
margin-top: 1em;
margin-bottom: 1em;
}
.note {
line-height: 90%;
font-size: 90%;
}
.online {
display: none;
}
.recipient {
margin-left: -1em;
margin-top: 1em;
margin-bottom: 1em;
}
.regie, .action {
font-size: 90%;
font-style: italic;
}
.sender {
margin-left: 2em;
font-style: italic;
font-weight: bold;
color: darkblue;
}
.signatur, .signature {
text-align: right;
margin-right: 2em;
}
.smallcaps {
font-variant: small-caps;
}
.speaker {
color: #333;
font-weight: bold;
}
.subtitle {
color: darkgray;
}
.title {
font-size: 2em;
}
.upper {
text-transform: uppercase;
}
.wide, .spaced {
letter-spacing: 0.15em;
}
h1, h2, h3, h4, h6 {
text-align: center;
}
h5 {
text-align: center;
font-size: 90%;
color: #808080;
font-weight: normal;
}
p {
margin-top: 0.5em;
margin-bottom: 0.5em;
margin-left: 2em;
text-indent: -2em;
}
p.abstract, p.stage {
font-size: 90%;
font-style: italic;
margin-left: 3em;
margin-right: 3em;
text-indent: 0;
text-align: justify;
}
p.address {
text-align: right;
text-indent: 0;
font-style: italic;
}
p.center {
text-indent: 0;
margin-left: 0;
text-align: center;
}
p.centerbig {
margin-left: 0;
margin-bottom: 0.6em;
margin-top: 0.6em;
text-indent: 0;
text-align: center;
font-size: 115%;
}
p.centersml {
margin-left: 0;
text-indent: 0;
text-align: center;
font-size: 90%;
margin-bottom: 0.3em;
margin-top: 0.3em;
}
p.date {
text-align: right;
font-style: italic;
}
p.dblmarg {
text-indent: 0;
margin-left: 10%;
margin-right: 10%;
}
p.epigraph {
text-indent: 0;
text-align: right;
margin-right: 5%;
font-style: italic;
}
p.initial {
text-indent: 0;
margin-left: 0;
text-align: justify;
}
p.initial:first-letter {
font-size: 180%;
}
p.left {
margin-left: 0;
text-indent: 0;
text-align: left;
}
p.leftjust {
text-indent: 0;
margin-left: 0;
text-align: justify;
}
p.leftmarg {
text-indent: 0;
text-align: left;
margin-left: 2em;
}
p.line {
text-align: left;
text-indent: 0;
margin-top: 0;
margin-bottom: 0;
}
p.poem, p.vers {
text-indent: 0;
text-align: left;
margin-left: 2em;
}
p.prosa {
margin-left: 0;
margin-top: 0.5em;
margin-bottom: 0.5em;
text-indent: 0.8em;
text-align: justify;
}
p.right {
margin-left: 0;
text-indent: 0;
text-align: right;
}
p.scene {
text-align: center;
margin-left: 0;
text-indent: 0;
font-style: italic;
}
hr {
border: 1px solid;
text-align: center;
color: #999;
margin-top: 0.5em;
margin-bottom: 0.5em;
}
hr.short {
margin-top: 1em;
margin-bottom: 1em;
margin-left: 40%;
width: 20%;
text-align: center;
}
hr.star {
margin-top: 1em;
margin-bottom: 1em;
width: 20%;
}
span.tooltip {
color: #800000;
}
/* span.footnote { display:inline }
span.footnote a:hover { background-color: #2B2E21; color:#fff }
span.footnote a:link span, span.footnote a:visited span { display: none }
span.footnote a:hover span.fntext { position: absolute; margin:20px; background-color: beige;
max-width:400px; padding: 5px 10px 5px 10px; border: 1px solid #C0C0C0;
font: normal 12px/14px arial; color: #000; text-align:left;
display: block; text-decoration:none; left:10px } */
span.footnote:before {
content: " [Fußnote: ";
color: #505050;
}
span.footnote:after {
content: "] ";
color: #505050;
}
span.footnote {
color: #505050;
font-size: 90%;
}
div.epigraph {
margin-left: 50%;
margin-right: 5%;
font-style: italic;
}
div.motto p {
text-align: left;
text-indent: 0;
margin-right: 5em;
}
div.titlepage {
text-align: center;
}
.bottomnavi-gb {
margin-top: 1em;
margin-bottom: 1em;
}
table.left {
margin-left: 0;
margin-right: auto;
}
table.poem {
margin-left: auto;
margin-right: auto;
text-align: left;
}
table.right {
margin-left: auto;
margin-right: 0;
}
table.toc {
margin-left: auto;
margin-right: auto;
margin-top: 0.3em;
text-align: center;
}
table.toc td {
padding: 0 0.25em;
text-align: left;
}
table.true {
margin-left: auto;
margin-right: auto;
margin-top: 0.3em;
margin-bottom: 0.3em;
text-align: left;
}
img.left {
float: left;
margin-top: 0.5em;
margin-bottom: 0.5em;
margin-right: 0.5em;
}
img.right {
float: right;
margin-top: 0.5em;
margin-bottom: 0.5em;
margin-left: 0.5em;
}
div.letter {
text-align: left;
margin-left: 1.5em;
margin-top: 1em;
margin-bottom: 1em;
}
div.letter p {
text-indent: 0;
}
a.pageref {
display: none;
}
a.pageref:before {
content: "[";
}
a.pageref:after {
content: "]";
}
dd {
margin-left: 2em;
}
dl {
margin-left: 1.5em;
margin-top: 1em;
margin-bottom: 1em;
}
dt {
font-weight: bold;
margin-top: 4pt;
}
del {
color: red;
}
ins {
color: blue;
}
p.stars {
text-indent: 0;
text-align: center;
font-size: 200%;
letter-spacing: 0.3em;
margin-top: 0.5em;
margin-bottom: 0;
}
sup {
font-size: 70%;
vertical-align: text-top;
}
sup.fract {
font-size: 70%;
vertical-align: text-top;
}
sub.fract {
font-size: 70%;
vertical-align: text-bottom;
}
tt {
font-family: "Courier", monospace;
}
ul {
margin-top: 1em;
margin-bottom: 1em;
}
div.poem {
text-align: left;
margin: 1em 2em;
}
div.poem blockquote {
text-align: left;
text-indent: 0;
margin-left: 1em;
}
div.poem blockquote p {
text-align: left;
text-indent: 0;
margin-left: 1em;
}
div.vers {
text-indent: 0;
text-align: left;
margin: 1em 2em;
}
div.vers p {
text-indent: 0;
margin-top: 0;
margin-bottom: 0;
}
div.anzeige-chap {
background-color: #eeeeee;
padding-top: 0.5em;
padding-left: 2.5em;
padding-right: 2.5em;
padding-bottom: 1em;
margin-top: 1em;
margin-bottom: 1em;
}

3
requirements.txt Normal file
View File

@@ -0,0 +1,3 @@
requests==2.32.3
beautifulsoup4==4.12.3
urllib3==2.2.2

25
test.py Normal file
View File

@@ -0,0 +1,25 @@
import requests
from bs4 import BeautifulSoup
import os
from convert import GBConvert
def main():
allbooks_relative_url ='/info/texte/allworka.html'
root_url = 'https://www.projekt-gutenberg.org'
allbooks_url = root_url + allbooks_relative_url
response = requests.get(allbooks_url)
if (response.status_code != 200): raise Exception(f'Couldnt fetch root page {self.root}')
soup = BeautifulSoup(response.content, 'html.parser')
books = soup.find('dl').find_all('a')
for book in books:
book_title = book.get_text()
book_url_relative = book.get('href')
if book_url_relative is not None:
book_url = root_url + os.path.dirname(book_url_relative)[5:]
gb = GBConvert(book_url)
if __name__ == "__main__":
main()