Compare commits
20 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
75974ae119 | ||
|
|
b3cd49326f | ||
|
|
401d02e0ca | ||
|
|
660af7fab0 | ||
|
|
c49a1be369 | ||
|
|
4267700763 | ||
|
|
5d063d8597 | ||
|
|
6754f47e9f | ||
|
|
4a8d4f945d | ||
|
|
4903a58619 | ||
|
|
7dfab60f18 | ||
|
|
9736c6135f | ||
|
|
d7ae0cc5a2 | ||
|
|
c78aac28ab | ||
|
|
00f6cef743 | ||
|
|
9ae25e40ad | ||
|
|
7be0fbc126 | ||
|
|
8f77a97733 | ||
|
|
967f97f381 | ||
|
|
4d8cd00298 |
47
README.md
47
README.md
@@ -1,18 +1,39 @@
|
||||
# epub2go.py
|
||||
web to epub converter for https://projekt-gutenberg.org.
|
||||
Web to ePUB Converter for [projekt-gutenberg.org](https://projekt-gutenberg.org) developed in conjunction with a [web interface](https://github.com/eneller/epub2go-web).
|
||||
|
||||
## Installation
|
||||
Requires:
|
||||
- [pandoc](https://pandoc.org/)
|
||||
- [wget](https://www.gnu.org/software/wget/)
|
||||
- [fzf](https://github.com/junegunn/fzf) (only for interactive mode)
|
||||
- python (duh)
|
||||
## Usage
|
||||
Invoke the script using the url of any page of the book you would like to download:
|
||||
```
|
||||
epub2go https://www.projekt-gutenberg.org/ibsen/solness/
|
||||
- [fzf](https://github.com/junegunn/fzf) (optional, only for interactive mode)
|
||||
- [python](https://www.python.org/) (duh)
|
||||
|
||||
Assuming you have a recent version of python installed, run
|
||||
|
||||
```
|
||||
pip install git+https://github.com/eneller/epub2go.py
|
||||
```
|
||||
This will provide the `epub2go` command.
|
||||
|
||||
## Usage
|
||||
```
|
||||
Usage: epub2go [OPTIONS] [ARGS]...
|
||||
|
||||
Download ePUBs from https://www.projekt-gutenberg.org/
|
||||
|
||||
Provide either 0 arguments to enter interactive mode or an arbitrary number
|
||||
of URLs to download from
|
||||
|
||||
Options:
|
||||
-d, --debug Set the log level to DEBUG
|
||||
-s, --silent Disable the progress bar
|
||||
-p, --path TEXT The path to which files are saved
|
||||
--no-clean Do not parse html files with blocklist
|
||||
--help Show this message and exit.
|
||||
```
|
||||
|
||||
Examples:
|
||||
```bash
|
||||
epub2go https://www.projekt-gutenberg.org/ibsen/solness/
|
||||
epub2go # will enter interactive mode
|
||||
```
|
||||
## Installation
|
||||
Assuming you have a recent version of python installed, run
|
||||
```
|
||||
pip install git+https://github.com/eneller/epub2go.py
|
||||
```
|
||||
This will provide the 'epub2go' command.
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
[project]
|
||||
name = "epub2go"
|
||||
version = "1.2"
|
||||
version = "2.2.3"
|
||||
description = "EPUB converter using wget, pandoc and python glue"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"beautifulsoup4==4.12.3",
|
||||
"click>=8.1.8",
|
||||
"pyfzf>=0.3.1", # hasnt been updated for some time
|
||||
"requests==2.32.3",
|
||||
"tqdm>=4.67.1",
|
||||
@@ -21,4 +22,4 @@ include-package-data = true
|
||||
requires = ["setuptools>=64", "setuptools_scm>=8"]
|
||||
|
||||
[tool.setuptools_scm]
|
||||
# can be empty if no extra settings are needed, presence enables setuptools-scm
|
||||
# can be empty if no extra settings are needed, presence enables setuptools-scm
|
||||
|
||||
@@ -5,38 +5,121 @@ from urllib.parse import urljoin
|
||||
from urllib.request import urlparse
|
||||
from tqdm import tqdm
|
||||
from pyfzf.pyfzf import FzfPrompt
|
||||
import click
|
||||
|
||||
import os, sys
|
||||
import os, subprocess, shlex, logging, re
|
||||
import importlib.resources as pkg_resources
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
allbooks_url ='https://www.projekt-gutenberg.org/info/texte/allworka.html'
|
||||
root_url = '{url.scheme}://{url.netloc}'.format(url = urlparse(allbooks_url))
|
||||
|
||||
@dataclass
|
||||
class Book():
|
||||
author: str
|
||||
title: str
|
||||
url: str
|
||||
class GBConvert():
|
||||
#TODO fix toc / headings
|
||||
|
||||
def __init__(self,
|
||||
url:str,
|
||||
standalone = False,
|
||||
downloaddir,
|
||||
):
|
||||
# NOTE move non-code files to data folder
|
||||
self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
|
||||
self.blocklist = open(pkg_resources.files('epub2go').joinpath('blocklist.txt')).read().splitlines()
|
||||
self.root = os.path.dirname(url)
|
||||
self.url = urlparse(self.root)
|
||||
self.output = self.url.netloc + self.url.path
|
||||
self.standalone = standalone
|
||||
with open(pkg_resources.files('epub2go').joinpath('blocklist.txt')) as blocklist:
|
||||
self.blocklist = blocklist.read().splitlines()
|
||||
self.dir_download = downloaddir
|
||||
|
||||
def getDir(self, url):
|
||||
tocpage = os.path.dirname(url) # ToC website url
|
||||
parsed_url = urlparse(tocpage)
|
||||
# directories created by wget recreating the URL
|
||||
dir_output = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path )
|
||||
return dir_output
|
||||
|
||||
def get_meta(self):
|
||||
response = requests.get(self.root)
|
||||
def download(self,
|
||||
url:str,
|
||||
author:str = None,
|
||||
title:str = None,
|
||||
showprogress: bool = False,
|
||||
cleanpages: bool = True,
|
||||
):
|
||||
tocpage = os.path.dirname(url) # ToC website url
|
||||
dir_output = self.getDir(url)
|
||||
logger.debug('Downloading to %s, expecting files in in %s', self.dir_download, dir_output)
|
||||
author = author
|
||||
title = title
|
||||
|
||||
#parse_meta
|
||||
response = requests.get(tocpage)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
self.author = soup.find('meta', {'name': 'author'})['content']
|
||||
self.title = soup.find('meta', {'name': 'title'})['content']
|
||||
self.toc = soup.find('ul').find_all('a')
|
||||
|
||||
# TODO allow setting these from interactive mode where those parameters are figured out from the list
|
||||
if not author:
|
||||
try:
|
||||
author = soup.find('meta', {'name': 'author'})['content']
|
||||
except:
|
||||
author = "UnknownAuthor"
|
||||
if not title:
|
||||
try:
|
||||
title = soup.find('meta', {'name': 'title'})['content']
|
||||
except:
|
||||
title = "UnknownTitle"
|
||||
chapter_urls = soup.find('ul').find_all('a')
|
||||
logger.debug('Found ToC with %d entries', len(chapter_urls))
|
||||
|
||||
#run
|
||||
#TODO include images flag
|
||||
# download all files in toc (chapters)
|
||||
chapter_files = []
|
||||
for item in (tqdm(chapter_urls) if showprogress else chapter_urls):
|
||||
item_url = self.parse_toc_entry(tocpage, item)
|
||||
parsed_url = urlparse(item_url)
|
||||
filepath = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path)
|
||||
if cleanpages: self.parse_page(filepath)
|
||||
chapter_files.append(os.path.basename(item_url))
|
||||
|
||||
return self.create_epub(author,title,chapter_files,dir_output)
|
||||
|
||||
def parse_toc_entry(self, tocpage, entry):
|
||||
url = os.path.join(tocpage, entry['href'])
|
||||
self.save_page(url)
|
||||
return url
|
||||
|
||||
# apply blocklist to file
|
||||
def parse_page(self,file_path):
|
||||
#TODO clean up file opening, mmap?
|
||||
count=0
|
||||
with open(file_path, 'r+') as f:
|
||||
soup = BeautifulSoup(f.read(), 'html.parser')
|
||||
for blocker in self.blocklist:
|
||||
for item in soup.select(blocker):
|
||||
item.decompose()
|
||||
count+=1
|
||||
f.seek(0)
|
||||
f.truncate()
|
||||
f.write(str(soup))
|
||||
logger.debug('Removed %d tags from page %s during parsing', count, file_path)
|
||||
|
||||
def create_epub(self, author, title, chapters, dir_output):
|
||||
#TODO --epub-cover-image
|
||||
#TODO toc if it isnt described by <h> tags, e.g. https://www.projekt-gutenberg.org/adlersfe/maskenba/
|
||||
filename = slugify(f'{title} - {author}.epub')
|
||||
command = f'''pandoc -f html -t epub \
|
||||
-o "{filename}" \
|
||||
--reference-location=section \
|
||||
--css="{self.style_path_drama}" \
|
||||
--metadata title="{title}" \
|
||||
--metadata author="{author}" \
|
||||
--epub-title-page=false \
|
||||
{" ".join(chapters)} '''
|
||||
logger.debug('Calling "%s"', command)
|
||||
subprocess.run(shlex.split(command), cwd=dir_output, check=True)
|
||||
return os.path.abspath(os.path.join(dir_output,filename))
|
||||
|
||||
def save_page(self, url):
|
||||
logger.debug('Saving page at %s', url)
|
||||
# https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
|
||||
command = f'''wget \
|
||||
--timestamping \
|
||||
@@ -45,48 +128,10 @@ class GBConvert():
|
||||
--tries=5 \
|
||||
--quiet \
|
||||
{url}'''
|
||||
os.system(command)
|
||||
subprocess.run(shlex.split(command), cwd=self.dir_download, check=True)
|
||||
|
||||
def clean_page(self,file_path):
|
||||
f = open(file_path, 'r').read()
|
||||
soup = BeautifulSoup(f, 'html.parser')
|
||||
for blocker in self.blocklist:
|
||||
for item in soup.select(blocker):
|
||||
item.decompose()
|
||||
open(file_path, 'w').write(str(soup))
|
||||
|
||||
|
||||
def create_epub(self, filename='out.epub'):
|
||||
os.chdir(self.output)
|
||||
command = f'''pandoc -f html -t epub \
|
||||
-o "{filename}" \
|
||||
--reference-location=section \
|
||||
--css="{self.style_path_drama}" \
|
||||
--metadata title="{self.title}" \
|
||||
--metadata author="{self.author}" \
|
||||
--epub-title-page=false \
|
||||
{" ".join(self.chapters)} '''#TODO --epub-cover-image
|
||||
os.system(command)
|
||||
|
||||
def run(self):
|
||||
#TODO include images flag
|
||||
|
||||
self.get_meta()
|
||||
|
||||
map(lambda x: self.save_page(os.path.join(self.root, x['href'])), self.toc)
|
||||
self.chapters = []
|
||||
for item in (tqdm(self.toc) if self.standalone else self.toc):
|
||||
item_title= item.get_text()
|
||||
item_url = os.path.join(self.root, item['href'])
|
||||
self.save_page(url=item_url)
|
||||
parsed_url = urlparse(item_url)
|
||||
filepath = parsed_url.netloc + parsed_url.path
|
||||
self.clean_page(filepath)
|
||||
self.chapters.append(item['href'])
|
||||
|
||||
self.create_epub(f'{self.title} - {self.author}.epub')
|
||||
|
||||
def get_all_books() -> list:
|
||||
# get a list of all books for interactive selection or scraping
|
||||
def get_all_books() -> List[Book]:
|
||||
response = requests.get(allbooks_url)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
|
||||
@@ -111,35 +156,52 @@ def get_all_books() -> list:
|
||||
book_href = book_tag.get('href')
|
||||
book_url = urljoin(allbooks_url, book_href)
|
||||
book_title = ' '.join(book_tag.getText().split())
|
||||
book = {'author': book_author, 'title': book_title, 'url': book_url}
|
||||
book = Book(book_author, book_title, book_url)
|
||||
books.append(book)
|
||||
return books
|
||||
|
||||
def get_all_book_tags ()-> ResultSet:
|
||||
response = requests.get(allbooks_url)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
|
||||
books = soup.find('dl').find_all('a')
|
||||
return books
|
||||
|
||||
def main():
|
||||
sys.argv.pop(0)
|
||||
def slugify(value, replacement='_'):
|
||||
value = re.sub(r'[<>:"/\\|?*\x00-\x1F]', replacement, value)
|
||||
# Remove leading/trailing whitespace or dots
|
||||
value = value.strip().strip(".")
|
||||
# Optionally truncate to safe length (e.g. 255 chars for most filesystems)
|
||||
return value[:255] or "untitled"
|
||||
|
||||
# run main cli
|
||||
@click.command()
|
||||
#TODO include images flag
|
||||
@click.option('--debug', '-d', is_flag=True, help='Set the log level to DEBUG')
|
||||
@click.option('--silent', '-s', is_flag=True, help='Disable the progress bar')
|
||||
@click.option('--path','-p',type=str,default='./', help='The path to which files are saved' )
|
||||
@click.option('--no-clean',is_flag=True,help='Do not parse html files with blocklist')
|
||||
@click.argument('args', nargs=-1)
|
||||
def main(args, debug, silent, path, no_clean):
|
||||
'''
|
||||
Download ePUBs from https://www.projekt-gutenberg.org/ \n
|
||||
Provide either 0 arguments to enter interactive mode or an arbitrary number of URLs to download from
|
||||
'''
|
||||
logging.basicConfig(level=logging.ERROR,format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
if(debug): logger.setLevel(logging.DEBUG)
|
||||
# non-interactive mode
|
||||
if len(sys.argv) > 0 :
|
||||
books = sys.argv
|
||||
if len(args) > 0 :
|
||||
books = args
|
||||
# interactive mode using fzf
|
||||
else:
|
||||
logger.debug('Received no CLI arguments, starting interactive mode')
|
||||
delimiter = ';'
|
||||
# create lines for fzf
|
||||
books = [f"{item['author']} - {item['title']} {delimiter} {item['url']}" for item in get_all_books()]
|
||||
books = [f"{ item.author } - { item.title } {delimiter} { item.url }" for item in get_all_books()]
|
||||
fzf = FzfPrompt()
|
||||
selection = fzf.prompt(choices=books, fzf_options=r'--exact --with-nth 1 -m -d\;')
|
||||
books = [item.split(';')[1].strip() for item in selection]
|
||||
|
||||
logger.debug('Attempting to download from %d URL(s)', len(books))
|
||||
converter = GBConvert(path)
|
||||
if len(books)==1:
|
||||
GBConvert(books[0], standalone=True).run()
|
||||
converter.download(books[0], showprogress=not silent, cleanpages= not no_clean)
|
||||
else:
|
||||
for book in tqdm(books):
|
||||
GBConvert(book).run()
|
||||
for book in (tqdm(books) if not silent else books):
|
||||
converter.download(book, cleanpages= not no_clean)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,20 +1,20 @@
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4 import ResultSet
|
||||
|
||||
import os
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from convert import GBConvert, get_all_book_tags, allbooks_url
|
||||
from convert import GBConvert, allbooks_url, get_all_books, Book
|
||||
|
||||
def main():
|
||||
books = get_all_book_tags()
|
||||
books = get_all_books()
|
||||
# NOTE consider making this a map()
|
||||
converter = GBConvert('./')
|
||||
for book in tqdm(books):
|
||||
book_title = book.get_text()
|
||||
book_url_relative = book.get('href')
|
||||
if book_url_relative is not None:
|
||||
book_url = urljoin(allbooks_url, book_url_relative)
|
||||
GBConvert(book_url).run()
|
||||
if book.url is not None:
|
||||
converter.download(book.url)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -1,38 +0,0 @@
|
||||
# run using `django-admin runserver --pythonpath=. --settings=web`
|
||||
from django.urls import path
|
||||
from django.http import HttpResponse, HttpRequest
|
||||
from django.shortcuts import redirect, render
|
||||
import requests
|
||||
|
||||
from convert import GBConvert, allbooks_url
|
||||
import json
|
||||
DEBUG = True
|
||||
ROOT_URLCONF = __name__
|
||||
SECRET_KEY='1'
|
||||
TEMPLATES = [
|
||||
{
|
||||
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
||||
'DIRS': [
|
||||
'templates/'
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
def root(request: HttpRequest):
|
||||
title = 'epub2go'
|
||||
targetParam = request.GET.get('t', None)
|
||||
if targetParam is not None:
|
||||
getEpub(targetParam)
|
||||
return render(request, 'index.html', locals())
|
||||
|
||||
urlpatterns = [
|
||||
path('', root, name='root'),
|
||||
]
|
||||
|
||||
def getEpub(param):
|
||||
# TODO validate / sanitize input
|
||||
# TODO check for existing file and age
|
||||
# TODO download
|
||||
# TODO redirect to loading page
|
||||
# TODO redirect to download page
|
||||
raise NotImplementedError
|
||||
0
test/__init__.py
Normal file
0
test/__init__.py
Normal file
23
test/test_epub.py
Normal file
23
test/test_epub.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from tqdm import tqdm
|
||||
|
||||
from src.epub2go.convert import GBConvert, get_all_books
|
||||
|
||||
import unittest
|
||||
|
||||
# run using `python -m unittest test/test_epub.py`
|
||||
class TestEpub(unittest.TestCase):
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.schiller_raeuber = GBConvert('https://www.projekt-gutenberg.org/schiller/raeuber/')
|
||||
cls.anzengru_allersee = GBConvert('https://www.projekt-gutenberg.org/anzengru/allersee/')
|
||||
|
||||
def test_schiller_raeuber_toc(self):
|
||||
self.assertEqual(len(self.schiller_raeuber.toc), 7)
|
||||
|
||||
def test_anzengru_allersee_toc(self):
|
||||
self.assertEqual(len(self.anzengru_allersee.toc), 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
17
uv.lock
generated
17
uv.lock
generated
@@ -1,4 +1,5 @@
|
||||
version = 1
|
||||
revision = 1
|
||||
requires-python = ">=3.12"
|
||||
|
||||
[[package]]
|
||||
@@ -57,6 +58,18 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/0e/f6/65ecc6878a89bb1c23a086ea335ad4bf21a588990c3f535a227b9eea9108/charset_normalizer-3.4.1-py3-none-any.whl", hash = "sha256:d98b1668f06378c6dbefec3b92299716b931cd4e6061f3c875a71ced1780ab85", size = 49767 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "click"
|
||||
version = "8.1.8"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "colorama", marker = "sys_platform == 'win32'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "colorama"
|
||||
version = "0.4.6"
|
||||
@@ -68,10 +81,11 @@ wheels = [
|
||||
|
||||
[[package]]
|
||||
name = "epub2go"
|
||||
version = "1.0"
|
||||
version = "2.2"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "beautifulsoup4" },
|
||||
{ name = "click" },
|
||||
{ name = "pyfzf" },
|
||||
{ name = "requests" },
|
||||
{ name = "tqdm" },
|
||||
@@ -81,6 +95,7 @@ dependencies = [
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "beautifulsoup4", specifier = "==4.12.3" },
|
||||
{ name = "click", specifier = ">=8.1.8" },
|
||||
{ name = "pyfzf", specifier = ">=0.3.1" },
|
||||
{ name = "requests", specifier = "==2.32.3" },
|
||||
{ name = "tqdm", specifier = ">=4.67.1" },
|
||||
|
||||
Reference in New Issue
Block a user