9 Commits
v1.3 ... v2.2.3

Author SHA1 Message Date
eneller
75974ae119 fix: slugify filenames 2025-04-06 10:29:19 +02:00
eneller
b3cd49326f feat: prettier logging 2025-04-05 01:42:20 +02:00
eneller
401d02e0ca fix: parameter getdir 2025-04-02 11:26:24 +02:00
eneller
660af7fab0 feat: allow getting directory without download 2025-03-23 23:55:05 +01:00
eneller
c49a1be369 docs: readme 2025-03-20 22:11:12 +01:00
eneller
4267700763 feat: return epub path
errors from wget and pandoc are thrown up
2025-03-16 20:30:42 +01:00
eneller
5d063d8597 feat: restructure for memory efficiency 2025-03-16 19:06:33 +01:00
eneller
6754f47e9f fix: restructure test 2025-03-16 18:57:40 +01:00
eneller
4a8d4f945d begin restructure 2025-03-16 18:34:12 +01:00
5 changed files with 115 additions and 68 deletions

View File

@@ -1,18 +1,39 @@
# epub2go.py # epub2go.py
web to epub converter for https://projekt-gutenberg.org. Web to ePUB Converter for [projekt-gutenberg.org](https://projekt-gutenberg.org) developed in conjunction with a [web interface](https://github.com/eneller/epub2go-web).
## Installation
Requires: Requires:
- [pandoc](https://pandoc.org/) - [pandoc](https://pandoc.org/)
- [wget](https://www.gnu.org/software/wget/) - [wget](https://www.gnu.org/software/wget/)
- [fzf](https://github.com/junegunn/fzf) (only for interactive mode) - [fzf](https://github.com/junegunn/fzf) (optional, only for interactive mode)
- python (duh) - [python](https://www.python.org/) (duh)
## Usage
Invoke the script using the url of any page of the book you would like to download:
```
epub2go https://www.projekt-gutenberg.org/ibsen/solness/
```
## Installation
Assuming you have a recent version of python installed, run Assuming you have a recent version of python installed, run
``` ```
pip install git+https://github.com/eneller/epub2go.py pip install git+https://github.com/eneller/epub2go.py
``` ```
This will provide the 'epub2go' command. This will provide the `epub2go` command.
## Usage
```
Usage: epub2go [OPTIONS] [ARGS]...
Download ePUBs from https://www.projekt-gutenberg.org/
Provide either 0 arguments to enter interactive mode or an arbitrary number
of URLs to download from
Options:
-d, --debug Set the log level to DEBUG
-s, --silent Disable the progress bar
-p, --path TEXT The path to which files are saved
--no-clean Do not parse html files with blocklist
--help Show this message and exit.
```
Examples:
```bash
epub2go https://www.projekt-gutenberg.org/ibsen/solness/
epub2go # will enter interactive mode
```

View File

@@ -1,6 +1,6 @@
[project] [project]
name = "epub2go" name = "epub2go"
version = "1.3" version = "2.2.3"
description = "EPUB converter using wget, pandoc and python glue" description = "EPUB converter using wget, pandoc and python glue"
readme = "README.md" readme = "README.md"
requires-python = ">=3.12" requires-python = ">=3.12"

View File

@@ -7,7 +7,7 @@ from tqdm import tqdm
from pyfzf.pyfzf import FzfPrompt from pyfzf.pyfzf import FzfPrompt
import click import click
import os, subprocess, shlex, logging import os, subprocess, shlex, logging, re
import importlib.resources as pkg_resources import importlib.resources as pkg_resources
from dataclasses import dataclass from dataclasses import dataclass
from typing import List from typing import List
@@ -23,75 +23,100 @@ class Book():
url: str url: str
class GBConvert(): class GBConvert():
def __init__(self, def __init__(self,
url:str, downloaddir,
author:str = None,
title:str = None,
downloaddir = './',
showprogress:bool = False,
): ):
# NOTE move non-code files to data folder # NOTE move non-code files to data folder
self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css") self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
with open(pkg_resources.files('epub2go').joinpath('blocklist.txt')) as blocklist: with open(pkg_resources.files('epub2go').joinpath('blocklist.txt')) as blocklist:
self.blocklist = blocklist.read().splitlines() self.blocklist = blocklist.read().splitlines()
self.tocpage = os.path.dirname(url) # ToC website url
url = urlparse(self.tocpage)
self.dir_download = downloaddir self.dir_download = downloaddir
self.dir_output = os.path.join(self.dir_download, url.netloc + url.path )# directories created by wget recreating the URL
logger.debug('Downloading in %s, expecting files in in %s', self.dir_download, self.dir_output)
self.showprogress = showprogress
self.author = author
self.title = title
self.chapters = []
self.parse_meta() def getDir(self, url):
tocpage = os.path.dirname(url) # ToC website url
parsed_url = urlparse(tocpage)
# directories created by wget recreating the URL
dir_output = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path )
return dir_output
def parse_meta(self): def download(self,
response = requests.get(self.tocpage) url:str,
author:str = None,
title:str = None,
showprogress: bool = False,
cleanpages: bool = True,
):
tocpage = os.path.dirname(url) # ToC website url
dir_output = self.getDir(url)
logger.debug('Downloading to %s, expecting files in in %s', self.dir_download, dir_output)
author = author
title = title
#parse_meta
response = requests.get(tocpage)
response.raise_for_status() response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser') soup = BeautifulSoup(response.content, 'html.parser')
# TODO allow setting these from interactive mode where those parameters are figured out from the list # TODO allow setting these from interactive mode where those parameters are figured out from the list
if not self.author: if not author:
try: try:
self.author = soup.find('meta', {'name': 'author'})['content'] author = soup.find('meta', {'name': 'author'})['content']
except: except:
self.author = "UnknownAuthor" author = "UnknownAuthor"
if not self.title: if not title:
try: try:
self.title = soup.find('meta', {'name': 'title'})['content'] title = soup.find('meta', {'name': 'title'})['content']
except: except:
self.title = "UnknownTitle" title = "UnknownTitle"
self.toc = soup.find('ul').find_all('a') chapter_urls = soup.find('ul').find_all('a')
logger.debug('Found ToC with %d entries', len(self.toc)) logger.debug('Found ToC with %d entries', len(chapter_urls))
def parse_toc_entry(self, entry): #run
url = os.path.join(self.tocpage, entry['href']) #TODO include images flag
# download all files in toc (chapters)
chapter_files = []
for item in (tqdm(chapter_urls) if showprogress else chapter_urls):
item_url = self.parse_toc_entry(tocpage, item)
parsed_url = urlparse(item_url)
filepath = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path)
if cleanpages: self.parse_page(filepath)
chapter_files.append(os.path.basename(item_url))
return self.create_epub(author,title,chapter_files,dir_output)
def parse_toc_entry(self, tocpage, entry):
url = os.path.join(tocpage, entry['href'])
self.save_page(url) self.save_page(url)
return url return url
# apply blocklist to file # apply blocklist to file
def parse_page(self,file_path): def parse_page(self,file_path):
#TODO clean up file opening, mmap? #TODO clean up file opening, mmap?
logger.debug('Parsing page at %s', file_path) count=0
with open(file_path, 'r+') as f: with open(file_path, 'r+') as f:
soup = BeautifulSoup(f.read(), 'html.parser') soup = BeautifulSoup(f.read(), 'html.parser')
for blocker in self.blocklist: for blocker in self.blocklist:
for item in soup.select(blocker): for item in soup.select(blocker):
item.decompose() item.decompose()
count+=1
f.seek(0)
f.truncate()
f.write(str(soup)) f.write(str(soup))
logger.debug('Removed %d tags from page %s during parsing', count, file_path)
def create_epub(self, filename='out.epub')-> int: def create_epub(self, author, title, chapters, dir_output):
#TODO --epub-cover-image #TODO --epub-cover-image
#TODO toc if it isnt described by <h> tags, e.g. https://www.projekt-gutenberg.org/adlersfe/maskenba/ #TODO toc if it isnt described by <h> tags, e.g. https://www.projekt-gutenberg.org/adlersfe/maskenba/
logger.debug('Creating epub as "%s"',filename) filename = slugify(f'{title} - {author}.epub')
command = f'''pandoc -f html -t epub \ command = f'''pandoc -f html -t epub \
-o "{filename}" \ -o "{filename}" \
--reference-location=section \ --reference-location=section \
--css="{self.style_path_drama}" \ --css="{self.style_path_drama}" \
--metadata title="{self.title}" \ --metadata title="{title}" \
--metadata author="{self.author}" \ --metadata author="{author}" \
--epub-title-page=false \ --epub-title-page=false \
{" ".join(self.chapters)} ''' {" ".join(chapters)} '''
return subprocess.run(shlex.split(command), cwd=self.dir_output).returncode logger.debug('Calling "%s"', command)
subprocess.run(shlex.split(command), cwd=dir_output, check=True)
return os.path.abspath(os.path.join(dir_output,filename))
def save_page(self, url): def save_page(self, url):
logger.debug('Saving page at %s', url) logger.debug('Saving page at %s', url)
@@ -103,19 +128,7 @@ class GBConvert():
--tries=5 \ --tries=5 \
--quiet \ --quiet \
{url}''' {url}'''
return subprocess.run(shlex.split(command), cwd=self.dir_download).returncode subprocess.run(shlex.split(command), cwd=self.dir_download, check=True)
def run(self):
#TODO include images flag
# download all files in toc (chapters)
for item in (tqdm(self.toc) if self.showprogress else self.toc):
item_url = self.parse_toc_entry(item)
parsed_url = urlparse(item_url)
filepath = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path)
self.parse_page(filepath)
self.chapters.append(os.path.basename(item_url))
return self.create_epub(f'{self.title} - {self.author}.epub')
# get a list of all books for interactive selection or scraping # get a list of all books for interactive selection or scraping
def get_all_books() -> List[Book]: def get_all_books() -> List[Book]:
@@ -147,14 +160,24 @@ def get_all_books() -> List[Book]:
books.append(book) books.append(book)
return books return books
def slugify(value, replacement='_'):
value = re.sub(r'[<>:"/\\|?*\x00-\x1F]', replacement, value)
# Remove leading/trailing whitespace or dots
value = value.strip().strip(".")
# Optionally truncate to safe length (e.g. 255 chars for most filesystems)
return value[:255] or "untitled"
# run main cli # run main cli
@click.command() @click.command()
#TODO include images flag
@click.option('--debug', '-d', is_flag=True, help='Set the log level to DEBUG') @click.option('--debug', '-d', is_flag=True, help='Set the log level to DEBUG')
@click.option('--silent', '-s', is_flag=True, help='Disable the progress bar') @click.option('--silent', '-s', is_flag=True, help='Disable the progress bar')
@click.option('--path','-p',type=str,default='./', help='The path to which files are saved' )
@click.option('--no-clean',is_flag=True,help='Do not parse html files with blocklist')
@click.argument('args', nargs=-1) @click.argument('args', nargs=-1)
def main(args, debug, silent): def main(args, debug, silent, path, no_clean):
''' '''
Download ePUBs from https://www.projekt-gutenberg.org/ Download ePUBs from https://www.projekt-gutenberg.org/ \n
Provide either 0 arguments to enter interactive mode or an arbitrary number of URLs to download from Provide either 0 arguments to enter interactive mode or an arbitrary number of URLs to download from
''' '''
logging.basicConfig(level=logging.ERROR,format='%(asctime)s - %(levelname)s - %(message)s') logging.basicConfig(level=logging.ERROR,format='%(asctime)s - %(levelname)s - %(message)s')
@@ -173,10 +196,12 @@ def main(args, debug, silent):
books = [item.split(';')[1].strip() for item in selection] books = [item.split(';')[1].strip() for item in selection]
logger.debug('Attempting to download from %d URL(s)', len(books)) logger.debug('Attempting to download from %d URL(s)', len(books))
converter = GBConvert(path)
if len(books)==1: if len(books)==1:
GBConvert(books[0], showprogress=not silent).run() converter.download(books[0], showprogress=not silent, cleanpages= not no_clean)
else: else:
for book in (tqdm(books) if not silent else books): for book in (tqdm(books) if not silent else books):
GBConvert(book).run() converter.download(book, cleanpages= not no_clean)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@@ -11,9 +11,10 @@ from convert import GBConvert, allbooks_url, get_all_books, Book
def main(): def main():
books = get_all_books() books = get_all_books()
# NOTE consider making this a map() # NOTE consider making this a map()
converter = GBConvert('./')
for book in tqdm(books): for book in tqdm(books):
if book.url is not None: if book.url is not None:
GBConvert(book.url).run() converter.download(book.url)
if __name__ == "__main__": if __name__ == "__main__":

2
uv.lock generated
View File

@@ -81,7 +81,7 @@ wheels = [
[[package]] [[package]]
name = "epub2go" name = "epub2go"
version = "1.2" version = "2.2"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "beautifulsoup4" }, { name = "beautifulsoup4" },