6 Commits
v1.3 ... v2.2

Author SHA1 Message Date
eneller
660af7fab0 feat: allow getting directory without download 2025-03-23 23:55:05 +01:00
eneller
c49a1be369 docs: readme 2025-03-20 22:11:12 +01:00
eneller
4267700763 feat: return epub path
errors from wget and pandoc are thrown up
2025-03-16 20:30:42 +01:00
eneller
5d063d8597 feat: restructure for memory efficiency 2025-03-16 19:06:33 +01:00
eneller
6754f47e9f fix: restructure test 2025-03-16 18:57:40 +01:00
eneller
4a8d4f945d begin restructure 2025-03-16 18:34:12 +01:00
5 changed files with 106 additions and 66 deletions

View File

@@ -1,18 +1,39 @@
# epub2go.py # epub2go.py
web to epub converter for https://projekt-gutenberg.org. Web to ePUB Converter for [projekt-gutenberg.org](https://projekt-gutenberg.org) developed in conjunction with a [web interface](https://github.com/eneller/epub2go-web).
## Installation
Requires: Requires:
- [pandoc](https://pandoc.org/) - [pandoc](https://pandoc.org/)
- [wget](https://www.gnu.org/software/wget/) - [wget](https://www.gnu.org/software/wget/)
- [fzf](https://github.com/junegunn/fzf) (only for interactive mode) - [fzf](https://github.com/junegunn/fzf) (optional, only for interactive mode)
- python (duh) - [python](https://www.python.org/) (duh)
## Usage
Invoke the script using the url of any page of the book you would like to download: Assuming you have a recent version of python installed, run
```
epub2go https://www.projekt-gutenberg.org/ibsen/solness/ ```
pip install git+https://github.com/eneller/epub2go.py
```
This will provide the `epub2go` command.
## Usage
```
Usage: epub2go [OPTIONS] [ARGS]...
Download ePUBs from https://www.projekt-gutenberg.org/
Provide either 0 arguments to enter interactive mode or an arbitrary number
of URLs to download from
Options:
-d, --debug Set the log level to DEBUG
-s, --silent Disable the progress bar
-p, --path TEXT The path to which files are saved
--no-clean Do not parse html files with blocklist
--help Show this message and exit.
```
Examples:
```bash
epub2go https://www.projekt-gutenberg.org/ibsen/solness/
epub2go # will enter interactive mode
``` ```
## Installation
Assuming you have a recent version of python installed, run
```
pip install git+https://github.com/eneller/epub2go.py
```
This will provide the 'epub2go' command.

View File

@@ -1,6 +1,6 @@
[project] [project]
name = "epub2go" name = "epub2go"
version = "1.3" version = "2.2"
description = "EPUB converter using wget, pandoc and python glue" description = "EPUB converter using wget, pandoc and python glue"
readme = "README.md" readme = "README.md"
requires-python = ">=3.12" requires-python = ">=3.12"
@@ -22,4 +22,4 @@ include-package-data = true
requires = ["setuptools>=64", "setuptools_scm>=8"] requires = ["setuptools>=64", "setuptools_scm>=8"]
[tool.setuptools_scm] [tool.setuptools_scm]
# can be empty if no extra settings are needed, presence enables setuptools-scm # can be empty if no extra settings are needed, presence enables setuptools-scm

View File

@@ -23,75 +23,100 @@ class Book():
url: str url: str
class GBConvert(): class GBConvert():
def __init__(self, def __init__(self,
url:str, downloaddir,
author:str = None,
title:str = None,
downloaddir = './',
showprogress:bool = False,
): ):
# NOTE move non-code files to data folder # NOTE move non-code files to data folder
self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css") self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
with open(pkg_resources.files('epub2go').joinpath('blocklist.txt')) as blocklist: with open(pkg_resources.files('epub2go').joinpath('blocklist.txt')) as blocklist:
self.blocklist = blocklist.read().splitlines() self.blocklist = blocklist.read().splitlines()
self.tocpage = os.path.dirname(url) # ToC website url
url = urlparse(self.tocpage)
self.dir_download = downloaddir self.dir_download = downloaddir
self.dir_output = os.path.join(self.dir_download, url.netloc + url.path )# directories created by wget recreating the URL
logger.debug('Downloading in %s, expecting files in in %s', self.dir_download, self.dir_output)
self.showprogress = showprogress
self.author = author
self.title = title
self.chapters = []
self.parse_meta() def getDir(self, url):
tocpage = os.path.dirname(url) # ToC website url
parsed_url = urlparse(tocpage)
# directories created by wget recreating the URL
dir_output = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path )
return dir_output
def parse_meta(self): def download(self,
response = requests.get(self.tocpage) url:str,
author:str = None,
title:str = None,
showprogress: bool = False,
cleanpages: bool = True,
):
tocpage = os.path.dirname(url) # ToC website url
dir_output = self.getDir()
logger.debug('Downloading to %s, expecting files in in %s', self.dir_download, dir_output)
author = author
title = title
#parse_meta
response = requests.get(tocpage)
response.raise_for_status() response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser') soup = BeautifulSoup(response.content, 'html.parser')
# TODO allow setting these from interactive mode where those parameters are figured out from the list # TODO allow setting these from interactive mode where those parameters are figured out from the list
if not self.author: if not author:
try: try:
self.author = soup.find('meta', {'name': 'author'})['content'] author = soup.find('meta', {'name': 'author'})['content']
except: except:
self.author = "UnknownAuthor" author = "UnknownAuthor"
if not self.title: if not title:
try: try:
self.title = soup.find('meta', {'name': 'title'})['content'] title = soup.find('meta', {'name': 'title'})['content']
except: except:
self.title = "UnknownTitle" title = "UnknownTitle"
self.toc = soup.find('ul').find_all('a') chapter_urls = soup.find('ul').find_all('a')
logger.debug('Found ToC with %d entries', len(self.toc)) logger.debug('Found ToC with %d entries', len(chapter_urls))
#run
#TODO include images flag
# download all files in toc (chapters)
chapter_files = []
for item in (tqdm(chapter_urls) if showprogress else chapter_urls):
item_url = self.parse_toc_entry(tocpage, item)
parsed_url = urlparse(item_url)
filepath = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path)
if cleanpages: self.parse_page(filepath)
chapter_files.append(os.path.basename(item_url))
def parse_toc_entry(self, entry): return self.create_epub(author,title,chapter_files,dir_output)
url = os.path.join(self.tocpage, entry['href'])
def parse_toc_entry(self, tocpage, entry):
url = os.path.join(tocpage, entry['href'])
self.save_page(url) self.save_page(url)
return url return url
# apply blocklist to file # apply blocklist to file
def parse_page(self,file_path): def parse_page(self,file_path):
#TODO clean up file opening, mmap? #TODO clean up file opening, mmap?
logger.debug('Parsing page at %s', file_path) count=0
with open(file_path, 'r+') as f: with open(file_path, 'r+') as f:
soup = BeautifulSoup(f.read(), 'html.parser') soup = BeautifulSoup(f.read(), 'html.parser')
for blocker in self.blocklist: for blocker in self.blocklist:
for item in soup.select(blocker): for item in soup.select(blocker):
item.decompose() item.decompose()
count+=1
f.seek(0)
f.truncate()
f.write(str(soup)) f.write(str(soup))
logger.debug('Removed %d tags from page %s during parsing', count, file_path)
def create_epub(self, filename='out.epub')-> int: def create_epub(self, author, title, chapters, dir_output):
#TODO --epub-cover-image #TODO --epub-cover-image
#TODO toc if it isnt described by <h> tags, e.g. https://www.projekt-gutenberg.org/adlersfe/maskenba/ #TODO toc if it isnt described by <h> tags, e.g. https://www.projekt-gutenberg.org/adlersfe/maskenba/
filename = f'{title} - {author}.epub'
logger.debug('Creating epub as "%s"',filename) logger.debug('Creating epub as "%s"',filename)
command = f'''pandoc -f html -t epub \ command = f'''pandoc -f html -t epub \
-o "{filename}" \ -o "{filename}" \
--reference-location=section \ --reference-location=section \
--css="{self.style_path_drama}" \ --css="{self.style_path_drama}" \
--metadata title="{self.title}" \ --metadata title="{title}" \
--metadata author="{self.author}" \ --metadata author="{author}" \
--epub-title-page=false \ --epub-title-page=false \
{" ".join(self.chapters)} ''' {" ".join(chapters)} '''
return subprocess.run(shlex.split(command), cwd=self.dir_output).returncode subprocess.run(shlex.split(command), cwd=dir_output, check=True)
return os.path.abspath(os.path.join(dir_output,filename))
def save_page(self, url): def save_page(self, url):
logger.debug('Saving page at %s', url) logger.debug('Saving page at %s', url)
@@ -103,19 +128,7 @@ class GBConvert():
--tries=5 \ --tries=5 \
--quiet \ --quiet \
{url}''' {url}'''
return subprocess.run(shlex.split(command), cwd=self.dir_download).returncode subprocess.run(shlex.split(command), cwd=self.dir_download, check=True)
def run(self):
#TODO include images flag
# download all files in toc (chapters)
for item in (tqdm(self.toc) if self.showprogress else self.toc):
item_url = self.parse_toc_entry(item)
parsed_url = urlparse(item_url)
filepath = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path)
self.parse_page(filepath)
self.chapters.append(os.path.basename(item_url))
return self.create_epub(f'{self.title} - {self.author}.epub')
# get a list of all books for interactive selection or scraping # get a list of all books for interactive selection or scraping
def get_all_books() -> List[Book]: def get_all_books() -> List[Book]:
@@ -149,12 +162,15 @@ def get_all_books() -> List[Book]:
# run main cli # run main cli
@click.command() @click.command()
#TODO include images flag
@click.option('--debug', '-d', is_flag=True, help='Set the log level to DEBUG') @click.option('--debug', '-d', is_flag=True, help='Set the log level to DEBUG')
@click.option('--silent', '-s', is_flag=True, help='Disable the progress bar') @click.option('--silent', '-s', is_flag=True, help='Disable the progress bar')
@click.option('--path','-p',type=str,default='./', help='The path to which files are saved' )
@click.option('--no-clean',is_flag=True,help='Do not parse html files with blocklist')
@click.argument('args', nargs=-1) @click.argument('args', nargs=-1)
def main(args, debug, silent): def main(args, debug, silent, path, no_clean):
''' '''
Download ePUBs from https://www.projekt-gutenberg.org/ Download ePUBs from https://www.projekt-gutenberg.org/ \n
Provide either 0 arguments to enter interactive mode or an arbitrary number of URLs to download from Provide either 0 arguments to enter interactive mode or an arbitrary number of URLs to download from
''' '''
logging.basicConfig(level=logging.ERROR,format='%(asctime)s - %(levelname)s - %(message)s') logging.basicConfig(level=logging.ERROR,format='%(asctime)s - %(levelname)s - %(message)s')
@@ -173,10 +189,12 @@ def main(args, debug, silent):
books = [item.split(';')[1].strip() for item in selection] books = [item.split(';')[1].strip() for item in selection]
logger.debug('Attempting to download from %d URL(s)', len(books)) logger.debug('Attempting to download from %d URL(s)', len(books))
converter = GBConvert(path)
if len(books)==1: if len(books)==1:
GBConvert(books[0], showprogress=not silent).run() converter.download(books[0], showprogress=not silent, cleanpages= not no_clean)
else: else:
for book in (tqdm(books) if not silent else books): for book in (tqdm(books) if not silent else books):
GBConvert(book).run() converter.download(book, cleanpages= not no_clean)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@@ -11,9 +11,10 @@ from convert import GBConvert, allbooks_url, get_all_books, Book
def main(): def main():
books = get_all_books() books = get_all_books()
# NOTE consider making this a map() # NOTE consider making this a map()
converter = GBConvert('./')
for book in tqdm(books): for book in tqdm(books):
if book.url is not None: if book.url is not None:
GBConvert(book.url).run() converter.download(book.url)
if __name__ == "__main__": if __name__ == "__main__":

2
uv.lock generated
View File

@@ -81,7 +81,7 @@ wheels = [
[[package]] [[package]]
name = "epub2go" name = "epub2go"
version = "1.2" version = "2.2"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "beautifulsoup4" }, { name = "beautifulsoup4" },