Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
660af7fab0 | ||
|
|
c49a1be369 | ||
|
|
4267700763 | ||
|
|
5d063d8597 | ||
|
|
6754f47e9f | ||
|
|
4a8d4f945d |
41
README.md
41
README.md
@@ -1,18 +1,39 @@
|
|||||||
# epub2go.py
|
# epub2go.py
|
||||||
web to epub converter for https://projekt-gutenberg.org.
|
Web to ePUB Converter for [projekt-gutenberg.org](https://projekt-gutenberg.org) developed in conjunction with a [web interface](https://github.com/eneller/epub2go-web).
|
||||||
|
|
||||||
|
## Installation
|
||||||
Requires:
|
Requires:
|
||||||
- [pandoc](https://pandoc.org/)
|
- [pandoc](https://pandoc.org/)
|
||||||
- [wget](https://www.gnu.org/software/wget/)
|
- [wget](https://www.gnu.org/software/wget/)
|
||||||
- [fzf](https://github.com/junegunn/fzf) (only for interactive mode)
|
- [fzf](https://github.com/junegunn/fzf) (optional, only for interactive mode)
|
||||||
- python (duh)
|
- [python](https://www.python.org/) (duh)
|
||||||
|
|
||||||
|
Assuming you have a recent version of python installed, run
|
||||||
|
|
||||||
|
```
|
||||||
|
pip install git+https://github.com/eneller/epub2go.py
|
||||||
|
```
|
||||||
|
This will provide the `epub2go` command.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
Invoke the script using the url of any page of the book you would like to download:
|
|
||||||
```
|
```
|
||||||
|
Usage: epub2go [OPTIONS] [ARGS]...
|
||||||
|
|
||||||
|
Download ePUBs from https://www.projekt-gutenberg.org/
|
||||||
|
|
||||||
|
Provide either 0 arguments to enter interactive mode or an arbitrary number
|
||||||
|
of URLs to download from
|
||||||
|
|
||||||
|
Options:
|
||||||
|
-d, --debug Set the log level to DEBUG
|
||||||
|
-s, --silent Disable the progress bar
|
||||||
|
-p, --path TEXT The path to which files are saved
|
||||||
|
--no-clean Do not parse html files with blocklist
|
||||||
|
--help Show this message and exit.
|
||||||
|
```
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
```bash
|
||||||
epub2go https://www.projekt-gutenberg.org/ibsen/solness/
|
epub2go https://www.projekt-gutenberg.org/ibsen/solness/
|
||||||
|
epub2go # will enter interactive mode
|
||||||
```
|
```
|
||||||
## Installation
|
|
||||||
Assuming you have a recent version of python installed, run
|
|
||||||
```
|
|
||||||
pip install git+https://github.com/eneller/epub2go.py
|
|
||||||
```
|
|
||||||
This will provide the 'epub2go' command.
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "epub2go"
|
name = "epub2go"
|
||||||
version = "1.3"
|
version = "2.2"
|
||||||
description = "EPUB converter using wget, pandoc and python glue"
|
description = "EPUB converter using wget, pandoc and python glue"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.12"
|
requires-python = ">=3.12"
|
||||||
|
|||||||
@@ -23,75 +23,100 @@ class Book():
|
|||||||
url: str
|
url: str
|
||||||
class GBConvert():
|
class GBConvert():
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
url:str,
|
downloaddir,
|
||||||
author:str = None,
|
|
||||||
title:str = None,
|
|
||||||
downloaddir = './',
|
|
||||||
showprogress:bool = False,
|
|
||||||
):
|
):
|
||||||
# NOTE move non-code files to data folder
|
# NOTE move non-code files to data folder
|
||||||
self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
|
self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
|
||||||
with open(pkg_resources.files('epub2go').joinpath('blocklist.txt')) as blocklist:
|
with open(pkg_resources.files('epub2go').joinpath('blocklist.txt')) as blocklist:
|
||||||
self.blocklist = blocklist.read().splitlines()
|
self.blocklist = blocklist.read().splitlines()
|
||||||
self.tocpage = os.path.dirname(url) # ToC website url
|
|
||||||
url = urlparse(self.tocpage)
|
|
||||||
self.dir_download = downloaddir
|
self.dir_download = downloaddir
|
||||||
self.dir_output = os.path.join(self.dir_download, url.netloc + url.path )# directories created by wget recreating the URL
|
|
||||||
logger.debug('Downloading in %s, expecting files in in %s', self.dir_download, self.dir_output)
|
|
||||||
self.showprogress = showprogress
|
|
||||||
self.author = author
|
|
||||||
self.title = title
|
|
||||||
self.chapters = []
|
|
||||||
|
|
||||||
self.parse_meta()
|
def getDir(self, url):
|
||||||
|
tocpage = os.path.dirname(url) # ToC website url
|
||||||
|
parsed_url = urlparse(tocpage)
|
||||||
|
# directories created by wget recreating the URL
|
||||||
|
dir_output = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path )
|
||||||
|
return dir_output
|
||||||
|
|
||||||
def parse_meta(self):
|
def download(self,
|
||||||
response = requests.get(self.tocpage)
|
url:str,
|
||||||
|
author:str = None,
|
||||||
|
title:str = None,
|
||||||
|
showprogress: bool = False,
|
||||||
|
cleanpages: bool = True,
|
||||||
|
):
|
||||||
|
tocpage = os.path.dirname(url) # ToC website url
|
||||||
|
dir_output = self.getDir()
|
||||||
|
logger.debug('Downloading to %s, expecting files in in %s', self.dir_download, dir_output)
|
||||||
|
author = author
|
||||||
|
title = title
|
||||||
|
|
||||||
|
#parse_meta
|
||||||
|
response = requests.get(tocpage)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
soup = BeautifulSoup(response.content, 'html.parser')
|
soup = BeautifulSoup(response.content, 'html.parser')
|
||||||
# TODO allow setting these from interactive mode where those parameters are figured out from the list
|
# TODO allow setting these from interactive mode where those parameters are figured out from the list
|
||||||
if not self.author:
|
if not author:
|
||||||
try:
|
try:
|
||||||
self.author = soup.find('meta', {'name': 'author'})['content']
|
author = soup.find('meta', {'name': 'author'})['content']
|
||||||
except:
|
except:
|
||||||
self.author = "UnknownAuthor"
|
author = "UnknownAuthor"
|
||||||
if not self.title:
|
if not title:
|
||||||
try:
|
try:
|
||||||
self.title = soup.find('meta', {'name': 'title'})['content']
|
title = soup.find('meta', {'name': 'title'})['content']
|
||||||
except:
|
except:
|
||||||
self.title = "UnknownTitle"
|
title = "UnknownTitle"
|
||||||
self.toc = soup.find('ul').find_all('a')
|
chapter_urls = soup.find('ul').find_all('a')
|
||||||
logger.debug('Found ToC with %d entries', len(self.toc))
|
logger.debug('Found ToC with %d entries', len(chapter_urls))
|
||||||
|
|
||||||
def parse_toc_entry(self, entry):
|
#run
|
||||||
url = os.path.join(self.tocpage, entry['href'])
|
#TODO include images flag
|
||||||
|
# download all files in toc (chapters)
|
||||||
|
chapter_files = []
|
||||||
|
for item in (tqdm(chapter_urls) if showprogress else chapter_urls):
|
||||||
|
item_url = self.parse_toc_entry(tocpage, item)
|
||||||
|
parsed_url = urlparse(item_url)
|
||||||
|
filepath = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path)
|
||||||
|
if cleanpages: self.parse_page(filepath)
|
||||||
|
chapter_files.append(os.path.basename(item_url))
|
||||||
|
|
||||||
|
return self.create_epub(author,title,chapter_files,dir_output)
|
||||||
|
|
||||||
|
def parse_toc_entry(self, tocpage, entry):
|
||||||
|
url = os.path.join(tocpage, entry['href'])
|
||||||
self.save_page(url)
|
self.save_page(url)
|
||||||
return url
|
return url
|
||||||
|
|
||||||
# apply blocklist to file
|
# apply blocklist to file
|
||||||
def parse_page(self,file_path):
|
def parse_page(self,file_path):
|
||||||
#TODO clean up file opening, mmap?
|
#TODO clean up file opening, mmap?
|
||||||
logger.debug('Parsing page at %s', file_path)
|
count=0
|
||||||
with open(file_path, 'r+') as f:
|
with open(file_path, 'r+') as f:
|
||||||
soup = BeautifulSoup(f.read(), 'html.parser')
|
soup = BeautifulSoup(f.read(), 'html.parser')
|
||||||
for blocker in self.blocklist:
|
for blocker in self.blocklist:
|
||||||
for item in soup.select(blocker):
|
for item in soup.select(blocker):
|
||||||
item.decompose()
|
item.decompose()
|
||||||
|
count+=1
|
||||||
|
f.seek(0)
|
||||||
|
f.truncate()
|
||||||
f.write(str(soup))
|
f.write(str(soup))
|
||||||
|
logger.debug('Removed %d tags from page %s during parsing', count, file_path)
|
||||||
|
|
||||||
def create_epub(self, filename='out.epub')-> int:
|
def create_epub(self, author, title, chapters, dir_output):
|
||||||
#TODO --epub-cover-image
|
#TODO --epub-cover-image
|
||||||
#TODO toc if it isnt described by <h> tags, e.g. https://www.projekt-gutenberg.org/adlersfe/maskenba/
|
#TODO toc if it isnt described by <h> tags, e.g. https://www.projekt-gutenberg.org/adlersfe/maskenba/
|
||||||
|
filename = f'{title} - {author}.epub'
|
||||||
logger.debug('Creating epub as "%s"',filename)
|
logger.debug('Creating epub as "%s"',filename)
|
||||||
command = f'''pandoc -f html -t epub \
|
command = f'''pandoc -f html -t epub \
|
||||||
-o "{filename}" \
|
-o "{filename}" \
|
||||||
--reference-location=section \
|
--reference-location=section \
|
||||||
--css="{self.style_path_drama}" \
|
--css="{self.style_path_drama}" \
|
||||||
--metadata title="{self.title}" \
|
--metadata title="{title}" \
|
||||||
--metadata author="{self.author}" \
|
--metadata author="{author}" \
|
||||||
--epub-title-page=false \
|
--epub-title-page=false \
|
||||||
{" ".join(self.chapters)} '''
|
{" ".join(chapters)} '''
|
||||||
return subprocess.run(shlex.split(command), cwd=self.dir_output).returncode
|
subprocess.run(shlex.split(command), cwd=dir_output, check=True)
|
||||||
|
return os.path.abspath(os.path.join(dir_output,filename))
|
||||||
|
|
||||||
def save_page(self, url):
|
def save_page(self, url):
|
||||||
logger.debug('Saving page at %s', url)
|
logger.debug('Saving page at %s', url)
|
||||||
@@ -103,19 +128,7 @@ class GBConvert():
|
|||||||
--tries=5 \
|
--tries=5 \
|
||||||
--quiet \
|
--quiet \
|
||||||
{url}'''
|
{url}'''
|
||||||
return subprocess.run(shlex.split(command), cwd=self.dir_download).returncode
|
subprocess.run(shlex.split(command), cwd=self.dir_download, check=True)
|
||||||
def run(self):
|
|
||||||
#TODO include images flag
|
|
||||||
|
|
||||||
# download all files in toc (chapters)
|
|
||||||
for item in (tqdm(self.toc) if self.showprogress else self.toc):
|
|
||||||
item_url = self.parse_toc_entry(item)
|
|
||||||
parsed_url = urlparse(item_url)
|
|
||||||
filepath = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path)
|
|
||||||
self.parse_page(filepath)
|
|
||||||
self.chapters.append(os.path.basename(item_url))
|
|
||||||
|
|
||||||
return self.create_epub(f'{self.title} - {self.author}.epub')
|
|
||||||
|
|
||||||
# get a list of all books for interactive selection or scraping
|
# get a list of all books for interactive selection or scraping
|
||||||
def get_all_books() -> List[Book]:
|
def get_all_books() -> List[Book]:
|
||||||
@@ -149,12 +162,15 @@ def get_all_books() -> List[Book]:
|
|||||||
|
|
||||||
# run main cli
|
# run main cli
|
||||||
@click.command()
|
@click.command()
|
||||||
|
#TODO include images flag
|
||||||
@click.option('--debug', '-d', is_flag=True, help='Set the log level to DEBUG')
|
@click.option('--debug', '-d', is_flag=True, help='Set the log level to DEBUG')
|
||||||
@click.option('--silent', '-s', is_flag=True, help='Disable the progress bar')
|
@click.option('--silent', '-s', is_flag=True, help='Disable the progress bar')
|
||||||
|
@click.option('--path','-p',type=str,default='./', help='The path to which files are saved' )
|
||||||
|
@click.option('--no-clean',is_flag=True,help='Do not parse html files with blocklist')
|
||||||
@click.argument('args', nargs=-1)
|
@click.argument('args', nargs=-1)
|
||||||
def main(args, debug, silent):
|
def main(args, debug, silent, path, no_clean):
|
||||||
'''
|
'''
|
||||||
Download ePUBs from https://www.projekt-gutenberg.org/
|
Download ePUBs from https://www.projekt-gutenberg.org/ \n
|
||||||
Provide either 0 arguments to enter interactive mode or an arbitrary number of URLs to download from
|
Provide either 0 arguments to enter interactive mode or an arbitrary number of URLs to download from
|
||||||
'''
|
'''
|
||||||
logging.basicConfig(level=logging.ERROR,format='%(asctime)s - %(levelname)s - %(message)s')
|
logging.basicConfig(level=logging.ERROR,format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
@@ -173,10 +189,12 @@ def main(args, debug, silent):
|
|||||||
books = [item.split(';')[1].strip() for item in selection]
|
books = [item.split(';')[1].strip() for item in selection]
|
||||||
|
|
||||||
logger.debug('Attempting to download from %d URL(s)', len(books))
|
logger.debug('Attempting to download from %d URL(s)', len(books))
|
||||||
|
converter = GBConvert(path)
|
||||||
if len(books)==1:
|
if len(books)==1:
|
||||||
GBConvert(books[0], showprogress=not silent).run()
|
converter.download(books[0], showprogress=not silent, cleanpages= not no_clean)
|
||||||
else:
|
else:
|
||||||
for book in (tqdm(books) if not silent else books):
|
for book in (tqdm(books) if not silent else books):
|
||||||
GBConvert(book).run()
|
converter.download(book, cleanpages= not no_clean)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
@@ -11,9 +11,10 @@ from convert import GBConvert, allbooks_url, get_all_books, Book
|
|||||||
def main():
|
def main():
|
||||||
books = get_all_books()
|
books = get_all_books()
|
||||||
# NOTE consider making this a map()
|
# NOTE consider making this a map()
|
||||||
|
converter = GBConvert('./')
|
||||||
for book in tqdm(books):
|
for book in tqdm(books):
|
||||||
if book.url is not None:
|
if book.url is not None:
|
||||||
GBConvert(book.url).run()
|
converter.download(book.url)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user