Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8e0d92d796 | ||
|
|
7f488c638c | ||
|
|
daddb58c3c |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -418,3 +418,4 @@ wheels/
|
|||||||
*.css
|
*.css
|
||||||
*.js
|
*.js
|
||||||
*.txt
|
*.txt
|
||||||
|
*.json
|
||||||
|
|||||||
@@ -2,8 +2,9 @@
|
|||||||
web to epub converter for https://projekt-gutenberg.org.
|
web to epub converter for https://projekt-gutenberg.org.
|
||||||
Requires:
|
Requires:
|
||||||
- [pandoc](https://pandoc.org/)
|
- [pandoc](https://pandoc.org/)
|
||||||
- wget
|
- [wget](https://www.gnu.org/software/wget/)
|
||||||
- python
|
- [fzf](https://github.com/junegunn/fzf) (only for interactive mode)
|
||||||
|
- python (duh)
|
||||||
## Usage
|
## Usage
|
||||||
Invoke the script using the url of any page of the book you would like to download:
|
Invoke the script using the url of any page of the book you would like to download:
|
||||||
```
|
```
|
||||||
@@ -12,6 +13,6 @@ epub2go https://www.projekt-gutenberg.org/ibsen/solness/
|
|||||||
## Installation
|
## Installation
|
||||||
Assuming you have a recent version of python installed, run
|
Assuming you have a recent version of python installed, run
|
||||||
```
|
```
|
||||||
pip install git+https://github.com/eneller/epub2go.py
|
pip install git+https://github.com/eneller/epub2go.py@latest
|
||||||
```
|
```
|
||||||
This will provide the 'epub2go' command.
|
This will provide the 'epub2go' command.
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ readme = "README.md"
|
|||||||
requires-python = ">=3.12"
|
requires-python = ">=3.12"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"beautifulsoup4==4.12.3",
|
"beautifulsoup4==4.12.3",
|
||||||
|
"pyfzf>=0.3.1", # hasnt been updated for some time
|
||||||
"requests==2.32.3",
|
"requests==2.32.3",
|
||||||
"tqdm>=4.67.1",
|
"tqdm>=4.67.1",
|
||||||
"urllib3==2.2.2",
|
"urllib3==2.2.2",
|
||||||
|
|||||||
@@ -1,17 +1,24 @@
|
|||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from bs4 import ResultSet
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
from urllib.request import urlopen, urlparse
|
from urllib.request import urlparse
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from pyfzf.pyfzf import FzfPrompt
|
||||||
|
|
||||||
import os, sys
|
import os, sys
|
||||||
import importlib.resources as pkg_resources
|
import importlib.resources as pkg_resources
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
|
allbooks_url ='https://www.projekt-gutenberg.org/info/texte/allworka.html'
|
||||||
|
root_url = '{url.scheme}://{url.netloc}'.format(url = urlparse(allbooks_url))
|
||||||
|
|
||||||
class GBConvert():
|
class GBConvert():
|
||||||
#TODO fix toc / headings
|
#TODO fix toc / headings
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
url:str,
|
url:str,
|
||||||
|
standalone = False,
|
||||||
):
|
):
|
||||||
# NOTE move non-code files to data folder
|
# NOTE move non-code files to data folder
|
||||||
self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
|
self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
|
||||||
@@ -19,6 +26,7 @@ class GBConvert():
|
|||||||
self.root = os.path.dirname(url)
|
self.root = os.path.dirname(url)
|
||||||
self.url = urlparse(self.root)
|
self.url = urlparse(self.root)
|
||||||
self.output = self.url.netloc + self.url.path
|
self.output = self.url.netloc + self.url.path
|
||||||
|
self.standalone = standalone
|
||||||
|
|
||||||
def get_meta(self):
|
def get_meta(self):
|
||||||
response = requests.get(self.root)
|
response = requests.get(self.root)
|
||||||
@@ -29,6 +37,7 @@ class GBConvert():
|
|||||||
self.toc = soup.find('ul').find_all('a')
|
self.toc = soup.find('ul').find_all('a')
|
||||||
|
|
||||||
def save_page(self, url):
|
def save_page(self, url):
|
||||||
|
# TODO fix redownloading of shared content
|
||||||
# https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
|
# https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
|
||||||
command = f'''wget \
|
command = f'''wget \
|
||||||
--page-requisites \
|
--page-requisites \
|
||||||
@@ -67,7 +76,7 @@ class GBConvert():
|
|||||||
|
|
||||||
map(lambda x: self.save_page(os.path.join(self.root, x['href'])), self.toc)
|
map(lambda x: self.save_page(os.path.join(self.root, x['href'])), self.toc)
|
||||||
self.chapters = []
|
self.chapters = []
|
||||||
for item in tqdm(self.toc):
|
for item in (tqdm(self.toc) if self.standalone else self.toc):
|
||||||
item_title= item.get_text()
|
item_title= item.get_text()
|
||||||
item_url = os.path.join(self.root, item['href'])
|
item_url = os.path.join(self.root, item['href'])
|
||||||
self.save_page(url=item_url)
|
self.save_page(url=item_url)
|
||||||
@@ -78,12 +87,43 @@ class GBConvert():
|
|||||||
|
|
||||||
self.create_epub(f'{self.title} - {self.author}.epub')
|
self.create_epub(f'{self.title} - {self.author}.epub')
|
||||||
|
|
||||||
|
def get_all_books() -> list:
|
||||||
|
books = get_all_book_tags()
|
||||||
|
d = []
|
||||||
|
for book in books:
|
||||||
|
book_href = book.get('href')
|
||||||
|
if book_href is not None:
|
||||||
|
book_url = urljoin(allbooks_url, book_href)
|
||||||
|
book_title = book.getText().translate(str.maketrans('','', '\n\t'))
|
||||||
|
d.append({'title': book_title, 'url': book_url})
|
||||||
|
return d
|
||||||
|
|
||||||
|
def get_all_book_tags ()-> ResultSet:
|
||||||
|
response = requests.get(allbooks_url)
|
||||||
|
response.raise_for_status()
|
||||||
|
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
|
||||||
|
books = soup.find('dl').find_all('a')
|
||||||
|
return books
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
g = GBConvert(sys.argv[1])
|
sys.argv.pop(0)
|
||||||
g.run()
|
# non-interactive mode
|
||||||
|
if len(sys.argv) > 0 :
|
||||||
|
books = sys.argv
|
||||||
|
# interactive mode using fzf
|
||||||
|
else:
|
||||||
|
delimiter = ';'
|
||||||
|
# create lines for fzf
|
||||||
|
# TODO display author
|
||||||
|
books = [f"{item['title']} {delimiter} {item['url']}" for item in get_all_books()]
|
||||||
|
fzf = FzfPrompt()
|
||||||
|
selection = fzf.prompt(choices=books, fzf_options=r'--exact --with-nth 1 -m -d\;')
|
||||||
|
books = [item.split(';')[1].strip() for item in selection]
|
||||||
|
|
||||||
|
if len(books)==1:
|
||||||
|
GBConvert(books[0], standalone=True).run()
|
||||||
|
else:
|
||||||
|
for book in tqdm(books):
|
||||||
|
GBConvert(book).run()
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
22
src/epub2go/crawl.py
Normal file
22
src/epub2go/crawl.py
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
import requests
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
import os
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
from convert import GBConvert
|
||||||
|
import utils
|
||||||
|
|
||||||
|
def main():
|
||||||
|
books = utils.get_all_book_tags()
|
||||||
|
# NOTE consider making this a map()
|
||||||
|
for book in tqdm(books):
|
||||||
|
book_title = book.get_text()
|
||||||
|
book_url_relative = book.get('href')
|
||||||
|
if book_url_relative is not None:
|
||||||
|
book_url = urljoin(allbooks_url, book_href)
|
||||||
|
GBConvert(book_url).run()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -1,25 +0,0 @@
|
|||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
import os
|
|
||||||
|
|
||||||
from convert import GBConvert
|
|
||||||
|
|
||||||
def main():
|
|
||||||
allbooks_relative_url ='/info/texte/allworka.html'
|
|
||||||
root_url = 'https://www.projekt-gutenberg.org'
|
|
||||||
allbooks_url = root_url + allbooks_relative_url
|
|
||||||
response = requests.get(allbooks_url)
|
|
||||||
if (response.status_code != 200): raise Exception(f'Couldnt fetch root page {self.root}')
|
|
||||||
soup = BeautifulSoup(response.content, 'html.parser')
|
|
||||||
books = soup.find('dl').find_all('a')
|
|
||||||
for book in books:
|
|
||||||
book_title = book.get_text()
|
|
||||||
book_url_relative = book.get('href')
|
|
||||||
if book_url_relative is not None:
|
|
||||||
book_url = root_url + os.path.dirname(book_url_relative)[5:]
|
|
||||||
gb = GBConvert(book_url)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
28
src/epub2go/web.py
Normal file
28
src/epub2go/web.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
# run using `django-admin runserver --pythonpath=. --settings=web`
|
||||||
|
from django.urls import path
|
||||||
|
from django.http import HttpResponse
|
||||||
|
from django.shortcuts import redirect, render
|
||||||
|
import requests
|
||||||
|
|
||||||
|
import utils
|
||||||
|
import json
|
||||||
|
DEBUG = True
|
||||||
|
ROOT_URLCONF = __name__
|
||||||
|
SECRET_KEY='1'
|
||||||
|
TEMPLATES = [
|
||||||
|
{
|
||||||
|
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
||||||
|
'DIRS': [
|
||||||
|
'templates/'
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
def home(request):
|
||||||
|
title = 'epub2go'
|
||||||
|
items = json.load(open('dict.json', 'r'))
|
||||||
|
return render(request, 'index.html', locals())
|
||||||
|
|
||||||
|
urlpatterns = [
|
||||||
|
path('', home, name='homepage'),
|
||||||
|
]
|
||||||
11
uv.lock
generated
11
uv.lock
generated
@@ -72,6 +72,7 @@ version = "1.0"
|
|||||||
source = { editable = "." }
|
source = { editable = "." }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "beautifulsoup4" },
|
{ name = "beautifulsoup4" },
|
||||||
|
{ name = "pyfzf" },
|
||||||
{ name = "requests" },
|
{ name = "requests" },
|
||||||
{ name = "tqdm" },
|
{ name = "tqdm" },
|
||||||
{ name = "urllib3" },
|
{ name = "urllib3" },
|
||||||
@@ -80,6 +81,7 @@ dependencies = [
|
|||||||
[package.metadata]
|
[package.metadata]
|
||||||
requires-dist = [
|
requires-dist = [
|
||||||
{ name = "beautifulsoup4", specifier = "==4.12.3" },
|
{ name = "beautifulsoup4", specifier = "==4.12.3" },
|
||||||
|
{ name = "pyfzf", specifier = ">=0.3.1" },
|
||||||
{ name = "requests", specifier = "==2.32.3" },
|
{ name = "requests", specifier = "==2.32.3" },
|
||||||
{ name = "tqdm", specifier = ">=4.67.1" },
|
{ name = "tqdm", specifier = ">=4.67.1" },
|
||||||
{ name = "urllib3", specifier = "==2.2.2" },
|
{ name = "urllib3", specifier = "==2.2.2" },
|
||||||
@@ -94,6 +96,15 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 },
|
{ url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pyfzf"
|
||||||
|
version = "0.3.1"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/d4/4c/c0c658a1e1e9f0e01932990d7947579515fe048d0a515f07458ecd992b8f/pyfzf-0.3.1.tar.gz", hash = "sha256:dd902e34cffeca9c3082f96131593dd20b4b3a9bba5b9dde1b0688e424b46bd2", size = 3652 }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/99/35/6a6c7b95390ec58904646a04f54e1b56fd57d7a247588b791c6331697797/pyfzf-0.3.1-py3-none-any.whl", hash = "sha256:736f71563461b75f6f85b55345bdc638fa0dc14c32c857c59e8b1ca1cfa3cf4a", size = 4315 },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "requests"
|
name = "requests"
|
||||||
version = "2.32.3"
|
version = "2.32.3"
|
||||||
|
|||||||
Reference in New Issue
Block a user