Compare commits
5 Commits
master
...
feat/paral
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
98070cc2c4 | ||
|
|
0d171a6ea3 | ||
|
|
024bea93f8 | ||
|
|
6d8cd43202 | ||
|
|
cc08210d36 |
14
.vscode/extensions.json
vendored
Normal file
14
.vscode/extensions.json
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
// See https://go.microsoft.com/fwlink/?LinkId=827846 to learn about workspace recommendations.
|
||||
// Extension identifier format: ${publisher}.${name}. Example: vscode.csharp
|
||||
|
||||
// List of extensions which should be recommended for users of this workspace.
|
||||
"recommendations": [
|
||||
"detachhead.basedpyright",
|
||||
"charliermarsh.ruff",
|
||||
],
|
||||
// List of extensions recommended by VS Code that should not be recommended for users of this workspace.
|
||||
"unwantedRecommendations": [
|
||||
|
||||
]
|
||||
}
|
||||
3
.vscode/settings.json
vendored
Normal file
3
.vscode/settings.json
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
{
|
||||
"basedpyright.disableLanguageServices": true
|
||||
}
|
||||
@@ -26,6 +26,7 @@ requires = ["setuptools>=64", "setuptools_scm>=8"]
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"basedpyright>=1.28.5",
|
||||
"pysonar>=1.0.2.1722",
|
||||
]
|
||||
|
||||
|
||||
@@ -1,18 +1,23 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4 import ResultSet
|
||||
from urllib.parse import urljoin
|
||||
from urllib.request import urlparse
|
||||
from tqdm import tqdm
|
||||
from pyfzf.pyfzf import FzfPrompt
|
||||
import click
|
||||
|
||||
import os, subprocess, shlex, logging, re
|
||||
import os
|
||||
import subprocess
|
||||
import shlex
|
||||
import logging
|
||||
import re
|
||||
import concurrent.futures
|
||||
import importlib.resources as pkg_resources
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(level=logging.NOTSET,format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
allbooks_url ='https://www.projekt-gutenberg.org/info/texte/allworka.html'
|
||||
|
||||
@@ -44,6 +49,7 @@ class GBConvert():
|
||||
title:str = None,
|
||||
showprogress: bool = False,
|
||||
cleanpages: bool = True,
|
||||
max_workers: int = 10,
|
||||
):
|
||||
tocpage = os.path.dirname(url) # ToC website url
|
||||
dir_output = self.getDir(url)
|
||||
@@ -72,13 +78,22 @@ class GBConvert():
|
||||
#run
|
||||
#TODO include images flag
|
||||
# download all files in toc (chapters)
|
||||
chapter_files = []
|
||||
for item in (tqdm(chapter_urls) if showprogress else chapter_urls):
|
||||
def process_item(item):
|
||||
item_url = self.parse_toc_entry(tocpage, item)
|
||||
parsed_url = urlparse(item_url)
|
||||
filepath = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path)
|
||||
if cleanpages: self.parse_page(filepath)
|
||||
chapter_files.append(os.path.basename(item_url))
|
||||
if cleanpages:
|
||||
self.parse_page(filepath)
|
||||
return os.path.basename(item_url)
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
futures = []
|
||||
items = tqdm(chapter_urls) if showprogress else chapter_urls
|
||||
for item in items:
|
||||
futures.append(executor.submit(process_item, item))
|
||||
|
||||
# Wait for completion and preserve order
|
||||
chapter_files = [future.result() for future in futures]
|
||||
|
||||
return self.create_epub(author,title,chapter_files,dir_output)
|
||||
|
||||
@@ -174,8 +189,9 @@ def slugify(value, replacement='_'):
|
||||
@click.option('--silent', '-s', is_flag=True, help='Disable the progress bar')
|
||||
@click.option('--path','-p',type=str,default='./', help='The path to which files are saved' )
|
||||
@click.option('--no-clean',is_flag=True,help='Do not parse html files with blocklist')
|
||||
@click.option('--max-workers', '-w', type=int, default=10, help='Number of concurrent download workers')
|
||||
@click.argument('args', nargs=-1)
|
||||
def main(args, debug, silent, path, no_clean):
|
||||
def main(args, debug, silent, path, no_clean, max_workers):
|
||||
'''
|
||||
Download ePUBs from https://www.projekt-gutenberg.org/ \n
|
||||
Provide either 0 arguments to enter interactive mode or an arbitrary number of URLs to download from
|
||||
@@ -198,10 +214,10 @@ def main(args, debug, silent, path, no_clean):
|
||||
logger.debug('Attempting to download from %d URL(s)', len(books))
|
||||
converter = GBConvert(path)
|
||||
if len(books)==1:
|
||||
converter.download(books[0], showprogress=not silent, cleanpages= not no_clean)
|
||||
converter.download(books[0], showprogress=not silent, cleanpages= not no_clean, max_workers=max_workers)
|
||||
else:
|
||||
for book in (tqdm(books) if not silent else books):
|
||||
converter.download(book, cleanpages= not no_clean)
|
||||
converter.download(book, cleanpages= not no_clean, max_workers=max_workers)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
36
uv.lock
generated
36
uv.lock
generated
@@ -1,7 +1,19 @@
|
||||
version = 1
|
||||
revision = 2
|
||||
revision = 3
|
||||
requires-python = ">=3.12"
|
||||
|
||||
[[package]]
|
||||
name = "basedpyright"
|
||||
version = "1.28.5"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "nodejs-wheel-binaries" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/ce/43/7a85507882cfbda82b9003a849924dcb6f58ef74051c13f2dea9d1c30067/basedpyright-1.28.5.tar.gz", hash = "sha256:f2f13d1158c77edffe827b0a8366a8d6ec8c3a69aa9f4c938ec8fe6026d1e309", size = 21748156, upload-time = "2025-04-09T11:41:36.092Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/77/57/8aabf496d7c9c22fe259494fddc28b130e8f5d099f41025e616139b80428/basedpyright-1.28.5-py3-none-any.whl", hash = "sha256:33dab5a88832c17dbce6207e2c9df244c227ad0bf71d7e38d8728a227244e980", size = 11387863, upload-time = "2025-04-09T11:41:32.713Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "beautifulsoup4"
|
||||
version = "4.12.3"
|
||||
@@ -94,6 +106,7 @@ dependencies = [
|
||||
|
||||
[package.dev-dependencies]
|
||||
dev = [
|
||||
{ name = "basedpyright" },
|
||||
{ name = "pysonar" },
|
||||
]
|
||||
|
||||
@@ -108,7 +121,10 @@ requires-dist = [
|
||||
]
|
||||
|
||||
[package.metadata.requires-dev]
|
||||
dev = [{ name = "pysonar", specifier = ">=1.0.2.1722" }]
|
||||
dev = [
|
||||
{ name = "basedpyright", specifier = ">=1.28.5" },
|
||||
{ name = "pysonar", specifier = ">=1.0.2.1722" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
@@ -131,6 +147,22 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/75/6f/55bc5837e9fe7a86a5acb553adec901257d709062bfaef7debd4d8cfee12/jproperties-2.1.2-py2.py3-none-any.whl", hash = "sha256:4108e868353a9f4a12bb86a92df5462d0e18d00119169533972ce473029be79a", size = 17981, upload-time = "2024-07-21T20:40:49.034Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nodejs-wheel-binaries"
|
||||
version = "22.14.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/d7/c7/4fd3871d2b7fd5122216245e273201ab98eda92bbd6fe9ad04846b758c56/nodejs_wheel_binaries-22.14.0.tar.gz", hash = "sha256:c1dc43713598c7310d53795c764beead861b8c5021fe4b1366cb912ce1a4c8bf", size = 8055, upload-time = "2025-02-11T18:15:17.714Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/61/b6/66ef4ef75ea7389ea788f2d5505bf9a8e5c3806d56c7a90cf46a6942f1cf/nodejs_wheel_binaries-22.14.0-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:d8ab8690516a3e98458041286e3f0d6458de176d15c14f205c3ea2972131420d", size = 50326597, upload-time = "2025-02-11T18:14:18.467Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7d/78/023d91a293ba73572a643bc89d11620d189f35f205a309dd8296aa45e69a/nodejs_wheel_binaries-22.14.0-py2.py3-none-macosx_11_0_x86_64.whl", hash = "sha256:b2f200f23b3610bdbee01cf136279e005ffdf8ee74557aa46c0940a7867956f6", size = 51158258, upload-time = "2025-02-11T18:14:25.693Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/af/86/324f6342c79e5034a13319b02ba9ed1f4ac8813af567d223c9a9e56cd338/nodejs_wheel_binaries-22.14.0-py2.py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d0877832abd7a9c75c8c5caafa37f986c9341ee025043c2771213d70c4c1defa", size = 57180264, upload-time = "2025-02-11T18:14:34.123Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/6d/9f/42bdaab26137e31732bff00147b9aca2185d475b5752b57a443e6c7ba93f/nodejs_wheel_binaries-22.14.0-py2.py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8fded5a70a8a55c2135e67bd580d8b7f2e94fcbafcc679b6a2d5b92f88373d69", size = 57693251, upload-time = "2025-02-11T18:14:42.071Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ab/d7/94f8f269aa86cf35f9ed2b70d09aca48dc971fb5656fdc4a3b69364b189f/nodejs_wheel_binaries-22.14.0-py2.py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:c1ade6f3ece458b40c02e89c91d5103792a9f18aaad5026da533eb0dcb87090e", size = 58841717, upload-time = "2025-02-11T18:14:49.971Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2d/a0/43b7316eaf22b4ee9bfb897ee36c724efceac7b89d7d1bedca28057b7be1/nodejs_wheel_binaries-22.14.0-py2.py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:34fa5ed4cf3f65cbfbe9b45c407ffc2fc7d97a06cd8993e6162191ff81f29f48", size = 59808791, upload-time = "2025-02-11T18:14:59.428Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/10/0a/814491f751a25136e37de68a2728c9a9e3c1d20494aba5ff3c230d5f9c2d/nodejs_wheel_binaries-22.14.0-py2.py3-none-win_amd64.whl", hash = "sha256:ca7023276327455988b81390fa6bbfa5191c1da7fc45bc57c7abc281ba9967e9", size = 40478921, upload-time = "2025-02-11T18:15:07.3Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f4/5c/cab444afaa387dceac8debb817b52fd00596efcd2d54506c27311c6fe6a8/nodejs_wheel_binaries-22.14.0-py2.py3-none-win_arm64.whl", hash = "sha256:fd59c8e9a202221e316febe1624a1ae3b42775b7fb27737bf12ec79565983eaf", size = 36206637, upload-time = "2025-02-11T18:15:13.39Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyfakefs"
|
||||
version = "5.9.1"
|
||||
|
||||
Reference in New Issue
Block a user