5 Commits

Author SHA1 Message Date
eneller
98070cc2c4 Merge branch 'master' into feat/parallel 2025-08-22 10:49:54 +02:00
eneller
0d171a6ea3 test: logging 2025-07-03 22:09:59 +02:00
eneller
024bea93f8 chore: vscode settings 2025-04-13 23:27:55 +02:00
eneller
6d8cd43202 chore: add basedpywright 2025-04-10 12:51:06 +02:00
eneller
cc08210d36 begin parallel downloads 2025-04-07 21:19:25 +02:00
5 changed files with 77 additions and 11 deletions

14
.vscode/extensions.json vendored Normal file
View File

@@ -0,0 +1,14 @@
{
// See https://go.microsoft.com/fwlink/?LinkId=827846 to learn about workspace recommendations.
// Extension identifier format: ${publisher}.${name}. Example: vscode.csharp
// List of extensions which should be recommended for users of this workspace.
"recommendations": [
"detachhead.basedpyright",
"charliermarsh.ruff",
],
// List of extensions recommended by VS Code that should not be recommended for users of this workspace.
"unwantedRecommendations": [
]
}

3
.vscode/settings.json vendored Normal file
View File

@@ -0,0 +1,3 @@
{
"basedpyright.disableLanguageServices": true
}

View File

@@ -26,6 +26,7 @@ requires = ["setuptools>=64", "setuptools_scm>=8"]
[dependency-groups]
dev = [
"basedpyright>=1.28.5",
"pysonar>=1.0.2.1722",
]

View File

@@ -1,18 +1,23 @@
import requests
from bs4 import BeautifulSoup
from bs4 import ResultSet
from urllib.parse import urljoin
from urllib.request import urlparse
from tqdm import tqdm
from pyfzf.pyfzf import FzfPrompt
import click
import os, subprocess, shlex, logging, re
import os
import subprocess
import shlex
import logging
import re
import concurrent.futures
import importlib.resources as pkg_resources
from dataclasses import dataclass
from typing import List
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.NOTSET,format='%(asctime)s - %(levelname)s - %(message)s')
allbooks_url ='https://www.projekt-gutenberg.org/info/texte/allworka.html'
@@ -44,6 +49,7 @@ class GBConvert():
title:str = None,
showprogress: bool = False,
cleanpages: bool = True,
max_workers: int = 10,
):
tocpage = os.path.dirname(url) # ToC website url
dir_output = self.getDir(url)
@@ -72,13 +78,22 @@ class GBConvert():
#run
#TODO include images flag
# download all files in toc (chapters)
chapter_files = []
for item in (tqdm(chapter_urls) if showprogress else chapter_urls):
def process_item(item):
item_url = self.parse_toc_entry(tocpage, item)
parsed_url = urlparse(item_url)
filepath = os.path.join(self.dir_download, parsed_url.netloc + parsed_url.path)
if cleanpages: self.parse_page(filepath)
chapter_files.append(os.path.basename(item_url))
if cleanpages:
self.parse_page(filepath)
return os.path.basename(item_url)
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = []
items = tqdm(chapter_urls) if showprogress else chapter_urls
for item in items:
futures.append(executor.submit(process_item, item))
# Wait for completion and preserve order
chapter_files = [future.result() for future in futures]
return self.create_epub(author,title,chapter_files,dir_output)
@@ -174,8 +189,9 @@ def slugify(value, replacement='_'):
@click.option('--silent', '-s', is_flag=True, help='Disable the progress bar')
@click.option('--path','-p',type=str,default='./', help='The path to which files are saved' )
@click.option('--no-clean',is_flag=True,help='Do not parse html files with blocklist')
@click.option('--max-workers', '-w', type=int, default=10, help='Number of concurrent download workers')
@click.argument('args', nargs=-1)
def main(args, debug, silent, path, no_clean):
def main(args, debug, silent, path, no_clean, max_workers):
'''
Download ePUBs from https://www.projekt-gutenberg.org/ \n
Provide either 0 arguments to enter interactive mode or an arbitrary number of URLs to download from
@@ -198,10 +214,10 @@ def main(args, debug, silent, path, no_clean):
logger.debug('Attempting to download from %d URL(s)', len(books))
converter = GBConvert(path)
if len(books)==1:
converter.download(books[0], showprogress=not silent, cleanpages= not no_clean)
converter.download(books[0], showprogress=not silent, cleanpages= not no_clean, max_workers=max_workers)
else:
for book in (tqdm(books) if not silent else books):
converter.download(book, cleanpages= not no_clean)
converter.download(book, cleanpages= not no_clean, max_workers=max_workers)
if __name__ == "__main__":
main()

36
uv.lock generated
View File

@@ -1,7 +1,19 @@
version = 1
revision = 2
revision = 3
requires-python = ">=3.12"
[[package]]
name = "basedpyright"
version = "1.28.5"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "nodejs-wheel-binaries" },
]
sdist = { url = "https://files.pythonhosted.org/packages/ce/43/7a85507882cfbda82b9003a849924dcb6f58ef74051c13f2dea9d1c30067/basedpyright-1.28.5.tar.gz", hash = "sha256:f2f13d1158c77edffe827b0a8366a8d6ec8c3a69aa9f4c938ec8fe6026d1e309", size = 21748156, upload-time = "2025-04-09T11:41:36.092Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/77/57/8aabf496d7c9c22fe259494fddc28b130e8f5d099f41025e616139b80428/basedpyright-1.28.5-py3-none-any.whl", hash = "sha256:33dab5a88832c17dbce6207e2c9df244c227ad0bf71d7e38d8728a227244e980", size = 11387863, upload-time = "2025-04-09T11:41:32.713Z" },
]
[[package]]
name = "beautifulsoup4"
version = "4.12.3"
@@ -94,6 +106,7 @@ dependencies = [
[package.dev-dependencies]
dev = [
{ name = "basedpyright" },
{ name = "pysonar" },
]
@@ -108,7 +121,10 @@ requires-dist = [
]
[package.metadata.requires-dev]
dev = [{ name = "pysonar", specifier = ">=1.0.2.1722" }]
dev = [
{ name = "basedpyright", specifier = ">=1.28.5" },
{ name = "pysonar", specifier = ">=1.0.2.1722" },
]
[[package]]
name = "idna"
@@ -131,6 +147,22 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/75/6f/55bc5837e9fe7a86a5acb553adec901257d709062bfaef7debd4d8cfee12/jproperties-2.1.2-py2.py3-none-any.whl", hash = "sha256:4108e868353a9f4a12bb86a92df5462d0e18d00119169533972ce473029be79a", size = 17981, upload-time = "2024-07-21T20:40:49.034Z" },
]
[[package]]
name = "nodejs-wheel-binaries"
version = "22.14.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d7/c7/4fd3871d2b7fd5122216245e273201ab98eda92bbd6fe9ad04846b758c56/nodejs_wheel_binaries-22.14.0.tar.gz", hash = "sha256:c1dc43713598c7310d53795c764beead861b8c5021fe4b1366cb912ce1a4c8bf", size = 8055, upload-time = "2025-02-11T18:15:17.714Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/61/b6/66ef4ef75ea7389ea788f2d5505bf9a8e5c3806d56c7a90cf46a6942f1cf/nodejs_wheel_binaries-22.14.0-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:d8ab8690516a3e98458041286e3f0d6458de176d15c14f205c3ea2972131420d", size = 50326597, upload-time = "2025-02-11T18:14:18.467Z" },
{ url = "https://files.pythonhosted.org/packages/7d/78/023d91a293ba73572a643bc89d11620d189f35f205a309dd8296aa45e69a/nodejs_wheel_binaries-22.14.0-py2.py3-none-macosx_11_0_x86_64.whl", hash = "sha256:b2f200f23b3610bdbee01cf136279e005ffdf8ee74557aa46c0940a7867956f6", size = 51158258, upload-time = "2025-02-11T18:14:25.693Z" },
{ url = "https://files.pythonhosted.org/packages/af/86/324f6342c79e5034a13319b02ba9ed1f4ac8813af567d223c9a9e56cd338/nodejs_wheel_binaries-22.14.0-py2.py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d0877832abd7a9c75c8c5caafa37f986c9341ee025043c2771213d70c4c1defa", size = 57180264, upload-time = "2025-02-11T18:14:34.123Z" },
{ url = "https://files.pythonhosted.org/packages/6d/9f/42bdaab26137e31732bff00147b9aca2185d475b5752b57a443e6c7ba93f/nodejs_wheel_binaries-22.14.0-py2.py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8fded5a70a8a55c2135e67bd580d8b7f2e94fcbafcc679b6a2d5b92f88373d69", size = 57693251, upload-time = "2025-02-11T18:14:42.071Z" },
{ url = "https://files.pythonhosted.org/packages/ab/d7/94f8f269aa86cf35f9ed2b70d09aca48dc971fb5656fdc4a3b69364b189f/nodejs_wheel_binaries-22.14.0-py2.py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:c1ade6f3ece458b40c02e89c91d5103792a9f18aaad5026da533eb0dcb87090e", size = 58841717, upload-time = "2025-02-11T18:14:49.971Z" },
{ url = "https://files.pythonhosted.org/packages/2d/a0/43b7316eaf22b4ee9bfb897ee36c724efceac7b89d7d1bedca28057b7be1/nodejs_wheel_binaries-22.14.0-py2.py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:34fa5ed4cf3f65cbfbe9b45c407ffc2fc7d97a06cd8993e6162191ff81f29f48", size = 59808791, upload-time = "2025-02-11T18:14:59.428Z" },
{ url = "https://files.pythonhosted.org/packages/10/0a/814491f751a25136e37de68a2728c9a9e3c1d20494aba5ff3c230d5f9c2d/nodejs_wheel_binaries-22.14.0-py2.py3-none-win_amd64.whl", hash = "sha256:ca7023276327455988b81390fa6bbfa5191c1da7fc45bc57c7abc281ba9967e9", size = 40478921, upload-time = "2025-02-11T18:15:07.3Z" },
{ url = "https://files.pythonhosted.org/packages/f4/5c/cab444afaa387dceac8debb817b52fd00596efcd2d54506c27311c6fe6a8/nodejs_wheel_binaries-22.14.0-py2.py3-none-win_arm64.whl", hash = "sha256:fd59c8e9a202221e316febe1624a1ae3b42775b7fb27737bf12ec79565983eaf", size = 36206637, upload-time = "2025-02-11T18:15:13.39Z" },
]
[[package]]
name = "pyfakefs"
version = "5.9.1"