chore: move to uv project structure
This commit is contained in:
87
src/convert.py
Normal file
87
src/convert.py
Normal file
@@ -0,0 +1,87 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin
|
||||
from urllib.request import urlopen, urlparse
|
||||
|
||||
import os, sys
|
||||
from pathlib import Path
|
||||
class GBConvert():
|
||||
#TODO fix toc / headings
|
||||
|
||||
def __init__(self
|
||||
, url:str
|
||||
):
|
||||
self.dir_root = os.path.dirname(os.path.realpath(__file__))
|
||||
self.dir_data = os.path.join(os.path.dirname(self.dir_root), "data/")
|
||||
self.root = os.path.dirname(url)
|
||||
self.url = urlparse(self.root)
|
||||
self.output = self.url.netloc + self.url.path
|
||||
self.blocklist = open(os.path.join(self.dir_data, "blocklist.txt"), 'r').read().splitlines()
|
||||
|
||||
def get_meta(self):
|
||||
response = requests.get(self.root)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
self.author = soup.find('meta', {'name': 'author'})['content']
|
||||
self.title = soup.find('meta', {'name': 'title'})['content']
|
||||
self.toc = soup.find('ul').find_all('a')
|
||||
|
||||
def save_page(self, url):
|
||||
# https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
|
||||
command = f'''wget \
|
||||
--page-requisites \
|
||||
--convert-links \
|
||||
--execute \
|
||||
--tries=5 \
|
||||
--quiet \
|
||||
{url}'''
|
||||
os.system(command)
|
||||
|
||||
def clean_page(self,file_path):
|
||||
f = open(file_path, 'r').read()
|
||||
soup = BeautifulSoup(f, 'html.parser')
|
||||
for blocker in self.blocklist:
|
||||
for item in soup.select(blocker):
|
||||
item.decompose()
|
||||
open(file_path, 'w').write(str(soup))
|
||||
|
||||
|
||||
def create_epub(self, filename='out.epub'):
|
||||
os.chdir(self.output)
|
||||
command = f'''pandoc -f html -t epub \
|
||||
-o "{filename}" \
|
||||
--reference-location=section \
|
||||
--css="{os.path.join(self.dir_data, "drama.css")}" \
|
||||
--metadata title="{self.title}" \
|
||||
--metadata author="{self.author}" \
|
||||
--epub-title-page=false \
|
||||
{" ".join(self.chapters)} '''#TODO --epub-cover-image
|
||||
os.system(command)
|
||||
|
||||
def run(self):
|
||||
#TODO include images flag
|
||||
|
||||
self.get_meta()
|
||||
|
||||
map(lambda x: self.save_page(os.path.join(self.root, x['href'])), self.toc)
|
||||
self.chapters = []
|
||||
for item in self.toc:
|
||||
item_title= item.get_text()
|
||||
item_url = os.path.join(self.root, item['href'])
|
||||
self.save_page(url=item_url)
|
||||
parsed_url = urlparse(item_url)
|
||||
filepath = parsed_url.netloc + parsed_url.path
|
||||
self.clean_page(filepath)
|
||||
self.chapters.append(item['href'])
|
||||
|
||||
self.create_epub(f'{self.title} - {self.author}.epub')
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
g = GBConvert(sys.argv[1])
|
||||
g.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
25
src/test.py
Normal file
25
src/test.py
Normal file
@@ -0,0 +1,25 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import os
|
||||
|
||||
from convert import GBConvert
|
||||
|
||||
def main():
|
||||
allbooks_relative_url ='/info/texte/allworka.html'
|
||||
root_url = 'https://www.projekt-gutenberg.org'
|
||||
allbooks_url = root_url + allbooks_relative_url
|
||||
response = requests.get(allbooks_url)
|
||||
if (response.status_code != 200): raise Exception(f'Couldnt fetch root page {self.root}')
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
books = soup.find('dl').find_all('a')
|
||||
for book in books:
|
||||
book_title = book.get_text()
|
||||
book_url_relative = book.get('href')
|
||||
if book_url_relative is not None:
|
||||
book_url = root_url + os.path.dirname(book_url_relative)[5:]
|
||||
gb = GBConvert(book_url)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user