chore: move to uv project structure

2025-01-16 09:38:56 +01:00
parent d89341364a
commit 8742883edd
9 changed files with 142 additions and 6 deletions
--- a/src/convert.py
+++ b/src/convert.py
@@ -0,0 +1,87 @@
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+from urllib.request import urlopen, urlparse
+
+import os, sys
+from pathlib import Path
+class GBConvert():
+    #TODO fix toc / headings
+    
+    def __init__(self
+        , url:str
+        ):
+        self.dir_root = os.path.dirname(os.path.realpath(__file__))
+        self.dir_data = os.path.join(os.path.dirname(self.dir_root), "data/")
+        self.root = os.path.dirname(url)
+        self.url = urlparse(self.root)
+        self.output = self.url.netloc + self.url.path
+        self.blocklist = open(os.path.join(self.dir_data, "blocklist.txt"), 'r').read().splitlines()
+        
+    def get_meta(self):
+        response = requests.get(self.root)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, 'html.parser')
+        self.author = soup.find('meta', {'name': 'author'})['content']
+        self.title = soup.find('meta', {'name': 'title'})['content']
+        self.toc = soup.find('ul').find_all('a')
+    
+    def save_page(self, url):
+        # https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
+        command = f'''wget \
+                    --page-requisites \
+                    --convert-links \
+                    --execute \
+                    --tries=5 \
+                    --quiet \
+                    {url}'''
+        os.system(command)
+
+    def clean_page(self,file_path):
+        f = open(file_path, 'r').read()
+        soup = BeautifulSoup(f, 'html.parser')
+        for blocker in self.blocklist:
+            for item in soup.select(blocker):
+                item.decompose()
+        open(file_path, 'w').write(str(soup))
+
+
+    def create_epub(self,  filename='out.epub'):
+        os.chdir(self.output)
+        command = f'''pandoc -f html -t epub \
+                    -o "{filename}" \
+                    --reference-location=section \
+                    --css="{os.path.join(self.dir_data, "drama.css")}" \
+                    --metadata title="{self.title}" \
+                    --metadata author="{self.author}" \
+                    --epub-title-page=false \
+                    {" ".join(self.chapters)} '''#TODO --epub-cover-image
+        os.system(command)
+
+    def run(self):
+        #TODO include images flag
+
+        self.get_meta()
+
+        map(lambda x: self.save_page(os.path.join(self.root, x['href'])), self.toc)
+        self.chapters = []
+        for item in self.toc:
+            item_title= item.get_text()
+            item_url = os.path.join(self.root, item['href'])
+            self.save_page(url=item_url)
+            parsed_url = urlparse(item_url)
+            filepath = parsed_url.netloc + parsed_url.path
+            self.clean_page(filepath)
+            self.chapters.append(item['href'])
+        
+        self.create_epub(f'{self.title} - {self.author}.epub')
+        
+    
+    
+def main():
+    g = GBConvert(sys.argv[1])
+    g.run()
+
+
+if __name__ == "__main__":
+    main()
--- a/src/test.py
+++ b/src/test.py
@@ -0,0 +1,25 @@
+import requests
+from bs4 import BeautifulSoup
+
+import os
+
+from convert import GBConvert
+
+def main():
+    allbooks_relative_url ='/info/texte/allworka.html'
+    root_url = 'https://www.projekt-gutenberg.org'
+    allbooks_url = root_url + allbooks_relative_url
+    response = requests.get(allbooks_url)
+    if (response.status_code != 200): raise Exception(f'Couldnt fetch root page {self.root}')
+    soup = BeautifulSoup(response.content, 'html.parser')
+    books = soup.find('dl').find_all('a')
+    for book in books:
+        book_title = book.get_text()
+        book_url_relative = book.get('href')
+        if book_url_relative is not None:
+            book_url = root_url + os.path.dirname(book_url_relative)[5:]
+            gb = GBConvert(book_url)
+
+
+if __name__ == "__main__":
+    main()