chore: error handling

This commit is contained in:
eneller
2025-03-04 18:28:17 +01:00
parent 00f6cef743
commit c78aac28ab

View File

@@ -23,23 +23,39 @@ class Book():
class GBConvert(): class GBConvert():
def __init__(self, def __init__(self,
url:str, url:str,
author:str = None,
title:str = None,
standalone = False, standalone = False,
): ):
# NOTE move non-code files to data folder # NOTE move non-code files to data folder
self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css") self.style_path_drama = pkg_resources.files('epub2go').joinpath("drama.css")
self.blocklist = open(pkg_resources.files('epub2go').joinpath('blocklist.txt')).read().splitlines() with open(pkg_resources.files('epub2go').joinpath('blocklist.txt')) as blocklist:
self.blocklist = blocklist.read().splitlines()
self.root = os.path.dirname(url) self.root = os.path.dirname(url)
self.url = urlparse(self.root) self.url = urlparse(self.root)
self.output = self.url.netloc + self.url.path self.output = self.url.netloc + self.url.path
self.standalone = standalone self.standalone = standalone
self.author = author
self.title = title
self.chapters = [] self.chapters = []
self.parse_meta()
def parse_meta(self): def parse_meta(self):
response = requests.get(self.root) response = requests.get(self.root)
response.raise_for_status() response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser') soup = BeautifulSoup(response.content, 'html.parser')
# TODO allow setting these from interactive mode where those parameters are figured out from the list
if not self.author:
try:
self.author = soup.find('meta', {'name': 'author'})['content'] self.author = soup.find('meta', {'name': 'author'})['content']
except:
self.author = "UnknownAuthor"
if not self.title:
try:
self.title = soup.find('meta', {'name': 'title'})['content'] self.title = soup.find('meta', {'name': 'title'})['content']
except:
self.title = "UnknownTitle"
self.toc = soup.find('ul').find_all('a') self.toc = soup.find('ul').find_all('a')
def parse_toc_entry(self, entry): def parse_toc_entry(self, entry):
@@ -47,16 +63,17 @@ class GBConvert():
self.save_page(url) self.save_page(url)
return url return url
# apply blocklist to file
def parse_page(self,file_path): def parse_page(self,file_path):
f = open(file_path, 'r').read() #TODO clean up file opening, mmap?
soup = BeautifulSoup(f, 'html.parser') with open(file_path, 'r+') as f:
soup = BeautifulSoup(f.read(), 'html.parser')
for blocker in self.blocklist: for blocker in self.blocklist:
for item in soup.select(blocker): for item in soup.select(blocker):
item.decompose() item.decompose()
open(file_path, 'w').write(str(soup)) f.write(str(soup))
def create_epub(self, filename='out.epub')-> int:
def create_epub(self, filename='out.epub'):
#TODO --epub-cover-image #TODO --epub-cover-image
#TODO toc if it isnt described by <h> tags, e.g. https://www.projekt-gutenberg.org/adlersfe/maskenba/ #TODO toc if it isnt described by <h> tags, e.g. https://www.projekt-gutenberg.org/adlersfe/maskenba/
command = f'''pandoc -f html -t epub \ command = f'''pandoc -f html -t epub \
@@ -67,7 +84,7 @@ class GBConvert():
--metadata author="{self.author}" \ --metadata author="{self.author}" \
--epub-title-page=false \ --epub-title-page=false \
{" ".join(self.chapters)} ''' {" ".join(self.chapters)} '''
return subprocess.Popen(shlex.split(command), cwd=self.output) return subprocess.Popen(shlex.split(command), cwd=self.output).returncode
def save_page(self, url): def save_page(self, url):
# https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing # https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
@@ -82,7 +99,6 @@ class GBConvert():
def run(self): def run(self):
#TODO include images flag #TODO include images flag
self.parse_meta()
# download all files in toc (chapters) # download all files in toc (chapters)
for item in (tqdm(self.toc) if self.standalone else self.toc): for item in (tqdm(self.toc) if self.standalone else self.toc):
item_url = self.parse_toc_entry(item) item_url = self.parse_toc_entry(item)