refactor: split parsing and logic

This commit is contained in:
eneller
2025-03-03 21:34:04 +01:00
parent bde605cc90
commit 4d8cd00298
2 changed files with 27 additions and 24 deletions

View File

@@ -27,27 +27,22 @@ class GBConvert():
self.url = urlparse(self.root)
self.output = self.url.netloc + self.url.path
self.standalone = standalone
self.chapters = []
def get_meta(self):
def parse_meta(self):
response = requests.get(self.root)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
self.author = soup.find('meta', {'name': 'author'})['content']
self.title = soup.find('meta', {'name': 'title'})['content']
self.toc = soup.find('ul').find_all('a')
def save_page(self, url):
# https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
command = f'''wget \
--timestamping \
--page-requisites \
--convert-links \
--tries=5 \
--quiet \
{url}'''
os.system(command)
def parse_toc_entry(self, entry):
url = os.path.join(self.root, entry['href'])
self.save_page(url)
return url
def clean_page(self,file_path):
def parse_page(self,file_path):
f = open(file_path, 'r').read()
soup = BeautifulSoup(f, 'html.parser')
for blocker in self.blocklist:
@@ -68,24 +63,31 @@ class GBConvert():
{" ".join(self.chapters)} '''#TODO --epub-cover-image
os.system(command)
def save_page(self, url):
# https://superuser.com/questions/970323/using-wget-to-copy-website-with-proper-layout-for-offline-browsing
command = f'''wget \
--timestamping \
--page-requisites \
--convert-links \
--tries=5 \
--quiet \
{url}'''
os.system(command)
def run(self):
#TODO include images flag
self.get_meta()
map(lambda x: self.save_page(os.path.join(self.root, x['href'])), self.toc)
self.chapters = []
self.parse_meta()
# download all files in toc (chapters)
for item in (tqdm(self.toc) if self.standalone else self.toc):
item_title= item.get_text()
item_url = os.path.join(self.root, item['href'])
self.save_page(url=item_url)
item_url = self.parse_toc_entry(item)
parsed_url = urlparse(item_url)
filepath = parsed_url.netloc + parsed_url.path
self.clean_page(filepath)
self.chapters.append(item['href'])
self.parse_page(filepath)
self.chapters.append(os.path.basename(item_url))
self.create_epub(f'{self.title} - {self.author}.epub')
# Methods used to get a list of all books for interactive selection or scraping
def get_all_books() -> list:
response = requests.get(allbooks_url)
response.raise_for_status()
@@ -122,6 +124,7 @@ def get_all_book_tags ()-> ResultSet:
books = soup.find('dl').find_all('a')
return books
# run main cli
def main():
sys.argv.pop(0)
# non-interactive mode

2
uv.lock generated
View File

@@ -68,7 +68,7 @@ wheels = [
[[package]]
name = "epub2go"
version = "1.0"
version = "1.2"
source = { editable = "." }
dependencies = [
{ name = "beautifulsoup4" },