From a0e463bc58b1b6a5be95c08775cf354eae4321b7 Mon Sep 17 00:00:00 2001 From: BasioMeusPuga Date: Thu, 14 Jun 2018 16:10:27 -0400 Subject: [PATCH] Speed up file addition Improve fb2 parser Fix extension checking --- TODO | 5 ++-- lector/parsers/epub.py | 1 + lector/parsers/fb2.py | 9 ++----- lector/readers/read_epub.py | 1 - lector/readers/read_fb2.py | 54 ++++++++++++++++++++++++++----------- lector/sorter.py | 18 ++++++++----- 6 files changed, 56 insertions(+), 32 deletions(-) diff --git a/TODO b/TODO index 9c09326..9dfd4cb 100644 --- a/TODO +++ b/TODO @@ -77,6 +77,8 @@ TODO ✓ mobi, azw support Limit the extra files produced by KindleUnpack Have them save to memory + ✓ fb2 support + Images need to show up in their placeholders Other: ✓ Define every widget in code Bugs: @@ -84,7 +86,6 @@ TODO Clean up 'switch' page layout Colors aren't loaded properly for annotation previews Cover page shouldn't be scolled midway - It's possible the addition function is also parsing the whole book. Secondary: Graphical themes @@ -98,7 +99,7 @@ TODO Use embedded fonts + CSS Scrolling: Smooth / By Line Shift to logging instead of print statements - txt, doc, chm, djvu, fb2 support + txt, doc, chm, djvu support Include icons for filetype emblems Comic view modes Continuous paging diff --git a/lector/parsers/epub.py b/lector/parsers/epub.py index 39da0d9..58eaa8c 100644 --- a/lector/parsers/epub.py +++ b/lector/parsers/epub.py @@ -58,6 +58,7 @@ class ParseEPUB: def get_contents(self): zipfile.ZipFile(self.filename).extractall(self.extract_path) + self.book_ref.parse_toc() self.book_ref.parse_chapters(temp_dir=self.extract_path) file_settings = { 'images_only': False} diff --git a/lector/parsers/fb2.py b/lector/parsers/fb2.py index f777a6b..ffb5a8f 100644 --- a/lector/parsers/fb2.py +++ b/lector/parsers/fb2.py @@ -15,7 +15,6 @@ # along with this program. If not, see . import os -import zipfile from lector.readers.read_fb2 import FB2 @@ -56,12 +55,8 @@ class ParseFB2: return self.book['tags'] def get_contents(self): - # TODO - # Make this save images to the temp path - # Relative file paths should then point there - # zipfile.ZipFile(self.filename).extractall(self.extract_path) - - # self.book_ref.parse_chapters(temp_dir=self.extract_path) + os.makedirs(self.extract_path, exist_ok=True) # Manual creation is required here + self.book_ref.parse_chapters(temp_dir=self.extract_path) file_settings = { 'images_only': False} return self.book['book_list'], file_settings diff --git a/lector/readers/read_epub.py b/lector/readers/read_epub.py index 537d6a1..62b7113 100644 --- a/lector/readers/read_epub.py +++ b/lector/readers/read_epub.py @@ -40,7 +40,6 @@ class EPUB: return False # No (valid) opf was found so processing cannot continue self.generate_book_metadata(contents_path) - self.parse_toc() except: # Not specifying an exception type here may be justified return False diff --git a/lector/readers/read_fb2.py b/lector/readers/read_fb2.py index e54a7ea..62a4d44 100644 --- a/lector/readers/read_fb2.py +++ b/lector/readers/read_fb2.py @@ -48,25 +48,29 @@ class FB2: return True def generate_book_metadata(self): - self.book['title'] = os.path.splitext( - os.path.basename(self.filename))[0] - self.book['author'] = 'Unknown' self.book['isbn'] = None self.book['tags'] = None self.book['cover'] = None - self.book['year'] = 9999 self.book['book_list'] = [] - # TODO - # Look for other components of book metadata here - for i in self.xml.find_all(): + # All metadata can be parsed in one pass + all_tags = self.xml.find('description') - if i.name == 'section': - for j in i: - if j.name == 'title': - this_title = j.text - self.book['book_list'].append( - (this_title, str(i))) + self.book['title'] = all_tags.find('book-title').text + if self.book['title'] == '' or self.book['title'] is None: + self.book['title'] = os.path.splitext( + os.path.basename(self.filename))[0] + + self.book['author'] = all_tags.find('author').getText(separator=' ').replace('\n', ' ') + if self.book['author'] == '' or self.book['author'] is None: + self.book['author'] = 'Unknown' + + # TODO + # Account for other date formats + try: + self.book['year'] = int(all_tags.find('date').text) + except ValueError: + self.book['year'] = 9999 # Cover Image cover_image_xml = self.xml.find('coverpage') @@ -75,8 +79,26 @@ class FB2: cover_image_data = self.xml.find_all('binary') for i in cover_image_data: - - # TODO - # Account for other images as well if cover_image_name.endswith(i.get('id')): self.book['cover'] = base64.decodebytes(i.text.encode()) + + def parse_chapters(self, temp_dir): + # There's no need to parse the TOC separately because + # everything is linear + for i in self.xml.find_all('section'): + for j in i: + if j.name == 'title': + this_title = j.getText(separator=' ') + self.book['book_list'].append( + (this_title, str(i))) + + # Extract all images to the temp_dir + for i in self.xml.find_all('binary'): + this_image_name = i.get('id') + this_image_path = os.path.join(temp_dir, this_image_name) + try: + this_image_data = base64.decodebytes(i.text.encode()) + with open(this_image_path, 'wb') as outimage: + outimage.write(this_image_data) + except AttributeError: + pass diff --git a/lector/sorter.py b/lector/sorter.py index b016798..077344b 100644 --- a/lector/sorter.py +++ b/lector/sorter.py @@ -175,15 +175,21 @@ class BookSorter: print(f'{os.path.basename(filename)} is already in database') return - # Using os.extsep like so allows for file extensions with multiple dots - file_extension = os.path.basename(filename).split(os.extsep, 1)[1] - try: - # Get the requisite parser from the sorter dict - book_ref = sorter[file_extension](filename, self.temp_dir, file_md5) - except KeyError: + # This allows for eliminating issues with filenames that have + # a dot in them. All hail the roundabout fix. + valid_extension = False + for i in sorter: + if os.path.basename(filename).endswith(i): + file_extension = i + valid_extension = True + break + + if not valid_extension: print(filename + ' has an unsupported extension') return + book_ref = sorter[file_extension](filename, self.temp_dir, file_md5) + # Everything following this is standard # None values are accounted for here book_ref.read_book()