From c6e30b67adc90e8fbab4c838aa7793f0bc202ac9 Mon Sep 17 00:00:00 2001 From: BasioMeusPuga Date: Sun, 10 Feb 2019 06:47:51 +0530 Subject: [PATCH] Improve EPUB parser compatibility and speed Completely break MOBI parser --- TODO | 3 + lector/parsers/epub.py | 8 +- lector/parsers/mobi.py | 9 +- lector/readers/read_epub.py | 166 ++++++++++++++++++++++++++---------- 4 files changed, 135 insertions(+), 51 deletions(-) diff --git a/TODO b/TODO index b39684f..63e9598 100644 --- a/TODO +++ b/TODO @@ -96,8 +96,11 @@ TODO Bookmark name for a page that's not on the TOC or has nothing before Screen position still keeps jumping when inside a paragraph Better recursion needed for fb2 toc + Initial sort by author in tableview + Last column not filling up tableview Secondary: + Tab tooltip Additional Settings: Disable progressbar - 20% book addition speed improvement Disable cover loading when reading - Saves ~2M / book diff --git a/lector/parsers/epub.py b/lector/parsers/epub.py index 69a3309..c8db06a 100644 --- a/lector/parsers/epub.py +++ b/lector/parsers/epub.py @@ -63,5 +63,11 @@ class ParseEPUB: self.book_ref.generate_toc() self.book_ref.generate_content() + toc = [] + content = [] + for count, i in enumerate(self.book['content']): + toc.append((i[0], i[1], count + 1)) + content.append(i[2]) + # Return toc, content, images_only - return self.book['toc'], self.book['content'], False + return toc, content, False diff --git a/lector/parsers/mobi.py b/lector/parsers/mobi.py index ee2c59a..6bb8d17 100644 --- a/lector/parsers/mobi.py +++ b/lector/parsers/mobi.py @@ -55,10 +55,10 @@ class ParseMOBI: self.epub_filepath = shutil.make_archive(zip_file, 'zip', zip_dir) self.split_large_xml = True - self.book_ref = EPUB(self.epub_filepath) - contents_found = self.book_ref.read_epub() - if not contents_found: - return False + self.book_ref = EPUB(self.epub_filepath, self.temp_dir) + self.book_ref.generate_metadata() + self.book_ref.generate_toc() + self.book_ref.generate_content() self.book = self.book_ref.book return True @@ -81,6 +81,7 @@ class ParseMOBI: return self.book['tags'] def get_contents(self): + return extract_path = os.path.join(self.extract_dir) zipfile.ZipFile(self.epub_filepath).extractall(extract_path) diff --git a/lector/readers/read_epub.py b/lector/readers/read_epub.py index 3ba2eba..0050ea9 100644 --- a/lector/readers/read_epub.py +++ b/lector/readers/read_epub.py @@ -16,7 +16,6 @@ # TODO # See if inserting chapters not in the toc.ncx can be avoided -# Missing file order is messed up # Account for stylesheets... eventually # Everything needs logging # Mobipocket files @@ -25,8 +24,10 @@ import os import zipfile import logging import collections +from urllib.parse import unquote import xmltodict +from PyQt5 import QtGui from bs4 import BeautifulSoup logger = logging.getLogger(__name__) @@ -44,11 +45,14 @@ class EPUB: self.generate_references() def find_file(self, filename): + # Get rid of special characters + filename = unquote(filename) + # First, look for the file in the root of the book if filename in self.file_list: return filename - # Then, search for it elsewhere + # Then search for it elsewhere else: file_basename = os.path.basename(filename) for i in self.file_list: @@ -56,7 +60,7 @@ class EPUB: return i # If the file isn't found - logger.error(filename + ' not found') + logging.error(filename + ' not found in ' + self.book_filename) return False def generate_references(self): @@ -75,7 +79,7 @@ class EPUB: container_dict = xmltodict.parse(container_xml) packagefile = container_dict['container']['rootfiles']['rootfile']['@full-path'] else: - presumptive_names = ('content.opf', 'package.opf') + presumptive_names = ('content.opf', 'package.opf', 'volume.opf') for i in presumptive_names: packagefile = self.find_file(i) if packagefile: @@ -85,11 +89,42 @@ class EPUB: self.opf_dict = xmltodict.parse(packagefile_data) def generate_toc(self): - self.book['toc'] = [] + self.book['content'] = [] - # I'm currently going with the file always being named toc.ncx - # But this is epub. The wild west of ebook formats. - tocfile = self.find_file('toc.ncx') + def find_alternative_toc(): + toc_filename = None + toc_filename_alternative = None + manifest = self.opf_dict['package']['manifest']['item'] + + for i in manifest: + # Behold the burning hoops we're jumping through + if i['@id'] == 'ncx': + toc_filename = i['@href'] + if ('ncx' in i['@id']) or ('toc' in i['@id']): + toc_filename_alternative = i['@href'] + if toc_filename and toc_filename_alternative: + break + + if not toc_filename: + if not toc_filename_alternative: + logger.error('No ToC found for: ' + self.book_filename) + else: + toc_filename = toc_filename_alternative + + logger.info('Using alternate ToC for: ' + self.book_filename) + return toc_filename + + # Find the toc.ncx file from the manifest + # EPUBs will name literally anything, anything so try + # a less stringent approach if the first one doesn't work + # The idea is to prioritize 'toc.ncx' since this should work + # for the vast majority of books + toc_filename = 'toc.ncx' + does_toc_exist = self.find_file(toc_filename) + if not does_toc_exist: + toc_filename = find_alternative_toc() + + tocfile = self.find_file(toc_filename) tocfile_data = self.zip_file.read(tocfile) toc_dict = xmltodict.parse(tocfile_data) @@ -99,21 +134,30 @@ class EPUB: level + 1, i['navLabel']['text'], i['content']['@src']] for i in nav_node] - self.book['toc'].extend(these_contents) + self.book['content'].extend(these_contents) return if 'navPoint' in nav_node.keys(): recursor(level, nav_node['navPoint']) else: - self.book['toc'].append([ + self.book['content'].append([ level + 1, nav_node['navLabel']['text'], nav_node['content']['@src']]) navpoints = toc_dict['ncx']['navMap']['navPoint'] for top_level_nav in navpoints: - self.book['toc'].append([ + # Just one chapter + if isinstance(top_level_nav, str): + self.book['content'].append([ + 1, + navpoints['navLabel']['text'], + navpoints['content']['@src']]) + break + + # Multiple chapters + self.book['content'].append([ 1, top_level_nav['navLabel']['text'], top_level_nav['content']['@src']]) @@ -124,10 +168,19 @@ class EPUB: def get_chapter_content(self, chapter_file): this_file = self.find_file(chapter_file) if this_file: - return self.zip_file.read(this_file).decode() + chapter_content = self.zip_file.read(this_file).decode() + + # Generate a None return for a blank chapter + # These will be removed from the contents later + contentDocument = QtGui.QTextDocument(None) + contentDocument.setHtml(chapter_content) + contentText = contentDocument.toPlainText().replace('\n', '') + if contentText == '': + chapter_content = None + + return chapter_content else: - print('Not found: ' + chapter_file) - return chapter_file + return 'Possible parse error: ' + chapter_file def parse_split_chapters(self, chapters_with_split_content): self.book['split_chapters'] = {} @@ -149,10 +202,13 @@ class EPUB: markup_split = str(soup).split(str(this_tag)) soup = BeautifulSoup(markup_split[0], 'lxml') - this_markup = BeautifulSoup( - str(this_tag) + markup_split[1], 'lxml') - self.book['split_chapters'][chapter_file][this_anchor] = str(this_markup) + # If the tag is None, it probably means the content is overlapping + # Skipping the insert is the way forward + if this_tag: + this_markup = BeautifulSoup( + str(this_tag).strip() + markup_split[1], 'lxml') + self.book['split_chapters'][chapter_file][this_anchor] = str(this_markup) # Remaining markup is assigned here self.book['split_chapters'][chapter_file]['top_level'] = str(soup) @@ -180,12 +236,11 @@ class EPUB: except KeyError: pass - # TODO - # Check what happens in case missing chapters are either - # at the beginning or the end of the book chapter_title = 1 - toc_chapters = [i[2] for i in self.book['toc']] - last_valid_index = 0 + toc_chapters = [ + unquote(i[2].split('#')[0]) for i in self.book['content']] + + last_valid_index = -2 # Yes, but why? for i in spine_final: if not i in toc_chapters: previous_chapter = spine_final[spine_final.index(i) - 1] @@ -195,7 +250,8 @@ class EPUB: last_valid_index = previous_chapter_toc_index except ValueError: last_valid_index += 1 - self.book['toc'].insert( + + self.book['content'].insert( last_valid_index + 1, [1, str(chapter_title), i]) chapter_title += 1 @@ -203,7 +259,7 @@ class EPUB: # Parse split chapters as below # They can be picked up during the iteration through the toc chapters_with_split_content = {} - for i in self.book['toc']: + for i in self.book['content']: if '#' in i[2]: this_split = i[2].split('#') chapter = this_split[0] @@ -222,13 +278,8 @@ class EPUB: # In case a split chapter is encountered, get its content # from the split_chapters dictionary # What could possibly go wrong? - - # The content list is separated from the toc list because - # the mupdf library returns its own toc a certain way and - # this keeps things uniform split_chapters = self.book['split_chapters'] - toc_copy = self.book['toc'][:] - self.book['content'] = [] + toc_copy = self.book['content'][:] # Put the book into the book for count, i in enumerate(toc_copy): @@ -263,9 +314,11 @@ class EPUB: else: chapter_content = self.get_chapter_content(chapter_file) - # The count + 2 is an adjustment due to the cover being inserted below - self.book['toc'][count][2] = count + 2 - self.book['content'].append(chapter_content) + self.book['content'][count][2] = chapter_content + + # Cleanup content by removing null chapters + self.book['content'] = [ + i for i in self.book['content'] if i[2]] self.generate_book_cover() if self.book['cover']: @@ -274,51 +327,72 @@ class EPUB: with open(cover_path, 'wb') as cover_temp: cover_temp.write(self.book['cover']) - self.book['toc'].insert(0, (1, 'Cover', 1)) + # There's probably some rationale to doing an insert here + # But a replacement seems... neater self.book['content'].insert( - 0, (f'
Cover
')) + 0, (1, 'Cover', f'
Cover
')) def generate_metadata(self): metadata = self.opf_dict['package']['metadata'] + def flattener(this_object): + if isinstance(this_object, collections.OrderedDict): + return this_object['#text'] + + if isinstance(this_object, list): + if isinstance(this_object[0], collections.OrderedDict): + return this_object[0]['#text'] + else: + return this_object[0] + + if isinstance(this_object, str): + return this_object + # There are no exception types specified below # This is on purpose and makes me long for the days # of simpler, happier things. # Book title try: - self.book['title'] = metadata['dc:title'] - if isinstance(self.book['title'], collections.OrderedDict): - self.book['title'] = metadata['dc:title']['#text'] + self.book['title'] = flattener(metadata['dc:title']) except: - print('Title parse error') self.book['title'] = os.path.splitext( os.path.basename(self.book_filename))[0] # Book author try: - self.book['author'] = metadata['dc:creator']['#text'] + self.book['author'] = flattener(metadata['dc:creator']) except: self.book['author'] = 'Unknown' # Book year try: - self.book['year'] = int(metadata['dc:date'][:4]) + self.book['year'] = int(flattener(metadata['dc:date'])[:4]) except: self.book['year'] = 9999 # Book isbn + # Both one and multiple schema self.book['isbn'] = None try: - for i in metadata['dc:identifier']: - if i['@opf:scheme'].lower() == 'isbn': - self.book['isbn'] = i['#text'] - except: - pass + scheme = metadata['dc:identifier']['@opf:scheme'].lower() + if scheme.lower() == 'isbn': + self.book['isbn'] = metadata['dc:identifier']['#text'] + + except (TypeError, KeyError): + try: + for i in metadata['dc:identifier']: + if i['@opf:scheme'].lower() == 'isbn': + self.book['isbn'] = i['#text'] + break + except: + pass # Book tags try: self.book['tags'] = metadata['dc:subject'] + if isinstance(self.book['tags'], str): + self.book['tags'] = [self.book['tags']] except: self.book['tags'] = []