From 3cd75807f92bc7a6786282de4e5366cbc6666d13 Mon Sep 17 00:00:00 2001 From: BasioMeusPuga Date: Sun, 10 Feb 2019 17:58:35 +0530 Subject: [PATCH] Fix MOBI parser Update Kindleunpack Discover new and exciting bugs --- TODO | 4 +- lector/KindleUnpack/kindleunpack.py | 8 ++-- lector/KindleUnpack/mobi_k8proc.py | 2 + lector/KindleUnpack/mobiml2xhtml.py | 16 +++---- lector/parsers/mobi.py | 67 +++++++++++------------------ lector/readers/read_epub.py | 49 +++++++++++++++------ lector/readers/read_fb2.py | 2 +- lector/sorter.py | 19 +++++--- requirements.txt | 5 ++- 9 files changed, 96 insertions(+), 76 deletions(-) diff --git a/TODO b/TODO index bddc2ae..08ebecc 100644 --- a/TODO +++ b/TODO @@ -93,12 +93,14 @@ TODO Deselecting all directories in the settings dialog also filters out manually added books Last line in QTextBrowser should never be cut off Does image alignment need to be centered? - Bookmark name for a page that's not on the TOC or has nothing before + Bookmark name for a page that's not on the TOC and has nothing before Screen position still keeps jumping when inside a paragraph Better recursion needed for fb2 toc Initial sort by author in tableview Last column not filling up tableview Comic view mode changing does not work for newly added books + Ctrl + A reports 10 times the number of books selected for deletion + Ordering for non TOC chapters is beyond borked Secondary: Tab tooltip diff --git a/lector/KindleUnpack/kindleunpack.py b/lector/KindleUnpack/kindleunpack.py index 41fa97d..628f256 100644 --- a/lector/KindleUnpack/kindleunpack.py +++ b/lector/KindleUnpack/kindleunpack.py @@ -6,7 +6,7 @@ from __future__ import unicode_literals, division, absolute_import, print_functi import os -__path__ = ["lib", os.path.dirname(__file__), "kindleunpack"] +__path__ = ["lib", os.path.dirname(os.path.realpath(__file__)), "kindleunpack"] import sys import codecs @@ -140,6 +140,8 @@ if PY2: # 0.76 pre-release version only fix name related issues in opf by not using original file name in mobi7 # 0.77 bug fix for unpacking HDImages with included Fonts # 0.80 converted to work with both python 2.7 and Python 3.3 and later +# 0.81 various fixes +# 0.82 Handle calibre-generated mobis that can have skeletons with no fragments DUMP = False """ Set to True to dump all possible information. """ @@ -847,7 +849,7 @@ def process_all_mobi_headers(files, apnxfile, sect, mhlst, K8Boundary, k8only=Fa return -def unpackBook(infile, outdir, apnxfile=None, epubver='2', use_hd=True, dodump=False, dowriteraw=False, dosplitcombos=False): +def unpackBook(infile, outdir, apnxfile=None, epubver='2', use_hd=False, dodump=False, dowriteraw=False, dosplitcombos=False): global DUMP global WRITE_RAW_DATA global SPLIT_COMBO_MOBIS @@ -949,7 +951,7 @@ def main(argv=unicode_argv()): global WRITE_RAW_DATA global SPLIT_COMBO_MOBIS - print("KindleUnpack v0.80") + print("KindleUnpack v0.82") print(" Based on initial mobipocket version Copyright © 2009 Charles M. Hannum ") print(" Extensive Extensions and Improvements Copyright © 2009-2014 ") print(" by: P. Durrant, K. Hendricks, S. Siebert, fandrieu, DiapDealer, nickredding, tkeo.") diff --git a/lector/KindleUnpack/mobi_k8proc.py b/lector/KindleUnpack/mobi_k8proc.py index 04ec028..5b8274e 100644 --- a/lector/KindleUnpack/mobi_k8proc.py +++ b/lector/KindleUnpack/mobi_k8proc.py @@ -180,9 +180,11 @@ class K8Processor: fragptr = 0 baseptr = 0 cnt = 0 + filename = 'part%04d.xhtml' % cnt for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl: baseptr = skelpos + skellen skeleton = text[skelpos: baseptr] + aidtext = "0" for i in range(fragcnt): [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[fragptr] aidtext = idtext[12:-2] diff --git a/lector/KindleUnpack/mobiml2xhtml.py b/lector/KindleUnpack/mobiml2xhtml.py index 7d89159..85be8ba 100755 --- a/lector/KindleUnpack/mobiml2xhtml.py +++ b/lector/KindleUnpack/mobiml2xhtml.py @@ -246,13 +246,13 @@ class MobiMLConverter(object): # handle case of end tag with no beginning by injecting empty begin tag taginfo = ('begin', tname, None) htmlstr += self.processtag(taginfo) - print(" - fixed by injecting empty start tag ", tname) + print " - fixed by injecting empty start tag ", tname self.path.append(tname) elif len(self.path) > 1 and tname == self.path[-2]: # handle case of dangling missing end taginfo = ('end', self.path[-1], None) htmlstr += self.processtag(taginfo) - print(" - fixed by injecting end tag ", self.path[-1]) + print " - fixed by injecting end tag ", self.path[-1] self.path.pop() self.path.pop() @@ -504,18 +504,18 @@ def main(argv=sys.argv): infile = argv[1] try: - print('Converting Mobi Markup Language to XHTML') + print 'Converting Mobi Markup Language to XHTML' mlc = MobiMLConverter(infile) - print('Processing ...') + print 'Processing ...' htmlstr, css, cssname = mlc.processml() outname = infile.rsplit('.',1)[0] + '_converted.html' file(outname, 'wb').write(htmlstr) file(cssname, 'wb').write(css) - print('Completed') - print('XHTML version of book can be found at: ', outname) + print 'Completed' + print 'XHTML version of book can be found at: ' + outname - except ValueError as e: - print("Error: %s" % e) + except ValueError, e: + print "Error: %s" % e return 1 return 0 diff --git a/lector/parsers/mobi.py b/lector/parsers/mobi.py index 6bb8d17..98ee3dc 100644 --- a/lector/parsers/mobi.py +++ b/lector/parsers/mobi.py @@ -14,8 +14,9 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -# This module parses Amazon ebooks using KindleUnpack to first create an -# epub that is then read the usual way +# TODO +# See if it's possible to just feed the +# unzipped mobi7 file into the EPUB parser module import os import sys @@ -30,73 +31,53 @@ logger = logging.getLogger(__name__) class ParseMOBI: + # This module parses Amazon ebooks using KindleUnpack to first create an + # epub and then read the usual way + def __init__(self, filename, temp_dir, file_md5): - self.book_ref = None self.book = None self.filename = filename self.epub_filepath = None - self.split_large_xml = False self.temp_dir = temp_dir - self.extract_dir = os.path.join(temp_dir, file_md5) + self.extract_path = os.path.join(temp_dir, file_md5) def read_book(self): with HidePrinting(): - KindleUnpack.unpackBook(self.filename, self.extract_dir) + KindleUnpack.unpackBook(self.filename, self.extract_path) epub_filename = os.path.splitext( os.path.basename(self.filename))[0] + '.epub' - self.epub_filepath = os.path.join( - self.extract_dir, 'mobi8', epub_filename) + self.extract_path, 'mobi8', epub_filename) + if not os.path.exists(self.epub_filepath): - zip_dir = os.path.join(self.extract_dir, 'mobi7') + zip_dir = os.path.join(self.extract_path, 'mobi7') zip_file = os.path.join( - self.extract_dir, epub_filename) + self.extract_path, epub_filename) self.epub_filepath = shutil.make_archive(zip_file, 'zip', zip_dir) - self.split_large_xml = True - self.book_ref = EPUB(self.epub_filepath, self.temp_dir) - self.book_ref.generate_metadata() - self.book_ref.generate_toc() - self.book_ref.generate_content() - self.book = self.book_ref.book - return True + self.book = EPUB(self.epub_filepath, self.temp_dir) - def get_title(self): - return self.book['title'] + def generate_metadata(self): + self.book.generate_metadata() + return self.book.metadata - def get_author(self): - return self.book['author'] + def generate_content(self): + zipfile.ZipFile(self.epub_filepath).extractall(self.extract_path) - def get_year(self): - return self.book['year'] - - def get_cover_image(self): - return self.book['cover'] - - def get_isbn(self): - return self.book['isbn'] - - def get_tags(self): - return self.book['tags'] - - def get_contents(self): - return - extract_path = os.path.join(self.extract_dir) - zipfile.ZipFile(self.epub_filepath).extractall(extract_path) - - self.book_ref.parse_chapters( - temp_dir=self.temp_dir, split_large_xml=self.split_large_xml) + self.book.generate_toc() + self.book.generate_content() toc = [] content = [] - for count, i in enumerate(self.book['book_list']): - toc.append((1, i[0], count + 1)) - content.append(i[1]) + for count, i in enumerate(self.book.content): + toc.append((1, i[1], count + 1)) + content.append(i[2]) # Return toc, content, images_only return toc, content, False + class HidePrinting: def __enter__(self): self._original_stdout = sys.stdout diff --git a/lector/readers/read_epub.py b/lector/readers/read_epub.py index 9f42ad4..0dddf45 100644 --- a/lector/readers/read_epub.py +++ b/lector/readers/read_epub.py @@ -18,7 +18,6 @@ # See if inserting chapters not in the toc.ncx can be avoided # Account for stylesheets... eventually # Everything needs logging -# Mobipocket files import os import zipfile @@ -68,6 +67,7 @@ class EPUB: for i in presumptive_names: packagefile = self.find_file(i) if packagefile: + logger.info('Using presumptive package file: ' + self.book_filename) break packagefile_data = self.zip_file.read(packagefile) @@ -218,11 +218,19 @@ class EPUB: # These are simply ids that correspond to the actual item # as mentioned in the manifest - which is a comprehensive # list of files - chapters_in_spine = [ - i['@idref'] - for i in self.opf_dict['package']['spine']['itemref']] + try: + # Multiple chapters + chapters_in_spine = [ + i['@idref'] + for i in self.opf_dict['package']['spine']['itemref']] + except TypeError: + # Single chapter - Large xml + chapters_in_spine = [ + self.opf_dict['package']['spine']['itemref']['@idref']] # Next, find items and ids from the manifest + # This might error out in case there's only one item in + # the manifest. Remember that for later. chapters_from_manifest = { i['@id']: i['@href'] for i in self.opf_dict['package']['manifest']['item']} @@ -236,10 +244,12 @@ class EPUB: except KeyError: pass - chapter_title = 1 toc_chapters = [ unquote(i[2].split('#')[0]) for i in self.content] + # TODO + # This totally borks the order + last_valid_index = -2 # Yes, but why? for i in spine_final: if not i in toc_chapters: @@ -251,10 +261,11 @@ class EPUB: except ValueError: last_valid_index += 1 + # Chapters are currently named None + # Blank chapters will later be removed + # and the None will be replaced by a number self.content.insert( - last_valid_index + 1, - [1, str(chapter_title), i]) - chapter_title += 1 + last_valid_index + 1, [1, None, i]) # Parse split chapters as below # They can be picked up during the iteration through the toc @@ -316,13 +327,28 @@ class EPUB: self.content[count][2] = chapter_content # Cleanup content by removing null chapters - self.content = [ - i for i in self.content if i[2]] + unnamed_chapter_title = 1 + content_copy = [] + for i in self.content: + if i[2]: + chapter_title = i[1] + if not chapter_title: + chapter_title = unnamed_chapter_title + unnamed_chapter_title += 1 + content_copy.append(( + i[0], str(chapter_title), i[2])) + self.content = content_copy + # TODO + # This can probably be circumvented by shifting the extraction + # to this module and simply getting the path to the cover + + # Get cover image and put it in its place + # I imagine this involves saying nasty things to it cover_image = self.generate_book_cover() if cover_image: cover_path = os.path.join( - self.temp_dir, os.path.basename(self.book_filename)) + '- cover' + self.temp_dir, os.path.basename(self.book_filename)) + ' - cover' with open(cover_path, 'wb') as cover_temp: cover_temp.write(cover_image) @@ -389,7 +415,6 @@ class EPUB: break except: logger.warning('ISBN not found: ' + self.book_filename) - pass # Book tags try: diff --git a/lector/readers/read_fb2.py b/lector/readers/read_fb2.py index 5e9bbfe..36c26c2 100644 --- a/lector/readers/read_fb2.py +++ b/lector/readers/read_fb2.py @@ -148,7 +148,7 @@ class FB2: cover_image = self.generate_book_cover() if cover_image: cover_path = os.path.join( - temp_dir, os.path.basename(self.filename)) + '- cover' + temp_dir, os.path.basename(self.filename)) + ' - cover' with open(cover_path, 'wb') as cover_temp: cover_temp.write(cover_image) diff --git a/lector/sorter.py b/lector/sorter.py index c058aee..d63cb31 100644 --- a/lector/sorter.py +++ b/lector/sorter.py @@ -15,7 +15,8 @@ # along with this program. If not, see . # INSTRUCTIONS -# Every parser is supposed to have the following methods. None returns are not allowed. +# Every parser is supposed to have the following methods. +# Exceptions will be caught - but that's just bad practice # read_book() - Initialize book # generate_metadata() - For addition # generate_content() - For reading @@ -67,6 +68,7 @@ else: # python-lxml - Required for everything except comics lxml_check = importlib.util.find_spec('lxml') +xmltodict_check = importlib.util.find_spec('xmltodict') if lxml_check: lxml_dependent = { 'epub': ParseEPUB, @@ -79,7 +81,7 @@ if lxml_check: 'fb2.zip': ParseFB2} sorter.update(lxml_dependent) else: - critical_sting = 'python-lxml is not installed. Only comics will load.' + critical_sting = 'python-lxml / xmltodict is not installed. Only comics will load.' print(critical_sting) logger.critical(critical_sting) @@ -122,8 +124,8 @@ class BookSorter: self.queue = Manager().Queue() self.processed_books = [] - if self.work_mode == 'addition': - progress_object_generator() + # if self.work_mode == 'addition': + progress_object_generator() def database_hashes(self): all_hashes_and_paths = database.DatabaseFunctions( @@ -134,7 +136,6 @@ class BookSorter: 'LIKE') if all_hashes_and_paths: - # self.hashes = [i[0] for i in all_hashes] self.hashes_and_paths = { i[0]: i[1] for i in all_hashes_and_paths} @@ -205,6 +206,12 @@ class BookSorter: book_ref = sorter[file_extension](filename, self.temp_dir, file_md5) + # None of the following have an exception type specified + # This will keep everything from crashing, but will make + # troubleshooting difficult + # TODO + # In application notifications + try: book_ref.read_book() except: @@ -248,7 +255,7 @@ class BookSorter: if self.work_mode == 'reading': try: book_breakdown = book_ref.generate_content() - except: + except KeyboardInterrupt: logger.error('Content generation error: ' + filename) return diff --git a/requirements.txt b/requirements.txt index 7ad59c7..3afba53 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ beautifulsoup4==4.7.1 -lxml==4.3.0 -PyMuPDF==1.14.7 +lxml==4.3.1 +PyMuPDF==1.14.8 PyQt5==5.11.3 PyQt5-sip==4.19.13 soupsieve==1.7.3 +xmltodict==0.11.0