Fix MOBI parser

Update Kindleunpack Discover new and exciting bugs
2019-02-10 17:58:35 +05:30
parent f6f9d01060
commit 3cd75807f9
9 changed files with 96 additions and 76 deletions
--- a/4
+++ b/4
@@ -93,12 +93,14 @@ TODO
        Deselecting all directories in the settings dialog also filters out manually added books
        Last line in QTextBrowser should never be cut off
        Does image alignment need to be centered?
-        Bookmark name for a page that's not on the TOC or has nothing before
+        Bookmark name for a page that's not on the TOC and has nothing before
        Screen position still keeps jumping when inside a paragraph
        Better recursion needed for fb2 toc
        Initial sort by author in tableview
        Last column not filling up tableview
        Comic view mode changing does not work for newly added books
        Ctrl + A reports 10 times the number of books selected for deletion
        Ordering for non TOC chapters is beyond borked
    Secondary:
        Tab tooltip
--- a/lector/KindleUnpack/kindleunpack.py
+++ b/lector/KindleUnpack/kindleunpack.py
@@ -6,7 +6,7 @@ from __future__ import unicode_literals, division, absolute_import, print_functi
 import os
-__path__ = ["lib", os.path.dirname(__file__), "kindleunpack"]
+__path__ = ["lib", os.path.dirname(os.path.realpath(__file__)), "kindleunpack"]
 import sys
 import codecs
@@ -140,6 +140,8 @@ if PY2:
 #  0.76   pre-release version only fix name related issues in opf by not using original file name in mobi7
 #  0.77   bug fix for unpacking HDImages with included Fonts
 #  0.80   converted to work with both python 2.7 and Python 3.3 and later
 #  0.81   various fixes
 #  0.82   Handle calibre-generated mobis that can have skeletons with no fragments
 DUMP = False
 """ Set to True to dump all possible information. """
@@ -847,7 +849,7 @@ def process_all_mobi_headers(files, apnxfile, sect, mhlst, K8Boundary, k8only=Fa
    return
-def unpackBook(infile, outdir, apnxfile=None, epubver='2', use_hd=True, dodump=False, dowriteraw=False, dosplitcombos=False):
+def unpackBook(infile, outdir, apnxfile=None, epubver='2', use_hd=False, dodump=False, dowriteraw=False, dosplitcombos=False):
    global DUMP
    global WRITE_RAW_DATA
    global SPLIT_COMBO_MOBIS
@@ -949,7 +951,7 @@ def main(argv=unicode_argv()):
    global WRITE_RAW_DATA
    global SPLIT_COMBO_MOBIS
-    print("KindleUnpack v0.80")
+    print("KindleUnpack v0.82")
    print("   Based on initial mobipocket version Copyright © 2009 Charles M. Hannum <root@ihack.net>")
    print("   Extensive Extensions and Improvements Copyright © 2009-2014 ")
    print("       by:  P. Durrant, K. Hendricks, S. Siebert, fandrieu, DiapDealer, nickredding, tkeo.")
--- a/lector/KindleUnpack/mobi_k8proc.py
+++ b/lector/KindleUnpack/mobi_k8proc.py
@@ -180,9 +180,11 @@ class K8Processor:
        fragptr = 0
        baseptr = 0
        cnt = 0
        filename = 'part%04d.xhtml' % cnt
        for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl:
            baseptr = skelpos + skellen
            skeleton = text[skelpos: baseptr]
            aidtext = "0"
            for i in range(fragcnt):
                [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[fragptr]
                aidtext = idtext[12:-2]
--- a/lector/KindleUnpack/mobiml2xhtml.py
+++ b/lector/KindleUnpack/mobiml2xhtml.py
@@ -246,13 +246,13 @@ class MobiMLConverter(object):
                            # handle case of end tag with no beginning by injecting empty begin tag
                            taginfo = ('begin', tname, None)
                            htmlstr += self.processtag(taginfo)
-                            print("     - fixed by injecting empty start tag ", tname)
+                            print "     - fixed by injecting empty start tag ", tname
                            self.path.append(tname)
                        elif len(self.path) >  1 and tname == self.path[-2]:
                            # handle case of dangling missing end
                            taginfo = ('end', self.path[-1], None)
                            htmlstr += self.processtag(taginfo)
-                            print("     - fixed by injecting end tag ", self.path[-1])
+                            print "     - fixed by injecting end tag ", self.path[-1]
                            self.path.pop()
                    self.path.pop()
@@ -504,18 +504,18 @@ def main(argv=sys.argv):
        infile = argv[1]
    try:
-        print('Converting Mobi Markup Language to XHTML')
+        print 'Converting Mobi Markup Language to XHTML'
        mlc = MobiMLConverter(infile)
-        print('Processing ...')
+        print 'Processing ...'
        htmlstr, css, cssname = mlc.processml()
        outname = infile.rsplit('.',1)[0] + '_converted.html'
        file(outname, 'wb').write(htmlstr)
        file(cssname, 'wb').write(css)
-        print('Completed')
+        print 'Completed'
-        print('XHTML version of book can be found at: ', outname)
+        print 'XHTML version of book can be found at: ' + outname
-    except ValueError as e:
+    except ValueError, e:
-        print("Error: %s" % e)
+        print "Error: %s" % e
        return 1
    return 0
--- a/lector/parsers/mobi.py
+++ b/lector/parsers/mobi.py
@@ -14,8 +14,9 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
-# This module parses Amazon ebooks using KindleUnpack to first create an
+# TODO
-# epub that is then read the usual way
+# See if it's possible to just feed the
 # unzipped mobi7 file into the EPUB parser module
 import os
 import sys
@@ -30,73 +31,53 @@ logger = logging.getLogger(__name__)
 class ParseMOBI:
    # This module parses Amazon ebooks using KindleUnpack to first create an
    # epub and then read the usual way
    def __init__(self, filename, temp_dir, file_md5):
        self.book_ref = None
        self.book = None
        self.filename = filename
        self.epub_filepath = None
        self.split_large_xml = False
        self.temp_dir = temp_dir
-        self.extract_dir = os.path.join(temp_dir, file_md5)
+        self.extract_path = os.path.join(temp_dir, file_md5)
    def read_book(self):
        with HidePrinting():
-            KindleUnpack.unpackBook(self.filename, self.extract_dir)
+            KindleUnpack.unpackBook(self.filename, self.extract_path)
        epub_filename = os.path.splitext(
            os.path.basename(self.filename))[0] + '.epub'
        self.epub_filepath = os.path.join(
-            self.extract_dir, 'mobi8', epub_filename)
+            self.extract_path, 'mobi8', epub_filename)
        if not os.path.exists(self.epub_filepath):
-            zip_dir = os.path.join(self.extract_dir, 'mobi7')
+            zip_dir = os.path.join(self.extract_path, 'mobi7')
            zip_file = os.path.join(
-                self.extract_dir, epub_filename)
+                self.extract_path, epub_filename)
            self.epub_filepath = shutil.make_archive(zip_file, 'zip', zip_dir)
            self.split_large_xml = True
-        self.book_ref = EPUB(self.epub_filepath, self.temp_dir)
+        self.book = EPUB(self.epub_filepath, self.temp_dir)
        self.book_ref.generate_metadata()
        self.book_ref.generate_toc()
        self.book_ref.generate_content()
        self.book = self.book_ref.book
        return True
-    def get_title(self):
+    def generate_metadata(self):
-        return self.book['title']
+        self.book.generate_metadata()
        return self.book.metadata
-    def get_author(self):
+    def generate_content(self):
-        return self.book['author']
+        zipfile.ZipFile(self.epub_filepath).extractall(self.extract_path)
-    def get_year(self):
+        self.book.generate_toc()
-        return self.book['year']
+        self.book.generate_content()
    def get_cover_image(self):
        return self.book['cover']
    def get_isbn(self):
        return self.book['isbn']
    def get_tags(self):
        return self.book['tags']
    def get_contents(self):
        return
        extract_path = os.path.join(self.extract_dir)
        zipfile.ZipFile(self.epub_filepath).extractall(extract_path)
        self.book_ref.parse_chapters(
            temp_dir=self.temp_dir, split_large_xml=self.split_large_xml)
        toc = []
        content = []
-        for count, i in enumerate(self.book['book_list']):
+        for count, i in enumerate(self.book.content):
-            toc.append((1, i[0], count + 1))
+            toc.append((1, i[1], count + 1))
-            content.append(i[1])
+            content.append(i[2])
        # Return toc, content, images_only
        return toc, content, False
 class HidePrinting:
    def __enter__(self):
        self._original_stdout = sys.stdout
--- a/lector/readers/read_epub.py
+++ b/lector/readers/read_epub.py
@@ -18,7 +18,6 @@
 # See if inserting chapters not in the toc.ncx can be avoided
 # Account for stylesheets... eventually
 # Everything needs logging
 # Mobipocket files
 import os
 import zipfile
@@ -68,6 +67,7 @@ class EPUB:
            for i in presumptive_names:
                packagefile = self.find_file(i)
                if packagefile:
                    logger.info('Using presumptive package file: ' + self.book_filename)
                    break
        packagefile_data = self.zip_file.read(packagefile)
@@ -218,11 +218,19 @@ class EPUB:
        # These are simply ids that correspond to the actual item
        # as mentioned in the manifest - which is a comprehensive
        # list of files
        try:
            # Multiple chapters
            chapters_in_spine = [
                i['@idref']
                for i in self.opf_dict['package']['spine']['itemref']]
        except TypeError:
            # Single chapter - Large xml
            chapters_in_spine = [
                self.opf_dict['package']['spine']['itemref']['@idref']]
        # Next, find items and ids from the manifest
        # This might error out in case there's only one item in
        # the manifest. Remember that for later.
        chapters_from_manifest = {
            i['@id']: i['@href']
            for i in self.opf_dict['package']['manifest']['item']}
@@ -236,10 +244,12 @@ class EPUB:
            except KeyError:
                pass
        chapter_title = 1
        toc_chapters = [
            unquote(i[2].split('#')[0]) for i in self.content]
        # TODO
        # This totally borks the order
        last_valid_index = -2  # Yes, but why?
        for i in spine_final:
            if not i in toc_chapters:
@@ -251,10 +261,11 @@ class EPUB:
                except ValueError:
                    last_valid_index += 1
                # Chapters are currently named None
                # Blank chapters will later be removed
                # and the None will be replaced by a number
                self.content.insert(
-                    last_valid_index + 1,
+                    last_valid_index + 1, [1, None, i])
                    [1, str(chapter_title), i])
                chapter_title += 1
        # Parse split chapters as below
        # They can be picked up during the iteration through the toc
@@ -316,9 +327,24 @@ class EPUB:
            self.content[count][2] = chapter_content
        # Cleanup content by removing null chapters
-        self.content = [
+        unnamed_chapter_title = 1
-            i for i in self.content if i[2]]
+        content_copy = []
        for i in self.content:
            if i[2]:
                chapter_title = i[1]
                if not chapter_title:
                    chapter_title = unnamed_chapter_title
                    unnamed_chapter_title += 1
                content_copy.append((
                    i[0], str(chapter_title), i[2]))
        self.content = content_copy
        # TODO
        # This can probably be circumvented by shifting the extraction
        # to this module and simply getting the path to the cover
        # Get cover image and put it in its place
        # I imagine this involves saying nasty things to it
        cover_image = self.generate_book_cover()
        if cover_image:
            cover_path = os.path.join(
@@ -389,7 +415,6 @@ class EPUB:
                    break
            except:
                logger.warning('ISBN not found: ' + self.book_filename)
                pass
        # Book tags
        try:
--- a/lector/sorter.py
+++ b/lector/sorter.py
@@ -15,7 +15,8 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 # INSTRUCTIONS
-# Every parser is supposed to have the following methods. None returns are not allowed.
+# Every parser is supposed to have the following methods.
 # Exceptions will be caught - but that's just bad practice
 # read_book() - Initialize book
 # generate_metadata() - For addition
 # generate_content() - For reading
@@ -67,6 +68,7 @@ else:
 # python-lxml - Required for everything except comics
 lxml_check = importlib.util.find_spec('lxml')
 xmltodict_check = importlib.util.find_spec('xmltodict')
 if lxml_check:
    lxml_dependent = {
        'epub': ParseEPUB,
@@ -79,7 +81,7 @@ if lxml_check:
        'fb2.zip': ParseFB2}
    sorter.update(lxml_dependent)
 else:
-    critical_sting = 'python-lxml is not installed. Only comics will load.'
+    critical_sting = 'python-lxml / xmltodict is not installed. Only comics will load.'
    print(critical_sting)
    logger.critical(critical_sting)
@@ -122,7 +124,7 @@ class BookSorter:
        self.queue = Manager().Queue()
        self.processed_books = []
-        if self.work_mode == 'addition':
+        # if self.work_mode == 'addition':
        progress_object_generator()
    def database_hashes(self):
@@ -134,7 +136,6 @@ class BookSorter:
                'LIKE')
        if all_hashes_and_paths:
            # self.hashes = [i[0] for i in all_hashes]
            self.hashes_and_paths = {
                i[0]: i[1] for i in all_hashes_and_paths}
@@ -205,6 +206,12 @@ class BookSorter:
        book_ref = sorter[file_extension](filename, self.temp_dir, file_md5)
        # None of the following have an exception type specified
        # This will keep everything from crashing, but will make
        # troubleshooting difficult
        # TODO
        # In application notifications
        try:
            book_ref.read_book()
        except:
@@ -248,7 +255,7 @@ class BookSorter:
        if self.work_mode == 'reading':
            try:
                book_breakdown = book_ref.generate_content()
-            except:
+            except KeyboardInterrupt:
                logger.error('Content generation error: ' + filename)
                return
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
 beautifulsoup4==4.7.1
-lxml==4.3.0
+lxml==4.3.1
-PyMuPDF==1.14.7
+PyMuPDF==1.14.8
 PyQt5==5.11.3
 PyQt5-sip==4.19.13
 soupsieve==1.7.3
 xmltodict==0.11.0