Fix MOBI parser

Update Kindleunpack Discover new and exciting bugs
2019-02-10 17:58:35 +05:30
parent f6f9d01060
commit 3cd75807f9
9 changed files with 96 additions and 76 deletions
--- a/4
+++ b/4
@@ -93,12 +93,14 @@ TODO
        Deselecting all directories in the settings dialog also filters out manually added books
        Last line in QTextBrowser should never be cut off
        Does image alignment need to be centered?
-        Bookmark name for a page that's not on the TOC or has nothing before
+        Bookmark name for a page that's not on the TOC and has nothing before
        Screen position still keeps jumping when inside a paragraph
        Better recursion needed for fb2 toc
        Initial sort by author in tableview
        Last column not filling up tableview
        Comic view mode changing does not work for newly added books
+        Ctrl + A reports 10 times the number of books selected for deletion
+        Ordering for non TOC chapters is beyond borked

    Secondary:
        Tab tooltip
--- a/lector/KindleUnpack/kindleunpack.py
+++ b/lector/KindleUnpack/kindleunpack.py
@@ -6,7 +6,7 @@ from __future__ import unicode_literals, division, absolute_import, print_functi

 import os

-__path__ = ["lib", os.path.dirname(__file__), "kindleunpack"]
+__path__ = ["lib", os.path.dirname(os.path.realpath(__file__)), "kindleunpack"]

 import sys
 import codecs
@@ -140,6 +140,8 @@ if PY2:
 #  0.76   pre-release version only fix name related issues in opf by not using original file name in mobi7
 #  0.77   bug fix for unpacking HDImages with included Fonts
 #  0.80   converted to work with both python 2.7 and Python 3.3 and later
+#  0.81   various fixes
+#  0.82   Handle calibre-generated mobis that can have skeletons with no fragments

 DUMP = False
 """ Set to True to dump all possible information. """
@@ -847,7 +849,7 @@ def process_all_mobi_headers(files, apnxfile, sect, mhlst, K8Boundary, k8only=Fa
    return


-def unpackBook(infile, outdir, apnxfile=None, epubver='2', use_hd=True, dodump=False, dowriteraw=False, dosplitcombos=False):
+def unpackBook(infile, outdir, apnxfile=None, epubver='2', use_hd=False, dodump=False, dowriteraw=False, dosplitcombos=False):
    global DUMP
    global WRITE_RAW_DATA
    global SPLIT_COMBO_MOBIS
@@ -949,7 +951,7 @@ def main(argv=unicode_argv()):
    global WRITE_RAW_DATA
    global SPLIT_COMBO_MOBIS

-    print("KindleUnpack v0.80")
+    print("KindleUnpack v0.82")
    print("   Based on initial mobipocket version Copyright © 2009 Charles M. Hannum <root@ihack.net>")
    print("   Extensive Extensions and Improvements Copyright © 2009-2014 ")
    print("       by:  P. Durrant, K. Hendricks, S. Siebert, fandrieu, DiapDealer, nickredding, tkeo.")
--- a/lector/KindleUnpack/mobi_k8proc.py
+++ b/lector/KindleUnpack/mobi_k8proc.py
@@ -180,9 +180,11 @@ class K8Processor:
        fragptr = 0
        baseptr = 0
        cnt = 0
+        filename = 'part%04d.xhtml' % cnt
        for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl:
            baseptr = skelpos + skellen
            skeleton = text[skelpos: baseptr]
+            aidtext = "0"
            for i in range(fragcnt):
                [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[fragptr]
                aidtext = idtext[12:-2]
--- a/lector/KindleUnpack/mobiml2xhtml.py
+++ b/lector/KindleUnpack/mobiml2xhtml.py
@@ -246,13 +246,13 @@ class MobiMLConverter(object):
                            # handle case of end tag with no beginning by injecting empty begin tag
                            taginfo = ('begin', tname, None)
                            htmlstr += self.processtag(taginfo)
-                            print("     - fixed by injecting empty start tag ", tname)
+                            print "     - fixed by injecting empty start tag ", tname
                            self.path.append(tname)
                        elif len(self.path) >  1 and tname == self.path[-2]:
                            # handle case of dangling missing end
                            taginfo = ('end', self.path[-1], None)
                            htmlstr += self.processtag(taginfo)
-                            print("     - fixed by injecting end tag ", self.path[-1])
+                            print "     - fixed by injecting end tag ", self.path[-1]
                            self.path.pop()
                    self.path.pop()

@@ -504,18 +504,18 @@ def main(argv=sys.argv):
        infile = argv[1]

    try:
-        print('Converting Mobi Markup Language to XHTML')
+        print 'Converting Mobi Markup Language to XHTML'
        mlc = MobiMLConverter(infile)
-        print('Processing ...')
+        print 'Processing ...'
        htmlstr, css, cssname = mlc.processml()
        outname = infile.rsplit('.',1)[0] + '_converted.html'
        file(outname, 'wb').write(htmlstr)
        file(cssname, 'wb').write(css)
-        print('Completed')
-        print('XHTML version of book can be found at: ', outname)
+        print 'Completed'
+        print 'XHTML version of book can be found at: ' + outname

-    except ValueError as e:
-        print("Error: %s" % e)
+    except ValueError, e:
+        print "Error: %s" % e
        return 1

    return 0
--- a/lector/parsers/mobi.py
+++ b/lector/parsers/mobi.py
@@ -14,8 +14,9 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

-# This module parses Amazon ebooks using KindleUnpack to first create an
-# epub that is then read the usual way
+# TODO
+# See if it's possible to just feed the
+# unzipped mobi7 file into the EPUB parser module

 import os
 import sys
@@ -30,73 +31,53 @@ logger = logging.getLogger(__name__)


 class ParseMOBI:
+    # This module parses Amazon ebooks using KindleUnpack to first create an
+    # epub and then read the usual way
+
    def __init__(self, filename, temp_dir, file_md5):
-        self.book_ref = None
        self.book = None
        self.filename = filename
        self.epub_filepath = None
-        self.split_large_xml = False
        self.temp_dir = temp_dir
-        self.extract_dir = os.path.join(temp_dir, file_md5)
+        self.extract_path = os.path.join(temp_dir, file_md5)

    def read_book(self):
        with HidePrinting():
-            KindleUnpack.unpackBook(self.filename, self.extract_dir)
+            KindleUnpack.unpackBook(self.filename, self.extract_path)

        epub_filename = os.path.splitext(
            os.path.basename(self.filename))[0] + '.epub'
-
        self.epub_filepath = os.path.join(
-            self.extract_dir, 'mobi8', epub_filename)
+            self.extract_path, 'mobi8', epub_filename)
+
        if not os.path.exists(self.epub_filepath):
-            zip_dir = os.path.join(self.extract_dir, 'mobi7')
+            zip_dir = os.path.join(self.extract_path, 'mobi7')
            zip_file = os.path.join(
-                self.extract_dir, epub_filename)
+                self.extract_path, epub_filename)
            self.epub_filepath = shutil.make_archive(zip_file, 'zip', zip_dir)
-            self.split_large_xml = True

-        self.book_ref = EPUB(self.epub_filepath, self.temp_dir)
-        self.book_ref.generate_metadata()
-        self.book_ref.generate_toc()
-        self.book_ref.generate_content()
-        self.book = self.book_ref.book
-        return True
+        self.book = EPUB(self.epub_filepath, self.temp_dir)

-    def get_title(self):
-        return self.book['title']
+    def generate_metadata(self):
+        self.book.generate_metadata()
+        return self.book.metadata

-    def get_author(self):
-        return self.book['author']
+    def generate_content(self):
+        zipfile.ZipFile(self.epub_filepath).extractall(self.extract_path)

-    def get_year(self):
-        return self.book['year']
-
-    def get_cover_image(self):
-        return self.book['cover']
-
-    def get_isbn(self):
-        return self.book['isbn']
-
-    def get_tags(self):
-        return self.book['tags']
-
-    def get_contents(self):
-        return
-        extract_path = os.path.join(self.extract_dir)
-        zipfile.ZipFile(self.epub_filepath).extractall(extract_path)
-
-        self.book_ref.parse_chapters(
-            temp_dir=self.temp_dir, split_large_xml=self.split_large_xml)
+        self.book.generate_toc()
+        self.book.generate_content()

        toc = []
        content = []
-        for count, i in enumerate(self.book['book_list']):
-            toc.append((1, i[0], count + 1))
-            content.append(i[1])
+        for count, i in enumerate(self.book.content):
+            toc.append((1, i[1], count + 1))
+            content.append(i[2])

        # Return toc, content, images_only
        return toc, content, False

+
 class HidePrinting:
    def __enter__(self):
        self._original_stdout = sys.stdout
--- a/lector/readers/read_epub.py
+++ b/lector/readers/read_epub.py
@@ -18,7 +18,6 @@
 # See if inserting chapters not in the toc.ncx can be avoided
 # Account for stylesheets... eventually
 # Everything needs logging
-# Mobipocket files

 import os
 import zipfile
@@ -68,6 +67,7 @@ class EPUB:
            for i in presumptive_names:
                packagefile = self.find_file(i)
                if packagefile:
+                    logger.info('Using presumptive package file: ' + self.book_filename)
                    break

        packagefile_data = self.zip_file.read(packagefile)
@@ -218,11 +218,19 @@ class EPUB:
        # These are simply ids that correspond to the actual item
        # as mentioned in the manifest - which is a comprehensive
        # list of files
+        try:
+            # Multiple chapters
            chapters_in_spine = [
                i['@idref']
                for i in self.opf_dict['package']['spine']['itemref']]
+        except TypeError:
+            # Single chapter - Large xml
+            chapters_in_spine = [
+                self.opf_dict['package']['spine']['itemref']['@idref']]

        # Next, find items and ids from the manifest
+        # This might error out in case there's only one item in
+        # the manifest. Remember that for later.
        chapters_from_manifest = {
            i['@id']: i['@href']
            for i in self.opf_dict['package']['manifest']['item']}
@@ -236,10 +244,12 @@ class EPUB:
            except KeyError:
                pass

-        chapter_title = 1
        toc_chapters = [
            unquote(i[2].split('#')[0]) for i in self.content]

+        # TODO
+        # This totally borks the order
+
        last_valid_index = -2  # Yes, but why?
        for i in spine_final:
            if not i in toc_chapters:
@@ -251,10 +261,11 @@ class EPUB:
                except ValueError:
                    last_valid_index += 1

+                # Chapters are currently named None
+                # Blank chapters will later be removed
+                # and the None will be replaced by a number
                self.content.insert(
-                    last_valid_index + 1,
-                    [1, str(chapter_title), i])
-                chapter_title += 1
+                    last_valid_index + 1, [1, None, i])

        # Parse split chapters as below
        # They can be picked up during the iteration through the toc
@@ -316,13 +327,28 @@ class EPUB:
            self.content[count][2] = chapter_content

        # Cleanup content by removing null chapters
-        self.content = [
-            i for i in self.content if i[2]]
+        unnamed_chapter_title = 1
+        content_copy = []
+        for i in self.content:
+            if i[2]:
+                chapter_title = i[1]
+                if not chapter_title:
+                    chapter_title = unnamed_chapter_title
+                    unnamed_chapter_title += 1
+                content_copy.append((
+                    i[0], str(chapter_title), i[2]))
+        self.content = content_copy

+        # TODO
+        # This can probably be circumvented by shifting the extraction
+        # to this module and simply getting the path to the cover
+
+        # Get cover image and put it in its place
+        # I imagine this involves saying nasty things to it
        cover_image = self.generate_book_cover()
        if cover_image:
            cover_path = os.path.join(
-                self.temp_dir, os.path.basename(self.book_filename)) + '- cover'
+                self.temp_dir, os.path.basename(self.book_filename)) + ' - cover'
            with open(cover_path, 'wb') as cover_temp:
                cover_temp.write(cover_image)

@@ -389,7 +415,6 @@ class EPUB:
                    break
            except:
                logger.warning('ISBN not found: ' + self.book_filename)
-                pass

        # Book tags
        try:
--- a/lector/readers/read_fb2.py
+++ b/lector/readers/read_fb2.py
@@ -148,7 +148,7 @@ class FB2:
        cover_image = self.generate_book_cover()
        if cover_image:
            cover_path = os.path.join(
-                temp_dir, os.path.basename(self.filename)) + '- cover'
+                temp_dir, os.path.basename(self.filename)) + ' - cover'
            with open(cover_path, 'wb') as cover_temp:
                cover_temp.write(cover_image)

--- a/lector/sorter.py
+++ b/lector/sorter.py
@@ -15,7 +15,8 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

 # INSTRUCTIONS
-# Every parser is supposed to have the following methods. None returns are not allowed.
+# Every parser is supposed to have the following methods.
+# Exceptions will be caught - but that's just bad practice
 # read_book() - Initialize book
 # generate_metadata() - For addition
 # generate_content() - For reading
@@ -67,6 +68,7 @@ else:

 # python-lxml - Required for everything except comics
 lxml_check = importlib.util.find_spec('lxml')
+xmltodict_check = importlib.util.find_spec('xmltodict')
 if lxml_check:
    lxml_dependent = {
        'epub': ParseEPUB,
@@ -79,7 +81,7 @@ if lxml_check:
        'fb2.zip': ParseFB2}
    sorter.update(lxml_dependent)
 else:
-    critical_sting = 'python-lxml is not installed. Only comics will load.'
+    critical_sting = 'python-lxml / xmltodict is not installed. Only comics will load.'
    print(critical_sting)
    logger.critical(critical_sting)

@@ -122,7 +124,7 @@ class BookSorter:
        self.queue = Manager().Queue()
        self.processed_books = []

-        if self.work_mode == 'addition':
+        # if self.work_mode == 'addition':
        progress_object_generator()

    def database_hashes(self):
@@ -134,7 +136,6 @@ class BookSorter:
                'LIKE')

        if all_hashes_and_paths:
-            # self.hashes = [i[0] for i in all_hashes]
            self.hashes_and_paths = {
                i[0]: i[1] for i in all_hashes_and_paths}

@@ -205,6 +206,12 @@ class BookSorter:

        book_ref = sorter[file_extension](filename, self.temp_dir, file_md5)

+        # None of the following have an exception type specified
+        # This will keep everything from crashing, but will make
+        # troubleshooting difficult
+        # TODO
+        # In application notifications
+
        try:
            book_ref.read_book()
        except:
@@ -248,7 +255,7 @@ class BookSorter:
        if self.work_mode == 'reading':
            try:
                book_breakdown = book_ref.generate_content()
-            except:
+            except KeyboardInterrupt:
                logger.error('Content generation error: ' + filename)
                return

--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
 beautifulsoup4==4.7.1
-lxml==4.3.0
-PyMuPDF==1.14.7
+lxml==4.3.1
+PyMuPDF==1.14.8
 PyQt5==5.11.3
 PyQt5-sip==4.19.13
 soupsieve==1.7.3
+xmltodict==0.11.0