From 3cd75807f92bc7a6786282de4e5366cbc6666d13 Mon Sep 17 00:00:00 2001
From: BasioMeusPuga <disgruntled.mob@gmail.com>
Date: Sun, 10 Feb 2019 17:58:35 +0530
Subject: [PATCH] Fix MOBI parser Update Kindleunpack Discover new and exciting
 bugs

---
 TODO                                |  4 +-
 lector/KindleUnpack/kindleunpack.py |  8 ++--
 lector/KindleUnpack/mobi_k8proc.py  |  2 +
 lector/KindleUnpack/mobiml2xhtml.py | 16 +++----
 lector/parsers/mobi.py              | 67 +++++++++++------------------
 lector/readers/read_epub.py         | 49 +++++++++++++++------
 lector/readers/read_fb2.py          |  2 +-
 lector/sorter.py                    | 19 +++++---
 requirements.txt                    |  5 ++-
 9 files changed, 96 insertions(+), 76 deletions(-)

diff --git a/TODO b/TODO
index bddc2ae..08ebecc 100644
--- a/TODO
+++ b/TODO
@@ -93,12 +93,14 @@ TODO
         Deselecting all directories in the settings dialog also filters out manually added books
         Last line in QTextBrowser should never be cut off
         Does image alignment need to be centered?
-        Bookmark name for a page that's not on the TOC or has nothing before
+        Bookmark name for a page that's not on the TOC and has nothing before
         Screen position still keeps jumping when inside a paragraph
         Better recursion needed for fb2 toc
         Initial sort by author in tableview
         Last column not filling up tableview
         Comic view mode changing does not work for newly added books
+        Ctrl + A reports 10 times the number of books selected for deletion
+        Ordering for non TOC chapters is beyond borked
 
     Secondary:
         Tab tooltip
diff --git a/lector/KindleUnpack/kindleunpack.py b/lector/KindleUnpack/kindleunpack.py
index 41fa97d..628f256 100644
--- a/lector/KindleUnpack/kindleunpack.py
+++ b/lector/KindleUnpack/kindleunpack.py
@@ -6,7 +6,7 @@ from __future__ import unicode_literals, division, absolute_import, print_functi
 
 import os
 
-__path__ = ["lib", os.path.dirname(__file__), "kindleunpack"]
+__path__ = ["lib", os.path.dirname(os.path.realpath(__file__)), "kindleunpack"]
 
 import sys
 import codecs
@@ -140,6 +140,8 @@ if PY2:
 #  0.76   pre-release version only fix name related issues in opf by not using original file name in mobi7
 #  0.77   bug fix for unpacking HDImages with included Fonts
 #  0.80   converted to work with both python 2.7 and Python 3.3 and later
+#  0.81   various fixes
+#  0.82   Handle calibre-generated mobis that can have skeletons with no fragments
 
 DUMP = False
 """ Set to True to dump all possible information. """
@@ -847,7 +849,7 @@ def process_all_mobi_headers(files, apnxfile, sect, mhlst, K8Boundary, k8only=Fa
     return
 
 
-def unpackBook(infile, outdir, apnxfile=None, epubver='2', use_hd=True, dodump=False, dowriteraw=False, dosplitcombos=False):
+def unpackBook(infile, outdir, apnxfile=None, epubver='2', use_hd=False, dodump=False, dowriteraw=False, dosplitcombos=False):
     global DUMP
     global WRITE_RAW_DATA
     global SPLIT_COMBO_MOBIS
@@ -949,7 +951,7 @@ def main(argv=unicode_argv()):
     global WRITE_RAW_DATA
     global SPLIT_COMBO_MOBIS
 
-    print("KindleUnpack v0.80")
+    print("KindleUnpack v0.82")
     print("   Based on initial mobipocket version Copyright © 2009 Charles M. Hannum <root@ihack.net>")
     print("   Extensive Extensions and Improvements Copyright © 2009-2014 ")
     print("       by:  P. Durrant, K. Hendricks, S. Siebert, fandrieu, DiapDealer, nickredding, tkeo.")
diff --git a/lector/KindleUnpack/mobi_k8proc.py b/lector/KindleUnpack/mobi_k8proc.py
index 04ec028..5b8274e 100644
--- a/lector/KindleUnpack/mobi_k8proc.py
+++ b/lector/KindleUnpack/mobi_k8proc.py
@@ -180,9 +180,11 @@ class K8Processor:
         fragptr = 0
         baseptr = 0
         cnt = 0
+        filename = 'part%04d.xhtml' % cnt
         for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl:
             baseptr = skelpos + skellen
             skeleton = text[skelpos: baseptr]
+            aidtext = "0"
             for i in range(fragcnt):
                 [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[fragptr]
                 aidtext = idtext[12:-2]
diff --git a/lector/KindleUnpack/mobiml2xhtml.py b/lector/KindleUnpack/mobiml2xhtml.py
index 7d89159..85be8ba 100755
--- a/lector/KindleUnpack/mobiml2xhtml.py
+++ b/lector/KindleUnpack/mobiml2xhtml.py
@@ -246,13 +246,13 @@ class MobiMLConverter(object):
                             # handle case of end tag with no beginning by injecting empty begin tag
                             taginfo = ('begin', tname, None)
                             htmlstr += self.processtag(taginfo)
-                            print("     - fixed by injecting empty start tag ", tname)
+                            print "     - fixed by injecting empty start tag ", tname
                             self.path.append(tname)
                         elif len(self.path) >  1 and tname == self.path[-2]:
                             # handle case of dangling missing end
                             taginfo = ('end', self.path[-1], None)
                             htmlstr += self.processtag(taginfo)
-                            print("     - fixed by injecting end tag ", self.path[-1])
+                            print "     - fixed by injecting end tag ", self.path[-1]
                             self.path.pop()
                     self.path.pop()
 
@@ -504,18 +504,18 @@ def main(argv=sys.argv):
         infile = argv[1]
 
     try:
-        print('Converting Mobi Markup Language to XHTML')
+        print 'Converting Mobi Markup Language to XHTML'
         mlc = MobiMLConverter(infile)
-        print('Processing ...')
+        print 'Processing ...'
         htmlstr, css, cssname = mlc.processml()
         outname = infile.rsplit('.',1)[0] + '_converted.html'
         file(outname, 'wb').write(htmlstr)
         file(cssname, 'wb').write(css)
-        print('Completed')
-        print('XHTML version of book can be found at: ', outname)
+        print 'Completed'
+        print 'XHTML version of book can be found at: ' + outname
 
-    except ValueError as e:
-        print("Error: %s" % e)
+    except ValueError, e:
+        print "Error: %s" % e
         return 1
 
     return 0
diff --git a/lector/parsers/mobi.py b/lector/parsers/mobi.py
index 6bb8d17..98ee3dc 100644
--- a/lector/parsers/mobi.py
+++ b/lector/parsers/mobi.py
@@ -14,8 +14,9 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-# This module parses Amazon ebooks using KindleUnpack to first create an
-# epub that is then read the usual way
+# TODO
+# See if it's possible to just feed the
+# unzipped mobi7 file into the EPUB parser module
 
 import os
 import sys
@@ -30,73 +31,53 @@ logger = logging.getLogger(__name__)
 
 
 class ParseMOBI:
+    # This module parses Amazon ebooks using KindleUnpack to first create an
+    # epub and then read the usual way
+
     def __init__(self, filename, temp_dir, file_md5):
-        self.book_ref = None
         self.book = None
         self.filename = filename
         self.epub_filepath = None
-        self.split_large_xml = False
         self.temp_dir = temp_dir
-        self.extract_dir = os.path.join(temp_dir, file_md5)
+        self.extract_path = os.path.join(temp_dir, file_md5)
 
     def read_book(self):
         with HidePrinting():
-            KindleUnpack.unpackBook(self.filename, self.extract_dir)
+            KindleUnpack.unpackBook(self.filename, self.extract_path)
 
         epub_filename = os.path.splitext(
             os.path.basename(self.filename))[0] + '.epub'
-
         self.epub_filepath = os.path.join(
-            self.extract_dir, 'mobi8', epub_filename)
+            self.extract_path, 'mobi8', epub_filename)
+
         if not os.path.exists(self.epub_filepath):
-            zip_dir = os.path.join(self.extract_dir, 'mobi7')
+            zip_dir = os.path.join(self.extract_path, 'mobi7')
             zip_file = os.path.join(
-                self.extract_dir, epub_filename)
+                self.extract_path, epub_filename)
             self.epub_filepath = shutil.make_archive(zip_file, 'zip', zip_dir)
-            self.split_large_xml = True
 
-        self.book_ref = EPUB(self.epub_filepath, self.temp_dir)
-        self.book_ref.generate_metadata()
-        self.book_ref.generate_toc()
-        self.book_ref.generate_content()
-        self.book = self.book_ref.book
-        return True
+        self.book = EPUB(self.epub_filepath, self.temp_dir)
 
-    def get_title(self):
-        return self.book['title']
+    def generate_metadata(self):
+        self.book.generate_metadata()
+        return self.book.metadata
 
-    def get_author(self):
-        return self.book['author']
+    def generate_content(self):
+        zipfile.ZipFile(self.epub_filepath).extractall(self.extract_path)
 
-    def get_year(self):
-        return self.book['year']
-
-    def get_cover_image(self):
-        return self.book['cover']
-
-    def get_isbn(self):
-        return self.book['isbn']
-
-    def get_tags(self):
-        return self.book['tags']
-
-    def get_contents(self):
-        return
-        extract_path = os.path.join(self.extract_dir)
-        zipfile.ZipFile(self.epub_filepath).extractall(extract_path)
-
-        self.book_ref.parse_chapters(
-            temp_dir=self.temp_dir, split_large_xml=self.split_large_xml)
+        self.book.generate_toc()
+        self.book.generate_content()
 
         toc = []
         content = []
-        for count, i in enumerate(self.book['book_list']):
-            toc.append((1, i[0], count + 1))
-            content.append(i[1])
+        for count, i in enumerate(self.book.content):
+            toc.append((1, i[1], count + 1))
+            content.append(i[2])
 
         # Return toc, content, images_only
         return toc, content, False
 
+
 class HidePrinting:
     def __enter__(self):
         self._original_stdout = sys.stdout
diff --git a/lector/readers/read_epub.py b/lector/readers/read_epub.py
index 9f42ad4..0dddf45 100644
--- a/lector/readers/read_epub.py
+++ b/lector/readers/read_epub.py
@@ -18,7 +18,6 @@
 # See if inserting chapters not in the toc.ncx can be avoided
 # Account for stylesheets... eventually
 # Everything needs logging
-# Mobipocket files
 
 import os
 import zipfile
@@ -68,6 +67,7 @@ class EPUB:
             for i in presumptive_names:
                 packagefile = self.find_file(i)
                 if packagefile:
+                    logger.info('Using presumptive package file: ' + self.book_filename)
                     break
 
         packagefile_data = self.zip_file.read(packagefile)
@@ -218,11 +218,19 @@ class EPUB:
         # These are simply ids that correspond to the actual item
         # as mentioned in the manifest - which is a comprehensive
         # list of files
-        chapters_in_spine = [
-            i['@idref']
-            for i in self.opf_dict['package']['spine']['itemref']]
+        try:
+            # Multiple chapters
+            chapters_in_spine = [
+                i['@idref']
+                for i in self.opf_dict['package']['spine']['itemref']]
+        except TypeError:
+            # Single chapter - Large xml
+            chapters_in_spine = [
+                self.opf_dict['package']['spine']['itemref']['@idref']]
 
         # Next, find items and ids from the manifest
+        # This might error out in case there's only one item in
+        # the manifest. Remember that for later.
         chapters_from_manifest = {
             i['@id']: i['@href']
             for i in self.opf_dict['package']['manifest']['item']}
@@ -236,10 +244,12 @@ class EPUB:
             except KeyError:
                 pass
 
-        chapter_title = 1
         toc_chapters = [
             unquote(i[2].split('#')[0]) for i in self.content]
 
+        # TODO
+        # This totally borks the order
+
         last_valid_index = -2  # Yes, but why?
         for i in spine_final:
             if not i in toc_chapters:
@@ -251,10 +261,11 @@ class EPUB:
                 except ValueError:
                     last_valid_index += 1
 
+                # Chapters are currently named None
+                # Blank chapters will later be removed
+                # and the None will be replaced by a number
                 self.content.insert(
-                    last_valid_index + 1,
-                    [1, str(chapter_title), i])
-                chapter_title += 1
+                    last_valid_index + 1, [1, None, i])
 
         # Parse split chapters as below
         # They can be picked up during the iteration through the toc
@@ -316,13 +327,28 @@ class EPUB:
             self.content[count][2] = chapter_content
 
         # Cleanup content by removing null chapters
-        self.content = [
-            i for i in self.content if i[2]]
+        unnamed_chapter_title = 1
+        content_copy = []
+        for i in self.content:
+            if i[2]:
+                chapter_title = i[1]
+                if not chapter_title:
+                    chapter_title = unnamed_chapter_title
+                    unnamed_chapter_title += 1
+                content_copy.append((
+                    i[0], str(chapter_title), i[2]))
+        self.content = content_copy
 
+        # TODO
+        # This can probably be circumvented by shifting the extraction
+        # to this module and simply getting the path to the cover
+
+        # Get cover image and put it in its place
+        # I imagine this involves saying nasty things to it
         cover_image = self.generate_book_cover()
         if cover_image:
             cover_path = os.path.join(
-                self.temp_dir, os.path.basename(self.book_filename)) + '- cover'
+                self.temp_dir, os.path.basename(self.book_filename)) + ' - cover'
             with open(cover_path, 'wb') as cover_temp:
                 cover_temp.write(cover_image)
 
@@ -389,7 +415,6 @@ class EPUB:
                     break
             except:
                 logger.warning('ISBN not found: ' + self.book_filename)
-                pass
 
         # Book tags
         try:
diff --git a/lector/readers/read_fb2.py b/lector/readers/read_fb2.py
index 5e9bbfe..36c26c2 100644
--- a/lector/readers/read_fb2.py
+++ b/lector/readers/read_fb2.py
@@ -148,7 +148,7 @@ class FB2:
         cover_image = self.generate_book_cover()
         if cover_image:
             cover_path = os.path.join(
-                temp_dir, os.path.basename(self.filename)) + '- cover'
+                temp_dir, os.path.basename(self.filename)) + ' - cover'
             with open(cover_path, 'wb') as cover_temp:
                 cover_temp.write(cover_image)
 
diff --git a/lector/sorter.py b/lector/sorter.py
index c058aee..d63cb31 100644
--- a/lector/sorter.py
+++ b/lector/sorter.py
@@ -15,7 +15,8 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 # INSTRUCTIONS
-# Every parser is supposed to have the following methods. None returns are not allowed.
+# Every parser is supposed to have the following methods.
+# Exceptions will be caught - but that's just bad practice
 # read_book() - Initialize book
 # generate_metadata() - For addition
 # generate_content() - For reading
@@ -67,6 +68,7 @@ else:
 
 # python-lxml - Required for everything except comics
 lxml_check = importlib.util.find_spec('lxml')
+xmltodict_check = importlib.util.find_spec('xmltodict')
 if lxml_check:
     lxml_dependent = {
         'epub': ParseEPUB,
@@ -79,7 +81,7 @@ if lxml_check:
         'fb2.zip': ParseFB2}
     sorter.update(lxml_dependent)
 else:
-    critical_sting = 'python-lxml is not installed. Only comics will load.'
+    critical_sting = 'python-lxml / xmltodict is not installed. Only comics will load.'
     print(critical_sting)
     logger.critical(critical_sting)
 
@@ -122,8 +124,8 @@ class BookSorter:
         self.queue = Manager().Queue()
         self.processed_books = []
 
-        if self.work_mode == 'addition':
-            progress_object_generator()
+        # if self.work_mode == 'addition':
+        progress_object_generator()
 
     def database_hashes(self):
         all_hashes_and_paths = database.DatabaseFunctions(
@@ -134,7 +136,6 @@ class BookSorter:
                 'LIKE')
 
         if all_hashes_and_paths:
-            # self.hashes = [i[0] for i in all_hashes]
             self.hashes_and_paths = {
                 i[0]: i[1] for i in all_hashes_and_paths}
 
@@ -205,6 +206,12 @@ class BookSorter:
 
         book_ref = sorter[file_extension](filename, self.temp_dir, file_md5)
 
+        # None of the following have an exception type specified
+        # This will keep everything from crashing, but will make
+        # troubleshooting difficult
+        # TODO
+        # In application notifications
+
         try:
             book_ref.read_book()
         except:
@@ -248,7 +255,7 @@ class BookSorter:
         if self.work_mode == 'reading':
             try:
                 book_breakdown = book_ref.generate_content()
-            except:
+            except KeyboardInterrupt:
                 logger.error('Content generation error: ' + filename)
                 return
 
diff --git a/requirements.txt b/requirements.txt
index 7ad59c7..3afba53 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
 beautifulsoup4==4.7.1
-lxml==4.3.0
-PyMuPDF==1.14.7
+lxml==4.3.1
+PyMuPDF==1.14.8
 PyQt5==5.11.3
 PyQt5-sip==4.19.13
 soupsieve==1.7.3
+xmltodict==0.11.0