Fix MOBI parser

Update Kindleunpack
Discover new and exciting bugs
This commit is contained in:
BasioMeusPuga
2019-02-10 17:58:35 +05:30
parent f6f9d01060
commit 3cd75807f9
9 changed files with 96 additions and 76 deletions

4
TODO
View File

@@ -93,12 +93,14 @@ TODO
Deselecting all directories in the settings dialog also filters out manually added books Deselecting all directories in the settings dialog also filters out manually added books
Last line in QTextBrowser should never be cut off Last line in QTextBrowser should never be cut off
Does image alignment need to be centered? Does image alignment need to be centered?
Bookmark name for a page that's not on the TOC or has nothing before Bookmark name for a page that's not on the TOC and has nothing before
Screen position still keeps jumping when inside a paragraph Screen position still keeps jumping when inside a paragraph
Better recursion needed for fb2 toc Better recursion needed for fb2 toc
Initial sort by author in tableview Initial sort by author in tableview
Last column not filling up tableview Last column not filling up tableview
Comic view mode changing does not work for newly added books Comic view mode changing does not work for newly added books
Ctrl + A reports 10 times the number of books selected for deletion
Ordering for non TOC chapters is beyond borked
Secondary: Secondary:
Tab tooltip Tab tooltip

View File

@@ -6,7 +6,7 @@ from __future__ import unicode_literals, division, absolute_import, print_functi
import os import os
__path__ = ["lib", os.path.dirname(__file__), "kindleunpack"] __path__ = ["lib", os.path.dirname(os.path.realpath(__file__)), "kindleunpack"]
import sys import sys
import codecs import codecs
@@ -140,6 +140,8 @@ if PY2:
# 0.76 pre-release version only fix name related issues in opf by not using original file name in mobi7 # 0.76 pre-release version only fix name related issues in opf by not using original file name in mobi7
# 0.77 bug fix for unpacking HDImages with included Fonts # 0.77 bug fix for unpacking HDImages with included Fonts
# 0.80 converted to work with both python 2.7 and Python 3.3 and later # 0.80 converted to work with both python 2.7 and Python 3.3 and later
# 0.81 various fixes
# 0.82 Handle calibre-generated mobis that can have skeletons with no fragments
DUMP = False DUMP = False
""" Set to True to dump all possible information. """ """ Set to True to dump all possible information. """
@@ -847,7 +849,7 @@ def process_all_mobi_headers(files, apnxfile, sect, mhlst, K8Boundary, k8only=Fa
return return
def unpackBook(infile, outdir, apnxfile=None, epubver='2', use_hd=True, dodump=False, dowriteraw=False, dosplitcombos=False): def unpackBook(infile, outdir, apnxfile=None, epubver='2', use_hd=False, dodump=False, dowriteraw=False, dosplitcombos=False):
global DUMP global DUMP
global WRITE_RAW_DATA global WRITE_RAW_DATA
global SPLIT_COMBO_MOBIS global SPLIT_COMBO_MOBIS
@@ -949,7 +951,7 @@ def main(argv=unicode_argv()):
global WRITE_RAW_DATA global WRITE_RAW_DATA
global SPLIT_COMBO_MOBIS global SPLIT_COMBO_MOBIS
print("KindleUnpack v0.80") print("KindleUnpack v0.82")
print(" Based on initial mobipocket version Copyright © 2009 Charles M. Hannum <root@ihack.net>") print(" Based on initial mobipocket version Copyright © 2009 Charles M. Hannum <root@ihack.net>")
print(" Extensive Extensions and Improvements Copyright © 2009-2014 ") print(" Extensive Extensions and Improvements Copyright © 2009-2014 ")
print(" by: P. Durrant, K. Hendricks, S. Siebert, fandrieu, DiapDealer, nickredding, tkeo.") print(" by: P. Durrant, K. Hendricks, S. Siebert, fandrieu, DiapDealer, nickredding, tkeo.")

View File

@@ -180,9 +180,11 @@ class K8Processor:
fragptr = 0 fragptr = 0
baseptr = 0 baseptr = 0
cnt = 0 cnt = 0
filename = 'part%04d.xhtml' % cnt
for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl: for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl:
baseptr = skelpos + skellen baseptr = skelpos + skellen
skeleton = text[skelpos: baseptr] skeleton = text[skelpos: baseptr]
aidtext = "0"
for i in range(fragcnt): for i in range(fragcnt):
[insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[fragptr] [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[fragptr]
aidtext = idtext[12:-2] aidtext = idtext[12:-2]

View File

@@ -246,13 +246,13 @@ class MobiMLConverter(object):
# handle case of end tag with no beginning by injecting empty begin tag # handle case of end tag with no beginning by injecting empty begin tag
taginfo = ('begin', tname, None) taginfo = ('begin', tname, None)
htmlstr += self.processtag(taginfo) htmlstr += self.processtag(taginfo)
print(" - fixed by injecting empty start tag ", tname) print " - fixed by injecting empty start tag ", tname
self.path.append(tname) self.path.append(tname)
elif len(self.path) > 1 and tname == self.path[-2]: elif len(self.path) > 1 and tname == self.path[-2]:
# handle case of dangling missing end # handle case of dangling missing end
taginfo = ('end', self.path[-1], None) taginfo = ('end', self.path[-1], None)
htmlstr += self.processtag(taginfo) htmlstr += self.processtag(taginfo)
print(" - fixed by injecting end tag ", self.path[-1]) print " - fixed by injecting end tag ", self.path[-1]
self.path.pop() self.path.pop()
self.path.pop() self.path.pop()
@@ -504,18 +504,18 @@ def main(argv=sys.argv):
infile = argv[1] infile = argv[1]
try: try:
print('Converting Mobi Markup Language to XHTML') print 'Converting Mobi Markup Language to XHTML'
mlc = MobiMLConverter(infile) mlc = MobiMLConverter(infile)
print('Processing ...') print 'Processing ...'
htmlstr, css, cssname = mlc.processml() htmlstr, css, cssname = mlc.processml()
outname = infile.rsplit('.',1)[0] + '_converted.html' outname = infile.rsplit('.',1)[0] + '_converted.html'
file(outname, 'wb').write(htmlstr) file(outname, 'wb').write(htmlstr)
file(cssname, 'wb').write(css) file(cssname, 'wb').write(css)
print('Completed') print 'Completed'
print('XHTML version of book can be found at: ', outname) print 'XHTML version of book can be found at: ' + outname
except ValueError as e: except ValueError, e:
print("Error: %s" % e) print "Error: %s" % e
return 1 return 1
return 0 return 0

View File

@@ -14,8 +14,9 @@
# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
# This module parses Amazon ebooks using KindleUnpack to first create an # TODO
# epub that is then read the usual way # See if it's possible to just feed the
# unzipped mobi7 file into the EPUB parser module
import os import os
import sys import sys
@@ -30,73 +31,53 @@ logger = logging.getLogger(__name__)
class ParseMOBI: class ParseMOBI:
# This module parses Amazon ebooks using KindleUnpack to first create an
# epub and then read the usual way
def __init__(self, filename, temp_dir, file_md5): def __init__(self, filename, temp_dir, file_md5):
self.book_ref = None
self.book = None self.book = None
self.filename = filename self.filename = filename
self.epub_filepath = None self.epub_filepath = None
self.split_large_xml = False
self.temp_dir = temp_dir self.temp_dir = temp_dir
self.extract_dir = os.path.join(temp_dir, file_md5) self.extract_path = os.path.join(temp_dir, file_md5)
def read_book(self): def read_book(self):
with HidePrinting(): with HidePrinting():
KindleUnpack.unpackBook(self.filename, self.extract_dir) KindleUnpack.unpackBook(self.filename, self.extract_path)
epub_filename = os.path.splitext( epub_filename = os.path.splitext(
os.path.basename(self.filename))[0] + '.epub' os.path.basename(self.filename))[0] + '.epub'
self.epub_filepath = os.path.join( self.epub_filepath = os.path.join(
self.extract_dir, 'mobi8', epub_filename) self.extract_path, 'mobi8', epub_filename)
if not os.path.exists(self.epub_filepath): if not os.path.exists(self.epub_filepath):
zip_dir = os.path.join(self.extract_dir, 'mobi7') zip_dir = os.path.join(self.extract_path, 'mobi7')
zip_file = os.path.join( zip_file = os.path.join(
self.extract_dir, epub_filename) self.extract_path, epub_filename)
self.epub_filepath = shutil.make_archive(zip_file, 'zip', zip_dir) self.epub_filepath = shutil.make_archive(zip_file, 'zip', zip_dir)
self.split_large_xml = True
self.book_ref = EPUB(self.epub_filepath, self.temp_dir) self.book = EPUB(self.epub_filepath, self.temp_dir)
self.book_ref.generate_metadata()
self.book_ref.generate_toc()
self.book_ref.generate_content()
self.book = self.book_ref.book
return True
def get_title(self): def generate_metadata(self):
return self.book['title'] self.book.generate_metadata()
return self.book.metadata
def get_author(self): def generate_content(self):
return self.book['author'] zipfile.ZipFile(self.epub_filepath).extractall(self.extract_path)
def get_year(self): self.book.generate_toc()
return self.book['year'] self.book.generate_content()
def get_cover_image(self):
return self.book['cover']
def get_isbn(self):
return self.book['isbn']
def get_tags(self):
return self.book['tags']
def get_contents(self):
return
extract_path = os.path.join(self.extract_dir)
zipfile.ZipFile(self.epub_filepath).extractall(extract_path)
self.book_ref.parse_chapters(
temp_dir=self.temp_dir, split_large_xml=self.split_large_xml)
toc = [] toc = []
content = [] content = []
for count, i in enumerate(self.book['book_list']): for count, i in enumerate(self.book.content):
toc.append((1, i[0], count + 1)) toc.append((1, i[1], count + 1))
content.append(i[1]) content.append(i[2])
# Return toc, content, images_only # Return toc, content, images_only
return toc, content, False return toc, content, False
class HidePrinting: class HidePrinting:
def __enter__(self): def __enter__(self):
self._original_stdout = sys.stdout self._original_stdout = sys.stdout

View File

@@ -18,7 +18,6 @@
# See if inserting chapters not in the toc.ncx can be avoided # See if inserting chapters not in the toc.ncx can be avoided
# Account for stylesheets... eventually # Account for stylesheets... eventually
# Everything needs logging # Everything needs logging
# Mobipocket files
import os import os
import zipfile import zipfile
@@ -68,6 +67,7 @@ class EPUB:
for i in presumptive_names: for i in presumptive_names:
packagefile = self.find_file(i) packagefile = self.find_file(i)
if packagefile: if packagefile:
logger.info('Using presumptive package file: ' + self.book_filename)
break break
packagefile_data = self.zip_file.read(packagefile) packagefile_data = self.zip_file.read(packagefile)
@@ -218,11 +218,19 @@ class EPUB:
# These are simply ids that correspond to the actual item # These are simply ids that correspond to the actual item
# as mentioned in the manifest - which is a comprehensive # as mentioned in the manifest - which is a comprehensive
# list of files # list of files
try:
# Multiple chapters
chapters_in_spine = [ chapters_in_spine = [
i['@idref'] i['@idref']
for i in self.opf_dict['package']['spine']['itemref']] for i in self.opf_dict['package']['spine']['itemref']]
except TypeError:
# Single chapter - Large xml
chapters_in_spine = [
self.opf_dict['package']['spine']['itemref']['@idref']]
# Next, find items and ids from the manifest # Next, find items and ids from the manifest
# This might error out in case there's only one item in
# the manifest. Remember that for later.
chapters_from_manifest = { chapters_from_manifest = {
i['@id']: i['@href'] i['@id']: i['@href']
for i in self.opf_dict['package']['manifest']['item']} for i in self.opf_dict['package']['manifest']['item']}
@@ -236,10 +244,12 @@ class EPUB:
except KeyError: except KeyError:
pass pass
chapter_title = 1
toc_chapters = [ toc_chapters = [
unquote(i[2].split('#')[0]) for i in self.content] unquote(i[2].split('#')[0]) for i in self.content]
# TODO
# This totally borks the order
last_valid_index = -2 # Yes, but why? last_valid_index = -2 # Yes, but why?
for i in spine_final: for i in spine_final:
if not i in toc_chapters: if not i in toc_chapters:
@@ -251,10 +261,11 @@ class EPUB:
except ValueError: except ValueError:
last_valid_index += 1 last_valid_index += 1
# Chapters are currently named None
# Blank chapters will later be removed
# and the None will be replaced by a number
self.content.insert( self.content.insert(
last_valid_index + 1, last_valid_index + 1, [1, None, i])
[1, str(chapter_title), i])
chapter_title += 1
# Parse split chapters as below # Parse split chapters as below
# They can be picked up during the iteration through the toc # They can be picked up during the iteration through the toc
@@ -316,9 +327,24 @@ class EPUB:
self.content[count][2] = chapter_content self.content[count][2] = chapter_content
# Cleanup content by removing null chapters # Cleanup content by removing null chapters
self.content = [ unnamed_chapter_title = 1
i for i in self.content if i[2]] content_copy = []
for i in self.content:
if i[2]:
chapter_title = i[1]
if not chapter_title:
chapter_title = unnamed_chapter_title
unnamed_chapter_title += 1
content_copy.append((
i[0], str(chapter_title), i[2]))
self.content = content_copy
# TODO
# This can probably be circumvented by shifting the extraction
# to this module and simply getting the path to the cover
# Get cover image and put it in its place
# I imagine this involves saying nasty things to it
cover_image = self.generate_book_cover() cover_image = self.generate_book_cover()
if cover_image: if cover_image:
cover_path = os.path.join( cover_path = os.path.join(
@@ -389,7 +415,6 @@ class EPUB:
break break
except: except:
logger.warning('ISBN not found: ' + self.book_filename) logger.warning('ISBN not found: ' + self.book_filename)
pass
# Book tags # Book tags
try: try:

View File

@@ -15,7 +15,8 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
# INSTRUCTIONS # INSTRUCTIONS
# Every parser is supposed to have the following methods. None returns are not allowed. # Every parser is supposed to have the following methods.
# Exceptions will be caught - but that's just bad practice
# read_book() - Initialize book # read_book() - Initialize book
# generate_metadata() - For addition # generate_metadata() - For addition
# generate_content() - For reading # generate_content() - For reading
@@ -67,6 +68,7 @@ else:
# python-lxml - Required for everything except comics # python-lxml - Required for everything except comics
lxml_check = importlib.util.find_spec('lxml') lxml_check = importlib.util.find_spec('lxml')
xmltodict_check = importlib.util.find_spec('xmltodict')
if lxml_check: if lxml_check:
lxml_dependent = { lxml_dependent = {
'epub': ParseEPUB, 'epub': ParseEPUB,
@@ -79,7 +81,7 @@ if lxml_check:
'fb2.zip': ParseFB2} 'fb2.zip': ParseFB2}
sorter.update(lxml_dependent) sorter.update(lxml_dependent)
else: else:
critical_sting = 'python-lxml is not installed. Only comics will load.' critical_sting = 'python-lxml / xmltodict is not installed. Only comics will load.'
print(critical_sting) print(critical_sting)
logger.critical(critical_sting) logger.critical(critical_sting)
@@ -122,7 +124,7 @@ class BookSorter:
self.queue = Manager().Queue() self.queue = Manager().Queue()
self.processed_books = [] self.processed_books = []
if self.work_mode == 'addition': # if self.work_mode == 'addition':
progress_object_generator() progress_object_generator()
def database_hashes(self): def database_hashes(self):
@@ -134,7 +136,6 @@ class BookSorter:
'LIKE') 'LIKE')
if all_hashes_and_paths: if all_hashes_and_paths:
# self.hashes = [i[0] for i in all_hashes]
self.hashes_and_paths = { self.hashes_and_paths = {
i[0]: i[1] for i in all_hashes_and_paths} i[0]: i[1] for i in all_hashes_and_paths}
@@ -205,6 +206,12 @@ class BookSorter:
book_ref = sorter[file_extension](filename, self.temp_dir, file_md5) book_ref = sorter[file_extension](filename, self.temp_dir, file_md5)
# None of the following have an exception type specified
# This will keep everything from crashing, but will make
# troubleshooting difficult
# TODO
# In application notifications
try: try:
book_ref.read_book() book_ref.read_book()
except: except:
@@ -248,7 +255,7 @@ class BookSorter:
if self.work_mode == 'reading': if self.work_mode == 'reading':
try: try:
book_breakdown = book_ref.generate_content() book_breakdown = book_ref.generate_content()
except: except KeyboardInterrupt:
logger.error('Content generation error: ' + filename) logger.error('Content generation error: ' + filename)
return return

View File

@@ -1,6 +1,7 @@
beautifulsoup4==4.7.1 beautifulsoup4==4.7.1
lxml==4.3.0 lxml==4.3.1
PyMuPDF==1.14.7 PyMuPDF==1.14.8
PyQt5==5.11.3 PyQt5==5.11.3
PyQt5-sip==4.19.13 PyQt5-sip==4.19.13
soupsieve==1.7.3 soupsieve==1.7.3
xmltodict==0.11.0