Fix MOBI parser

Update Kindleunpack
Discover new and exciting bugs
This commit is contained in:
BasioMeusPuga
2019-02-10 17:58:35 +05:30
parent f6f9d01060
commit 3cd75807f9
9 changed files with 96 additions and 76 deletions

View File

@@ -6,7 +6,7 @@ from __future__ import unicode_literals, division, absolute_import, print_functi
import os
__path__ = ["lib", os.path.dirname(__file__), "kindleunpack"]
__path__ = ["lib", os.path.dirname(os.path.realpath(__file__)), "kindleunpack"]
import sys
import codecs
@@ -140,6 +140,8 @@ if PY2:
# 0.76 pre-release version only fix name related issues in opf by not using original file name in mobi7
# 0.77 bug fix for unpacking HDImages with included Fonts
# 0.80 converted to work with both python 2.7 and Python 3.3 and later
# 0.81 various fixes
# 0.82 Handle calibre-generated mobis that can have skeletons with no fragments
DUMP = False
""" Set to True to dump all possible information. """
@@ -847,7 +849,7 @@ def process_all_mobi_headers(files, apnxfile, sect, mhlst, K8Boundary, k8only=Fa
return
def unpackBook(infile, outdir, apnxfile=None, epubver='2', use_hd=True, dodump=False, dowriteraw=False, dosplitcombos=False):
def unpackBook(infile, outdir, apnxfile=None, epubver='2', use_hd=False, dodump=False, dowriteraw=False, dosplitcombos=False):
global DUMP
global WRITE_RAW_DATA
global SPLIT_COMBO_MOBIS
@@ -949,7 +951,7 @@ def main(argv=unicode_argv()):
global WRITE_RAW_DATA
global SPLIT_COMBO_MOBIS
print("KindleUnpack v0.80")
print("KindleUnpack v0.82")
print(" Based on initial mobipocket version Copyright © 2009 Charles M. Hannum <root@ihack.net>")
print(" Extensive Extensions and Improvements Copyright © 2009-2014 ")
print(" by: P. Durrant, K. Hendricks, S. Siebert, fandrieu, DiapDealer, nickredding, tkeo.")

View File

@@ -180,9 +180,11 @@ class K8Processor:
fragptr = 0
baseptr = 0
cnt = 0
filename = 'part%04d.xhtml' % cnt
for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl:
baseptr = skelpos + skellen
skeleton = text[skelpos: baseptr]
aidtext = "0"
for i in range(fragcnt):
[insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[fragptr]
aidtext = idtext[12:-2]

View File

@@ -246,13 +246,13 @@ class MobiMLConverter(object):
# handle case of end tag with no beginning by injecting empty begin tag
taginfo = ('begin', tname, None)
htmlstr += self.processtag(taginfo)
print(" - fixed by injecting empty start tag ", tname)
print " - fixed by injecting empty start tag ", tname
self.path.append(tname)
elif len(self.path) > 1 and tname == self.path[-2]:
# handle case of dangling missing end
taginfo = ('end', self.path[-1], None)
htmlstr += self.processtag(taginfo)
print(" - fixed by injecting end tag ", self.path[-1])
print " - fixed by injecting end tag ", self.path[-1]
self.path.pop()
self.path.pop()
@@ -504,18 +504,18 @@ def main(argv=sys.argv):
infile = argv[1]
try:
print('Converting Mobi Markup Language to XHTML')
print 'Converting Mobi Markup Language to XHTML'
mlc = MobiMLConverter(infile)
print('Processing ...')
print 'Processing ...'
htmlstr, css, cssname = mlc.processml()
outname = infile.rsplit('.',1)[0] + '_converted.html'
file(outname, 'wb').write(htmlstr)
file(cssname, 'wb').write(css)
print('Completed')
print('XHTML version of book can be found at: ', outname)
print 'Completed'
print 'XHTML version of book can be found at: ' + outname
except ValueError as e:
print("Error: %s" % e)
except ValueError, e:
print "Error: %s" % e
return 1
return 0

View File

@@ -14,8 +14,9 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# This module parses Amazon ebooks using KindleUnpack to first create an
# epub that is then read the usual way
# TODO
# See if it's possible to just feed the
# unzipped mobi7 file into the EPUB parser module
import os
import sys
@@ -30,73 +31,53 @@ logger = logging.getLogger(__name__)
class ParseMOBI:
# This module parses Amazon ebooks using KindleUnpack to first create an
# epub and then read the usual way
def __init__(self, filename, temp_dir, file_md5):
self.book_ref = None
self.book = None
self.filename = filename
self.epub_filepath = None
self.split_large_xml = False
self.temp_dir = temp_dir
self.extract_dir = os.path.join(temp_dir, file_md5)
self.extract_path = os.path.join(temp_dir, file_md5)
def read_book(self):
with HidePrinting():
KindleUnpack.unpackBook(self.filename, self.extract_dir)
KindleUnpack.unpackBook(self.filename, self.extract_path)
epub_filename = os.path.splitext(
os.path.basename(self.filename))[0] + '.epub'
self.epub_filepath = os.path.join(
self.extract_dir, 'mobi8', epub_filename)
self.extract_path, 'mobi8', epub_filename)
if not os.path.exists(self.epub_filepath):
zip_dir = os.path.join(self.extract_dir, 'mobi7')
zip_dir = os.path.join(self.extract_path, 'mobi7')
zip_file = os.path.join(
self.extract_dir, epub_filename)
self.extract_path, epub_filename)
self.epub_filepath = shutil.make_archive(zip_file, 'zip', zip_dir)
self.split_large_xml = True
self.book_ref = EPUB(self.epub_filepath, self.temp_dir)
self.book_ref.generate_metadata()
self.book_ref.generate_toc()
self.book_ref.generate_content()
self.book = self.book_ref.book
return True
self.book = EPUB(self.epub_filepath, self.temp_dir)
def get_title(self):
return self.book['title']
def generate_metadata(self):
self.book.generate_metadata()
return self.book.metadata
def get_author(self):
return self.book['author']
def generate_content(self):
zipfile.ZipFile(self.epub_filepath).extractall(self.extract_path)
def get_year(self):
return self.book['year']
def get_cover_image(self):
return self.book['cover']
def get_isbn(self):
return self.book['isbn']
def get_tags(self):
return self.book['tags']
def get_contents(self):
return
extract_path = os.path.join(self.extract_dir)
zipfile.ZipFile(self.epub_filepath).extractall(extract_path)
self.book_ref.parse_chapters(
temp_dir=self.temp_dir, split_large_xml=self.split_large_xml)
self.book.generate_toc()
self.book.generate_content()
toc = []
content = []
for count, i in enumerate(self.book['book_list']):
toc.append((1, i[0], count + 1))
content.append(i[1])
for count, i in enumerate(self.book.content):
toc.append((1, i[1], count + 1))
content.append(i[2])
# Return toc, content, images_only
return toc, content, False
class HidePrinting:
def __enter__(self):
self._original_stdout = sys.stdout

View File

@@ -18,7 +18,6 @@
# See if inserting chapters not in the toc.ncx can be avoided
# Account for stylesheets... eventually
# Everything needs logging
# Mobipocket files
import os
import zipfile
@@ -68,6 +67,7 @@ class EPUB:
for i in presumptive_names:
packagefile = self.find_file(i)
if packagefile:
logger.info('Using presumptive package file: ' + self.book_filename)
break
packagefile_data = self.zip_file.read(packagefile)
@@ -218,11 +218,19 @@ class EPUB:
# These are simply ids that correspond to the actual item
# as mentioned in the manifest - which is a comprehensive
# list of files
chapters_in_spine = [
i['@idref']
for i in self.opf_dict['package']['spine']['itemref']]
try:
# Multiple chapters
chapters_in_spine = [
i['@idref']
for i in self.opf_dict['package']['spine']['itemref']]
except TypeError:
# Single chapter - Large xml
chapters_in_spine = [
self.opf_dict['package']['spine']['itemref']['@idref']]
# Next, find items and ids from the manifest
# This might error out in case there's only one item in
# the manifest. Remember that for later.
chapters_from_manifest = {
i['@id']: i['@href']
for i in self.opf_dict['package']['manifest']['item']}
@@ -236,10 +244,12 @@ class EPUB:
except KeyError:
pass
chapter_title = 1
toc_chapters = [
unquote(i[2].split('#')[0]) for i in self.content]
# TODO
# This totally borks the order
last_valid_index = -2 # Yes, but why?
for i in spine_final:
if not i in toc_chapters:
@@ -251,10 +261,11 @@ class EPUB:
except ValueError:
last_valid_index += 1
# Chapters are currently named None
# Blank chapters will later be removed
# and the None will be replaced by a number
self.content.insert(
last_valid_index + 1,
[1, str(chapter_title), i])
chapter_title += 1
last_valid_index + 1, [1, None, i])
# Parse split chapters as below
# They can be picked up during the iteration through the toc
@@ -316,13 +327,28 @@ class EPUB:
self.content[count][2] = chapter_content
# Cleanup content by removing null chapters
self.content = [
i for i in self.content if i[2]]
unnamed_chapter_title = 1
content_copy = []
for i in self.content:
if i[2]:
chapter_title = i[1]
if not chapter_title:
chapter_title = unnamed_chapter_title
unnamed_chapter_title += 1
content_copy.append((
i[0], str(chapter_title), i[2]))
self.content = content_copy
# TODO
# This can probably be circumvented by shifting the extraction
# to this module and simply getting the path to the cover
# Get cover image and put it in its place
# I imagine this involves saying nasty things to it
cover_image = self.generate_book_cover()
if cover_image:
cover_path = os.path.join(
self.temp_dir, os.path.basename(self.book_filename)) + '- cover'
self.temp_dir, os.path.basename(self.book_filename)) + ' - cover'
with open(cover_path, 'wb') as cover_temp:
cover_temp.write(cover_image)
@@ -389,7 +415,6 @@ class EPUB:
break
except:
logger.warning('ISBN not found: ' + self.book_filename)
pass
# Book tags
try:

View File

@@ -148,7 +148,7 @@ class FB2:
cover_image = self.generate_book_cover()
if cover_image:
cover_path = os.path.join(
temp_dir, os.path.basename(self.filename)) + '- cover'
temp_dir, os.path.basename(self.filename)) + ' - cover'
with open(cover_path, 'wb') as cover_temp:
cover_temp.write(cover_image)

View File

@@ -15,7 +15,8 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# INSTRUCTIONS
# Every parser is supposed to have the following methods. None returns are not allowed.
# Every parser is supposed to have the following methods.
# Exceptions will be caught - but that's just bad practice
# read_book() - Initialize book
# generate_metadata() - For addition
# generate_content() - For reading
@@ -67,6 +68,7 @@ else:
# python-lxml - Required for everything except comics
lxml_check = importlib.util.find_spec('lxml')
xmltodict_check = importlib.util.find_spec('xmltodict')
if lxml_check:
lxml_dependent = {
'epub': ParseEPUB,
@@ -79,7 +81,7 @@ if lxml_check:
'fb2.zip': ParseFB2}
sorter.update(lxml_dependent)
else:
critical_sting = 'python-lxml is not installed. Only comics will load.'
critical_sting = 'python-lxml / xmltodict is not installed. Only comics will load.'
print(critical_sting)
logger.critical(critical_sting)
@@ -122,8 +124,8 @@ class BookSorter:
self.queue = Manager().Queue()
self.processed_books = []
if self.work_mode == 'addition':
progress_object_generator()
# if self.work_mode == 'addition':
progress_object_generator()
def database_hashes(self):
all_hashes_and_paths = database.DatabaseFunctions(
@@ -134,7 +136,6 @@ class BookSorter:
'LIKE')
if all_hashes_and_paths:
# self.hashes = [i[0] for i in all_hashes]
self.hashes_and_paths = {
i[0]: i[1] for i in all_hashes_and_paths}
@@ -205,6 +206,12 @@ class BookSorter:
book_ref = sorter[file_extension](filename, self.temp_dir, file_md5)
# None of the following have an exception type specified
# This will keep everything from crashing, but will make
# troubleshooting difficult
# TODO
# In application notifications
try:
book_ref.read_book()
except:
@@ -248,7 +255,7 @@ class BookSorter:
if self.work_mode == 'reading':
try:
book_breakdown = book_ref.generate_content()
except:
except KeyboardInterrupt:
logger.error('Content generation error: ' + filename)
return