Fix MOBI parser

Update Kindleunpack
Discover new and exciting bugs
This commit is contained in:
BasioMeusPuga
2019-02-10 17:58:35 +05:30
parent f6f9d01060
commit 3cd75807f9
9 changed files with 96 additions and 76 deletions

4
TODO
View File

@@ -93,12 +93,14 @@ TODO
Deselecting all directories in the settings dialog also filters out manually added books
Last line in QTextBrowser should never be cut off
Does image alignment need to be centered?
Bookmark name for a page that's not on the TOC or has nothing before
Bookmark name for a page that's not on the TOC and has nothing before
Screen position still keeps jumping when inside a paragraph
Better recursion needed for fb2 toc
Initial sort by author in tableview
Last column not filling up tableview
Comic view mode changing does not work for newly added books
Ctrl + A reports 10 times the number of books selected for deletion
Ordering for non TOC chapters is beyond borked
Secondary:
Tab tooltip

View File

@@ -6,7 +6,7 @@ from __future__ import unicode_literals, division, absolute_import, print_functi
import os
__path__ = ["lib", os.path.dirname(__file__), "kindleunpack"]
__path__ = ["lib", os.path.dirname(os.path.realpath(__file__)), "kindleunpack"]
import sys
import codecs
@@ -140,6 +140,8 @@ if PY2:
# 0.76 pre-release version only fix name related issues in opf by not using original file name in mobi7
# 0.77 bug fix for unpacking HDImages with included Fonts
# 0.80 converted to work with both python 2.7 and Python 3.3 and later
# 0.81 various fixes
# 0.82 Handle calibre-generated mobis that can have skeletons with no fragments
DUMP = False
""" Set to True to dump all possible information. """
@@ -847,7 +849,7 @@ def process_all_mobi_headers(files, apnxfile, sect, mhlst, K8Boundary, k8only=Fa
return
def unpackBook(infile, outdir, apnxfile=None, epubver='2', use_hd=True, dodump=False, dowriteraw=False, dosplitcombos=False):
def unpackBook(infile, outdir, apnxfile=None, epubver='2', use_hd=False, dodump=False, dowriteraw=False, dosplitcombos=False):
global DUMP
global WRITE_RAW_DATA
global SPLIT_COMBO_MOBIS
@@ -949,7 +951,7 @@ def main(argv=unicode_argv()):
global WRITE_RAW_DATA
global SPLIT_COMBO_MOBIS
print("KindleUnpack v0.80")
print("KindleUnpack v0.82")
print(" Based on initial mobipocket version Copyright © 2009 Charles M. Hannum <root@ihack.net>")
print(" Extensive Extensions and Improvements Copyright © 2009-2014 ")
print(" by: P. Durrant, K. Hendricks, S. Siebert, fandrieu, DiapDealer, nickredding, tkeo.")

View File

@@ -180,9 +180,11 @@ class K8Processor:
fragptr = 0
baseptr = 0
cnt = 0
filename = 'part%04d.xhtml' % cnt
for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl:
baseptr = skelpos + skellen
skeleton = text[skelpos: baseptr]
aidtext = "0"
for i in range(fragcnt):
[insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[fragptr]
aidtext = idtext[12:-2]

View File

@@ -246,13 +246,13 @@ class MobiMLConverter(object):
# handle case of end tag with no beginning by injecting empty begin tag
taginfo = ('begin', tname, None)
htmlstr += self.processtag(taginfo)
print(" - fixed by injecting empty start tag ", tname)
print " - fixed by injecting empty start tag ", tname
self.path.append(tname)
elif len(self.path) > 1 and tname == self.path[-2]:
# handle case of dangling missing end
taginfo = ('end', self.path[-1], None)
htmlstr += self.processtag(taginfo)
print(" - fixed by injecting end tag ", self.path[-1])
print " - fixed by injecting end tag ", self.path[-1]
self.path.pop()
self.path.pop()
@@ -504,18 +504,18 @@ def main(argv=sys.argv):
infile = argv[1]
try:
print('Converting Mobi Markup Language to XHTML')
print 'Converting Mobi Markup Language to XHTML'
mlc = MobiMLConverter(infile)
print('Processing ...')
print 'Processing ...'
htmlstr, css, cssname = mlc.processml()
outname = infile.rsplit('.',1)[0] + '_converted.html'
file(outname, 'wb').write(htmlstr)
file(cssname, 'wb').write(css)
print('Completed')
print('XHTML version of book can be found at: ', outname)
print 'Completed'
print 'XHTML version of book can be found at: ' + outname
except ValueError as e:
print("Error: %s" % e)
except ValueError, e:
print "Error: %s" % e
return 1
return 0

View File

@@ -14,8 +14,9 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# This module parses Amazon ebooks using KindleUnpack to first create an
# epub that is then read the usual way
# TODO
# See if it's possible to just feed the
# unzipped mobi7 file into the EPUB parser module
import os
import sys
@@ -30,73 +31,53 @@ logger = logging.getLogger(__name__)
class ParseMOBI:
# This module parses Amazon ebooks using KindleUnpack to first create an
# epub and then read the usual way
def __init__(self, filename, temp_dir, file_md5):
self.book_ref = None
self.book = None
self.filename = filename
self.epub_filepath = None
self.split_large_xml = False
self.temp_dir = temp_dir
self.extract_dir = os.path.join(temp_dir, file_md5)
self.extract_path = os.path.join(temp_dir, file_md5)
def read_book(self):
with HidePrinting():
KindleUnpack.unpackBook(self.filename, self.extract_dir)
KindleUnpack.unpackBook(self.filename, self.extract_path)
epub_filename = os.path.splitext(
os.path.basename(self.filename))[0] + '.epub'
self.epub_filepath = os.path.join(
self.extract_dir, 'mobi8', epub_filename)
self.extract_path, 'mobi8', epub_filename)
if not os.path.exists(self.epub_filepath):
zip_dir = os.path.join(self.extract_dir, 'mobi7')
zip_dir = os.path.join(self.extract_path, 'mobi7')
zip_file = os.path.join(
self.extract_dir, epub_filename)
self.extract_path, epub_filename)
self.epub_filepath = shutil.make_archive(zip_file, 'zip', zip_dir)
self.split_large_xml = True
self.book_ref = EPUB(self.epub_filepath, self.temp_dir)
self.book_ref.generate_metadata()
self.book_ref.generate_toc()
self.book_ref.generate_content()
self.book = self.book_ref.book
return True
self.book = EPUB(self.epub_filepath, self.temp_dir)
def get_title(self):
return self.book['title']
def generate_metadata(self):
self.book.generate_metadata()
return self.book.metadata
def get_author(self):
return self.book['author']
def generate_content(self):
zipfile.ZipFile(self.epub_filepath).extractall(self.extract_path)
def get_year(self):
return self.book['year']
def get_cover_image(self):
return self.book['cover']
def get_isbn(self):
return self.book['isbn']
def get_tags(self):
return self.book['tags']
def get_contents(self):
return
extract_path = os.path.join(self.extract_dir)
zipfile.ZipFile(self.epub_filepath).extractall(extract_path)
self.book_ref.parse_chapters(
temp_dir=self.temp_dir, split_large_xml=self.split_large_xml)
self.book.generate_toc()
self.book.generate_content()
toc = []
content = []
for count, i in enumerate(self.book['book_list']):
toc.append((1, i[0], count + 1))
content.append(i[1])
for count, i in enumerate(self.book.content):
toc.append((1, i[1], count + 1))
content.append(i[2])
# Return toc, content, images_only
return toc, content, False
class HidePrinting:
def __enter__(self):
self._original_stdout = sys.stdout

View File

@@ -18,7 +18,6 @@
# See if inserting chapters not in the toc.ncx can be avoided
# Account for stylesheets... eventually
# Everything needs logging
# Mobipocket files
import os
import zipfile
@@ -68,6 +67,7 @@ class EPUB:
for i in presumptive_names:
packagefile = self.find_file(i)
if packagefile:
logger.info('Using presumptive package file: ' + self.book_filename)
break
packagefile_data = self.zip_file.read(packagefile)
@@ -218,11 +218,19 @@ class EPUB:
# These are simply ids that correspond to the actual item
# as mentioned in the manifest - which is a comprehensive
# list of files
try:
# Multiple chapters
chapters_in_spine = [
i['@idref']
for i in self.opf_dict['package']['spine']['itemref']]
except TypeError:
# Single chapter - Large xml
chapters_in_spine = [
self.opf_dict['package']['spine']['itemref']['@idref']]
# Next, find items and ids from the manifest
# This might error out in case there's only one item in
# the manifest. Remember that for later.
chapters_from_manifest = {
i['@id']: i['@href']
for i in self.opf_dict['package']['manifest']['item']}
@@ -236,10 +244,12 @@ class EPUB:
except KeyError:
pass
chapter_title = 1
toc_chapters = [
unquote(i[2].split('#')[0]) for i in self.content]
# TODO
# This totally borks the order
last_valid_index = -2 # Yes, but why?
for i in spine_final:
if not i in toc_chapters:
@@ -251,10 +261,11 @@ class EPUB:
except ValueError:
last_valid_index += 1
# Chapters are currently named None
# Blank chapters will later be removed
# and the None will be replaced by a number
self.content.insert(
last_valid_index + 1,
[1, str(chapter_title), i])
chapter_title += 1
last_valid_index + 1, [1, None, i])
# Parse split chapters as below
# They can be picked up during the iteration through the toc
@@ -316,13 +327,28 @@ class EPUB:
self.content[count][2] = chapter_content
# Cleanup content by removing null chapters
self.content = [
i for i in self.content if i[2]]
unnamed_chapter_title = 1
content_copy = []
for i in self.content:
if i[2]:
chapter_title = i[1]
if not chapter_title:
chapter_title = unnamed_chapter_title
unnamed_chapter_title += 1
content_copy.append((
i[0], str(chapter_title), i[2]))
self.content = content_copy
# TODO
# This can probably be circumvented by shifting the extraction
# to this module and simply getting the path to the cover
# Get cover image and put it in its place
# I imagine this involves saying nasty things to it
cover_image = self.generate_book_cover()
if cover_image:
cover_path = os.path.join(
self.temp_dir, os.path.basename(self.book_filename)) + '- cover'
self.temp_dir, os.path.basename(self.book_filename)) + ' - cover'
with open(cover_path, 'wb') as cover_temp:
cover_temp.write(cover_image)
@@ -389,7 +415,6 @@ class EPUB:
break
except:
logger.warning('ISBN not found: ' + self.book_filename)
pass
# Book tags
try:

View File

@@ -148,7 +148,7 @@ class FB2:
cover_image = self.generate_book_cover()
if cover_image:
cover_path = os.path.join(
temp_dir, os.path.basename(self.filename)) + '- cover'
temp_dir, os.path.basename(self.filename)) + ' - cover'
with open(cover_path, 'wb') as cover_temp:
cover_temp.write(cover_image)

View File

@@ -15,7 +15,8 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# INSTRUCTIONS
# Every parser is supposed to have the following methods. None returns are not allowed.
# Every parser is supposed to have the following methods.
# Exceptions will be caught - but that's just bad practice
# read_book() - Initialize book
# generate_metadata() - For addition
# generate_content() - For reading
@@ -67,6 +68,7 @@ else:
# python-lxml - Required for everything except comics
lxml_check = importlib.util.find_spec('lxml')
xmltodict_check = importlib.util.find_spec('xmltodict')
if lxml_check:
lxml_dependent = {
'epub': ParseEPUB,
@@ -79,7 +81,7 @@ if lxml_check:
'fb2.zip': ParseFB2}
sorter.update(lxml_dependent)
else:
critical_sting = 'python-lxml is not installed. Only comics will load.'
critical_sting = 'python-lxml / xmltodict is not installed. Only comics will load.'
print(critical_sting)
logger.critical(critical_sting)
@@ -122,7 +124,7 @@ class BookSorter:
self.queue = Manager().Queue()
self.processed_books = []
if self.work_mode == 'addition':
# if self.work_mode == 'addition':
progress_object_generator()
def database_hashes(self):
@@ -134,7 +136,6 @@ class BookSorter:
'LIKE')
if all_hashes_and_paths:
# self.hashes = [i[0] for i in all_hashes]
self.hashes_and_paths = {
i[0]: i[1] for i in all_hashes_and_paths}
@@ -205,6 +206,12 @@ class BookSorter:
book_ref = sorter[file_extension](filename, self.temp_dir, file_md5)
# None of the following have an exception type specified
# This will keep everything from crashing, but will make
# troubleshooting difficult
# TODO
# In application notifications
try:
book_ref.read_book()
except:
@@ -248,7 +255,7 @@ class BookSorter:
if self.work_mode == 'reading':
try:
book_breakdown = book_ref.generate_content()
except:
except KeyboardInterrupt:
logger.error('Content generation error: ' + filename)
return

View File

@@ -1,6 +1,7 @@
beautifulsoup4==4.7.1
lxml==4.3.0
PyMuPDF==1.14.7
lxml==4.3.1
PyMuPDF==1.14.8
PyQt5==5.11.3
PyQt5-sip==4.19.13
soupsieve==1.7.3
xmltodict==0.11.0