Incorporate KindleUnpack from https://github.com/kevinhendricks/KindleUnpack

The GUI elements have been removed
2018-03-09 14:24:05 +05:30
parent d2910188d6
commit 2cbd2df9a5
21 changed files with 7286 additions and 0 deletions
--- a/KindleUnpack/init.py
+++ b/KindleUnpack/init.py
@@ -0,0 +1,2 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
--- a/KindleUnpack/compatibility_utils.py
+++ b/KindleUnpack/compatibility_utils.py
@@ -0,0 +1,278 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+# Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this list of
+# conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice, this list
+# of conditions and the following disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+import sys
+import codecs
+
+PY2 = sys.version_info[0] == 2
+PY3 = sys.version_info[0] == 3
+
+iswindows = sys.platform.startswith('win')
+
+try:
+    from urllib.parse import unquote
+except ImportError:
+    from urllib import unquote
+
+if PY2:
+    from HTMLParser import HTMLParser
+    _h = HTMLParser()
+elif sys.version_info[1] < 4:
+    import html.parser
+    _h = html.parser.HTMLParser()
+else:
+    import html as _h
+
+if PY3:
+    text_type = str
+    binary_type = bytes
+    # if will be printing arbitraty binary data to stdout on python 3
+    # sys.stdin = sys.stdin.detach()
+    # sys.stdout = sys.stdout.detach()
+    # sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
+else:
+    range = xrange
+    text_type = unicode
+    binary_type = str
+    # if will be printing unicode under python 2 need to protect
+    # against sys.stdout.encoding being None stupidly forcing forcing ascii encoding of unicode
+    # sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
+    # alternatively set environment variable as follows **before** launching python:  export PYTHONIOENCODING=UTF-8
+
+# NOTE: Python 3 is completely broken when accessing single bytes in bytes strings
+# (and they amazingly claim by design and no bug!)
+
+# To illustrate: this works for unicode in Python 3 and for all Python 2.X for both bytestrings and unicode
+# >>> o = '123456789'
+# >>> o[-3]
+# '7'
+# >>> type(o[-3])
+# <class 'str'>
+# >>> type(o)
+# <class 'str'>
+
+# Unfortunately, this is what Python 3 does for no sane reason and only for bytestrings
+# >>> o = b'123456789'
+# >>> o[-3]
+# 55
+# >>> type(o[-3])
+# <class 'int'>
+# >>> type(o)
+# <class 'bytes'>
+
+# This mind boggling  behaviour also happens when indexing a bytestring and/or
+# iteratoring over a bytestring.  In other words it will return an int but not
+# the byte itself!!!!!!!
+
+# The only way to access a single byte as a byte in bytestring and get the byte in both
+# Python 2 and Python 3 is to use a slice
+
+# This problem is so common there are horrible hacks floating around the net to **try**
+# to work around it, so that code that works on both Python 2 and Python 3 is possible.
+
+# So in order to write code that works on both Python 2 and Python 3
+# if you index or access a single byte and want its ord() then use the bord() function.
+# If instead you want it as a single character byte use the bchar() function
+# both of which are defined below.
+
+if PY3:
+    # Also Note: if decode a bytestring using 'latin-1' (or any other full range 0-255 encoding)
+    # in place of ascii you will get a byte value to half-word or integer value
+    # one-to-one mapping (in the 0 - 255 range)
+
+    def bchr(s):
+        return bytes([s])
+
+    def bstr(s):
+        if isinstance(s, str):
+            return bytes(s, 'latin-1')
+        else:
+            return bytes(s)
+
+    def bord(s):
+        return s
+
+    def bchar(s):
+        return bytes([s])
+
+else:
+    def bchr(s):
+        return chr(s)
+
+    def bstr(s):
+        return str(s)
+
+    def bord(s):
+        return ord(s)
+
+    def bchar(s):
+        return s
+
+if PY3:
+    # list-producing versions of the major Python iterating functions
+    def lrange(*args, **kwargs):
+        return list(range(*args, **kwargs))
+
+    def lzip(*args, **kwargs):
+        return list(zip(*args, **kwargs))
+
+    def lmap(*args, **kwargs):
+        return list(map(*args, **kwargs))
+
+    def lfilter(*args, **kwargs):
+        return list(filter(*args, **kwargs))
+else:
+    import __builtin__
+    # Python 2-builtin ranges produce lists
+    lrange = __builtin__.range
+    lzip = __builtin__.zip
+    lmap = __builtin__.map
+    lfilter = __builtin__.filter
+
+# In Python 3 you can no longer use .encode('hex') on a bytestring
+# instead use the following on both platforms
+import binascii
+def hexlify(bdata):
+    return (binascii.hexlify(bdata)).decode('ascii')
+
+# If you: import struct
+# Note:  struct pack, unpack, unpack_from all *require* bytestring format
+# data all the way up to at least Python 2.7.5, Python 3 is okay with either
+
+# If you: import re
+# note: Python 3 "re" requires the pattern to be the exact same type as the data to be
+# searched ... but u"" is not allowed for the pattern itself only b""
+# Python 2.X allows the pattern to be any type and converts it to match the data
+# and returns the same type as the data
+
+# convert string to be utf-8 encoded
+def utf8_str(p, enc='utf-8'):
+    if p is None:
+        return None
+    if isinstance(p, text_type):
+        return p.encode('utf-8')
+    if enc != 'utf-8':
+        return p.decode(enc).encode('utf-8')
+    return p
+
+# convert string to be unicode encoded
+def unicode_str(p, enc='utf-8'):
+    if p is None:
+        return None
+    if isinstance(p, text_type):
+        return p
+    return p.decode(enc)
+
+ASCII_CHARS   = set(chr(x) for x in range(128))
+URL_SAFE      = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+                    'abcdefghijklmnopqrstuvwxyz'
+                    '0123456789' '#' '_.-/~')
+IRI_UNSAFE = ASCII_CHARS - URL_SAFE
+
+# returns a quoted IRI (not a URI)
+def quoteurl(href):
+    if isinstance(href,binary_type):
+        href = href.decode('utf-8')
+    result = []
+    for char in href:
+        if char in IRI_UNSAFE:
+            char = "%%%02x" % ord(char)
+        result.append(char)
+    return ''.join(result)
+
+# unquotes url/iri
+def unquoteurl(href):
+    if isinstance(href,binary_type):
+        href = href.decode('utf-8')
+    href = unquote(href)
+    return href
+
+# unescape html
+def unescapeit(sval):
+    return _h.unescape(sval)
+
+# Python 2.X commandline parsing under Windows has been horribly broken for years!
+# Use the following code to emulate full unicode commandline parsing on Python 2
+# ie. To get  sys.argv arguments and properly encode them as unicode
+
+def unicode_argv():
+    global iswindows
+    global PY3
+    if PY3:
+        return sys.argv
+    if iswindows:
+        # Versions 2.x of Python don't support Unicode in sys.argv on
+        # Windows, with the underlying Windows API instead replacing multi-byte
+        # characters with '?'.  So use shell32.GetCommandLineArgvW to get sys.argv
+        # as a list of Unicode strings
+        from ctypes import POINTER, byref, cdll, c_int, windll
+        from ctypes.wintypes import LPCWSTR, LPWSTR
+
+        GetCommandLineW = cdll.kernel32.GetCommandLineW
+        GetCommandLineW.argtypes = []
+        GetCommandLineW.restype = LPCWSTR
+
+        CommandLineToArgvW = windll.shell32.CommandLineToArgvW
+        CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)]
+        CommandLineToArgvW.restype = POINTER(LPWSTR)
+
+        cmd = GetCommandLineW()
+        argc = c_int(0)
+        argv = CommandLineToArgvW(cmd, byref(argc))
+        if argc.value > 0:
+            # Remove Python executable and commands if present
+            start = argc.value - len(sys.argv)
+            return [argv[i] for i in
+                    range(start, argc.value)]
+        # this should never happen
+        return None
+    else:
+        argv = []
+        argvencoding = sys.stdin.encoding
+        if argvencoding is None:
+            argvencoding = sys.getfilesystemencoding()
+        if argvencoding is None:
+            argvencoding = 'utf-8'
+        for arg in sys.argv:
+            if isinstance(arg, text_type):
+                argv.append(arg)
+            else:
+                argv.append(arg.decode(argvencoding))
+        return argv
+
+
+# Python 2.X is broken in that it does not recognize CP65001 as UTF-8
+def add_cp65001_codec():
+    if PY2:
+        try:
+            codecs.lookup('cp65001')
+        except LookupError:
+            codecs.register(
+                lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None)
+    return
--- a/KindleUnpack/kindleunpack.py
+++ b/KindleUnpack/kindleunpack.py
--- a/KindleUnpack/mobi_cover.py
+++ b/KindleUnpack/mobi_cover.py
@@ -0,0 +1,238 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import unicode_str
+
+from .unipath import pathof
+import os
+import imghdr
+
+import struct
+# note:  struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+USE_SVG_WRAPPER = True
+""" Set to True to use svg wrapper for default. """
+
+FORCE_DEFAULT_TITLE = False
+""" Set to True to force to use the default title. """
+
+COVER_PAGE_FINENAME = 'cover_page.xhtml'
+""" The name for the cover page. """
+
+DEFAULT_TITLE = 'Cover'
+""" The default title for the cover page. """
+
+MAX_WIDTH = 4096
+""" The max width for the svg cover page. """
+
+MAX_HEIGHT = 4096
+""" The max height for the svg cover page. """
+
+
+def get_image_type(imgname, imgdata=None):
+    imgtype = unicode_str(imghdr.what(pathof(imgname), imgdata))
+
+    # imghdr only checks for JFIF or Exif JPEG files. Apparently, there are some
+    # with only the magic JPEG bytes out there...
+    # ImageMagick handles those, so, do it too.
+    if imgtype is None:
+        if imgdata is None:
+            with open(pathof(imgname), 'rb') as f:
+                imgdata = f.read()
+        if imgdata[0:2] == b'\xFF\xD8':
+            # Get last non-null bytes
+            last = len(imgdata)
+            while (imgdata[last-1:last] == b'\x00'):
+                last-=1
+            # Be extra safe, check the trailing bytes, too.
+            if imgdata[last-2:last] == b'\xFF\xD9':
+                imgtype = "jpeg"
+    return imgtype
+
+
+def get_image_size(imgname, imgdata=None):
+    '''Determine the image type of imgname (or imgdata) and return its size.
+
+    Originally,
+    Determine the image type of fhandle and return its size.
+    from draco'''
+    if imgdata is None:
+        fhandle = open(pathof(imgname), 'rb')
+        head = fhandle.read(24)
+    else:
+        head = imgdata[0:24]
+    if len(head) != 24:
+        return
+
+    imgtype = get_image_type(imgname, imgdata)
+    if imgtype == 'png':
+        check = struct.unpack(b'>i', head[4:8])[0]
+        if check != 0x0d0a1a0a:
+            return
+        width, height = struct.unpack(b'>ii', head[16:24])
+    elif imgtype == 'gif':
+        width, height = struct.unpack(b'<HH', head[6:10])
+    elif imgtype == 'jpeg' and imgdata is None:
+        try:
+            fhandle.seek(0)  # Read 0xff next
+            size = 2
+            ftype = 0
+            while not 0xc0 <= ftype <= 0xcf:
+                fhandle.seek(size, 1)
+                byte = fhandle.read(1)
+                while ord(byte) == 0xff:
+                    byte = fhandle.read(1)
+                ftype = ord(byte)
+                size = struct.unpack(b'>H', fhandle.read(2))[0] - 2
+            # We are at a SOFn block
+            fhandle.seek(1, 1)  # Skip `precision' byte.
+            height, width = struct.unpack(b'>HH', fhandle.read(4))
+        except Exception:  # IGNORE:W0703
+            return
+    elif imgtype == 'jpeg' and imgdata is not None:
+        try:
+            pos = 0
+            size = 2
+            ftype = 0
+            while not 0xc0 <= ftype <= 0xcf:
+                pos += size
+                byte = imgdata[pos:pos+1]
+                pos += 1
+                while ord(byte) == 0xff:
+                    byte = imgdata[pos:pos+1]
+                    pos += 1
+                ftype = ord(byte)
+                size = struct.unpack(b'>H', imgdata[pos:pos+2])[0] - 2
+                pos += 2
+            # We are at a SOFn block
+            pos += 1  # Skip `precision' byte.
+            height, width = struct.unpack(b'>HH', imgdata[pos:pos+4])
+            pos += 4
+        except Exception:  # IGNORE:W0703
+            return
+    else:
+        return
+    return width, height
+
+# XXX experimental
+class CoverProcessor(object):
+
+    """Create a cover page.
+
+    """
+    def __init__(self, files, metadata, rscnames, imgname=None, imgdata=None):
+        self.files = files
+        self.metadata = metadata
+        self.rscnames = rscnames
+        self.cover_page = COVER_PAGE_FINENAME
+        self.use_svg = USE_SVG_WRAPPER  # Use svg wrapper.
+        self.lang = metadata.get('Language', ['en'])[0]
+        # This should ensure that if the methods to find the cover image's
+        # dimensions should fail for any reason, the SVG routine will not be used.
+        [self.width, self.height] = (-1,-1)
+        if FORCE_DEFAULT_TITLE:
+            self.title = DEFAULT_TITLE
+        else:
+            self.title = metadata.get('Title', [DEFAULT_TITLE])[0]
+
+        self.cover_image = None
+        if imgname is not None:
+            self.cover_image = imgname
+        elif 'CoverOffset' in metadata:
+            imageNumber = int(metadata['CoverOffset'][0])
+            cover_image = self.rscnames[imageNumber]
+            if cover_image is not None:
+                self.cover_image = cover_image
+            else:
+                print('Warning: Cannot identify the cover image.')
+        if self.use_svg:
+            try:
+                if imgdata is None:
+                    fname = os.path.join(files.imgdir, self.cover_image)
+                    [self.width, self.height] = get_image_size(fname)
+                else:
+                    [self.width, self.height] = get_image_size(None, imgdata)
+            except:
+                self.use_svg = False
+            width = self.width
+            height = self.height
+            if width < 0 or height < 0 or width > MAX_WIDTH or height > MAX_HEIGHT:
+                self.use_svg = False
+        return
+
+    def getImageName(self):
+        return self.cover_image
+
+    def getXHTMLName(self):
+        return self.cover_page
+
+    def buildXHTML(self):
+        print('Building a cover page.')
+        files = self.files
+        cover_image = self.cover_image
+        title = self.title
+        lang = self.lang
+
+        image_dir = os.path.normpath(os.path.relpath(files.k8images, files.k8text))
+        image_path = os.path.join(image_dir, cover_image).replace('\\', '/')
+
+        if not self.use_svg:
+            data = ''
+            data += '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html>'
+            data += '<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"'
+            data += ' xml:lang="{:s}">\n'.format(lang)
+            data += '<head>\n<title>{:s}</title>\n'.format(title)
+            data += '<style type="text/css">\n'
+            data += 'body {\n  margin: 0;\n  padding: 0;\n  text-align: center;\n}\n'
+            data += 'div {\n  height: 100%;\n  width: 100%;\n  text-align: center;\n  page-break-inside: avoid;\n}\n'
+            data += 'img {\n  display: inline-block;\n  height: 100%;\n  margin: 0 auto;\n}\n'
+            data += '</style>\n</head>\n'
+            data += '<body><div>\n'
+            data += '  <img src="{:s}" alt=""/>\n'.format(image_path)
+            data += '</div></body>\n</html>'
+        else:
+            width = self.width
+            height = self.height
+            viewBox = "0 0 {0:d} {1:d}".format(width, height)
+
+            data = ''
+            data += '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html>'
+            data += '<html xmlns="http://www.w3.org/1999/xhtml"'
+            data += ' xml:lang="{:s}">\n'.format(lang)
+            data += '<head>\n  <title>{:s}</title>\n'.format(title)
+            data += '<style type="text/css">\n'
+            data += 'svg {padding: 0pt; margin:0pt}\n'
+            data += 'body { text-align: center; padding:0pt; margin: 0pt; }\n'
+            data += '</style>\n</head>\n'
+            data += '<body>\n  <div>\n'
+            data += '    <svg xmlns="http://www.w3.org/2000/svg" height="100%" preserveAspectRatio="xMidYMid meet"'
+            data += ' version="1.1" viewBox="{0:s}" width="100%" xmlns:xlink="http://www.w3.org/1999/xlink">\n'.format(viewBox)
+            data += '      <image height="{0}" width="{1}" xlink:href="{2}"/>\n'.format(height, width, image_path)
+            data += '    </svg>\n'
+            data += '  </div>\n</body>\n</html>'
+        return data
+
+    def writeXHTML(self):
+        files = self.files
+        cover_page = self.cover_page
+
+        data = self.buildXHTML()
+
+        outfile = os.path.join(files.k8text, cover_page)
+        if os.path.exists(pathof(outfile)):
+            print('Warning: {:s} already exists.'.format(cover_page))
+            os.remove(pathof(outfile))
+        with open(pathof(outfile), 'wb') as f:
+            f.write(data.encode('utf-8'))
+        return
+
+    def guide_toxml(self):
+        files = self.files
+        text_dir = os.path.relpath(files.k8text, files.k8oebps)
+        data = '<reference type="cover" title="Cover" href="{:s}/{:s}" />\n'.format(
+                text_dir, self.cover_page)
+        return data
--- a/KindleUnpack/mobi_dict.py
+++ b/KindleUnpack/mobi_dict.py
@@ -0,0 +1,377 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, PY3, utf8_str, bstr, bchr
+
+if PY2:
+    range = xrange
+    array_format = b'B'
+if PY3:
+    unichr = chr
+    array_format = "B"
+
+import array
+
+import struct
+# note:  struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+from .mobi_index import getVariableWidthValue, readTagSection, getTagMap
+from .mobi_utils import toHex
+
+DEBUG_DICT = False
+
+class InflectionData(object):
+
+    def __init__(self, infldatas):
+        self.infldatas = infldatas
+        self.starts = []
+        self.counts = []
+        for idata in self.infldatas:
+            start, = struct.unpack_from(b'>L', idata, 0x14)
+            count, = struct.unpack_from(b'>L', idata, 0x18)
+            self.starts.append(start)
+            self.counts.append(count)
+
+    def lookup(self, lookupvalue):
+        i = 0
+        rvalue = lookupvalue
+        while rvalue >= self.counts[i]:
+            rvalue = rvalue - self.counts[i]
+            i += 1
+            if i == len(self.counts):
+                print("Error: Problem with multiple inflections data sections")
+                return lookupvalue, self.starts[0], self.counts[0], self.infldatas[0]
+        return rvalue, self.starts[i], self.counts[i], self.infldatas[i]
+
+    def offsets(self, value):
+        rvalue, start, count, data = self.lookup(value)
+        offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue))
+        if rvalue + 1 < count:
+            nextOffset, = struct.unpack_from(b'>H',data, start + 4 + (2 * (rvalue + 1)))
+        else:
+            nextOffset = None
+        return offset, nextOffset, data
+
+
+class dictSupport(object):
+
+    def __init__(self, mh, sect):
+        self.mh = mh
+        self.header = mh.header
+        self.sect = sect
+        self.metaOrthIndex = mh.metaOrthIndex
+        self.metaInflIndex = mh.metaInflIndex
+
+    def parseHeader(self, data):
+        "read INDX header"
+        if not data[:4] == b'INDX':
+            print("Warning: index section is not INDX")
+            return False
+        words = (
+                'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
+                'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc'
+        )
+        num = len(words)
+        values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)])
+        header = {}
+        for n in range(num):
+            header[words[n]] = values[n]
+
+        ordt1 = None
+        ordt2 = None
+
+        otype, oentries, op1, op2, otagx  = struct.unpack_from(b'>LLLLL',data, 0xa4)
+        header['otype'] = otype
+        header['oentries'] = oentries
+
+        if DEBUG_DICT:
+            print("otype %d, oentries %d, op1 %d, op2 %d, otagx %d" % (otype, oentries, op1, op2, otagx))
+
+        if header['code'] == 0xfdea or oentries > 0:
+            # some dictionaries seem to be codepage 65002 (0xFDEA) which seems
+            # to be some sort of strange EBCDIC utf-8 or 16 encoded strings
+            # So we need to look for them and store them away to process leading text
+            # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
+            # we only ever seem to use the second but ...
+            #
+            # if otype = 0, ORDT table uses 16 bit values as offsets into the table
+            # if otype = 1, ORDT table uses 8 bit values as offsets inot the table
+
+            assert(data[op1:op1+4] == b'ORDT')
+            assert(data[op2:op2+4] == b'ORDT')
+            ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4)
+            ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4)
+
+        if DEBUG_DICT:
+            print("parsed INDX header:")
+            for key in header:
+                print(key, "%x" % header[key],)
+            print("\n")
+        return header, ordt1, ordt2
+
+    def getPositionMap(self):
+        sect = self.sect
+
+        positionMap = {}
+
+        metaOrthIndex = self.metaOrthIndex
+        metaInflIndex = self.metaInflIndex
+
+        decodeInflection = True
+        if metaOrthIndex != 0xFFFFFFFF:
+            print("Info: Document contains orthographic index, handle as dictionary")
+            if metaInflIndex == 0xFFFFFFFF:
+                decodeInflection = False
+            else:
+                metaInflIndexData = sect.loadSection(metaInflIndex)
+
+                print("\nParsing metaInflIndexData")
+                midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData)
+
+                metaIndexCount = midxhdr['count']
+                idatas = []
+                for j in range(metaIndexCount):
+                    idatas.append(sect.loadSection(metaInflIndex + 1 + j))
+                dinfl = InflectionData(idatas)
+
+                inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount)
+                tagSectionStart = midxhdr['len']
+                inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData)
+                if DEBUG_DICT:
+                    print("inflectionTagTable: %s" % inflectionTagTable)
+                if self.hasTag(inflectionTagTable, 0x07):
+                    print("Error: Dictionary uses obsolete inflection rule scheme which is not yet supported")
+                    decodeInflection = False
+
+            data = sect.loadSection(metaOrthIndex)
+
+            print("\nParsing metaOrthIndex")
+            idxhdr, hordt1, hordt2 = self.parseHeader(data)
+
+            tagSectionStart = idxhdr['len']
+            controlByteCount, tagTable = readTagSection(tagSectionStart, data)
+            orthIndexCount = idxhdr['count']
+            print("orthIndexCount is", orthIndexCount)
+            if DEBUG_DICT:
+                print("orthTagTable: %s" % tagTable)
+            if hordt2 is not None:
+                print("orth entry uses ordt2 lookup table of type ", idxhdr['otype'])
+            hasEntryLength = self.hasTag(tagTable, 0x02)
+            if not hasEntryLength:
+                print("Info: Index doesn't contain entry length tags")
+
+            print("Read dictionary index data")
+            for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount):
+                data = sect.loadSection(i)
+                hdrinfo, ordt1, ordt2 = self.parseHeader(data)
+                idxtPos = hdrinfo['start']
+                entryCount = hdrinfo['count']
+                idxPositions = []
+                for j in range(entryCount):
+                    pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j))
+                    idxPositions.append(pos)
+                # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
+                idxPositions.append(idxtPos)
+                for j in range(entryCount):
+                    startPos = idxPositions[j]
+                    endPos = idxPositions[j+1]
+                    textLength = ord(data[startPos:startPos+1])
+                    text = data[startPos+1:startPos+1+textLength]
+                    if hordt2 is not None:
+                        utext = u""
+                        if idxhdr['otype'] == 0:
+                            pattern = b'>H'
+                            inc = 2
+                        else:
+                            pattern = b'>B'
+                            inc = 1
+                        pos = 0
+                        while pos < textLength:
+                            off, = struct.unpack_from(pattern, text, pos)
+                            if off < len(hordt2):
+                                utext += unichr(hordt2[off])
+                            else:
+                                utext += unichr(off)
+                            pos += inc
+                        text = utext.encode('utf-8')
+
+                    tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos)
+                    if 0x01 in tagMap:
+                        if decodeInflection and 0x2a in tagMap:
+                            inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable,
+                                                                        dinfl, inflNameData, tagMap[0x2a])
+                        else:
+                            inflectionGroups = b''
+                        assert len(tagMap[0x01]) == 1
+                        entryStartPosition = tagMap[0x01][0]
+                        if hasEntryLength:
+                            # The idx:entry attribute "scriptable" must be present to create entry length tags.
+                            ml = b'<idx:entry scriptable="yes"><idx:orth value="' + text + b'">' + inflectionGroups + b'</idx:orth>'
+                            if entryStartPosition in positionMap:
+                                positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml
+                            else:
+                                positionMap[entryStartPosition] = ml
+                            assert len(tagMap[0x02]) == 1
+                            entryEndPosition = entryStartPosition + tagMap[0x02][0]
+                            if entryEndPosition in positionMap:
+                                positionMap[entryEndPosition] = b"</idx:entry>" + positionMap[entryEndPosition]
+                            else:
+                                positionMap[entryEndPosition] = b"</idx:entry>"
+
+                        else:
+                            indexTags = b'<idx:entry>\n<idx:orth value="' + text + b'">\n' + inflectionGroups + b'</idx:entry>\n'
+                            if entryStartPosition in positionMap:
+                                positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags
+                            else:
+                                positionMap[entryStartPosition] = indexTags
+        return positionMap
+
+    def hasTag(self, tagTable, tag):
+        '''
+        Test if tag table contains given tag.
+
+        @param tagTable: The tag table.
+        @param tag: The tag to search.
+        @return: True if tag table contains given tag; False otherwise.
+        '''
+        for currentTag, _, _, _ in tagTable:
+            if currentTag == tag:
+                return True
+        return False
+
+    def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList):
+        '''
+        Create string which contains the inflection groups with inflection rules as mobipocket tags.
+
+        @param mainEntry: The word to inflect.
+        @param controlByteCount: The number of control bytes.
+        @param tagTable: The tag table.
+        @param data: The Inflection data object to properly select the right inflection data section to use
+        @param inflectionNames: The inflection rule name data.
+        @param groupList: The list of inflection groups to process.
+        @return: String with inflection groups and rules or empty string if required tags are not available.
+        '''
+        result = b""
+        for value in groupList:
+            offset, nextOffset, data = dinfl.offsets(value)
+
+            # First byte seems to be always 0x00 and must be skipped.
+            assert ord(data[offset:offset+1]) == 0x00
+            tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset)
+
+            # Make sure that the required tags are available.
+            if 0x05 not in tagMap:
+                print("Error: Required tag 0x05 not found in tagMap")
+                return ""
+            if 0x1a not in tagMap:
+                print("Error: Required tag 0x1a not found in tagMap")
+                return b''
+
+            result += b'<idx:infl>'
+
+            for i in range(len(tagMap[0x05])):
+
+                # Get name of inflection rule.
+                value = tagMap[0x05][i]
+                consumed, textLength = getVariableWidthValue(inflectionNames, value)
+                inflectionName = inflectionNames[value+consumed:value+consumed+textLength]
+
+                # Get and apply inflection rule across possibly multiple inflection data sections
+                value = tagMap[0x1a][i]
+                rvalue, start, count, data = dinfl.lookup(value)
+                offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue))
+                textLength = ord(data[offset:offset+1])
+                inflection = self.applyInflectionRule(mainEntry, data, offset+1, offset+1+textLength)
+                if inflection is not None:
+                    result += b'  <idx:iform name="' + inflectionName + b'" value="' + inflection + b'"/>'
+
+            result += b'</idx:infl>'
+        return result
+
+    def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end):
+        '''
+        Apply inflection rule.
+
+        @param mainEntry: The word to inflect.
+        @param inflectionRuleData: The inflection rules.
+        @param start: The start position of the inflection rule to use.
+        @param end: The end position of the inflection rule to use.
+        @return: The string with the inflected word or None if an error occurs.
+        '''
+        mode = -1
+        byteArray = array.array(array_format, mainEntry)
+        position = len(byteArray)
+        for charOffset in range(start, end):
+            char = inflectionRuleData[charOffset:charOffset+1]
+            abyte = ord(char)
+            if abyte >= 0x0a and abyte <= 0x13:
+                # Move cursor backwards
+                offset = abyte - 0x0a
+                if mode not in [0x02, 0x03]:
+                    mode = 0x02
+                    position = len(byteArray)
+                position -= offset
+            elif abyte > 0x13:
+                if mode == -1:
+                    print("Error: Unexpected first byte %i of inflection rule" % abyte)
+                    return None
+                elif position == -1:
+                    print("Error: Unexpected first byte %i of inflection rule" % abyte)
+                    return None
+                else:
+                    if mode == 0x01:
+                        # Insert at word start
+                        byteArray.insert(position, abyte)
+                        position += 1
+                    elif mode == 0x02:
+                        # Insert at word end
+                        byteArray.insert(position, abyte)
+                    elif mode == 0x03:
+                        # Delete at word end
+                        position -= 1
+                        deleted = byteArray.pop(position)
+                        if bchr(deleted) != char:
+                            if DEBUG_DICT:
+                                print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted)))
+                            print("Error: Delete operation of inflection rule failed")
+                            return None
+                    elif mode == 0x04:
+                        # Delete at word start
+                        deleted = byteArray.pop(position)
+                        if bchr(deleted) != char:
+                            if DEBUG_DICT:
+                                print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted)))
+                            print("Error: Delete operation of inflection rule failed")
+                            return None
+                    else:
+                        print("Error: Inflection rule mode %x is not implemented" % mode)
+                        return None
+            elif abyte == 0x01:
+                # Insert at word start
+                if mode not in [0x01, 0x04]:
+                    position = 0
+                mode = abyte
+            elif abyte == 0x02:
+                # Insert at word end
+                if mode not in [0x02, 0x03]:
+                    position = len(byteArray)
+                mode = abyte
+            elif abyte == 0x03:
+                # Delete at word end
+                if mode not in [0x02, 0x03]:
+                    position = len(byteArray)
+                mode = abyte
+            elif abyte == 0x04:
+                # Delete at word start
+                if mode not in [0x01, 0x04]:
+                    position = 0
+                # Delete at word start
+                mode = abyte
+            else:
+                print("Error: Inflection rule mode %x is not implemented" % abyte)
+                return None
+        return utf8_str(byteArray.tostring())
--- a/KindleUnpack/mobi_header.py
+++ b/KindleUnpack/mobi_header.py
@@ -0,0 +1,934 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+DEBUG_USE_ORDERED_DICTIONARY = False  # OrderedDict is supoorted >= python 2.7.
+""" set to True to use OrderedDict for MobiHeader.metadata."""
+
+if DEBUG_USE_ORDERED_DICTIONARY:
+    from collections import OrderedDict as dict_
+else:
+    dict_ = dict
+
+from .compatibility_utils import PY2, unicode_str, hexlify, bord
+
+if PY2:
+    range = xrange
+
+import struct
+import uuid
+
+# import the mobiunpack support libraries
+from .mobi_utils import getLanguage
+from .mobi_uncompress import HuffcdicReader, PalmdocReader, UncompressedReader
+
+class unpackException(Exception):
+    pass
+
+
+def sortedHeaderKeys(mheader):
+    hdrkeys = sorted(list(mheader.keys()), key=lambda akey: mheader[akey][0])
+    return hdrkeys
+
+
+# HD Containers have their own headers and their own EXTH
+# this is just guesswork so far, making big assumption that
+# metavalue key numbers remain the same in the CONT EXTH
+
+# Note:  The layout of the CONT Header is still unknown
+# so just deal with their EXTH sections for now
+
+def dump_contexth(cpage, extheader):
+    # determine text encoding
+    codec = 'windows-1252'
+    codec_map = {
+         1252 : 'windows-1252',
+         65001: 'utf-8',
+    }
+    if cpage in codec_map:
+        codec = codec_map[cpage]
+    if extheader == b'':
+        return
+    id_map_strings = {
+        1 : 'Drm Server Id',
+        2 : 'Drm Commerce Id',
+        3 : 'Drm Ebookbase Book Id',
+        4 : 'Drm Ebookbase Dep Id',
+        100 : 'Creator',
+        101 : 'Publisher',
+        102 : 'Imprint',
+        103 : 'Description',
+        104 : 'ISBN',
+        105 : 'Subject',
+        106 : 'Published',
+        107 : 'Review',
+        108 : 'Contributor',
+        109 : 'Rights',
+        110 : 'SubjectCode',
+        111 : 'Type',
+        112 : 'Source',
+        113 : 'ASIN',
+        114 : 'versionNumber',
+        117 : 'Adult',
+        118 : 'Retail-Price',
+        119 : 'Retail-Currency',
+        120 : 'TSC',
+        122 : 'fixed-layout',
+        123 : 'book-type',
+        124 : 'orientation-lock',
+        126 : 'original-resolution',
+        127 : 'zero-gutter',
+        128 : 'zero-margin',
+        129 : 'MetadataResourceURI',
+        132 : 'RegionMagnification',
+        150 : 'LendingEnabled',
+        200 : 'DictShortName',
+        501 : 'cdeType',
+        502 : 'last_update_time',
+        503 : 'Updated_Title',
+        504 : 'CDEContentKey',
+        505 : 'AmazonContentReference',
+        506 : 'Title-Language',
+        507 : 'Title-Display-Direction',
+        508 : 'Title-Pronunciation',
+        509 : 'Title-Collation',
+        510 : 'Secondary-Title',
+        511 : 'Secondary-Title-Language',
+        512 : 'Secondary-Title-Direction',
+        513 : 'Secondary-Title-Pronunciation',
+        514 : 'Secondary-Title-Collation',
+        515 : 'Author-Language',
+        516 : 'Author-Display-Direction',
+        517 : 'Author-Pronunciation',
+        518 : 'Author-Collation',
+        519 : 'Author-Type',
+        520 : 'Publisher-Language',
+        521 : 'Publisher-Display-Direction',
+        522 : 'Publisher-Pronunciation',
+        523 : 'Publisher-Collation',
+        524 : 'Content-Language-Tag',
+        525 : 'primary-writing-mode',
+        526 : 'NCX-Ingested-By-Software',
+        527 : 'page-progression-direction',
+        528 : 'override-kindle-fonts',
+        529 : 'Compression-Upgraded',
+        530 : 'Soft-Hyphens-In-Content',
+        531 : 'Dictionary_In_Langague',
+        532 : 'Dictionary_Out_Language',
+        533 : 'Font_Converted',
+        534 : 'Amazon_Creator_Info',
+        535 : 'Creator-Build-Tag',
+        536 : 'HD-Media-Containers-Info',  # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?)
+        538 : 'Resource-Container-Fidelity',
+        539 : 'HD-Container-Mimetype',
+        540 : 'Sample-For_Special-Purpose',
+        541 : 'Kindletool-Operation-Information',
+        542 : 'Container_Id',
+        543 : 'Asset-Type',  # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER
+        544 : 'Unknown_544',
+    }
+    id_map_values = {
+        115 : 'sample',
+        116 : 'StartOffset',
+        121 : 'Mobi8-Boundary-Section',
+        125 : 'Embedded-Record-Count',
+        130 : 'Offline-Sample',
+        131 : 'Metadata-Record-Offset',
+        201 : 'CoverOffset',
+        202 : 'ThumbOffset',
+        203 : 'HasFakeCover',
+        204 : 'Creator-Software',
+        205 : 'Creator-Major-Version',
+        206 : 'Creator-Minor-Version',
+        207 : 'Creator-Build-Number',
+        401 : 'Clipping-Limit',
+        402 : 'Publisher-Limit',
+        404 : 'Text-to-Speech-Disabled',
+        406 : 'Rental-Expiration-Time',
+    }
+    id_map_hexstrings = {
+        208 : 'Watermark_(hex)',
+        209 : 'Tamper-Proof-Keys_(hex)',
+        300 : 'Font-Signature_(hex)',
+        403 : 'Unknown_(403)_(hex)',
+        405 : 'Ownership-Type_(hex)',
+        407 : 'Unknown_(407)_(hex)',
+        420 : 'Multimedia-Content-Reference_(hex)',
+        450 : 'Locations_Match_(hex)',
+        451 : 'Full-Story-Length_(hex)',
+        452 : 'Sample-Start_Location_(hex)',
+        453 : 'Sample-End-Location_(hex)',
+    }
+    _length, num_items = struct.unpack(b'>LL', extheader[4:12])
+    extheader = extheader[12:]
+    pos = 0
+    for _ in range(num_items):
+        id, size = struct.unpack(b'>LL', extheader[pos:pos+8])
+        content = extheader[pos + 8: pos + size]
+        if id in id_map_strings:
+            name = id_map_strings[id]
+            print('\n    Key: "%s"\n        Value: "%s"' % (name, content.decode(codec, errors='replace')))
+        elif id in id_map_values:
+            name = id_map_values[id]
+            if size == 9:
+                value, = struct.unpack(b'B',content)
+                print('\n    Key: "%s"\n        Value: 0x%01x' % (name, value))
+            elif size == 10:
+                value, = struct.unpack(b'>H',content)
+                print('\n    Key: "%s"\n        Value: 0x%02x' % (name, value))
+            elif size == 12:
+                value, = struct.unpack(b'>L',content)
+                print('\n    Key: "%s"\n        Value: 0x%04x' % (name, value))
+            else:
+                print("\nError: Value for %s has unexpected size of %s" % (name, size))
+        elif id in id_map_hexstrings:
+            name = id_map_hexstrings[id]
+            print('\n    Key: "%s"\n        Value: 0x%s' % (name, hexlify(content)))
+        else:
+            print("\nWarning: Unknown metadata with id %s found" % id)
+            name = str(id) + ' (hex)'
+            print('    Key: "%s"\n        Value: 0x%s' % (name, hexlify(content)))
+        pos += size
+    return
+
+
+class MobiHeader:
+    # all values are packed in big endian format
+    palmdoc_header = {
+            'compression_type'  : (0x00, b'>H', 2),
+            'fill0'             : (0x02, b'>H', 2),
+            'text_length'       : (0x04, b'>L', 4),
+            'text_records'      : (0x08, b'>H', 2),
+            'max_section_size'  : (0x0a, b'>H', 2),
+            'read_pos   '       : (0x0c, b'>L', 4),
+    }
+
+    mobi6_header = {
+            'compression_type'  : (0x00, b'>H', 2),
+            'fill0'             : (0x02, b'>H', 2),
+            'text_length'       : (0x04, b'>L', 4),
+            'text_records'      : (0x08, b'>H', 2),
+            'max_section_size'  : (0x0a, b'>H', 2),
+            'crypto_type'       : (0x0c, b'>H', 2),
+            'fill1'             : (0x0e, b'>H', 2),
+            'magic'             : (0x10, b'4s', 4),
+            'header_length (from MOBI)'     : (0x14, b'>L', 4),
+            'type'              : (0x18, b'>L', 4),
+            'codepage'          : (0x1c, b'>L', 4),
+            'unique_id'         : (0x20, b'>L', 4),
+            'version'           : (0x24, b'>L', 4),
+            'metaorthindex'     : (0x28, b'>L', 4),
+            'metainflindex'     : (0x2c, b'>L', 4),
+            'index_names'       : (0x30, b'>L', 4),
+            'index_keys'        : (0x34, b'>L', 4),
+            'extra_index0'      : (0x38, b'>L', 4),
+            'extra_index1'      : (0x3c, b'>L', 4),
+            'extra_index2'      : (0x40, b'>L', 4),
+            'extra_index3'      : (0x44, b'>L', 4),
+            'extra_index4'      : (0x48, b'>L', 4),
+            'extra_index5'      : (0x4c, b'>L', 4),
+            'first_nontext'     : (0x50, b'>L', 4),
+            'title_offset'      : (0x54, b'>L', 4),
+            'title_length'      : (0x58, b'>L', 4),
+            'language_code'     : (0x5c, b'>L', 4),
+            'dict_in_lang'      : (0x60, b'>L', 4),
+            'dict_out_lang'     : (0x64, b'>L', 4),
+            'min_version'       : (0x68, b'>L', 4),
+            'first_resc_offset' : (0x6c, b'>L', 4),
+            'huff_offset'       : (0x70, b'>L', 4),
+            'huff_num'          : (0x74, b'>L', 4),
+            'huff_tbl_offset'   : (0x78, b'>L', 4),
+            'huff_tbl_len'      : (0x7c, b'>L', 4),
+            'exth_flags'        : (0x80, b'>L', 4),
+            'fill3_a'           : (0x84, b'>L', 4),
+            'fill3_b'           : (0x88, b'>L', 4),
+            'fill3_c'           : (0x8c, b'>L', 4),
+            'fill3_d'           : (0x90, b'>L', 4),
+            'fill3_e'           : (0x94, b'>L', 4),
+            'fill3_f'           : (0x98, b'>L', 4),
+            'fill3_g'           : (0x9c, b'>L', 4),
+            'fill3_h'           : (0xa0, b'>L', 4),
+            'unknown0'          : (0xa4, b'>L', 4),
+            'drm_offset'        : (0xa8, b'>L', 4),
+            'drm_count'         : (0xac, b'>L', 4),
+            'drm_size'          : (0xb0, b'>L', 4),
+            'drm_flags'         : (0xb4, b'>L', 4),
+            'fill4_a'           : (0xb8, b'>L', 4),
+            'fill4_b'           : (0xbc, b'>L', 4),
+            'first_content'     : (0xc0, b'>H', 2),
+            'last_content'      : (0xc2, b'>H', 2),
+            'unknown0'          : (0xc4, b'>L', 4),
+            'fcis_offset'       : (0xc8, b'>L', 4),
+            'fcis_count'        : (0xcc, b'>L', 4),
+            'flis_offset'       : (0xd0, b'>L', 4),
+            'flis_count'        : (0xd4, b'>L', 4),
+            'unknown1'          : (0xd8, b'>L', 4),
+            'unknown2'          : (0xdc, b'>L', 4),
+            'srcs_offset'       : (0xe0, b'>L', 4),
+            'srcs_count'        : (0xe4, b'>L', 4),
+            'unknown3'          : (0xe8, b'>L', 4),
+            'unknown4'          : (0xec, b'>L', 4),
+            'fill5'             : (0xf0, b'>H', 2),
+            'traildata_flags'   : (0xf2, b'>H', 2),
+            'ncx_index'         : (0xf4, b'>L', 4),
+            'unknown5'          : (0xf8, b'>L', 4),
+            'unknown6'          : (0xfc, b'>L', 4),
+            'datp_offset'       : (0x100, b'>L', 4),
+            'unknown7'          : (0x104, b'>L', 4),
+            'Unknown    '       : (0x108, b'>L', 4),
+            'Unknown    '       : (0x10C, b'>L', 4),
+            'Unknown    '       : (0x110, b'>L', 4),
+            'Unknown    '       : (0x114, b'>L', 4),
+            'Unknown    '       : (0x118, b'>L', 4),
+            'Unknown    '       : (0x11C, b'>L', 4),
+            'Unknown    '       : (0x120, b'>L', 4),
+            'Unknown    '       : (0x124, b'>L', 4),
+            'Unknown    '       : (0x128, b'>L', 4),
+            'Unknown    '       : (0x12C, b'>L', 4),
+            'Unknown    '       : (0x130, b'>L', 4),
+            'Unknown    '       : (0x134, b'>L', 4),
+            'Unknown    '       : (0x138, b'>L', 4),
+            'Unknown    '       : (0x11C, b'>L', 4),
+            }
+
+    mobi8_header = {
+            'compression_type'  : (0x00, b'>H', 2),
+            'fill0'             : (0x02, b'>H', 2),
+            'text_length'       : (0x04, b'>L', 4),
+            'text_records'      : (0x08, b'>H', 2),
+            'max_section_size'  : (0x0a, b'>H', 2),
+            'crypto_type'       : (0x0c, b'>H', 2),
+            'fill1'             : (0x0e, b'>H', 2),
+            'magic'             : (0x10, b'4s', 4),
+            'header_length (from MOBI)'     : (0x14, b'>L', 4),
+            'type'              : (0x18, b'>L', 4),
+            'codepage'          : (0x1c, b'>L', 4),
+            'unique_id'         : (0x20, b'>L', 4),
+            'version'           : (0x24, b'>L', 4),
+            'metaorthindex'     : (0x28, b'>L', 4),
+            'metainflindex'     : (0x2c, b'>L', 4),
+            'index_names'       : (0x30, b'>L', 4),
+            'index_keys'        : (0x34, b'>L', 4),
+            'extra_index0'      : (0x38, b'>L', 4),
+            'extra_index1'      : (0x3c, b'>L', 4),
+            'extra_index2'      : (0x40, b'>L', 4),
+            'extra_index3'      : (0x44, b'>L', 4),
+            'extra_index4'      : (0x48, b'>L', 4),
+            'extra_index5'      : (0x4c, b'>L', 4),
+            'first_nontext'     : (0x50, b'>L', 4),
+            'title_offset'      : (0x54, b'>L', 4),
+            'title_length'      : (0x58, b'>L', 4),
+            'language_code'     : (0x5c, b'>L', 4),
+            'dict_in_lang'      : (0x60, b'>L', 4),
+            'dict_out_lang'     : (0x64, b'>L', 4),
+            'min_version'       : (0x68, b'>L', 4),
+            'first_resc_offset' : (0x6c, b'>L', 4),
+            'huff_offset'       : (0x70, b'>L', 4),
+            'huff_num'          : (0x74, b'>L', 4),
+            'huff_tbl_offset'   : (0x78, b'>L', 4),
+            'huff_tbl_len'      : (0x7c, b'>L', 4),
+            'exth_flags'        : (0x80, b'>L', 4),
+            'fill3_a'           : (0x84, b'>L', 4),
+            'fill3_b'           : (0x88, b'>L', 4),
+            'fill3_c'           : (0x8c, b'>L', 4),
+            'fill3_d'           : (0x90, b'>L', 4),
+            'fill3_e'           : (0x94, b'>L', 4),
+            'fill3_f'           : (0x98, b'>L', 4),
+            'fill3_g'           : (0x9c, b'>L', 4),
+            'fill3_h'           : (0xa0, b'>L', 4),
+            'unknown0'          : (0xa4, b'>L', 4),
+            'drm_offset'        : (0xa8, b'>L', 4),
+            'drm_count'         : (0xac, b'>L', 4),
+            'drm_size'          : (0xb0, b'>L', 4),
+            'drm_flags'         : (0xb4, b'>L', 4),
+            'fill4_a'           : (0xb8, b'>L', 4),
+            'fill4_b'           : (0xbc, b'>L', 4),
+            'fdst_offset'       : (0xc0, b'>L', 4),
+            'fdst_flow_count'   : (0xc4, b'>L', 4),
+            'fcis_offset'       : (0xc8, b'>L', 4),
+            'fcis_count'        : (0xcc, b'>L', 4),
+            'flis_offset'       : (0xd0, b'>L', 4),
+            'flis_count'        : (0xd4, b'>L', 4),
+            'unknown1'          : (0xd8, b'>L', 4),
+            'unknown2'          : (0xdc, b'>L', 4),
+            'srcs_offset'       : (0xe0, b'>L', 4),
+            'srcs_count'        : (0xe4, b'>L', 4),
+            'unknown3'          : (0xe8, b'>L', 4),
+            'unknown4'          : (0xec, b'>L', 4),
+            'fill5'             : (0xf0, b'>H', 2),
+            'traildata_flags'   : (0xf2, b'>H', 2),
+            'ncx_index'         : (0xf4, b'>L', 4),
+            'fragment_index'    : (0xf8, b'>L', 4),
+            'skeleton_index'    : (0xfc, b'>L', 4),
+            'datp_offset'       : (0x100, b'>L', 4),
+            'guide_index'       : (0x104, b'>L', 4),
+            'Unknown    '       : (0x108, b'>L', 4),
+            'Unknown    '       : (0x10C, b'>L', 4),
+            'Unknown    '       : (0x110, b'>L', 4),
+            'Unknown    '       : (0x114, b'>L', 4),
+            'Unknown    '       : (0x118, b'>L', 4),
+            'Unknown    '       : (0x11C, b'>L', 4),
+            'Unknown    '       : (0x120, b'>L', 4),
+            'Unknown    '       : (0x124, b'>L', 4),
+            'Unknown    '       : (0x128, b'>L', 4),
+            'Unknown    '       : (0x12C, b'>L', 4),
+            'Unknown    '       : (0x130, b'>L', 4),
+            'Unknown    '       : (0x134, b'>L', 4),
+            'Unknown    '       : (0x138, b'>L', 4),
+            'Unknown    '       : (0x11C, b'>L', 4),
+            }
+
+    palmdoc_header_sorted_keys = sortedHeaderKeys(palmdoc_header)
+    mobi6_header_sorted_keys = sortedHeaderKeys(mobi6_header)
+    mobi8_header_sorted_keys = sortedHeaderKeys(mobi8_header)
+
+    id_map_strings = {
+        1 : 'Drm Server Id',
+        2 : 'Drm Commerce Id',
+        3 : 'Drm Ebookbase Book Id',
+        4 : 'Drm Ebookbase Dep Id',
+        100 : 'Creator',
+        101 : 'Publisher',
+        102 : 'Imprint',
+        103 : 'Description',
+        104 : 'ISBN',
+        105 : 'Subject',
+        106 : 'Published',
+        107 : 'Review',
+        108 : 'Contributor',
+        109 : 'Rights',
+        110 : 'SubjectCode',
+        111 : 'Type',
+        112 : 'Source',
+        113 : 'ASIN',
+        114 : 'versionNumber',
+        117 : 'Adult',
+        118 : 'Retail-Price',
+        119 : 'Retail-Currency',
+        120 : 'TSC',
+        122 : 'fixed-layout',
+        123 : 'book-type',
+        124 : 'orientation-lock',
+        126 : 'original-resolution',
+        127 : 'zero-gutter',
+        128 : 'zero-margin',
+        129 : 'MetadataResourceURI',
+        132 : 'RegionMagnification',
+        150 : 'LendingEnabled',
+        200 : 'DictShortName',
+        501 : 'cdeType',
+        502 : 'last_update_time',
+        503 : 'Updated_Title',
+        504 : 'CDEContentKey',
+        505 : 'AmazonContentReference',
+        506 : 'Title-Language',
+        507 : 'Title-Display-Direction',
+        508 : 'Title-Pronunciation',
+        509 : 'Title-Collation',
+        510 : 'Secondary-Title',
+        511 : 'Secondary-Title-Language',
+        512 : 'Secondary-Title-Direction',
+        513 : 'Secondary-Title-Pronunciation',
+        514 : 'Secondary-Title-Collation',
+        515 : 'Author-Language',
+        516 : 'Author-Display-Direction',
+        517 : 'Author-Pronunciation',
+        518 : 'Author-Collation',
+        519 : 'Author-Type',
+        520 : 'Publisher-Language',
+        521 : 'Publisher-Display-Direction',
+        522 : 'Publisher-Pronunciation',
+        523 : 'Publisher-Collation',
+        524 : 'Content-Language-Tag',
+        525 : 'primary-writing-mode',
+        526 : 'NCX-Ingested-By-Software',
+        527 : 'page-progression-direction',
+        528 : 'override-kindle-fonts',
+        529 : 'Compression-Upgraded',
+        530 : 'Soft-Hyphens-In-Content',
+        531 : 'Dictionary_In_Langague',
+        532 : 'Dictionary_Out_Language',
+        533 : 'Font_Converted',
+        534 : 'Amazon_Creator_Info',
+        535 : 'Creator-Build-Tag',
+        536 : 'HD-Media-Containers-Info',  # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?)
+        538 : 'Resource-Container-Fidelity',
+        539 : 'HD-Container-Mimetype',
+        540 : 'Sample-For_Special-Purpose',
+        541 : 'Kindletool-Operation-Information',
+        542 : 'Container_Id',
+        543 : 'Asset-Type',  # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER
+        544 : 'Unknown_544',
+    }
+    id_map_values = {
+        115 : 'sample',
+        116 : 'StartOffset',
+        121 : 'Mobi8-Boundary-Section',
+        125 : 'Embedded-Record-Count',
+        130 : 'Offline-Sample',
+        131 : 'Metadata-Record-Offset',
+        201 : 'CoverOffset',
+        202 : 'ThumbOffset',
+        203 : 'HasFakeCover',
+        204 : 'Creator-Software',
+        205 : 'Creator-Major-Version',
+        206 : 'Creator-Minor-Version',
+        207 : 'Creator-Build-Number',
+        401 : 'Clipping-Limit',
+        402 : 'Publisher-Limit',
+        404 : 'Text-to-Speech-Disabled',
+        406 : 'Rental-Expiration-Time',
+    }
+    id_map_hexstrings = {
+        208 : 'Watermark_(hex)',
+        209 : 'Tamper-Proof-Keys_(hex)',
+        300 : 'Font-Signature_(hex)',
+        403 : 'Unknown_(403)_(hex)',
+        405 : 'Ownership-Type_(hex)',
+        407 : 'Unknown_(407)_(hex)',
+        420 : 'Multimedia-Content-Reference_(hex)',
+        450 : 'Locations_Match_(hex)',
+        451 : 'Full-Story-Length_(hex)',
+        452 : 'Sample-Start_Location_(hex)',
+        453 : 'Sample-End-Location_(hex)',
+    }
+
+    def __init__(self, sect, sectNumber):
+        self.sect = sect
+        self.start = sectNumber
+        self.header = self.sect.loadSection(self.start)
+        if len(self.header)>20 and self.header[16:20] == b'MOBI':
+            self.sect.setsectiondescription(0,"Mobipocket Header")
+            self.palm = False
+        elif self.sect.ident == b'TEXtREAd':
+            self.sect.setsectiondescription(0, "PalmDOC Header")
+            self.palm = True
+        else:
+            raise unpackException('Unknown File Format')
+
+        self.records, = struct.unpack_from(b'>H', self.header, 0x8)
+
+        # set defaults in case this is a PalmDOC
+        self.title = self.sect.palmname.decode('latin-1', errors='replace')
+        self.length = len(self.header)-16
+        self.type = 3
+        self.codepage = 1252
+        self.codec = 'windows-1252'
+        self.unique_id = 0
+        self.version = 0
+        self.hasExth = False
+        self.exth = b''
+        self.exth_offset = self.length + 16
+        self.exth_length = 0
+        self.crypto_type = 0
+        self.firstnontext = self.start+self.records + 1
+        self.firstresource = self.start+self.records + 1
+        self.ncxidx = 0xffffffff
+        self.metaOrthIndex = 0xffffffff
+        self.metaInflIndex = 0xffffffff
+        self.skelidx = 0xffffffff
+        self.fragidx = 0xffffffff
+        self.guideidx = 0xffffffff
+        self.fdst = 0xffffffff
+        self.mlstart = self.sect.loadSection(self.start+1)[:4]
+        self.rawSize = 0
+        self.metadata = dict_()
+
+        # set up for decompression/unpacking
+        self.compression, = struct.unpack_from(b'>H', self.header, 0x0)
+        if self.compression == 0x4448:
+            reader = HuffcdicReader()
+            huffoff, huffnum = struct.unpack_from(b'>LL', self.header, 0x70)
+            huffoff = huffoff + self.start
+            self.sect.setsectiondescription(huffoff,"Huffman Compression Seed")
+            reader.loadHuff(self.sect.loadSection(huffoff))
+            for i in range(1, huffnum):
+                self.sect.setsectiondescription(huffoff+i,"Huffman CDIC Compression Seed %d" % i)
+                reader.loadCdic(self.sect.loadSection(huffoff+i))
+            self.unpack = reader.unpack
+        elif self.compression == 2:
+            self.unpack = PalmdocReader().unpack
+        elif self.compression == 1:
+            self.unpack = UncompressedReader().unpack
+        else:
+            raise unpackException('invalid compression type: 0x%4x' % self.compression)
+
+        if self.palm:
+            return
+
+        self.length, self.type, self.codepage, self.unique_id, self.version = struct.unpack(b'>LLLLL', self.header[20:40])
+        codec_map = {
+            1252 : 'windows-1252',
+            65001: 'utf-8',
+        }
+        if self.codepage in codec_map:
+            self.codec = codec_map[self.codepage]
+
+        # title
+        toff, tlen = struct.unpack(b'>II', self.header[0x54:0x5c])
+        tend = toff + tlen
+        self.title=self.header[toff:tend].decode(self.codec, errors='replace')
+
+        exth_flag, = struct.unpack(b'>L', self.header[0x80:0x84])
+        self.hasExth = exth_flag & 0x40
+        self.exth_offset = self.length + 16
+        self.exth_length = 0
+        if self.hasExth:
+            self.exth_length, = struct.unpack_from(b'>L', self.header, self.exth_offset+4)
+            self.exth_length = ((self.exth_length + 3)>>2)<<2  # round to next 4 byte boundary
+            self.exth = self.header[self.exth_offset:self.exth_offset+self.exth_length]
+
+        # parse the exth / metadata
+        self.parseMetaData()
+
+        # self.mlstart = self.sect.loadSection(self.start+1)
+        # self.mlstart = self.mlstart[0:4]
+        self.crypto_type, = struct.unpack_from(b'>H', self.header, 0xC)
+
+        # Start sector for additional files such as images, fonts, resources, etc
+        # Can be missing so fall back to default set previously
+        ofst, = struct.unpack_from(b'>L', self.header, 0x6C)
+        if ofst != 0xffffffff:
+            self.firstresource = ofst + self.start
+        ofst, = struct.unpack_from(b'>L', self.header, 0x50)
+        if ofst != 0xffffffff:
+            self.firstnontext = ofst + self.start
+
+        if self.isPrintReplica():
+            return
+
+        if self.version < 8:
+            # Dictionary metaOrthIndex
+            self.metaOrthIndex, = struct.unpack_from(b'>L', self.header, 0x28)
+            if self.metaOrthIndex != 0xffffffff:
+                self.metaOrthIndex += self.start
+
+            # Dictionary metaInflIndex
+            self.metaInflIndex, = struct.unpack_from(b'>L', self.header, 0x2C)
+            if self.metaInflIndex != 0xffffffff:
+                self.metaInflIndex += self.start
+
+        # handle older headers without any ncxindex info and later
+        # specifically 0xe4 headers
+        if self.length + 16 < 0xf8:
+            return
+
+        # NCX Index
+        self.ncxidx, = struct.unpack(b'>L', self.header[0xf4:0xf8])
+        if self.ncxidx != 0xffffffff:
+            self.ncxidx += self.start
+
+        # K8 specific Indexes
+        if self.start != 0 or self.version == 8:
+            # Index into <xml> file skeletons in RawML
+            self.skelidx, = struct.unpack_from(b'>L', self.header, 0xfc)
+            if self.skelidx != 0xffffffff:
+                self.skelidx += self.start
+
+            # Index into <div> sections in RawML
+            self.fragidx, = struct.unpack_from(b'>L', self.header, 0xf8)
+            if self.fragidx != 0xffffffff:
+                self.fragidx += self.start
+
+            # Index into Other files
+            self.guideidx, = struct.unpack_from(b'>L', self.header, 0x104)
+            if self.guideidx != 0xffffffff:
+                self.guideidx += self.start
+
+            # dictionaries do not seem to use the same approach in K8's
+            # so disable them
+            self.metaOrthIndex = 0xffffffff
+            self.metaInflIndex = 0xffffffff
+
+            # need to use the FDST record to find out how to properly unpack
+            # the rawML into pieces
+            # it is simply a table of start and end locations for each flow piece
+            self.fdst, = struct.unpack_from(b'>L', self.header, 0xc0)
+            self.fdstcnt, = struct.unpack_from(b'>L', self.header, 0xc4)
+            # if cnt is 1 or less, fdst section mumber can be garbage
+            if self.fdstcnt <= 1:
+                self.fdst = 0xffffffff
+            if self.fdst != 0xffffffff:
+                self.fdst += self.start
+                # setting of fdst section description properly handled in mobi_kf8proc
+
+    def dump_exth(self):
+        # determine text encoding
+        codec=self.codec
+        if (not self.hasExth) or (self.exth_length) == 0 or (self.exth == b''):
+            return
+        num_items, = struct.unpack(b'>L', self.exth[8:12])
+        pos = 12
+        print("Key Size Decription                     Value")
+        for _ in range(num_items):
+            id, size = struct.unpack(b'>LL', self.exth[pos:pos+8])
+            contentsize = size-8
+            content = self.exth[pos + 8: pos + size]
+            if id in MobiHeader.id_map_strings:
+                exth_name = MobiHeader.id_map_strings[id]
+                print('{0: >3d} {1: >4d} {2: <30s} {3:s}'.format(id, contentsize, exth_name, content.decode(codec, errors='replace')))
+            elif id in MobiHeader.id_map_values:
+                exth_name = MobiHeader.id_map_values[id]
+                if size == 9:
+                    value, = struct.unpack(b'B',content)
+                    print('{0:3d} byte {1:<30s} {2:d}'.format(id, exth_name, value))
+                elif size == 10:
+                    value, = struct.unpack(b'>H',content)
+                    print('{0:3d} word {1:<30s} 0x{2:0>4X} ({2:d})'.format(id, exth_name, value))
+                elif size == 12:
+                    value, = struct.unpack(b'>L',content)
+                    print('{0:3d} long {1:<30s} 0x{2:0>8X} ({2:d})'.format(id, exth_name, value))
+                else:
+                    print('{0: >3d} {1: >4d} {2: <30s} (0x{3:s})'.format(id, contentsize, "Bad size for "+exth_name, hexlify(content)))
+            elif id in MobiHeader.id_map_hexstrings:
+                exth_name = MobiHeader.id_map_hexstrings[id]
+                print('{0:3d} {1:4d} {2:<30s} 0x{3:s}'.format(id, contentsize, exth_name, hexlify(content)))
+            else:
+                exth_name = "Unknown EXTH ID {0:d}".format(id)
+                print("{0: >3d} {1: >4d} {2: <30s} 0x{3:s}".format(id, contentsize, exth_name, hexlify(content)))
+            pos += size
+        return
+
+    def dumpheader(self):
+        # first 16 bytes are not part of the official mobiheader
+        # but we will treat it as such
+        # so section 0 is 16 (decimal) + self.length in total == at least 0x108 bytes for Mobi 8 headers
+        print("Dumping section %d, Mobipocket Header version: %d, total length %d" % (self.start,self.version, self.length+16))
+        self.hdr = {}
+        # set it up for the proper header version
+        if self.version == 0:
+            self.mobi_header = MobiHeader.palmdoc_header
+            self.mobi_header_sorted_keys = MobiHeader.palmdoc_header_sorted_keys
+        elif self.version < 8:
+            self.mobi_header = MobiHeader.mobi6_header
+            self.mobi_header_sorted_keys = MobiHeader.mobi6_header_sorted_keys
+        else:
+            self.mobi_header = MobiHeader.mobi8_header
+            self.mobi_header_sorted_keys = MobiHeader.mobi8_header_sorted_keys
+
+        # parse the header information
+        for key in self.mobi_header_sorted_keys:
+            (pos, format, tot_len) = self.mobi_header[key]
+            if pos < (self.length + 16):
+                val, = struct.unpack_from(format, self.header, pos)
+                self.hdr[key] = val
+
+        if 'title_offset' in self.hdr:
+            title_offset = self.hdr['title_offset']
+            title_length = self.hdr['title_length']
+        else:
+            title_offset = 0
+            title_length = 0
+        if title_offset == 0:
+            title_offset = len(self.header)
+            title_length = 0
+            self.title = self.sect.palmname.decode('latin-1', errors='replace')
+        else:
+            self.title = self.header[title_offset:title_offset+title_length].decode(self.codec, errors='replace')
+            # title record always padded with two nul bytes and then padded with nuls to next 4 byte boundary
+            title_length = ((title_length+2+3)>>2)<<2
+
+        self.extra1 = self.header[self.exth_offset+self.exth_length:title_offset]
+        self.extra2 = self.header[title_offset+title_length:]
+
+        print("Mobipocket header from section %d" % self.start)
+        print("     Offset  Value Hex Dec        Description")
+        for key in self.mobi_header_sorted_keys:
+            (pos, format, tot_len) = self.mobi_header[key]
+            if pos < (self.length + 16):
+                if key != 'magic':
+                    fmt_string = "0x{0:0>3X} ({0:3d}){1: >" + str(9-2*tot_len) +"s}0x{2:0>" + str(2*tot_len) + "X} {2:10d} {3:s}"
+                else:
+                    self.hdr[key] = unicode_str(self.hdr[key])
+                    fmt_string = "0x{0:0>3X} ({0:3d}){2:>11s}            {3:s}"
+                print(fmt_string.format(pos, " ",self.hdr[key], key))
+        print("")
+
+        if self.exth_length > 0:
+            print("EXTH metadata, offset %d, padded length %d" % (self.exth_offset,self.exth_length))
+            self.dump_exth()
+            print("")
+
+        if len(self.extra1) > 0:
+            print("Extra data between EXTH and Title, length %d" % len(self.extra1))
+            print(hexlify(self.extra1))
+            print("")
+
+        if title_length > 0:
+            print("Title in header at offset %d, padded length %d: '%s'" %(title_offset,title_length,self.title))
+            print("")
+
+        if len(self.extra2) > 0:
+            print("Extra data between Title and end of header, length %d" % len(self.extra2))
+            print(hexlify(self.extra2))
+            print("")
+
+    def isPrintReplica(self):
+        return self.mlstart[0:4] == b"%MOP"
+
+    def isK8(self):
+        return self.start != 0 or self.version == 8
+
+    def isEncrypted(self):
+        return self.crypto_type != 0
+
+    def hasNCX(self):
+        return self.ncxidx != 0xffffffff
+
+    def isDictionary(self):
+        return self.metaOrthIndex != 0xffffffff
+
+    def getncxIndex(self):
+        return self.ncxidx
+
+    def decompress(self, data):
+        return self.unpack(data)
+
+    def Language(self):
+        langcode = struct.unpack(b'!L', self.header[0x5c:0x60])[0]
+        langid = langcode & 0xFF
+        sublangid = (langcode >> 8) & 0xFF
+        return getLanguage(langid, sublangid)
+
+    def DictInLanguage(self):
+        if self.isDictionary():
+            langcode = struct.unpack(b'!L', self.header[0x60:0x64])[0]
+            langid = langcode & 0xFF
+            sublangid = (langcode >> 10) & 0xFF
+            if langid != 0:
+                return getLanguage(langid, sublangid)
+        return False
+
+    def DictOutLanguage(self):
+        if self.isDictionary():
+            langcode = struct.unpack(b'!L', self.header[0x64:0x68])[0]
+            langid = langcode & 0xFF
+            sublangid = (langcode >> 10) & 0xFF
+            if langid != 0:
+                return getLanguage(langid, sublangid)
+        return False
+
+    def getRawML(self):
+        def getSizeOfTrailingDataEntry(data):
+            num = 0
+            for v in data[-4:]:
+                if bord(v) & 0x80:
+                    num = 0
+                num = (num << 7) | (bord(v) & 0x7f)
+            return num
+        def trimTrailingDataEntries(data):
+            for _ in range(trailers):
+                num = getSizeOfTrailingDataEntry(data)
+                data = data[:-num]
+            if multibyte:
+                num = (ord(data[-1:]) & 3) + 1
+                data = data[:-num]
+            return data
+        multibyte = 0
+        trailers = 0
+        if self.sect.ident == b'BOOKMOBI':
+            mobi_length, = struct.unpack_from(b'>L', self.header, 0x14)
+            mobi_version, = struct.unpack_from(b'>L', self.header, 0x68)
+            if (mobi_length >= 0xE4) and (mobi_version >= 5):
+                flags, = struct.unpack_from(b'>H', self.header, 0xF2)
+                multibyte = flags & 1
+                while flags > 1:
+                    if flags & 2:
+                        trailers += 1
+                    flags = flags >> 1
+        # get raw mobi markup languge
+        print("Unpacking raw markup language")
+        dataList = []
+        # offset = 0
+        for i in range(1, self.records+1):
+            data = trimTrailingDataEntries(self.sect.loadSection(self.start + i))
+            dataList.append(self.unpack(data))
+            if self.isK8():
+                self.sect.setsectiondescription(self.start + i,"KF8 Text Section {0:d}".format(i))
+            elif self.version == 0:
+                self.sect.setsectiondescription(self.start + i,"PalmDOC Text Section {0:d}".format(i))
+            else:
+                self.sect.setsectiondescription(self.start + i,"Mobipocket Text Section {0:d}".format(i))
+        rawML = b''.join(dataList)
+        self.rawSize = len(rawML)
+        return rawML
+
+    # all metadata is stored in a dictionary with key and returns a *list* of values
+    # a list is used to allow for multiple creators, multiple contributors, etc
+    def parseMetaData(self):
+        def addValue(name, value):
+            if name not in self.metadata:
+                self.metadata[name] = [value]
+            else:
+                self.metadata[name].append(value)
+
+        codec=self.codec
+        if self.hasExth:
+            extheader=self.exth
+            _length, num_items = struct.unpack(b'>LL', extheader[4:12])
+            extheader = extheader[12:]
+            pos = 0
+            for _ in range(num_items):
+                id, size = struct.unpack(b'>LL', extheader[pos:pos+8])
+                content = extheader[pos + 8: pos + size]
+                if id in MobiHeader.id_map_strings:
+                    name = MobiHeader.id_map_strings[id]
+                    addValue(name, content.decode(codec, errors='replace'))
+                elif id in MobiHeader.id_map_values:
+                    name = MobiHeader.id_map_values[id]
+                    if size == 9:
+                        value, = struct.unpack(b'B',content)
+                        addValue(name, unicode_str(str(value)))
+                    elif size == 10:
+                        value, = struct.unpack(b'>H',content)
+                        addValue(name, unicode_str(str(value)))
+                    elif size == 12:
+                        value, = struct.unpack(b'>L',content)
+                        # handle special case of missing CoverOffset or missing ThumbOffset
+                        if id == 201 or id == 202:
+                            if value != 0xffffffff:
+                                addValue(name, unicode_str(str(value)))
+                        else:
+                            addValue(name, unicode_str(str(value)))
+                    else:
+                        print("Warning: Bad key, size, value combination detected in EXTH ", id, size, hexlify(content))
+                        addValue(name, hexlify(content))
+                elif id in MobiHeader.id_map_hexstrings:
+                    name = MobiHeader.id_map_hexstrings[id]
+                    addValue(name, hexlify(content))
+                else:
+                    name = unicode_str(str(id)) + ' (hex)'
+                    addValue(name, hexlify(content))
+                pos += size
+
+        # add the basics to the metadata each as a list element
+        self.metadata['Language'] = [self.Language()]
+        self.metadata['Title'] = [unicode_str(self.title,self.codec)]
+        self.metadata['Codec'] = [self.codec]
+        self.metadata['UniqueID'] = [unicode_str(str(self.unique_id))]
+        # if no asin create one using a uuid
+        if 'ASIN' not in self.metadata:
+            self.metadata['ASIN'] = [unicode_str(str(uuid.uuid4()))]
+        # if no cdeType set it to "EBOK"
+        if 'cdeType' not in self.metadata:
+            self.metadata['cdeType'] = ['EBOK']
+
+    def getMetaData(self):
+        return self.metadata
+
+    def describeHeader(self, DUMP):
+        print("Mobi Version:", self.version)
+        print("Codec:", self.codec)
+        print("Title:", self.title)
+        if 'Updated_Title' in self.metadata:
+            print("EXTH Title:", self.metadata['Updated_Title'][0])
+        if self.compression == 0x4448:
+            print("Huffdic compression")
+        elif self.compression == 2:
+            print("Palmdoc compression")
+        elif self.compression == 1:
+            print("No compression")
+        if DUMP:
+            self.dumpheader()
--- a/KindleUnpack/mobi_html.py
+++ b/KindleUnpack/mobi_html.py
@@ -0,0 +1,439 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, utf8_str
+
+if PY2:
+    range = xrange
+
+import re
+# note: re requites the pattern to be the exact same type as the data to be searched in python3
+# but u"" is not allowed for the pattern itself only b""
+
+from .mobi_utils import fromBase32
+
+class HTMLProcessor:
+
+    def __init__(self, files, metadata, rscnames):
+        self.files = files
+        self.metadata = metadata
+        self.rscnames = rscnames
+        # for original style mobis, default to including all image files in the opf manifest
+        self.used = {}
+        for name in rscnames:
+            self.used[name] = 'used'
+
+    def findAnchors(self, rawtext, indx_data, positionMap):
+        # process the raw text
+        # find anchors...
+        print("Find link anchors")
+        link_pattern = re.compile(br'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''', re.IGNORECASE)
+        # TEST NCX: merge in filepos from indx
+        pos_links = [int(m.group(1)) for m in link_pattern.finditer(rawtext)]
+        if indx_data:
+            pos_indx = [e['pos'] for e in indx_data if e['pos']>0]
+            pos_links = list(set(pos_links + pos_indx))
+
+        for position in pos_links:
+            if position in positionMap:
+                positionMap[position] = positionMap[position] + utf8_str('<a id="filepos%d" />' % position)
+            else:
+                positionMap[position] = utf8_str('<a id="filepos%d" />' % position)
+
+        # apply dictionary metadata and anchors
+        print("Insert data into html")
+        pos = 0
+        lastPos = len(rawtext)
+        dataList = []
+        for end in sorted(positionMap.keys()):
+            if end == 0 or end > lastPos:
+                continue  # something's up - can't put a tag in outside <html>...</html>
+            dataList.append(rawtext[pos:end])
+            dataList.append(positionMap[end])
+            pos = end
+        dataList.append(rawtext[pos:])
+        srctext = b"".join(dataList)
+        rawtext = None
+        dataList = None
+        self.srctext = srctext
+        self.indx_data = indx_data
+        return srctext
+
+    def insertHREFS(self):
+        srctext = self.srctext
+        rscnames = self.rscnames
+        metadata = self.metadata
+
+        # put in the hrefs
+        print("Insert hrefs into html")
+        # There doesn't seem to be a standard, so search as best as we can
+
+        link_pattern = re.compile(br'''<a([^>]*?)filepos=['"]{0,1}0*(\d+)['"]{0,1}([^>]*?)>''', re.IGNORECASE)
+        srctext = link_pattern.sub(br'''<a\1href="#filepos\2"\3>''', srctext)
+
+        # remove empty anchors
+        print("Remove empty anchors from html")
+        srctext = re.sub(br"<a\s*/>",br"", srctext)
+        srctext = re.sub(br"<a\s*>\s*</a>",br"", srctext)
+
+        # convert image references
+        print("Insert image references into html")
+        # split string into image tag pieces and other pieces
+        image_pattern = re.compile(br'''(<img.*?>)''', re.IGNORECASE)
+        image_index_pattern = re.compile(br'''recindex=['"]{0,1}([0-9]+)['"]{0,1}''', re.IGNORECASE)
+        srcpieces = image_pattern.split(srctext)
+        srctext = self.srctext = None
+
+        # all odd pieces are image tags (nulls string on even pieces if no space between them in srctext)
+        for i in range(1, len(srcpieces), 2):
+            tag = srcpieces[i]
+            for m in image_index_pattern.finditer(tag):
+                imageNumber = int(m.group(1))
+                imageName = rscnames[imageNumber-1]
+                if imageName is None:
+                    print("Error: Referenced image %s was not recognized as a valid image" % imageNumber)
+                else:
+                    replacement = b'src="Images/' + utf8_str(imageName) + b'"'
+                    tag = image_index_pattern.sub(replacement, tag, 1)
+            srcpieces[i] = tag
+        srctext = b"".join(srcpieces)
+
+        # add in character set meta into the html header if needed
+        if 'Codec' in metadata:
+            srctext = srctext[0:12]+b'<meta http-equiv="content-type" content="text/html; charset='+utf8_str(metadata.get('Codec')[0])+b'" />'+srctext[12:]
+        return srctext, self.used
+
+
+class XHTMLK8Processor:
+
+    def __init__(self, rscnames, k8proc):
+        self.rscnames = rscnames
+        self.k8proc = k8proc
+        self.used = {}
+
+    def buildXHTML(self):
+
+        # first need to update all links that are internal which
+        # are based on positions within the xhtml files **BEFORE**
+        # cutting and pasting any pieces into the xhtml text files
+
+        #   kindle:pos:fid:XXXX:off:YYYYYYYYYY  (used for internal link within xhtml)
+        #       XXXX is the offset in records into divtbl
+        #       YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position
+
+        # pos:fid pattern
+        posfid_pattern = re.compile(br'''(<a.*?href=.*?>)''', re.IGNORECASE)
+        posfid_index_pattern = re.compile(br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''')
+
+        parts = []
+        print("Building proper xhtml for each file")
+        for i in range(self.k8proc.getNumberOfParts()):
+            part = self.k8proc.getPart(i)
+            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.getPartInfo(i)
+
+            # internal links
+            srcpieces = posfid_pattern.split(part)
+            for j in range(1, len(srcpieces),2):
+                tag = srcpieces[j]
+                if tag.startswith(b'<'):
+                    for m in posfid_index_pattern.finditer(tag):
+                        posfid = m.group(1)
+                        offset = m.group(2)
+                        filename, idtag = self.k8proc.getIDTagByPosFid(posfid, offset)
+                        if idtag == b'':
+                            replacement= b'"' + utf8_str(filename) + b'"'
+                        else:
+                            replacement = b'"' + utf8_str(filename) + b'#' + idtag + b'"'
+                        tag = posfid_index_pattern.sub(replacement, tag, 1)
+                    srcpieces[j] = tag
+            part = b"".join(srcpieces)
+            parts.append(part)
+
+        # we are free to cut and paste as we see fit
+        # we can safely remove all of the Kindlegen generated aid tags
+        # change aid ids that are in k8proc.linked_aids to xhtml ids
+        find_tag_with_aid_pattern = re.compile(br'''(<[^>]*\said\s*=[^>]*>)''', re.IGNORECASE)
+        within_tag_aid_position_pattern = re.compile(br'''\said\s*=['"]([^'"]*)['"]''')
+        for i in range(len(parts)):
+            part = parts[i]
+            srcpieces = find_tag_with_aid_pattern.split(part)
+            for j in range(len(srcpieces)):
+                tag = srcpieces[j]
+                if tag.startswith(b'<'):
+                    for m in within_tag_aid_position_pattern.finditer(tag):
+                        try:
+                            aid = m.group(1)
+                        except IndexError:
+                            aid = None
+                        replacement = b''
+                        if aid in self.k8proc.linked_aids:
+                            replacement = b' id="aid-' + aid + b'"'
+                        tag = within_tag_aid_position_pattern.sub(replacement, tag, 1)
+                    srcpieces[j] = tag
+            part = b"".join(srcpieces)
+            parts[i] = part
+
+        # we can safely replace all of the Kindlegen generated data-AmznPageBreak tags
+        # with page-break-after style patterns
+        find_tag_with_AmznPageBreak_pattern = re.compile(br'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE)
+        within_tag_AmznPageBreak_position_pattern = re.compile(br'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''')
+        for i in range(len(parts)):
+            part = parts[i]
+            srcpieces = find_tag_with_AmznPageBreak_pattern.split(part)
+            for j in range(len(srcpieces)):
+                tag = srcpieces[j]
+                if tag.startswith(b'<'):
+                    srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub(
+                        lambda m:b' style="page-break-after:' + m.group(1) + b'"', tag)
+            part = b"".join(srcpieces)
+            parts[i] = part
+
+        # we have to handle substitutions for the flows  pieces first as they may
+        # be inlined into the xhtml text
+        #   kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
+        #   kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
+        #   kindle:embed:XXXX   (used for fonts)
+
+        flows = []
+        flows.append(None)
+        flowinfo = []
+        flowinfo.append([None, None, None, None])
+
+        # regular expression search patterns
+        img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
+        img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE)
+
+        tag_pattern = re.compile(br'''(<[^>]*>)''')
+        flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
+
+        url_pattern = re.compile(br'''(url\(.*?\))''', re.IGNORECASE)
+        url_img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*["')]''', re.IGNORECASE)
+        font_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)["')]''', re.IGNORECASE)
+        url_css_index_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE)
+        url_svg_image_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=image/svg\+xml[^\)]*''', re.IGNORECASE)
+
+        for i in range(1, self.k8proc.getNumberOfFlows()):
+            [ftype, format, dir, filename] = self.k8proc.getFlowInfo(i)
+            flowpart = self.k8proc.getFlow(i)
+
+            # links to raster image files from image tags
+            # image_pattern
+            srcpieces = img_pattern.split(flowpart)
+            for j in range(1, len(srcpieces),2):
+                tag = srcpieces[j]
+                if tag.startswith(b'<im'):
+                    for m in img_index_pattern.finditer(tag):
+                        imageNumber = fromBase32(m.group(1))
+                        imageName = self.rscnames[imageNumber-1]
+                        if imageName is not None:
+                            replacement = b'"../Images/' + utf8_str(imageName) + b'"'
+                            self.used[imageName] = 'used'
+                            tag = img_index_pattern.sub(replacement, tag, 1)
+                        else:
+                            print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag))
+                    srcpieces[j] = tag
+            flowpart = b"".join(srcpieces)
+
+            # replacements inside css url():
+            srcpieces = url_pattern.split(flowpart)
+            for j in range(1, len(srcpieces),2):
+                tag = srcpieces[j]
+
+                #  process links to raster image files
+                for m in url_img_index_pattern.finditer(tag):
+                    imageNumber = fromBase32(m.group(1))
+                    imageName = self.rscnames[imageNumber-1]
+                    osep = m.group()[0:1]
+                    csep = m.group()[-1:]
+                    if imageName is not None:
+                        replacement = osep +  b'../Images/' + utf8_str(imageName) +  csep
+                        self.used[imageName] = 'used'
+                        tag = url_img_index_pattern.sub(replacement, tag, 1)
+                    else:
+                        print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag))
+
+                # process links to fonts
+                for m in font_index_pattern.finditer(tag):
+                    fontNumber = fromBase32(m.group(1))
+                    fontName = self.rscnames[fontNumber-1]
+                    osep = m.group()[0:1]
+                    csep = m.group()[-1:]
+                    if fontName is None:
+                        print("Error: Referenced font %s was not recognized as a valid font in %s" % (fontNumber, tag))
+                    else:
+                        replacement = osep +  b'../Fonts/' + utf8_str(fontName) +  csep
+                        tag = font_index_pattern.sub(replacement, tag, 1)
+                        self.used[fontName] = 'used'
+
+                # process links to other css pieces
+                for m in url_css_index_pattern.finditer(tag):
+                    num = fromBase32(m.group(1))
+                    [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
+                    replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
+                    tag = url_css_index_pattern.sub(replacement, tag, 1)
+                    self.used[fnm] = 'used'
+
+                # process links to svg images
+                for m in url_svg_image_pattern.finditer(tag):
+                    num = fromBase32(m.group(1))
+                    [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
+                    replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
+                    tag = url_svg_image_pattern.sub(replacement, tag, 1)
+                    self.used[fnm] = 'used'
+
+                srcpieces[j] = tag
+            flowpart = b"".join(srcpieces)
+
+            # store away in our own copy
+            flows.append(flowpart)
+
+            # I do not think this case exists and even if it does exist, it needs to be done in a separate
+            # pass to prevent inlining a flow piece into another flow piece before the inserted one or the
+            # target one has been fully processed
+
+            # but keep it around if it ends up we do need it
+
+            # flow pattern not inside url()
+            # srcpieces = tag_pattern.split(flowpart)
+            # for j in range(1, len(srcpieces),2):
+            #     tag = srcpieces[j]
+            #     if tag.startswith(b'<'):
+            #         for m in flow_pattern.finditer(tag):
+            #             num = fromBase32(m.group(1))
+            #             [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
+            #             flowtext = self.k8proc.getFlow(num)
+            #             if fmt == b'inline':
+            #                 tag = flowtext
+            #             else:
+            #                 replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
+            #                 tag = flow_pattern.sub(replacement, tag, 1)
+            #                 self.used[fnm] = 'used'
+            #         srcpieces[j] = tag
+            # flowpart = b"".join(srcpieces)
+
+        # now handle the main text xhtml parts
+
+        # Handle the flow items in the XHTML text pieces
+        # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
+        tag_pattern = re.compile(br'''(<[^>]*>)''')
+        flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
+        for i in range(len(parts)):
+            part = parts[i]
+            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
+            # flow pattern
+            srcpieces = tag_pattern.split(part)
+            for j in range(1, len(srcpieces),2):
+                tag = srcpieces[j]
+                if tag.startswith(b'<'):
+                    for m in flow_pattern.finditer(tag):
+                        num = fromBase32(m.group(1))
+                        if num > 0 and num < len(self.k8proc.flowinfo):
+                            [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
+                            flowpart = flows[num]
+                            if fmt == b'inline':
+                                tag = flowpart
+                            else:
+                                replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
+                                tag = flow_pattern.sub(replacement, tag, 1)
+                                self.used[fnm] = 'used'
+                        else:
+                            print("warning: ignoring non-existent flow link", tag, " value 0x%x" % num)
+                    srcpieces[j] = tag
+            part = b''.join(srcpieces)
+
+            # store away modified version
+            parts[i] = part
+
+        # Handle any embedded raster images links in style= attributes urls
+        style_pattern = re.compile(br'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''', re.IGNORECASE)
+        img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE)
+
+        for i in range(len(parts)):
+            part = parts[i]
+            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
+
+            # replace urls in style attributes
+            srcpieces = style_pattern.split(part)
+            for j in range(1, len(srcpieces),2):
+                tag = srcpieces[j]
+                if b'kindle:embed' in tag:
+                    for m in img_index_pattern.finditer(tag):
+                        imageNumber = fromBase32(m.group(1))
+                        imageName = self.rscnames[imageNumber-1]
+                        osep = m.group()[0:1]
+                        csep = m.group()[-1:]
+                        if imageName is not None:
+                            replacement = osep + b'../Images/'+ utf8_str(imageName) + csep
+                            self.used[imageName] = 'used'
+                            tag = img_index_pattern.sub(replacement, tag, 1)
+                        else:
+                            print("Error: Referenced image %s in style url was not recognized in %s" % (imageNumber, tag))
+                    srcpieces[j] = tag
+            part = b"".join(srcpieces)
+
+            # store away modified version
+            parts[i] = part
+
+        # Handle any embedded raster images links in the xhtml text
+        # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
+        img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
+        img_index_pattern = re.compile(br'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''')
+
+        for i in range(len(parts)):
+            part = parts[i]
+            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
+
+            # links to raster image files
+            # image_pattern
+            srcpieces = img_pattern.split(part)
+            for j in range(1, len(srcpieces),2):
+                tag = srcpieces[j]
+                if tag.startswith(b'<im'):
+                    for m in img_index_pattern.finditer(tag):
+                        imageNumber = fromBase32(m.group(1))
+                        imageName = self.rscnames[imageNumber-1]
+                        if imageName is not None:
+                            replacement = b'"../Images/' + utf8_str(imageName) + b'"'
+                            self.used[imageName] = 'used'
+                            tag = img_index_pattern.sub(replacement, tag, 1)
+                        else:
+                            print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag))
+                    srcpieces[j] = tag
+            part = b"".join(srcpieces)
+            # store away modified version
+            parts[i] = part
+
+        # finally perform any general cleanups needed to make valid XHTML
+        # these include:
+        #   in svg tags replace "perserveaspectratio" attributes with "perserveAspectRatio"
+        #   in svg tags replace "viewbox" attributes with "viewBox"
+        #   in <li> remove value="XX" attributes since these are illegal
+        tag_pattern = re.compile(br'''(<[^>]*>)''')
+        li_value_pattern = re.compile(br'''\svalue\s*=\s*['"][^'"]*['"]''', re.IGNORECASE)
+
+        for i in range(len(parts)):
+            part = parts[i]
+            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
+
+            # tag pattern
+            srcpieces = tag_pattern.split(part)
+            for j in range(1, len(srcpieces),2):
+                tag = srcpieces[j]
+                if tag.startswith(b'<svg') or tag.startswith(b'<SVG'):
+                    tag = tag.replace(b'preserveaspectratio',b'preserveAspectRatio')
+                    tag = tag.replace(b'viewbox',b'viewBox')
+                elif tag.startswith(b'<li ') or tag.startswith(b'<LI '):
+                    tagpieces = li_value_pattern.split(tag)
+                    tag = b"".join(tagpieces)
+                srcpieces[j] = tag
+            part = b"".join(srcpieces)
+            # store away modified version
+            parts[i] = part
+
+        self.k8proc.setFlows(flows)
+        self.k8proc.setParts(parts)
+
+        return self.used
--- a/KindleUnpack/mobi_index.py
+++ b/KindleUnpack/mobi_index.py
@@ -0,0 +1,276 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, bchr, bstr, bord
+if PY2:
+    range = xrange
+
+import struct
+# note:  struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+from .mobi_utils import toHex
+
+class MobiIndex:
+
+    def __init__(self, sect, DEBUG=False):
+        self.sect = sect
+        self.DEBUG = DEBUG
+
+    def getIndexData(self, idx, label="Unknown"):
+        sect = self.sect
+        outtbl = []
+        ctoc_text = {}
+        if idx != 0xffffffff:
+            sect.setsectiondescription(idx,"{0} Main INDX section".format(label))
+            data = sect.loadSection(idx)
+            idxhdr, hordt1, hordt2 = self.parseINDXHeader(data)
+            IndexCount = idxhdr['count']
+            # handle the case of multiple sections used for CTOC
+            rec_off = 0
+            off = idx + IndexCount + 1
+            for j in range(idxhdr['nctoc']):
+                cdata = sect.loadSection(off + j)
+                sect.setsectiondescription(off+j, label + ' CTOC Data ' + str(j))
+                ctocdict = self.readCTOC(cdata)
+                for k in ctocdict:
+                    ctoc_text[k + rec_off] = ctocdict[k]
+                rec_off += 0x10000
+            tagSectionStart = idxhdr['len']
+            controlByteCount, tagTable = readTagSection(tagSectionStart, data)
+            if self.DEBUG:
+                print("ControlByteCount is", controlByteCount)
+                print("IndexCount is", IndexCount)
+                print("TagTable: %s" % tagTable)
+            for i in range(idx + 1, idx + 1 + IndexCount):
+                sect.setsectiondescription(i,"{0} Extra {1:d} INDX section".format(label,i-idx))
+                data = sect.loadSection(i)
+                hdrinfo, ordt1, ordt2 = self.parseINDXHeader(data)
+                idxtPos = hdrinfo['start']
+                entryCount = hdrinfo['count']
+                if self.DEBUG:
+                    print(idxtPos, entryCount)
+                # loop through to build up the IDXT position starts
+                idxPositions = []
+                for j in range(entryCount):
+                    pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j))
+                    idxPositions.append(pos)
+                # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
+                idxPositions.append(idxtPos)
+                # for each entry in the IDXT build up the tagMap and any associated text
+                for j in range(entryCount):
+                    startPos = idxPositions[j]
+                    endPos = idxPositions[j+1]
+                    textLength = ord(data[startPos:startPos+1])
+                    text = data[startPos+1:startPos+1+textLength]
+                    if hordt2 is not None:
+                        text = b''.join(bchr(hordt2[bord(x)]) for x in text)
+                    tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos)
+                    outtbl.append([text, tagMap])
+                    if self.DEBUG:
+                        print(tagMap)
+                        print(text)
+        return outtbl, ctoc_text
+
+    def parseINDXHeader(self, data):
+        "read INDX header"
+        if not data[:4] == b'INDX':
+            print("Warning: index section is not INDX")
+            return False
+        words = (
+                'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
+                'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc'
+        )
+        num = len(words)
+        values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)])
+        header = {}
+        for n in range(num):
+            header[words[n]] = values[n]
+
+        ordt1 = None
+        ordt2 = None
+
+        ocnt, oentries, op1, op2, otagx  = struct.unpack_from(b'>LLLLL',data, 0xa4)
+        if header['code'] == 0xfdea or ocnt != 0 or oentries > 0:
+            # horribly hacked up ESP (sample) mobi books use two ORDT sections but never specify
+            # them in the proper place in the header.  They seem to be codepage 65002 which seems
+            # to be some sort of strange EBCDIC utf-8 or 16 encoded strings
+
+            # so we need to look for them and store them away to process leading text
+            # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
+            # we only ever seem to use the seocnd but ...
+            assert(ocnt == 1)
+            assert(data[op1:op1+4] == b'ORDT')
+            assert(data[op2:op2+4] == b'ORDT')
+            ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4)
+            ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4)
+
+        if self.DEBUG:
+            print("parsed INDX header:")
+            for n in words:
+                print(n, "%X" % header[n],)
+            print("")
+        return header, ordt1, ordt2
+
+    def readCTOC(self, txtdata):
+        # read all blocks from CTOC
+        ctoc_data = {}
+        offset = 0
+        while offset<len(txtdata):
+            if PY2:
+                if txtdata[offset] == b'\0':
+                    break
+            else:
+                if txtdata[offset] == 0:
+                    break
+            idx_offs = offset
+            # first n bytes: name len as vwi
+            pos, ilen = getVariableWidthValue(txtdata, offset)
+            offset += pos
+            # <len> next bytes: name
+            name = txtdata[offset:offset+ilen]
+            offset += ilen
+            if self.DEBUG:
+                print("name length is ", ilen)
+                print(idx_offs, name)
+            ctoc_data[idx_offs] = name
+        return ctoc_data
+
+
+def getVariableWidthValue(data, offset):
+    '''
+    Decode variable width value from given bytes.
+
+    @param data: The bytes to decode.
+    @param offset: The start offset into data.
+    @return: Tuple of consumed bytes count and decoded value.
+    '''
+    value = 0
+    consumed = 0
+    finished = False
+    while not finished:
+        v = data[offset + consumed: offset + consumed + 1]
+        consumed += 1
+        if ord(v) & 0x80:
+            finished = True
+        value = (value << 7) | (ord(v) & 0x7f)
+    return consumed, value
+
+
+def readTagSection(start, data):
+    '''
+    Read tag section from given data.
+
+    @param start: The start position in the data.
+    @param data: The data to process.
+    @return: Tuple of control byte count and list of tag tuples.
+    '''
+    controlByteCount = 0
+    tags = []
+    if data[start:start+4] == b"TAGX":
+        firstEntryOffset, = struct.unpack_from(b'>L', data, start + 0x04)
+        controlByteCount, = struct.unpack_from(b'>L', data, start + 0x08)
+
+        # Skip the first 12 bytes already read above.
+        for i in range(12, firstEntryOffset, 4):
+            pos = start + i
+            tags.append((ord(data[pos:pos+1]), ord(data[pos+1:pos+2]), ord(data[pos+2:pos+3]), ord(data[pos+3:pos+4])))
+    return controlByteCount, tags
+
+
+def countSetBits(value, bits=8):
+    '''
+    Count the set bits in the given value.
+
+    @param value: Integer value.
+    @param bits: The number of bits of the input value (defaults to 8).
+    @return: Number of set bits.
+    '''
+    count = 0
+    for _ in range(bits):
+        if value & 0x01 == 0x01:
+            count += 1
+        value = value >> 1
+    return count
+
+
+def getTagMap(controlByteCount, tagTable, entryData, startPos, endPos):
+    '''
+    Create a map of tags and values from the given byte section.
+
+    @param controlByteCount: The number of control bytes.
+    @param tagTable: The tag table.
+    @param entryData: The data to process.
+    @param startPos: The starting position in entryData.
+    @param endPos: The end position in entryData or None if it is unknown.
+    @return: Hashmap of tag and list of values.
+    '''
+    tags = []
+    tagHashMap = {}
+    controlByteIndex = 0
+    dataStart = startPos + controlByteCount
+
+    for tag, valuesPerEntry, mask, endFlag in tagTable:
+        if endFlag == 0x01:
+            controlByteIndex += 1
+            continue
+        cbyte = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1])
+        if 0:
+            print("Control Byte Index %0x , Control Byte Value %0x" % (controlByteIndex, cbyte))
+
+        value = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1]) & mask
+        if value != 0:
+            if value == mask:
+                if countSetBits(mask) > 1:
+                    # If all bits of masked value are set and the mask has more than one bit, a variable width value
+                    # will follow after the control bytes which defines the length of bytes (NOT the value count!)
+                    # which will contain the corresponding variable width values.
+                    consumed, value = getVariableWidthValue(entryData, dataStart)
+                    dataStart += consumed
+                    tags.append((tag, None, value, valuesPerEntry))
+                else:
+                    tags.append((tag, 1, None, valuesPerEntry))
+            else:
+                # Shift bits to get the masked value.
+                while mask & 0x01 == 0:
+                    mask = mask >> 1
+                    value = value >> 1
+                tags.append((tag, value, None, valuesPerEntry))
+    for tag, valueCount, valueBytes, valuesPerEntry in tags:
+        values = []
+        if valueCount is not None:
+            # Read valueCount * valuesPerEntry variable width values.
+            for _ in range(valueCount):
+                for _ in range(valuesPerEntry):
+                    consumed, data = getVariableWidthValue(entryData, dataStart)
+                    dataStart += consumed
+                    values.append(data)
+        else:
+            # Convert valueBytes to variable width values.
+            totalConsumed = 0
+            while totalConsumed < valueBytes:
+                # Does this work for valuesPerEntry != 1?
+                consumed, data = getVariableWidthValue(entryData, dataStart)
+                dataStart += consumed
+                totalConsumed += consumed
+                values.append(data)
+            if totalConsumed != valueBytes:
+                print("Error: Should consume %s bytes, but consumed %s" % (valueBytes, totalConsumed))
+        tagHashMap[tag] = values
+    # Test that all bytes have been processed if endPos is given.
+    if endPos is not None and dataStart != endPos:
+        # The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
+        for char in entryData[dataStart:endPos]:
+            if bord(char) != 0:
+                print("Warning: There are unprocessed index bytes left: %s" % toHex(entryData[dataStart:endPos]))
+                if 0:
+                    print("controlByteCount: %s" % controlByteCount)
+                    print("tagTable: %s" % tagTable)
+                    print("data: %s" % toHex(entryData[startPos:endPos]))
+                    print("tagHashMap: %s" % tagHashMap)
+                break
+
+    return tagHashMap
--- a/KindleUnpack/mobi_k8proc.py
+++ b/KindleUnpack/mobi_k8proc.py
@@ -0,0 +1,494 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, bstr, utf8_str
+
+if PY2:
+    range = xrange
+
+import os
+
+import struct
+# note:  struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+import re
+# note: re requites the pattern to be the exact same type as the data to be searched in python3
+# but u"" is not allowed for the pattern itself only b""
+
+from .mobi_index import MobiIndex
+from .mobi_utils import fromBase32
+from .unipath import pathof
+
+_guide_types = [b'cover',b'title-page',b'toc',b'index',b'glossary',b'acknowledgements',
+                b'bibliography',b'colophon',b'copyright-page',b'dedication',
+                b'epigraph',b'foreward',b'loi',b'lot',b'notes',b'preface',b'text']
+
+# locate beginning and ending positions of tag with specific aid attribute
+def locate_beg_end_of_tag(ml, aid):
+    pattern = utf8_str(r'''<[^>]*\said\s*=\s*['"]%s['"][^>]*>''' % aid)
+    aid_pattern = re.compile(pattern,re.IGNORECASE)
+    for m in re.finditer(aid_pattern, ml):
+        plt = m.start()
+        pgt = ml.find(b'>',plt+1)
+        return plt, pgt
+    return 0, 0
+
+
+# iterate over all tags in block in reverse order, i.e. last ta to first tag
+def reverse_tag_iter(block):
+    end = len(block)
+    while True:
+        pgt = block.rfind(b'>', 0, end)
+        if pgt == -1:
+            break
+        plt = block.rfind(b'<', 0, pgt)
+        if plt == -1:
+            break
+        yield block[plt:pgt+1]
+        end = plt
+
+
+class K8Processor:
+
+    def __init__(self, mh, sect, files, debug=False):
+        self.sect = sect
+        self.files = files
+        self.mi = MobiIndex(sect)
+        self.mh = mh
+        self.skelidx = mh.skelidx
+        self.fragidx = mh.fragidx
+        self.guideidx = mh.guideidx
+        self.fdst = mh.fdst
+        self.flowmap = {}
+        self.flows = None
+        self.flowinfo = []
+        self.parts = None
+        self.partinfo = []
+        self.linked_aids = set()
+        self.fdsttbl= [0,0xffffffff]
+        self.DEBUG = debug
+
+        # read in and parse the FDST info which is very similar in format to the Palm DB section
+        # parsing except it provides offsets into rawML file and not the Palm DB file
+        # this is needed to split up the final css, svg, etc flow section
+        # that can exist at the end of the rawML file
+        if self.fdst != 0xffffffff:
+            header = self.sect.loadSection(self.fdst)
+            if header[0:4] == b"FDST":
+                num_sections, = struct.unpack_from(b'>L', header, 0x08)
+                self.fdsttbl = struct.unpack_from(bstr('>%dL' % (num_sections*2)), header, 12)[::2] + (mh.rawSize, )
+                sect.setsectiondescription(self.fdst,"KF8 FDST INDX")
+                if self.DEBUG:
+                    print("\nFDST Section Map:  %d sections" % num_sections)
+                    for j in range(num_sections):
+                        print("Section %d: 0x%08X - 0x%08X" % (j, self.fdsttbl[j],self.fdsttbl[j+1]))
+            else:
+                print("\nError: K8 Mobi with Missing FDST info")
+
+        # read/process skeleton index info to create the skeleton table
+        skeltbl = []
+        if self.skelidx != 0xffffffff:
+            # for i in range(2):
+            #     fname = 'skel%04d.dat' % i
+            #     data = self.sect.loadSection(self.skelidx + i)
+            #     with open(pathof(fname), 'wb') as f:
+            #         f.write(data)
+            outtbl, ctoc_text = self.mi.getIndexData(self.skelidx, "KF8 Skeleton")
+            fileptr = 0
+            for [text, tagMap] in outtbl:
+                # file number, skeleton name, fragtbl record count, start position, length
+                skeltbl.append([fileptr, text, tagMap[1][0], tagMap[6][0], tagMap[6][1]])
+                fileptr += 1
+        self.skeltbl = skeltbl
+        if self.DEBUG:
+            print("\nSkel Table:  %d entries" % len(self.skeltbl))
+            print("table: filenum, skeleton name, frag tbl record count, start position, length")
+            for j in range(len(self.skeltbl)):
+                print(self.skeltbl[j])
+
+        # read/process the fragment index to create the fragment table
+        fragtbl = []
+        if self.fragidx != 0xffffffff:
+            # for i in range(3):
+            #     fname = 'frag%04d.dat' % i
+            #     data = self.sect.loadSection(self.fragidx + i)
+            #     with open(pathof(fname), 'wb') as f:
+            #         f.write(data)
+            outtbl, ctoc_text = self.mi.getIndexData(self.fragidx, "KF8 Fragment")
+            for [text, tagMap] in outtbl:
+                # insert position, ctoc offset (aidtext), file number, sequence number, start position, length
+                ctocoffset = tagMap[2][0]
+                ctocdata = ctoc_text[ctocoffset]
+                fragtbl.append([int(text), ctocdata, tagMap[3][0], tagMap[4][0], tagMap[6][0], tagMap[6][1]])
+        self.fragtbl = fragtbl
+        if self.DEBUG:
+            print("\nFragment Table: %d entries" % len(self.fragtbl))
+            print("table: file position, link id text, file num, sequence number, start position, length")
+            for j in range(len(self.fragtbl)):
+                print(self.fragtbl[j])
+
+        # read / process guide index for guide elements of opf
+        guidetbl = []
+        if self.guideidx != 0xffffffff:
+            # for i in range(3):
+            #     fname = 'guide%04d.dat' % i
+            #     data = self.sect.loadSection(self.guideidx + i)
+            #     with open(pathof(fname), 'wb') as f:
+            #         f.write(data)
+            outtbl, ctoc_text = self.mi.getIndexData(self.guideidx, "KF8 Guide elements)")
+            for [text, tagMap] in outtbl:
+                # ref_type, ref_title, frag number
+                ctocoffset = tagMap[1][0]
+                ref_title = ctoc_text[ctocoffset]
+                ref_type = text
+                fileno = None
+                if 3 in tagMap:
+                    fileno  = tagMap[3][0]
+                if 6 in tagMap:
+                    fileno = tagMap[6][0]
+                guidetbl.append([ref_type, ref_title, fileno])
+        self.guidetbl = guidetbl
+        if self.DEBUG:
+            print("\nGuide Table: %d entries" % len(self.guidetbl))
+            print("table: ref_type, ref_title, fragtbl entry number")
+            for j in range(len(self.guidetbl)):
+                print(self.guidetbl[j])
+
+    def buildParts(self, rawML):
+        # now split the rawML into its flow pieces
+        self.flows = []
+        for j in range(0, len(self.fdsttbl)-1):
+            start = self.fdsttbl[j]
+            end = self.fdsttbl[j+1]
+            self.flows.append(rawML[start:end])
+
+        # the first piece represents the xhtml text
+        text = self.flows[0]
+        self.flows[0] = b''
+
+        # walk the <skeleton> and fragment tables to build original source xhtml files
+        # *without* destroying any file position information needed for later href processing
+        # and create final list of file separation start: stop points and etc in partinfo
+        if self.DEBUG:
+            print("\nRebuilding flow piece 0: the main body of the ebook")
+        self.parts = []
+        self.partinfo = []
+        fragptr = 0
+        baseptr = 0
+        cnt = 0
+        for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl:
+            baseptr = skelpos + skellen
+            skeleton = text[skelpos: baseptr]
+            for i in range(fragcnt):
+                [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[fragptr]
+                aidtext = idtext[12:-2]
+                if i == 0:
+                    filename = 'part%04d.xhtml' % filenum
+                slice = text[baseptr: baseptr + length]
+                insertpos = insertpos - skelpos
+                head = skeleton[:insertpos]
+                tail = skeleton[insertpos:]
+                actual_inspos = insertpos
+                if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') < head.rfind(b'<')):
+                    # There is an incomplete tag in either the head or tail.
+                    # This can happen for some badly formed KF8 files
+                    print('The fragment table for %s has incorrect insert position. Calculating manually.' % skelname)
+                    bp, ep = locate_beg_end_of_tag(skeleton, aidtext)
+                    if bp != ep:
+                        actual_inspos = ep + 1 + startpos
+                if insertpos != actual_inspos:
+                    print("fixed corrupt fragment table insert position", insertpos+skelpos, actual_inspos+skelpos)
+                    insertpos = actual_inspos
+                    self.fragtbl[fragptr][0] = actual_inspos + skelpos
+                skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:]
+                baseptr = baseptr + length
+                fragptr += 1
+            cnt += 1
+            self.parts.append(skeleton)
+            self.partinfo.append([skelnum, 'Text', filename, skelpos, baseptr, aidtext])
+
+        assembled_text = b''.join(self.parts)
+        if self.DEBUG:
+            outassembled = os.path.join(self.files.k8dir, 'assembled_text.dat')
+            with open(pathof(outassembled),'wb') as f:
+                f.write(assembled_text)
+
+        # The primary css style sheet is typically stored next followed by any
+        # snippets of code that were previously inlined in the
+        # original xhtml but have been stripped out and placed here.
+        # This can include local CDATA snippets and and svg sections.
+
+        # The problem is that for most browsers and ereaders, you can not
+        # use <img src="imageXXXX.svg" /> to import any svg image that itself
+        # properly uses an <image/> tag to import some raster image - it
+        # should work according to the spec but does not for almost all browsers
+        # and ereaders and causes epub validation issues because those  raster
+        # images are in manifest but not in xhtml text - since they only
+        # referenced from an svg image
+
+        # So we need to check the remaining flow pieces to see if they are css
+        # or svg images.  if svg images, we must check if they have an <image />
+        # and if so inline them into the xhtml text pieces.
+
+        # there may be other sorts of pieces stored here but until we see one
+        # in the wild to reverse engineer we won't be able to tell
+        self.flowinfo.append([None, None, None, None])
+        svg_tag_pattern = re.compile(br'''(<svg[^>]*>)''', re.IGNORECASE)
+        image_tag_pattern = re.compile(br'''(<image[^>]*>)''', re.IGNORECASE)
+        for j in range(1,len(self.flows)):
+            flowpart = self.flows[j]
+            nstr = '%04d' % j
+            m = re.search(svg_tag_pattern, flowpart)
+            if m is not None:
+                # svg
+                ptype = b'svg'
+                start = m.start()
+                m2 = re.search(image_tag_pattern, flowpart)
+                if m2 is not None:
+                    pformat = b'inline'
+                    pdir = None
+                    fname = None
+                    # strip off anything before <svg if inlining
+                    flowpart = flowpart[start:]
+                else:
+                    pformat = b'file'
+                    pdir = "Images"
+                    fname = 'svgimg' + nstr + '.svg'
+            else:
+                # search for CDATA and if exists inline it
+                if flowpart.find(b'[CDATA[') >= 0:
+                    ptype = b'css'
+                    flowpart = b'<style type="text/css">\n' + flowpart + b'\n</style>\n'
+                    pformat = b'inline'
+                    pdir = None
+                    fname = None
+                else:
+                    # css - assume as standalone css file
+                    ptype = b'css'
+                    pformat = b'file'
+                    pdir = "Styles"
+                    fname = 'style' + nstr + '.css'
+
+            self.flows[j] = flowpart
+            self.flowinfo.append([ptype, pformat, pdir, fname])
+
+        if self.DEBUG:
+            print("\nFlow Map:  %d entries" % len(self.flowinfo))
+            for fi in self.flowinfo:
+                print(fi)
+            print("\n")
+
+            print("\nXHTML File Part Position Information: %d entries" % len(self.partinfo))
+            for pi in self.partinfo:
+                print(pi)
+
+        if False:  # self.Debug:
+            # dump all of the locations of the aid tags used in TEXT
+            # find id links only inside of tags
+            #    inside any < > pair find all "aid=' and return whatever is inside the quotes
+            #    [^>]* means match any amount of chars except for  '>' char
+            #    [^'"] match any amount of chars except for the quote character
+            #    \s* means match any amount of whitespace
+            print("\npositions of all aid= pieces")
+            id_pattern = re.compile(br'''<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>''',re.IGNORECASE)
+            for m in re.finditer(id_pattern, rawML):
+                [filename, partnum, start, end] = self.getFileInfo(m.start())
+                [seqnum, idtext] = self.getFragTblInfo(m.start())
+                value = fromBase32(m.group(1))
+                print("  aid: %s value: %d at: %d -> part: %d, start: %d, end: %d" % (m.group(1), value, m.start(), partnum, start, end))
+                print("       %s  fragtbl entry %d" % (idtext, seqnum))
+
+        return
+
+    # get information fragment table entry by pos
+    def getFragTblInfo(self, pos):
+        for j in range(len(self.fragtbl)):
+            [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[j]
+            if pos >= insertpos and pos < (insertpos + length):
+                # why are these "in: and before: added here
+                return seqnum, b'in: ' + idtext
+            if pos < insertpos:
+                return seqnum, b'before: ' + idtext
+        return None, None
+
+    # get information about the part (file) that exists at pos in original rawML
+    def getFileInfo(self, pos):
+        for [partnum, pdir, filename, start, end, aidtext] in self.partinfo:
+            if pos >= start and pos < end:
+                return filename, partnum, start, end
+        return None, None, None, None
+
+    # accessor functions to properly protect the internal structure
+    def getNumberOfParts(self):
+        return len(self.parts)
+
+    def getPart(self,i):
+        if i >= 0 and i < len(self.parts):
+            return self.parts[i]
+        return None
+
+    def getPartInfo(self, i):
+        if i >= 0 and i < len(self.partinfo):
+            return self.partinfo[i]
+        return None
+
+    def getNumberOfFlows(self):
+        return len(self.flows)
+
+    def getFlow(self,i):
+        # note flows[0] is empty - it was all of the original text
+        if i > 0 and i < len(self.flows):
+            return self.flows[i]
+        return None
+
+    def getFlowInfo(self,i):
+        # note flowinfo[0] is empty - it was all of the original text
+        if i > 0 and i < len(self.flowinfo):
+            return self.flowinfo[i]
+        return None
+
+    def getIDTagByPosFid(self, posfid, offset):
+        # first convert kindle:pos:fid and offset info to position in file
+        # (fromBase32 can handle both string types on input)
+        row = fromBase32(posfid)
+        off = fromBase32(offset)
+        [insertpos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[row]
+        pos = insertpos + off
+        fname, pn, skelpos, skelend = self.getFileInfo(pos)
+        if fname is None:
+            # pos does not exist
+            # default to skeleton pos instead
+            print("Link To Position", pos, "does not exist, retargeting to top of target")
+            pos = self.skeltbl[filenum][3]
+            fname, pn, skelpos, skelend = self.getFileInfo(pos)
+        # an existing "id=" or "name=" attribute must exist in original xhtml otherwise it would not have worked for linking.
+        # Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent
+        # some position information encoded into Base32 name.
+        # so find the closest "id=" before position the file  by actually searching in that file
+        idtext = self.getIDTag(pos)
+        return fname, idtext
+
+    def getIDTag(self, pos):
+        # find the first tag with a named anchor (name or id attribute) before pos
+        fname, pn, skelpos, skelend = self.getFileInfo(pos)
+        if pn is None and skelpos is None:
+            print("Error: getIDTag - no file contains ", pos)
+        textblock = self.parts[pn]
+        npos = pos - skelpos
+        # if npos inside a tag then search all text before the its end of tag marker
+        pgt = textblock.find(b'>',npos)
+        plt = textblock.find(b'<',npos)
+        if plt == npos or pgt < plt:
+            npos = pgt + 1
+        # find id and name attributes only inside of tags
+        # use a reverse tag search since that is faster
+        #    inside any < > pair find "id=" and "name=" attributes return it
+        #    [^>]* means match any amount of chars except for  '>' char
+        #    [^'"] match any amount of chars except for the quote character
+        #    \s* means match any amount of whitespace
+        textblock = textblock[0:npos]
+        id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE)
+        name_pattern = re.compile(br'''<[^>]*\sname\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE)
+        aid_pattern = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''')
+        for tag in reverse_tag_iter(textblock):
+            # any ids in the body should default to top of file
+            if tag[0:6] == b'<body ':
+                return b''
+            if tag[0:6] != b'<meta ':
+                m = id_pattern.match(tag) or name_pattern.match(tag)
+                if m is not None:
+                    return m.group(1)
+                m = aid_pattern.match(tag)
+                if m is not None:
+                    self.linked_aids.add(m.group(1))
+                    return b'aid-' + m.group(1)
+        return b''
+
+    # do we need to do deep copying
+    def setParts(self, parts):
+        assert(len(parts) == len(self.parts))
+        for i in range(len(parts)):
+            self.parts[i] = parts[i]
+
+    # do we need to do deep copying
+    def setFlows(self, flows):
+        assert(len(flows) == len(self.flows))
+        for i in range(len(flows)):
+            self.flows[i] = flows[i]
+
+    # get information about the part (file) that exists at pos in original rawML
+    def getSkelInfo(self, pos):
+        for [partnum, pdir, filename, start, end, aidtext] in self.partinfo:
+            if pos >= start and pos < end:
+                return [partnum, pdir, filename, start, end, aidtext]
+        return [None, None, None, None, None, None]
+
+    # fileno is actually a reference into fragtbl (a fragment)
+    def getGuideText(self):
+        guidetext = b''
+        for [ref_type, ref_title, fileno] in self.guidetbl:
+            if ref_type == b'thumbimagestandard':
+                continue
+            if ref_type not in _guide_types and not ref_type.startswith(b'other.'):
+                if ref_type == b'start':
+                    ref_type = b'text'
+                else:
+                    ref_type = b'other.' + ref_type
+            [pos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[fileno]
+            [pn, pdir, filename, skelpos, skelend, aidtext] = self.getSkelInfo(pos)
+            idtext = self.getIDTag(pos)
+            linktgt = filename.encode('utf-8')
+            if idtext != b'':
+                linktgt += b'#' + idtext
+            guidetext += b'<reference type="'+ref_type+b'" title="'+ref_title+b'" href="'+utf8_str(pdir)+b'/'+linktgt+b'" />\n'
+        # opf is encoded utf-8 so must convert any titles properly
+        guidetext = (guidetext.decode(self.mh.codec)).encode("utf-8")
+        return guidetext
+
+    def getPageIDTag(self, pos):
+        # find the first tag with a named anchor (name or id attribute) before pos
+        # but page map offsets need to little more leeway so if the offset points
+        # into a tag look for the next ending tag "/>" or "</" and start your search from there.
+        fname, pn, skelpos, skelend = self.getFileInfo(pos)
+        if pn is None and skelpos is None:
+            print("Error: getIDTag - no file contains ", pos)
+        textblock = self.parts[pn]
+        npos = pos - skelpos
+        # if npos inside a tag then search all text before next ending tag
+        pgt = textblock.find(b'>',npos)
+        plt = textblock.find(b'<',npos)
+        if plt == npos or pgt < plt:
+            # we are in a tag
+            # so find first ending tag
+            pend1 = textblock.find(b'/>', npos)
+            pend2 = textblock.find(b'</', npos)
+            if pend1 != -1 and pend2 != -1:
+                pend = min(pend1, pend2)
+            else:
+                pend = max(pend1, pend2)
+            if pend != -1:
+                npos = pend
+            else:
+                npos = pgt + 1
+        # find id and name attributes only inside of tags
+        # use a reverse tag search since that is faster
+        #    inside any < > pair find "id=" and "name=" attributes return it
+        #    [^>]* means match any amount of chars except for  '>' char
+        #    [^'"] match any amount of chars except for the quote character
+        #    \s* means match any amount of whitespace
+        textblock = textblock[0:npos]
+        id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE)
+        name_pattern = re.compile(br'''<[^>]*\sname\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE)
+        for tag in reverse_tag_iter(textblock):
+            # any ids in the body should default to top of file
+            if tag[0:6] == b'<body ':
+                return b''
+            if tag[0:6] != b'<meta ':
+                m = id_pattern.match(tag) or name_pattern.match(tag)
+                if m is not None:
+                    return m.group(1)
+        return b''
--- a/KindleUnpack/mobi_k8resc.py
+++ b/KindleUnpack/mobi_k8resc.py
@@ -0,0 +1,268 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+DEBUG_USE_ORDERED_DICTIONARY = False  # OrderedDict is supoorted >= python 2.7.
+""" set to True to use OrderedDict for K8RESCProcessor.parsetag.tattr."""
+
+if DEBUG_USE_ORDERED_DICTIONARY:
+    from collections import OrderedDict as dict_
+else:
+    dict_ = dict
+
+from .compatibility_utils import unicode_str
+
+from .mobi_utils import fromBase32
+
+_OPF_PARENT_TAGS = ['xml', 'package', 'metadata', 'dc-metadata',
+                    'x-metadata', 'manifest', 'spine', 'tours', 'guide']
+
+class K8RESCProcessor(object):
+
+    def __init__(self, data, debug=False):
+        self._debug = debug
+        self.resc = None
+        self.opos = 0
+        self.extrameta = []
+        self.cover_name = None
+        self.spine_idrefs = {}
+        self.spine_order = []
+        self.spine_pageattributes = {}
+        self.spine_ppd = None
+        # need3 indicate the book has fields which require epub3.
+        # but the estimation of the source epub version from the fields is difficult.
+        self.need3 = False
+        self.package_ver = None
+        self.extra_metadata = []
+        self.refines_metadata = []
+        self.extra_attributes = []
+        # get header
+        start_pos = data.find(b'<')
+        self.resc_header = data[:start_pos]
+        # get resc data length
+        start = self.resc_header.find(b'=') + 1
+        end = self.resc_header.find(b'&', start)
+        resc_size = 0
+        if end > 0:
+            resc_size = fromBase32(self.resc_header[start:end])
+        resc_rawbytes = len(data) - start_pos
+        if resc_rawbytes == resc_size:
+            self.resc_length = resc_size
+        else:
+            # Most RESC has a nul string at its tail but some do not.
+            end_pos = data.find(b'\x00', start_pos)
+            if end_pos < 0:
+                self.resc_length = resc_rawbytes
+            else:
+                self.resc_length = end_pos - start_pos
+        if self.resc_length != resc_size:
+            print("Warning: RESC section length({:d}bytes) does not match its size({:d}bytes).".format(self.resc_length, resc_size))
+        # now parse RESC after converting it to unicode from utf-8
+        self.resc = unicode_str(data[start_pos:start_pos+self.resc_length])
+        self.parseData()
+
+    def prepend_to_spine(self, key, idref, linear, properties):
+        self.spine_order = [key] + self.spine_order
+        self.spine_idrefs[key] = idref
+        attributes = {}
+        if linear is not None:
+            attributes['linear'] = linear
+        if properties is not None:
+            attributes['properties'] = properties
+        self.spine_pageattributes[key] = attributes
+
+    # RESC tag iterator
+    def resc_tag_iter(self):
+        tcontent = last_tattr = None
+        prefix = ['']
+        while True:
+            text, tag = self.parseresc()
+            if text is None and tag is None:
+                break
+            if text is not None:
+                tcontent = text.rstrip(' \r\n')
+            else:  # we have a tag
+                ttype, tname, tattr = self.parsetag(tag)
+                if ttype == 'begin':
+                    tcontent = None
+                    prefix.append(tname + '.')
+                    if tname in _OPF_PARENT_TAGS:
+                        yield ''.join(prefix), tname, tattr, tcontent
+                    else:
+                        last_tattr = tattr
+                else:  # single or end
+                    if ttype == 'end':
+                        prefix.pop()
+                        tattr = last_tattr
+                        last_tattr = None
+                        if tname in _OPF_PARENT_TAGS:
+                            tname += '-end'
+                    yield ''.join(prefix), tname, tattr, tcontent
+                    tcontent = None
+
+    # now parse the RESC to extract spine and extra metadata info
+    def parseData(self):
+        for prefix, tname, tattr, tcontent in self.resc_tag_iter():
+            if self._debug:
+                print("   Parsing RESC: ", prefix, tname, tattr, tcontent)
+            if tname == 'package':
+                self.package_ver = tattr.get('version', '2.0')
+                package_prefix = tattr.get('prefix','')
+                if self.package_ver.startswith('3') or package_prefix.startswith('rendition'):
+                    self.need3 = True
+            if tname == 'spine':
+                self.spine_ppd = tattr.get('page-progession-direction', None)
+                if self.spine_ppd is not None and self.spine_ppd == 'rtl':
+                    self.need3 = True
+            if tname == 'itemref':
+                skelid = tattr.pop('skelid', None)
+                if skelid is None and len(self.spine_order) == 0:
+                    # assume it was removed initial coverpage
+                    skelid = 'coverpage'
+                    tattr['linear'] = 'no'
+                self.spine_order.append(skelid)
+                idref = tattr.pop('idref', None)
+                if idref is not None:
+                    idref = 'x_' + idref
+                self.spine_idrefs[skelid] = idref
+                if 'id' in tattr:
+                    del tattr['id']
+                # tattr["id"] = 'x_' + tattr["id"]
+                if 'properties' in tattr:
+                    self.need3 = True
+                self.spine_pageattributes[skelid] = tattr
+            if tname == 'meta' or tname.startswith('dc:'):
+                if 'refines' in tattr or 'property' in tattr:
+                    self.need3 = True
+                if tattr.get('name','') == 'cover':
+                    cover_name = tattr.get('content',None)
+                    if cover_name is not None:
+                        cover_name = 'x_' + cover_name
+                    self.cover_name = cover_name
+                else:
+                    self.extrameta.append([tname, tattr, tcontent])
+
+    # parse and return either leading text or the next tag
+    def parseresc(self):
+        p = self.opos
+        if p >= len(self.resc):
+            return None, None
+        if self.resc[p] != '<':
+            res = self.resc.find('<',p)
+            if res == -1 :
+                res = len(self.resc)
+            self.opos = res
+            return self.resc[p:res], None
+        # handle comment as a special case
+        if self.resc[p:p+4] == '<!--':
+            te = self.resc.find('-->',p+1)
+            if te != -1:
+                te = te+2
+        else:
+            te = self.resc.find('>',p+1)
+            ntb = self.resc.find('<',p+1)
+            if ntb != -1 and ntb < te:
+                self.opos = ntb
+                return self.resc[p:ntb], None
+        self.opos = te + 1
+        return None, self.resc[p:te+1]
+
+    # parses tag to identify:  [tname, ttype, tattr]
+    #    tname: tag name
+    #    ttype: tag type ('begin', 'end' or 'single');
+    #    tattr: dictionary of tag atributes
+    def parsetag(self, s):
+        p = 1
+        tname = None
+        ttype = None
+        tattr = dict_()
+        while s[p:p+1] == ' ' :
+            p += 1
+        if s[p:p+1] == '/':
+            ttype = 'end'
+            p += 1
+            while s[p:p+1] == ' ' :
+                p += 1
+        b = p
+        while s[p:p+1] not in ('>', '/', ' ', '"', "'",'\r','\n') :
+            p += 1
+        tname=s[b:p].lower()
+        # some special cases
+        if tname == '?xml':
+            tname = 'xml'
+        if tname == '!--':
+            ttype = 'single'
+            comment = s[p:-3].strip()
+            tattr['comment'] = comment
+        if ttype is None:
+            # parse any attributes of begin or single tags
+            while s.find('=',p) != -1 :
+                while s[p:p+1] == ' ' :
+                    p += 1
+                b = p
+                while s[p:p+1] != '=' :
+                    p += 1
+                aname = s[b:p].lower()
+                aname = aname.rstrip(' ')
+                p += 1
+                while s[p:p+1] == ' ' :
+                    p += 1
+                if s[p:p+1] in ('"', "'") :
+                    p = p + 1
+                    b = p
+                    while s[p:p+1] not in ('"', "'"):
+                        p += 1
+                    val = s[b:p]
+                    p += 1
+                else :
+                    b = p
+                    while s[p:p+1] not in ('>', '/', ' ') :
+                        p += 1
+                    val = s[b:p]
+                tattr[aname] = val
+        if ttype is None:
+            ttype = 'begin'
+            if s.find('/',p) >= 0:
+                ttype = 'single'
+        return ttype, tname, tattr
+
+    def taginfo_toxml(self, taginfo):
+        res = []
+        tname, tattr, tcontent = taginfo
+        res.append('<' + tname)
+        if tattr is not None:
+            for key in tattr:
+                res.append(' ' + key + '="'+tattr[key]+'"')
+        if tcontent is not None:
+            res.append('>' + tcontent + '</' + tname + '>\n')
+        else:
+            res.append('/>\n')
+        return "".join(res)
+
+    def hasSpine(self):
+        return len(self.spine_order) > 0
+
+    def needEPUB3(self):
+        return self.need3
+
+    def hasRefines(self):
+        for [tname, tattr, tcontent] in self.extrameta:
+            if 'refines' in tattr:
+                return True
+        return False
+
+    def createMetadata(self, epubver):
+        for taginfo in self.extrameta:
+            tname, tattr, tcontent = taginfo
+            if 'refines' in tattr:
+                if epubver == 'F' and 'property' in tattr:
+                    attr = ' id="%s" opf:%s="%s"\n' % (tattr['refines'], tattr['property'], tcontent)
+                    self.extra_attributes.append(attr)
+                else:
+                    tag = self.taginfo_toxml(taginfo)
+                    self.refines_metadata.append(tag)
+            else:
+                tag = self.taginfo_toxml(taginfo)
+                self.extra_metadata.append(tag)
--- a/KindleUnpack/mobi_nav.py
+++ b/KindleUnpack/mobi_nav.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import unicode_str
+import os
+from .unipath import pathof
+
+import re
+# note: re requites the pattern to be the exact same type as the data to be searched in python3
+# but u"" is not allowed for the pattern itself only b""
+
+DEBUG_NAV = False
+
+FORCE_DEFAULT_TITLE = False
+""" Set to True to force to use the default title. """
+
+NAVIGATION_FINENAME = 'nav.xhtml'
+""" The name for the navigation document. """
+
+DEFAULT_TITLE = 'Navigation'
+""" The default title for the navigation document. """
+
+class NAVProcessor(object):
+
+    def __init__(self, files):
+        self.files = files
+        self.navname = NAVIGATION_FINENAME
+
+    def buildLandmarks(self, guidetext):
+        header = ''
+        header += '  <nav epub:type="landmarks" id="landmarks" hidden="">\n'
+        header += '    <h2>Guide</h2>\n'
+        header += '    <ol>\n'
+        element = '      <li><a epub:type="{:s}" href="{:s}">{:s}</a></li>\n'
+        footer = ''
+        footer += '    </ol>\n'
+        footer += '  </nav>\n'
+
+        type_map = {
+            'cover' : 'cover',
+            'title-page' : 'title-page',
+            # ?: 'frontmatter',
+            'text' : 'bodymatter',
+            # ?: 'backmatter',
+            'toc' : 'toc',
+            'loi' : 'loi',
+            'lot' : 'lot',
+            'preface' : 'preface',
+            'bibliography' : 'bibliography',
+            'index' : 'index',
+            'glossary' : 'glossary',
+            'acknowledgements' : 'acknowledgements',
+            'colophon' : None,
+            'copyright-page' : None,
+            'dedication' : None,
+            'epigraph' : None,
+            'foreword' : None,
+            'notes' : None
+            }
+
+        re_type = re.compile(r'\s+type\s*=\s*"(.*?)"', re.I)
+        re_title = re.compile(r'\s+title\s*=\s*"(.*?)"', re.I)
+        re_link = re.compile(r'\s+href\s*=\s*"(.*?)"', re.I)
+        dir_ = os.path.relpath(self.files.k8text, self.files.k8oebps).replace('\\', '/')
+
+        data = ''
+        references = re.findall(r'<reference\s+.*?>', unicode_str(guidetext), re.I)
+        for reference in references:
+            mo_type = re_type.search(reference)
+            mo_title = re_title.search(reference)
+            mo_link = re_link.search(reference)
+            if mo_type is not None:
+                type_ = type_map.get(mo_type.group(1), None)
+            else:
+                type_ = None
+            if mo_title is not None:
+                title = mo_title.group(1)
+            else:
+                title = None
+            if mo_link is not None:
+                link = mo_link.group(1)
+            else:
+                link = None
+
+            if type_ is not None and title is not None and link is not None:
+                link = os.path.relpath(link, dir_).replace('\\', '/')
+                data += element.format(type_, link, title)
+        if len(data) > 0:
+            return header + data + footer
+        else:
+            return ''
+
+    def buildTOC(self, indx_data):
+        header = ''
+        header += '  <nav epub:type="toc" id="toc">\n'
+        header += '    <h1>Table of contents</h1>\n'
+        footer = '  </nav>\n'
+
+        # recursive part
+        def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
+            if start>len(indx_data) or end>len(indx_data):
+                print("Warning (in buildTOC): missing INDX child entries", start, end, len(indx_data))
+                return ''
+            if DEBUG_NAV:
+                print("recursINDX (in buildTOC) lvl %d from %d to %d" % (lvl, start, end))
+            xhtml = ''
+            if start <= 0:
+                start = 0
+            if end <= 0:
+                end = len(indx_data)
+            if lvl > max_lvl:
+                max_lvl = lvl
+
+            indent1 = '  ' * (2 + lvl * 2)
+            indent2 = '  ' * (3 + lvl * 2)
+            xhtml += indent1 + '<ol>\n'
+            for i in range(start, end):
+                e = indx_data[i]
+                htmlfile = e['filename']
+                desttag = e['idtag']
+                text = e['text']
+                if not e['hlvl'] == lvl:
+                    continue
+                num += 1
+                if desttag == '':
+                    link = htmlfile
+                else:
+                    link = '{:s}#{:s}'.format(htmlfile, desttag)
+                xhtml += indent2 + '<li>'
+                entry = '<a href="{:}">{:s}</a>'.format(link, text)
+                xhtml += entry
+                # recurs
+                if e['child1'] >= 0:
+                    xhtml += '\n'
+                    xhtmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1,
+                            e['child1'], e['childn'] + 1)
+                    xhtml += xhtmlrec
+                    xhtml += indent2
+                # close entry
+                xhtml += '</li>\n'
+            xhtml += indent1 + '</ol>\n'
+            return xhtml, max_lvl, num
+
+        data, max_lvl, num = recursINDX()
+        if not len(indx_data) == num:
+            print("Warning (in buildTOC): different number of entries in NCX", len(indx_data), num)
+        return header + data + footer
+
+    def buildNAV(self, ncx_data, guidetext, title, lang):
+        print("Building Navigation Document.")
+        if FORCE_DEFAULT_TITLE:
+            title = DEFAULT_TITLE
+        nav_header = ''
+        nav_header += '<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE html>'
+        nav_header += '<html xmlns="http://www.w3.org/1999/xhtml"'
+        nav_header += ' xmlns:epub="http://www.idpf.org/2007/ops"'
+        nav_header += ' lang="{0:s}" xml:lang="{0:s}">\n'.format(lang)
+        nav_header += '<head>\n<title>{:s}</title>\n'.format(title)
+        nav_header += '<meta charset="UTF-8" />\n'
+        nav_header += '<style type="text/css">\n'
+        nav_header += 'nav#landmarks { display:none; }\n'
+        nav_header += '</style>\n</head>\n<body>\n'
+        nav_footer = '</body>\n</html>\n'
+
+        landmarks =  self.buildLandmarks(guidetext)
+        toc = self.buildTOC(ncx_data)
+
+        data = nav_header
+        data += landmarks
+        data += toc
+        data += nav_footer
+        return data
+
+    def getNAVName(self):
+        return self.navname
+
+    def writeNAV(self, ncx_data, guidetext, metadata):
+        # build the xhtml
+        # print("Write Navigation Document.")
+        xhtml = self.buildNAV(ncx_data, guidetext, metadata.get('Title')[0], metadata.get('Language')[0])
+        fname = os.path.join(self.files.k8text, self.navname)
+        with open(pathof(fname), 'wb') as f:
+            f.write(xhtml.encode('utf-8'))
--- a/KindleUnpack/mobi_ncx.py
+++ b/KindleUnpack/mobi_ncx.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+import os
+from .unipath import pathof
+
+
+import re
+# note: re requites the pattern to be the exact same type as the data to be searched in python3
+# but u"" is not allowed for the pattern itself only b""
+
+from .mobi_utils import toBase32
+from .mobi_index import MobiIndex
+
+DEBUG_NCX = False
+
+class ncxExtract:
+
+    def __init__(self, mh, files):
+        self.mh = mh
+        self.sect = self.mh.sect
+        self.files = files
+        self.isNCX = False
+        self.mi = MobiIndex(self.sect)
+        self.ncxidx = self.mh.ncxidx
+        self.indx_data = None
+
+    def parseNCX(self):
+        indx_data = []
+        tag_fieldname_map = {
+                1: ['pos',0],
+                2: ['len',0],
+                3: ['noffs',0],
+                4: ['hlvl',0],
+                5: ['koffs',0],
+                6: ['pos_fid',0],
+                21: ['parent',0],
+                22: ['child1',0],
+                23: ['childn',0]
+        }
+        if self.ncxidx != 0xffffffff:
+            outtbl, ctoc_text = self.mi.getIndexData(self.ncxidx, "NCX")
+            if DEBUG_NCX:
+                print(ctoc_text)
+                print(outtbl)
+            num = 0
+            for [text, tagMap] in outtbl:
+                tmp = {
+                        'name': text.decode('utf-8'),
+                        'pos':  -1,
+                        'len':  0,
+                        'noffs': -1,
+                        'text' : "Unknown Text",
+                        'hlvl' : -1,
+                        'kind' : "Unknown Kind",
+                        'pos_fid' : None,
+                        'parent' : -1,
+                        'child1' : -1,
+                        'childn' : -1,
+                        'num'  : num
+                        }
+                for tag in tag_fieldname_map:
+                    [fieldname, i] = tag_fieldname_map[tag]
+                    if tag in tagMap:
+                        fieldvalue = tagMap[tag][i]
+                        if tag == 6:
+                            pos_fid = toBase32(fieldvalue,4).decode('utf-8')
+                            fieldvalue2 = tagMap[tag][i+1]
+                            pos_off = toBase32(fieldvalue2,10).decode('utf-8')
+                            fieldvalue = 'kindle:pos:fid:%s:off:%s' % (pos_fid, pos_off)
+                        tmp[fieldname] = fieldvalue
+                        if tag == 3:
+                            toctext = ctoc_text.get(fieldvalue, 'Unknown Text')
+                            toctext = toctext.decode(self.mh.codec)
+                            tmp['text'] = toctext
+                        if tag == 5:
+                            kindtext = ctoc_text.get(fieldvalue, 'Unknown Kind')
+                            kindtext = kindtext.decode(self.mh.codec)
+                            tmp['kind'] = kindtext
+                indx_data.append(tmp)
+                if DEBUG_NCX:
+                    print("record number: ", num)
+                    print("name: ", tmp['name'],)
+                    print("position", tmp['pos']," length: ", tmp['len'])
+                    print("text: ", tmp['text'])
+                    print("kind: ", tmp['kind'])
+                    print("heading level: ", tmp['hlvl'])
+                    print("parent:", tmp['parent'])
+                    print("first child: ",tmp['child1']," last child: ", tmp['childn'])
+                    print("pos_fid is ", tmp['pos_fid'])
+                    print("\n\n")
+                num += 1
+        self.indx_data = indx_data
+        return indx_data
+
+    def buildNCX(self, htmlfile, title, ident, lang):
+        indx_data = self.indx_data
+
+        ncx_header = \
+'''<?xml version='1.0' encoding='utf-8'?>
+<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="%s">
+<head>
+<meta content="%s" name="dtb:uid"/>
+<meta content="%d" name="dtb:depth"/>
+<meta content="mobiunpack.py" name="dtb:generator"/>
+<meta content="0" name="dtb:totalPageCount"/>
+<meta content="0" name="dtb:maxPageNumber"/>
+</head>
+<docTitle>
+<text>%s</text>
+</docTitle>
+<navMap>
+'''
+
+        ncx_footer = \
+'''  </navMap>
+</ncx>
+'''
+
+        ncx_entry = \
+'''<navPoint id="%s" playOrder="%d">
+<navLabel>
+<text>%s</text>
+</navLabel>
+<content src="%s"/>'''
+
+        # recursive part
+        def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
+            if start>len(indx_data) or end>len(indx_data):
+                print("Warning: missing INDX child entries", start, end, len(indx_data))
+                return ''
+            if DEBUG_NCX:
+                print("recursINDX lvl %d from %d to %d" % (lvl, start, end))
+            xml = ''
+            if start <= 0:
+                start = 0
+            if end <= 0:
+                end = len(indx_data)
+            if lvl > max_lvl:
+                max_lvl = lvl
+            indent = '  ' * (2 + lvl)
+
+            for i in range(start, end):
+                e = indx_data[i]
+                if not e['hlvl'] == lvl:
+                    continue
+                # open entry
+                num += 1
+                link = '%s#filepos%d' % (htmlfile, e['pos'])
+                tagid = 'np_%d' % num
+                entry = ncx_entry % (tagid, num, e['text'], link)
+                entry = re.sub(re.compile('^', re.M), indent, entry, 0)
+                xml += entry + '\n'
+                # recurs
+                if e['child1']>=0:
+                    xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1,
+                            e['child1'], e['childn'] + 1)
+                    xml += xmlrec
+                # close entry
+                xml += indent + '</navPoint>\n'
+            return xml, max_lvl, num
+
+        body, max_lvl, num = recursINDX()
+        header = ncx_header % (lang, ident, max_lvl + 1, title)
+        ncx =  header + body + ncx_footer
+        if not len(indx_data) == num:
+            print("Warning: different number of entries in NCX", len(indx_data), num)
+        return ncx
+
+    def writeNCX(self, metadata):
+        # build the xml
+        self.isNCX = True
+        print("Write ncx")
+        # htmlname = os.path.basename(self.files.outbase)
+        # htmlname += '.html'
+        htmlname = 'book.html'
+        xml = self.buildNCX(htmlname, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0])
+        # write the ncx file
+        # ncxname = os.path.join(self.files.mobi7dir, self.files.getInputFileBasename() + '.ncx')
+        ncxname = os.path.join(self.files.mobi7dir, 'toc.ncx')
+        with open(pathof(ncxname), 'wb') as f:
+            f.write(xml.encode('utf-8'))
+
+    def buildK8NCX(self, indx_data, title, ident, lang):
+        ncx_header = \
+'''<?xml version='1.0' encoding='utf-8'?>
+<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="%s">
+<head>
+<meta content="%s" name="dtb:uid"/>
+<meta content="%d" name="dtb:depth"/>
+<meta content="mobiunpack.py" name="dtb:generator"/>
+<meta content="0" name="dtb:totalPageCount"/>
+<meta content="0" name="dtb:maxPageNumber"/>
+</head>
+<docTitle>
+<text>%s</text>
+</docTitle>
+<navMap>
+'''
+
+        ncx_footer = \
+'''  </navMap>
+</ncx>
+'''
+
+        ncx_entry = \
+'''<navPoint id="%s" playOrder="%d">
+<navLabel>
+<text>%s</text>
+</navLabel>
+<content src="%s"/>'''
+
+        # recursive part
+        def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
+            if start>len(indx_data) or end>len(indx_data):
+                print("Warning: missing INDX child entries", start, end, len(indx_data))
+                return ''
+            if DEBUG_NCX:
+                print("recursINDX lvl %d from %d to %d" % (lvl, start, end))
+            xml = ''
+            if start <= 0:
+                start = 0
+            if end <= 0:
+                end = len(indx_data)
+            if lvl > max_lvl:
+                max_lvl = lvl
+            indent = '  ' * (2 + lvl)
+
+            for i in range(start, end):
+                e = indx_data[i]
+                htmlfile = e['filename']
+                desttag = e['idtag']
+                if not e['hlvl'] == lvl:
+                    continue
+                # open entry
+                num += 1
+                if desttag == '':
+                    link = 'Text/%s' % htmlfile
+                else:
+                    link = 'Text/%s#%s' % (htmlfile, desttag)
+                tagid = 'np_%d' % num
+                entry = ncx_entry % (tagid, num, e['text'], link)
+                entry = re.sub(re.compile('^', re.M), indent, entry, 0)
+                xml += entry + '\n'
+                # recurs
+                if e['child1']>=0:
+                    xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1,
+                            e['child1'], e['childn'] + 1)
+                    xml += xmlrec
+                # close entry
+                xml += indent + '</navPoint>\n'
+            return xml, max_lvl, num
+
+        body, max_lvl, num = recursINDX()
+        header = ncx_header % (lang, ident, max_lvl + 1, title)
+        ncx =  header + body + ncx_footer
+        if not len(indx_data) == num:
+            print("Warning: different number of entries in NCX", len(indx_data), num)
+        return ncx
+
+    def writeK8NCX(self, ncx_data, metadata):
+        # build the xml
+        self.isNCX = True
+        print("Write K8 ncx")
+        xml = self.buildK8NCX(ncx_data, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0])
+        bname = 'toc.ncx'
+        ncxname = os.path.join(self.files.k8oebps,bname)
+        with open(pathof(ncxname), 'wb') as f:
+            f.write(xml.encode('utf-8'))
--- a/KindleUnpack/mobi_opf.py
+++ b/KindleUnpack/mobi_opf.py
@@ -0,0 +1,681 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import unicode_str, unescapeit
+from .compatibility_utils import lzip
+
+from .unipath import pathof
+
+from xml.sax.saxutils import escape as xmlescape
+
+import os
+import uuid
+from datetime import datetime
+
+# In EPUB3, NCX and <guide> MAY exist in OPF, although the NCX is superseded
+# by the Navigation Document and the <guide> is deprecated. Currently, EPUB3_WITH_NCX
+# and EPUB3_WITH_GUIDE are set to True due to compatibility with epub2 reading systems.
+# They might be change to set to False in the future.
+
+EPUB3_WITH_NCX = True  # Do not set to False except for debug.
+""" Set to True to create a toc.ncx when converting to epub3. """
+
+EPUB3_WITH_GUIDE = True  # Do not set to False except for debug.
+""" Set to True to create a guide element in an opf when converting to epub3. """
+
+EPUB_OPF = 'content.opf'
+""" The name for the OPF of EPUB. """
+
+TOC_NCX = 'toc.ncx'
+""" The name for the TOC of EPUB2. """
+
+NAVIGATION_DOCUMENT = 'nav.xhtml'
+""" The name for the navigation document of EPUB3. """
+
+BEGIN_INFO_ONLY = '<!-- BEGIN INFORMATION ONLY '
+""" The comment to indicate the beginning of metadata which will be ignored by kindlegen. """
+
+END_INFO_ONLY = 'END INFORMATION ONLY -->'
+""" The comment to indicate the end of metadata which will be ignored by kindlegen. """
+
+EXTH_TITLE_FURIGANA = 'Title-Pronunciation'
+""" The name for Title Furigana(similar to file-as) set by KDP. """
+
+EXTH_CREATOR_FURIGANA = 'Author-Pronunciation'
+""" The name for Creator Furigana(similar to file-as) set by KDP. """
+
+EXTH_PUBLISHER_FURIGANA = 'Publisher-Pronunciation'
+""" The name for Publisher Furigana(similar to file-as) set by KDP. """
+
+EXTRA_ENTITIES = {'"': '&quot;', "'": "&apos;"}
+
+class OPFProcessor(object):
+
+    def __init__(self, files, metadata, fileinfo, rscnames, hasNCX, mh, usedmap, pagemapxml='', guidetext='', k8resc=None, epubver='2'):
+        self.files = files
+        self.metadata = metadata
+        self.fileinfo = fileinfo
+        self.rscnames = rscnames
+        self.has_ncx = hasNCX
+        self.codec = mh.codec
+        self.isK8 = mh.isK8()
+        self.printReplica = mh.isPrintReplica()
+        self.guidetext = unicode_str(guidetext)
+        self.used = usedmap
+        self.k8resc = k8resc
+        self.covername = None
+        self.cover_id = 'cover_img'
+        if self.k8resc is not None and self.k8resc.cover_name is not None:
+            # update cover id info from RESC if available
+            self.cover_id = self.k8resc.cover_name
+        # Create a unique urn uuid
+        self.BookId = unicode_str(str(uuid.uuid4()))
+        self.pagemap = pagemapxml
+
+        self.ncxname = None
+        self.navname = None
+
+        # page-progression-direction is only set in spine
+        self.page_progression_direction = metadata.pop('page-progression-direction', [None])[0]
+        if 'rl' in metadata.get('primary-writing-mode', [''])[0]:
+            self.page_progression_direction = 'rtl'
+        self.epubver = epubver  # the epub version set by user
+        self.target_epubver = epubver  # the epub vertion set by user or detected automatically
+        if self.epubver == 'A':
+            self.target_epubver = self.autodetectEPUBVersion()
+        elif self.epubver == 'F':
+            self.target_epubver = '2'
+        elif self.epubver != '2' and self.epubver != '3':
+            self.target_epubver = '2'
+
+        # id for rifine attributes
+        self.title_id = {}
+        self.creator_id = {}
+        self.publisher_id = {}
+        # extra attributes
+        self.title_attrib = {}
+        self.creator_attrib = {}
+        self.publisher_attrib = {}
+        self.extra_attributes = []  # for force epub2 option
+        # Create epub3 metadata from EXTH.
+        self.exth_solved_refines_metadata = []
+        self.exth_refines_metadata = []
+        self.exth_fixedlayout_metadata = []
+
+        self.defineRefinesID()
+        self.processRefinesMetadata()
+        if self.k8resc is not None:
+            # Create metadata in RESC section.
+            self.k8resc.createMetadata(epubver)
+        if self.target_epubver == "3":
+            self.createMetadataForFixedlayout()
+
+    def escapeit(self, sval, EXTRAS=None):
+        # note, xmlescape and unescape do not work with utf-8 bytestrings
+        sval = unicode_str(sval)
+        if EXTRAS:
+            res = xmlescape(unescapeit(sval), EXTRAS)
+        else:
+            res = xmlescape(unescapeit(sval))
+        return res
+
+    def createMetaTag(self, data, property, content, refid=''):
+        refines = ''
+        if refid:
+            refines = ' refines="#%s"' % refid
+        data.append('<meta property="%s"%s>%s</meta>\n' % (property, refines, content))
+
+    def buildOPFMetadata(self, start_tag, has_obfuscated_fonts=False):
+        # convert from EXTH metadata format to target epub version metadata
+        # epub 3 will ignore <meta name="xxxx" content="yyyy" /> style metatags
+        #    but allows them to be present for backwards compatibility
+        #    instead the new format is
+        #    <meta property="xxxx" id="iiii" ... > property_value</meta>
+        #       and DCMES elements such as:
+        #    <dc:blah id="iiii">value</dc:blah>
+
+        metadata = self.metadata
+        k8resc = self.k8resc
+
+        META_TAGS = ['Drm Server Id', 'Drm Commerce Id', 'Drm Ebookbase Book Id', 'ASIN', 'ThumbOffset', 'Fake Cover',
+                                                'Creator Software', 'Creator Major Version', 'Creator Minor Version', 'Creator Build Number',
+                                                'Watermark', 'Clipping Limit', 'Publisher Limit', 'Text to Speech Disabled', 'CDE Type',
+                                                'Updated Title', 'Font Signature (hex)', 'Tamper Proof Keys (hex)',]
+
+        # def handleTag(data, metadata, key, tag, ids={}):
+        def handleTag(data, metadata, key, tag, attrib={}):
+            '''Format metadata values.
+
+            @param data: List of formatted metadata entries.
+            @param metadata: The metadata dictionary.
+            @param key: The key of the metadata value to handle.
+            @param tag: The opf tag corresponds to the metadata value.
+            ###@param ids: The ids in tags for refines property of epub3.
+            @param attrib: The extra attibute for refines or opf prefixs.
+           '''
+            if key in metadata:
+                for i, value in enumerate(metadata[key]):
+                    closingTag = tag.split(" ")[0]
+                    res = '<%s%s>%s</%s>\n' % (tag, attrib.get(i, ''), self.escapeit(value), closingTag)
+                    data.append(res)
+                del metadata[key]
+
+        # these are allowed but ignored by epub3
+        def handleMetaPairs(data, metadata, key, name):
+            if key in metadata:
+                for value in metadata[key]:
+                    res = '<meta name="%s" content="%s" />\n' % (name, self.escapeit(value, EXTRA_ENTITIES))
+                    data.append(res)
+                del metadata[key]
+
+        data = []
+        data.append(start_tag + '\n')
+        # Handle standard metadata
+        if 'Title' in metadata:
+            handleTag(data, metadata, 'Title', 'dc:title', self.title_attrib)
+        else:
+            data.append('<dc:title>Untitled</dc:title>\n')
+        handleTag(data, metadata, 'Language', 'dc:language')
+        if 'UniqueID' in metadata:
+            handleTag(data, metadata, 'UniqueID', 'dc:identifier id="uid"')
+        else:
+            # No unique ID in original, give it a generic one.
+            data.append('<dc:identifier id="uid">0</dc:identifier>\n')
+
+        if self.target_epubver == '3':
+            # epub version 3 minimal metadata requires a dcterms:modifed date tag
+            self.createMetaTag(data, 'dcterms:modified', datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"))
+
+        if self.isK8 and has_obfuscated_fonts:
+            # Use the random generated urn:uuid so obuscated fonts work.
+            # It doesn't need to be _THE_ unique identifier to work as a key
+            # for obfuscated fonts in Sigil, ADE and calibre. Its just has
+            # to use the opf:scheme="UUID" and have the urn:uuid: prefix.
+            if self.target_epubver == '3':
+                data.append('<dc:identifier>urn:uuid:'+self.BookId+'</dc:identifier>\n')
+            else:
+                data.append('<dc:identifier opf:scheme="UUID">urn:uuid:'+self.BookId+'</dc:identifier>\n')
+
+        handleTag(data, metadata, 'Creator', 'dc:creator', self.creator_attrib)
+        handleTag(data, metadata, 'Contributor', 'dc:contributor')
+        handleTag(data, metadata, 'Publisher', 'dc:publisher', self.publisher_attrib)
+        handleTag(data, metadata, 'Source', 'dc:source')
+        handleTag(data, metadata, 'Type', 'dc:type')
+        if self.target_epubver == '3':
+            if 'ISBN' in metadata:
+                for i, value in enumerate(metadata['ISBN']):
+                    res = '<dc:identifier>urn:isbn:%s</dc:identifier>\n' % self.escapeit(value)
+                    data.append(res)
+        else:
+            handleTag(data, metadata, 'ISBN', 'dc:identifier opf:scheme="ISBN"')
+        if 'Subject' in metadata:
+            if 'SubjectCode' in metadata:
+                codeList = metadata['SubjectCode']
+                del metadata['SubjectCode']
+            else:
+                codeList = None
+            for i in range(len(metadata['Subject'])):
+                if codeList and i < len(codeList):
+                    data.append('<dc:subject BASICCode="'+codeList[i]+'">')
+                else:
+                    data.append('<dc:subject>')
+                data.append(self.escapeit(metadata['Subject'][i])+'</dc:subject>\n')
+            del metadata['Subject']
+        handleTag(data, metadata, 'Description', 'dc:description')
+        if self.target_epubver == '3':
+            if 'Published' in metadata:
+                for i, value in enumerate(metadata['Published']):
+                    res = '<dc:date>%s</dc:date>\n' % self.escapeit(value)
+                    data.append(res)
+        else:
+            handleTag(data, metadata, 'Published', 'dc:date opf:event="publication"')
+        handleTag(data, metadata, 'Rights', 'dc:rights')
+
+        if self.epubver == 'F':
+            if self.extra_attributes or k8resc is not None and k8resc.extra_attributes:
+                data.append('<!-- THE FOLLOWINGS ARE REQUIRED TO INSERT INTO <dc:xxx> MANUALLY\n')
+                if self.extra_attributes:
+                    data += self.extra_attributes
+                if k8resc is not None and k8resc.extra_attributes:
+                    data += k8resc.extra_attributes
+                data.append('-->\n')
+        else:
+            # Append refines metadata.
+            if self.exth_solved_refines_metadata:
+                data.append('<!-- Refines MetaData from EXTH -->\n')
+                data += self.exth_solved_refines_metadata
+            if self.exth_refines_metadata or k8resc is not None and k8resc.refines_metadata:
+                data.append('<!-- THE FOLLOWINGS ARE REQUIRED TO EDIT IDS MANUALLY\n')
+                if self.exth_refines_metadata:
+                    data += self.exth_refines_metadata
+                if k8resc is not None and k8resc.refines_metadata:
+                    data += k8resc.refines_metadata
+                data.append('-->\n')
+
+        # Append metadata in RESC section.
+        if k8resc is not None and k8resc.extra_metadata:
+            data.append('<!-- Extra MetaData from RESC\n')
+            data += k8resc.extra_metadata
+            data.append('-->\n')
+
+        if 'CoverOffset' in metadata:
+            imageNumber = int(metadata['CoverOffset'][0])
+            self.covername = self.rscnames[imageNumber]
+            if self.covername is None:
+                print("Error: Cover image %s was not recognized as a valid image" % imageNumber)
+            else:
+                # <meta name="cover"> is obsoleted in EPUB3, but kindlegen v2.9 requires it.
+                data.append('<meta name="cover" content="' + self.cover_id + '" />\n')
+                self.used[self.covername] = 'used'
+            del metadata['CoverOffset']
+
+        handleMetaPairs(data, metadata, 'Codec', 'output encoding')
+        # handle kindlegen specifc tags
+        handleTag(data, metadata, 'DictInLanguage', 'DictionaryInLanguage')
+        handleTag(data, metadata, 'DictOutLanguage', 'DictionaryOutLanguage')
+        handleMetaPairs(data, metadata, 'RegionMagnification', 'RegionMagnification')
+        handleMetaPairs(data, metadata, 'book-type', 'book-type')
+        handleMetaPairs(data, metadata, 'zero-gutter', 'zero-gutter')
+        handleMetaPairs(data, metadata, 'zero-margin', 'zero-margin')
+        handleMetaPairs(data, metadata, 'primary-writing-mode', 'primary-writing-mode')
+        handleMetaPairs(data, metadata, 'fixed-layout', 'fixed-layout')
+        handleMetaPairs(data, metadata, 'orientation-lock', 'orientation-lock')
+        handleMetaPairs(data, metadata, 'original-resolution', 'original-resolution')
+
+        # these are not allowed in epub2 or 3 so convert them to meta name content pairs
+        # perhaps these could better be mapped into the dcterms namespace instead
+        handleMetaPairs(data, metadata, 'Review', 'review')
+        handleMetaPairs(data, metadata, 'Imprint', 'imprint')
+        handleMetaPairs(data, metadata, 'Adult', 'adult')
+        handleMetaPairs(data, metadata, 'DictShortName', 'DictionaryVeryShortName')
+
+        # these are needed by kobo books upon submission but not sure if legal metadata in epub2 or epub3
+        if 'Price' in metadata and 'Currency' in metadata:
+            priceList = metadata['Price']
+            currencyList = metadata['Currency']
+            if len(priceList) != len(currencyList):
+                print("Error: found %s price entries, but %s currency entries.")
+            else:
+                for i in range(len(priceList)):
+                    data.append('<SRP Currency="'+currencyList[i]+'">'+priceList[i]+'</SRP>\n')
+            del metadata['Price']
+            del metadata['Currency']
+
+        if self.target_epubver == '3':
+            # Append metadata for EPUB3.
+            if self.exth_fixedlayout_metadata:
+                data.append('<!-- EPUB3 MedaData converted from EXTH -->\n')
+                data += self.exth_fixedlayout_metadata
+
+        # all that remains is extra EXTH info we will store inside a comment inside meta name/content pairs
+        # so it can not impact anything and will be automatically stripped out if found again in a RESC section
+        data.append(BEGIN_INFO_ONLY + '\n')
+        if 'ThumbOffset' in metadata:
+            imageNumber = int(metadata['ThumbOffset'][0])
+            imageName = self.rscnames[imageNumber]
+            if imageName is None:
+                print("Error: Cover Thumbnail image %s was not recognized as a valid image" % imageNumber)
+            else:
+                data.append('<meta name="Cover ThumbNail Image" content="'+ 'Images/'+imageName+'" />\n')
+                # self.used[imageName] = 'used' # thumbnail image is always generated by Kindlegen, so don't include in manifest
+                self.used[imageName] = 'not used'
+            del metadata['ThumbOffset']
+        for metaName in META_TAGS:
+            if metaName in metadata:
+                for value in metadata[metaName]:
+                    data.append('<meta name="'+metaName+'" content="'+self.escapeit(value, EXTRA_ENTITIES)+'" />\n')
+                del metadata[metaName]
+        for key in list(metadata.keys()):
+            for value in metadata[key]:
+                data.append('<meta name="'+key+'" content="'+self.escapeit(value, EXTRA_ENTITIES)+'" />\n')
+            del metadata[key]
+        data.append(END_INFO_ONLY + '\n')
+        data.append('</metadata>\n')
+        return data
+
+    def buildOPFManifest(self, ncxname, navname=None):
+        # buildManifest for mobi7, azw4, epub2 and epub3.
+        k8resc = self.k8resc
+        cover_id = self.cover_id
+        hasK8RescSpine = k8resc is not None and k8resc.hasSpine()
+        self.ncxname = ncxname
+        self.navname = navname
+
+        data = []
+        data.append('<manifest>\n')
+        media_map = {
+                '.jpg'  : 'image/jpeg',
+                '.jpeg' : 'image/jpeg',
+                '.png'  : 'image/png',
+                '.gif'  : 'image/gif',
+                '.svg'  : 'image/svg+xml',
+                '.xhtml': 'application/xhtml+xml',
+                '.html' : 'text/html',                   # for mobi7
+                '.pdf'  : 'application/pdf',             # for azw4(print replica textbook)
+                '.ttf'  : 'application/x-font-ttf',
+                '.otf'  : 'application/x-font-opentype',  # replaced?
+                '.css'  : 'text/css',
+                # '.html' : 'text/x-oeb1-document',        # for mobi7
+                # '.otf'  : 'application/vnd.ms-opentype', # [OpenType] OpenType fonts
+                # '.woff' : 'application/font-woff',       # [WOFF] WOFF fonts
+                # '.smil' : 'application/smil+xml',        # [MediaOverlays301] EPUB Media Overlay documents
+                # '.pls'  : 'application/pls+xml',         # [PLS] Text-to-Speech (TTS) Pronunciation lexicons
+                # '.mp3'  : 'audio/mpeg',
+                # '.mp4'  : 'video/mp4',
+                # '.js'   : 'text/javascript',             # not supported in K8
+                }
+        spinerefs = []
+
+        idcnt = 0
+        for [key,dir,fname] in self.fileinfo:
+            name, ext = os.path.splitext(fname)
+            ext = ext.lower()
+            media = media_map.get(ext)
+            ref = "item%d" % idcnt
+            if hasK8RescSpine:
+                if key is not None and key in k8resc.spine_idrefs:
+                    ref = k8resc.spine_idrefs[key]
+            properties = ''
+            if dir != '':
+                fpath = dir + '/' + fname
+            else:
+                fpath = fname
+            data.append('<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(ref, media, fpath, properties))
+
+            if ext in ['.xhtml', '.html']:
+                spinerefs.append(ref)
+            idcnt += 1
+
+        for fname in self.rscnames:
+            if fname is not None:
+                if self.used.get(fname,'not used') == 'not used':
+                    continue
+                name, ext = os.path.splitext(fname)
+                ext = ext.lower()
+                media = media_map.get(ext,ext[1:])
+                properties = ''
+                if fname == self.covername:
+                    ref = cover_id
+                    if self.target_epubver == '3':
+                        properties = 'properties="cover-image"'
+                else:
+                    ref = "item%d" % idcnt
+                if ext == '.ttf' or ext == '.otf':
+                    if self.isK8:  # fonts are only used in Mobi 8
+                        fpath = 'Fonts/' + fname
+                        data.append('<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(ref, media, fpath, properties))
+                else:
+                    fpath = 'Images/' + fname
+                    data.append('<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(ref, media, fpath, properties))
+                idcnt += 1
+
+        if self.target_epubver == '3' and navname is not None:
+            data.append('<item id="nav" media-type="application/xhtml+xml" href="Text/' + navname + '" properties="nav"/>\n')
+        if self.has_ncx and ncxname is not None:
+            data.append('<item id="ncx" media-type="application/x-dtbncx+xml" href="' + ncxname +'" />\n')
+        if self.pagemap != '':
+            data.append('<item id="map" media-type="application/oebs-page-map+xml" href="page-map.xml" />\n')
+        data.append('</manifest>\n')
+        return [data, spinerefs]
+
+    def buildOPFSpine(self, spinerefs, isNCX):
+        # build spine
+        k8resc = self.k8resc
+        hasK8RescSpine = k8resc is not None and k8resc.hasSpine()
+        data = []
+        ppd = ''
+        if self.isK8 and self.page_progression_direction is not None:
+            ppd = ' page-progression-direction="{:s}"'.format(self.page_progression_direction)
+        ncx = ''
+        if isNCX:
+            ncx = ' toc="ncx"'
+        map=''
+        if self.pagemap != '':
+            map = ' page-map="map"'
+        if self.epubver == 'F':
+            if ppd:
+                ppd = '<!--' + ppd + ' -->'
+            spine_start_tag = '<spine{1:s}{2:s}>{0:s}\n'.format(ppd, map, ncx)
+        else:
+            spine_start_tag = '<spine{0:s}{1:s}{2:s}>\n'.format(ppd, map, ncx)
+        data.append(spine_start_tag)
+
+        if hasK8RescSpine:
+            for key in k8resc.spine_order:
+                idref = k8resc.spine_idrefs[key]
+                attribs = k8resc.spine_pageattributes[key]
+                tag = '<itemref idref="%s"' % idref
+                for aname, val in list(attribs.items()):
+                    if self.epubver == 'F' and aname == 'properties':
+                        continue
+                    if val is not None:
+                        tag += ' %s="%s"' % (aname, val)
+                tag += '/>'
+                if self.epubver == 'F' and 'properties' in attribs:
+                    val = attribs['properties']
+                    if val is not None:
+                        tag += '<!-- properties="%s" -->' % val
+                tag += '\n'
+                data.append(tag)
+        else:
+            start = 0
+            # special case the created coverpage if need be
+            [key, dir, fname] = self.fileinfo[0]
+            if key is not None and key == "coverpage":
+                entry = spinerefs[start]
+                data.append('<itemref idref="%s" linear="no"/>\n' % entry)
+                start += 1
+            for entry in spinerefs[start:]:
+                data.append('<itemref idref="' + entry + '"/>\n')
+        data.append('</spine>\n')
+        return data
+
+    def buildMobi7OPF(self):
+        # Build an OPF for mobi7 and azw4.
+        print("Building an opf for mobi7/azw4.")
+        data = []
+        data.append('<?xml version="1.0" encoding="utf-8"?>\n')
+        data.append('<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="uid">\n')
+        metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">'
+        opf_metadata = self.buildOPFMetadata(metadata_tag)
+        data += opf_metadata
+        if self.has_ncx:
+            # ncxname = self.files.getInputFileBasename() + '.ncx'
+            ncxname = 'toc.ncx'
+        else:
+            ncxname = None
+        [opf_manifest, spinerefs] = self.buildOPFManifest(ncxname)
+        data += opf_manifest
+        opf_spine = self.buildOPFSpine(spinerefs, self.has_ncx)
+        data += opf_spine
+        data.append('<tours>\n</tours>\n')
+        if not self.printReplica:
+            guide ='<guide>\n' + self.guidetext + '</guide>\n'
+            data.append(guide)
+        data.append('</package>\n')
+        return ''.join(data)
+
+    def buildEPUBOPF(self, has_obfuscated_fonts=False):
+        print("Building an opf for mobi8 using epub version: ", self.target_epubver)
+        if self.target_epubver == '2':
+            has_ncx = self.has_ncx
+            has_guide = True
+            ncxname = None
+            ncxname = TOC_NCX
+            navname = None
+            package = '<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="uid">\n'
+            tours = '<tours>\n</tours>\n'
+            metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">'
+        else:
+            has_ncx = EPUB3_WITH_NCX
+            has_guide = EPUB3_WITH_GUIDE
+            ncxname = None
+            if has_ncx:
+                ncxname = TOC_NCX
+            navname = NAVIGATION_DOCUMENT
+            package = '<package version="3.0" xmlns="http://www.idpf.org/2007/opf" prefix="rendition: http://www.idpf.org/vocab/rendition/#" unique-identifier="uid">\n'
+            tours = ''
+            metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">'
+
+        data = []
+        data.append('<?xml version="1.0" encoding="utf-8"?>\n')
+        data.append(package)
+        opf_metadata = self.buildOPFMetadata(metadata_tag, has_obfuscated_fonts)
+        data += opf_metadata
+        [opf_manifest, spinerefs] = self.buildOPFManifest(ncxname, navname)
+        data += opf_manifest
+        opf_spine = self.buildOPFSpine(spinerefs, has_ncx)
+        data += opf_spine
+        data.append(tours)
+        if has_guide:
+            guide ='<guide>\n' + self.guidetext + '</guide>\n'
+            data.append(guide)
+        data.append('</package>\n')
+        return ''.join(data)
+
+    def writeOPF(self, has_obfuscated_fonts=False):
+        if self.isK8:
+            data = self.buildEPUBOPF(has_obfuscated_fonts)
+            outopf = os.path.join(self.files.k8oebps, EPUB_OPF)
+            with open(pathof(outopf), 'wb') as f:
+                f.write(data.encode('utf-8'))
+            return self.BookId
+        else:
+            data = self.buildMobi7OPF()
+            outopf = os.path.join(self.files.mobi7dir, 'content.opf')
+            with open(pathof(outopf), 'wb') as f:
+                f.write(data.encode('utf-8'))
+            return 0
+
+    def getBookId(self):
+        return self.BookId
+
+    def getNCXName(self):
+        return self.ncxname
+
+    def getNAVName(self):
+        return self.navname
+
+    def getEPUBVersion(self):
+        return self.target_epubver
+
+    def hasNCX(self):
+        return self.ncxname is not None and self.has_ncx
+
+    def hasNAV(self):
+        return self.navname is not None
+
+    def autodetectEPUBVersion(self):
+        # Determine EPUB version from metadata and RESC.
+        metadata = self.metadata
+        k8resc = self.k8resc
+        epubver = '2'
+        if 'true' == metadata.get('fixed-layout', [''])[0].lower():
+            epubver = '3'
+        elif metadata.get('orientation-lock', [''])[0].lower() in ['portrait', 'landscape']:
+            epubver = '3'
+        elif self.page_progression_direction == 'rtl':
+            epubver = '3'
+        elif EXTH_TITLE_FURIGANA in metadata:
+            epubver = '3'
+        elif EXTH_CREATOR_FURIGANA in metadata:
+            epubver = '3'
+        elif EXTH_PUBLISHER_FURIGANA in metadata:
+            epubver = '3'
+        elif k8resc is not None and k8resc.needEPUB3():
+            epubver = '3'
+        return epubver
+
+    def defineRefinesID(self):
+        # the following EXTH are set by KDP.
+        # 'Title_Furigana_(508)'
+        # 'Creator_Furigana_(517)',
+        # 'Publisher_Furigana_(522)'
+        # It is difficult to find correspondence between Title, Creator, Publisher
+        # and EXTH 508,512, 522 if they have more than two values since KDP seems not preserve the oders of EXTH 508,512 and 522.
+        # It is also difficult to find correspondence between them and tags which have refine attributes in RESC.
+        # So editing manually is required.
+        metadata = self.metadata
+
+        needRefinesId = False
+        if self.k8resc is not None:
+            needRefinesId = self.k8resc.hasRefines()
+        # Create id for rifine attributes
+        if (needRefinesId or EXTH_TITLE_FURIGANA in metadata) and 'Title' in metadata:
+            for i in range(len(metadata.get('Title'))):
+                self.title_id[i] = 'title%02d' % (i+1)
+
+        if (needRefinesId or EXTH_CREATOR_FURIGANA in metadata) and 'Creator' in metadata:
+            for i in range(len(metadata.get('Creator'))):
+                self.creator_id[i] = 'creator%02d' % (i+1)
+
+        if (needRefinesId or EXTH_PUBLISHER_FURIGANA in metadata) and 'Publisher' in metadata:
+            for i in range(len(metadata.get('Publisher'))):
+                self.publisher_id[i] = 'publisher%02d' % (i+1)
+
+    def processRefinesMetadata(self):
+        # create refines metadata defined in epub3 or convert refines property to opf: attribues for epub2.
+        metadata = self.metadata
+
+        refines_list = [
+                [EXTH_TITLE_FURIGANA, self.title_id, self.title_attrib, 'title00'],
+                [EXTH_CREATOR_FURIGANA, self.creator_id, self.creator_attrib, 'creator00'],
+                [EXTH_PUBLISHER_FURIGANA, self.publisher_id, self.publisher_attrib, 'publisher00']
+                ]
+
+        create_refines_metadata = False
+        for EXTH in lzip(*refines_list)[0]:
+            if EXTH in metadata:
+                create_refines_metadata = True
+                break
+        if create_refines_metadata:
+            for [EXTH, id, attrib, defaultid] in refines_list:
+                if self.target_epubver == '3':
+                    for i, value in list(id.items()):
+                        attrib[i] = ' id="%s"' % value
+
+                    if EXTH in metadata:
+                        if len(metadata[EXTH]) == 1 and len(id) == 1:
+                            self.createMetaTag(self.exth_solved_refines_metadata, 'file-as', metadata[EXTH][0], id[0])
+                        else:
+                            for i, value in enumerate(metadata[EXTH]):
+                                self.createMetaTag(self.exth_refines_metadata, 'file-as', value, id.get(i, defaultid))
+                else:
+                    if EXTH in metadata:
+                        if len(metadata[EXTH]) == 1 and len(id) == 1:
+                            attr = ' opf:file-as="%s"' % metadata[EXTH][0]
+                            attrib[0] = attr
+                        else:
+                            for i, value in enumerate(metadata[EXTH]):
+                                attr = ' id="#%s" opf:file-as="%s"\n' % (id.get(i, defaultid), value)
+                                self.extra_attributes.append(attr)
+
+    def createMetadataForFixedlayout(self):
+        # convert fixed layout to epub3 format if needed.
+        metadata = self.metadata
+
+        if 'fixed-layout' in metadata:
+            fixedlayout = metadata['fixed-layout'][0]
+            content = {'true' : 'pre-paginated'}.get(fixedlayout.lower(), 'reflowable')
+            self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:layout', content)
+
+        if 'orientation-lock' in metadata:
+            content = metadata['orientation-lock'][0].lower()
+            if content == 'portrait' or content == 'landscape':
+                self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:orientation', content)
+
+        # according to epub3 spec about correspondence with Amazon
+        # if 'original-resolution' is provided it needs to be converted to
+        # meta viewport property tag stored in the <head></head> of **each**
+        # xhtml page - so this tag would need to be handled by editing each part
+        # before reaching this routine
+        # we need to add support for this to the k8html routine
+        # if 'original-resolution' in metadata.keys():
+        #     resolution = metadata['original-resolution'][0].lower()
+        #     width, height = resolution.split('x')
+        #     if width.isdigit() and int(width) > 0 and height.isdigit() and int(height) > 0:
+        #         viewport = 'width=%s, height=%s' % (width, height)
+        #         self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:viewport', viewport)
--- a/KindleUnpack/mobi_pagemap.py
+++ b/KindleUnpack/mobi_pagemap.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, unicode_str
+
+if PY2:
+    range = xrange
+
+import struct
+# note:  struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+import re
+# note: re requites the pattern to be the exact same type as the data to be searched in python3
+# but u"" is not allowed for the pattern itself only b""
+
+
+_TABLE = [('m', 1000), ('cm', 900), ('d', 500), ('cd', 400), ('c', 100), ('xc', 90), ('l', 50), ('xl', 40), ('x', 10), ('ix', 9), ('v', 5), ('iv', 4), ('i', 1)]
+
+def int_to_roman(i):
+    parts = []
+    num = i
+    for letter, value in _TABLE:
+        while value <= num:
+            num -= value
+            parts.append(letter)
+    return ''.join(parts)
+
+def roman_to_int(s):
+    result = 0
+    rnstr = s
+    for letter, value in _TABLE:
+        while rnstr.startswith(letter):
+            result += value
+            rnstr = rnstr[len(letter):]
+    return result
+
+_pattern = r'''\(([^\)]*)\)'''
+_tup_pattern = re.compile(_pattern,re.IGNORECASE)
+
+
+def _parseNames(numpages, data):
+    data = unicode_str(data)
+    pagenames = []
+    pageMap = ''
+    for i in range(numpages):
+        pagenames.append(None)
+    for m in re.finditer(_tup_pattern, data):
+        tup = m.group(1)
+        if pageMap != '':
+            pageMap += ','
+        pageMap += '(' + tup + ')'
+        spos, nametype, svalue = tup.split(",")
+        # print(spos, nametype, svalue)
+        if nametype == 'a' or nametype == 'r':
+            svalue = int(svalue)
+        spos = int(spos)
+        for i in range(spos - 1, numpages):
+            if nametype == 'r':
+                pname = int_to_roman(svalue)
+                svalue += 1
+            elif nametype == 'a':
+                pname = "%s" % svalue
+                svalue += 1
+            elif nametype == 'c':
+                sp = svalue.find('|')
+                if sp == -1:
+                    pname = svalue
+                else:
+                    pname = svalue[0:sp]
+                    svalue = svalue[sp+1:]
+            else:
+                print("Error: unknown page numbering type", nametype)
+            pagenames[i] = pname
+    return pagenames, pageMap
+
+
+class PageMapProcessor:
+
+    def __init__(self, mh, data):
+        self.mh = mh
+        self.data = data
+        self.pagenames = []
+        self.pageoffsets = []
+        self.pageMap = ''
+        self.pm_len = 0
+        self.pm_nn = 0
+        self.pn_bits = 0
+        self.pmoff = None
+        self.pmstr = ''
+        print("Extracting Page Map Information")
+        rev_len, = struct.unpack_from(b'>L', self.data, 0x10)
+        # skip over header, revision string length data, and revision string
+        ptr = 0x14 + rev_len
+        pm_1, self.pm_len, self.pm_nn, self.pm_bits  = struct.unpack_from(b'>4H', self.data, ptr)
+        # print(pm_1, self.pm_len, self.pm_nn, self.pm_bits)
+        self.pmstr = self.data[ptr+8:ptr+8+self.pm_len]
+        self.pmoff = self.data[ptr+8+self.pm_len:]
+        offsize = b">L"
+        offwidth = 4
+        if self.pm_bits == 16:
+            offsize = b">H"
+            offwidth = 2
+        ptr = 0
+        for i in range(self.pm_nn):
+            od, = struct.unpack_from(offsize, self.pmoff, ptr)
+            ptr += offwidth
+            self.pageoffsets.append(od)
+        self.pagenames, self.pageMap = _parseNames(self.pm_nn, self.pmstr)
+
+    def getPageMap(self):
+        return self.pageMap
+
+    def getNames(self):
+        return self.pagenames
+
+    def getOffsets(self):
+        return self.pageoffsets
+
+    # page-map.xml will be unicode but encoded to utf-8 immediately before being written to a file
+    def generateKF8PageMapXML(self, k8proc):
+        pagemapxml = '<page-map xmlns="http://www.idpf.org/2007/opf">\n'
+        for i in range(len(self.pagenames)):
+            pos = self.pageoffsets[i]
+            name = self.pagenames[i]
+            if name is not None and name != "":
+                [pn, dir, filename, skelpos, skelend, aidtext] = k8proc.getSkelInfo(pos)
+                idtext = unicode_str(k8proc.getPageIDTag(pos))
+                linktgt = unicode_str(filename)
+                if idtext != '':
+                    linktgt += '#' + idtext
+                pagemapxml += '<page name="%s" href="%s/%s" />\n' % (name, dir, linktgt)
+        pagemapxml += "</page-map>\n"
+        return pagemapxml
+
+    def generateAPNX(self, apnx_meta):
+        if apnx_meta['format'] == 'MOBI_8':
+            content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","format":"%(format)s","fileRevisionId":"1","acr":"%(acr)s"}' %apnx_meta
+        else:
+            content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","fileRevisionId":"1"}' % apnx_meta
+        content_header = content_header.encode('utf-8')
+        page_header = '{"asin":"%(asin)s","pageMap":"%(pageMap)s"}' % apnx_meta
+        page_header = page_header.encode('utf-8')
+        apnx = struct.pack(b'>H',1) + struct.pack(b'>H',1)
+        apnx += struct.pack(b'>I', 12 + len(content_header))
+        apnx += struct.pack(b'>I', len(content_header))
+        apnx += content_header
+        apnx += struct.pack(b'>H', 1)
+        apnx += struct.pack(b'>H', len(page_header))
+        apnx += struct.pack(b'>H', self.pm_nn)
+        apnx += struct.pack(b'>H', 32)
+        apnx += page_header
+        for page in self.pageoffsets:
+            apnx += struct.pack(b'>L', page)
+        return apnx
--- a/KindleUnpack/mobi_sectioner.py
+++ b/KindleUnpack/mobi_sectioner.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, hexlify, bstr, bord, bchar
+
+import datetime
+
+if PY2:
+    range = xrange
+
+# note:  struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+import struct
+
+from .unipath import pathof
+
+DUMP = False
+""" Set to True to dump all possible information. """
+
+class unpackException(Exception):
+    pass
+
+
+def describe(data):
+    txtans = ''
+    hexans = hexlify(data)
+    for i in data:
+        if bord(i) < 32 or bord(i) > 127:
+            txtans += '?'
+        else:
+            txtans += bchar(i).decode('latin-1')
+    return '"' + txtans + '"' + ' 0x'+ hexans
+
+def datetimefrompalmtime(palmtime):
+    if palmtime > 0x7FFFFFFF:
+        pythondatetime = datetime.datetime(year=1904,month=1,day=1)+datetime.timedelta(seconds=palmtime)
+    else:
+        pythondatetime = datetime.datetime(year=1970,month=1,day=1)+datetime.timedelta(seconds=palmtime)
+    return pythondatetime
+
+
+class Sectionizer:
+
+    def __init__(self, filename):
+        self.data = b''
+        with open(pathof(filename), 'rb') as f:
+            self.data = f.read()
+        self.palmheader = self.data[:78]
+        self.palmname = self.data[:32]
+        self.ident = self.palmheader[0x3C:0x3C+8]
+        self.num_sections, = struct.unpack_from(b'>H', self.palmheader, 76)
+        self.filelength = len(self.data)
+        sectionsdata = struct.unpack_from(bstr('>%dL' % (self.num_sections*2)), self.data, 78) + (self.filelength, 0)
+        self.sectionoffsets = sectionsdata[::2]
+        self.sectionattributes = sectionsdata[1::2]
+        self.sectiondescriptions = ["" for x in range(self.num_sections+1)]
+        self.sectiondescriptions[-1] = "File Length Only"
+        return
+
+    def dumpsectionsinfo(self):
+        print("Section     Offset  Length      UID Attribs Description")
+        for i in range(self.num_sections):
+            print("%3d %3X  0x%07X 0x%05X % 8d % 7d %s" % (i,i, self.sectionoffsets[i], self.sectionoffsets[
+                  i+1] - self.sectionoffsets[i], self.sectionattributes[i]&0xFFFFFF, (self.sectionattributes[i]>>24)&0xFF, self.sectiondescriptions[i]))
+        print("%3d %3X  0x%07X                          %s" %
+              (self.num_sections,self.num_sections, self.sectionoffsets[self.num_sections], self.sectiondescriptions[self.num_sections]))
+
+    def setsectiondescription(self, section, description):
+        if section < len(self.sectiondescriptions):
+            self.sectiondescriptions[section] = description
+        else:
+            print("Section out of range: %d, description %s" % (section,description))
+
+    def dumppalmheader(self):
+        print("Palm Database Header")
+        print("Database name: " + repr(self.palmheader[:32]))
+        dbattributes, = struct.unpack_from(b'>H', self.palmheader, 32)
+        print("Bitfield attributes: 0x%0X" % dbattributes,)
+        if dbattributes != 0:
+            print(" (",)
+            if (dbattributes & 2):
+                print("Read-only; ",)
+            if (dbattributes & 4):
+                print("Dirty AppInfoArea; ",)
+            if (dbattributes & 8):
+                print("Needs to be backed up; ",)
+            if (dbattributes & 16):
+                print("OK to install over newer; ",)
+            if (dbattributes & 32):
+                print("Reset after installation; ",)
+            if (dbattributes & 64):
+                print("No copying by PalmPilot beaming; ",)
+            print(")")
+        else:
+            print("")
+        print("File version: %d" % struct.unpack_from(b'>H', self.palmheader, 34)[0])
+        dbcreation, = struct.unpack_from(b'>L', self.palmheader, 36)
+        print("Creation Date: " + str(datetimefrompalmtime(dbcreation))+ (" (0x%0X)" % dbcreation))
+        dbmodification, = struct.unpack_from(b'>L', self.palmheader, 40)
+        print("Modification Date: " + str(datetimefrompalmtime(dbmodification))+ (" (0x%0X)" % dbmodification))
+        dbbackup, = struct.unpack_from(b'>L', self.palmheader, 44)
+        if dbbackup != 0:
+            print("Backup Date: " + str(datetimefrompalmtime(dbbackup))+ (" (0x%0X)" % dbbackup))
+        print("Modification No.: %d" % struct.unpack_from(b'>L', self.palmheader, 48)[0])
+        print("App Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 52)[0])
+        print("Sort Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 56)[0])
+        print("Type/Creator: %s/%s" % (repr(self.palmheader[60:64]), repr(self.palmheader[64:68])))
+        print("Unique seed: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 68)[0])
+        expectedzero, = struct.unpack_from(b'>L', self.palmheader, 72)
+        if expectedzero != 0:
+            print("Should be zero but isn't: %d" % struct.unpack_from(b'>L', self.palmheader, 72)[0])
+        print("Number of sections: %d" % struct.unpack_from(b'>H', self.palmheader, 76)[0])
+        return
+
+    def loadSection(self, section):
+        before, after = self.sectionoffsets[section:section+2]
+        return self.data[before:after]
--- a/KindleUnpack/mobi_split.py
+++ b/KindleUnpack/mobi_split.py
@@ -0,0 +1,438 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+import struct
+# note:  struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+from .unipath import pathof
+
+
+# important  pdb header offsets
+unique_id_seed = 68
+number_of_pdb_records = 76
+
+# important palmdoc header offsets
+book_length = 4
+book_record_count = 8
+first_pdb_record = 78
+
+# important rec0 offsets
+length_of_book = 4
+mobi_header_base = 16
+mobi_header_length = 20
+mobi_type = 24
+mobi_version = 36
+first_non_text = 80
+title_offset = 84
+first_resc_record = 108
+first_content_index = 192
+last_content_index = 194
+kf8_fdst_index = 192  # for KF8 mobi headers
+fcis_index = 200
+flis_index = 208
+srcs_index = 224
+srcs_count = 228
+primary_index = 244
+datp_index = 256
+huffoff = 112
+hufftbloff = 120
+
+def getint(datain,ofs,sz=b'L'):
+    i, = struct.unpack_from(b'>'+sz,datain,ofs)
+    return i
+
+def writeint(datain,ofs,n,len=b'L'):
+    if len==b'L':
+        return datain[:ofs]+struct.pack(b'>L',n)+datain[ofs+4:]
+    else:
+        return datain[:ofs]+struct.pack(b'>H',n)+datain[ofs+2:]
+
+def getsecaddr(datain,secno):
+    nsec = getint(datain,number_of_pdb_records,b'H')
+    assert secno>=0 & secno<nsec,'secno %d out of range (nsec=%d)'%(secno,nsec)
+    secstart = getint(datain,first_pdb_record+secno*8)
+    if secno == nsec-1:
+        secend = len(datain)
+    else:
+        secend = getint(datain,first_pdb_record+(secno+1)*8)
+    return secstart,secend
+
+def readsection(datain,secno):
+    secstart, secend = getsecaddr(datain,secno)
+    return datain[secstart:secend]
+
+def writesection(datain,secno,secdata):  # overwrite, accounting for different length
+    # dataout = deletesectionrange(datain,secno, secno)
+    # return insertsection(dataout, secno, secdata)
+    datalst = []
+    nsec = getint(datain,number_of_pdb_records,b'H')
+    zerosecstart,zerosecend = getsecaddr(datain,0)
+    secstart,secend = getsecaddr(datain,secno)
+    dif = len(secdata) - (secend - secstart)
+    datalst.append(datain[:unique_id_seed])
+    datalst.append(struct.pack(b'>L',2*nsec+1))
+    datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
+    datalst.append(struct.pack(b'>H',nsec))
+    newstart = zerosecstart
+    for i in range(0,secno):
+        ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+        datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
+    datalst.append(struct.pack(b'>L', secstart) + struct.pack(b'>L', (2*secno)))
+    for i in range(secno+1,nsec):
+        ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+        ofs = ofs + dif
+        datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
+    lpad = newstart - (first_pdb_record + 8*nsec)
+    if lpad > 0:
+        datalst.append(b'\0' * lpad)
+    datalst.append(datain[zerosecstart:secstart])
+    datalst.append(secdata)
+    datalst.append(datain[secend:])
+    dataout = b''.join(datalst)
+    return dataout
+
+def nullsection(datain,secno):  # make it zero-length without deleting it
+    datalst = []
+    nsec = getint(datain,number_of_pdb_records,b'H')
+    secstart, secend = getsecaddr(datain,secno)
+    zerosecstart, zerosecend = getsecaddr(datain, 0)
+    dif =  secend-secstart
+    datalst.append(datain[:first_pdb_record])
+    for i in range(0,secno+1):
+        ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+        datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
+    for i in range(secno+1, nsec):
+        ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+        ofs = ofs - dif
+        datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
+    lpad = zerosecstart - (first_pdb_record + 8*nsec)
+    if lpad > 0:
+        datalst.append(b'\0' * lpad)
+    datalst.append(datain[zerosecstart: secstart])
+    datalst.append(datain[secend:])
+    dataout = b''.join(datalst)
+    return dataout
+
+def deletesectionrange(datain,firstsec,lastsec):  # delete a range of sections
+    datalst = []
+    firstsecstart,firstsecend = getsecaddr(datain,firstsec)
+    lastsecstart,lastsecend = getsecaddr(datain,lastsec)
+    zerosecstart, zerosecend = getsecaddr(datain, 0)
+    dif = lastsecend - firstsecstart + 8*(lastsec-firstsec+1)
+    nsec = getint(datain,number_of_pdb_records,b'H')
+    datalst.append(datain[:unique_id_seed])
+    datalst.append(struct.pack(b'>L',2*(nsec-(lastsec-firstsec+1))+1))
+    datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
+    datalst.append(struct.pack(b'>H',nsec-(lastsec-firstsec+1)))
+    newstart = zerosecstart - 8*(lastsec-firstsec+1)
+    for i in range(0,firstsec):
+        ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+        ofs = ofs-8*(lastsec-firstsec+1)
+        datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
+    for i in range(lastsec+1,nsec):
+        ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+        ofs = ofs - dif
+        flgval = 2*(i-(lastsec-firstsec+1))
+        datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
+    lpad = newstart - (first_pdb_record + 8*(nsec - (lastsec - firstsec + 1)))
+    if lpad > 0:
+        datalst.append(b'\0' * lpad)
+    datalst.append(datain[zerosecstart:firstsecstart])
+    datalst.append(datain[lastsecend:])
+    dataout = b''.join(datalst)
+    return dataout
+
+def insertsection(datain,secno,secdata):  # insert a new section
+    datalst = []
+    nsec = getint(datain,number_of_pdb_records,b'H')
+    # print("inserting secno" , secno,  "into" ,nsec, "sections")
+    secstart,secend = getsecaddr(datain,secno)
+    zerosecstart,zerosecend = getsecaddr(datain,0)
+    dif = len(secdata)
+    datalst.append(datain[:unique_id_seed])
+    datalst.append(struct.pack(b'>L',2*(nsec+1)+1))
+    datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
+    datalst.append(struct.pack(b'>H',nsec+1))
+    newstart = zerosecstart + 8
+    for i in range(0,secno):
+        ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+        ofs += 8
+        datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
+    datalst.append(struct.pack(b'>L', secstart + 8) + struct.pack(b'>L', (2*secno)))
+    for i in range(secno,nsec):
+        ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
+        ofs = ofs + dif + 8
+        flgval = 2*(i+1)
+        datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
+    lpad = newstart - (first_pdb_record + 8*(nsec + 1))
+    if lpad > 0:
+        datalst.append(b'\0' * lpad)
+    datalst.append(datain[zerosecstart:secstart])
+    datalst.append(secdata)
+    datalst.append(datain[secstart:])
+    dataout = b''.join(datalst)
+    return dataout
+
+
+def insertsectionrange(sectionsource,firstsec,lastsec,sectiontarget,targetsec):  # insert a range of sections
+    # print("inserting secno" , firstsec,  "to", lastsec, "into" ,targetsec, "sections")
+    # dataout = sectiontarget
+    # for idx in range(lastsec,firstsec-1,-1):
+    #    dataout = insertsection(dataout,targetsec,readsection(sectionsource,idx))
+    # return dataout
+    datalst = []
+    nsec = getint(sectiontarget,number_of_pdb_records,b'H')
+    zerosecstart, zerosecend = getsecaddr(sectiontarget,0)
+    insstart, nul = getsecaddr(sectiontarget,targetsec)
+    nins = lastsec - firstsec + 1
+    srcstart, nul = getsecaddr(sectionsource,firstsec)
+    nul, srcend = getsecaddr(sectionsource,lastsec)
+    newstart = zerosecstart + 8*nins
+
+    datalst.append(sectiontarget[:unique_id_seed])
+    datalst.append(struct.pack(b'>L',2*(nsec+nins)+1))
+    datalst.append(sectiontarget[unique_id_seed+4:number_of_pdb_records])
+    datalst.append(struct.pack(b'>H',nsec+nins))
+    for i in range(0,targetsec):
+        ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8)
+        ofsnew = ofs + 8*nins
+        flgvalnew = flgval
+        datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew))
+        # print(ofsnew, flgvalnew, ofs, flgval)
+    srcstart0, nul = getsecaddr(sectionsource,firstsec)
+    for i in range(nins):
+        isrcstart, nul = getsecaddr(sectionsource,firstsec+i)
+        ofsnew = insstart + (isrcstart-srcstart0) + 8*nins
+        flgvalnew = 2*(targetsec+i)
+        datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew))
+        # print(ofsnew, flgvalnew)
+    dif = srcend - srcstart
+    for i in range(targetsec,nsec):
+        ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8)
+        ofsnew = ofs + dif + 8*nins
+        flgvalnew = 2*(i+nins)
+        datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L',flgvalnew))
+        # print(ofsnew, flgvalnew, ofs, flgval)
+    lpad = newstart - (first_pdb_record + 8*(nsec + nins))
+    if lpad > 0:
+        datalst.append(b'\0' * lpad)
+    datalst.append(sectiontarget[zerosecstart:insstart])
+    datalst.append(sectionsource[srcstart:srcend])
+    datalst.append(sectiontarget[insstart:])
+    dataout = b''.join(datalst)
+    return dataout
+
+def get_exth_params(rec0):
+    ebase = mobi_header_base + getint(rec0,mobi_header_length)
+    elen = getint(rec0,ebase+4)
+    enum = getint(rec0,ebase+8)
+    return ebase,elen,enum
+
+def add_exth(rec0,exth_num,exth_bytes):
+    ebase,elen,enum = get_exth_params(rec0)
+    newrecsize = 8+len(exth_bytes)
+    newrec0 = rec0[0:ebase+4]+struct.pack(b'>L',elen+newrecsize)+struct.pack(b'>L',enum+1)+\
+              struct.pack(b'>L',exth_num)+struct.pack(b'>L',newrecsize)+exth_bytes+rec0[ebase+12:]
+    newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+newrecsize)
+    return newrec0
+
+def read_exth(rec0,exth_num):
+    exth_values = []
+    ebase,elen,enum = get_exth_params(rec0)
+    ebase = ebase+12
+    while enum>0:
+        exth_id = getint(rec0,ebase)
+        if exth_id == exth_num:
+            # We might have multiple exths, so build a list.
+            exth_values.append(rec0[ebase+8:ebase+getint(rec0,ebase+4)])
+        enum = enum-1
+        ebase = ebase+getint(rec0,ebase+4)
+    return exth_values
+
+def write_exth(rec0,exth_num,exth_bytes):
+    ebase,elen,enum = get_exth_params(rec0)
+    ebase_idx = ebase+12
+    enum_idx = enum
+    while enum_idx>0:
+        exth_id = getint(rec0,ebase_idx)
+        if exth_id == exth_num:
+            dif = len(exth_bytes)+8-getint(rec0,ebase_idx+4)
+            newrec0 = rec0
+            if dif != 0:
+                newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+dif)
+            return newrec0[:ebase+4]+struct.pack(b'>L',elen+len(exth_bytes)+8-getint(rec0,ebase_idx+4))+\
+                                              struct.pack(b'>L',enum)+rec0[ebase+12:ebase_idx+4]+\
+                                              struct.pack(b'>L',len(exth_bytes)+8)+exth_bytes+\
+                                              rec0[ebase_idx+getint(rec0,ebase_idx+4):]
+        enum_idx = enum_idx-1
+        ebase_idx = ebase_idx+getint(rec0,ebase_idx+4)
+    return rec0
+
+def del_exth(rec0,exth_num):
+    ebase,elen,enum = get_exth_params(rec0)
+    ebase_idx = ebase+12
+    enum_idx = 0
+    while enum_idx < enum:
+        exth_id = getint(rec0,ebase_idx)
+        exth_size = getint(rec0,ebase_idx+4)
+        if exth_id == exth_num:
+            newrec0 = rec0
+            newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)-exth_size)
+            newrec0 = newrec0[:ebase_idx]+newrec0[ebase_idx+exth_size:]
+            newrec0 = newrec0[0:ebase+4]+struct.pack(b'>L',elen-exth_size)+struct.pack(b'>L',enum-1)+newrec0[ebase+12:]
+            return newrec0
+        enum_idx += 1
+        ebase_idx = ebase_idx+exth_size
+    return rec0
+
+
+class mobi_split:
+
+    def __init__(self, infile):
+        datain = b''
+        with open(pathof(infile), 'rb') as f:
+            datain = f.read()
+        datain_rec0 = readsection(datain,0)
+        ver = getint(datain_rec0,mobi_version)
+        self.combo = (ver!=8)
+        if not self.combo:
+            return
+        exth121 = read_exth(datain_rec0,121)
+        if len(exth121) == 0:
+            self.combo = False
+            return
+        else:
+            # only pay attention to first exth121
+            # (there should only be one)
+            datain_kf8, = struct.unpack_from(b'>L',exth121[0],0)
+            if datain_kf8 == 0xffffffff:
+                self.combo = False
+                return
+        datain_kfrec0 =readsection(datain,datain_kf8)
+
+        # create the standalone mobi7
+        num_sec = getint(datain,number_of_pdb_records,b'H')
+        # remove BOUNDARY up to but not including ELF record
+        self.result_file7 = deletesectionrange(datain,datain_kf8-1,num_sec-2)
+        # check if there are SRCS records and delete them
+        srcs = getint(datain_rec0,srcs_index)
+        num_srcs = getint(datain_rec0,srcs_count)
+        if srcs != 0xffffffff and num_srcs > 0:
+            self.result_file7 = deletesectionrange(self.result_file7,srcs,srcs+num_srcs-1)
+            datain_rec0 = writeint(datain_rec0,srcs_index,0xffffffff)
+            datain_rec0 = writeint(datain_rec0,srcs_count,0)
+        # reset the EXTH 121 KF8 Boundary meta data to 0xffffffff
+        datain_rec0 = write_exth(datain_rec0,121, struct.pack(b'>L', 0xffffffff))
+        # datain_rec0 = del_exth(datain_rec0,121)
+        # datain_rec0 = del_exth(datain_rec0,534)
+        # don't remove the EXTH 125 KF8 Count of Resources, seems to be present in mobi6 files as well
+        # set the EXTH 129 KF8 Masthead / Cover Image string to the null string
+        datain_rec0 = write_exth(datain_rec0,129, b'')
+        # don't remove the EXTH 131 KF8 Unidentified Count, seems to be present in mobi6 files as well
+
+        # need to reset flags stored in 0x80-0x83
+        # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050
+        # Bit Flags
+        # 0x1000 = Bit 12 indicates if embedded fonts are used or not
+        # 0x0800 = means this Header points to *shared* images/resource/fonts ??
+        # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8?
+        # 0x0040 = exth exists
+        # 0x0010 = Not sure but this is always set so far
+        fval, = struct.unpack_from(b'>L',datain_rec0, 0x80)
+        # need to remove flag 0x0800 for KindlePreviewer 2.8 and unset Bit 12 for embedded fonts
+        fval = fval & 0x07FF
+        datain_rec0 = datain_rec0[:0x80] + struct.pack(b'>L',fval) + datain_rec0[0x84:]
+
+        self.result_file7 = writesection(self.result_file7,0,datain_rec0)
+
+        # no need to replace kf8 style fcis with mobi 7 one
+        # fcis_secnum, = struct.unpack_from(b'>L',datain_rec0, 0xc8)
+        # if fcis_secnum != 0xffffffff:
+        #     fcis_info = readsection(datain, fcis_secnum)
+        #     text_len,  = struct.unpack_from(b'>L', fcis_info, 0x14)
+        #     new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
+        #     new_fcis += struct.pack(b'>L',text_len)
+        #     new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
+        #     self.result_file7 = writesection(self.result_file7, fcis_secnum, new_fcis)
+
+        firstimage = getint(datain_rec0,first_resc_record)
+        lastimage = getint(datain_rec0,last_content_index,b'H')
+        # print("Old First Image, last Image", firstimage,lastimage)
+        if lastimage == 0xffff:
+            # find the lowest of the next sections and copy up to that.
+            ofs_list = [(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')]
+            for ofs,sz in ofs_list:
+                n = getint(datain_rec0,ofs,sz)
+                # print("n",n)
+                if n > 0 and n < lastimage:
+                    lastimage = n-1
+        print("First Image, last Image", firstimage,lastimage)
+
+        # Try to null out FONT and RES, but leave the (empty) PDB record so image refs remain valid
+        for i in range(firstimage,lastimage):
+            imgsec = readsection(self.result_file7,i)
+            if imgsec[0:4] in [b'RESC',b'FONT']:
+                self.result_file7 = nullsection(self.result_file7,i)
+
+        # mobi7 finished
+
+        # create standalone mobi8
+        self.result_file8 = deletesectionrange(datain,0,datain_kf8-1)
+        target = getint(datain_kfrec0,first_resc_record)
+        self.result_file8 = insertsectionrange(datain,firstimage,lastimage,self.result_file8,target)
+        datain_kfrec0 =readsection(self.result_file8,0)
+
+        # Only keep the correct EXTH 116 StartOffset, KG 2.5 carries over the one from the mobi7 part, which then points at garbage in the mobi8 part, and confuses FW 3.4
+        kf8starts = read_exth(datain_kfrec0,116)
+        # If we have multiple StartOffset, keep only the last one
+        kf8start_count = len(kf8starts)
+        while kf8start_count > 1:
+            kf8start_count -= 1
+            datain_kfrec0 = del_exth(datain_kfrec0,116)
+
+        # update the EXTH 125 KF8 Count of Images/Fonts/Resources
+        datain_kfrec0 = write_exth(datain_kfrec0,125,struct.pack(b'>L',lastimage-firstimage+1))
+
+        # need to reset flags stored in 0x80-0x83
+        # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050
+        # standalone mobi8 with exth: 0x0050
+        # Bit Flags
+        # 0x1000 = Bit 12 indicates if embedded fonts are used or not
+        # 0x0800 = means this Header points to *shared* images/resource/fonts ??
+        # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8?
+        # 0x0040 = exth exists
+        # 0x0010 = Not sure but this is always set so far
+        fval, = struct.unpack_from('>L',datain_kfrec0, 0x80)
+        fval = fval & 0x1FFF
+        fval |= 0x0800
+        datain_kfrec0 = datain_kfrec0[:0x80] + struct.pack(b'>L',fval) + datain_kfrec0[0x84:]
+
+        # properly update other index pointers that have been shifted by the insertion of images
+        ofs_list = [(kf8_fdst_index,b'L'),(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')]
+        for ofs,sz in ofs_list:
+            n = getint(datain_kfrec0,ofs,sz)
+            if n != 0xffffffff:
+                datain_kfrec0 = writeint(datain_kfrec0,ofs,n+lastimage-firstimage+1,sz)
+        self.result_file8 = writesection(self.result_file8,0,datain_kfrec0)
+
+        # no need to replace kf8 style fcis with mobi 7 one
+        # fcis_secnum, = struct.unpack_from(b'>L',datain_kfrec0, 0xc8)
+        # if fcis_secnum != 0xffffffff:
+        #     fcis_info = readsection(self.result_file8, fcis_secnum)
+        #     text_len,  = struct.unpack_from(b'>L', fcis_info, 0x14)
+        #     new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
+        #     new_fcis += struct.pack(b'>L',text_len)
+        #     new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
+        #     self.result_file8 = writesection(self.result_file8, fcis_secnum, new_fcis)
+
+        # mobi8 finished
+
+    def getResult8(self):
+        return self.result_file8
+
+    def getResult7(self):
+        return self.result_file7
--- a/KindleUnpack/mobi_uncompress.py
+++ b/KindleUnpack/mobi_uncompress.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, bchr, lmap, bstr
+
+if PY2:
+    range = xrange
+
+import struct
+# note:  struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+
+class unpackException(Exception):
+    pass
+
+class UncompressedReader:
+
+    def unpack(self, data):
+        return data
+
+class PalmdocReader:
+
+    def unpack(self, i):
+        o, p = b'', 0
+        while p < len(i):
+            # for python 3 must use slice since i[p] returns int while slice returns character
+            c = ord(i[p:p+1])
+            p += 1
+            if (c >= 1 and c <= 8):
+                o += i[p:p+c]
+                p += c
+            elif (c < 128):
+                o += bchr(c)
+            elif (c >= 192):
+                o += b' ' + bchr(c ^ 128)
+            else:
+                if p < len(i):
+                    c = (c << 8) | ord(i[p:p+1])
+                    p += 1
+                    m = (c >> 3) & 0x07ff
+                    n = (c & 7) + 3
+                    if (m > n):
+                        o += o[-m:n-m]
+                    else:
+                        for _ in range(n):
+                            # because of completely ass-backwards decision by python mainters for python 3
+                            # we must use slice for bytes as i[p] returns int while slice returns character
+                            if m == 1:
+                                o += o[-m:]
+                            else:
+                                o += o[-m:-m+1]
+        return o
+
+class HuffcdicReader:
+    q = struct.Struct(b'>Q').unpack_from
+
+    def loadHuff(self, huff):
+        if huff[0:8] != b'HUFF\x00\x00\x00\x18':
+            raise unpackException('invalid huff header')
+        off1, off2 = struct.unpack_from(b'>LL', huff, 8)
+
+        def dict1_unpack(v):
+            codelen, term, maxcode = v&0x1f, v&0x80, v>>8
+            assert codelen != 0
+            if codelen <= 8:
+                assert term
+            maxcode = ((maxcode + 1) << (32 - codelen)) - 1
+            return (codelen, term, maxcode)
+        self.dict1 = lmap(dict1_unpack, struct.unpack_from(b'>256L', huff, off1))
+
+        dict2 = struct.unpack_from(b'>64L', huff, off2)
+        self.mincode, self.maxcode = (), ()
+        for codelen, mincode in enumerate((0,) + dict2[0::2]):
+            self.mincode += (mincode << (32 - codelen), )
+        for codelen, maxcode in enumerate((0,) + dict2[1::2]):
+            self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, )
+
+        self.dictionary = []
+
+    def loadCdic(self, cdic):
+        if cdic[0:8] != b'CDIC\x00\x00\x00\x10':
+            raise unpackException('invalid cdic header')
+        phrases, bits = struct.unpack_from(b'>LL', cdic, 8)
+        n = min(1<<bits, phrases-len(self.dictionary))
+        h = struct.Struct(b'>H').unpack_from
+        def getslice(off):
+            blen, = h(cdic, 16+off)
+            slice = cdic[18+off:18+off+(blen&0x7fff)]
+            return (slice, blen&0x8000)
+        self.dictionary += lmap(getslice, struct.unpack_from(bstr('>%dH' % n), cdic, 16))
+
+    def unpack(self, data):
+        q = HuffcdicReader.q
+
+        bitsleft = len(data) * 8
+        data += b"\x00\x00\x00\x00\x00\x00\x00\x00"
+        pos = 0
+        x, = q(data, pos)
+        n = 32
+
+        s = b''
+        while True:
+            if n <= 0:
+                pos += 4
+                x, = q(data, pos)
+                n += 32
+            code = (x >> n) & ((1 << 32) - 1)
+
+            codelen, term, maxcode = self.dict1[code >> 24]
+            if not term:
+                while code < self.mincode[codelen]:
+                    codelen += 1
+                maxcode = self.maxcode[codelen]
+
+            n -= codelen
+            bitsleft -= codelen
+            if bitsleft < 0:
+                break
+
+            r = (maxcode - code) >> (32 - codelen)
+            slice, flag = self.dictionary[r]
+            if not flag:
+                self.dictionary[r] = None
+                slice = self.unpack(slice)
+                self.dictionary[r] = (slice, 1)
+            s += slice
+        return s
--- a/KindleUnpack/mobi_utils.py
+++ b/KindleUnpack/mobi_utils.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+# flake8: noqa
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, text_type, bchr, bord
+
+import binascii
+
+if PY2:
+    range = xrange
+
+from itertools import cycle
+
+def getLanguage(langID, sublangID):
+    mobilangdict = {
+            54 : {0 : 'af'},  # Afrikaans
+            28 : {0 : 'sq'},  # Albanian
+             1 : {0 : 'ar' , 5 : 'ar-dz' , 15 : 'ar-bh' , 3 : 'ar-eg' , 2 : 'ar-iq',  11 : 'ar-jo' , 13 : 'ar-kw' , 12 : 'ar-lb' , 4: 'ar-ly',
+                  6 : 'ar-ma' , 8 : 'ar-om' , 16 : 'ar-qa' , 1 : 'ar-sa' , 10 : 'ar-sy' , 7 : 'ar-tn' , 14 : 'ar-ae' , 9 : 'ar-ye'},
+             # Arabic,  Arabic (Algeria),  Arabic (Bahrain),  Arabic (Egypt),  Arabic
+             # (Iraq), Arabic (Jordan),  Arabic (Kuwait),  Arabic (Lebanon),  Arabic
+             # (Libya), Arabic (Morocco),  Arabic (Oman),  Arabic (Qatar),  Arabic
+             # (Saudi Arabia),  Arabic (Syria),  Arabic (Tunisia),  Arabic (United Arab
+             # Emirates),  Arabic (Yemen)
+            43 : {0 : 'hy'},  # Armenian
+            77 : {0 : 'as'},  # Assamese
+            44 : {0 : 'az'},  # "Azeri (IANA: Azerbaijani)
+            45 : {0 : 'eu'},  # Basque
+            35 : {0 : 'be'},  # Belarusian
+            69 : {0 : 'bn'},  # Bengali
+             2 : {0 : 'bg'},  # Bulgarian
+             3 : {0 : 'ca'},  # Catalan
+             4 : {0 : 'zh' , 3 : 'zh-hk' , 2 : 'zh-cn' , 4 : 'zh-sg' , 1 : 'zh-tw'},
+            # Chinese,  Chinese (Hong Kong),  Chinese (PRC),  Chinese (Singapore),  Chinese (Taiwan)
+            26 : {0 : 'hr', 3 : 'sr'},  # Croatian, Serbian
+             5 : {0 : 'cs'},  # Czech
+             6 : {0 : 'da'},  # Danish
+            19 : {0: 'nl', 1 : 'nl' , 2 : 'nl-be'},  # Dutch / Flemish,  Dutch (Belgium)
+             9 : {0: 'en', 1 : 'en' , 3 : 'en-au' , 40 : 'en-bz' , 4 : 'en-ca' , 6 : 'en-ie' , 8 : 'en-jm' , 5 : 'en-nz' , 13 : 'en-ph' ,
+                  7 : 'en-za' , 11 : 'en-tt' , 2 : 'en-gb', 1 : 'en-us' , 12 : 'en-zw'},
+             # English,  English (Australia),  English (Belize),  English (Canada),
+             # English (Ireland),  English (Jamaica),  English (New Zealand),  English
+             # (Philippines),  English (South Africa),  English (Trinidad),  English
+             # (United Kingdom),  English (United States),  English (Zimbabwe)
+            37 : {0 : 'et'},  # Estonian
+            56 : {0 : 'fo'},  # Faroese
+            41 : {0 : 'fa'},  # Farsi / Persian
+            11 : {0 : 'fi'},  # Finnish
+            12 : {0 : 'fr', 1 : 'fr' , 2 : 'fr-be' , 3 : 'fr-ca' , 5 : 'fr-lu' , 6 : 'fr-mc' , 4 : 'fr-ch'},
+            # French,  French (Belgium),  French (Canada),  French (Luxembourg),  French (Monaco),  French (Switzerland)
+            55 : {0 : 'ka'},  # Georgian
+             7 : {0 : 'de', 1 : 'de' , 3 : 'de-at' , 5 : 'de-li' , 4 : 'de-lu' , 2 : 'de-ch'},
+             # German,  German (Austria),  German (Liechtenstein),  German (Luxembourg),  German (Switzerland)
+             8 : {0 : 'el'},  # Greek, Modern (1453-)
+            71 : {0 : 'gu'},  # Gujarati
+            13 : {0 : 'he'},  # Hebrew (also code 'iw'?)
+            57 : {0 : 'hi'},  # Hindi
+            14 : {0 : 'hu'},  # Hungarian
+            15 : {0 : 'is'},  # Icelandic
+            33 : {0 : 'id'},  # Indonesian
+            16 : {0 : 'it', 1 : 'it' , 2 : 'it-ch'},  # Italian,  Italian (Switzerland)
+            17 : {0 : 'ja'},  # Japanese
+            75 : {0 : 'kn'},  # Kannada
+            63 : {0 : 'kk'},  # Kazakh
+            87 : {0 : 'x-kok'},  # Konkani (real language code is 'kok'?)
+            18 : {0 : 'ko'},  # Korean
+            38 : {0 : 'lv'},  # Latvian
+            39 : {0 : 'lt'},  # Lithuanian
+            47 : {0 : 'mk'},  # Macedonian
+            62 : {0 : 'ms'},  # Malay
+            76 : {0 : 'ml'},  # Malayalam
+            58 : {0 : 'mt'},  # Maltese
+            78 : {0 : 'mr'},  # Marathi
+            97 : {0 : 'ne'},  # Nepali
+            20 : {0 : 'no'},  # Norwegian
+            72 : {0 : 'or'},  # Oriya
+            21 : {0 : 'pl'},  # Polish
+            22 : {0 : 'pt', 2 : 'pt' , 1 : 'pt-br'},  # Portuguese,  Portuguese (Brazil)
+            70 : {0 : 'pa'},  # Punjabi
+            23 : {0 : 'rm'},  # "Rhaeto-Romanic" (IANA: Romansh)
+            24 : {0 : 'ro'},  # Romanian
+            25 : {0 : 'ru'},  # Russian
+            59 : {0 : 'sz'},  # "Sami (Lappish)" (not an IANA language code)
+            # IANA code for "Northern Sami" is 'se'
+            # 'SZ' is the IANA region code for Swaziland
+            79 : {0 : 'sa'},  # Sanskrit
+            27 : {0 : 'sk'},  # Slovak
+            36 : {0 : 'sl'},  # Slovenian
+            46 : {0 : 'sb'},  # "Sorbian" (not an IANA language code)
+            # 'SB' is IANA region code for 'Solomon Islands'
+            # Lower Sorbian = 'dsb'
+            # Upper Sorbian = 'hsb'
+            # Sorbian Languages = 'wen'
+            10 : {0 : 'es' , 4 : 'es' , 44 : 'es-ar' , 64 : 'es-bo' , 52 : 'es-cl' , 36 : 'es-co' , 20 : 'es-cr' , 28 : 'es-do' ,
+                  48 : 'es-ec' , 68 : 'es-sv' , 16 : 'es-gt' , 72 : 'es-hn' , 8 : 'es-mx' , 76 : 'es-ni' , 24 : 'es-pa' ,
+                  60 : 'es-py' , 40 : 'es-pe' , 80 : 'es-pr' , 56 : 'es-uy' , 32 : 'es-ve'},
+            # Spanish,  Spanish (Mobipocket bug?),  Spanish (Argentina),  Spanish
+            # (Bolivia),  Spanish (Chile),  Spanish (Colombia),  Spanish (Costa Rica),
+            # Spanish (Dominican Republic),  Spanish (Ecuador),  Spanish (El
+            # Salvador),  Spanish (Guatemala),  Spanish (Honduras),  Spanish (Mexico),
+            # Spanish (Nicaragua),  Spanish (Panama),  Spanish (Paraguay),  Spanish
+            # (Peru),  Spanish (Puerto Rico),  Spanish (Uruguay),  Spanish (Venezuela)
+            48 : {0 : 'sx'},  # "Sutu" (not an IANA language code)
+            # "Sutu" is another name for "Southern Sotho"?
+            # IANA code for "Southern Sotho" is 'st'
+            65 : {0 : 'sw'},  # Swahili
+            29 : {0 : 'sv' , 1 : 'sv' , 8 : 'sv-fi'},  # Swedish,  Swedish (Finland)
+            73 : {0 : 'ta'},  # Tamil
+            68 : {0 : 'tt'},  # Tatar
+            74 : {0 : 'te'},  # Telugu
+            30 : {0 : 'th'},  # Thai
+            49 : {0 : 'ts'},  # Tsonga
+            50 : {0 : 'tn'},  # Tswana
+            31 : {0 : 'tr'},  # Turkish
+            34 : {0 : 'uk'},  # Ukrainian
+            32 : {0 : 'ur'},  # Urdu
+            67 : {0 : 'uz', 2 : 'uz'},  # Uzbek
+            42 : {0 : 'vi'},  # Vietnamese
+            52 : {0 : 'xh'},  # Xhosa
+            53 : {0 : 'zu'},  # Zulu
+    }
+    lang = "en"
+    if langID in mobilangdict:
+        subdict = mobilangdict[langID]
+        lang = subdict[0]
+        if sublangID in subdict:
+            lang = subdict[sublangID]
+    return lang
+
+
+def toHex(byteList):
+    return binascii.hexlify(byteList)
+
+# returns base32 bytestring
+def toBase32(value, npad=4):
+    digits = b'0123456789ABCDEFGHIJKLMNOPQRSTUV'
+    num_string=b''
+    current = value
+    while current != 0:
+        next, remainder = divmod(current, 32)
+        rem_string = digits[remainder:remainder+1]
+        num_string = rem_string + num_string
+        current=next
+    if num_string == b'':
+        num_string = b'0'
+    pad = npad - len(num_string)
+    if pad > 0:
+        num_string = b'0' * pad + num_string
+    return num_string
+
+
+# converts base32 string to value
+def fromBase32(str_num):
+    if isinstance(str_num, text_type):
+        str_num = str_num.encode('latin-1')
+    scalelst = [1,32,1024,32768,1048576,33554432,1073741824,34359738368]
+    value = 0
+    j = 0
+    n = len(str_num)
+    scale = 0
+    for i in range(n):
+        c = str_num[n-i-1:n-i]
+        if c in b'0123456789':
+            v = ord(c) - ord(b'0')
+        else:
+            v = ord(c) - ord(b'A') + 10
+        if j < len(scalelst):
+            scale = scalelst[j]
+        else:
+            scale = scale * 32
+        j += 1
+        if v != 0:
+            value = value + (v * scale)
+    return value
+
+
+# note: if decode a bytestring using 'latin-1' (or any other 0-255 encoding)
+# in place of ascii you will get a byte to half-word or integer
+# one to one mapping of values from 0 - 255
+
+def mangle_fonts(encryption_key, data):
+    if isinstance(encryption_key, text_type):
+        encryption_key = encryption_key.encode('latin-1')
+    crypt = data[:1024]
+    key = cycle(iter(map(bord, encryption_key)))
+    # encrypt = ''.join([chr(ord(x)^key.next()) for x in crypt])
+    encrypt = b''.join([bchr(bord(x)^next(key)) for x in crypt])
+    return encrypt + data[1024:]
--- a/KindleUnpack/mobiml2xhtml.py
+++ b/KindleUnpack/mobiml2xhtml.py
@@ -0,0 +1,525 @@
+#! /usr/bin/python
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+
+# this program works in concert with the output from KindleUnpack
+
+'''
+Convert from Mobi ML to XHTML
+'''
+
+import os
+import sys
+import re
+
+SPECIAL_HANDLING_TAGS = {
+    '?xml'     : ('xmlheader', -1),
+    '!--'      : ('comment', -3),
+    '!DOCTYPE' : ('doctype', -1),
+}
+
+SPECIAL_HANDLING_TYPES = ['xmlheader', 'doctype', 'comment']
+
+SELF_CLOSING_TAGS = ['br' , 'hr', 'input', 'img', 'image', 'meta', 'spacer', 'link', 'frame', 'base', 'col', 'reference']
+
+class MobiMLConverter(object):
+
+    PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
+    IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
+
+    def __init__(self, filename):
+        self.base_css_rules =  'blockquote { margin: 0em 0em 0em 1.25em }\n'
+        self.base_css_rules += 'p { margin: 0em }\n'
+        self.base_css_rules += '.bold { font-weight: bold }\n'
+        self.base_css_rules += '.italic { font-style: italic }\n'
+        self.base_css_rules += '.mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n'
+        self.tag_css_rules = {}
+        self.tag_css_rule_cnt = 0
+        self.path = []
+        self.filename = filename
+        self.wipml = open(self.filename, 'rb').read()
+        self.pos = 0
+        self.opfname = self.filename.rsplit('.',1)[0] + '.opf'
+        self.opos = 0
+        self.meta = ''
+        self.cssname = os.path.join(os.path.dirname(self.filename),'styles.css')
+        self.current_font_size = 3
+        self.font_history = []
+
+    def cleanup_html(self):
+        self.wipml = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.wipml)
+        self.wipml = self.wipml.replace('\r\n', '\n')
+        self.wipml = self.wipml.replace('> <', '>\n<')
+        self.wipml = self.wipml.replace('<mbp: ', '<mbp:')
+        # self.wipml = re.sub(r'<?xml[^>]*>', '', self.wipml)
+        self.wipml = self.wipml.replace('<br></br>','<br/>')
+
+    def replace_page_breaks(self):
+        self.wipml = self.PAGE_BREAK_PAT.sub(
+            '<div class="mbp_pagebreak" />',
+            self.wipml)
+
+    # parse leading text of ml and tag
+    def parseml(self):
+        p = self.pos
+        if p >= len(self.wipml):
+            return None
+        if self.wipml[p] != '<':
+            res = self.wipml.find('<',p)
+            if res == -1 :
+                res = len(self.wipml)
+            self.pos = res
+            return self.wipml[p:res], None
+        # handle comment as a special case to deal with multi-line comments
+        if self.wipml[p:p+4] == '<!--':
+            te = self.wipml.find('-->',p+1)
+            if te != -1:
+                te = te+2
+        else :
+            te = self.wipml.find('>',p+1)
+            ntb = self.wipml.find('<',p+1)
+            if ntb != -1 and ntb < te:
+                self.pos = ntb
+                return self.wipml[p:ntb], None
+        self.pos = te + 1
+        return None, self.wipml[p:te+1]
+
+    # parses string version of tag to identify its name,
+    # its type 'begin', 'end' or 'single',
+    # plus build a hashtable of its attributes
+    # code is written to handle the possiblity of very poor formating
+    def parsetag(self, s):
+        p = 1
+        # get the tag name
+        tname = None
+        ttype = None
+        tattr = {}
+        while s[p:p+1] == ' ' :
+            p += 1
+        if s[p:p+1] == '/':
+            ttype = 'end'
+            p += 1
+            while s[p:p+1] == ' ' :
+                p += 1
+        b = p
+        while s[p:p+1] not in ('>', '/', ' ', '"', "'", "\r", "\n") :
+            p += 1
+        tname=s[b:p].lower()
+        if tname == '!doctype':
+            tname = '!DOCTYPE'
+        # special cases
+        if tname in SPECIAL_HANDLING_TAGS.keys():
+            ttype, backstep = SPECIAL_HANDLING_TAGS[tname]
+            tattr['special'] = s[p:backstep]
+        if ttype is None:
+            # parse any attributes
+            while s.find('=',p) != -1 :
+                while s[p:p+1] == ' ' :
+                    p += 1
+                b = p
+                while s[p:p+1] != '=' :
+                    p += 1
+                aname = s[b:p].lower()
+                aname = aname.rstrip(' ')
+                p += 1
+                while s[p:p+1] == ' ' :
+                    p += 1
+                if s[p:p+1] in ('"', "'") :
+                    p = p + 1
+                    b = p
+                    while s[p:p+1] not in ('"', "'") :
+                        p += 1
+                    val = s[b:p]
+                    p += 1
+                else :
+                    b = p
+                    while s[p:p+1] not in ('>', '/', ' ') :
+                        p += 1
+                    val = s[b:p]
+                tattr[aname] = val
+        # label beginning and single tags
+        if ttype is None:
+            ttype = 'begin'
+            if s.find(' /',p) >= 0:
+                ttype = 'single_ext'
+            elif s.find('/',p) >= 0:
+                ttype = 'single'
+        return ttype, tname, tattr
+
+    # main routine to convert from mobi markup language to html
+    def processml(self):
+
+        # are these really needed
+        html_done = False
+        head_done = False
+        body_done = False
+
+        skip = False
+
+        htmlstr = ''
+        self.replace_page_breaks()
+        self.cleanup_html()
+
+        # now parse the cleaned up ml into standard xhtml
+        while True:
+
+            r = self.parseml()
+            if not r:
+                break
+
+            text, tag = r
+
+            if text:
+                if not skip:
+                    htmlstr += text
+
+            if tag:
+                ttype, tname, tattr = self.parsetag(tag)
+
+                # If we run into a DTD or xml declarations inside the body ... bail.
+                if tname in SPECIAL_HANDLING_TAGS.keys() and tname != 'comment' and body_done:
+                    htmlstr += '\n</body></html>'
+                    break
+
+                # make sure self-closing tags actually self-close
+                if ttype == 'begin' and tname in SELF_CLOSING_TAGS:
+                    ttype = 'single'
+
+                # make sure any end tags of self-closing tags are discarded
+                if ttype == 'end' and tname in SELF_CLOSING_TAGS:
+                    continue
+
+                # remove embedded guide and refernces from old mobis
+                if tname in ('guide', 'ncx', 'reference') and ttype in ('begin', 'single', 'single_ext'):
+                    tname = 'removeme:{0}'.format(tname)
+                    tattr = None
+                if tname in ('guide', 'ncx', 'reference', 'font', 'span') and ttype == 'end':
+                    if self.path[-1] == 'removeme:{0}'.format(tname):
+                        tname = 'removeme:{0}'.format(tname)
+                        tattr = None
+
+                # Get rid of font tags that only have a color attribute.
+                if tname == 'font' and ttype in ('begin', 'single', 'single_ext'):
+                    if 'color' in tattr.keys() and len(tattr.keys()) == 1:
+                        tname = 'removeme:{0}'.format(tname)
+                        tattr = None
+
+                # Get rid of empty spans in the markup.
+                if tname == 'span' and ttype in ('begin', 'single', 'single_ext') and not len(tattr):
+                    tname = 'removeme:{0}'.format(tname)
+
+                # need to handle fonts outside of the normal methods
+                # so fonts tags won't be added to the self.path since we keep track
+                # of font tags separately with self.font_history
+                if tname == 'font' and ttype == 'begin':
+                    # check for nested font start tags
+                    if len(self.font_history) > 0 :
+                        # inject a font end tag
+                        taginfo = ('end', 'font', None)
+                        htmlstr += self.processtag(taginfo)
+                    self.font_history.append((ttype, tname, tattr))
+                    # handle the current font start tag
+                    taginfo = (ttype, tname, tattr)
+                    htmlstr += self.processtag(taginfo)
+                    continue
+
+                # check for nested font tags and unnest them
+                if tname == 'font' and ttype == 'end':
+                    self.font_history.pop()
+                    # handle this font end tag
+                    taginfo = ('end', 'font', None)
+                    htmlstr += self.processtag(taginfo)
+                    # check if we were nested
+                    if len(self.font_history) > 0:
+                        # inject a copy of the most recent font start tag from history
+                        taginfo = self.font_history[-1]
+                        htmlstr += self.processtag(taginfo)
+                    continue
+
+                # keep track of nesting path
+                if ttype == 'begin':
+                    self.path.append(tname)
+                elif ttype == 'end':
+                    if tname != self.path[-1]:
+                        print ('improper nesting: ', self.path, tname, ttype)
+                        if tname not in self.path:
+                            # handle case of end tag with no beginning by injecting empty begin tag
+                            taginfo = ('begin', tname, None)
+                            htmlstr += self.processtag(taginfo)
+                            print "     - fixed by injecting empty start tag ", tname
+                            self.path.append(tname)
+                        elif len(self.path) >  1 and tname == self.path[-2]:
+                            # handle case of dangling missing end
+                            taginfo = ('end', self.path[-1], None)
+                            htmlstr += self.processtag(taginfo)
+                            print "     - fixed by injecting end tag ", self.path[-1]
+                            self.path.pop()
+                    self.path.pop()
+
+                if tname == 'removeme:{0}'.format(tname):
+                    if ttype in ('begin', 'single', 'single_ext'):
+                        skip = True
+                    else:
+                        skip = False
+                else:
+                    taginfo = (ttype, tname, tattr)
+                    htmlstr += self.processtag(taginfo)
+
+                # handle potential issue of multiple html, head, and body sections
+                if tname == 'html' and ttype == 'begin' and not html_done:
+                    htmlstr += '\n'
+                    html_done = True
+
+                if tname == 'head' and ttype == 'begin' and not head_done:
+                    htmlstr += '\n'
+                    # also add in metadata and style link tags
+                    htmlstr += self.meta
+                    htmlstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
+                    head_done = True
+
+                if tname == 'body' and ttype == 'begin' and not body_done:
+                    htmlstr += '\n'
+                    body_done = True
+
+        # handle issue of possibly missing html, head, and body tags
+        # I have not seen this but the original did something like this so ...
+        if not body_done:
+            htmlstr = '<body>\n' + htmlstr + '</body>\n'
+        if not head_done:
+            headstr = '<head>\n'
+            headstr += self.meta
+            headstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
+            headstr += '</head>\n'
+            htmlstr = headstr + htmlstr
+        if not html_done:
+            htmlstr = '<html>\n' + htmlstr + '</html>\n'
+
+        # finally add DOCTYPE info
+        htmlstr = '<?xml version="1.0"?>\n<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n' + htmlstr
+
+        css = self.base_css_rules
+        for cls, rule in self.tag_css_rules.items():
+            css += '.%s { %s }\n' % (cls, rule)
+
+        return (htmlstr, css, self.cssname)
+
+    def ensure_unit(self, raw, unit='px'):
+        if re.search(r'\d+$', raw) is not None:
+            raw += unit
+        return raw
+
+    # flatten possibly modified tag back to string
+    def taginfo_tostring(self, taginfo):
+        (ttype, tname, tattr) = taginfo
+        if ttype is None or tname is None:
+            return ''
+        if ttype == 'end':
+            return '</%s>' % tname
+        if ttype in SPECIAL_HANDLING_TYPES and tattr is not None and 'special' in tattr.keys():
+            info = tattr['special']
+            if ttype == 'comment':
+                return '<%s %s-->' % tname, info
+            else:
+                return '<%s %s>' % tname, info
+        res = []
+        res.append('<%s' % tname)
+        if tattr is not None:
+            for key in tattr.keys():
+                res.append(' %s="%s"' % (key, tattr[key]))
+        if ttype == 'single':
+            res.append('/>')
+        elif ttype == 'single_ext':
+            res.append(' />')
+        else :
+            res.append('>')
+        return "".join(res)
+
+    # routines to convert from mobi ml tags atributes to xhtml attributes and styles
+    def processtag(self, taginfo):
+        # Converting mobi font sizes to numerics
+        size_map = {
+            'xx-small': '1',
+            'x-small': '2',
+            'small': '3',
+            'medium': '4',
+            'large': '5',
+            'x-large': '6',
+            'xx-large': '7',
+            }
+
+        size_to_em_map = {
+            '1': '.65em',
+            '2': '.75em',
+            '3': '1em',
+            '4': '1.125em',
+            '5': '1.25em',
+            '6': '1.5em',
+            '7': '2em',
+            }
+
+        # current tag to work on
+        (ttype, tname, tattr) = taginfo
+        if not tattr:
+            tattr = {}
+
+        styles = []
+
+        if tname is None or tname.startswith('removeme'):
+            return ''
+
+        # have not seen an example of this yet so keep it here to be safe
+        # until this is better understood
+        if tname in ('country-region', 'place', 'placetype', 'placename',
+                'state', 'city', 'street', 'address', 'content'):
+            tname = 'div' if tname == 'content' else 'span'
+            for key in tattr.keys():
+                tattr.pop(key)
+
+        # handle general case of style, height, width, bgcolor in any tag
+        if 'style' in tattr.keys():
+            style = tattr.pop('style').strip()
+            if style:
+                styles.append(style)
+
+        if 'align' in tattr.keys():
+            align = tattr.pop('align').strip()
+            if align:
+                if tname in ('table', 'td', 'tr'):
+                    pass
+                else:
+                    styles.append('text-align: %s' % align)
+
+        if 'height' in tattr.keys():
+            height = tattr.pop('height').strip()
+            if height and '<' not in height and '>' not in height and re.search(r'\d+', height):
+                if tname in ('table', 'td', 'tr'):
+                    pass
+                elif tname == 'img':
+                    tattr['height'] = height
+                else:
+                    styles.append('margin-top: %s' % self.ensure_unit(height))
+
+        if 'width' in tattr.keys():
+            width = tattr.pop('width').strip()
+            if width and re.search(r'\d+', width):
+                if tname in ('table', 'td', 'tr'):
+                    pass
+                elif tname == 'img':
+                    tattr['width'] =  width
+                else:
+                    styles.append('text-indent: %s' % self.ensure_unit(width))
+                    if width.startswith('-'):
+                        styles.append('margin-left: %s' % self.ensure_unit(width[1:]))
+
+        if 'bgcolor' in tattr.keys():
+            # no proprietary html allowed
+            if tname == 'div':
+                del tattr['bgcolor']
+
+        elif tname == 'font':
+            # Change font tags to span tags
+            tname = 'span'
+            if ttype in ('begin', 'single', 'single_ext'):
+                # move the face attribute to css font-family
+                if 'face' in tattr.keys():
+                    face = tattr.pop('face').strip()
+                    styles.append('font-family: "%s"' % face)
+
+                    # Monitor the constantly changing font sizes, change them to ems and move
+                    # them to css. The following will work for 'flat' font tags, but nested font tags
+                    # will cause things to go wonky. Need to revert to the parent font tag's size
+                    # when a closing tag is encountered.
+                if 'size' in tattr.keys():
+                    sz = tattr.pop('size').strip().lower()
+                    try:
+                        float(sz)
+                    except ValueError:
+                        if sz in size_map.keys():
+                            sz = size_map[sz]
+                    else:
+                        if sz.startswith('-') or sz.startswith('+'):
+                            sz = self.current_font_size + float(sz)
+                            if sz > 7:
+                                sz = 7
+                            elif sz < 1:
+                                sz = 1
+                            sz = str(int(sz))
+                    styles.append('font-size: %s' % size_to_em_map[sz])
+                    self.current_font_size = int(sz)
+
+        elif tname == 'img':
+            for attr in ('width', 'height'):
+                if attr in tattr:
+                    val = tattr[attr]
+                    if val.lower().endswith('em'):
+                        try:
+                            nval = float(val[:-2])
+                            nval *= 16 * (168.451/72)  # Assume this was set using the Kindle profile
+                            tattr[attr] = "%dpx"%int(nval)
+                        except:
+                            del tattr[attr]
+                    elif val.lower().endswith('%'):
+                        del tattr[attr]
+
+        # convert the anchor tags
+        if 'filepos-id' in tattr:
+            tattr['id'] = tattr.pop('filepos-id')
+            if 'name' in tattr and tattr['name'] != tattr['id']:
+                tattr['name'] = tattr['id']
+
+        if 'filepos' in tattr:
+            filepos = tattr.pop('filepos')
+            try:
+                tattr['href'] = "#filepos%d" % int(filepos)
+            except ValueError:
+                pass
+
+        if styles:
+            ncls = None
+            rule = '; '.join(styles)
+            for sel, srule in self.tag_css_rules.items():
+                if srule == rule:
+                    ncls = sel
+                    break
+            if ncls is None:
+                self.tag_css_rule_cnt += 1
+                ncls = 'rule_%d' % self.tag_css_rule_cnt
+                self.tag_css_rules[ncls] = rule
+            cls = tattr.get('class', '')
+            cls = cls + (' ' if cls else '') + ncls
+            tattr['class'] = cls
+
+        # convert updated tag back to string representation
+        if len(tattr) == 0:
+            tattr = None
+        taginfo = (ttype, tname, tattr)
+        return self.taginfo_tostring(taginfo)
+
+''' main only left in for testing outside of plugin '''
+
+def main(argv=sys.argv):
+    if len(argv) != 2:
+        return 1
+    else:
+        infile = argv[1]
+
+    try:
+        print 'Converting Mobi Markup Language to XHTML'
+        mlc = MobiMLConverter(infile)
+        print 'Processing ...'
+        htmlstr, css, cssname = mlc.processml()
+        outname = infile.rsplit('.',1)[0] + '_converted.html'
+        file(outname, 'wb').write(htmlstr)
+        file(cssname, 'wb').write(css)
+        print 'Completed'
+        print 'XHTML version of book can be found at: ' + outname
+
+    except ValueError, e:
+        print "Error: %s" % e
+        return 1
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/KindleUnpack/unipath.py
+++ b/KindleUnpack/unipath.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+# Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this list of
+# conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice, this list
+# of conditions and the following disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+from .compatibility_utils import PY2, text_type, binary_type
+
+import sys
+import os
+
+# utility routines to convert all paths to be full unicode
+
+# Under Python 2, if a bytestring, try to convert it to unicode using sys.getfilesystemencoding
+# Under Python 3, if bytes, try to convert it to unicode using os.fsencode() to decode it
+
+# Mac OS X and Windows will happily support full unicode paths
+# Linux can support full unicode paths but allows arbitrary byte paths which may be inconsistent with unicode
+
+fsencoding = sys.getfilesystemencoding()
+
+def pathof(s, enc=fsencoding):
+    if s is None:
+        return None
+    if isinstance(s, text_type):
+        return s
+    if isinstance(s, binary_type):
+        try:
+            return s.decode(enc)
+        except:
+            pass
+    return s
+
+def exists(s):
+    return os.path.exists(pathof(s))
+
+def isfile(s):
+    return os.path.isfile(pathof(s))
+
+def isdir(s):
+    return os.path.isdir(pathof(s))
+
+def mkdir(s):
+    return os.mkdir(pathof(s))
+
+def listdir(s):
+    rv = []
+    for file in os.listdir(pathof(s)):
+        rv.append(pathof(file))
+    return rv
+
+def getcwd():
+    if PY2:
+        return os.getcwdu()
+    return os.getcwd()
+
+def walk(top):
+    top = pathof(top)
+    rv = []
+    for base, dnames, names in os.walk(top):
+        base = pathof(base)
+        for name in names:
+            name = pathof(name)
+            rv.append(relpath(os.path.join(base, name), top))
+    return rv
+
+def relpath(path, start=None):
+    return os.path.relpath(pathof(path) , pathof(start))
+
+def abspath(path):
+    return os.path.abspath(pathof(path))
--- a/KindleUnpack/unpack_structure.py
+++ b/KindleUnpack/unpack_structure.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import text_type
+
+from . import unipath
+from .unipath import pathof
+
+DUMP = False
+""" Set to True to dump all possible information. """
+
+import os
+
+import re
+# note: re requites the pattern to be the exact same type as the data to be searched in python3
+# but u"" is not allowed for the pattern itself only b""
+
+import zipfile
+import binascii
+from .mobi_utils import mangle_fonts
+
+class unpackException(Exception):
+    pass
+
+class ZipInfo(zipfile.ZipInfo):
+
+    def __init__(self, *args, **kwargs):
+        if 'compress_type' in kwargs:
+            compress_type = kwargs.pop('compress_type')
+        super(ZipInfo, self).__init__(*args, **kwargs)
+        self.compress_type = compress_type
+
+class fileNames:
+
+    def __init__(self, infile, outdir):
+        self.infile = infile
+        self.outdir = outdir
+        if not unipath.exists(self.outdir):
+            unipath.mkdir(self.outdir)
+        self.mobi7dir = os.path.join(self.outdir,'mobi7')
+        if not unipath.exists(self.mobi7dir):
+            unipath.mkdir(self.mobi7dir)
+        self.imgdir = os.path.join(self.mobi7dir, 'Images')
+        if not unipath.exists(self.imgdir):
+            unipath.mkdir(self.imgdir)
+        self.hdimgdir = os.path.join(self.outdir,'HDImages')
+        if not unipath.exists(self.hdimgdir):
+            unipath.mkdir(self.hdimgdir)
+        self.outbase = os.path.join(self.outdir, os.path.splitext(os.path.split(infile)[1])[0])
+
+    def getInputFileBasename(self):
+        return os.path.splitext(os.path.basename(self.infile))[0]
+
+    def makeK8Struct(self):
+        self.k8dir = os.path.join(self.outdir,'mobi8')
+        if not unipath.exists(self.k8dir):
+            unipath.mkdir(self.k8dir)
+        self.k8metainf = os.path.join(self.k8dir,'META-INF')
+        if not unipath.exists(self.k8metainf):
+            unipath.mkdir(self.k8metainf)
+        self.k8oebps = os.path.join(self.k8dir,'OEBPS')
+        if not unipath.exists(self.k8oebps):
+            unipath.mkdir(self.k8oebps)
+        self.k8images = os.path.join(self.k8oebps,'Images')
+        if not unipath.exists(self.k8images):
+            unipath.mkdir(self.k8images)
+        self.k8fonts = os.path.join(self.k8oebps,'Fonts')
+        if not unipath.exists(self.k8fonts):
+            unipath.mkdir(self.k8fonts)
+        self.k8styles = os.path.join(self.k8oebps,'Styles')
+        if not unipath.exists(self.k8styles):
+            unipath.mkdir(self.k8styles)
+        self.k8text = os.path.join(self.k8oebps,'Text')
+        if not unipath.exists(self.k8text):
+            unipath.mkdir(self.k8text)
+
+    # recursive zip creation support routine
+    def zipUpDir(self, myzip, tdir, localname):
+        currentdir = tdir
+        if localname != "":
+            currentdir = os.path.join(currentdir,localname)
+        list = unipath.listdir(currentdir)
+        for file in list:
+            afilename = file
+            localfilePath = os.path.join(localname, afilename)
+            realfilePath = os.path.join(currentdir,file)
+            if unipath.isfile(realfilePath):
+                myzip.write(pathof(realfilePath), pathof(localfilePath), zipfile.ZIP_DEFLATED)
+            elif unipath.isdir(realfilePath):
+                self.zipUpDir(myzip, tdir, localfilePath)
+
+    def makeEPUB(self, usedmap, obfuscate_data, uid):
+        bname = os.path.join(self.k8dir, self.getInputFileBasename() + '.epub')
+        # Create an encryption key for Adobe font obfuscation
+        # based on the epub's uid
+        if isinstance(uid,text_type):
+            uid = uid.encode('ascii')
+        if obfuscate_data:
+            key = re.sub(br'[^a-fA-F0-9]', b'', uid)
+            key = binascii.unhexlify((key + key)[:32])
+
+        # copy over all images and fonts that are actually used in the ebook
+        # and remove all font files from mobi7 since not supported
+        imgnames = unipath.listdir(self.imgdir)
+        for name in imgnames:
+            if usedmap.get(name,'not used') == 'used':
+                filein = os.path.join(self.imgdir,name)
+                if name.endswith(".ttf"):
+                    fileout = os.path.join(self.k8fonts,name)
+                elif name.endswith(".otf"):
+                    fileout = os.path.join(self.k8fonts,name)
+                elif name.endswith(".failed"):
+                    fileout = os.path.join(self.k8fonts,name)
+                else:
+                    fileout = os.path.join(self.k8images,name)
+                data = b''
+                with open(pathof(filein),'rb') as f:
+                    data = f.read()
+                if obfuscate_data:
+                    if name in obfuscate_data:
+                        data = mangle_fonts(key, data)
+                open(pathof(fileout),'wb').write(data)
+                if name.endswith(".ttf") or name.endswith(".otf"):
+                    os.remove(pathof(filein))
+
+        # opf file name hard coded to "content.opf"
+        container = '<?xml version="1.0" encoding="UTF-8"?>\n'
+        container += '<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">\n'
+        container += '    <rootfiles>\n'
+        container += '<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>'
+        container += '    </rootfiles>\n</container>\n'
+        fileout = os.path.join(self.k8metainf,'container.xml')
+        with open(pathof(fileout),'wb') as f:
+            f.write(container.encode('utf-8'))
+
+        if obfuscate_data:
+            encryption = '<encryption xmlns="urn:oasis:names:tc:opendocument:xmlns:container" \
+xmlns:enc="http://www.w3.org/2001/04/xmlenc#" xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">\n'
+            for font in obfuscate_data:
+                encryption += '  <enc:EncryptedData>\n'
+                encryption += '    <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>\n'
+                encryption += '    <enc:CipherData>\n'
+                encryption += '      <enc:CipherReference URI="OEBPS/Fonts/' + font + '"/>\n'
+                encryption += '    </enc:CipherData>\n'
+                encryption += '  </enc:EncryptedData>\n'
+            encryption += '</encryption>\n'
+            fileout = os.path.join(self.k8metainf,'encryption.xml')
+            with open(pathof(fileout),'wb') as f:
+                f.write(encryption.encode('utf-8'))
+
+        # ready to build epub
+        self.outzip = zipfile.ZipFile(pathof(bname), 'w')
+
+        # add the mimetype file uncompressed
+        mimetype = b'application/epub+zip'
+        fileout = os.path.join(self.k8dir,'mimetype')
+        with open(pathof(fileout),'wb') as f:
+            f.write(mimetype)
+        nzinfo = ZipInfo('mimetype', compress_type=zipfile.ZIP_STORED)
+        nzinfo.external_attr = 0o600 << 16 # make this a normal file
+        self.outzip.writestr(nzinfo, mimetype)
+        self.zipUpDir(self.outzip,self.k8dir,'META-INF')
+        self.zipUpDir(self.outzip,self.k8dir,'OEBPS')
+        self.outzip.close()