Incorporate KindleUnpack from https://github.com/kevinhendricks/KindleUnpack

The GUI elements have been removed
2018-03-09 14:24:05 +05:30
parent d2910188d6
commit 2cbd2df9a5
21 changed files with 7286 additions and 0 deletions
--- a/KindleUnpack/init.py
+++ b/KindleUnpack/init.py
@@ -0,0 +1,2 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
--- a/KindleUnpack/compatibility_utils.py
+++ b/KindleUnpack/compatibility_utils.py
@@ -0,0 +1,278 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 # Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification,
 # are permitted provided that the following conditions are met:
 #
 # 1. Redistributions of source code must retain the above copyright notice, this list of
 # conditions and the following disclaimer.
 #
 # 2. Redistributions in binary form must reproduce the above copyright notice, this list
 # of conditions and the following disclaimer in the documentation and/or other materials
 # provided with the distribution.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
 # SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 # TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
 # WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 from __future__ import unicode_literals, division, absolute_import, print_function
 import sys
 import codecs
 PY2 = sys.version_info[0] == 2
 PY3 = sys.version_info[0] == 3
 iswindows = sys.platform.startswith('win')
 try:
    from urllib.parse import unquote
 except ImportError:
    from urllib import unquote
 if PY2:
    from HTMLParser import HTMLParser
    _h = HTMLParser()
 elif sys.version_info[1] < 4:
    import html.parser
    _h = html.parser.HTMLParser()
 else:
    import html as _h
 if PY3:
    text_type = str
    binary_type = bytes
    # if will be printing arbitraty binary data to stdout on python 3
    # sys.stdin = sys.stdin.detach()
    # sys.stdout = sys.stdout.detach()
    # sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
 else:
    range = xrange
    text_type = unicode
    binary_type = str
    # if will be printing unicode under python 2 need to protect
    # against sys.stdout.encoding being None stupidly forcing forcing ascii encoding of unicode
    # sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
    # alternatively set environment variable as follows **before** launching python:  export PYTHONIOENCODING=UTF-8
 # NOTE: Python 3 is completely broken when accessing single bytes in bytes strings
 # (and they amazingly claim by design and no bug!)
 # To illustrate: this works for unicode in Python 3 and for all Python 2.X for both bytestrings and unicode
 # >>> o = '123456789'
 # >>> o[-3]
 # '7'
 # >>> type(o[-3])
 # <class 'str'>
 # >>> type(o)
 # <class 'str'>
 # Unfortunately, this is what Python 3 does for no sane reason and only for bytestrings
 # >>> o = b'123456789'
 # >>> o[-3]
 # 55
 # >>> type(o[-3])
 # <class 'int'>
 # >>> type(o)
 # <class 'bytes'>
 # This mind boggling  behaviour also happens when indexing a bytestring and/or
 # iteratoring over a bytestring.  In other words it will return an int but not
 # the byte itself!!!!!!!
 # The only way to access a single byte as a byte in bytestring and get the byte in both
 # Python 2 and Python 3 is to use a slice
 # This problem is so common there are horrible hacks floating around the net to **try**
 # to work around it, so that code that works on both Python 2 and Python 3 is possible.
 # So in order to write code that works on both Python 2 and Python 3
 # if you index or access a single byte and want its ord() then use the bord() function.
 # If instead you want it as a single character byte use the bchar() function
 # both of which are defined below.
 if PY3:
    # Also Note: if decode a bytestring using 'latin-1' (or any other full range 0-255 encoding)
    # in place of ascii you will get a byte value to half-word or integer value
    # one-to-one mapping (in the 0 - 255 range)
    def bchr(s):
        return bytes([s])
    def bstr(s):
        if isinstance(s, str):
            return bytes(s, 'latin-1')
        else:
            return bytes(s)
    def bord(s):
        return s
    def bchar(s):
        return bytes([s])
 else:
    def bchr(s):
        return chr(s)
    def bstr(s):
        return str(s)
    def bord(s):
        return ord(s)
    def bchar(s):
        return s
 if PY3:
    # list-producing versions of the major Python iterating functions
    def lrange(*args, **kwargs):
        return list(range(*args, **kwargs))
    def lzip(*args, **kwargs):
        return list(zip(*args, **kwargs))
    def lmap(*args, **kwargs):
        return list(map(*args, **kwargs))
    def lfilter(*args, **kwargs):
        return list(filter(*args, **kwargs))
 else:
    import __builtin__
    # Python 2-builtin ranges produce lists
    lrange = __builtin__.range
    lzip = __builtin__.zip
    lmap = __builtin__.map
    lfilter = __builtin__.filter
 # In Python 3 you can no longer use .encode('hex') on a bytestring
 # instead use the following on both platforms
 import binascii
 def hexlify(bdata):
    return (binascii.hexlify(bdata)).decode('ascii')
 # If you: import struct
 # Note:  struct pack, unpack, unpack_from all *require* bytestring format
 # data all the way up to at least Python 2.7.5, Python 3 is okay with either
 # If you: import re
 # note: Python 3 "re" requires the pattern to be the exact same type as the data to be
 # searched ... but u"" is not allowed for the pattern itself only b""
 # Python 2.X allows the pattern to be any type and converts it to match the data
 # and returns the same type as the data
 # convert string to be utf-8 encoded
 def utf8_str(p, enc='utf-8'):
    if p is None:
        return None
    if isinstance(p, text_type):
        return p.encode('utf-8')
    if enc != 'utf-8':
        return p.decode(enc).encode('utf-8')
    return p
 # convert string to be unicode encoded
 def unicode_str(p, enc='utf-8'):
    if p is None:
        return None
    if isinstance(p, text_type):
        return p
    return p.decode(enc)
 ASCII_CHARS   = set(chr(x) for x in range(128))
 URL_SAFE      = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
                    'abcdefghijklmnopqrstuvwxyz'
                    '0123456789' '#' '_.-/~')
 IRI_UNSAFE = ASCII_CHARS - URL_SAFE
 # returns a quoted IRI (not a URI)
 def quoteurl(href):
    if isinstance(href,binary_type):
        href = href.decode('utf-8')
    result = []
    for char in href:
        if char in IRI_UNSAFE:
            char = "%%%02x" % ord(char)
        result.append(char)
    return ''.join(result)
 # unquotes url/iri
 def unquoteurl(href):
    if isinstance(href,binary_type):
        href = href.decode('utf-8')
    href = unquote(href)
    return href
 # unescape html
 def unescapeit(sval):
    return _h.unescape(sval)
 # Python 2.X commandline parsing under Windows has been horribly broken for years!
 # Use the following code to emulate full unicode commandline parsing on Python 2
 # ie. To get  sys.argv arguments and properly encode them as unicode
 def unicode_argv():
    global iswindows
    global PY3
    if PY3:
        return sys.argv
    if iswindows:
        # Versions 2.x of Python don't support Unicode in sys.argv on
        # Windows, with the underlying Windows API instead replacing multi-byte
        # characters with '?'.  So use shell32.GetCommandLineArgvW to get sys.argv
        # as a list of Unicode strings
        from ctypes import POINTER, byref, cdll, c_int, windll
        from ctypes.wintypes import LPCWSTR, LPWSTR
        GetCommandLineW = cdll.kernel32.GetCommandLineW
        GetCommandLineW.argtypes = []
        GetCommandLineW.restype = LPCWSTR
        CommandLineToArgvW = windll.shell32.CommandLineToArgvW
        CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)]
        CommandLineToArgvW.restype = POINTER(LPWSTR)
        cmd = GetCommandLineW()
        argc = c_int(0)
        argv = CommandLineToArgvW(cmd, byref(argc))
        if argc.value > 0:
            # Remove Python executable and commands if present
            start = argc.value - len(sys.argv)
            return [argv[i] for i in
                    range(start, argc.value)]
        # this should never happen
        return None
    else:
        argv = []
        argvencoding = sys.stdin.encoding
        if argvencoding is None:
            argvencoding = sys.getfilesystemencoding()
        if argvencoding is None:
            argvencoding = 'utf-8'
        for arg in sys.argv:
            if isinstance(arg, text_type):
                argv.append(arg)
            else:
                argv.append(arg.decode(argvencoding))
        return argv
 # Python 2.X is broken in that it does not recognize CP65001 as UTF-8
 def add_cp65001_codec():
    if PY2:
        try:
            codecs.lookup('cp65001')
        except LookupError:
            codecs.register(
                lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None)
    return
--- a/KindleUnpack/kindleunpack.py
+++ b/KindleUnpack/kindleunpack.py
--- a/KindleUnpack/mobi_cover.py
+++ b/KindleUnpack/mobi_cover.py
@@ -0,0 +1,238 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 from __future__ import unicode_literals, division, absolute_import, print_function
 from .compatibility_utils import unicode_str
 from .unipath import pathof
 import os
 import imghdr
 import struct
 # note:  struct pack, unpack, unpack_from all require bytestring format
 # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 USE_SVG_WRAPPER = True
 """ Set to True to use svg wrapper for default. """
 FORCE_DEFAULT_TITLE = False
 """ Set to True to force to use the default title. """
 COVER_PAGE_FINENAME = 'cover_page.xhtml'
 """ The name for the cover page. """
 DEFAULT_TITLE = 'Cover'
 """ The default title for the cover page. """
 MAX_WIDTH = 4096
 """ The max width for the svg cover page. """
 MAX_HEIGHT = 4096
 """ The max height for the svg cover page. """
 def get_image_type(imgname, imgdata=None):
    imgtype = unicode_str(imghdr.what(pathof(imgname), imgdata))
    # imghdr only checks for JFIF or Exif JPEG files. Apparently, there are some
    # with only the magic JPEG bytes out there...
    # ImageMagick handles those, so, do it too.
    if imgtype is None:
        if imgdata is None:
            with open(pathof(imgname), 'rb') as f:
                imgdata = f.read()
        if imgdata[0:2] == b'\xFF\xD8':
            # Get last non-null bytes
            last = len(imgdata)
            while (imgdata[last-1:last] == b'\x00'):
                last-=1
            # Be extra safe, check the trailing bytes, too.
            if imgdata[last-2:last] == b'\xFF\xD9':
                imgtype = "jpeg"
    return imgtype
 def get_image_size(imgname, imgdata=None):
    '''Determine the image type of imgname (or imgdata) and return its size.
    Originally,
    Determine the image type of fhandle and return its size.
    from draco'''
    if imgdata is None:
        fhandle = open(pathof(imgname), 'rb')
        head = fhandle.read(24)
    else:
        head = imgdata[0:24]
    if len(head) != 24:
        return
    imgtype = get_image_type(imgname, imgdata)
    if imgtype == 'png':
        check = struct.unpack(b'>i', head[4:8])[0]
        if check != 0x0d0a1a0a:
            return
        width, height = struct.unpack(b'>ii', head[16:24])
    elif imgtype == 'gif':
        width, height = struct.unpack(b'<HH', head[6:10])
    elif imgtype == 'jpeg' and imgdata is None:
        try:
            fhandle.seek(0)  # Read 0xff next
            size = 2
            ftype = 0
            while not 0xc0 <= ftype <= 0xcf:
                fhandle.seek(size, 1)
                byte = fhandle.read(1)
                while ord(byte) == 0xff:
                    byte = fhandle.read(1)
                ftype = ord(byte)
                size = struct.unpack(b'>H', fhandle.read(2))[0] - 2
            # We are at a SOFn block
            fhandle.seek(1, 1)  # Skip `precision' byte.
            height, width = struct.unpack(b'>HH', fhandle.read(4))
        except Exception:  # IGNORE:W0703
            return
    elif imgtype == 'jpeg' and imgdata is not None:
        try:
            pos = 0
            size = 2
            ftype = 0
            while not 0xc0 <= ftype <= 0xcf:
                pos += size
                byte = imgdata[pos:pos+1]
                pos += 1
                while ord(byte) == 0xff:
                    byte = imgdata[pos:pos+1]
                    pos += 1
                ftype = ord(byte)
                size = struct.unpack(b'>H', imgdata[pos:pos+2])[0] - 2
                pos += 2
            # We are at a SOFn block
            pos += 1  # Skip `precision' byte.
            height, width = struct.unpack(b'>HH', imgdata[pos:pos+4])
            pos += 4
        except Exception:  # IGNORE:W0703
            return
    else:
        return
    return width, height
 # XXX experimental
 class CoverProcessor(object):
    """Create a cover page.
    """
    def __init__(self, files, metadata, rscnames, imgname=None, imgdata=None):
        self.files = files
        self.metadata = metadata
        self.rscnames = rscnames
        self.cover_page = COVER_PAGE_FINENAME
        self.use_svg = USE_SVG_WRAPPER  # Use svg wrapper.
        self.lang = metadata.get('Language', ['en'])[0]
        # This should ensure that if the methods to find the cover image's
        # dimensions should fail for any reason, the SVG routine will not be used.
        [self.width, self.height] = (-1,-1)
        if FORCE_DEFAULT_TITLE:
            self.title = DEFAULT_TITLE
        else:
            self.title = metadata.get('Title', [DEFAULT_TITLE])[0]
        self.cover_image = None
        if imgname is not None:
            self.cover_image = imgname
        elif 'CoverOffset' in metadata:
            imageNumber = int(metadata['CoverOffset'][0])
            cover_image = self.rscnames[imageNumber]
            if cover_image is not None:
                self.cover_image = cover_image
            else:
                print('Warning: Cannot identify the cover image.')
        if self.use_svg:
            try:
                if imgdata is None:
                    fname = os.path.join(files.imgdir, self.cover_image)
                    [self.width, self.height] = get_image_size(fname)
                else:
                    [self.width, self.height] = get_image_size(None, imgdata)
            except:
                self.use_svg = False
            width = self.width
            height = self.height
            if width < 0 or height < 0 or width > MAX_WIDTH or height > MAX_HEIGHT:
                self.use_svg = False
        return
    def getImageName(self):
        return self.cover_image
    def getXHTMLName(self):
        return self.cover_page
    def buildXHTML(self):
        print('Building a cover page.')
        files = self.files
        cover_image = self.cover_image
        title = self.title
        lang = self.lang
        image_dir = os.path.normpath(os.path.relpath(files.k8images, files.k8text))
        image_path = os.path.join(image_dir, cover_image).replace('\\', '/')
        if not self.use_svg:
            data = ''
            data += '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html>'
            data += '<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"'
            data += ' xml:lang="{:s}">\n'.format(lang)
            data += '<head>\n<title>{:s}</title>\n'.format(title)
            data += '<style type="text/css">\n'
            data += 'body {\n  margin: 0;\n  padding: 0;\n  text-align: center;\n}\n'
            data += 'div {\n  height: 100%;\n  width: 100%;\n  text-align: center;\n  page-break-inside: avoid;\n}\n'
            data += 'img {\n  display: inline-block;\n  height: 100%;\n  margin: 0 auto;\n}\n'
            data += '</style>\n</head>\n'
            data += '<body><div>\n'
            data += '  <img src="{:s}" alt=""/>\n'.format(image_path)
            data += '</div></body>\n</html>'
        else:
            width = self.width
            height = self.height
            viewBox = "0 0 {0:d} {1:d}".format(width, height)
            data = ''
            data += '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html>'
            data += '<html xmlns="http://www.w3.org/1999/xhtml"'
            data += ' xml:lang="{:s}">\n'.format(lang)
            data += '<head>\n  <title>{:s}</title>\n'.format(title)
            data += '<style type="text/css">\n'
            data += 'svg {padding: 0pt; margin:0pt}\n'
            data += 'body { text-align: center; padding:0pt; margin: 0pt; }\n'
            data += '</style>\n</head>\n'
            data += '<body>\n  <div>\n'
            data += '    <svg xmlns="http://www.w3.org/2000/svg" height="100%" preserveAspectRatio="xMidYMid meet"'
            data += ' version="1.1" viewBox="{0:s}" width="100%" xmlns:xlink="http://www.w3.org/1999/xlink">\n'.format(viewBox)
            data += '      <image height="{0}" width="{1}" xlink:href="{2}"/>\n'.format(height, width, image_path)
            data += '    </svg>\n'
            data += '  </div>\n</body>\n</html>'
        return data
    def writeXHTML(self):
        files = self.files
        cover_page = self.cover_page
        data = self.buildXHTML()
        outfile = os.path.join(files.k8text, cover_page)
        if os.path.exists(pathof(outfile)):
            print('Warning: {:s} already exists.'.format(cover_page))
            os.remove(pathof(outfile))
        with open(pathof(outfile), 'wb') as f:
            f.write(data.encode('utf-8'))
        return
    def guide_toxml(self):
        files = self.files
        text_dir = os.path.relpath(files.k8text, files.k8oebps)
        data = '<reference type="cover" title="Cover" href="{:s}/{:s}" />\n'.format(
                text_dir, self.cover_page)
        return data
--- a/KindleUnpack/mobi_dict.py
+++ b/KindleUnpack/mobi_dict.py
@@ -0,0 +1,377 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 from __future__ import unicode_literals, division, absolute_import, print_function
 from .compatibility_utils import PY2, PY3, utf8_str, bstr, bchr
 if PY2:
    range = xrange
    array_format = b'B'
 if PY3:
    unichr = chr
    array_format = "B"
 import array
 import struct
 # note:  struct pack, unpack, unpack_from all require bytestring format
 # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 from .mobi_index import getVariableWidthValue, readTagSection, getTagMap
 from .mobi_utils import toHex
 DEBUG_DICT = False
 class InflectionData(object):
    def __init__(self, infldatas):
        self.infldatas = infldatas
        self.starts = []
        self.counts = []
        for idata in self.infldatas:
            start, = struct.unpack_from(b'>L', idata, 0x14)
            count, = struct.unpack_from(b'>L', idata, 0x18)
            self.starts.append(start)
            self.counts.append(count)
    def lookup(self, lookupvalue):
        i = 0
        rvalue = lookupvalue
        while rvalue >= self.counts[i]:
            rvalue = rvalue - self.counts[i]
            i += 1
            if i == len(self.counts):
                print("Error: Problem with multiple inflections data sections")
                return lookupvalue, self.starts[0], self.counts[0], self.infldatas[0]
        return rvalue, self.starts[i], self.counts[i], self.infldatas[i]
    def offsets(self, value):
        rvalue, start, count, data = self.lookup(value)
        offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue))
        if rvalue + 1 < count:
            nextOffset, = struct.unpack_from(b'>H',data, start + 4 + (2 * (rvalue + 1)))
        else:
            nextOffset = None
        return offset, nextOffset, data
 class dictSupport(object):
    def __init__(self, mh, sect):
        self.mh = mh
        self.header = mh.header
        self.sect = sect
        self.metaOrthIndex = mh.metaOrthIndex
        self.metaInflIndex = mh.metaInflIndex
    def parseHeader(self, data):
        "read INDX header"
        if not data[:4] == b'INDX':
            print("Warning: index section is not INDX")
            return False
        words = (
                'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
                'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc'
        )
        num = len(words)
        values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)])
        header = {}
        for n in range(num):
            header[words[n]] = values[n]
        ordt1 = None
        ordt2 = None
        otype, oentries, op1, op2, otagx  = struct.unpack_from(b'>LLLLL',data, 0xa4)
        header['otype'] = otype
        header['oentries'] = oentries
        if DEBUG_DICT:
            print("otype %d, oentries %d, op1 %d, op2 %d, otagx %d" % (otype, oentries, op1, op2, otagx))
        if header['code'] == 0xfdea or oentries > 0:
            # some dictionaries seem to be codepage 65002 (0xFDEA) which seems
            # to be some sort of strange EBCDIC utf-8 or 16 encoded strings
            # So we need to look for them and store them away to process leading text
            # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
            # we only ever seem to use the second but ...
            #
            # if otype = 0, ORDT table uses 16 bit values as offsets into the table
            # if otype = 1, ORDT table uses 8 bit values as offsets inot the table
            assert(data[op1:op1+4] == b'ORDT')
            assert(data[op2:op2+4] == b'ORDT')
            ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4)
            ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4)
        if DEBUG_DICT:
            print("parsed INDX header:")
            for key in header:
                print(key, "%x" % header[key],)
            print("\n")
        return header, ordt1, ordt2
    def getPositionMap(self):
        sect = self.sect
        positionMap = {}
        metaOrthIndex = self.metaOrthIndex
        metaInflIndex = self.metaInflIndex
        decodeInflection = True
        if metaOrthIndex != 0xFFFFFFFF:
            print("Info: Document contains orthographic index, handle as dictionary")
            if metaInflIndex == 0xFFFFFFFF:
                decodeInflection = False
            else:
                metaInflIndexData = sect.loadSection(metaInflIndex)
                print("\nParsing metaInflIndexData")
                midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData)
                metaIndexCount = midxhdr['count']
                idatas = []
                for j in range(metaIndexCount):
                    idatas.append(sect.loadSection(metaInflIndex + 1 + j))
                dinfl = InflectionData(idatas)
                inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount)
                tagSectionStart = midxhdr['len']
                inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData)
                if DEBUG_DICT:
                    print("inflectionTagTable: %s" % inflectionTagTable)
                if self.hasTag(inflectionTagTable, 0x07):
                    print("Error: Dictionary uses obsolete inflection rule scheme which is not yet supported")
                    decodeInflection = False
            data = sect.loadSection(metaOrthIndex)
            print("\nParsing metaOrthIndex")
            idxhdr, hordt1, hordt2 = self.parseHeader(data)
            tagSectionStart = idxhdr['len']
            controlByteCount, tagTable = readTagSection(tagSectionStart, data)
            orthIndexCount = idxhdr['count']
            print("orthIndexCount is", orthIndexCount)
            if DEBUG_DICT:
                print("orthTagTable: %s" % tagTable)
            if hordt2 is not None:
                print("orth entry uses ordt2 lookup table of type ", idxhdr['otype'])
            hasEntryLength = self.hasTag(tagTable, 0x02)
            if not hasEntryLength:
                print("Info: Index doesn't contain entry length tags")
            print("Read dictionary index data")
            for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount):
                data = sect.loadSection(i)
                hdrinfo, ordt1, ordt2 = self.parseHeader(data)
                idxtPos = hdrinfo['start']
                entryCount = hdrinfo['count']
                idxPositions = []
                for j in range(entryCount):
                    pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j))
                    idxPositions.append(pos)
                # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
                idxPositions.append(idxtPos)
                for j in range(entryCount):
                    startPos = idxPositions[j]
                    endPos = idxPositions[j+1]
                    textLength = ord(data[startPos:startPos+1])
                    text = data[startPos+1:startPos+1+textLength]
                    if hordt2 is not None:
                        utext = u""
                        if idxhdr['otype'] == 0:
                            pattern = b'>H'
                            inc = 2
                        else:
                            pattern = b'>B'
                            inc = 1
                        pos = 0
                        while pos < textLength:
                            off, = struct.unpack_from(pattern, text, pos)
                            if off < len(hordt2):
                                utext += unichr(hordt2[off])
                            else:
                                utext += unichr(off)
                            pos += inc
                        text = utext.encode('utf-8')
                    tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos)
                    if 0x01 in tagMap:
                        if decodeInflection and 0x2a in tagMap:
                            inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable,
                                                                        dinfl, inflNameData, tagMap[0x2a])
                        else:
                            inflectionGroups = b''
                        assert len(tagMap[0x01]) == 1
                        entryStartPosition = tagMap[0x01][0]
                        if hasEntryLength:
                            # The idx:entry attribute "scriptable" must be present to create entry length tags.
                            ml = b'<idx:entry scriptable="yes"><idx:orth value="' + text + b'">' + inflectionGroups + b'</idx:orth>'
                            if entryStartPosition in positionMap:
                                positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml
                            else:
                                positionMap[entryStartPosition] = ml
                            assert len(tagMap[0x02]) == 1
                            entryEndPosition = entryStartPosition + tagMap[0x02][0]
                            if entryEndPosition in positionMap:
                                positionMap[entryEndPosition] = b"</idx:entry>" + positionMap[entryEndPosition]
                            else:
                                positionMap[entryEndPosition] = b"</idx:entry>"
                        else:
                            indexTags = b'<idx:entry>\n<idx:orth value="' + text + b'">\n' + inflectionGroups + b'</idx:entry>\n'
                            if entryStartPosition in positionMap:
                                positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags
                            else:
                                positionMap[entryStartPosition] = indexTags
        return positionMap
    def hasTag(self, tagTable, tag):
        '''
        Test if tag table contains given tag.
        @param tagTable: The tag table.
        @param tag: The tag to search.
        @return: True if tag table contains given tag; False otherwise.
        '''
        for currentTag, _, _, _ in tagTable:
            if currentTag == tag:
                return True
        return False
    def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList):
        '''
        Create string which contains the inflection groups with inflection rules as mobipocket tags.
        @param mainEntry: The word to inflect.
        @param controlByteCount: The number of control bytes.
        @param tagTable: The tag table.
        @param data: The Inflection data object to properly select the right inflection data section to use
        @param inflectionNames: The inflection rule name data.
        @param groupList: The list of inflection groups to process.
        @return: String with inflection groups and rules or empty string if required tags are not available.
        '''
        result = b""
        for value in groupList:
            offset, nextOffset, data = dinfl.offsets(value)
            # First byte seems to be always 0x00 and must be skipped.
            assert ord(data[offset:offset+1]) == 0x00
            tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset)
            # Make sure that the required tags are available.
            if 0x05 not in tagMap:
                print("Error: Required tag 0x05 not found in tagMap")
                return ""
            if 0x1a not in tagMap:
                print("Error: Required tag 0x1a not found in tagMap")
                return b''
            result += b'<idx:infl>'
            for i in range(len(tagMap[0x05])):
                # Get name of inflection rule.
                value = tagMap[0x05][i]
                consumed, textLength = getVariableWidthValue(inflectionNames, value)
                inflectionName = inflectionNames[value+consumed:value+consumed+textLength]
                # Get and apply inflection rule across possibly multiple inflection data sections
                value = tagMap[0x1a][i]
                rvalue, start, count, data = dinfl.lookup(value)
                offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue))
                textLength = ord(data[offset:offset+1])
                inflection = self.applyInflectionRule(mainEntry, data, offset+1, offset+1+textLength)
                if inflection is not None:
                    result += b'  <idx:iform name="' + inflectionName + b'" value="' + inflection + b'"/>'
            result += b'</idx:infl>'
        return result
    def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end):
        '''
        Apply inflection rule.
        @param mainEntry: The word to inflect.
        @param inflectionRuleData: The inflection rules.
        @param start: The start position of the inflection rule to use.
        @param end: The end position of the inflection rule to use.
        @return: The string with the inflected word or None if an error occurs.
        '''
        mode = -1
        byteArray = array.array(array_format, mainEntry)
        position = len(byteArray)
        for charOffset in range(start, end):
            char = inflectionRuleData[charOffset:charOffset+1]
            abyte = ord(char)
            if abyte >= 0x0a and abyte <= 0x13:
                # Move cursor backwards
                offset = abyte - 0x0a
                if mode not in [0x02, 0x03]:
                    mode = 0x02
                    position = len(byteArray)
                position -= offset
            elif abyte > 0x13:
                if mode == -1:
                    print("Error: Unexpected first byte %i of inflection rule" % abyte)
                    return None
                elif position == -1:
                    print("Error: Unexpected first byte %i of inflection rule" % abyte)
                    return None
                else:
                    if mode == 0x01:
                        # Insert at word start
                        byteArray.insert(position, abyte)
                        position += 1
                    elif mode == 0x02:
                        # Insert at word end
                        byteArray.insert(position, abyte)
                    elif mode == 0x03:
                        # Delete at word end
                        position -= 1
                        deleted = byteArray.pop(position)
                        if bchr(deleted) != char:
                            if DEBUG_DICT:
                                print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted)))
                            print("Error: Delete operation of inflection rule failed")
                            return None
                    elif mode == 0x04:
                        # Delete at word start
                        deleted = byteArray.pop(position)
                        if bchr(deleted) != char:
                            if DEBUG_DICT:
                                print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted)))
                            print("Error: Delete operation of inflection rule failed")
                            return None
                    else:
                        print("Error: Inflection rule mode %x is not implemented" % mode)
                        return None
            elif abyte == 0x01:
                # Insert at word start
                if mode not in [0x01, 0x04]:
                    position = 0
                mode = abyte
            elif abyte == 0x02:
                # Insert at word end
                if mode not in [0x02, 0x03]:
                    position = len(byteArray)
                mode = abyte
            elif abyte == 0x03:
                # Delete at word end
                if mode not in [0x02, 0x03]:
                    position = len(byteArray)
                mode = abyte
            elif abyte == 0x04:
                # Delete at word start
                if mode not in [0x01, 0x04]:
                    position = 0
                # Delete at word start
                mode = abyte
            else:
                print("Error: Inflection rule mode %x is not implemented" % abyte)
                return None
        return utf8_str(byteArray.tostring())
--- a/KindleUnpack/mobi_header.py
+++ b/KindleUnpack/mobi_header.py
@@ -0,0 +1,934 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 from __future__ import unicode_literals, division, absolute_import, print_function
 DEBUG_USE_ORDERED_DICTIONARY = False  # OrderedDict is supoorted >= python 2.7.
 """ set to True to use OrderedDict for MobiHeader.metadata."""
 if DEBUG_USE_ORDERED_DICTIONARY:
    from collections import OrderedDict as dict_
 else:
    dict_ = dict
 from .compatibility_utils import PY2, unicode_str, hexlify, bord
 if PY2:
    range = xrange
 import struct
 import uuid
 # import the mobiunpack support libraries
 from .mobi_utils import getLanguage
 from .mobi_uncompress import HuffcdicReader, PalmdocReader, UncompressedReader
 class unpackException(Exception):
    pass
 def sortedHeaderKeys(mheader):
    hdrkeys = sorted(list(mheader.keys()), key=lambda akey: mheader[akey][0])
    return hdrkeys
 # HD Containers have their own headers and their own EXTH
 # this is just guesswork so far, making big assumption that
 # metavalue key numbers remain the same in the CONT EXTH
 # Note:  The layout of the CONT Header is still unknown
 # so just deal with their EXTH sections for now
 def dump_contexth(cpage, extheader):
    # determine text encoding
    codec = 'windows-1252'
    codec_map = {
         1252 : 'windows-1252',
         65001: 'utf-8',
    }
    if cpage in codec_map:
        codec = codec_map[cpage]
    if extheader == b'':
        return
    id_map_strings = {
        1 : 'Drm Server Id',
        2 : 'Drm Commerce Id',
        3 : 'Drm Ebookbase Book Id',
        4 : 'Drm Ebookbase Dep Id',
        100 : 'Creator',
        101 : 'Publisher',
        102 : 'Imprint',
        103 : 'Description',
        104 : 'ISBN',
        105 : 'Subject',
        106 : 'Published',
        107 : 'Review',
        108 : 'Contributor',
        109 : 'Rights',
        110 : 'SubjectCode',
        111 : 'Type',
        112 : 'Source',
        113 : 'ASIN',
        114 : 'versionNumber',
        117 : 'Adult',
        118 : 'Retail-Price',
        119 : 'Retail-Currency',
        120 : 'TSC',
        122 : 'fixed-layout',
        123 : 'book-type',
        124 : 'orientation-lock',
        126 : 'original-resolution',
        127 : 'zero-gutter',
        128 : 'zero-margin',
        129 : 'MetadataResourceURI',
        132 : 'RegionMagnification',
        150 : 'LendingEnabled',
        200 : 'DictShortName',
        501 : 'cdeType',
        502 : 'last_update_time',
        503 : 'Updated_Title',
        504 : 'CDEContentKey',
        505 : 'AmazonContentReference',
        506 : 'Title-Language',
        507 : 'Title-Display-Direction',
        508 : 'Title-Pronunciation',
        509 : 'Title-Collation',
        510 : 'Secondary-Title',
        511 : 'Secondary-Title-Language',
        512 : 'Secondary-Title-Direction',
        513 : 'Secondary-Title-Pronunciation',
        514 : 'Secondary-Title-Collation',
        515 : 'Author-Language',
        516 : 'Author-Display-Direction',
        517 : 'Author-Pronunciation',
        518 : 'Author-Collation',
        519 : 'Author-Type',
        520 : 'Publisher-Language',
        521 : 'Publisher-Display-Direction',
        522 : 'Publisher-Pronunciation',
        523 : 'Publisher-Collation',
        524 : 'Content-Language-Tag',
        525 : 'primary-writing-mode',
        526 : 'NCX-Ingested-By-Software',
        527 : 'page-progression-direction',
        528 : 'override-kindle-fonts',
        529 : 'Compression-Upgraded',
        530 : 'Soft-Hyphens-In-Content',
        531 : 'Dictionary_In_Langague',
        532 : 'Dictionary_Out_Language',
        533 : 'Font_Converted',
        534 : 'Amazon_Creator_Info',
        535 : 'Creator-Build-Tag',
        536 : 'HD-Media-Containers-Info',  # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?)
        538 : 'Resource-Container-Fidelity',
        539 : 'HD-Container-Mimetype',
        540 : 'Sample-For_Special-Purpose',
        541 : 'Kindletool-Operation-Information',
        542 : 'Container_Id',
        543 : 'Asset-Type',  # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER
        544 : 'Unknown_544',
    }
    id_map_values = {
        115 : 'sample',
        116 : 'StartOffset',
        121 : 'Mobi8-Boundary-Section',
        125 : 'Embedded-Record-Count',
        130 : 'Offline-Sample',
        131 : 'Metadata-Record-Offset',
        201 : 'CoverOffset',
        202 : 'ThumbOffset',
        203 : 'HasFakeCover',
        204 : 'Creator-Software',
        205 : 'Creator-Major-Version',
        206 : 'Creator-Minor-Version',
        207 : 'Creator-Build-Number',
        401 : 'Clipping-Limit',
        402 : 'Publisher-Limit',
        404 : 'Text-to-Speech-Disabled',
        406 : 'Rental-Expiration-Time',
    }
    id_map_hexstrings = {
        208 : 'Watermark_(hex)',
        209 : 'Tamper-Proof-Keys_(hex)',
        300 : 'Font-Signature_(hex)',
        403 : 'Unknown_(403)_(hex)',
        405 : 'Ownership-Type_(hex)',
        407 : 'Unknown_(407)_(hex)',
        420 : 'Multimedia-Content-Reference_(hex)',
        450 : 'Locations_Match_(hex)',
        451 : 'Full-Story-Length_(hex)',
        452 : 'Sample-Start_Location_(hex)',
        453 : 'Sample-End-Location_(hex)',
    }
    _length, num_items = struct.unpack(b'>LL', extheader[4:12])
    extheader = extheader[12:]
    pos = 0
    for _ in range(num_items):
        id, size = struct.unpack(b'>LL', extheader[pos:pos+8])
        content = extheader[pos + 8: pos + size]
        if id in id_map_strings:
            name = id_map_strings[id]
            print('\n    Key: "%s"\n        Value: "%s"' % (name, content.decode(codec, errors='replace')))
        elif id in id_map_values:
            name = id_map_values[id]
            if size == 9:
                value, = struct.unpack(b'B',content)
                print('\n    Key: "%s"\n        Value: 0x%01x' % (name, value))
            elif size == 10:
                value, = struct.unpack(b'>H',content)
                print('\n    Key: "%s"\n        Value: 0x%02x' % (name, value))
            elif size == 12:
                value, = struct.unpack(b'>L',content)
                print('\n    Key: "%s"\n        Value: 0x%04x' % (name, value))
            else:
                print("\nError: Value for %s has unexpected size of %s" % (name, size))
        elif id in id_map_hexstrings:
            name = id_map_hexstrings[id]
            print('\n    Key: "%s"\n        Value: 0x%s' % (name, hexlify(content)))
        else:
            print("\nWarning: Unknown metadata with id %s found" % id)
            name = str(id) + ' (hex)'
            print('    Key: "%s"\n        Value: 0x%s' % (name, hexlify(content)))
        pos += size
    return
 class MobiHeader:
    # all values are packed in big endian format
    palmdoc_header = {
            'compression_type'  : (0x00, b'>H', 2),
            'fill0'             : (0x02, b'>H', 2),
            'text_length'       : (0x04, b'>L', 4),
            'text_records'      : (0x08, b'>H', 2),
            'max_section_size'  : (0x0a, b'>H', 2),
            'read_pos   '       : (0x0c, b'>L', 4),
    }
    mobi6_header = {
            'compression_type'  : (0x00, b'>H', 2),
            'fill0'             : (0x02, b'>H', 2),
            'text_length'       : (0x04, b'>L', 4),
            'text_records'      : (0x08, b'>H', 2),
            'max_section_size'  : (0x0a, b'>H', 2),
            'crypto_type'       : (0x0c, b'>H', 2),
            'fill1'             : (0x0e, b'>H', 2),
            'magic'             : (0x10, b'4s', 4),
            'header_length (from MOBI)'     : (0x14, b'>L', 4),
            'type'              : (0x18, b'>L', 4),
            'codepage'          : (0x1c, b'>L', 4),
            'unique_id'         : (0x20, b'>L', 4),
            'version'           : (0x24, b'>L', 4),
            'metaorthindex'     : (0x28, b'>L', 4),
            'metainflindex'     : (0x2c, b'>L', 4),
            'index_names'       : (0x30, b'>L', 4),
            'index_keys'        : (0x34, b'>L', 4),
            'extra_index0'      : (0x38, b'>L', 4),
            'extra_index1'      : (0x3c, b'>L', 4),
            'extra_index2'      : (0x40, b'>L', 4),
            'extra_index3'      : (0x44, b'>L', 4),
            'extra_index4'      : (0x48, b'>L', 4),
            'extra_index5'      : (0x4c, b'>L', 4),
            'first_nontext'     : (0x50, b'>L', 4),
            'title_offset'      : (0x54, b'>L', 4),
            'title_length'      : (0x58, b'>L', 4),
            'language_code'     : (0x5c, b'>L', 4),
            'dict_in_lang'      : (0x60, b'>L', 4),
            'dict_out_lang'     : (0x64, b'>L', 4),
            'min_version'       : (0x68, b'>L', 4),
            'first_resc_offset' : (0x6c, b'>L', 4),
            'huff_offset'       : (0x70, b'>L', 4),
            'huff_num'          : (0x74, b'>L', 4),
            'huff_tbl_offset'   : (0x78, b'>L', 4),
            'huff_tbl_len'      : (0x7c, b'>L', 4),
            'exth_flags'        : (0x80, b'>L', 4),
            'fill3_a'           : (0x84, b'>L', 4),
            'fill3_b'           : (0x88, b'>L', 4),
            'fill3_c'           : (0x8c, b'>L', 4),
            'fill3_d'           : (0x90, b'>L', 4),
            'fill3_e'           : (0x94, b'>L', 4),
            'fill3_f'           : (0x98, b'>L', 4),
            'fill3_g'           : (0x9c, b'>L', 4),
            'fill3_h'           : (0xa0, b'>L', 4),
            'unknown0'          : (0xa4, b'>L', 4),
            'drm_offset'        : (0xa8, b'>L', 4),
            'drm_count'         : (0xac, b'>L', 4),
            'drm_size'          : (0xb0, b'>L', 4),
            'drm_flags'         : (0xb4, b'>L', 4),
            'fill4_a'           : (0xb8, b'>L', 4),
            'fill4_b'           : (0xbc, b'>L', 4),
            'first_content'     : (0xc0, b'>H', 2),
            'last_content'      : (0xc2, b'>H', 2),
            'unknown0'          : (0xc4, b'>L', 4),
            'fcis_offset'       : (0xc8, b'>L', 4),
            'fcis_count'        : (0xcc, b'>L', 4),
            'flis_offset'       : (0xd0, b'>L', 4),
            'flis_count'        : (0xd4, b'>L', 4),
            'unknown1'          : (0xd8, b'>L', 4),
            'unknown2'          : (0xdc, b'>L', 4),
            'srcs_offset'       : (0xe0, b'>L', 4),
            'srcs_count'        : (0xe4, b'>L', 4),
            'unknown3'          : (0xe8, b'>L', 4),
            'unknown4'          : (0xec, b'>L', 4),
            'fill5'             : (0xf0, b'>H', 2),
            'traildata_flags'   : (0xf2, b'>H', 2),
            'ncx_index'         : (0xf4, b'>L', 4),
            'unknown5'          : (0xf8, b'>L', 4),
            'unknown6'          : (0xfc, b'>L', 4),
            'datp_offset'       : (0x100, b'>L', 4),
            'unknown7'          : (0x104, b'>L', 4),
            'Unknown    '       : (0x108, b'>L', 4),
            'Unknown    '       : (0x10C, b'>L', 4),
            'Unknown    '       : (0x110, b'>L', 4),
            'Unknown    '       : (0x114, b'>L', 4),
            'Unknown    '       : (0x118, b'>L', 4),
            'Unknown    '       : (0x11C, b'>L', 4),
            'Unknown    '       : (0x120, b'>L', 4),
            'Unknown    '       : (0x124, b'>L', 4),
            'Unknown    '       : (0x128, b'>L', 4),
            'Unknown    '       : (0x12C, b'>L', 4),
            'Unknown    '       : (0x130, b'>L', 4),
            'Unknown    '       : (0x134, b'>L', 4),
            'Unknown    '       : (0x138, b'>L', 4),
            'Unknown    '       : (0x11C, b'>L', 4),
            }
    mobi8_header = {
            'compression_type'  : (0x00, b'>H', 2),
            'fill0'             : (0x02, b'>H', 2),
            'text_length'       : (0x04, b'>L', 4),
            'text_records'      : (0x08, b'>H', 2),
            'max_section_size'  : (0x0a, b'>H', 2),
            'crypto_type'       : (0x0c, b'>H', 2),
            'fill1'             : (0x0e, b'>H', 2),
            'magic'             : (0x10, b'4s', 4),
            'header_length (from MOBI)'     : (0x14, b'>L', 4),
            'type'              : (0x18, b'>L', 4),
            'codepage'          : (0x1c, b'>L', 4),
            'unique_id'         : (0x20, b'>L', 4),
            'version'           : (0x24, b'>L', 4),
            'metaorthindex'     : (0x28, b'>L', 4),
            'metainflindex'     : (0x2c, b'>L', 4),
            'index_names'       : (0x30, b'>L', 4),
            'index_keys'        : (0x34, b'>L', 4),
            'extra_index0'      : (0x38, b'>L', 4),
            'extra_index1'      : (0x3c, b'>L', 4),
            'extra_index2'      : (0x40, b'>L', 4),
            'extra_index3'      : (0x44, b'>L', 4),
            'extra_index4'      : (0x48, b'>L', 4),
            'extra_index5'      : (0x4c, b'>L', 4),
            'first_nontext'     : (0x50, b'>L', 4),
            'title_offset'      : (0x54, b'>L', 4),
            'title_length'      : (0x58, b'>L', 4),
            'language_code'     : (0x5c, b'>L', 4),
            'dict_in_lang'      : (0x60, b'>L', 4),
            'dict_out_lang'     : (0x64, b'>L', 4),
            'min_version'       : (0x68, b'>L', 4),
            'first_resc_offset' : (0x6c, b'>L', 4),
            'huff_offset'       : (0x70, b'>L', 4),
            'huff_num'          : (0x74, b'>L', 4),
            'huff_tbl_offset'   : (0x78, b'>L', 4),
            'huff_tbl_len'      : (0x7c, b'>L', 4),
            'exth_flags'        : (0x80, b'>L', 4),
            'fill3_a'           : (0x84, b'>L', 4),
            'fill3_b'           : (0x88, b'>L', 4),
            'fill3_c'           : (0x8c, b'>L', 4),
            'fill3_d'           : (0x90, b'>L', 4),
            'fill3_e'           : (0x94, b'>L', 4),
            'fill3_f'           : (0x98, b'>L', 4),
            'fill3_g'           : (0x9c, b'>L', 4),
            'fill3_h'           : (0xa0, b'>L', 4),
            'unknown0'          : (0xa4, b'>L', 4),
            'drm_offset'        : (0xa8, b'>L', 4),
            'drm_count'         : (0xac, b'>L', 4),
            'drm_size'          : (0xb0, b'>L', 4),
            'drm_flags'         : (0xb4, b'>L', 4),
            'fill4_a'           : (0xb8, b'>L', 4),
            'fill4_b'           : (0xbc, b'>L', 4),
            'fdst_offset'       : (0xc0, b'>L', 4),
            'fdst_flow_count'   : (0xc4, b'>L', 4),
            'fcis_offset'       : (0xc8, b'>L', 4),
            'fcis_count'        : (0xcc, b'>L', 4),
            'flis_offset'       : (0xd0, b'>L', 4),
            'flis_count'        : (0xd4, b'>L', 4),
            'unknown1'          : (0xd8, b'>L', 4),
            'unknown2'          : (0xdc, b'>L', 4),
            'srcs_offset'       : (0xe0, b'>L', 4),
            'srcs_count'        : (0xe4, b'>L', 4),
            'unknown3'          : (0xe8, b'>L', 4),
            'unknown4'          : (0xec, b'>L', 4),
            'fill5'             : (0xf0, b'>H', 2),
            'traildata_flags'   : (0xf2, b'>H', 2),
            'ncx_index'         : (0xf4, b'>L', 4),
            'fragment_index'    : (0xf8, b'>L', 4),
            'skeleton_index'    : (0xfc, b'>L', 4),
            'datp_offset'       : (0x100, b'>L', 4),
            'guide_index'       : (0x104, b'>L', 4),
            'Unknown    '       : (0x108, b'>L', 4),
            'Unknown    '       : (0x10C, b'>L', 4),
            'Unknown    '       : (0x110, b'>L', 4),
            'Unknown    '       : (0x114, b'>L', 4),
            'Unknown    '       : (0x118, b'>L', 4),
            'Unknown    '       : (0x11C, b'>L', 4),
            'Unknown    '       : (0x120, b'>L', 4),
            'Unknown    '       : (0x124, b'>L', 4),
            'Unknown    '       : (0x128, b'>L', 4),
            'Unknown    '       : (0x12C, b'>L', 4),
            'Unknown    '       : (0x130, b'>L', 4),
            'Unknown    '       : (0x134, b'>L', 4),
            'Unknown    '       : (0x138, b'>L', 4),
            'Unknown    '       : (0x11C, b'>L', 4),
            }
    palmdoc_header_sorted_keys = sortedHeaderKeys(palmdoc_header)
    mobi6_header_sorted_keys = sortedHeaderKeys(mobi6_header)
    mobi8_header_sorted_keys = sortedHeaderKeys(mobi8_header)
    id_map_strings = {
        1 : 'Drm Server Id',
        2 : 'Drm Commerce Id',
        3 : 'Drm Ebookbase Book Id',
        4 : 'Drm Ebookbase Dep Id',
        100 : 'Creator',
        101 : 'Publisher',
        102 : 'Imprint',
        103 : 'Description',
        104 : 'ISBN',
        105 : 'Subject',
        106 : 'Published',
        107 : 'Review',
        108 : 'Contributor',
        109 : 'Rights',
        110 : 'SubjectCode',
        111 : 'Type',
        112 : 'Source',
        113 : 'ASIN',
        114 : 'versionNumber',
        117 : 'Adult',
        118 : 'Retail-Price',
        119 : 'Retail-Currency',
        120 : 'TSC',
        122 : 'fixed-layout',
        123 : 'book-type',
        124 : 'orientation-lock',
        126 : 'original-resolution',
        127 : 'zero-gutter',
        128 : 'zero-margin',
        129 : 'MetadataResourceURI',
        132 : 'RegionMagnification',
        150 : 'LendingEnabled',
        200 : 'DictShortName',
        501 : 'cdeType',
        502 : 'last_update_time',
        503 : 'Updated_Title',
        504 : 'CDEContentKey',
        505 : 'AmazonContentReference',
        506 : 'Title-Language',
        507 : 'Title-Display-Direction',
        508 : 'Title-Pronunciation',
        509 : 'Title-Collation',
        510 : 'Secondary-Title',
        511 : 'Secondary-Title-Language',
        512 : 'Secondary-Title-Direction',
        513 : 'Secondary-Title-Pronunciation',
        514 : 'Secondary-Title-Collation',
        515 : 'Author-Language',
        516 : 'Author-Display-Direction',
        517 : 'Author-Pronunciation',
        518 : 'Author-Collation',
        519 : 'Author-Type',
        520 : 'Publisher-Language',
        521 : 'Publisher-Display-Direction',
        522 : 'Publisher-Pronunciation',
        523 : 'Publisher-Collation',
        524 : 'Content-Language-Tag',
        525 : 'primary-writing-mode',
        526 : 'NCX-Ingested-By-Software',
        527 : 'page-progression-direction',
        528 : 'override-kindle-fonts',
        529 : 'Compression-Upgraded',
        530 : 'Soft-Hyphens-In-Content',
        531 : 'Dictionary_In_Langague',
        532 : 'Dictionary_Out_Language',
        533 : 'Font_Converted',
        534 : 'Amazon_Creator_Info',
        535 : 'Creator-Build-Tag',
        536 : 'HD-Media-Containers-Info',  # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?)
        538 : 'Resource-Container-Fidelity',
        539 : 'HD-Container-Mimetype',
        540 : 'Sample-For_Special-Purpose',
        541 : 'Kindletool-Operation-Information',
        542 : 'Container_Id',
        543 : 'Asset-Type',  # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER
        544 : 'Unknown_544',
    }
    id_map_values = {
        115 : 'sample',
        116 : 'StartOffset',
        121 : 'Mobi8-Boundary-Section',
        125 : 'Embedded-Record-Count',
        130 : 'Offline-Sample',
        131 : 'Metadata-Record-Offset',
        201 : 'CoverOffset',
        202 : 'ThumbOffset',
        203 : 'HasFakeCover',
        204 : 'Creator-Software',
        205 : 'Creator-Major-Version',
        206 : 'Creator-Minor-Version',
        207 : 'Creator-Build-Number',
        401 : 'Clipping-Limit',
        402 : 'Publisher-Limit',
        404 : 'Text-to-Speech-Disabled',
        406 : 'Rental-Expiration-Time',
    }
    id_map_hexstrings = {
        208 : 'Watermark_(hex)',
        209 : 'Tamper-Proof-Keys_(hex)',
        300 : 'Font-Signature_(hex)',
        403 : 'Unknown_(403)_(hex)',
        405 : 'Ownership-Type_(hex)',
        407 : 'Unknown_(407)_(hex)',
        420 : 'Multimedia-Content-Reference_(hex)',
        450 : 'Locations_Match_(hex)',
        451 : 'Full-Story-Length_(hex)',
        452 : 'Sample-Start_Location_(hex)',
        453 : 'Sample-End-Location_(hex)',
    }
    def __init__(self, sect, sectNumber):
        self.sect = sect
        self.start = sectNumber
        self.header = self.sect.loadSection(self.start)
        if len(self.header)>20 and self.header[16:20] == b'MOBI':
            self.sect.setsectiondescription(0,"Mobipocket Header")
            self.palm = False
        elif self.sect.ident == b'TEXtREAd':
            self.sect.setsectiondescription(0, "PalmDOC Header")
            self.palm = True
        else:
            raise unpackException('Unknown File Format')
        self.records, = struct.unpack_from(b'>H', self.header, 0x8)
        # set defaults in case this is a PalmDOC
        self.title = self.sect.palmname.decode('latin-1', errors='replace')
        self.length = len(self.header)-16
        self.type = 3
        self.codepage = 1252
        self.codec = 'windows-1252'
        self.unique_id = 0
        self.version = 0
        self.hasExth = False
        self.exth = b''
        self.exth_offset = self.length + 16
        self.exth_length = 0
        self.crypto_type = 0
        self.firstnontext = self.start+self.records + 1
        self.firstresource = self.start+self.records + 1
        self.ncxidx = 0xffffffff
        self.metaOrthIndex = 0xffffffff
        self.metaInflIndex = 0xffffffff
        self.skelidx = 0xffffffff
        self.fragidx = 0xffffffff
        self.guideidx = 0xffffffff
        self.fdst = 0xffffffff
        self.mlstart = self.sect.loadSection(self.start+1)[:4]
        self.rawSize = 0
        self.metadata = dict_()
        # set up for decompression/unpacking
        self.compression, = struct.unpack_from(b'>H', self.header, 0x0)
        if self.compression == 0x4448:
            reader = HuffcdicReader()
            huffoff, huffnum = struct.unpack_from(b'>LL', self.header, 0x70)
            huffoff = huffoff + self.start
            self.sect.setsectiondescription(huffoff,"Huffman Compression Seed")
            reader.loadHuff(self.sect.loadSection(huffoff))
            for i in range(1, huffnum):
                self.sect.setsectiondescription(huffoff+i,"Huffman CDIC Compression Seed %d" % i)
                reader.loadCdic(self.sect.loadSection(huffoff+i))
            self.unpack = reader.unpack
        elif self.compression == 2:
            self.unpack = PalmdocReader().unpack
        elif self.compression == 1:
            self.unpack = UncompressedReader().unpack
        else:
            raise unpackException('invalid compression type: 0x%4x' % self.compression)
        if self.palm:
            return
        self.length, self.type, self.codepage, self.unique_id, self.version = struct.unpack(b'>LLLLL', self.header[20:40])
        codec_map = {
            1252 : 'windows-1252',
            65001: 'utf-8',
        }
        if self.codepage in codec_map:
            self.codec = codec_map[self.codepage]
        # title
        toff, tlen = struct.unpack(b'>II', self.header[0x54:0x5c])
        tend = toff + tlen
        self.title=self.header[toff:tend].decode(self.codec, errors='replace')
        exth_flag, = struct.unpack(b'>L', self.header[0x80:0x84])
        self.hasExth = exth_flag & 0x40
        self.exth_offset = self.length + 16
        self.exth_length = 0
        if self.hasExth:
            self.exth_length, = struct.unpack_from(b'>L', self.header, self.exth_offset+4)
            self.exth_length = ((self.exth_length + 3)>>2)<<2  # round to next 4 byte boundary
            self.exth = self.header[self.exth_offset:self.exth_offset+self.exth_length]
        # parse the exth / metadata
        self.parseMetaData()
        # self.mlstart = self.sect.loadSection(self.start+1)
        # self.mlstart = self.mlstart[0:4]
        self.crypto_type, = struct.unpack_from(b'>H', self.header, 0xC)
        # Start sector for additional files such as images, fonts, resources, etc
        # Can be missing so fall back to default set previously
        ofst, = struct.unpack_from(b'>L', self.header, 0x6C)
        if ofst != 0xffffffff:
            self.firstresource = ofst + self.start
        ofst, = struct.unpack_from(b'>L', self.header, 0x50)
        if ofst != 0xffffffff:
            self.firstnontext = ofst + self.start
        if self.isPrintReplica():
            return
        if self.version < 8:
            # Dictionary metaOrthIndex
            self.metaOrthIndex, = struct.unpack_from(b'>L', self.header, 0x28)
            if self.metaOrthIndex != 0xffffffff:
                self.metaOrthIndex += self.start
            # Dictionary metaInflIndex
            self.metaInflIndex, = struct.unpack_from(b'>L', self.header, 0x2C)
            if self.metaInflIndex != 0xffffffff:
                self.metaInflIndex += self.start
        # handle older headers without any ncxindex info and later
        # specifically 0xe4 headers
        if self.length + 16 < 0xf8:
            return
        # NCX Index
        self.ncxidx, = struct.unpack(b'>L', self.header[0xf4:0xf8])
        if self.ncxidx != 0xffffffff:
            self.ncxidx += self.start
        # K8 specific Indexes
        if self.start != 0 or self.version == 8:
            # Index into <xml> file skeletons in RawML
            self.skelidx, = struct.unpack_from(b'>L', self.header, 0xfc)
            if self.skelidx != 0xffffffff:
                self.skelidx += self.start
            # Index into <div> sections in RawML
            self.fragidx, = struct.unpack_from(b'>L', self.header, 0xf8)
            if self.fragidx != 0xffffffff:
                self.fragidx += self.start
            # Index into Other files
            self.guideidx, = struct.unpack_from(b'>L', self.header, 0x104)
            if self.guideidx != 0xffffffff:
                self.guideidx += self.start
            # dictionaries do not seem to use the same approach in K8's
            # so disable them
            self.metaOrthIndex = 0xffffffff
            self.metaInflIndex = 0xffffffff
            # need to use the FDST record to find out how to properly unpack
            # the rawML into pieces
            # it is simply a table of start and end locations for each flow piece
            self.fdst, = struct.unpack_from(b'>L', self.header, 0xc0)
            self.fdstcnt, = struct.unpack_from(b'>L', self.header, 0xc4)
            # if cnt is 1 or less, fdst section mumber can be garbage
            if self.fdstcnt <= 1:
                self.fdst = 0xffffffff
            if self.fdst != 0xffffffff:
                self.fdst += self.start
                # setting of fdst section description properly handled in mobi_kf8proc
    def dump_exth(self):
        # determine text encoding
        codec=self.codec
        if (not self.hasExth) or (self.exth_length) == 0 or (self.exth == b''):
            return
        num_items, = struct.unpack(b'>L', self.exth[8:12])
        pos = 12
        print("Key Size Decription                     Value")
        for _ in range(num_items):
            id, size = struct.unpack(b'>LL', self.exth[pos:pos+8])
            contentsize = size-8
            content = self.exth[pos + 8: pos + size]
            if id in MobiHeader.id_map_strings:
                exth_name = MobiHeader.id_map_strings[id]
                print('{0: >3d} {1: >4d} {2: <30s} {3:s}'.format(id, contentsize, exth_name, content.decode(codec, errors='replace')))
            elif id in MobiHeader.id_map_values:
                exth_name = MobiHeader.id_map_values[id]
                if size == 9:
                    value, = struct.unpack(b'B',content)
                    print('{0:3d} byte {1:<30s} {2:d}'.format(id, exth_name, value))
                elif size == 10:
                    value, = struct.unpack(b'>H',content)
                    print('{0:3d} word {1:<30s} 0x{2:0>4X} ({2:d})'.format(id, exth_name, value))
                elif size == 12:
                    value, = struct.unpack(b'>L',content)
                    print('{0:3d} long {1:<30s} 0x{2:0>8X} ({2:d})'.format(id, exth_name, value))
                else:
                    print('{0: >3d} {1: >4d} {2: <30s} (0x{3:s})'.format(id, contentsize, "Bad size for "+exth_name, hexlify(content)))
            elif id in MobiHeader.id_map_hexstrings:
                exth_name = MobiHeader.id_map_hexstrings[id]
                print('{0:3d} {1:4d} {2:<30s} 0x{3:s}'.format(id, contentsize, exth_name, hexlify(content)))
            else:
                exth_name = "Unknown EXTH ID {0:d}".format(id)
                print("{0: >3d} {1: >4d} {2: <30s} 0x{3:s}".format(id, contentsize, exth_name, hexlify(content)))
            pos += size
        return
    def dumpheader(self):
        # first 16 bytes are not part of the official mobiheader
        # but we will treat it as such
        # so section 0 is 16 (decimal) + self.length in total == at least 0x108 bytes for Mobi 8 headers
        print("Dumping section %d, Mobipocket Header version: %d, total length %d" % (self.start,self.version, self.length+16))
        self.hdr = {}
        # set it up for the proper header version
        if self.version == 0:
            self.mobi_header = MobiHeader.palmdoc_header
            self.mobi_header_sorted_keys = MobiHeader.palmdoc_header_sorted_keys
        elif self.version < 8:
            self.mobi_header = MobiHeader.mobi6_header
            self.mobi_header_sorted_keys = MobiHeader.mobi6_header_sorted_keys
        else:
            self.mobi_header = MobiHeader.mobi8_header
            self.mobi_header_sorted_keys = MobiHeader.mobi8_header_sorted_keys
        # parse the header information
        for key in self.mobi_header_sorted_keys:
            (pos, format, tot_len) = self.mobi_header[key]
            if pos < (self.length + 16):
                val, = struct.unpack_from(format, self.header, pos)
                self.hdr[key] = val
        if 'title_offset' in self.hdr:
            title_offset = self.hdr['title_offset']
            title_length = self.hdr['title_length']
        else:
            title_offset = 0
            title_length = 0
        if title_offset == 0:
            title_offset = len(self.header)
            title_length = 0
            self.title = self.sect.palmname.decode('latin-1', errors='replace')
        else:
            self.title = self.header[title_offset:title_offset+title_length].decode(self.codec, errors='replace')
            # title record always padded with two nul bytes and then padded with nuls to next 4 byte boundary
            title_length = ((title_length+2+3)>>2)<<2
        self.extra1 = self.header[self.exth_offset+self.exth_length:title_offset]
        self.extra2 = self.header[title_offset+title_length:]
        print("Mobipocket header from section %d" % self.start)
        print("     Offset  Value Hex Dec        Description")
        for key in self.mobi_header_sorted_keys:
            (pos, format, tot_len) = self.mobi_header[key]
            if pos < (self.length + 16):
                if key != 'magic':
                    fmt_string = "0x{0:0>3X} ({0:3d}){1: >" + str(9-2*tot_len) +"s}0x{2:0>" + str(2*tot_len) + "X} {2:10d} {3:s}"
                else:
                    self.hdr[key] = unicode_str(self.hdr[key])
                    fmt_string = "0x{0:0>3X} ({0:3d}){2:>11s}            {3:s}"
                print(fmt_string.format(pos, " ",self.hdr[key], key))
        print("")
        if self.exth_length > 0:
            print("EXTH metadata, offset %d, padded length %d" % (self.exth_offset,self.exth_length))
            self.dump_exth()
            print("")
        if len(self.extra1) > 0:
            print("Extra data between EXTH and Title, length %d" % len(self.extra1))
            print(hexlify(self.extra1))
            print("")
        if title_length > 0:
            print("Title in header at offset %d, padded length %d: '%s'" %(title_offset,title_length,self.title))
            print("")
        if len(self.extra2) > 0:
            print("Extra data between Title and end of header, length %d" % len(self.extra2))
            print(hexlify(self.extra2))
            print("")
    def isPrintReplica(self):
        return self.mlstart[0:4] == b"%MOP"
    def isK8(self):
        return self.start != 0 or self.version == 8
    def isEncrypted(self):
        return self.crypto_type != 0
    def hasNCX(self):
        return self.ncxidx != 0xffffffff
    def isDictionary(self):
        return self.metaOrthIndex != 0xffffffff
    def getncxIndex(self):
        return self.ncxidx
    def decompress(self, data):
        return self.unpack(data)
    def Language(self):
        langcode = struct.unpack(b'!L', self.header[0x5c:0x60])[0]
        langid = langcode & 0xFF
        sublangid = (langcode >> 8) & 0xFF
        return getLanguage(langid, sublangid)
    def DictInLanguage(self):
        if self.isDictionary():
            langcode = struct.unpack(b'!L', self.header[0x60:0x64])[0]
            langid = langcode & 0xFF
            sublangid = (langcode >> 10) & 0xFF
            if langid != 0:
                return getLanguage(langid, sublangid)
        return False
    def DictOutLanguage(self):
        if self.isDictionary():
            langcode = struct.unpack(b'!L', self.header[0x64:0x68])[0]
            langid = langcode & 0xFF
            sublangid = (langcode >> 10) & 0xFF
            if langid != 0:
                return getLanguage(langid, sublangid)
        return False
    def getRawML(self):
        def getSizeOfTrailingDataEntry(data):
            num = 0
            for v in data[-4:]:
                if bord(v) & 0x80:
                    num = 0
                num = (num << 7) | (bord(v) & 0x7f)
            return num
        def trimTrailingDataEntries(data):
            for _ in range(trailers):
                num = getSizeOfTrailingDataEntry(data)
                data = data[:-num]
            if multibyte:
                num = (ord(data[-1:]) & 3) + 1
                data = data[:-num]
            return data
        multibyte = 0
        trailers = 0
        if self.sect.ident == b'BOOKMOBI':
            mobi_length, = struct.unpack_from(b'>L', self.header, 0x14)
            mobi_version, = struct.unpack_from(b'>L', self.header, 0x68)
            if (mobi_length >= 0xE4) and (mobi_version >= 5):
                flags, = struct.unpack_from(b'>H', self.header, 0xF2)
                multibyte = flags & 1
                while flags > 1:
                    if flags & 2:
                        trailers += 1
                    flags = flags >> 1
        # get raw mobi markup languge
        print("Unpacking raw markup language")
        dataList = []
        # offset = 0
        for i in range(1, self.records+1):
            data = trimTrailingDataEntries(self.sect.loadSection(self.start + i))
            dataList.append(self.unpack(data))
            if self.isK8():
                self.sect.setsectiondescription(self.start + i,"KF8 Text Section {0:d}".format(i))
            elif self.version == 0:
                self.sect.setsectiondescription(self.start + i,"PalmDOC Text Section {0:d}".format(i))
            else:
                self.sect.setsectiondescription(self.start + i,"Mobipocket Text Section {0:d}".format(i))
        rawML = b''.join(dataList)
        self.rawSize = len(rawML)
        return rawML
    # all metadata is stored in a dictionary with key and returns a *list* of values
    # a list is used to allow for multiple creators, multiple contributors, etc
    def parseMetaData(self):
        def addValue(name, value):
            if name not in self.metadata:
                self.metadata[name] = [value]
            else:
                self.metadata[name].append(value)
        codec=self.codec
        if self.hasExth:
            extheader=self.exth
            _length, num_items = struct.unpack(b'>LL', extheader[4:12])
            extheader = extheader[12:]
            pos = 0
            for _ in range(num_items):
                id, size = struct.unpack(b'>LL', extheader[pos:pos+8])
                content = extheader[pos + 8: pos + size]
                if id in MobiHeader.id_map_strings:
                    name = MobiHeader.id_map_strings[id]
                    addValue(name, content.decode(codec, errors='replace'))
                elif id in MobiHeader.id_map_values:
                    name = MobiHeader.id_map_values[id]
                    if size == 9:
                        value, = struct.unpack(b'B',content)
                        addValue(name, unicode_str(str(value)))
                    elif size == 10:
                        value, = struct.unpack(b'>H',content)
                        addValue(name, unicode_str(str(value)))
                    elif size == 12:
                        value, = struct.unpack(b'>L',content)
                        # handle special case of missing CoverOffset or missing ThumbOffset
                        if id == 201 or id == 202:
                            if value != 0xffffffff:
                                addValue(name, unicode_str(str(value)))
                        else:
                            addValue(name, unicode_str(str(value)))
                    else:
                        print("Warning: Bad key, size, value combination detected in EXTH ", id, size, hexlify(content))
                        addValue(name, hexlify(content))
                elif id in MobiHeader.id_map_hexstrings:
                    name = MobiHeader.id_map_hexstrings[id]
                    addValue(name, hexlify(content))
                else:
                    name = unicode_str(str(id)) + ' (hex)'
                    addValue(name, hexlify(content))
                pos += size
        # add the basics to the metadata each as a list element
        self.metadata['Language'] = [self.Language()]
        self.metadata['Title'] = [unicode_str(self.title,self.codec)]
        self.metadata['Codec'] = [self.codec]
        self.metadata['UniqueID'] = [unicode_str(str(self.unique_id))]
        # if no asin create one using a uuid
        if 'ASIN' not in self.metadata:
            self.metadata['ASIN'] = [unicode_str(str(uuid.uuid4()))]
        # if no cdeType set it to "EBOK"
        if 'cdeType' not in self.metadata:
            self.metadata['cdeType'] = ['EBOK']
    def getMetaData(self):
        return self.metadata
    def describeHeader(self, DUMP):
        print("Mobi Version:", self.version)
        print("Codec:", self.codec)
        print("Title:", self.title)
        if 'Updated_Title' in self.metadata:
            print("EXTH Title:", self.metadata['Updated_Title'][0])
        if self.compression == 0x4448:
            print("Huffdic compression")
        elif self.compression == 2:
            print("Palmdoc compression")
        elif self.compression == 1:
            print("No compression")
        if DUMP:
            self.dumpheader()
--- a/KindleUnpack/mobi_html.py
+++ b/KindleUnpack/mobi_html.py
@@ -0,0 +1,439 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 from __future__ import unicode_literals, division, absolute_import, print_function
 from .compatibility_utils import PY2, utf8_str
 if PY2:
    range = xrange
 import re
 # note: re requites the pattern to be the exact same type as the data to be searched in python3
 # but u"" is not allowed for the pattern itself only b""
 from .mobi_utils import fromBase32
 class HTMLProcessor:
    def __init__(self, files, metadata, rscnames):
        self.files = files
        self.metadata = metadata
        self.rscnames = rscnames
        # for original style mobis, default to including all image files in the opf manifest
        self.used = {}
        for name in rscnames:
            self.used[name] = 'used'
    def findAnchors(self, rawtext, indx_data, positionMap):
        # process the raw text
        # find anchors...
        print("Find link anchors")
        link_pattern = re.compile(br'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''', re.IGNORECASE)
        # TEST NCX: merge in filepos from indx
        pos_links = [int(m.group(1)) for m in link_pattern.finditer(rawtext)]
        if indx_data:
            pos_indx = [e['pos'] for e in indx_data if e['pos']>0]
            pos_links = list(set(pos_links + pos_indx))
        for position in pos_links:
            if position in positionMap:
                positionMap[position] = positionMap[position] + utf8_str('<a id="filepos%d" />' % position)
            else:
                positionMap[position] = utf8_str('<a id="filepos%d" />' % position)
        # apply dictionary metadata and anchors
        print("Insert data into html")
        pos = 0
        lastPos = len(rawtext)
        dataList = []
        for end in sorted(positionMap.keys()):
            if end == 0 or end > lastPos:
                continue  # something's up - can't put a tag in outside <html>...</html>
            dataList.append(rawtext[pos:end])
            dataList.append(positionMap[end])
            pos = end
        dataList.append(rawtext[pos:])
        srctext = b"".join(dataList)
        rawtext = None
        dataList = None
        self.srctext = srctext
        self.indx_data = indx_data
        return srctext
    def insertHREFS(self):
        srctext = self.srctext
        rscnames = self.rscnames
        metadata = self.metadata
        # put in the hrefs
        print("Insert hrefs into html")
        # There doesn't seem to be a standard, so search as best as we can
        link_pattern = re.compile(br'''<a([^>]*?)filepos=['"]{0,1}0*(\d+)['"]{0,1}([^>]*?)>''', re.IGNORECASE)
        srctext = link_pattern.sub(br'''<a\1href="#filepos\2"\3>''', srctext)
        # remove empty anchors
        print("Remove empty anchors from html")
        srctext = re.sub(br"<a\s*/>",br"", srctext)
        srctext = re.sub(br"<a\s*>\s*</a>",br"", srctext)
        # convert image references
        print("Insert image references into html")
        # split string into image tag pieces and other pieces
        image_pattern = re.compile(br'''(<img.*?>)''', re.IGNORECASE)
        image_index_pattern = re.compile(br'''recindex=['"]{0,1}([0-9]+)['"]{0,1}''', re.IGNORECASE)
        srcpieces = image_pattern.split(srctext)
        srctext = self.srctext = None
        # all odd pieces are image tags (nulls string on even pieces if no space between them in srctext)
        for i in range(1, len(srcpieces), 2):
            tag = srcpieces[i]
            for m in image_index_pattern.finditer(tag):
                imageNumber = int(m.group(1))
                imageName = rscnames[imageNumber-1]
                if imageName is None:
                    print("Error: Referenced image %s was not recognized as a valid image" % imageNumber)
                else:
                    replacement = b'src="Images/' + utf8_str(imageName) + b'"'
                    tag = image_index_pattern.sub(replacement, tag, 1)
            srcpieces[i] = tag
        srctext = b"".join(srcpieces)
        # add in character set meta into the html header if needed
        if 'Codec' in metadata:
            srctext = srctext[0:12]+b'<meta http-equiv="content-type" content="text/html; charset='+utf8_str(metadata.get('Codec')[0])+b'" />'+srctext[12:]
        return srctext, self.used
 class XHTMLK8Processor:
    def __init__(self, rscnames, k8proc):
        self.rscnames = rscnames
        self.k8proc = k8proc
        self.used = {}
    def buildXHTML(self):
        # first need to update all links that are internal which
        # are based on positions within the xhtml files **BEFORE**
        # cutting and pasting any pieces into the xhtml text files
        #   kindle:pos:fid:XXXX:off:YYYYYYYYYY  (used for internal link within xhtml)
        #       XXXX is the offset in records into divtbl
        #       YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position
        # pos:fid pattern
        posfid_pattern = re.compile(br'''(<a.*?href=.*?>)''', re.IGNORECASE)
        posfid_index_pattern = re.compile(br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''')
        parts = []
        print("Building proper xhtml for each file")
        for i in range(self.k8proc.getNumberOfParts()):
            part = self.k8proc.getPart(i)
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.getPartInfo(i)
            # internal links
            srcpieces = posfid_pattern.split(part)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if tag.startswith(b'<'):
                    for m in posfid_index_pattern.finditer(tag):
                        posfid = m.group(1)
                        offset = m.group(2)
                        filename, idtag = self.k8proc.getIDTagByPosFid(posfid, offset)
                        if idtag == b'':
                            replacement= b'"' + utf8_str(filename) + b'"'
                        else:
                            replacement = b'"' + utf8_str(filename) + b'#' + idtag + b'"'
                        tag = posfid_index_pattern.sub(replacement, tag, 1)
                    srcpieces[j] = tag
            part = b"".join(srcpieces)
            parts.append(part)
        # we are free to cut and paste as we see fit
        # we can safely remove all of the Kindlegen generated aid tags
        # change aid ids that are in k8proc.linked_aids to xhtml ids
        find_tag_with_aid_pattern = re.compile(br'''(<[^>]*\said\s*=[^>]*>)''', re.IGNORECASE)
        within_tag_aid_position_pattern = re.compile(br'''\said\s*=['"]([^'"]*)['"]''')
        for i in range(len(parts)):
            part = parts[i]
            srcpieces = find_tag_with_aid_pattern.split(part)
            for j in range(len(srcpieces)):
                tag = srcpieces[j]
                if tag.startswith(b'<'):
                    for m in within_tag_aid_position_pattern.finditer(tag):
                        try:
                            aid = m.group(1)
                        except IndexError:
                            aid = None
                        replacement = b''
                        if aid in self.k8proc.linked_aids:
                            replacement = b' id="aid-' + aid + b'"'
                        tag = within_tag_aid_position_pattern.sub(replacement, tag, 1)
                    srcpieces[j] = tag
            part = b"".join(srcpieces)
            parts[i] = part
        # we can safely replace all of the Kindlegen generated data-AmznPageBreak tags
        # with page-break-after style patterns
        find_tag_with_AmznPageBreak_pattern = re.compile(br'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE)
        within_tag_AmznPageBreak_position_pattern = re.compile(br'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''')
        for i in range(len(parts)):
            part = parts[i]
            srcpieces = find_tag_with_AmznPageBreak_pattern.split(part)
            for j in range(len(srcpieces)):
                tag = srcpieces[j]
                if tag.startswith(b'<'):
                    srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub(
                        lambda m:b' style="page-break-after:' + m.group(1) + b'"', tag)
            part = b"".join(srcpieces)
            parts[i] = part
        # we have to handle substitutions for the flows  pieces first as they may
        # be inlined into the xhtml text
        #   kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
        #   kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
        #   kindle:embed:XXXX   (used for fonts)
        flows = []
        flows.append(None)
        flowinfo = []
        flowinfo.append([None, None, None, None])
        # regular expression search patterns
        img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
        img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE)
        tag_pattern = re.compile(br'''(<[^>]*>)''')
        flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
        url_pattern = re.compile(br'''(url\(.*?\))''', re.IGNORECASE)
        url_img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*["')]''', re.IGNORECASE)
        font_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)["')]''', re.IGNORECASE)
        url_css_index_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE)
        url_svg_image_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=image/svg\+xml[^\)]*''', re.IGNORECASE)
        for i in range(1, self.k8proc.getNumberOfFlows()):
            [ftype, format, dir, filename] = self.k8proc.getFlowInfo(i)
            flowpart = self.k8proc.getFlow(i)
            # links to raster image files from image tags
            # image_pattern
            srcpieces = img_pattern.split(flowpart)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if tag.startswith(b'<im'):
                    for m in img_index_pattern.finditer(tag):
                        imageNumber = fromBase32(m.group(1))
                        imageName = self.rscnames[imageNumber-1]
                        if imageName is not None:
                            replacement = b'"../Images/' + utf8_str(imageName) + b'"'
                            self.used[imageName] = 'used'
                            tag = img_index_pattern.sub(replacement, tag, 1)
                        else:
                            print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag))
                    srcpieces[j] = tag
            flowpart = b"".join(srcpieces)
            # replacements inside css url():
            srcpieces = url_pattern.split(flowpart)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                #  process links to raster image files
                for m in url_img_index_pattern.finditer(tag):
                    imageNumber = fromBase32(m.group(1))
                    imageName = self.rscnames[imageNumber-1]
                    osep = m.group()[0:1]
                    csep = m.group()[-1:]
                    if imageName is not None:
                        replacement = osep +  b'../Images/' + utf8_str(imageName) +  csep
                        self.used[imageName] = 'used'
                        tag = url_img_index_pattern.sub(replacement, tag, 1)
                    else:
                        print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag))
                # process links to fonts
                for m in font_index_pattern.finditer(tag):
                    fontNumber = fromBase32(m.group(1))
                    fontName = self.rscnames[fontNumber-1]
                    osep = m.group()[0:1]
                    csep = m.group()[-1:]
                    if fontName is None:
                        print("Error: Referenced font %s was not recognized as a valid font in %s" % (fontNumber, tag))
                    else:
                        replacement = osep +  b'../Fonts/' + utf8_str(fontName) +  csep
                        tag = font_index_pattern.sub(replacement, tag, 1)
                        self.used[fontName] = 'used'
                # process links to other css pieces
                for m in url_css_index_pattern.finditer(tag):
                    num = fromBase32(m.group(1))
                    [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
                    replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
                    tag = url_css_index_pattern.sub(replacement, tag, 1)
                    self.used[fnm] = 'used'
                # process links to svg images
                for m in url_svg_image_pattern.finditer(tag):
                    num = fromBase32(m.group(1))
                    [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
                    replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
                    tag = url_svg_image_pattern.sub(replacement, tag, 1)
                    self.used[fnm] = 'used'
                srcpieces[j] = tag
            flowpart = b"".join(srcpieces)
            # store away in our own copy
            flows.append(flowpart)
            # I do not think this case exists and even if it does exist, it needs to be done in a separate
            # pass to prevent inlining a flow piece into another flow piece before the inserted one or the
            # target one has been fully processed
            # but keep it around if it ends up we do need it
            # flow pattern not inside url()
            # srcpieces = tag_pattern.split(flowpart)
            # for j in range(1, len(srcpieces),2):
            #     tag = srcpieces[j]
            #     if tag.startswith(b'<'):
            #         for m in flow_pattern.finditer(tag):
            #             num = fromBase32(m.group(1))
            #             [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
            #             flowtext = self.k8proc.getFlow(num)
            #             if fmt == b'inline':
            #                 tag = flowtext
            #             else:
            #                 replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
            #                 tag = flow_pattern.sub(replacement, tag, 1)
            #                 self.used[fnm] = 'used'
            #         srcpieces[j] = tag
            # flowpart = b"".join(srcpieces)
        # now handle the main text xhtml parts
        # Handle the flow items in the XHTML text pieces
        # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
        tag_pattern = re.compile(br'''(<[^>]*>)''')
        flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
        for i in range(len(parts)):
            part = parts[i]
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
            # flow pattern
            srcpieces = tag_pattern.split(part)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if tag.startswith(b'<'):
                    for m in flow_pattern.finditer(tag):
                        num = fromBase32(m.group(1))
                        if num > 0 and num < len(self.k8proc.flowinfo):
                            [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
                            flowpart = flows[num]
                            if fmt == b'inline':
                                tag = flowpart
                            else:
                                replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
                                tag = flow_pattern.sub(replacement, tag, 1)
                                self.used[fnm] = 'used'
                        else:
                            print("warning: ignoring non-existent flow link", tag, " value 0x%x" % num)
                    srcpieces[j] = tag
            part = b''.join(srcpieces)
            # store away modified version
            parts[i] = part
        # Handle any embedded raster images links in style= attributes urls
        style_pattern = re.compile(br'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''', re.IGNORECASE)
        img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE)
        for i in range(len(parts)):
            part = parts[i]
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
            # replace urls in style attributes
            srcpieces = style_pattern.split(part)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if b'kindle:embed' in tag:
                    for m in img_index_pattern.finditer(tag):
                        imageNumber = fromBase32(m.group(1))
                        imageName = self.rscnames[imageNumber-1]
                        osep = m.group()[0:1]
                        csep = m.group()[-1:]
                        if imageName is not None:
                            replacement = osep + b'../Images/'+ utf8_str(imageName) + csep
                            self.used[imageName] = 'used'
                            tag = img_index_pattern.sub(replacement, tag, 1)
                        else:
                            print("Error: Referenced image %s in style url was not recognized in %s" % (imageNumber, tag))
                    srcpieces[j] = tag
            part = b"".join(srcpieces)
            # store away modified version
            parts[i] = part
        # Handle any embedded raster images links in the xhtml text
        # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
        img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
        img_index_pattern = re.compile(br'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''')
        for i in range(len(parts)):
            part = parts[i]
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
            # links to raster image files
            # image_pattern
            srcpieces = img_pattern.split(part)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if tag.startswith(b'<im'):
                    for m in img_index_pattern.finditer(tag):
                        imageNumber = fromBase32(m.group(1))
                        imageName = self.rscnames[imageNumber-1]
                        if imageName is not None:
                            replacement = b'"../Images/' + utf8_str(imageName) + b'"'
                            self.used[imageName] = 'used'
                            tag = img_index_pattern.sub(replacement, tag, 1)
                        else:
                            print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag))
                    srcpieces[j] = tag
            part = b"".join(srcpieces)
            # store away modified version
            parts[i] = part
        # finally perform any general cleanups needed to make valid XHTML
        # these include:
        #   in svg tags replace "perserveaspectratio" attributes with "perserveAspectRatio"
        #   in svg tags replace "viewbox" attributes with "viewBox"
        #   in <li> remove value="XX" attributes since these are illegal
        tag_pattern = re.compile(br'''(<[^>]*>)''')
        li_value_pattern = re.compile(br'''\svalue\s*=\s*['"][^'"]*['"]''', re.IGNORECASE)
        for i in range(len(parts)):
            part = parts[i]
            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
            # tag pattern
            srcpieces = tag_pattern.split(part)
            for j in range(1, len(srcpieces),2):
                tag = srcpieces[j]
                if tag.startswith(b'<svg') or tag.startswith(b'<SVG'):
                    tag = tag.replace(b'preserveaspectratio',b'preserveAspectRatio')
                    tag = tag.replace(b'viewbox',b'viewBox')
                elif tag.startswith(b'<li ') or tag.startswith(b'<LI '):
                    tagpieces = li_value_pattern.split(tag)
                    tag = b"".join(tagpieces)
                srcpieces[j] = tag
            part = b"".join(srcpieces)
            # store away modified version
            parts[i] = part
        self.k8proc.setFlows(flows)
        self.k8proc.setParts(parts)
        return self.used
--- a/KindleUnpack/mobi_index.py
+++ b/KindleUnpack/mobi_index.py
@@ -0,0 +1,276 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 from __future__ import unicode_literals, division, absolute_import, print_function
 from .compatibility_utils import PY2, bchr, bstr, bord
 if PY2:
    range = xrange
 import struct
 # note:  struct pack, unpack, unpack_from all require bytestring format
 # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 from .mobi_utils import toHex
 class MobiIndex:
    def __init__(self, sect, DEBUG=False):
        self.sect = sect
        self.DEBUG = DEBUG
    def getIndexData(self, idx, label="Unknown"):
        sect = self.sect
        outtbl = []
        ctoc_text = {}
        if idx != 0xffffffff:
            sect.setsectiondescription(idx,"{0} Main INDX section".format(label))
            data = sect.loadSection(idx)
            idxhdr, hordt1, hordt2 = self.parseINDXHeader(data)
            IndexCount = idxhdr['count']
            # handle the case of multiple sections used for CTOC
            rec_off = 0
            off = idx + IndexCount + 1
            for j in range(idxhdr['nctoc']):
                cdata = sect.loadSection(off + j)
                sect.setsectiondescription(off+j, label + ' CTOC Data ' + str(j))
                ctocdict = self.readCTOC(cdata)
                for k in ctocdict:
                    ctoc_text[k + rec_off] = ctocdict[k]
                rec_off += 0x10000
            tagSectionStart = idxhdr['len']
            controlByteCount, tagTable = readTagSection(tagSectionStart, data)
            if self.DEBUG:
                print("ControlByteCount is", controlByteCount)
                print("IndexCount is", IndexCount)
                print("TagTable: %s" % tagTable)
            for i in range(idx + 1, idx + 1 + IndexCount):
                sect.setsectiondescription(i,"{0} Extra {1:d} INDX section".format(label,i-idx))
                data = sect.loadSection(i)
                hdrinfo, ordt1, ordt2 = self.parseINDXHeader(data)
                idxtPos = hdrinfo['start']
                entryCount = hdrinfo['count']
                if self.DEBUG:
                    print(idxtPos, entryCount)
                # loop through to build up the IDXT position starts
                idxPositions = []
                for j in range(entryCount):
                    pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j))
                    idxPositions.append(pos)
                # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
                idxPositions.append(idxtPos)
                # for each entry in the IDXT build up the tagMap and any associated text
                for j in range(entryCount):
                    startPos = idxPositions[j]
                    endPos = idxPositions[j+1]
                    textLength = ord(data[startPos:startPos+1])
                    text = data[startPos+1:startPos+1+textLength]
                    if hordt2 is not None:
                        text = b''.join(bchr(hordt2[bord(x)]) for x in text)
                    tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos)
                    outtbl.append([text, tagMap])
                    if self.DEBUG:
                        print(tagMap)
                        print(text)
        return outtbl, ctoc_text
    def parseINDXHeader(self, data):
        "read INDX header"
        if not data[:4] == b'INDX':
            print("Warning: index section is not INDX")
            return False
        words = (
                'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
                'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc'
        )
        num = len(words)
        values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)])
        header = {}
        for n in range(num):
            header[words[n]] = values[n]
        ordt1 = None
        ordt2 = None
        ocnt, oentries, op1, op2, otagx  = struct.unpack_from(b'>LLLLL',data, 0xa4)
        if header['code'] == 0xfdea or ocnt != 0 or oentries > 0:
            # horribly hacked up ESP (sample) mobi books use two ORDT sections but never specify
            # them in the proper place in the header.  They seem to be codepage 65002 which seems
            # to be some sort of strange EBCDIC utf-8 or 16 encoded strings
            # so we need to look for them and store them away to process leading text
            # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
            # we only ever seem to use the seocnd but ...
            assert(ocnt == 1)
            assert(data[op1:op1+4] == b'ORDT')
            assert(data[op2:op2+4] == b'ORDT')
            ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4)
            ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4)
        if self.DEBUG:
            print("parsed INDX header:")
            for n in words:
                print(n, "%X" % header[n],)
            print("")
        return header, ordt1, ordt2
    def readCTOC(self, txtdata):
        # read all blocks from CTOC
        ctoc_data = {}
        offset = 0
        while offset<len(txtdata):
            if PY2:
                if txtdata[offset] == b'\0':
                    break
            else:
                if txtdata[offset] == 0:
                    break
            idx_offs = offset
            # first n bytes: name len as vwi
            pos, ilen = getVariableWidthValue(txtdata, offset)
            offset += pos
            # <len> next bytes: name
            name = txtdata[offset:offset+ilen]
            offset += ilen
            if self.DEBUG:
                print("name length is ", ilen)
                print(idx_offs, name)
            ctoc_data[idx_offs] = name
        return ctoc_data
 def getVariableWidthValue(data, offset):
    '''
    Decode variable width value from given bytes.
    @param data: The bytes to decode.
    @param offset: The start offset into data.
    @return: Tuple of consumed bytes count and decoded value.
    '''
    value = 0
    consumed = 0
    finished = False
    while not finished:
        v = data[offset + consumed: offset + consumed + 1]
        consumed += 1
        if ord(v) & 0x80:
            finished = True
        value = (value << 7) | (ord(v) & 0x7f)
    return consumed, value
 def readTagSection(start, data):
    '''
    Read tag section from given data.
    @param start: The start position in the data.
    @param data: The data to process.
    @return: Tuple of control byte count and list of tag tuples.
    '''
    controlByteCount = 0
    tags = []
    if data[start:start+4] == b"TAGX":
        firstEntryOffset, = struct.unpack_from(b'>L', data, start + 0x04)
        controlByteCount, = struct.unpack_from(b'>L', data, start + 0x08)
        # Skip the first 12 bytes already read above.
        for i in range(12, firstEntryOffset, 4):
            pos = start + i
            tags.append((ord(data[pos:pos+1]), ord(data[pos+1:pos+2]), ord(data[pos+2:pos+3]), ord(data[pos+3:pos+4])))
    return controlByteCount, tags
 def countSetBits(value, bits=8):
    '''
    Count the set bits in the given value.
    @param value: Integer value.
    @param bits: The number of bits of the input value (defaults to 8).
    @return: Number of set bits.
    '''
    count = 0
    for _ in range(bits):
        if value & 0x01 == 0x01:
            count += 1
        value = value >> 1
    return count
 def getTagMap(controlByteCount, tagTable, entryData, startPos, endPos):
    '''
    Create a map of tags and values from the given byte section.
    @param controlByteCount: The number of control bytes.
    @param tagTable: The tag table.
    @param entryData: The data to process.
    @param startPos: The starting position in entryData.
    @param endPos: The end position in entryData or None if it is unknown.
    @return: Hashmap of tag and list of values.
    '''
    tags = []
    tagHashMap = {}
    controlByteIndex = 0
    dataStart = startPos + controlByteCount
    for tag, valuesPerEntry, mask, endFlag in tagTable:
        if endFlag == 0x01:
            controlByteIndex += 1
            continue
        cbyte = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1])
        if 0:
            print("Control Byte Index %0x , Control Byte Value %0x" % (controlByteIndex, cbyte))
        value = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1]) & mask
        if value != 0:
            if value == mask:
                if countSetBits(mask) > 1:
                    # If all bits of masked value are set and the mask has more than one bit, a variable width value
                    # will follow after the control bytes which defines the length of bytes (NOT the value count!)
                    # which will contain the corresponding variable width values.
                    consumed, value = getVariableWidthValue(entryData, dataStart)
                    dataStart += consumed
                    tags.append((tag, None, value, valuesPerEntry))
                else:
                    tags.append((tag, 1, None, valuesPerEntry))
            else:
                # Shift bits to get the masked value.
                while mask & 0x01 == 0:
                    mask = mask >> 1
                    value = value >> 1
                tags.append((tag, value, None, valuesPerEntry))
    for tag, valueCount, valueBytes, valuesPerEntry in tags:
        values = []
        if valueCount is not None:
            # Read valueCount * valuesPerEntry variable width values.
            for _ in range(valueCount):
                for _ in range(valuesPerEntry):
                    consumed, data = getVariableWidthValue(entryData, dataStart)
                    dataStart += consumed
                    values.append(data)
        else:
            # Convert valueBytes to variable width values.
            totalConsumed = 0
            while totalConsumed < valueBytes:
                # Does this work for valuesPerEntry != 1?
                consumed, data = getVariableWidthValue(entryData, dataStart)
                dataStart += consumed
                totalConsumed += consumed
                values.append(data)
            if totalConsumed != valueBytes:
                print("Error: Should consume %s bytes, but consumed %s" % (valueBytes, totalConsumed))
        tagHashMap[tag] = values
    # Test that all bytes have been processed if endPos is given.
    if endPos is not None and dataStart != endPos:
        # The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
        for char in entryData[dataStart:endPos]:
            if bord(char) != 0:
                print("Warning: There are unprocessed index bytes left: %s" % toHex(entryData[dataStart:endPos]))
                if 0:
                    print("controlByteCount: %s" % controlByteCount)
                    print("tagTable: %s" % tagTable)
                    print("data: %s" % toHex(entryData[startPos:endPos]))
                    print("tagHashMap: %s" % tagHashMap)
                break
    return tagHashMap
--- a/KindleUnpack/mobi_k8proc.py
+++ b/KindleUnpack/mobi_k8proc.py
@@ -0,0 +1,494 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 from __future__ import unicode_literals, division, absolute_import, print_function
 from .compatibility_utils import PY2, bstr, utf8_str
 if PY2:
    range = xrange
 import os
 import struct
 # note:  struct pack, unpack, unpack_from all require bytestring format
 # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 import re
 # note: re requites the pattern to be the exact same type as the data to be searched in python3
 # but u"" is not allowed for the pattern itself only b""
 from .mobi_index import MobiIndex
 from .mobi_utils import fromBase32
 from .unipath import pathof
 _guide_types = [b'cover',b'title-page',b'toc',b'index',b'glossary',b'acknowledgements',
                b'bibliography',b'colophon',b'copyright-page',b'dedication',
                b'epigraph',b'foreward',b'loi',b'lot',b'notes',b'preface',b'text']
 # locate beginning and ending positions of tag with specific aid attribute
 def locate_beg_end_of_tag(ml, aid):
    pattern = utf8_str(r'''<[^>]*\said\s*=\s*['"]%s['"][^>]*>''' % aid)
    aid_pattern = re.compile(pattern,re.IGNORECASE)
    for m in re.finditer(aid_pattern, ml):
        plt = m.start()
        pgt = ml.find(b'>',plt+1)
        return plt, pgt
    return 0, 0
 # iterate over all tags in block in reverse order, i.e. last ta to first tag
 def reverse_tag_iter(block):
    end = len(block)
    while True:
        pgt = block.rfind(b'>', 0, end)
        if pgt == -1:
            break
        plt = block.rfind(b'<', 0, pgt)
        if plt == -1:
            break
        yield block[plt:pgt+1]
        end = plt
 class K8Processor:
    def __init__(self, mh, sect, files, debug=False):
        self.sect = sect
        self.files = files
        self.mi = MobiIndex(sect)
        self.mh = mh
        self.skelidx = mh.skelidx
        self.fragidx = mh.fragidx
        self.guideidx = mh.guideidx
        self.fdst = mh.fdst
        self.flowmap = {}
        self.flows = None
        self.flowinfo = []
        self.parts = None
        self.partinfo = []
        self.linked_aids = set()
        self.fdsttbl= [0,0xffffffff]
        self.DEBUG = debug
        # read in and parse the FDST info which is very similar in format to the Palm DB section
        # parsing except it provides offsets into rawML file and not the Palm DB file
        # this is needed to split up the final css, svg, etc flow section
        # that can exist at the end of the rawML file
        if self.fdst != 0xffffffff:
            header = self.sect.loadSection(self.fdst)
            if header[0:4] == b"FDST":
                num_sections, = struct.unpack_from(b'>L', header, 0x08)
                self.fdsttbl = struct.unpack_from(bstr('>%dL' % (num_sections*2)), header, 12)[::2] + (mh.rawSize, )
                sect.setsectiondescription(self.fdst,"KF8 FDST INDX")
                if self.DEBUG:
                    print("\nFDST Section Map:  %d sections" % num_sections)
                    for j in range(num_sections):
                        print("Section %d: 0x%08X - 0x%08X" % (j, self.fdsttbl[j],self.fdsttbl[j+1]))
            else:
                print("\nError: K8 Mobi with Missing FDST info")
        # read/process skeleton index info to create the skeleton table
        skeltbl = []
        if self.skelidx != 0xffffffff:
            # for i in range(2):
            #     fname = 'skel%04d.dat' % i
            #     data = self.sect.loadSection(self.skelidx + i)
            #     with open(pathof(fname), 'wb') as f:
            #         f.write(data)
            outtbl, ctoc_text = self.mi.getIndexData(self.skelidx, "KF8 Skeleton")
            fileptr = 0
            for [text, tagMap] in outtbl:
                # file number, skeleton name, fragtbl record count, start position, length
                skeltbl.append([fileptr, text, tagMap[1][0], tagMap[6][0], tagMap[6][1]])
                fileptr += 1
        self.skeltbl = skeltbl
        if self.DEBUG:
            print("\nSkel Table:  %d entries" % len(self.skeltbl))
            print("table: filenum, skeleton name, frag tbl record count, start position, length")
            for j in range(len(self.skeltbl)):
                print(self.skeltbl[j])
        # read/process the fragment index to create the fragment table
        fragtbl = []
        if self.fragidx != 0xffffffff:
            # for i in range(3):
            #     fname = 'frag%04d.dat' % i
            #     data = self.sect.loadSection(self.fragidx + i)
            #     with open(pathof(fname), 'wb') as f:
            #         f.write(data)
            outtbl, ctoc_text = self.mi.getIndexData(self.fragidx, "KF8 Fragment")
            for [text, tagMap] in outtbl:
                # insert position, ctoc offset (aidtext), file number, sequence number, start position, length
                ctocoffset = tagMap[2][0]
                ctocdata = ctoc_text[ctocoffset]
                fragtbl.append([int(text), ctocdata, tagMap[3][0], tagMap[4][0], tagMap[6][0], tagMap[6][1]])
        self.fragtbl = fragtbl
        if self.DEBUG:
            print("\nFragment Table: %d entries" % len(self.fragtbl))
            print("table: file position, link id text, file num, sequence number, start position, length")
            for j in range(len(self.fragtbl)):
                print(self.fragtbl[j])
        # read / process guide index for guide elements of opf
        guidetbl = []
        if self.guideidx != 0xffffffff:
            # for i in range(3):
            #     fname = 'guide%04d.dat' % i
            #     data = self.sect.loadSection(self.guideidx + i)
            #     with open(pathof(fname), 'wb') as f:
            #         f.write(data)
            outtbl, ctoc_text = self.mi.getIndexData(self.guideidx, "KF8 Guide elements)")
            for [text, tagMap] in outtbl:
                # ref_type, ref_title, frag number
                ctocoffset = tagMap[1][0]
                ref_title = ctoc_text[ctocoffset]
                ref_type = text
                fileno = None
                if 3 in tagMap:
                    fileno  = tagMap[3][0]
                if 6 in tagMap:
                    fileno = tagMap[6][0]
                guidetbl.append([ref_type, ref_title, fileno])
        self.guidetbl = guidetbl
        if self.DEBUG:
            print("\nGuide Table: %d entries" % len(self.guidetbl))
            print("table: ref_type, ref_title, fragtbl entry number")
            for j in range(len(self.guidetbl)):
                print(self.guidetbl[j])
    def buildParts(self, rawML):
        # now split the rawML into its flow pieces
        self.flows = []
        for j in range(0, len(self.fdsttbl)-1):
            start = self.fdsttbl[j]
            end = self.fdsttbl[j+1]
            self.flows.append(rawML[start:end])
        # the first piece represents the xhtml text
        text = self.flows[0]
        self.flows[0] = b''
        # walk the <skeleton> and fragment tables to build original source xhtml files
        # *without* destroying any file position information needed for later href processing
        # and create final list of file separation start: stop points and etc in partinfo
        if self.DEBUG:
            print("\nRebuilding flow piece 0: the main body of the ebook")
        self.parts = []
        self.partinfo = []
        fragptr = 0
        baseptr = 0
        cnt = 0
        for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl:
            baseptr = skelpos + skellen
            skeleton = text[skelpos: baseptr]
            for i in range(fragcnt):
                [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[fragptr]
                aidtext = idtext[12:-2]
                if i == 0:
                    filename = 'part%04d.xhtml' % filenum
                slice = text[baseptr: baseptr + length]
                insertpos = insertpos - skelpos
                head = skeleton[:insertpos]
                tail = skeleton[insertpos:]
                actual_inspos = insertpos
                if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') < head.rfind(b'<')):
                    # There is an incomplete tag in either the head or tail.
                    # This can happen for some badly formed KF8 files
                    print('The fragment table for %s has incorrect insert position. Calculating manually.' % skelname)
                    bp, ep = locate_beg_end_of_tag(skeleton, aidtext)
                    if bp != ep:
                        actual_inspos = ep + 1 + startpos
                if insertpos != actual_inspos:
                    print("fixed corrupt fragment table insert position", insertpos+skelpos, actual_inspos+skelpos)
                    insertpos = actual_inspos
                    self.fragtbl[fragptr][0] = actual_inspos + skelpos
                skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:]
                baseptr = baseptr + length
                fragptr += 1
            cnt += 1
            self.parts.append(skeleton)
            self.partinfo.append([skelnum, 'Text', filename, skelpos, baseptr, aidtext])
        assembled_text = b''.join(self.parts)
        if self.DEBUG:
            outassembled = os.path.join(self.files.k8dir, 'assembled_text.dat')
            with open(pathof(outassembled),'wb') as f:
                f.write(assembled_text)
        # The primary css style sheet is typically stored next followed by any
        # snippets of code that were previously inlined in the
        # original xhtml but have been stripped out and placed here.
        # This can include local CDATA snippets and and svg sections.
        # The problem is that for most browsers and ereaders, you can not
        # use <img src="imageXXXX.svg" /> to import any svg image that itself
        # properly uses an <image/> tag to import some raster image - it
        # should work according to the spec but does not for almost all browsers
        # and ereaders and causes epub validation issues because those  raster
        # images are in manifest but not in xhtml text - since they only
        # referenced from an svg image
        # So we need to check the remaining flow pieces to see if they are css
        # or svg images.  if svg images, we must check if they have an <image />
        # and if so inline them into the xhtml text pieces.
        # there may be other sorts of pieces stored here but until we see one
        # in the wild to reverse engineer we won't be able to tell
        self.flowinfo.append([None, None, None, None])
        svg_tag_pattern = re.compile(br'''(<svg[^>]*>)''', re.IGNORECASE)
        image_tag_pattern = re.compile(br'''(<image[^>]*>)''', re.IGNORECASE)
        for j in range(1,len(self.flows)):
            flowpart = self.flows[j]
            nstr = '%04d' % j
            m = re.search(svg_tag_pattern, flowpart)
            if m is not None:
                # svg
                ptype = b'svg'
                start = m.start()
                m2 = re.search(image_tag_pattern, flowpart)
                if m2 is not None:
                    pformat = b'inline'
                    pdir = None
                    fname = None
                    # strip off anything before <svg if inlining
                    flowpart = flowpart[start:]
                else:
                    pformat = b'file'
                    pdir = "Images"
                    fname = 'svgimg' + nstr + '.svg'
            else:
                # search for CDATA and if exists inline it
                if flowpart.find(b'[CDATA[') >= 0:
                    ptype = b'css'
                    flowpart = b'<style type="text/css">\n' + flowpart + b'\n</style>\n'
                    pformat = b'inline'
                    pdir = None
                    fname = None
                else:
                    # css - assume as standalone css file
                    ptype = b'css'
                    pformat = b'file'
                    pdir = "Styles"
                    fname = 'style' + nstr + '.css'
            self.flows[j] = flowpart
            self.flowinfo.append([ptype, pformat, pdir, fname])
        if self.DEBUG:
            print("\nFlow Map:  %d entries" % len(self.flowinfo))
            for fi in self.flowinfo:
                print(fi)
            print("\n")
            print("\nXHTML File Part Position Information: %d entries" % len(self.partinfo))
            for pi in self.partinfo:
                print(pi)
        if False:  # self.Debug:
            # dump all of the locations of the aid tags used in TEXT
            # find id links only inside of tags
            #    inside any < > pair find all "aid=' and return whatever is inside the quotes
            #    [^>]* means match any amount of chars except for  '>' char
            #    [^'"] match any amount of chars except for the quote character
            #    \s* means match any amount of whitespace
            print("\npositions of all aid= pieces")
            id_pattern = re.compile(br'''<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>''',re.IGNORECASE)
            for m in re.finditer(id_pattern, rawML):
                [filename, partnum, start, end] = self.getFileInfo(m.start())
                [seqnum, idtext] = self.getFragTblInfo(m.start())
                value = fromBase32(m.group(1))
                print("  aid: %s value: %d at: %d -> part: %d, start: %d, end: %d" % (m.group(1), value, m.start(), partnum, start, end))
                print("       %s  fragtbl entry %d" % (idtext, seqnum))
        return
    # get information fragment table entry by pos
    def getFragTblInfo(self, pos):
        for j in range(len(self.fragtbl)):
            [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[j]
            if pos >= insertpos and pos < (insertpos + length):
                # why are these "in: and before: added here
                return seqnum, b'in: ' + idtext
            if pos < insertpos:
                return seqnum, b'before: ' + idtext
        return None, None
    # get information about the part (file) that exists at pos in original rawML
    def getFileInfo(self, pos):
        for [partnum, pdir, filename, start, end, aidtext] in self.partinfo:
            if pos >= start and pos < end:
                return filename, partnum, start, end
        return None, None, None, None
    # accessor functions to properly protect the internal structure
    def getNumberOfParts(self):
        return len(self.parts)
    def getPart(self,i):
        if i >= 0 and i < len(self.parts):
            return self.parts[i]
        return None
    def getPartInfo(self, i):
        if i >= 0 and i < len(self.partinfo):
            return self.partinfo[i]
        return None
    def getNumberOfFlows(self):
        return len(self.flows)
    def getFlow(self,i):
        # note flows[0] is empty - it was all of the original text
        if i > 0 and i < len(self.flows):
            return self.flows[i]
        return None
    def getFlowInfo(self,i):
        # note flowinfo[0] is empty - it was all of the original text
        if i > 0 and i < len(self.flowinfo):
            return self.flowinfo[i]
        return None
    def getIDTagByPosFid(self, posfid, offset):
        # first convert kindle:pos:fid and offset info to position in file
        # (fromBase32 can handle both string types on input)
        row = fromBase32(posfid)
        off = fromBase32(offset)
        [insertpos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[row]
        pos = insertpos + off
        fname, pn, skelpos, skelend = self.getFileInfo(pos)
        if fname is None:
            # pos does not exist
            # default to skeleton pos instead
            print("Link To Position", pos, "does not exist, retargeting to top of target")
            pos = self.skeltbl[filenum][3]
            fname, pn, skelpos, skelend = self.getFileInfo(pos)
        # an existing "id=" or "name=" attribute must exist in original xhtml otherwise it would not have worked for linking.
        # Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent
        # some position information encoded into Base32 name.
        # so find the closest "id=" before position the file  by actually searching in that file
        idtext = self.getIDTag(pos)
        return fname, idtext
    def getIDTag(self, pos):
        # find the first tag with a named anchor (name or id attribute) before pos
        fname, pn, skelpos, skelend = self.getFileInfo(pos)
        if pn is None and skelpos is None:
            print("Error: getIDTag - no file contains ", pos)
        textblock = self.parts[pn]
        npos = pos - skelpos
        # if npos inside a tag then search all text before the its end of tag marker
        pgt = textblock.find(b'>',npos)
        plt = textblock.find(b'<',npos)
        if plt == npos or pgt < plt:
            npos = pgt + 1
        # find id and name attributes only inside of tags
        # use a reverse tag search since that is faster
        #    inside any < > pair find "id=" and "name=" attributes return it
        #    [^>]* means match any amount of chars except for  '>' char
        #    [^'"] match any amount of chars except for the quote character
        #    \s* means match any amount of whitespace
        textblock = textblock[0:npos]
        id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE)
        name_pattern = re.compile(br'''<[^>]*\sname\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE)
        aid_pattern = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''')
        for tag in reverse_tag_iter(textblock):
            # any ids in the body should default to top of file
            if tag[0:6] == b'<body ':
                return b''
            if tag[0:6] != b'<meta ':
                m = id_pattern.match(tag) or name_pattern.match(tag)
                if m is not None:
                    return m.group(1)
                m = aid_pattern.match(tag)
                if m is not None:
                    self.linked_aids.add(m.group(1))
                    return b'aid-' + m.group(1)
        return b''
    # do we need to do deep copying
    def setParts(self, parts):
        assert(len(parts) == len(self.parts))
        for i in range(len(parts)):
            self.parts[i] = parts[i]
    # do we need to do deep copying
    def setFlows(self, flows):
        assert(len(flows) == len(self.flows))
        for i in range(len(flows)):
            self.flows[i] = flows[i]
    # get information about the part (file) that exists at pos in original rawML
    def getSkelInfo(self, pos):
        for [partnum, pdir, filename, start, end, aidtext] in self.partinfo:
            if pos >= start and pos < end:
                return [partnum, pdir, filename, start, end, aidtext]
        return [None, None, None, None, None, None]
    # fileno is actually a reference into fragtbl (a fragment)
    def getGuideText(self):
        guidetext = b''
        for [ref_type, ref_title, fileno] in self.guidetbl:
            if ref_type == b'thumbimagestandard':
                continue
            if ref_type not in _guide_types and not ref_type.startswith(b'other.'):
                if ref_type == b'start':
                    ref_type = b'text'
                else:
                    ref_type = b'other.' + ref_type
            [pos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[fileno]
            [pn, pdir, filename, skelpos, skelend, aidtext] = self.getSkelInfo(pos)
            idtext = self.getIDTag(pos)
            linktgt = filename.encode('utf-8')
            if idtext != b'':
                linktgt += b'#' + idtext
            guidetext += b'<reference type="'+ref_type+b'" title="'+ref_title+b'" href="'+utf8_str(pdir)+b'/'+linktgt+b'" />\n'
        # opf is encoded utf-8 so must convert any titles properly
        guidetext = (guidetext.decode(self.mh.codec)).encode("utf-8")
        return guidetext
    def getPageIDTag(self, pos):
        # find the first tag with a named anchor (name or id attribute) before pos
        # but page map offsets need to little more leeway so if the offset points
        # into a tag look for the next ending tag "/>" or "</" and start your search from there.
        fname, pn, skelpos, skelend = self.getFileInfo(pos)
        if pn is None and skelpos is None:
            print("Error: getIDTag - no file contains ", pos)
        textblock = self.parts[pn]
        npos = pos - skelpos
        # if npos inside a tag then search all text before next ending tag
        pgt = textblock.find(b'>',npos)
        plt = textblock.find(b'<',npos)
        if plt == npos or pgt < plt:
            # we are in a tag
            # so find first ending tag
            pend1 = textblock.find(b'/>', npos)
            pend2 = textblock.find(b'</', npos)
            if pend1 != -1 and pend2 != -1:
                pend = min(pend1, pend2)
            else:
                pend = max(pend1, pend2)
            if pend != -1:
                npos = pend
            else:
                npos = pgt + 1
        # find id and name attributes only inside of tags
        # use a reverse tag search since that is faster
        #    inside any < > pair find "id=" and "name=" attributes return it
        #    [^>]* means match any amount of chars except for  '>' char
        #    [^'"] match any amount of chars except for the quote character
        #    \s* means match any amount of whitespace
        textblock = textblock[0:npos]
        id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE)
        name_pattern = re.compile(br'''<[^>]*\sname\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE)
        for tag in reverse_tag_iter(textblock):
            # any ids in the body should default to top of file
            if tag[0:6] == b'<body ':
                return b''
            if tag[0:6] != b'<meta ':
                m = id_pattern.match(tag) or name_pattern.match(tag)
                if m is not None:
                    return m.group(1)
        return b''
--- a/KindleUnpack/mobi_k8resc.py
+++ b/KindleUnpack/mobi_k8resc.py
@@ -0,0 +1,268 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 from __future__ import unicode_literals, division, absolute_import, print_function
 DEBUG_USE_ORDERED_DICTIONARY = False  # OrderedDict is supoorted >= python 2.7.
 """ set to True to use OrderedDict for K8RESCProcessor.parsetag.tattr."""
 if DEBUG_USE_ORDERED_DICTIONARY:
    from collections import OrderedDict as dict_
 else:
    dict_ = dict
 from .compatibility_utils import unicode_str
 from .mobi_utils import fromBase32
 _OPF_PARENT_TAGS = ['xml', 'package', 'metadata', 'dc-metadata',
                    'x-metadata', 'manifest', 'spine', 'tours', 'guide']
 class K8RESCProcessor(object):
    def __init__(self, data, debug=False):
        self._debug = debug
        self.resc = None
        self.opos = 0
        self.extrameta = []
        self.cover_name = None
        self.spine_idrefs = {}
        self.spine_order = []
        self.spine_pageattributes = {}
        self.spine_ppd = None
        # need3 indicate the book has fields which require epub3.
        # but the estimation of the source epub version from the fields is difficult.
        self.need3 = False
        self.package_ver = None
        self.extra_metadata = []
        self.refines_metadata = []
        self.extra_attributes = []
        # get header
        start_pos = data.find(b'<')
        self.resc_header = data[:start_pos]
        # get resc data length
        start = self.resc_header.find(b'=') + 1
        end = self.resc_header.find(b'&', start)
        resc_size = 0
        if end > 0:
            resc_size = fromBase32(self.resc_header[start:end])
        resc_rawbytes = len(data) - start_pos
        if resc_rawbytes == resc_size:
            self.resc_length = resc_size
        else:
            # Most RESC has a nul string at its tail but some do not.
            end_pos = data.find(b'\x00', start_pos)
            if end_pos < 0:
                self.resc_length = resc_rawbytes
            else:
                self.resc_length = end_pos - start_pos
        if self.resc_length != resc_size:
            print("Warning: RESC section length({:d}bytes) does not match its size({:d}bytes).".format(self.resc_length, resc_size))
        # now parse RESC after converting it to unicode from utf-8
        self.resc = unicode_str(data[start_pos:start_pos+self.resc_length])
        self.parseData()
    def prepend_to_spine(self, key, idref, linear, properties):
        self.spine_order = [key] + self.spine_order
        self.spine_idrefs[key] = idref
        attributes = {}
        if linear is not None:
            attributes['linear'] = linear
        if properties is not None:
            attributes['properties'] = properties
        self.spine_pageattributes[key] = attributes
    # RESC tag iterator
    def resc_tag_iter(self):
        tcontent = last_tattr = None
        prefix = ['']
        while True:
            text, tag = self.parseresc()
            if text is None and tag is None:
                break
            if text is not None:
                tcontent = text.rstrip(' \r\n')
            else:  # we have a tag
                ttype, tname, tattr = self.parsetag(tag)
                if ttype == 'begin':
                    tcontent = None
                    prefix.append(tname + '.')
                    if tname in _OPF_PARENT_TAGS:
                        yield ''.join(prefix), tname, tattr, tcontent
                    else:
                        last_tattr = tattr
                else:  # single or end
                    if ttype == 'end':
                        prefix.pop()
                        tattr = last_tattr
                        last_tattr = None
                        if tname in _OPF_PARENT_TAGS:
                            tname += '-end'
                    yield ''.join(prefix), tname, tattr, tcontent
                    tcontent = None
    # now parse the RESC to extract spine and extra metadata info
    def parseData(self):
        for prefix, tname, tattr, tcontent in self.resc_tag_iter():
            if self._debug:
                print("   Parsing RESC: ", prefix, tname, tattr, tcontent)
            if tname == 'package':
                self.package_ver = tattr.get('version', '2.0')
                package_prefix = tattr.get('prefix','')
                if self.package_ver.startswith('3') or package_prefix.startswith('rendition'):
                    self.need3 = True
            if tname == 'spine':
                self.spine_ppd = tattr.get('page-progession-direction', None)
                if self.spine_ppd is not None and self.spine_ppd == 'rtl':
                    self.need3 = True
            if tname == 'itemref':
                skelid = tattr.pop('skelid', None)
                if skelid is None and len(self.spine_order) == 0:
                    # assume it was removed initial coverpage
                    skelid = 'coverpage'
                    tattr['linear'] = 'no'
                self.spine_order.append(skelid)
                idref = tattr.pop('idref', None)
                if idref is not None:
                    idref = 'x_' + idref
                self.spine_idrefs[skelid] = idref
                if 'id' in tattr:
                    del tattr['id']
                # tattr["id"] = 'x_' + tattr["id"]
                if 'properties' in tattr:
                    self.need3 = True
                self.spine_pageattributes[skelid] = tattr
            if tname == 'meta' or tname.startswith('dc:'):
                if 'refines' in tattr or 'property' in tattr:
                    self.need3 = True
                if tattr.get('name','') == 'cover':
                    cover_name = tattr.get('content',None)
                    if cover_name is not None:
                        cover_name = 'x_' + cover_name
                    self.cover_name = cover_name
                else:
                    self.extrameta.append([tname, tattr, tcontent])
    # parse and return either leading text or the next tag
    def parseresc(self):
        p = self.opos
        if p >= len(self.resc):
            return None, None
        if self.resc[p] != '<':
            res = self.resc.find('<',p)
            if res == -1 :
                res = len(self.resc)
            self.opos = res
            return self.resc[p:res], None
        # handle comment as a special case
        if self.resc[p:p+4] == '<!--':
            te = self.resc.find('-->',p+1)
            if te != -1:
                te = te+2
        else:
            te = self.resc.find('>',p+1)
            ntb = self.resc.find('<',p+1)
            if ntb != -1 and ntb < te:
                self.opos = ntb
                return self.resc[p:ntb], None
        self.opos = te + 1
        return None, self.resc[p:te+1]
    # parses tag to identify:  [tname, ttype, tattr]
    #    tname: tag name
    #    ttype: tag type ('begin', 'end' or 'single');
    #    tattr: dictionary of tag atributes
    def parsetag(self, s):
        p = 1
        tname = None
        ttype = None
        tattr = dict_()
        while s[p:p+1] == ' ' :
            p += 1
        if s[p:p+1] == '/':
            ttype = 'end'
            p += 1
            while s[p:p+1] == ' ' :
                p += 1
        b = p
        while s[p:p+1] not in ('>', '/', ' ', '"', "'",'\r','\n') :
            p += 1
        tname=s[b:p].lower()
        # some special cases
        if tname == '?xml':
            tname = 'xml'
        if tname == '!--':
            ttype = 'single'
            comment = s[p:-3].strip()
            tattr['comment'] = comment
        if ttype is None:
            # parse any attributes of begin or single tags
            while s.find('=',p) != -1 :
                while s[p:p+1] == ' ' :
                    p += 1
                b = p
                while s[p:p+1] != '=' :
                    p += 1
                aname = s[b:p].lower()
                aname = aname.rstrip(' ')
                p += 1
                while s[p:p+1] == ' ' :
                    p += 1
                if s[p:p+1] in ('"', "'") :
                    p = p + 1
                    b = p
                    while s[p:p+1] not in ('"', "'"):
                        p += 1
                    val = s[b:p]
                    p += 1
                else :
                    b = p
                    while s[p:p+1] not in ('>', '/', ' ') :
                        p += 1
                    val = s[b:p]
                tattr[aname] = val
        if ttype is None:
            ttype = 'begin'
            if s.find('/',p) >= 0:
                ttype = 'single'
        return ttype, tname, tattr
    def taginfo_toxml(self, taginfo):
        res = []
        tname, tattr, tcontent = taginfo
        res.append('<' + tname)
        if tattr is not None:
            for key in tattr:
                res.append(' ' + key + '="'+tattr[key]+'"')
        if tcontent is not None:
            res.append('>' + tcontent + '</' + tname + '>\n')
        else:
            res.append('/>\n')
        return "".join(res)
    def hasSpine(self):
        return len(self.spine_order) > 0
    def needEPUB3(self):
        return self.need3
    def hasRefines(self):
        for [tname, tattr, tcontent] in self.extrameta:
            if 'refines' in tattr:
                return True
        return False
    def createMetadata(self, epubver):
        for taginfo in self.extrameta:
            tname, tattr, tcontent = taginfo
            if 'refines' in tattr:
                if epubver == 'F' and 'property' in tattr:
                    attr = ' id="%s" opf:%s="%s"\n' % (tattr['refines'], tattr['property'], tcontent)
                    self.extra_attributes.append(attr)
                else:
                    tag = self.taginfo_toxml(taginfo)
                    self.refines_metadata.append(tag)
            else:
                tag = self.taginfo_toxml(taginfo)
                self.extra_metadata.append(tag)
--- a/KindleUnpack/mobi_nav.py
+++ b/KindleUnpack/mobi_nav.py
@@ -0,0 +1,186 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 from __future__ import unicode_literals, division, absolute_import, print_function
 from .compatibility_utils import unicode_str
 import os
 from .unipath import pathof
 import re
 # note: re requites the pattern to be the exact same type as the data to be searched in python3
 # but u"" is not allowed for the pattern itself only b""
 DEBUG_NAV = False
 FORCE_DEFAULT_TITLE = False
 """ Set to True to force to use the default title. """
 NAVIGATION_FINENAME = 'nav.xhtml'
 """ The name for the navigation document. """
 DEFAULT_TITLE = 'Navigation'
 """ The default title for the navigation document. """
 class NAVProcessor(object):
    def __init__(self, files):
        self.files = files
        self.navname = NAVIGATION_FINENAME
    def buildLandmarks(self, guidetext):
        header = ''
        header += '  <nav epub:type="landmarks" id="landmarks" hidden="">\n'
        header += '    <h2>Guide</h2>\n'
        header += '    <ol>\n'
        element = '      <li><a epub:type="{:s}" href="{:s}">{:s}</a></li>\n'
        footer = ''
        footer += '    </ol>\n'
        footer += '  </nav>\n'
        type_map = {
            'cover' : 'cover',
            'title-page' : 'title-page',
            # ?: 'frontmatter',
            'text' : 'bodymatter',
            # ?: 'backmatter',
            'toc' : 'toc',
            'loi' : 'loi',
            'lot' : 'lot',
            'preface' : 'preface',
            'bibliography' : 'bibliography',
            'index' : 'index',
            'glossary' : 'glossary',
            'acknowledgements' : 'acknowledgements',
            'colophon' : None,
            'copyright-page' : None,
            'dedication' : None,
            'epigraph' : None,
            'foreword' : None,
            'notes' : None
            }
        re_type = re.compile(r'\s+type\s*=\s*"(.*?)"', re.I)
        re_title = re.compile(r'\s+title\s*=\s*"(.*?)"', re.I)
        re_link = re.compile(r'\s+href\s*=\s*"(.*?)"', re.I)
        dir_ = os.path.relpath(self.files.k8text, self.files.k8oebps).replace('\\', '/')
        data = ''
        references = re.findall(r'<reference\s+.*?>', unicode_str(guidetext), re.I)
        for reference in references:
            mo_type = re_type.search(reference)
            mo_title = re_title.search(reference)
            mo_link = re_link.search(reference)
            if mo_type is not None:
                type_ = type_map.get(mo_type.group(1), None)
            else:
                type_ = None
            if mo_title is not None:
                title = mo_title.group(1)
            else:
                title = None
            if mo_link is not None:
                link = mo_link.group(1)
            else:
                link = None
            if type_ is not None and title is not None and link is not None:
                link = os.path.relpath(link, dir_).replace('\\', '/')
                data += element.format(type_, link, title)
        if len(data) > 0:
            return header + data + footer
        else:
            return ''
    def buildTOC(self, indx_data):
        header = ''
        header += '  <nav epub:type="toc" id="toc">\n'
        header += '    <h1>Table of contents</h1>\n'
        footer = '  </nav>\n'
        # recursive part
        def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
            if start>len(indx_data) or end>len(indx_data):
                print("Warning (in buildTOC): missing INDX child entries", start, end, len(indx_data))
                return ''
            if DEBUG_NAV:
                print("recursINDX (in buildTOC) lvl %d from %d to %d" % (lvl, start, end))
            xhtml = ''
            if start <= 0:
                start = 0
            if end <= 0:
                end = len(indx_data)
            if lvl > max_lvl:
                max_lvl = lvl
            indent1 = '  ' * (2 + lvl * 2)
            indent2 = '  ' * (3 + lvl * 2)
            xhtml += indent1 + '<ol>\n'
            for i in range(start, end):
                e = indx_data[i]
                htmlfile = e['filename']
                desttag = e['idtag']
                text = e['text']
                if not e['hlvl'] == lvl:
                    continue
                num += 1
                if desttag == '':
                    link = htmlfile
                else:
                    link = '{:s}#{:s}'.format(htmlfile, desttag)
                xhtml += indent2 + '<li>'
                entry = '<a href="{:}">{:s}</a>'.format(link, text)
                xhtml += entry
                # recurs
                if e['child1'] >= 0:
                    xhtml += '\n'
                    xhtmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1,
                            e['child1'], e['childn'] + 1)
                    xhtml += xhtmlrec
                    xhtml += indent2
                # close entry
                xhtml += '</li>\n'
            xhtml += indent1 + '</ol>\n'
            return xhtml, max_lvl, num
        data, max_lvl, num = recursINDX()
        if not len(indx_data) == num:
            print("Warning (in buildTOC): different number of entries in NCX", len(indx_data), num)
        return header + data + footer
    def buildNAV(self, ncx_data, guidetext, title, lang):
        print("Building Navigation Document.")
        if FORCE_DEFAULT_TITLE:
            title = DEFAULT_TITLE
        nav_header = ''
        nav_header += '<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE html>'
        nav_header += '<html xmlns="http://www.w3.org/1999/xhtml"'
        nav_header += ' xmlns:epub="http://www.idpf.org/2007/ops"'
        nav_header += ' lang="{0:s}" xml:lang="{0:s}">\n'.format(lang)
        nav_header += '<head>\n<title>{:s}</title>\n'.format(title)
        nav_header += '<meta charset="UTF-8" />\n'
        nav_header += '<style type="text/css">\n'
        nav_header += 'nav#landmarks { display:none; }\n'
        nav_header += '</style>\n</head>\n<body>\n'
        nav_footer = '</body>\n</html>\n'
        landmarks =  self.buildLandmarks(guidetext)
        toc = self.buildTOC(ncx_data)
        data = nav_header
        data += landmarks
        data += toc
        data += nav_footer
        return data
    def getNAVName(self):
        return self.navname
    def writeNAV(self, ncx_data, guidetext, metadata):
        # build the xhtml
        # print("Write Navigation Document.")
        xhtml = self.buildNAV(ncx_data, guidetext, metadata.get('Title')[0], metadata.get('Language')[0])
        fname = os.path.join(self.files.k8text, self.navname)
        with open(pathof(fname), 'wb') as f:
            f.write(xhtml.encode('utf-8'))
--- a/KindleUnpack/mobi_ncx.py
+++ b/KindleUnpack/mobi_ncx.py
@@ -0,0 +1,272 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 from __future__ import unicode_literals, division, absolute_import, print_function
 import os
 from .unipath import pathof
 import re
 # note: re requites the pattern to be the exact same type as the data to be searched in python3
 # but u"" is not allowed for the pattern itself only b""
 from .mobi_utils import toBase32
 from .mobi_index import MobiIndex
 DEBUG_NCX = False
 class ncxExtract:
    def __init__(self, mh, files):
        self.mh = mh
        self.sect = self.mh.sect
        self.files = files
        self.isNCX = False
        self.mi = MobiIndex(self.sect)
        self.ncxidx = self.mh.ncxidx
        self.indx_data = None
    def parseNCX(self):
        indx_data = []
        tag_fieldname_map = {
                1: ['pos',0],
                2: ['len',0],
                3: ['noffs',0],
                4: ['hlvl',0],
                5: ['koffs',0],
                6: ['pos_fid',0],
                21: ['parent',0],
                22: ['child1',0],
                23: ['childn',0]
        }
        if self.ncxidx != 0xffffffff:
            outtbl, ctoc_text = self.mi.getIndexData(self.ncxidx, "NCX")
            if DEBUG_NCX:
                print(ctoc_text)
                print(outtbl)
            num = 0
            for [text, tagMap] in outtbl:
                tmp = {
                        'name': text.decode('utf-8'),
                        'pos':  -1,
                        'len':  0,
                        'noffs': -1,
                        'text' : "Unknown Text",
                        'hlvl' : -1,
                        'kind' : "Unknown Kind",
                        'pos_fid' : None,
                        'parent' : -1,
                        'child1' : -1,
                        'childn' : -1,
                        'num'  : num
                        }
                for tag in tag_fieldname_map:
                    [fieldname, i] = tag_fieldname_map[tag]
                    if tag in tagMap:
                        fieldvalue = tagMap[tag][i]
                        if tag == 6:
                            pos_fid = toBase32(fieldvalue,4).decode('utf-8')
                            fieldvalue2 = tagMap[tag][i+1]
                            pos_off = toBase32(fieldvalue2,10).decode('utf-8')
                            fieldvalue = 'kindle:pos:fid:%s:off:%s' % (pos_fid, pos_off)
                        tmp[fieldname] = fieldvalue
                        if tag == 3:
                            toctext = ctoc_text.get(fieldvalue, 'Unknown Text')
                            toctext = toctext.decode(self.mh.codec)
                            tmp['text'] = toctext
                        if tag == 5:
                            kindtext = ctoc_text.get(fieldvalue, 'Unknown Kind')
                            kindtext = kindtext.decode(self.mh.codec)
                            tmp['kind'] = kindtext
                indx_data.append(tmp)
                if DEBUG_NCX:
                    print("record number: ", num)
                    print("name: ", tmp['name'],)
                    print("position", tmp['pos']," length: ", tmp['len'])
                    print("text: ", tmp['text'])
                    print("kind: ", tmp['kind'])
                    print("heading level: ", tmp['hlvl'])
                    print("parent:", tmp['parent'])
                    print("first child: ",tmp['child1']," last child: ", tmp['childn'])
                    print("pos_fid is ", tmp['pos_fid'])
                    print("\n\n")
                num += 1
        self.indx_data = indx_data
        return indx_data
    def buildNCX(self, htmlfile, title, ident, lang):
        indx_data = self.indx_data
        ncx_header = \
 '''<?xml version='1.0' encoding='utf-8'?>
 <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="%s">
 <head>
 <meta content="%s" name="dtb:uid"/>
 <meta content="%d" name="dtb:depth"/>
 <meta content="mobiunpack.py" name="dtb:generator"/>
 <meta content="0" name="dtb:totalPageCount"/>
 <meta content="0" name="dtb:maxPageNumber"/>
 </head>
 <docTitle>
 <text>%s</text>
 </docTitle>
 <navMap>
 '''
        ncx_footer = \
 '''  </navMap>
 </ncx>
 '''
        ncx_entry = \
 '''<navPoint id="%s" playOrder="%d">
 <navLabel>
 <text>%s</text>
 </navLabel>
 <content src="%s"/>'''
        # recursive part
        def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
            if start>len(indx_data) or end>len(indx_data):
                print("Warning: missing INDX child entries", start, end, len(indx_data))
                return ''
            if DEBUG_NCX:
                print("recursINDX lvl %d from %d to %d" % (lvl, start, end))
            xml = ''
            if start <= 0:
                start = 0
            if end <= 0:
                end = len(indx_data)
            if lvl > max_lvl:
                max_lvl = lvl
            indent = '  ' * (2 + lvl)
            for i in range(start, end):
                e = indx_data[i]
                if not e['hlvl'] == lvl:
                    continue
                # open entry
                num += 1
                link = '%s#filepos%d' % (htmlfile, e['pos'])
                tagid = 'np_%d' % num
                entry = ncx_entry % (tagid, num, e['text'], link)
                entry = re.sub(re.compile('^', re.M), indent, entry, 0)
                xml += entry + '\n'
                # recurs
                if e['child1']>=0:
                    xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1,
                            e['child1'], e['childn'] + 1)
                    xml += xmlrec
                # close entry
                xml += indent + '</navPoint>\n'
            return xml, max_lvl, num
        body, max_lvl, num = recursINDX()
        header = ncx_header % (lang, ident, max_lvl + 1, title)
        ncx =  header + body + ncx_footer
        if not len(indx_data) == num:
            print("Warning: different number of entries in NCX", len(indx_data), num)
        return ncx
    def writeNCX(self, metadata):
        # build the xml
        self.isNCX = True
        print("Write ncx")
        # htmlname = os.path.basename(self.files.outbase)
        # htmlname += '.html'
        htmlname = 'book.html'
        xml = self.buildNCX(htmlname, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0])
        # write the ncx file
        # ncxname = os.path.join(self.files.mobi7dir, self.files.getInputFileBasename() + '.ncx')
        ncxname = os.path.join(self.files.mobi7dir, 'toc.ncx')
        with open(pathof(ncxname), 'wb') as f:
            f.write(xml.encode('utf-8'))
    def buildK8NCX(self, indx_data, title, ident, lang):
        ncx_header = \
 '''<?xml version='1.0' encoding='utf-8'?>
 <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="%s">
 <head>
 <meta content="%s" name="dtb:uid"/>
 <meta content="%d" name="dtb:depth"/>
 <meta content="mobiunpack.py" name="dtb:generator"/>
 <meta content="0" name="dtb:totalPageCount"/>
 <meta content="0" name="dtb:maxPageNumber"/>
 </head>
 <docTitle>
 <text>%s</text>
 </docTitle>
 <navMap>
 '''
        ncx_footer = \
 '''  </navMap>
 </ncx>
 '''
        ncx_entry = \
 '''<navPoint id="%s" playOrder="%d">
 <navLabel>
 <text>%s</text>
 </navLabel>
 <content src="%s"/>'''
        # recursive part
        def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
            if start>len(indx_data) or end>len(indx_data):
                print("Warning: missing INDX child entries", start, end, len(indx_data))
                return ''
            if DEBUG_NCX:
                print("recursINDX lvl %d from %d to %d" % (lvl, start, end))
            xml = ''
            if start <= 0:
                start = 0
            if end <= 0:
                end = len(indx_data)
            if lvl > max_lvl:
                max_lvl = lvl
            indent = '  ' * (2 + lvl)
            for i in range(start, end):
                e = indx_data[i]
                htmlfile = e['filename']
                desttag = e['idtag']
                if not e['hlvl'] == lvl:
                    continue
                # open entry
                num += 1
                if desttag == '':
                    link = 'Text/%s' % htmlfile
                else:
                    link = 'Text/%s#%s' % (htmlfile, desttag)
                tagid = 'np_%d' % num
                entry = ncx_entry % (tagid, num, e['text'], link)
                entry = re.sub(re.compile('^', re.M), indent, entry, 0)
                xml += entry + '\n'
                # recurs
                if e['child1']>=0:
                    xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1,
                            e['child1'], e['childn'] + 1)
                    xml += xmlrec
                # close entry
                xml += indent + '</navPoint>\n'
            return xml, max_lvl, num
        body, max_lvl, num = recursINDX()
        header = ncx_header % (lang, ident, max_lvl + 1, title)
        ncx =  header + body + ncx_footer
        if not len(indx_data) == num:
            print("Warning: different number of entries in NCX", len(indx_data), num)
        return ncx
    def writeK8NCX(self, ncx_data, metadata):
        # build the xml
        self.isNCX = True
        print("Write K8 ncx")
        xml = self.buildK8NCX(ncx_data, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0])
        bname = 'toc.ncx'
        ncxname = os.path.join(self.files.k8oebps,bname)
        with open(pathof(ncxname), 'wb') as f:
            f.write(xml.encode('utf-8'))
--- a/KindleUnpack/mobi_opf.py
+++ b/KindleUnpack/mobi_opf.py
@@ -0,0 +1,681 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 from __future__ import unicode_literals, division, absolute_import, print_function
 from .compatibility_utils import unicode_str, unescapeit
 from .compatibility_utils import lzip
 from .unipath import pathof
 from xml.sax.saxutils import escape as xmlescape
 import os
 import uuid
 from datetime import datetime
 # In EPUB3, NCX and <guide> MAY exist in OPF, although the NCX is superseded
 # by the Navigation Document and the <guide> is deprecated. Currently, EPUB3_WITH_NCX
 # and EPUB3_WITH_GUIDE are set to True due to compatibility with epub2 reading systems.
 # They might be change to set to False in the future.
 EPUB3_WITH_NCX = True  # Do not set to False except for debug.
 """ Set to True to create a toc.ncx when converting to epub3. """
 EPUB3_WITH_GUIDE = True  # Do not set to False except for debug.
 """ Set to True to create a guide element in an opf when converting to epub3. """
 EPUB_OPF = 'content.opf'
 """ The name for the OPF of EPUB. """
 TOC_NCX = 'toc.ncx'
 """ The name for the TOC of EPUB2. """
 NAVIGATION_DOCUMENT = 'nav.xhtml'
 """ The name for the navigation document of EPUB3. """
 BEGIN_INFO_ONLY = '<!-- BEGIN INFORMATION ONLY '
 """ The comment to indicate the beginning of metadata which will be ignored by kindlegen. """
 END_INFO_ONLY = 'END INFORMATION ONLY -->'
 """ The comment to indicate the end of metadata which will be ignored by kindlegen. """
 EXTH_TITLE_FURIGANA = 'Title-Pronunciation'
 """ The name for Title Furigana(similar to file-as) set by KDP. """
 EXTH_CREATOR_FURIGANA = 'Author-Pronunciation'
 """ The name for Creator Furigana(similar to file-as) set by KDP. """
 EXTH_PUBLISHER_FURIGANA = 'Publisher-Pronunciation'
 """ The name for Publisher Furigana(similar to file-as) set by KDP. """
 EXTRA_ENTITIES = {'"': '&quot;', "'": "&apos;"}
 class OPFProcessor(object):
    def __init__(self, files, metadata, fileinfo, rscnames, hasNCX, mh, usedmap, pagemapxml='', guidetext='', k8resc=None, epubver='2'):
        self.files = files
        self.metadata = metadata
        self.fileinfo = fileinfo
        self.rscnames = rscnames
        self.has_ncx = hasNCX
        self.codec = mh.codec
        self.isK8 = mh.isK8()
        self.printReplica = mh.isPrintReplica()
        self.guidetext = unicode_str(guidetext)
        self.used = usedmap
        self.k8resc = k8resc
        self.covername = None
        self.cover_id = 'cover_img'
        if self.k8resc is not None and self.k8resc.cover_name is not None:
            # update cover id info from RESC if available
            self.cover_id = self.k8resc.cover_name
        # Create a unique urn uuid
        self.BookId = unicode_str(str(uuid.uuid4()))
        self.pagemap = pagemapxml
        self.ncxname = None
        self.navname = None
        # page-progression-direction is only set in spine
        self.page_progression_direction = metadata.pop('page-progression-direction', [None])[0]
        if 'rl' in metadata.get('primary-writing-mode', [''])[0]:
            self.page_progression_direction = 'rtl'
        self.epubver = epubver  # the epub version set by user
        self.target_epubver = epubver  # the epub vertion set by user or detected automatically
        if self.epubver == 'A':
            self.target_epubver = self.autodetectEPUBVersion()
        elif self.epubver == 'F':
            self.target_epubver = '2'
        elif self.epubver != '2' and self.epubver != '3':
            self.target_epubver = '2'
        # id for rifine attributes
        self.title_id = {}
        self.creator_id = {}
        self.publisher_id = {}
        # extra attributes
        self.title_attrib = {}
        self.creator_attrib = {}
        self.publisher_attrib = {}
        self.extra_attributes = []  # for force epub2 option
        # Create epub3 metadata from EXTH.
        self.exth_solved_refines_metadata = []
        self.exth_refines_metadata = []
        self.exth_fixedlayout_metadata = []
        self.defineRefinesID()
        self.processRefinesMetadata()
        if self.k8resc is not None:
            # Create metadata in RESC section.
            self.k8resc.createMetadata(epubver)
        if self.target_epubver == "3":
            self.createMetadataForFixedlayout()
    def escapeit(self, sval, EXTRAS=None):
        # note, xmlescape and unescape do not work with utf-8 bytestrings
        sval = unicode_str(sval)
        if EXTRAS:
            res = xmlescape(unescapeit(sval), EXTRAS)
        else:
            res = xmlescape(unescapeit(sval))
        return res
    def createMetaTag(self, data, property, content, refid=''):
        refines = ''
        if refid:
            refines = ' refines="#%s"' % refid
        data.append('<meta property="%s"%s>%s</meta>\n' % (property, refines, content))
    def buildOPFMetadata(self, start_tag, has_obfuscated_fonts=False):
        # convert from EXTH metadata format to target epub version metadata
        # epub 3 will ignore <meta name="xxxx" content="yyyy" /> style metatags
        #    but allows them to be present for backwards compatibility
        #    instead the new format is
        #    <meta property="xxxx" id="iiii" ... > property_value</meta>
        #       and DCMES elements such as:
        #    <dc:blah id="iiii">value</dc:blah>
        metadata = self.metadata
        k8resc = self.k8resc
        META_TAGS = ['Drm Server Id', 'Drm Commerce Id', 'Drm Ebookbase Book Id', 'ASIN', 'ThumbOffset', 'Fake Cover',
                                                'Creator Software', 'Creator Major Version', 'Creator Minor Version', 'Creator Build Number',
                                                'Watermark', 'Clipping Limit', 'Publisher Limit', 'Text to Speech Disabled', 'CDE Type',
                                                'Updated Title', 'Font Signature (hex)', 'Tamper Proof Keys (hex)',]
        # def handleTag(data, metadata, key, tag, ids={}):
        def handleTag(data, metadata, key, tag, attrib={}):
            '''Format metadata values.
            @param data: List of formatted metadata entries.
            @param metadata: The metadata dictionary.
            @param key: The key of the metadata value to handle.
            @param tag: The opf tag corresponds to the metadata value.
            ###@param ids: The ids in tags for refines property of epub3.
            @param attrib: The extra attibute for refines or opf prefixs.
           '''
            if key in metadata:
                for i, value in enumerate(metadata[key]):
                    closingTag = tag.split(" ")[0]
                    res = '<%s%s>%s</%s>\n' % (tag, attrib.get(i, ''), self.escapeit(value), closingTag)
                    data.append(res)
                del metadata[key]
        # these are allowed but ignored by epub3
        def handleMetaPairs(data, metadata, key, name):
            if key in metadata:
                for value in metadata[key]:
                    res = '<meta name="%s" content="%s" />\n' % (name, self.escapeit(value, EXTRA_ENTITIES))
                    data.append(res)
                del metadata[key]
        data = []
        data.append(start_tag + '\n')
        # Handle standard metadata
        if 'Title' in metadata:
            handleTag(data, metadata, 'Title', 'dc:title', self.title_attrib)
        else:
            data.append('<dc:title>Untitled</dc:title>\n')
        handleTag(data, metadata, 'Language', 'dc:language')
        if 'UniqueID' in metadata:
            handleTag(data, metadata, 'UniqueID', 'dc:identifier id="uid"')
        else:
            # No unique ID in original, give it a generic one.
            data.append('<dc:identifier id="uid">0</dc:identifier>\n')
        if self.target_epubver == '3':
            # epub version 3 minimal metadata requires a dcterms:modifed date tag
            self.createMetaTag(data, 'dcterms:modified', datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"))
        if self.isK8 and has_obfuscated_fonts:
            # Use the random generated urn:uuid so obuscated fonts work.
            # It doesn't need to be _THE_ unique identifier to work as a key
            # for obfuscated fonts in Sigil, ADE and calibre. Its just has
            # to use the opf:scheme="UUID" and have the urn:uuid: prefix.
            if self.target_epubver == '3':
                data.append('<dc:identifier>urn:uuid:'+self.BookId+'</dc:identifier>\n')
            else:
                data.append('<dc:identifier opf:scheme="UUID">urn:uuid:'+self.BookId+'</dc:identifier>\n')
        handleTag(data, metadata, 'Creator', 'dc:creator', self.creator_attrib)
        handleTag(data, metadata, 'Contributor', 'dc:contributor')
        handleTag(data, metadata, 'Publisher', 'dc:publisher', self.publisher_attrib)
        handleTag(data, metadata, 'Source', 'dc:source')
        handleTag(data, metadata, 'Type', 'dc:type')
        if self.target_epubver == '3':
            if 'ISBN' in metadata:
                for i, value in enumerate(metadata['ISBN']):
                    res = '<dc:identifier>urn:isbn:%s</dc:identifier>\n' % self.escapeit(value)
                    data.append(res)
        else:
            handleTag(data, metadata, 'ISBN', 'dc:identifier opf:scheme="ISBN"')
        if 'Subject' in metadata:
            if 'SubjectCode' in metadata:
                codeList = metadata['SubjectCode']
                del metadata['SubjectCode']
            else:
                codeList = None
            for i in range(len(metadata['Subject'])):
                if codeList and i < len(codeList):
                    data.append('<dc:subject BASICCode="'+codeList[i]+'">')
                else:
                    data.append('<dc:subject>')
                data.append(self.escapeit(metadata['Subject'][i])+'</dc:subject>\n')
            del metadata['Subject']
        handleTag(data, metadata, 'Description', 'dc:description')
        if self.target_epubver == '3':
            if 'Published' in metadata:
                for i, value in enumerate(metadata['Published']):
                    res = '<dc:date>%s</dc:date>\n' % self.escapeit(value)
                    data.append(res)
        else:
            handleTag(data, metadata, 'Published', 'dc:date opf:event="publication"')
        handleTag(data, metadata, 'Rights', 'dc:rights')
        if self.epubver == 'F':
            if self.extra_attributes or k8resc is not None and k8resc.extra_attributes:
                data.append('<!-- THE FOLLOWINGS ARE REQUIRED TO INSERT INTO <dc:xxx> MANUALLY\n')
                if self.extra_attributes:
                    data += self.extra_attributes
                if k8resc is not None and k8resc.extra_attributes:
                    data += k8resc.extra_attributes
                data.append('-->\n')
        else:
            # Append refines metadata.
            if self.exth_solved_refines_metadata:
                data.append('<!-- Refines MetaData from EXTH -->\n')
                data += self.exth_solved_refines_metadata
            if self.exth_refines_metadata or k8resc is not None and k8resc.refines_metadata:
                data.append('<!-- THE FOLLOWINGS ARE REQUIRED TO EDIT IDS MANUALLY\n')
                if self.exth_refines_metadata:
                    data += self.exth_refines_metadata
                if k8resc is not None and k8resc.refines_metadata:
                    data += k8resc.refines_metadata
                data.append('-->\n')
        # Append metadata in RESC section.
        if k8resc is not None and k8resc.extra_metadata:
            data.append('<!-- Extra MetaData from RESC\n')
            data += k8resc.extra_metadata
            data.append('-->\n')
        if 'CoverOffset' in metadata:
            imageNumber = int(metadata['CoverOffset'][0])
            self.covername = self.rscnames[imageNumber]
            if self.covername is None:
                print("Error: Cover image %s was not recognized as a valid image" % imageNumber)
            else:
                # <meta name="cover"> is obsoleted in EPUB3, but kindlegen v2.9 requires it.
                data.append('<meta name="cover" content="' + self.cover_id + '" />\n')
                self.used[self.covername] = 'used'
            del metadata['CoverOffset']
        handleMetaPairs(data, metadata, 'Codec', 'output encoding')
        # handle kindlegen specifc tags
        handleTag(data, metadata, 'DictInLanguage', 'DictionaryInLanguage')
        handleTag(data, metadata, 'DictOutLanguage', 'DictionaryOutLanguage')
        handleMetaPairs(data, metadata, 'RegionMagnification', 'RegionMagnification')
        handleMetaPairs(data, metadata, 'book-type', 'book-type')
        handleMetaPairs(data, metadata, 'zero-gutter', 'zero-gutter')
        handleMetaPairs(data, metadata, 'zero-margin', 'zero-margin')
        handleMetaPairs(data, metadata, 'primary-writing-mode', 'primary-writing-mode')
        handleMetaPairs(data, metadata, 'fixed-layout', 'fixed-layout')
        handleMetaPairs(data, metadata, 'orientation-lock', 'orientation-lock')
        handleMetaPairs(data, metadata, 'original-resolution', 'original-resolution')
        # these are not allowed in epub2 or 3 so convert them to meta name content pairs
        # perhaps these could better be mapped into the dcterms namespace instead
        handleMetaPairs(data, metadata, 'Review', 'review')
        handleMetaPairs(data, metadata, 'Imprint', 'imprint')
        handleMetaPairs(data, metadata, 'Adult', 'adult')
        handleMetaPairs(data, metadata, 'DictShortName', 'DictionaryVeryShortName')
        # these are needed by kobo books upon submission but not sure if legal metadata in epub2 or epub3
        if 'Price' in metadata and 'Currency' in metadata:
            priceList = metadata['Price']
            currencyList = metadata['Currency']
            if len(priceList) != len(currencyList):
                print("Error: found %s price entries, but %s currency entries.")
            else:
                for i in range(len(priceList)):
                    data.append('<SRP Currency="'+currencyList[i]+'">'+priceList[i]+'</SRP>\n')
            del metadata['Price']
            del metadata['Currency']
        if self.target_epubver == '3':
            # Append metadata for EPUB3.
            if self.exth_fixedlayout_metadata:
                data.append('<!-- EPUB3 MedaData converted from EXTH -->\n')
                data += self.exth_fixedlayout_metadata
        # all that remains is extra EXTH info we will store inside a comment inside meta name/content pairs
        # so it can not impact anything and will be automatically stripped out if found again in a RESC section
        data.append(BEGIN_INFO_ONLY + '\n')
        if 'ThumbOffset' in metadata:
            imageNumber = int(metadata['ThumbOffset'][0])
            imageName = self.rscnames[imageNumber]
            if imageName is None:
                print("Error: Cover Thumbnail image %s was not recognized as a valid image" % imageNumber)
            else:
                data.append('<meta name="Cover ThumbNail Image" content="'+ 'Images/'+imageName+'" />\n')
                # self.used[imageName] = 'used' # thumbnail image is always generated by Kindlegen, so don't include in manifest
                self.used[imageName] = 'not used'
            del metadata['ThumbOffset']
        for metaName in META_TAGS:
            if metaName in metadata:
                for value in metadata[metaName]:
                    data.append('<meta name="'+metaName+'" content="'+self.escapeit(value, EXTRA_ENTITIES)+'" />\n')
                del metadata[metaName]
        for key in list(metadata.keys()):
            for value in metadata[key]:
                data.append('<meta name="'+key+'" content="'+self.escapeit(value, EXTRA_ENTITIES)+'" />\n')
            del metadata[key]
        data.append(END_INFO_ONLY + '\n')
        data.append('</metadata>\n')
        return data
    def buildOPFManifest(self, ncxname, navname=None):
        # buildManifest for mobi7, azw4, epub2 and epub3.
        k8resc = self.k8resc
        cover_id = self.cover_id
        hasK8RescSpine = k8resc is not None and k8resc.hasSpine()
        self.ncxname = ncxname
        self.navname = navname
        data = []
        data.append('<manifest>\n')
        media_map = {
                '.jpg'  : 'image/jpeg',
                '.jpeg' : 'image/jpeg',
                '.png'  : 'image/png',
                '.gif'  : 'image/gif',
                '.svg'  : 'image/svg+xml',
                '.xhtml': 'application/xhtml+xml',
                '.html' : 'text/html',                   # for mobi7
                '.pdf'  : 'application/pdf',             # for azw4(print replica textbook)
                '.ttf'  : 'application/x-font-ttf',
                '.otf'  : 'application/x-font-opentype',  # replaced?
                '.css'  : 'text/css',
                # '.html' : 'text/x-oeb1-document',        # for mobi7
                # '.otf'  : 'application/vnd.ms-opentype', # [OpenType] OpenType fonts
                # '.woff' : 'application/font-woff',       # [WOFF] WOFF fonts
                # '.smil' : 'application/smil+xml',        # [MediaOverlays301] EPUB Media Overlay documents
                # '.pls'  : 'application/pls+xml',         # [PLS] Text-to-Speech (TTS) Pronunciation lexicons
                # '.mp3'  : 'audio/mpeg',
                # '.mp4'  : 'video/mp4',
                # '.js'   : 'text/javascript',             # not supported in K8
                }
        spinerefs = []
        idcnt = 0
        for [key,dir,fname] in self.fileinfo:
            name, ext = os.path.splitext(fname)
            ext = ext.lower()
            media = media_map.get(ext)
            ref = "item%d" % idcnt
            if hasK8RescSpine:
                if key is not None and key in k8resc.spine_idrefs:
                    ref = k8resc.spine_idrefs[key]
            properties = ''
            if dir != '':
                fpath = dir + '/' + fname
            else:
                fpath = fname
            data.append('<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(ref, media, fpath, properties))
            if ext in ['.xhtml', '.html']:
                spinerefs.append(ref)
            idcnt += 1
        for fname in self.rscnames:
            if fname is not None:
                if self.used.get(fname,'not used') == 'not used':
                    continue
                name, ext = os.path.splitext(fname)
                ext = ext.lower()
                media = media_map.get(ext,ext[1:])
                properties = ''
                if fname == self.covername:
                    ref = cover_id
                    if self.target_epubver == '3':
                        properties = 'properties="cover-image"'
                else:
                    ref = "item%d" % idcnt
                if ext == '.ttf' or ext == '.otf':
                    if self.isK8:  # fonts are only used in Mobi 8
                        fpath = 'Fonts/' + fname
                        data.append('<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(ref, media, fpath, properties))
                else:
                    fpath = 'Images/' + fname
                    data.append('<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(ref, media, fpath, properties))
                idcnt += 1
        if self.target_epubver == '3' and navname is not None:
            data.append('<item id="nav" media-type="application/xhtml+xml" href="Text/' + navname + '" properties="nav"/>\n')
        if self.has_ncx and ncxname is not None:
            data.append('<item id="ncx" media-type="application/x-dtbncx+xml" href="' + ncxname +'" />\n')
        if self.pagemap != '':
            data.append('<item id="map" media-type="application/oebs-page-map+xml" href="page-map.xml" />\n')
        data.append('</manifest>\n')
        return [data, spinerefs]
    def buildOPFSpine(self, spinerefs, isNCX):
        # build spine
        k8resc = self.k8resc
        hasK8RescSpine = k8resc is not None and k8resc.hasSpine()
        data = []
        ppd = ''
        if self.isK8 and self.page_progression_direction is not None:
            ppd = ' page-progression-direction="{:s}"'.format(self.page_progression_direction)
        ncx = ''
        if isNCX:
            ncx = ' toc="ncx"'
        map=''
        if self.pagemap != '':
            map = ' page-map="map"'
        if self.epubver == 'F':
            if ppd:
                ppd = '<!--' + ppd + ' -->'
            spine_start_tag = '<spine{1:s}{2:s}>{0:s}\n'.format(ppd, map, ncx)
        else:
            spine_start_tag = '<spine{0:s}{1:s}{2:s}>\n'.format(ppd, map, ncx)
        data.append(spine_start_tag)
        if hasK8RescSpine:
            for key in k8resc.spine_order:
                idref = k8resc.spine_idrefs[key]
                attribs = k8resc.spine_pageattributes[key]
                tag = '<itemref idref="%s"' % idref
                for aname, val in list(attribs.items()):
                    if self.epubver == 'F' and aname == 'properties':
                        continue
                    if val is not None:
                        tag += ' %s="%s"' % (aname, val)
                tag += '/>'
                if self.epubver == 'F' and 'properties' in attribs:
                    val = attribs['properties']
                    if val is not None:
                        tag += '<!-- properties="%s" -->' % val
                tag += '\n'
                data.append(tag)
        else:
            start = 0
            # special case the created coverpage if need be
            [key, dir, fname] = self.fileinfo[0]
            if key is not None and key == "coverpage":
                entry = spinerefs[start]
                data.append('<itemref idref="%s" linear="no"/>\n' % entry)
                start += 1
            for entry in spinerefs[start:]:
                data.append('<itemref idref="' + entry + '"/>\n')
        data.append('</spine>\n')
        return data
    def buildMobi7OPF(self):
        # Build an OPF for mobi7 and azw4.
        print("Building an opf for mobi7/azw4.")
        data = []
        data.append('<?xml version="1.0" encoding="utf-8"?>\n')
        data.append('<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="uid">\n')
        metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">'
        opf_metadata = self.buildOPFMetadata(metadata_tag)
        data += opf_metadata
        if self.has_ncx:
            # ncxname = self.files.getInputFileBasename() + '.ncx'
            ncxname = 'toc.ncx'
        else:
            ncxname = None
        [opf_manifest, spinerefs] = self.buildOPFManifest(ncxname)
        data += opf_manifest
        opf_spine = self.buildOPFSpine(spinerefs, self.has_ncx)
        data += opf_spine
        data.append('<tours>\n</tours>\n')
        if not self.printReplica:
            guide ='<guide>\n' + self.guidetext + '</guide>\n'
            data.append(guide)
        data.append('</package>\n')
        return ''.join(data)
    def buildEPUBOPF(self, has_obfuscated_fonts=False):
        print("Building an opf for mobi8 using epub version: ", self.target_epubver)
        if self.target_epubver == '2':
            has_ncx = self.has_ncx
            has_guide = True
            ncxname = None
            ncxname = TOC_NCX
            navname = None
            package = '<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="uid">\n'
            tours = '<tours>\n</tours>\n'
            metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">'
        else:
            has_ncx = EPUB3_WITH_NCX
            has_guide = EPUB3_WITH_GUIDE
            ncxname = None
            if has_ncx:
                ncxname = TOC_NCX
            navname = NAVIGATION_DOCUMENT
            package = '<package version="3.0" xmlns="http://www.idpf.org/2007/opf" prefix="rendition: http://www.idpf.org/vocab/rendition/#" unique-identifier="uid">\n'
            tours = ''
            metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">'
        data = []
        data.append('<?xml version="1.0" encoding="utf-8"?>\n')
        data.append(package)
        opf_metadata = self.buildOPFMetadata(metadata_tag, has_obfuscated_fonts)
        data += opf_metadata
        [opf_manifest, spinerefs] = self.buildOPFManifest(ncxname, navname)
        data += opf_manifest
        opf_spine = self.buildOPFSpine(spinerefs, has_ncx)
        data += opf_spine
        data.append(tours)
        if has_guide:
            guide ='<guide>\n' + self.guidetext + '</guide>\n'
            data.append(guide)
        data.append('</package>\n')
        return ''.join(data)
    def writeOPF(self, has_obfuscated_fonts=False):
        if self.isK8:
            data = self.buildEPUBOPF(has_obfuscated_fonts)
            outopf = os.path.join(self.files.k8oebps, EPUB_OPF)
            with open(pathof(outopf), 'wb') as f:
                f.write(data.encode('utf-8'))
            return self.BookId
        else:
            data = self.buildMobi7OPF()
            outopf = os.path.join(self.files.mobi7dir, 'content.opf')
            with open(pathof(outopf), 'wb') as f:
                f.write(data.encode('utf-8'))
            return 0
    def getBookId(self):
        return self.BookId
    def getNCXName(self):
        return self.ncxname
    def getNAVName(self):
        return self.navname
    def getEPUBVersion(self):
        return self.target_epubver
    def hasNCX(self):
        return self.ncxname is not None and self.has_ncx
    def hasNAV(self):
        return self.navname is not None
    def autodetectEPUBVersion(self):
        # Determine EPUB version from metadata and RESC.
        metadata = self.metadata
        k8resc = self.k8resc
        epubver = '2'
        if 'true' == metadata.get('fixed-layout', [''])[0].lower():
            epubver = '3'
        elif metadata.get('orientation-lock', [''])[0].lower() in ['portrait', 'landscape']:
            epubver = '3'
        elif self.page_progression_direction == 'rtl':
            epubver = '3'
        elif EXTH_TITLE_FURIGANA in metadata:
            epubver = '3'
        elif EXTH_CREATOR_FURIGANA in metadata:
            epubver = '3'
        elif EXTH_PUBLISHER_FURIGANA in metadata:
            epubver = '3'
        elif k8resc is not None and k8resc.needEPUB3():
            epubver = '3'
        return epubver
    def defineRefinesID(self):
        # the following EXTH are set by KDP.
        # 'Title_Furigana_(508)'
        # 'Creator_Furigana_(517)',
        # 'Publisher_Furigana_(522)'
        # It is difficult to find correspondence between Title, Creator, Publisher
        # and EXTH 508,512, 522 if they have more than two values since KDP seems not preserve the oders of EXTH 508,512 and 522.
        # It is also difficult to find correspondence between them and tags which have refine attributes in RESC.
        # So editing manually is required.
        metadata = self.metadata
        needRefinesId = False
        if self.k8resc is not None:
            needRefinesId = self.k8resc.hasRefines()
        # Create id for rifine attributes
        if (needRefinesId or EXTH_TITLE_FURIGANA in metadata) and 'Title' in metadata:
            for i in range(len(metadata.get('Title'))):
                self.title_id[i] = 'title%02d' % (i+1)
        if (needRefinesId or EXTH_CREATOR_FURIGANA in metadata) and 'Creator' in metadata:
            for i in range(len(metadata.get('Creator'))):
                self.creator_id[i] = 'creator%02d' % (i+1)
        if (needRefinesId or EXTH_PUBLISHER_FURIGANA in metadata) and 'Publisher' in metadata:
            for i in range(len(metadata.get('Publisher'))):
                self.publisher_id[i] = 'publisher%02d' % (i+1)
    def processRefinesMetadata(self):
        # create refines metadata defined in epub3 or convert refines property to opf: attribues for epub2.
        metadata = self.metadata
        refines_list = [
                [EXTH_TITLE_FURIGANA, self.title_id, self.title_attrib, 'title00'],
                [EXTH_CREATOR_FURIGANA, self.creator_id, self.creator_attrib, 'creator00'],
                [EXTH_PUBLISHER_FURIGANA, self.publisher_id, self.publisher_attrib, 'publisher00']
                ]
        create_refines_metadata = False
        for EXTH in lzip(*refines_list)[0]:
            if EXTH in metadata:
                create_refines_metadata = True
                break
        if create_refines_metadata:
            for [EXTH, id, attrib, defaultid] in refines_list:
                if self.target_epubver == '3':
                    for i, value in list(id.items()):
                        attrib[i] = ' id="%s"' % value
                    if EXTH in metadata:
                        if len(metadata[EXTH]) == 1 and len(id) == 1:
                            self.createMetaTag(self.exth_solved_refines_metadata, 'file-as', metadata[EXTH][0], id[0])
                        else:
                            for i, value in enumerate(metadata[EXTH]):
                                self.createMetaTag(self.exth_refines_metadata, 'file-as', value, id.get(i, defaultid))
                else:
                    if EXTH in metadata:
                        if len(metadata[EXTH]) == 1 and len(id) == 1:
                            attr = ' opf:file-as="%s"' % metadata[EXTH][0]
                            attrib[0] = attr
                        else:
                            for i, value in enumerate(metadata[EXTH]):
                                attr = ' id="#%s" opf:file-as="%s"\n' % (id.get(i, defaultid), value)
                                self.extra_attributes.append(attr)
    def createMetadataForFixedlayout(self):
        # convert fixed layout to epub3 format if needed.
        metadata = self.metadata
        if 'fixed-layout' in metadata:
            fixedlayout = metadata['fixed-layout'][0]
            content = {'true' : 'pre-paginated'}.get(fixedlayout.lower(), 'reflowable')
            self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:layout', content)
        if 'orientation-lock' in metadata:
            content = metadata['orientation-lock'][0].lower()
            if content == 'portrait' or content == 'landscape':
                self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:orientation', content)
        # according to epub3 spec about correspondence with Amazon
        # if 'original-resolution' is provided it needs to be converted to
        # meta viewport property tag stored in the <head></head> of **each**
        # xhtml page - so this tag would need to be handled by editing each part
        # before reaching this routine
        # we need to add support for this to the k8html routine
        # if 'original-resolution' in metadata.keys():
        #     resolution = metadata['original-resolution'][0].lower()
        #     width, height = resolution.split('x')
        #     if width.isdigit() and int(width) > 0 and height.isdigit() and int(height) > 0:
        #         viewport = 'width=%s, height=%s' % (width, height)
        #         self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:viewport', viewport)
--- a/KindleUnpack/mobi_pagemap.py
+++ b/KindleUnpack/mobi_pagemap.py
@@ -0,0 +1,158 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 from __future__ import unicode_literals, division, absolute_import, print_function
 from .compatibility_utils import PY2, unicode_str
 if PY2:
    range = xrange
 import struct
 # note:  struct pack, unpack, unpack_from all require bytestring format
 # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 import re
 # note: re requites the pattern to be the exact same type as the data to be searched in python3
 # but u"" is not allowed for the pattern itself only b""
 _TABLE = [('m', 1000), ('cm', 900), ('d', 500), ('cd', 400), ('c', 100), ('xc', 90), ('l', 50), ('xl', 40), ('x', 10), ('ix', 9), ('v', 5), ('iv', 4), ('i', 1)]
 def int_to_roman(i):
    parts = []
    num = i
    for letter, value in _TABLE:
        while value <= num:
            num -= value
            parts.append(letter)
    return ''.join(parts)
 def roman_to_int(s):
    result = 0
    rnstr = s
    for letter, value in _TABLE:
        while rnstr.startswith(letter):
            result += value
            rnstr = rnstr[len(letter):]
    return result
 _pattern = r'''\(([^\)]*)\)'''
 _tup_pattern = re.compile(_pattern,re.IGNORECASE)
 def _parseNames(numpages, data):
    data = unicode_str(data)
    pagenames = []
    pageMap = ''
    for i in range(numpages):
        pagenames.append(None)
    for m in re.finditer(_tup_pattern, data):
        tup = m.group(1)
        if pageMap != '':
            pageMap += ','
        pageMap += '(' + tup + ')'
        spos, nametype, svalue = tup.split(",")
        # print(spos, nametype, svalue)
        if nametype == 'a' or nametype == 'r':
            svalue = int(svalue)
        spos = int(spos)
        for i in range(spos - 1, numpages):
            if nametype == 'r':
                pname = int_to_roman(svalue)
                svalue += 1
            elif nametype == 'a':
                pname = "%s" % svalue
                svalue += 1
            elif nametype == 'c':
                sp = svalue.find('|')
                if sp == -1:
                    pname = svalue
                else:
                    pname = svalue[0:sp]
                    svalue = svalue[sp+1:]
            else:
                print("Error: unknown page numbering type", nametype)
            pagenames[i] = pname
    return pagenames, pageMap
 class PageMapProcessor:
    def __init__(self, mh, data):
        self.mh = mh
        self.data = data
        self.pagenames = []
        self.pageoffsets = []
        self.pageMap = ''
        self.pm_len = 0
        self.pm_nn = 0
        self.pn_bits = 0
        self.pmoff = None
        self.pmstr = ''
        print("Extracting Page Map Information")
        rev_len, = struct.unpack_from(b'>L', self.data, 0x10)
        # skip over header, revision string length data, and revision string
        ptr = 0x14 + rev_len
        pm_1, self.pm_len, self.pm_nn, self.pm_bits  = struct.unpack_from(b'>4H', self.data, ptr)
        # print(pm_1, self.pm_len, self.pm_nn, self.pm_bits)
        self.pmstr = self.data[ptr+8:ptr+8+self.pm_len]
        self.pmoff = self.data[ptr+8+self.pm_len:]
        offsize = b">L"
        offwidth = 4
        if self.pm_bits == 16:
            offsize = b">H"
            offwidth = 2
        ptr = 0
        for i in range(self.pm_nn):
            od, = struct.unpack_from(offsize, self.pmoff, ptr)
            ptr += offwidth
            self.pageoffsets.append(od)
        self.pagenames, self.pageMap = _parseNames(self.pm_nn, self.pmstr)
    def getPageMap(self):
        return self.pageMap
    def getNames(self):
        return self.pagenames
    def getOffsets(self):
        return self.pageoffsets
    # page-map.xml will be unicode but encoded to utf-8 immediately before being written to a file
    def generateKF8PageMapXML(self, k8proc):
        pagemapxml = '<page-map xmlns="http://www.idpf.org/2007/opf">\n'
        for i in range(len(self.pagenames)):
            pos = self.pageoffsets[i]
            name = self.pagenames[i]
            if name is not None and name != "":
                [pn, dir, filename, skelpos, skelend, aidtext] = k8proc.getSkelInfo(pos)
                idtext = unicode_str(k8proc.getPageIDTag(pos))
                linktgt = unicode_str(filename)
                if idtext != '':
                    linktgt += '#' + idtext
                pagemapxml += '<page name="%s" href="%s/%s" />\n' % (name, dir, linktgt)
        pagemapxml += "</page-map>\n"
        return pagemapxml
    def generateAPNX(self, apnx_meta):
        if apnx_meta['format'] == 'MOBI_8':
            content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","format":"%(format)s","fileRevisionId":"1","acr":"%(acr)s"}' %apnx_meta
        else:
            content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","fileRevisionId":"1"}' % apnx_meta
        content_header = content_header.encode('utf-8')
        page_header = '{"asin":"%(asin)s","pageMap":"%(pageMap)s"}' % apnx_meta
        page_header = page_header.encode('utf-8')
        apnx = struct.pack(b'>H',1) + struct.pack(b'>H',1)
        apnx += struct.pack(b'>I', 12 + len(content_header))
        apnx += struct.pack(b'>I', len(content_header))
        apnx += content_header
        apnx += struct.pack(b'>H', 1)
        apnx += struct.pack(b'>H', len(page_header))
        apnx += struct.pack(b'>H', self.pm_nn)
        apnx += struct.pack(b'>H', 32)
        apnx += page_header
        for page in self.pageoffsets:
            apnx += struct.pack(b'>L', page)
        return apnx
--- a/KindleUnpack/mobi_sectioner.py
+++ b/KindleUnpack/mobi_sectioner.py
@@ -0,0 +1,120 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 from __future__ import unicode_literals, division, absolute_import, print_function
 from .compatibility_utils import PY2, hexlify, bstr, bord, bchar
 import datetime
 if PY2:
    range = xrange
 # note:  struct pack, unpack, unpack_from all require bytestring format
 # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 import struct
 from .unipath import pathof
 DUMP = False
 """ Set to True to dump all possible information. """
 class unpackException(Exception):
    pass
 def describe(data):
    txtans = ''
    hexans = hexlify(data)
    for i in data:
        if bord(i) < 32 or bord(i) > 127:
            txtans += '?'
        else:
            txtans += bchar(i).decode('latin-1')
    return '"' + txtans + '"' + ' 0x'+ hexans
 def datetimefrompalmtime(palmtime):
    if palmtime > 0x7FFFFFFF:
        pythondatetime = datetime.datetime(year=1904,month=1,day=1)+datetime.timedelta(seconds=palmtime)
    else:
        pythondatetime = datetime.datetime(year=1970,month=1,day=1)+datetime.timedelta(seconds=palmtime)
    return pythondatetime
 class Sectionizer:
    def __init__(self, filename):
        self.data = b''
        with open(pathof(filename), 'rb') as f:
            self.data = f.read()
        self.palmheader = self.data[:78]
        self.palmname = self.data[:32]
        self.ident = self.palmheader[0x3C:0x3C+8]
        self.num_sections, = struct.unpack_from(b'>H', self.palmheader, 76)
        self.filelength = len(self.data)
        sectionsdata = struct.unpack_from(bstr('>%dL' % (self.num_sections*2)), self.data, 78) + (self.filelength, 0)
        self.sectionoffsets = sectionsdata[::2]
        self.sectionattributes = sectionsdata[1::2]
        self.sectiondescriptions = ["" for x in range(self.num_sections+1)]
        self.sectiondescriptions[-1] = "File Length Only"
        return
    def dumpsectionsinfo(self):
        print("Section     Offset  Length      UID Attribs Description")
        for i in range(self.num_sections):
            print("%3d %3X  0x%07X 0x%05X % 8d % 7d %s" % (i,i, self.sectionoffsets[i], self.sectionoffsets[
                  i+1] - self.sectionoffsets[i], self.sectionattributes[i]&0xFFFFFF, (self.sectionattributes[i]>>24)&0xFF, self.sectiondescriptions[i]))
        print("%3d %3X  0x%07X                          %s" %
              (self.num_sections,self.num_sections, self.sectionoffsets[self.num_sections], self.sectiondescriptions[self.num_sections]))
    def setsectiondescription(self, section, description):
        if section < len(self.sectiondescriptions):
            self.sectiondescriptions[section] = description
        else:
            print("Section out of range: %d, description %s" % (section,description))
    def dumppalmheader(self):
        print("Palm Database Header")
        print("Database name: " + repr(self.palmheader[:32]))
        dbattributes, = struct.unpack_from(b'>H', self.palmheader, 32)
        print("Bitfield attributes: 0x%0X" % dbattributes,)
        if dbattributes != 0:
            print(" (",)
            if (dbattributes & 2):
                print("Read-only; ",)
            if (dbattributes & 4):
                print("Dirty AppInfoArea; ",)
            if (dbattributes & 8):
                print("Needs to be backed up; ",)
            if (dbattributes & 16):
                print("OK to install over newer; ",)
            if (dbattributes & 32):
                print("Reset after installation; ",)
            if (dbattributes & 64):
                print("No copying by PalmPilot beaming; ",)
            print(")")
        else:
            print("")
        print("File version: %d" % struct.unpack_from(b'>H', self.palmheader, 34)[0])
        dbcreation, = struct.unpack_from(b'>L', self.palmheader, 36)
        print("Creation Date: " + str(datetimefrompalmtime(dbcreation))+ (" (0x%0X)" % dbcreation))
        dbmodification, = struct.unpack_from(b'>L', self.palmheader, 40)
        print("Modification Date: " + str(datetimefrompalmtime(dbmodification))+ (" (0x%0X)" % dbmodification))
        dbbackup, = struct.unpack_from(b'>L', self.palmheader, 44)
        if dbbackup != 0:
            print("Backup Date: " + str(datetimefrompalmtime(dbbackup))+ (" (0x%0X)" % dbbackup))
        print("Modification No.: %d" % struct.unpack_from(b'>L', self.palmheader, 48)[0])
        print("App Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 52)[0])
        print("Sort Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 56)[0])
        print("Type/Creator: %s/%s" % (repr(self.palmheader[60:64]), repr(self.palmheader[64:68])))
        print("Unique seed: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 68)[0])
        expectedzero, = struct.unpack_from(b'>L', self.palmheader, 72)
        if expectedzero != 0:
            print("Should be zero but isn't: %d" % struct.unpack_from(b'>L', self.palmheader, 72)[0])
        print("Number of sections: %d" % struct.unpack_from(b'>H', self.palmheader, 76)[0])
        return
    def loadSection(self, section):
        before, after = self.sectionoffsets[section:section+2]
        return self.data[before:after]
--- a/KindleUnpack/mobi_split.py
+++ b/KindleUnpack/mobi_split.py
@@ -0,0 +1,438 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 from __future__ import unicode_literals, division, absolute_import, print_function
 import struct
 # note:  struct pack, unpack, unpack_from all require bytestring format
 # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 from .unipath import pathof
 # important  pdb header offsets
 unique_id_seed = 68
 number_of_pdb_records = 76
 # important palmdoc header offsets
 book_length = 4
 book_record_count = 8
 first_pdb_record = 78
 # important rec0 offsets
 length_of_book = 4
 mobi_header_base = 16
 mobi_header_length = 20
 mobi_type = 24
 mobi_version = 36
 first_non_text = 80
 title_offset = 84
 first_resc_record = 108
 first_content_index = 192
 last_content_index = 194
 kf8_fdst_index = 192  # for KF8 mobi headers
 fcis_index = 200
 flis_index = 208
 srcs_index = 224
 srcs_count = 228
 primary_index = 244
 datp_index = 256
 huffoff = 112
 hufftbloff = 120
 def getint(datain,ofs,sz=b'L'):
    i, = struct.unpack_from(b'>'+sz,datain,ofs)
    return i
 def writeint(datain,ofs,n,len=b'L'):
    if len==b'L':
        return datain[:ofs]+struct.pack(b'>L',n)+datain[ofs+4:]
    else:
        return datain[:ofs]+struct.pack(b'>H',n)+datain[ofs+2:]
 def getsecaddr(datain,secno):
    nsec = getint(datain,number_of_pdb_records,b'H')
    assert secno>=0 & secno<nsec,'secno %d out of range (nsec=%d)'%(secno,nsec)
    secstart = getint(datain,first_pdb_record+secno*8)
    if secno == nsec-1:
        secend = len(datain)
    else:
        secend = getint(datain,first_pdb_record+(secno+1)*8)
    return secstart,secend
 def readsection(datain,secno):
    secstart, secend = getsecaddr(datain,secno)
    return datain[secstart:secend]
 def writesection(datain,secno,secdata):  # overwrite, accounting for different length
    # dataout = deletesectionrange(datain,secno, secno)
    # return insertsection(dataout, secno, secdata)
    datalst = []
    nsec = getint(datain,number_of_pdb_records,b'H')
    zerosecstart,zerosecend = getsecaddr(datain,0)
    secstart,secend = getsecaddr(datain,secno)
    dif = len(secdata) - (secend - secstart)
    datalst.append(datain[:unique_id_seed])
    datalst.append(struct.pack(b'>L',2*nsec+1))
    datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
    datalst.append(struct.pack(b'>H',nsec))
    newstart = zerosecstart
    for i in range(0,secno):
        ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
        datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
    datalst.append(struct.pack(b'>L', secstart) + struct.pack(b'>L', (2*secno)))
    for i in range(secno+1,nsec):
        ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
        ofs = ofs + dif
        datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
    lpad = newstart - (first_pdb_record + 8*nsec)
    if lpad > 0:
        datalst.append(b'\0' * lpad)
    datalst.append(datain[zerosecstart:secstart])
    datalst.append(secdata)
    datalst.append(datain[secend:])
    dataout = b''.join(datalst)
    return dataout
 def nullsection(datain,secno):  # make it zero-length without deleting it
    datalst = []
    nsec = getint(datain,number_of_pdb_records,b'H')
    secstart, secend = getsecaddr(datain,secno)
    zerosecstart, zerosecend = getsecaddr(datain, 0)
    dif =  secend-secstart
    datalst.append(datain[:first_pdb_record])
    for i in range(0,secno+1):
        ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
        datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
    for i in range(secno+1, nsec):
        ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
        ofs = ofs - dif
        datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
    lpad = zerosecstart - (first_pdb_record + 8*nsec)
    if lpad > 0:
        datalst.append(b'\0' * lpad)
    datalst.append(datain[zerosecstart: secstart])
    datalst.append(datain[secend:])
    dataout = b''.join(datalst)
    return dataout
 def deletesectionrange(datain,firstsec,lastsec):  # delete a range of sections
    datalst = []
    firstsecstart,firstsecend = getsecaddr(datain,firstsec)
    lastsecstart,lastsecend = getsecaddr(datain,lastsec)
    zerosecstart, zerosecend = getsecaddr(datain, 0)
    dif = lastsecend - firstsecstart + 8*(lastsec-firstsec+1)
    nsec = getint(datain,number_of_pdb_records,b'H')
    datalst.append(datain[:unique_id_seed])
    datalst.append(struct.pack(b'>L',2*(nsec-(lastsec-firstsec+1))+1))
    datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
    datalst.append(struct.pack(b'>H',nsec-(lastsec-firstsec+1)))
    newstart = zerosecstart - 8*(lastsec-firstsec+1)
    for i in range(0,firstsec):
        ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
        ofs = ofs-8*(lastsec-firstsec+1)
        datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
    for i in range(lastsec+1,nsec):
        ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
        ofs = ofs - dif
        flgval = 2*(i-(lastsec-firstsec+1))
        datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
    lpad = newstart - (first_pdb_record + 8*(nsec - (lastsec - firstsec + 1)))
    if lpad > 0:
        datalst.append(b'\0' * lpad)
    datalst.append(datain[zerosecstart:firstsecstart])
    datalst.append(datain[lastsecend:])
    dataout = b''.join(datalst)
    return dataout
 def insertsection(datain,secno,secdata):  # insert a new section
    datalst = []
    nsec = getint(datain,number_of_pdb_records,b'H')
    # print("inserting secno" , secno,  "into" ,nsec, "sections")
    secstart,secend = getsecaddr(datain,secno)
    zerosecstart,zerosecend = getsecaddr(datain,0)
    dif = len(secdata)
    datalst.append(datain[:unique_id_seed])
    datalst.append(struct.pack(b'>L',2*(nsec+1)+1))
    datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
    datalst.append(struct.pack(b'>H',nsec+1))
    newstart = zerosecstart + 8
    for i in range(0,secno):
        ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
        ofs += 8
        datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
    datalst.append(struct.pack(b'>L', secstart + 8) + struct.pack(b'>L', (2*secno)))
    for i in range(secno,nsec):
        ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
        ofs = ofs + dif + 8
        flgval = 2*(i+1)
        datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
    lpad = newstart - (first_pdb_record + 8*(nsec + 1))
    if lpad > 0:
        datalst.append(b'\0' * lpad)
    datalst.append(datain[zerosecstart:secstart])
    datalst.append(secdata)
    datalst.append(datain[secstart:])
    dataout = b''.join(datalst)
    return dataout
 def insertsectionrange(sectionsource,firstsec,lastsec,sectiontarget,targetsec):  # insert a range of sections
    # print("inserting secno" , firstsec,  "to", lastsec, "into" ,targetsec, "sections")
    # dataout = sectiontarget
    # for idx in range(lastsec,firstsec-1,-1):
    #    dataout = insertsection(dataout,targetsec,readsection(sectionsource,idx))
    # return dataout
    datalst = []
    nsec = getint(sectiontarget,number_of_pdb_records,b'H')
    zerosecstart, zerosecend = getsecaddr(sectiontarget,0)
    insstart, nul = getsecaddr(sectiontarget,targetsec)
    nins = lastsec - firstsec + 1
    srcstart, nul = getsecaddr(sectionsource,firstsec)
    nul, srcend = getsecaddr(sectionsource,lastsec)
    newstart = zerosecstart + 8*nins
    datalst.append(sectiontarget[:unique_id_seed])
    datalst.append(struct.pack(b'>L',2*(nsec+nins)+1))
    datalst.append(sectiontarget[unique_id_seed+4:number_of_pdb_records])
    datalst.append(struct.pack(b'>H',nsec+nins))
    for i in range(0,targetsec):
        ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8)
        ofsnew = ofs + 8*nins
        flgvalnew = flgval
        datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew))
        # print(ofsnew, flgvalnew, ofs, flgval)
    srcstart0, nul = getsecaddr(sectionsource,firstsec)
    for i in range(nins):
        isrcstart, nul = getsecaddr(sectionsource,firstsec+i)
        ofsnew = insstart + (isrcstart-srcstart0) + 8*nins
        flgvalnew = 2*(targetsec+i)
        datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew))
        # print(ofsnew, flgvalnew)
    dif = srcend - srcstart
    for i in range(targetsec,nsec):
        ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8)
        ofsnew = ofs + dif + 8*nins
        flgvalnew = 2*(i+nins)
        datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L',flgvalnew))
        # print(ofsnew, flgvalnew, ofs, flgval)
    lpad = newstart - (first_pdb_record + 8*(nsec + nins))
    if lpad > 0:
        datalst.append(b'\0' * lpad)
    datalst.append(sectiontarget[zerosecstart:insstart])
    datalst.append(sectionsource[srcstart:srcend])
    datalst.append(sectiontarget[insstart:])
    dataout = b''.join(datalst)
    return dataout
 def get_exth_params(rec0):
    ebase = mobi_header_base + getint(rec0,mobi_header_length)
    elen = getint(rec0,ebase+4)
    enum = getint(rec0,ebase+8)
    return ebase,elen,enum
 def add_exth(rec0,exth_num,exth_bytes):
    ebase,elen,enum = get_exth_params(rec0)
    newrecsize = 8+len(exth_bytes)
    newrec0 = rec0[0:ebase+4]+struct.pack(b'>L',elen+newrecsize)+struct.pack(b'>L',enum+1)+\
              struct.pack(b'>L',exth_num)+struct.pack(b'>L',newrecsize)+exth_bytes+rec0[ebase+12:]
    newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+newrecsize)
    return newrec0
 def read_exth(rec0,exth_num):
    exth_values = []
    ebase,elen,enum = get_exth_params(rec0)
    ebase = ebase+12
    while enum>0:
        exth_id = getint(rec0,ebase)
        if exth_id == exth_num:
            # We might have multiple exths, so build a list.
            exth_values.append(rec0[ebase+8:ebase+getint(rec0,ebase+4)])
        enum = enum-1
        ebase = ebase+getint(rec0,ebase+4)
    return exth_values
 def write_exth(rec0,exth_num,exth_bytes):
    ebase,elen,enum = get_exth_params(rec0)
    ebase_idx = ebase+12
    enum_idx = enum
    while enum_idx>0:
        exth_id = getint(rec0,ebase_idx)
        if exth_id == exth_num:
            dif = len(exth_bytes)+8-getint(rec0,ebase_idx+4)
            newrec0 = rec0
            if dif != 0:
                newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+dif)
            return newrec0[:ebase+4]+struct.pack(b'>L',elen+len(exth_bytes)+8-getint(rec0,ebase_idx+4))+\
                                              struct.pack(b'>L',enum)+rec0[ebase+12:ebase_idx+4]+\
                                              struct.pack(b'>L',len(exth_bytes)+8)+exth_bytes+\
                                              rec0[ebase_idx+getint(rec0,ebase_idx+4):]
        enum_idx = enum_idx-1
        ebase_idx = ebase_idx+getint(rec0,ebase_idx+4)
    return rec0
 def del_exth(rec0,exth_num):
    ebase,elen,enum = get_exth_params(rec0)
    ebase_idx = ebase+12
    enum_idx = 0
    while enum_idx < enum:
        exth_id = getint(rec0,ebase_idx)
        exth_size = getint(rec0,ebase_idx+4)
        if exth_id == exth_num:
            newrec0 = rec0
            newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)-exth_size)
            newrec0 = newrec0[:ebase_idx]+newrec0[ebase_idx+exth_size:]
            newrec0 = newrec0[0:ebase+4]+struct.pack(b'>L',elen-exth_size)+struct.pack(b'>L',enum-1)+newrec0[ebase+12:]
            return newrec0
        enum_idx += 1
        ebase_idx = ebase_idx+exth_size
    return rec0
 class mobi_split:
    def __init__(self, infile):
        datain = b''
        with open(pathof(infile), 'rb') as f:
            datain = f.read()
        datain_rec0 = readsection(datain,0)
        ver = getint(datain_rec0,mobi_version)
        self.combo = (ver!=8)
        if not self.combo:
            return
        exth121 = read_exth(datain_rec0,121)
        if len(exth121) == 0:
            self.combo = False
            return
        else:
            # only pay attention to first exth121
            # (there should only be one)
            datain_kf8, = struct.unpack_from(b'>L',exth121[0],0)
            if datain_kf8 == 0xffffffff:
                self.combo = False
                return
        datain_kfrec0 =readsection(datain,datain_kf8)
        # create the standalone mobi7
        num_sec = getint(datain,number_of_pdb_records,b'H')
        # remove BOUNDARY up to but not including ELF record
        self.result_file7 = deletesectionrange(datain,datain_kf8-1,num_sec-2)
        # check if there are SRCS records and delete them
        srcs = getint(datain_rec0,srcs_index)
        num_srcs = getint(datain_rec0,srcs_count)
        if srcs != 0xffffffff and num_srcs > 0:
            self.result_file7 = deletesectionrange(self.result_file7,srcs,srcs+num_srcs-1)
            datain_rec0 = writeint(datain_rec0,srcs_index,0xffffffff)
            datain_rec0 = writeint(datain_rec0,srcs_count,0)
        # reset the EXTH 121 KF8 Boundary meta data to 0xffffffff
        datain_rec0 = write_exth(datain_rec0,121, struct.pack(b'>L', 0xffffffff))
        # datain_rec0 = del_exth(datain_rec0,121)
        # datain_rec0 = del_exth(datain_rec0,534)
        # don't remove the EXTH 125 KF8 Count of Resources, seems to be present in mobi6 files as well
        # set the EXTH 129 KF8 Masthead / Cover Image string to the null string
        datain_rec0 = write_exth(datain_rec0,129, b'')
        # don't remove the EXTH 131 KF8 Unidentified Count, seems to be present in mobi6 files as well
        # need to reset flags stored in 0x80-0x83
        # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050
        # Bit Flags
        # 0x1000 = Bit 12 indicates if embedded fonts are used or not
        # 0x0800 = means this Header points to *shared* images/resource/fonts ??
        # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8?
        # 0x0040 = exth exists
        # 0x0010 = Not sure but this is always set so far
        fval, = struct.unpack_from(b'>L',datain_rec0, 0x80)
        # need to remove flag 0x0800 for KindlePreviewer 2.8 and unset Bit 12 for embedded fonts
        fval = fval & 0x07FF
        datain_rec0 = datain_rec0[:0x80] + struct.pack(b'>L',fval) + datain_rec0[0x84:]
        self.result_file7 = writesection(self.result_file7,0,datain_rec0)
        # no need to replace kf8 style fcis with mobi 7 one
        # fcis_secnum, = struct.unpack_from(b'>L',datain_rec0, 0xc8)
        # if fcis_secnum != 0xffffffff:
        #     fcis_info = readsection(datain, fcis_secnum)
        #     text_len,  = struct.unpack_from(b'>L', fcis_info, 0x14)
        #     new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
        #     new_fcis += struct.pack(b'>L',text_len)
        #     new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
        #     self.result_file7 = writesection(self.result_file7, fcis_secnum, new_fcis)
        firstimage = getint(datain_rec0,first_resc_record)
        lastimage = getint(datain_rec0,last_content_index,b'H')
        # print("Old First Image, last Image", firstimage,lastimage)
        if lastimage == 0xffff:
            # find the lowest of the next sections and copy up to that.
            ofs_list = [(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')]
            for ofs,sz in ofs_list:
                n = getint(datain_rec0,ofs,sz)
                # print("n",n)
                if n > 0 and n < lastimage:
                    lastimage = n-1
        print("First Image, last Image", firstimage,lastimage)
        # Try to null out FONT and RES, but leave the (empty) PDB record so image refs remain valid
        for i in range(firstimage,lastimage):
            imgsec = readsection(self.result_file7,i)
            if imgsec[0:4] in [b'RESC',b'FONT']:
                self.result_file7 = nullsection(self.result_file7,i)
        # mobi7 finished
        # create standalone mobi8
        self.result_file8 = deletesectionrange(datain,0,datain_kf8-1)
        target = getint(datain_kfrec0,first_resc_record)
        self.result_file8 = insertsectionrange(datain,firstimage,lastimage,self.result_file8,target)
        datain_kfrec0 =readsection(self.result_file8,0)
        # Only keep the correct EXTH 116 StartOffset, KG 2.5 carries over the one from the mobi7 part, which then points at garbage in the mobi8 part, and confuses FW 3.4
        kf8starts = read_exth(datain_kfrec0,116)
        # If we have multiple StartOffset, keep only the last one
        kf8start_count = len(kf8starts)
        while kf8start_count > 1:
            kf8start_count -= 1
            datain_kfrec0 = del_exth(datain_kfrec0,116)
        # update the EXTH 125 KF8 Count of Images/Fonts/Resources
        datain_kfrec0 = write_exth(datain_kfrec0,125,struct.pack(b'>L',lastimage-firstimage+1))
        # need to reset flags stored in 0x80-0x83
        # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050
        # standalone mobi8 with exth: 0x0050
        # Bit Flags
        # 0x1000 = Bit 12 indicates if embedded fonts are used or not
        # 0x0800 = means this Header points to *shared* images/resource/fonts ??
        # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8?
        # 0x0040 = exth exists
        # 0x0010 = Not sure but this is always set so far
        fval, = struct.unpack_from('>L',datain_kfrec0, 0x80)
        fval = fval & 0x1FFF
        fval |= 0x0800
        datain_kfrec0 = datain_kfrec0[:0x80] + struct.pack(b'>L',fval) + datain_kfrec0[0x84:]
        # properly update other index pointers that have been shifted by the insertion of images
        ofs_list = [(kf8_fdst_index,b'L'),(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')]
        for ofs,sz in ofs_list:
            n = getint(datain_kfrec0,ofs,sz)
            if n != 0xffffffff:
                datain_kfrec0 = writeint(datain_kfrec0,ofs,n+lastimage-firstimage+1,sz)
        self.result_file8 = writesection(self.result_file8,0,datain_kfrec0)
        # no need to replace kf8 style fcis with mobi 7 one
        # fcis_secnum, = struct.unpack_from(b'>L',datain_kfrec0, 0xc8)
        # if fcis_secnum != 0xffffffff:
        #     fcis_info = readsection(self.result_file8, fcis_secnum)
        #     text_len,  = struct.unpack_from(b'>L', fcis_info, 0x14)
        #     new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
        #     new_fcis += struct.pack(b'>L',text_len)
        #     new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
        #     self.result_file8 = writesection(self.result_file8, fcis_secnum, new_fcis)
        # mobi8 finished
    def getResult8(self):
        return self.result_file8
    def getResult7(self):
        return self.result_file7
--- a/KindleUnpack/mobi_uncompress.py
+++ b/KindleUnpack/mobi_uncompress.py
@@ -0,0 +1,131 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 from __future__ import unicode_literals, division, absolute_import, print_function
 from .compatibility_utils import PY2, bchr, lmap, bstr
 if PY2:
    range = xrange
 import struct
 # note:  struct pack, unpack, unpack_from all require bytestring format
 # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 class unpackException(Exception):
    pass
 class UncompressedReader:
    def unpack(self, data):
        return data
 class PalmdocReader:
    def unpack(self, i):
        o, p = b'', 0
        while p < len(i):
            # for python 3 must use slice since i[p] returns int while slice returns character
            c = ord(i[p:p+1])
            p += 1
            if (c >= 1 and c <= 8):
                o += i[p:p+c]
                p += c
            elif (c < 128):
                o += bchr(c)
            elif (c >= 192):
                o += b' ' + bchr(c ^ 128)
            else:
                if p < len(i):
                    c = (c << 8) | ord(i[p:p+1])
                    p += 1
                    m = (c >> 3) & 0x07ff
                    n = (c & 7) + 3
                    if (m > n):
                        o += o[-m:n-m]
                    else:
                        for _ in range(n):
                            # because of completely ass-backwards decision by python mainters for python 3
                            # we must use slice for bytes as i[p] returns int while slice returns character
                            if m == 1:
                                o += o[-m:]
                            else:
                                o += o[-m:-m+1]
        return o
 class HuffcdicReader:
    q = struct.Struct(b'>Q').unpack_from
    def loadHuff(self, huff):
        if huff[0:8] != b'HUFF\x00\x00\x00\x18':
            raise unpackException('invalid huff header')
        off1, off2 = struct.unpack_from(b'>LL', huff, 8)
        def dict1_unpack(v):
            codelen, term, maxcode = v&0x1f, v&0x80, v>>8
            assert codelen != 0
            if codelen <= 8:
                assert term
            maxcode = ((maxcode + 1) << (32 - codelen)) - 1
            return (codelen, term, maxcode)
        self.dict1 = lmap(dict1_unpack, struct.unpack_from(b'>256L', huff, off1))
        dict2 = struct.unpack_from(b'>64L', huff, off2)
        self.mincode, self.maxcode = (), ()
        for codelen, mincode in enumerate((0,) + dict2[0::2]):
            self.mincode += (mincode << (32 - codelen), )
        for codelen, maxcode in enumerate((0,) + dict2[1::2]):
            self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, )
        self.dictionary = []
    def loadCdic(self, cdic):
        if cdic[0:8] != b'CDIC\x00\x00\x00\x10':
            raise unpackException('invalid cdic header')
        phrases, bits = struct.unpack_from(b'>LL', cdic, 8)
        n = min(1<<bits, phrases-len(self.dictionary))
        h = struct.Struct(b'>H').unpack_from
        def getslice(off):
            blen, = h(cdic, 16+off)
            slice = cdic[18+off:18+off+(blen&0x7fff)]
            return (slice, blen&0x8000)
        self.dictionary += lmap(getslice, struct.unpack_from(bstr('>%dH' % n), cdic, 16))
    def unpack(self, data):
        q = HuffcdicReader.q
        bitsleft = len(data) * 8
        data += b"\x00\x00\x00\x00\x00\x00\x00\x00"
        pos = 0
        x, = q(data, pos)
        n = 32
        s = b''
        while True:
            if n <= 0:
                pos += 4
                x, = q(data, pos)
                n += 32
            code = (x >> n) & ((1 << 32) - 1)
            codelen, term, maxcode = self.dict1[code >> 24]
            if not term:
                while code < self.mincode[codelen]:
                    codelen += 1
                maxcode = self.maxcode[codelen]
            n -= codelen
            bitsleft -= codelen
            if bitsleft < 0:
                break
            r = (maxcode - code) >> (32 - codelen)
            slice, flag = self.dictionary[r]
            if not flag:
                self.dictionary[r] = None
                slice = self.unpack(slice)
                self.dictionary[r] = (slice, 1)
            s += slice
        return s
--- a/KindleUnpack/mobi_utils.py
+++ b/KindleUnpack/mobi_utils.py
@@ -0,0 +1,191 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 # flake8: noqa
 from __future__ import unicode_literals, division, absolute_import, print_function
 from .compatibility_utils import PY2, text_type, bchr, bord
 import binascii
 if PY2:
    range = xrange
 from itertools import cycle
 def getLanguage(langID, sublangID):
    mobilangdict = {
            54 : {0 : 'af'},  # Afrikaans
            28 : {0 : 'sq'},  # Albanian
             1 : {0 : 'ar' , 5 : 'ar-dz' , 15 : 'ar-bh' , 3 : 'ar-eg' , 2 : 'ar-iq',  11 : 'ar-jo' , 13 : 'ar-kw' , 12 : 'ar-lb' , 4: 'ar-ly',
                  6 : 'ar-ma' , 8 : 'ar-om' , 16 : 'ar-qa' , 1 : 'ar-sa' , 10 : 'ar-sy' , 7 : 'ar-tn' , 14 : 'ar-ae' , 9 : 'ar-ye'},
             # Arabic,  Arabic (Algeria),  Arabic (Bahrain),  Arabic (Egypt),  Arabic
             # (Iraq), Arabic (Jordan),  Arabic (Kuwait),  Arabic (Lebanon),  Arabic
             # (Libya), Arabic (Morocco),  Arabic (Oman),  Arabic (Qatar),  Arabic
             # (Saudi Arabia),  Arabic (Syria),  Arabic (Tunisia),  Arabic (United Arab
             # Emirates),  Arabic (Yemen)
            43 : {0 : 'hy'},  # Armenian
            77 : {0 : 'as'},  # Assamese
            44 : {0 : 'az'},  # "Azeri (IANA: Azerbaijani)
            45 : {0 : 'eu'},  # Basque
            35 : {0 : 'be'},  # Belarusian
            69 : {0 : 'bn'},  # Bengali
             2 : {0 : 'bg'},  # Bulgarian
             3 : {0 : 'ca'},  # Catalan
             4 : {0 : 'zh' , 3 : 'zh-hk' , 2 : 'zh-cn' , 4 : 'zh-sg' , 1 : 'zh-tw'},
            # Chinese,  Chinese (Hong Kong),  Chinese (PRC),  Chinese (Singapore),  Chinese (Taiwan)
            26 : {0 : 'hr', 3 : 'sr'},  # Croatian, Serbian
             5 : {0 : 'cs'},  # Czech
             6 : {0 : 'da'},  # Danish
            19 : {0: 'nl', 1 : 'nl' , 2 : 'nl-be'},  # Dutch / Flemish,  Dutch (Belgium)
             9 : {0: 'en', 1 : 'en' , 3 : 'en-au' , 40 : 'en-bz' , 4 : 'en-ca' , 6 : 'en-ie' , 8 : 'en-jm' , 5 : 'en-nz' , 13 : 'en-ph' ,
                  7 : 'en-za' , 11 : 'en-tt' , 2 : 'en-gb', 1 : 'en-us' , 12 : 'en-zw'},
             # English,  English (Australia),  English (Belize),  English (Canada),
             # English (Ireland),  English (Jamaica),  English (New Zealand),  English
             # (Philippines),  English (South Africa),  English (Trinidad),  English
             # (United Kingdom),  English (United States),  English (Zimbabwe)
            37 : {0 : 'et'},  # Estonian
            56 : {0 : 'fo'},  # Faroese
            41 : {0 : 'fa'},  # Farsi / Persian
            11 : {0 : 'fi'},  # Finnish
            12 : {0 : 'fr', 1 : 'fr' , 2 : 'fr-be' , 3 : 'fr-ca' , 5 : 'fr-lu' , 6 : 'fr-mc' , 4 : 'fr-ch'},
            # French,  French (Belgium),  French (Canada),  French (Luxembourg),  French (Monaco),  French (Switzerland)
            55 : {0 : 'ka'},  # Georgian
             7 : {0 : 'de', 1 : 'de' , 3 : 'de-at' , 5 : 'de-li' , 4 : 'de-lu' , 2 : 'de-ch'},
             # German,  German (Austria),  German (Liechtenstein),  German (Luxembourg),  German (Switzerland)
             8 : {0 : 'el'},  # Greek, Modern (1453-)
            71 : {0 : 'gu'},  # Gujarati
            13 : {0 : 'he'},  # Hebrew (also code 'iw'?)
            57 : {0 : 'hi'},  # Hindi
            14 : {0 : 'hu'},  # Hungarian
            15 : {0 : 'is'},  # Icelandic
            33 : {0 : 'id'},  # Indonesian
            16 : {0 : 'it', 1 : 'it' , 2 : 'it-ch'},  # Italian,  Italian (Switzerland)
            17 : {0 : 'ja'},  # Japanese
            75 : {0 : 'kn'},  # Kannada
            63 : {0 : 'kk'},  # Kazakh
            87 : {0 : 'x-kok'},  # Konkani (real language code is 'kok'?)
            18 : {0 : 'ko'},  # Korean
            38 : {0 : 'lv'},  # Latvian
            39 : {0 : 'lt'},  # Lithuanian
            47 : {0 : 'mk'},  # Macedonian
            62 : {0 : 'ms'},  # Malay
            76 : {0 : 'ml'},  # Malayalam
            58 : {0 : 'mt'},  # Maltese
            78 : {0 : 'mr'},  # Marathi
            97 : {0 : 'ne'},  # Nepali
            20 : {0 : 'no'},  # Norwegian
            72 : {0 : 'or'},  # Oriya
            21 : {0 : 'pl'},  # Polish
            22 : {0 : 'pt', 2 : 'pt' , 1 : 'pt-br'},  # Portuguese,  Portuguese (Brazil)
            70 : {0 : 'pa'},  # Punjabi
            23 : {0 : 'rm'},  # "Rhaeto-Romanic" (IANA: Romansh)
            24 : {0 : 'ro'},  # Romanian
            25 : {0 : 'ru'},  # Russian
            59 : {0 : 'sz'},  # "Sami (Lappish)" (not an IANA language code)
            # IANA code for "Northern Sami" is 'se'
            # 'SZ' is the IANA region code for Swaziland
            79 : {0 : 'sa'},  # Sanskrit
            27 : {0 : 'sk'},  # Slovak
            36 : {0 : 'sl'},  # Slovenian
            46 : {0 : 'sb'},  # "Sorbian" (not an IANA language code)
            # 'SB' is IANA region code for 'Solomon Islands'
            # Lower Sorbian = 'dsb'
            # Upper Sorbian = 'hsb'
            # Sorbian Languages = 'wen'
            10 : {0 : 'es' , 4 : 'es' , 44 : 'es-ar' , 64 : 'es-bo' , 52 : 'es-cl' , 36 : 'es-co' , 20 : 'es-cr' , 28 : 'es-do' ,
                  48 : 'es-ec' , 68 : 'es-sv' , 16 : 'es-gt' , 72 : 'es-hn' , 8 : 'es-mx' , 76 : 'es-ni' , 24 : 'es-pa' ,
                  60 : 'es-py' , 40 : 'es-pe' , 80 : 'es-pr' , 56 : 'es-uy' , 32 : 'es-ve'},
            # Spanish,  Spanish (Mobipocket bug?),  Spanish (Argentina),  Spanish
            # (Bolivia),  Spanish (Chile),  Spanish (Colombia),  Spanish (Costa Rica),
            # Spanish (Dominican Republic),  Spanish (Ecuador),  Spanish (El
            # Salvador),  Spanish (Guatemala),  Spanish (Honduras),  Spanish (Mexico),
            # Spanish (Nicaragua),  Spanish (Panama),  Spanish (Paraguay),  Spanish
            # (Peru),  Spanish (Puerto Rico),  Spanish (Uruguay),  Spanish (Venezuela)
            48 : {0 : 'sx'},  # "Sutu" (not an IANA language code)
            # "Sutu" is another name for "Southern Sotho"?
            # IANA code for "Southern Sotho" is 'st'
            65 : {0 : 'sw'},  # Swahili
            29 : {0 : 'sv' , 1 : 'sv' , 8 : 'sv-fi'},  # Swedish,  Swedish (Finland)
            73 : {0 : 'ta'},  # Tamil
            68 : {0 : 'tt'},  # Tatar
            74 : {0 : 'te'},  # Telugu
            30 : {0 : 'th'},  # Thai
            49 : {0 : 'ts'},  # Tsonga
            50 : {0 : 'tn'},  # Tswana
            31 : {0 : 'tr'},  # Turkish
            34 : {0 : 'uk'},  # Ukrainian
            32 : {0 : 'ur'},  # Urdu
            67 : {0 : 'uz', 2 : 'uz'},  # Uzbek
            42 : {0 : 'vi'},  # Vietnamese
            52 : {0 : 'xh'},  # Xhosa
            53 : {0 : 'zu'},  # Zulu
    }
    lang = "en"
    if langID in mobilangdict:
        subdict = mobilangdict[langID]
        lang = subdict[0]
        if sublangID in subdict:
            lang = subdict[sublangID]
    return lang
 def toHex(byteList):
    return binascii.hexlify(byteList)
 # returns base32 bytestring
 def toBase32(value, npad=4):
    digits = b'0123456789ABCDEFGHIJKLMNOPQRSTUV'
    num_string=b''
    current = value
    while current != 0:
        next, remainder = divmod(current, 32)
        rem_string = digits[remainder:remainder+1]
        num_string = rem_string + num_string
        current=next
    if num_string == b'':
        num_string = b'0'
    pad = npad - len(num_string)
    if pad > 0:
        num_string = b'0' * pad + num_string
    return num_string
 # converts base32 string to value
 def fromBase32(str_num):
    if isinstance(str_num, text_type):
        str_num = str_num.encode('latin-1')
    scalelst = [1,32,1024,32768,1048576,33554432,1073741824,34359738368]
    value = 0
    j = 0
    n = len(str_num)
    scale = 0
    for i in range(n):
        c = str_num[n-i-1:n-i]
        if c in b'0123456789':
            v = ord(c) - ord(b'0')
        else:
            v = ord(c) - ord(b'A') + 10
        if j < len(scalelst):
            scale = scalelst[j]
        else:
            scale = scale * 32
        j += 1
        if v != 0:
            value = value + (v * scale)
    return value
 # note: if decode a bytestring using 'latin-1' (or any other 0-255 encoding)
 # in place of ascii you will get a byte to half-word or integer
 # one to one mapping of values from 0 - 255
 def mangle_fonts(encryption_key, data):
    if isinstance(encryption_key, text_type):
        encryption_key = encryption_key.encode('latin-1')
    crypt = data[:1024]
    key = cycle(iter(map(bord, encryption_key)))
    # encrypt = ''.join([chr(ord(x)^key.next()) for x in crypt])
    encrypt = b''.join([bchr(bord(x)^next(key)) for x in crypt])
    return encrypt + data[1024:]
--- a/KindleUnpack/mobiml2xhtml.py
+++ b/KindleUnpack/mobiml2xhtml.py
@@ -0,0 +1,525 @@
 #! /usr/bin/python
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 # this program works in concert with the output from KindleUnpack
 '''
 Convert from Mobi ML to XHTML
 '''
 import os
 import sys
 import re
 SPECIAL_HANDLING_TAGS = {
    '?xml'     : ('xmlheader', -1),
    '!--'      : ('comment', -3),
    '!DOCTYPE' : ('doctype', -1),
 }
 SPECIAL_HANDLING_TYPES = ['xmlheader', 'doctype', 'comment']
 SELF_CLOSING_TAGS = ['br' , 'hr', 'input', 'img', 'image', 'meta', 'spacer', 'link', 'frame', 'base', 'col', 'reference']
 class MobiMLConverter(object):
    PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
    IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
    def __init__(self, filename):
        self.base_css_rules =  'blockquote { margin: 0em 0em 0em 1.25em }\n'
        self.base_css_rules += 'p { margin: 0em }\n'
        self.base_css_rules += '.bold { font-weight: bold }\n'
        self.base_css_rules += '.italic { font-style: italic }\n'
        self.base_css_rules += '.mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n'
        self.tag_css_rules = {}
        self.tag_css_rule_cnt = 0
        self.path = []
        self.filename = filename
        self.wipml = open(self.filename, 'rb').read()
        self.pos = 0
        self.opfname = self.filename.rsplit('.',1)[0] + '.opf'
        self.opos = 0
        self.meta = ''
        self.cssname = os.path.join(os.path.dirname(self.filename),'styles.css')
        self.current_font_size = 3
        self.font_history = []
    def cleanup_html(self):
        self.wipml = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.wipml)
        self.wipml = self.wipml.replace('\r\n', '\n')
        self.wipml = self.wipml.replace('> <', '>\n<')
        self.wipml = self.wipml.replace('<mbp: ', '<mbp:')
        # self.wipml = re.sub(r'<?xml[^>]*>', '', self.wipml)
        self.wipml = self.wipml.replace('<br></br>','<br/>')
    def replace_page_breaks(self):
        self.wipml = self.PAGE_BREAK_PAT.sub(
            '<div class="mbp_pagebreak" />',
            self.wipml)
    # parse leading text of ml and tag
    def parseml(self):
        p = self.pos
        if p >= len(self.wipml):
            return None
        if self.wipml[p] != '<':
            res = self.wipml.find('<',p)
            if res == -1 :
                res = len(self.wipml)
            self.pos = res
            return self.wipml[p:res], None
        # handle comment as a special case to deal with multi-line comments
        if self.wipml[p:p+4] == '<!--':
            te = self.wipml.find('-->',p+1)
            if te != -1:
                te = te+2
        else :
            te = self.wipml.find('>',p+1)
            ntb = self.wipml.find('<',p+1)
            if ntb != -1 and ntb < te:
                self.pos = ntb
                return self.wipml[p:ntb], None
        self.pos = te + 1
        return None, self.wipml[p:te+1]
    # parses string version of tag to identify its name,
    # its type 'begin', 'end' or 'single',
    # plus build a hashtable of its attributes
    # code is written to handle the possiblity of very poor formating
    def parsetag(self, s):
        p = 1
        # get the tag name
        tname = None
        ttype = None
        tattr = {}
        while s[p:p+1] == ' ' :
            p += 1
        if s[p:p+1] == '/':
            ttype = 'end'
            p += 1
            while s[p:p+1] == ' ' :
                p += 1
        b = p
        while s[p:p+1] not in ('>', '/', ' ', '"', "'", "\r", "\n") :
            p += 1
        tname=s[b:p].lower()
        if tname == '!doctype':
            tname = '!DOCTYPE'
        # special cases
        if tname in SPECIAL_HANDLING_TAGS.keys():
            ttype, backstep = SPECIAL_HANDLING_TAGS[tname]
            tattr['special'] = s[p:backstep]
        if ttype is None:
            # parse any attributes
            while s.find('=',p) != -1 :
                while s[p:p+1] == ' ' :
                    p += 1
                b = p
                while s[p:p+1] != '=' :
                    p += 1
                aname = s[b:p].lower()
                aname = aname.rstrip(' ')
                p += 1
                while s[p:p+1] == ' ' :
                    p += 1
                if s[p:p+1] in ('"', "'") :
                    p = p + 1
                    b = p
                    while s[p:p+1] not in ('"', "'") :
                        p += 1
                    val = s[b:p]
                    p += 1
                else :
                    b = p
                    while s[p:p+1] not in ('>', '/', ' ') :
                        p += 1
                    val = s[b:p]
                tattr[aname] = val
        # label beginning and single tags
        if ttype is None:
            ttype = 'begin'
            if s.find(' /',p) >= 0:
                ttype = 'single_ext'
            elif s.find('/',p) >= 0:
                ttype = 'single'
        return ttype, tname, tattr
    # main routine to convert from mobi markup language to html
    def processml(self):
        # are these really needed
        html_done = False
        head_done = False
        body_done = False
        skip = False
        htmlstr = ''
        self.replace_page_breaks()
        self.cleanup_html()
        # now parse the cleaned up ml into standard xhtml
        while True:
            r = self.parseml()
            if not r:
                break
            text, tag = r
            if text:
                if not skip:
                    htmlstr += text
            if tag:
                ttype, tname, tattr = self.parsetag(tag)
                # If we run into a DTD or xml declarations inside the body ... bail.
                if tname in SPECIAL_HANDLING_TAGS.keys() and tname != 'comment' and body_done:
                    htmlstr += '\n</body></html>'
                    break
                # make sure self-closing tags actually self-close
                if ttype == 'begin' and tname in SELF_CLOSING_TAGS:
                    ttype = 'single'
                # make sure any end tags of self-closing tags are discarded
                if ttype == 'end' and tname in SELF_CLOSING_TAGS:
                    continue
                # remove embedded guide and refernces from old mobis
                if tname in ('guide', 'ncx', 'reference') and ttype in ('begin', 'single', 'single_ext'):
                    tname = 'removeme:{0}'.format(tname)
                    tattr = None
                if tname in ('guide', 'ncx', 'reference', 'font', 'span') and ttype == 'end':
                    if self.path[-1] == 'removeme:{0}'.format(tname):
                        tname = 'removeme:{0}'.format(tname)
                        tattr = None
                # Get rid of font tags that only have a color attribute.
                if tname == 'font' and ttype in ('begin', 'single', 'single_ext'):
                    if 'color' in tattr.keys() and len(tattr.keys()) == 1:
                        tname = 'removeme:{0}'.format(tname)
                        tattr = None
                # Get rid of empty spans in the markup.
                if tname == 'span' and ttype in ('begin', 'single', 'single_ext') and not len(tattr):
                    tname = 'removeme:{0}'.format(tname)
                # need to handle fonts outside of the normal methods
                # so fonts tags won't be added to the self.path since we keep track
                # of font tags separately with self.font_history
                if tname == 'font' and ttype == 'begin':
                    # check for nested font start tags
                    if len(self.font_history) > 0 :
                        # inject a font end tag
                        taginfo = ('end', 'font', None)
                        htmlstr += self.processtag(taginfo)
                    self.font_history.append((ttype, tname, tattr))
                    # handle the current font start tag
                    taginfo = (ttype, tname, tattr)
                    htmlstr += self.processtag(taginfo)
                    continue
                # check for nested font tags and unnest them
                if tname == 'font' and ttype == 'end':
                    self.font_history.pop()
                    # handle this font end tag
                    taginfo = ('end', 'font', None)
                    htmlstr += self.processtag(taginfo)
                    # check if we were nested
                    if len(self.font_history) > 0:
                        # inject a copy of the most recent font start tag from history
                        taginfo = self.font_history[-1]
                        htmlstr += self.processtag(taginfo)
                    continue
                # keep track of nesting path
                if ttype == 'begin':
                    self.path.append(tname)
                elif ttype == 'end':
                    if tname != self.path[-1]:
                        print ('improper nesting: ', self.path, tname, ttype)
                        if tname not in self.path:
                            # handle case of end tag with no beginning by injecting empty begin tag
                            taginfo = ('begin', tname, None)
                            htmlstr += self.processtag(taginfo)
                            print "     - fixed by injecting empty start tag ", tname
                            self.path.append(tname)
                        elif len(self.path) >  1 and tname == self.path[-2]:
                            # handle case of dangling missing end
                            taginfo = ('end', self.path[-1], None)
                            htmlstr += self.processtag(taginfo)
                            print "     - fixed by injecting end tag ", self.path[-1]
                            self.path.pop()
                    self.path.pop()
                if tname == 'removeme:{0}'.format(tname):
                    if ttype in ('begin', 'single', 'single_ext'):
                        skip = True
                    else:
                        skip = False
                else:
                    taginfo = (ttype, tname, tattr)
                    htmlstr += self.processtag(taginfo)
                # handle potential issue of multiple html, head, and body sections
                if tname == 'html' and ttype == 'begin' and not html_done:
                    htmlstr += '\n'
                    html_done = True
                if tname == 'head' and ttype == 'begin' and not head_done:
                    htmlstr += '\n'
                    # also add in metadata and style link tags
                    htmlstr += self.meta
                    htmlstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
                    head_done = True
                if tname == 'body' and ttype == 'begin' and not body_done:
                    htmlstr += '\n'
                    body_done = True
        # handle issue of possibly missing html, head, and body tags
        # I have not seen this but the original did something like this so ...
        if not body_done:
            htmlstr = '<body>\n' + htmlstr + '</body>\n'
        if not head_done:
            headstr = '<head>\n'
            headstr += self.meta
            headstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
            headstr += '</head>\n'
            htmlstr = headstr + htmlstr
        if not html_done:
            htmlstr = '<html>\n' + htmlstr + '</html>\n'
        # finally add DOCTYPE info
        htmlstr = '<?xml version="1.0"?>\n<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n' + htmlstr
        css = self.base_css_rules
        for cls, rule in self.tag_css_rules.items():
            css += '.%s { %s }\n' % (cls, rule)
        return (htmlstr, css, self.cssname)
    def ensure_unit(self, raw, unit='px'):
        if re.search(r'\d+$', raw) is not None:
            raw += unit
        return raw
    # flatten possibly modified tag back to string
    def taginfo_tostring(self, taginfo):
        (ttype, tname, tattr) = taginfo
        if ttype is None or tname is None:
            return ''
        if ttype == 'end':
            return '</%s>' % tname
        if ttype in SPECIAL_HANDLING_TYPES and tattr is not None and 'special' in tattr.keys():
            info = tattr['special']
            if ttype == 'comment':
                return '<%s %s-->' % tname, info
            else:
                return '<%s %s>' % tname, info
        res = []
        res.append('<%s' % tname)
        if tattr is not None:
            for key in tattr.keys():
                res.append(' %s="%s"' % (key, tattr[key]))
        if ttype == 'single':
            res.append('/>')
        elif ttype == 'single_ext':
            res.append(' />')
        else :
            res.append('>')
        return "".join(res)
    # routines to convert from mobi ml tags atributes to xhtml attributes and styles
    def processtag(self, taginfo):
        # Converting mobi font sizes to numerics
        size_map = {
            'xx-small': '1',
            'x-small': '2',
            'small': '3',
            'medium': '4',
            'large': '5',
            'x-large': '6',
            'xx-large': '7',
            }
        size_to_em_map = {
            '1': '.65em',
            '2': '.75em',
            '3': '1em',
            '4': '1.125em',
            '5': '1.25em',
            '6': '1.5em',
            '7': '2em',
            }
        # current tag to work on
        (ttype, tname, tattr) = taginfo
        if not tattr:
            tattr = {}
        styles = []
        if tname is None or tname.startswith('removeme'):
            return ''
        # have not seen an example of this yet so keep it here to be safe
        # until this is better understood
        if tname in ('country-region', 'place', 'placetype', 'placename',
                'state', 'city', 'street', 'address', 'content'):
            tname = 'div' if tname == 'content' else 'span'
            for key in tattr.keys():
                tattr.pop(key)
        # handle general case of style, height, width, bgcolor in any tag
        if 'style' in tattr.keys():
            style = tattr.pop('style').strip()
            if style:
                styles.append(style)
        if 'align' in tattr.keys():
            align = tattr.pop('align').strip()
            if align:
                if tname in ('table', 'td', 'tr'):
                    pass
                else:
                    styles.append('text-align: %s' % align)
        if 'height' in tattr.keys():
            height = tattr.pop('height').strip()
            if height and '<' not in height and '>' not in height and re.search(r'\d+', height):
                if tname in ('table', 'td', 'tr'):
                    pass
                elif tname == 'img':
                    tattr['height'] = height
                else:
                    styles.append('margin-top: %s' % self.ensure_unit(height))
        if 'width' in tattr.keys():
            width = tattr.pop('width').strip()
            if width and re.search(r'\d+', width):
                if tname in ('table', 'td', 'tr'):
                    pass
                elif tname == 'img':
                    tattr['width'] =  width
                else:
                    styles.append('text-indent: %s' % self.ensure_unit(width))
                    if width.startswith('-'):
                        styles.append('margin-left: %s' % self.ensure_unit(width[1:]))
        if 'bgcolor' in tattr.keys():
            # no proprietary html allowed
            if tname == 'div':
                del tattr['bgcolor']
        elif tname == 'font':
            # Change font tags to span tags
            tname = 'span'
            if ttype in ('begin', 'single', 'single_ext'):
                # move the face attribute to css font-family
                if 'face' in tattr.keys():
                    face = tattr.pop('face').strip()
                    styles.append('font-family: "%s"' % face)
                    # Monitor the constantly changing font sizes, change them to ems and move
                    # them to css. The following will work for 'flat' font tags, but nested font tags
                    # will cause things to go wonky. Need to revert to the parent font tag's size
                    # when a closing tag is encountered.
                if 'size' in tattr.keys():
                    sz = tattr.pop('size').strip().lower()
                    try:
                        float(sz)
                    except ValueError:
                        if sz in size_map.keys():
                            sz = size_map[sz]
                    else:
                        if sz.startswith('-') or sz.startswith('+'):
                            sz = self.current_font_size + float(sz)
                            if sz > 7:
                                sz = 7
                            elif sz < 1:
                                sz = 1
                            sz = str(int(sz))
                    styles.append('font-size: %s' % size_to_em_map[sz])
                    self.current_font_size = int(sz)
        elif tname == 'img':
            for attr in ('width', 'height'):
                if attr in tattr:
                    val = tattr[attr]
                    if val.lower().endswith('em'):
                        try:
                            nval = float(val[:-2])
                            nval *= 16 * (168.451/72)  # Assume this was set using the Kindle profile
                            tattr[attr] = "%dpx"%int(nval)
                        except:
                            del tattr[attr]
                    elif val.lower().endswith('%'):
                        del tattr[attr]
        # convert the anchor tags
        if 'filepos-id' in tattr:
            tattr['id'] = tattr.pop('filepos-id')
            if 'name' in tattr and tattr['name'] != tattr['id']:
                tattr['name'] = tattr['id']
        if 'filepos' in tattr:
            filepos = tattr.pop('filepos')
            try:
                tattr['href'] = "#filepos%d" % int(filepos)
            except ValueError:
                pass
        if styles:
            ncls = None
            rule = '; '.join(styles)
            for sel, srule in self.tag_css_rules.items():
                if srule == rule:
                    ncls = sel
                    break
            if ncls is None:
                self.tag_css_rule_cnt += 1
                ncls = 'rule_%d' % self.tag_css_rule_cnt
                self.tag_css_rules[ncls] = rule
            cls = tattr.get('class', '')
            cls = cls + (' ' if cls else '') + ncls
            tattr['class'] = cls
        # convert updated tag back to string representation
        if len(tattr) == 0:
            tattr = None
        taginfo = (ttype, tname, tattr)
        return self.taginfo_tostring(taginfo)
 ''' main only left in for testing outside of plugin '''
 def main(argv=sys.argv):
    if len(argv) != 2:
        return 1
    else:
        infile = argv[1]
    try:
        print 'Converting Mobi Markup Language to XHTML'
        mlc = MobiMLConverter(infile)
        print 'Processing ...'
        htmlstr, css, cssname = mlc.processml()
        outname = infile.rsplit('.',1)[0] + '_converted.html'
        file(outname, 'wb').write(htmlstr)
        file(cssname, 'wb').write(css)
        print 'Completed'
        print 'XHTML version of book can be found at: ' + outname
    except ValueError, e:
        print "Error: %s" % e
        return 1
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/KindleUnpack/unipath.py
+++ b/KindleUnpack/unipath.py
@@ -0,0 +1,93 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 # Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification,
 # are permitted provided that the following conditions are met:
 #
 # 1. Redistributions of source code must retain the above copyright notice, this list of
 # conditions and the following disclaimer.
 #
 # 2. Redistributions in binary form must reproduce the above copyright notice, this list
 # of conditions and the following disclaimer in the documentation and/or other materials
 # provided with the distribution.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
 # SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 # TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
 # WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 from __future__ import unicode_literals, division, absolute_import, print_function
 from .compatibility_utils import PY2, text_type, binary_type
 import sys
 import os
 # utility routines to convert all paths to be full unicode
 # Under Python 2, if a bytestring, try to convert it to unicode using sys.getfilesystemencoding
 # Under Python 3, if bytes, try to convert it to unicode using os.fsencode() to decode it
 # Mac OS X and Windows will happily support full unicode paths
 # Linux can support full unicode paths but allows arbitrary byte paths which may be inconsistent with unicode
 fsencoding = sys.getfilesystemencoding()
 def pathof(s, enc=fsencoding):
    if s is None:
        return None
    if isinstance(s, text_type):
        return s
    if isinstance(s, binary_type):
        try:
            return s.decode(enc)
        except:
            pass
    return s
 def exists(s):
    return os.path.exists(pathof(s))
 def isfile(s):
    return os.path.isfile(pathof(s))
 def isdir(s):
    return os.path.isdir(pathof(s))
 def mkdir(s):
    return os.mkdir(pathof(s))
 def listdir(s):
    rv = []
    for file in os.listdir(pathof(s)):
        rv.append(pathof(file))
    return rv
 def getcwd():
    if PY2:
        return os.getcwdu()
    return os.getcwd()
 def walk(top):
    top = pathof(top)
    rv = []
    for base, dnames, names in os.walk(top):
        base = pathof(base)
        for name in names:
            name = pathof(name)
            rv.append(relpath(os.path.join(base, name), top))
    return rv
 def relpath(path, start=None):
    return os.path.relpath(pathof(path) , pathof(start))
 def abspath(path):
    return os.path.abspath(pathof(path))
--- a/KindleUnpack/unpack_structure.py
+++ b/KindleUnpack/unpack_structure.py
@@ -0,0 +1,167 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 from __future__ import unicode_literals, division, absolute_import, print_function
 from .compatibility_utils import text_type
 from . import unipath
 from .unipath import pathof
 DUMP = False
 """ Set to True to dump all possible information. """
 import os
 import re
 # note: re requites the pattern to be the exact same type as the data to be searched in python3
 # but u"" is not allowed for the pattern itself only b""
 import zipfile
 import binascii
 from .mobi_utils import mangle_fonts
 class unpackException(Exception):
    pass
 class ZipInfo(zipfile.ZipInfo):
    def __init__(self, *args, **kwargs):
        if 'compress_type' in kwargs:
            compress_type = kwargs.pop('compress_type')
        super(ZipInfo, self).__init__(*args, **kwargs)
        self.compress_type = compress_type
 class fileNames:
    def __init__(self, infile, outdir):
        self.infile = infile
        self.outdir = outdir
        if not unipath.exists(self.outdir):
            unipath.mkdir(self.outdir)
        self.mobi7dir = os.path.join(self.outdir,'mobi7')
        if not unipath.exists(self.mobi7dir):
            unipath.mkdir(self.mobi7dir)
        self.imgdir = os.path.join(self.mobi7dir, 'Images')
        if not unipath.exists(self.imgdir):
            unipath.mkdir(self.imgdir)
        self.hdimgdir = os.path.join(self.outdir,'HDImages')
        if not unipath.exists(self.hdimgdir):
            unipath.mkdir(self.hdimgdir)
        self.outbase = os.path.join(self.outdir, os.path.splitext(os.path.split(infile)[1])[0])
    def getInputFileBasename(self):
        return os.path.splitext(os.path.basename(self.infile))[0]
    def makeK8Struct(self):
        self.k8dir = os.path.join(self.outdir,'mobi8')
        if not unipath.exists(self.k8dir):
            unipath.mkdir(self.k8dir)
        self.k8metainf = os.path.join(self.k8dir,'META-INF')
        if not unipath.exists(self.k8metainf):
            unipath.mkdir(self.k8metainf)
        self.k8oebps = os.path.join(self.k8dir,'OEBPS')
        if not unipath.exists(self.k8oebps):
            unipath.mkdir(self.k8oebps)
        self.k8images = os.path.join(self.k8oebps,'Images')
        if not unipath.exists(self.k8images):
            unipath.mkdir(self.k8images)
        self.k8fonts = os.path.join(self.k8oebps,'Fonts')
        if not unipath.exists(self.k8fonts):
            unipath.mkdir(self.k8fonts)
        self.k8styles = os.path.join(self.k8oebps,'Styles')
        if not unipath.exists(self.k8styles):
            unipath.mkdir(self.k8styles)
        self.k8text = os.path.join(self.k8oebps,'Text')
        if not unipath.exists(self.k8text):
            unipath.mkdir(self.k8text)
    # recursive zip creation support routine
    def zipUpDir(self, myzip, tdir, localname):
        currentdir = tdir
        if localname != "":
            currentdir = os.path.join(currentdir,localname)
        list = unipath.listdir(currentdir)
        for file in list:
            afilename = file
            localfilePath = os.path.join(localname, afilename)
            realfilePath = os.path.join(currentdir,file)
            if unipath.isfile(realfilePath):
                myzip.write(pathof(realfilePath), pathof(localfilePath), zipfile.ZIP_DEFLATED)
            elif unipath.isdir(realfilePath):
                self.zipUpDir(myzip, tdir, localfilePath)
    def makeEPUB(self, usedmap, obfuscate_data, uid):
        bname = os.path.join(self.k8dir, self.getInputFileBasename() + '.epub')
        # Create an encryption key for Adobe font obfuscation
        # based on the epub's uid
        if isinstance(uid,text_type):
            uid = uid.encode('ascii')
        if obfuscate_data:
            key = re.sub(br'[^a-fA-F0-9]', b'', uid)
            key = binascii.unhexlify((key + key)[:32])
        # copy over all images and fonts that are actually used in the ebook
        # and remove all font files from mobi7 since not supported
        imgnames = unipath.listdir(self.imgdir)
        for name in imgnames:
            if usedmap.get(name,'not used') == 'used':
                filein = os.path.join(self.imgdir,name)
                if name.endswith(".ttf"):
                    fileout = os.path.join(self.k8fonts,name)
                elif name.endswith(".otf"):
                    fileout = os.path.join(self.k8fonts,name)
                elif name.endswith(".failed"):
                    fileout = os.path.join(self.k8fonts,name)
                else:
                    fileout = os.path.join(self.k8images,name)
                data = b''
                with open(pathof(filein),'rb') as f:
                    data = f.read()
                if obfuscate_data:
                    if name in obfuscate_data:
                        data = mangle_fonts(key, data)
                open(pathof(fileout),'wb').write(data)
                if name.endswith(".ttf") or name.endswith(".otf"):
                    os.remove(pathof(filein))
        # opf file name hard coded to "content.opf"
        container = '<?xml version="1.0" encoding="UTF-8"?>\n'
        container += '<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">\n'
        container += '    <rootfiles>\n'
        container += '<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>'
        container += '    </rootfiles>\n</container>\n'
        fileout = os.path.join(self.k8metainf,'container.xml')
        with open(pathof(fileout),'wb') as f:
            f.write(container.encode('utf-8'))
        if obfuscate_data:
            encryption = '<encryption xmlns="urn:oasis:names:tc:opendocument:xmlns:container" \
 xmlns:enc="http://www.w3.org/2001/04/xmlenc#" xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">\n'
            for font in obfuscate_data:
                encryption += '  <enc:EncryptedData>\n'
                encryption += '    <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>\n'
                encryption += '    <enc:CipherData>\n'
                encryption += '      <enc:CipherReference URI="OEBPS/Fonts/' + font + '"/>\n'
                encryption += '    </enc:CipherData>\n'
                encryption += '  </enc:EncryptedData>\n'
            encryption += '</encryption>\n'
            fileout = os.path.join(self.k8metainf,'encryption.xml')
            with open(pathof(fileout),'wb') as f:
                f.write(encryption.encode('utf-8'))
        # ready to build epub
        self.outzip = zipfile.ZipFile(pathof(bname), 'w')
        # add the mimetype file uncompressed
        mimetype = b'application/epub+zip'
        fileout = os.path.join(self.k8dir,'mimetype')
        with open(pathof(fileout),'wb') as f:
            f.write(mimetype)
        nzinfo = ZipInfo('mimetype', compress_type=zipfile.ZIP_STORED)
        nzinfo.external_attr = 0o600 << 16 # make this a normal file
        self.outzip.writestr(nzinfo, mimetype)
        self.zipUpDir(self.outzip,self.k8dir,'META-INF')
        self.zipUpDir(self.outzip,self.k8dir,'OEBPS')
        self.outzip.close()
		`@@ -0,0 +1,2 @@`
							`#!/usr/bin/env python`
							`# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai`