Lector/lector/KindleUnpack/mobi_header.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab

from __future__ import unicode_literals, division, absolute_import, print_function

DEBUG_USE_ORDERED_DICTIONARY = False  # OrderedDict is supoorted >= python 2.7.
""" set to True to use OrderedDict for MobiHeader.metadata."""

if DEBUG_USE_ORDERED_DICTIONARY:
    from collections import OrderedDict as dict_
else:
    dict_ = dict

from .compatibility_utils import PY2, unicode_str, hexlify, bord

if PY2:
    range = xrange

import struct
import uuid

# import the mobiunpack support libraries
from .mobi_utils import getLanguage
from .mobi_uncompress import HuffcdicReader, PalmdocReader, UncompressedReader

class unpackException(Exception):
    pass


def sortedHeaderKeys(mheader):
    hdrkeys = sorted(list(mheader.keys()), key=lambda akey: mheader[akey][0])
    return hdrkeys


# HD Containers have their own headers and their own EXTH
# this is just guesswork so far, making big assumption that
# metavalue key numbers remain the same in the CONT EXTH

# Note:  The layout of the CONT Header is still unknown
# so just deal with their EXTH sections for now

def dump_contexth(cpage, extheader):
    # determine text encoding
    codec = 'windows-1252'
    codec_map = {
         1252 : 'windows-1252',
         65001: 'utf-8',
    }
    if cpage in codec_map:
        codec = codec_map[cpage]
    if extheader == b'':
        return
    id_map_strings = {
        1 : 'Drm Server Id',
        2 : 'Drm Commerce Id',
        3 : 'Drm Ebookbase Book Id',
        4 : 'Drm Ebookbase Dep Id',
        100 : 'Creator',
        101 : 'Publisher',
        102 : 'Imprint',
        103 : 'Description',
        104 : 'ISBN',
        105 : 'Subject',
        106 : 'Published',
        107 : 'Review',
        108 : 'Contributor',
        109 : 'Rights',
        110 : 'SubjectCode',
        111 : 'Type',
        112 : 'Source',
        113 : 'ASIN',
        114 : 'versionNumber',
        117 : 'Adult',
        118 : 'Retail-Price',
        119 : 'Retail-Currency',
        120 : 'TSC',
        122 : 'fixed-layout',
        123 : 'book-type',
        124 : 'orientation-lock',
        126 : 'original-resolution',
        127 : 'zero-gutter',
        128 : 'zero-margin',
        129 : 'MetadataResourceURI',
        132 : 'RegionMagnification',
        150 : 'LendingEnabled',
        200 : 'DictShortName',
        501 : 'cdeType',
        502 : 'last_update_time',
        503 : 'Updated_Title',
        504 : 'CDEContentKey',
        505 : 'AmazonContentReference',
        506 : 'Title-Language',
        507 : 'Title-Display-Direction',
        508 : 'Title-Pronunciation',
        509 : 'Title-Collation',
        510 : 'Secondary-Title',
        511 : 'Secondary-Title-Language',
        512 : 'Secondary-Title-Direction',
        513 : 'Secondary-Title-Pronunciation',
        514 : 'Secondary-Title-Collation',
        515 : 'Author-Language',
        516 : 'Author-Display-Direction',
        517 : 'Author-Pronunciation',
        518 : 'Author-Collation',
        519 : 'Author-Type',
        520 : 'Publisher-Language',
        521 : 'Publisher-Display-Direction',
        522 : 'Publisher-Pronunciation',
        523 : 'Publisher-Collation',
        524 : 'Content-Language-Tag',
        525 : 'primary-writing-mode',
        526 : 'NCX-Ingested-By-Software',
        527 : 'page-progression-direction',
        528 : 'override-kindle-fonts',
        529 : 'Compression-Upgraded',
        530 : 'Soft-Hyphens-In-Content',
        531 : 'Dictionary_In_Langague',
        532 : 'Dictionary_Out_Language',
        533 : 'Font_Converted',
        534 : 'Amazon_Creator_Info',
        535 : 'Creator-Build-Tag',
        536 : 'HD-Media-Containers-Info',  # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?)
        538 : 'Resource-Container-Fidelity',
        539 : 'HD-Container-Mimetype',
        540 : 'Sample-For_Special-Purpose',
        541 : 'Kindletool-Operation-Information',
        542 : 'Container_Id',
        543 : 'Asset-Type',  # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER
        544 : 'Unknown_544',
    }
    id_map_values = {
        115 : 'sample',
        116 : 'StartOffset',
        121 : 'Mobi8-Boundary-Section',
        125 : 'Embedded-Record-Count',
        130 : 'Offline-Sample',
        131 : 'Metadata-Record-Offset',
        201 : 'CoverOffset',
        202 : 'ThumbOffset',
        203 : 'HasFakeCover',
        204 : 'Creator-Software',
        205 : 'Creator-Major-Version',
        206 : 'Creator-Minor-Version',
        207 : 'Creator-Build-Number',
        401 : 'Clipping-Limit',
        402 : 'Publisher-Limit',
        404 : 'Text-to-Speech-Disabled',
        406 : 'Rental-Expiration-Time',
    }
    id_map_hexstrings = {
        208 : 'Watermark_(hex)',
        209 : 'Tamper-Proof-Keys_(hex)',
        300 : 'Font-Signature_(hex)',
        403 : 'Unknown_(403)_(hex)',
        405 : 'Ownership-Type_(hex)',
        407 : 'Unknown_(407)_(hex)',
        420 : 'Multimedia-Content-Reference_(hex)',
        450 : 'Locations_Match_(hex)',
        451 : 'Full-Story-Length_(hex)',
        452 : 'Sample-Start_Location_(hex)',
        453 : 'Sample-End-Location_(hex)',
    }
    _length, num_items = struct.unpack(b'>LL', extheader[4:12])
    extheader = extheader[12:]
    pos = 0
    for _ in range(num_items):
        id, size = struct.unpack(b'>LL', extheader[pos:pos+8])
        content = extheader[pos + 8: pos + size]
        if id in id_map_strings:
            name = id_map_strings[id]
            print('\n    Key: "%s"\n        Value: "%s"' % (name, content.decode(codec, errors='replace')))
        elif id in id_map_values:
            name = id_map_values[id]
            if size == 9:
                value, = struct.unpack(b'B',content)
                print('\n    Key: "%s"\n        Value: 0x%01x' % (name, value))
            elif size == 10:
                value, = struct.unpack(b'>H',content)
                print('\n    Key: "%s"\n        Value: 0x%02x' % (name, value))
            elif size == 12:
                value, = struct.unpack(b'>L',content)
                print('\n    Key: "%s"\n        Value: 0x%04x' % (name, value))
            else:
                print("\nError: Value for %s has unexpected size of %s" % (name, size))
        elif id in id_map_hexstrings:
            name = id_map_hexstrings[id]
            print('\n    Key: "%s"\n        Value: 0x%s' % (name, hexlify(content)))
        else:
            print("\nWarning: Unknown metadata with id %s found" % id)
            name = str(id) + ' (hex)'
            print('    Key: "%s"\n        Value: 0x%s' % (name, hexlify(content)))
        pos += size
    return


class MobiHeader:
    # all values are packed in big endian format
    palmdoc_header = {
            'compression_type'  : (0x00, b'>H', 2),
            'fill0'             : (0x02, b'>H', 2),
            'text_length'       : (0x04, b'>L', 4),
            'text_records'      : (0x08, b'>H', 2),
            'max_section_size'  : (0x0a, b'>H', 2),
            'read_pos   '       : (0x0c, b'>L', 4),
    }

    mobi6_header = {
            'compression_type'  : (0x00, b'>H', 2),
            'fill0'             : (0x02, b'>H', 2),
            'text_length'       : (0x04, b'>L', 4),
            'text_records'      : (0x08, b'>H', 2),
            'max_section_size'  : (0x0a, b'>H', 2),
            'crypto_type'       : (0x0c, b'>H', 2),
            'fill1'             : (0x0e, b'>H', 2),
            'magic'             : (0x10, b'4s', 4),
            'header_length (from MOBI)'     : (0x14, b'>L', 4),
            'type'              : (0x18, b'>L', 4),
            'codepage'          : (0x1c, b'>L', 4),
            'unique_id'         : (0x20, b'>L', 4),
            'version'           : (0x24, b'>L', 4),
            'metaorthindex'     : (0x28, b'>L', 4),
            'metainflindex'     : (0x2c, b'>L', 4),
            'index_names'       : (0x30, b'>L', 4),
            'index_keys'        : (0x34, b'>L', 4),
            'extra_index0'      : (0x38, b'>L', 4),
            'extra_index1'      : (0x3c, b'>L', 4),
            'extra_index2'      : (0x40, b'>L', 4),
            'extra_index3'      : (0x44, b'>L', 4),
            'extra_index4'      : (0x48, b'>L', 4),
            'extra_index5'      : (0x4c, b'>L', 4),
            'first_nontext'     : (0x50, b'>L', 4),
            'title_offset'      : (0x54, b'>L', 4),
            'title_length'      : (0x58, b'>L', 4),
            'language_code'     : (0x5c, b'>L', 4),
            'dict_in_lang'      : (0x60, b'>L', 4),
            'dict_out_lang'     : (0x64, b'>L', 4),
            'min_version'       : (0x68, b'>L', 4),
            'first_resc_offset' : (0x6c, b'>L', 4),
            'huff_offset'       : (0x70, b'>L', 4),
            'huff_num'          : (0x74, b'>L', 4),
            'huff_tbl_offset'   : (0x78, b'>L', 4),
            'huff_tbl_len'      : (0x7c, b'>L', 4),
            'exth_flags'        : (0x80, b'>L', 4),
            'fill3_a'           : (0x84, b'>L', 4),
            'fill3_b'           : (0x88, b'>L', 4),
            'fill3_c'           : (0x8c, b'>L', 4),
            'fill3_d'           : (0x90, b'>L', 4),
            'fill3_e'           : (0x94, b'>L', 4),
            'fill3_f'           : (0x98, b'>L', 4),
            'fill3_g'           : (0x9c, b'>L', 4),
            'fill3_h'           : (0xa0, b'>L', 4),
            'unknown0'          : (0xa4, b'>L', 4),
            'drm_offset'        : (0xa8, b'>L', 4),
            'drm_count'         : (0xac, b'>L', 4),
            'drm_size'          : (0xb0, b'>L', 4),
            'drm_flags'         : (0xb4, b'>L', 4),
            'fill4_a'           : (0xb8, b'>L', 4),
            'fill4_b'           : (0xbc, b'>L', 4),
            'first_content'     : (0xc0, b'>H', 2),
            'last_content'      : (0xc2, b'>H', 2),
            'unknown0'          : (0xc4, b'>L', 4),
            'fcis_offset'       : (0xc8, b'>L', 4),
            'fcis_count'        : (0xcc, b'>L', 4),
            'flis_offset'       : (0xd0, b'>L', 4),
            'flis_count'        : (0xd4, b'>L', 4),
            'unknown1'          : (0xd8, b'>L', 4),
            'unknown2'          : (0xdc, b'>L', 4),
            'srcs_offset'       : (0xe0, b'>L', 4),
            'srcs_count'        : (0xe4, b'>L', 4),
            'unknown3'          : (0xe8, b'>L', 4),
            'unknown4'          : (0xec, b'>L', 4),
            'fill5'             : (0xf0, b'>H', 2),
            'traildata_flags'   : (0xf2, b'>H', 2),
            'ncx_index'         : (0xf4, b'>L', 4),
            'unknown5'          : (0xf8, b'>L', 4),
            'unknown6'          : (0xfc, b'>L', 4),
            'datp_offset'       : (0x100, b'>L', 4),
            'unknown7'          : (0x104, b'>L', 4),
            'Unknown    '       : (0x108, b'>L', 4),
            'Unknown    '       : (0x10C, b'>L', 4),
            'Unknown    '       : (0x110, b'>L', 4),
            'Unknown    '       : (0x114, b'>L', 4),
            'Unknown    '       : (0x118, b'>L', 4),
            'Unknown    '       : (0x11C, b'>L', 4),
            'Unknown    '       : (0x120, b'>L', 4),
            'Unknown    '       : (0x124, b'>L', 4),
            'Unknown    '       : (0x128, b'>L', 4),
            'Unknown    '       : (0x12C, b'>L', 4),
            'Unknown    '       : (0x130, b'>L', 4),
            'Unknown    '       : (0x134, b'>L', 4),
            'Unknown    '       : (0x138, b'>L', 4),
            'Unknown    '       : (0x11C, b'>L', 4),
            }

    mobi8_header = {
            'compression_type'  : (0x00, b'>H', 2),
            'fill0'             : (0x02, b'>H', 2),
            'text_length'       : (0x04, b'>L', 4),
            'text_records'      : (0x08, b'>H', 2),
            'max_section_size'  : (0x0a, b'>H', 2),
            'crypto_type'       : (0x0c, b'>H', 2),
            'fill1'             : (0x0e, b'>H', 2),
            'magic'             : (0x10, b'4s', 4),
            'header_length (from MOBI)'     : (0x14, b'>L', 4),
            'type'              : (0x18, b'>L', 4),
            'codepage'          : (0x1c, b'>L', 4),
            'unique_id'         : (0x20, b'>L', 4),
            'version'           : (0x24, b'>L', 4),
            'metaorthindex'     : (0x28, b'>L', 4),
            'metainflindex'     : (0x2c, b'>L', 4),
            'index_names'       : (0x30, b'>L', 4),
            'index_keys'        : (0x34, b'>L', 4),
            'extra_index0'      : (0x38, b'>L', 4),
            'extra_index1'      : (0x3c, b'>L', 4),
            'extra_index2'      : (0x40, b'>L', 4),
            'extra_index3'      : (0x44, b'>L', 4),
            'extra_index4'      : (0x48, b'>L', 4),
            'extra_index5'      : (0x4c, b'>L', 4),
            'first_nontext'     : (0x50, b'>L', 4),
            'title_offset'      : (0x54, b'>L', 4),
            'title_length'      : (0x58, b'>L', 4),
            'language_code'     : (0x5c, b'>L', 4),
            'dict_in_lang'      : (0x60, b'>L', 4),
            'dict_out_lang'     : (0x64, b'>L', 4),
            'min_version'       : (0x68, b'>L', 4),
            'first_resc_offset' : (0x6c, b'>L', 4),
            'huff_offset'       : (0x70, b'>L', 4),
            'huff_num'          : (0x74, b'>L', 4),
            'huff_tbl_offset'   : (0x78, b'>L', 4),
            'huff_tbl_len'      : (0x7c, b'>L', 4),
            'exth_flags'        : (0x80, b'>L', 4),
            'fill3_a'           : (0x84, b'>L', 4),
            'fill3_b'           : (0x88, b'>L', 4),
            'fill3_c'           : (0x8c, b'>L', 4),
            'fill3_d'           : (0x90, b'>L', 4),
            'fill3_e'           : (0x94, b'>L', 4),
            'fill3_f'           : (0x98, b'>L', 4),
            'fill3_g'           : (0x9c, b'>L', 4),
            'fill3_h'           : (0xa0, b'>L', 4),
            'unknown0'          : (0xa4, b'>L', 4),
            'drm_offset'        : (0xa8, b'>L', 4),
            'drm_count'         : (0xac, b'>L', 4),
            'drm_size'          : (0xb0, b'>L', 4),
            'drm_flags'         : (0xb4, b'>L', 4),
            'fill4_a'           : (0xb8, b'>L', 4),
            'fill4_b'           : (0xbc, b'>L', 4),
            'fdst_offset'       : (0xc0, b'>L', 4),
            'fdst_flow_count'   : (0xc4, b'>L', 4),
            'fcis_offset'       : (0xc8, b'>L', 4),
            'fcis_count'        : (0xcc, b'>L', 4),
            'flis_offset'       : (0xd0, b'>L', 4),
            'flis_count'        : (0xd4, b'>L', 4),
            'unknown1'          : (0xd8, b'>L', 4),
            'unknown2'          : (0xdc, b'>L', 4),
            'srcs_offset'       : (0xe0, b'>L', 4),
            'srcs_count'        : (0xe4, b'>L', 4),
            'unknown3'          : (0xe8, b'>L', 4),
            'unknown4'          : (0xec, b'>L', 4),
            'fill5'             : (0xf0, b'>H', 2),
            'traildata_flags'   : (0xf2, b'>H', 2),
            'ncx_index'         : (0xf4, b'>L', 4),
            'fragment_index'    : (0xf8, b'>L', 4),
            'skeleton_index'    : (0xfc, b'>L', 4),
            'datp_offset'       : (0x100, b'>L', 4),
            'guide_index'       : (0x104, b'>L', 4),
            'Unknown    '       : (0x108, b'>L', 4),
            'Unknown    '       : (0x10C, b'>L', 4),
            'Unknown    '       : (0x110, b'>L', 4),
            'Unknown    '       : (0x114, b'>L', 4),
            'Unknown    '       : (0x118, b'>L', 4),
            'Unknown    '       : (0x11C, b'>L', 4),
            'Unknown    '       : (0x120, b'>L', 4),
            'Unknown    '       : (0x124, b'>L', 4),
            'Unknown    '       : (0x128, b'>L', 4),
            'Unknown    '       : (0x12C, b'>L', 4),
            'Unknown    '       : (0x130, b'>L', 4),
            'Unknown    '       : (0x134, b'>L', 4),
            'Unknown    '       : (0x138, b'>L', 4),
            'Unknown    '       : (0x11C, b'>L', 4),
            }

    palmdoc_header_sorted_keys = sortedHeaderKeys(palmdoc_header)
    mobi6_header_sorted_keys = sortedHeaderKeys(mobi6_header)
    mobi8_header_sorted_keys = sortedHeaderKeys(mobi8_header)

    id_map_strings = {
        1 : 'Drm Server Id',
        2 : 'Drm Commerce Id',
        3 : 'Drm Ebookbase Book Id',
        4 : 'Drm Ebookbase Dep Id',
        100 : 'Creator',
        101 : 'Publisher',
        102 : 'Imprint',
        103 : 'Description',
        104 : 'ISBN',
        105 : 'Subject',
        106 : 'Published',
        107 : 'Review',
        108 : 'Contributor',
        109 : 'Rights',
        110 : 'SubjectCode',
        111 : 'Type',
        112 : 'Source',
        113 : 'ASIN',
        114 : 'versionNumber',
        117 : 'Adult',
        118 : 'Retail-Price',
        119 : 'Retail-Currency',
        120 : 'TSC',
        122 : 'fixed-layout',
        123 : 'book-type',
        124 : 'orientation-lock',
        126 : 'original-resolution',
        127 : 'zero-gutter',
        128 : 'zero-margin',
        129 : 'MetadataResourceURI',
        132 : 'RegionMagnification',
        150 : 'LendingEnabled',
        200 : 'DictShortName',
        501 : 'cdeType',
        502 : 'last_update_time',
        503 : 'Updated_Title',
        504 : 'CDEContentKey',
        505 : 'AmazonContentReference',
        506 : 'Title-Language',
        507 : 'Title-Display-Direction',
        508 : 'Title-Pronunciation',
        509 : 'Title-Collation',
        510 : 'Secondary-Title',
        511 : 'Secondary-Title-Language',
        512 : 'Secondary-Title-Direction',
        513 : 'Secondary-Title-Pronunciation',
        514 : 'Secondary-Title-Collation',
        515 : 'Author-Language',
        516 : 'Author-Display-Direction',
        517 : 'Author-Pronunciation',
        518 : 'Author-Collation',
        519 : 'Author-Type',
        520 : 'Publisher-Language',
        521 : 'Publisher-Display-Direction',
        522 : 'Publisher-Pronunciation',
        523 : 'Publisher-Collation',
        524 : 'Content-Language-Tag',
        525 : 'primary-writing-mode',
        526 : 'NCX-Ingested-By-Software',
        527 : 'page-progression-direction',
        528 : 'override-kindle-fonts',
        529 : 'Compression-Upgraded',
        530 : 'Soft-Hyphens-In-Content',
        531 : 'Dictionary_In_Langague',
        532 : 'Dictionary_Out_Language',
        533 : 'Font_Converted',
        534 : 'Amazon_Creator_Info',
        535 : 'Creator-Build-Tag',
        536 : 'HD-Media-Containers-Info',  # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?)
        538 : 'Resource-Container-Fidelity',
        539 : 'HD-Container-Mimetype',
        540 : 'Sample-For_Special-Purpose',
        541 : 'Kindletool-Operation-Information',
        542 : 'Container_Id',
        543 : 'Asset-Type',  # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER
        544 : 'Unknown_544',
    }
    id_map_values = {
        115 : 'sample',
        116 : 'StartOffset',
        121 : 'Mobi8-Boundary-Section',
        125 : 'Embedded-Record-Count',
        130 : 'Offline-Sample',
        131 : 'Metadata-Record-Offset',
        201 : 'CoverOffset',
        202 : 'ThumbOffset',
        203 : 'HasFakeCover',
        204 : 'Creator-Software',
        205 : 'Creator-Major-Version',
        206 : 'Creator-Minor-Version',
        207 : 'Creator-Build-Number',
        401 : 'Clipping-Limit',
        402 : 'Publisher-Limit',
        404 : 'Text-to-Speech-Disabled',
        406 : 'Rental-Expiration-Time',
    }
    id_map_hexstrings = {
        208 : 'Watermark_(hex)',
        209 : 'Tamper-Proof-Keys_(hex)',
        300 : 'Font-Signature_(hex)',
        403 : 'Unknown_(403)_(hex)',
        405 : 'Ownership-Type_(hex)',
        407 : 'Unknown_(407)_(hex)',
        420 : 'Multimedia-Content-Reference_(hex)',
        450 : 'Locations_Match_(hex)',
        451 : 'Full-Story-Length_(hex)',
        452 : 'Sample-Start_Location_(hex)',
        453 : 'Sample-End-Location_(hex)',
    }

    def __init__(self, sect, sectNumber):
        self.sect = sect
        self.start = sectNumber
        self.header = self.sect.loadSection(self.start)
        if len(self.header)>20 and self.header[16:20] == b'MOBI':
            self.sect.setsectiondescription(0,"Mobipocket Header")
            self.palm = False
        elif self.sect.ident == b'TEXtREAd':
            self.sect.setsectiondescription(0, "PalmDOC Header")
            self.palm = True
        else:
            raise unpackException('Unknown File Format')

        self.records, = struct.unpack_from(b'>H', self.header, 0x8)

        # set defaults in case this is a PalmDOC
        self.title = self.sect.palmname.decode('latin-1', errors='replace')
        self.length = len(self.header)-16
        self.type = 3
        self.codepage = 1252
        self.codec = 'windows-1252'
        self.unique_id = 0
        self.version = 0
        self.hasExth = False
        self.exth = b''
        self.exth_offset = self.length + 16
        self.exth_length = 0
        self.crypto_type = 0
        self.firstnontext = self.start+self.records + 1
        self.firstresource = self.start+self.records + 1
        self.ncxidx = 0xffffffff
        self.metaOrthIndex = 0xffffffff
        self.metaInflIndex = 0xffffffff
        self.skelidx = 0xffffffff
        self.fragidx = 0xffffffff
        self.guideidx = 0xffffffff
        self.fdst = 0xffffffff
        self.mlstart = self.sect.loadSection(self.start+1)[:4]
        self.rawSize = 0
        self.metadata = dict_()

        # set up for decompression/unpacking
        self.compression, = struct.unpack_from(b'>H', self.header, 0x0)
        if self.compression == 0x4448:
            reader = HuffcdicReader()
            huffoff, huffnum = struct.unpack_from(b'>LL', self.header, 0x70)
            huffoff = huffoff + self.start
            self.sect.setsectiondescription(huffoff,"Huffman Compression Seed")
            reader.loadHuff(self.sect.loadSection(huffoff))
            for i in range(1, huffnum):
                self.sect.setsectiondescription(huffoff+i,"Huffman CDIC Compression Seed %d" % i)
                reader.loadCdic(self.sect.loadSection(huffoff+i))
            self.unpack = reader.unpack
        elif self.compression == 2:
            self.unpack = PalmdocReader().unpack
        elif self.compression == 1:
            self.unpack = UncompressedReader().unpack
        else:
            raise unpackException('invalid compression type: 0x%4x' % self.compression)

        if self.palm:
            return

        self.length, self.type, self.codepage, self.unique_id, self.version = struct.unpack(b'>LLLLL', self.header[20:40])
        codec_map = {
            1252 : 'windows-1252',
            65001: 'utf-8',
        }
        if self.codepage in codec_map:
            self.codec = codec_map[self.codepage]

        # title
        toff, tlen = struct.unpack(b'>II', self.header[0x54:0x5c])
        tend = toff + tlen
        self.title=self.header[toff:tend].decode(self.codec, errors='replace')

        exth_flag, = struct.unpack(b'>L', self.header[0x80:0x84])
        self.hasExth = exth_flag & 0x40
        self.exth_offset = self.length + 16
        self.exth_length = 0
        if self.hasExth:
            self.exth_length, = struct.unpack_from(b'>L', self.header, self.exth_offset+4)
            self.exth_length = ((self.exth_length + 3)>>2)<<2  # round to next 4 byte boundary
            self.exth = self.header[self.exth_offset:self.exth_offset+self.exth_length]

        # parse the exth / metadata
        self.parseMetaData()

        # self.mlstart = self.sect.loadSection(self.start+1)
        # self.mlstart = self.mlstart[0:4]
        self.crypto_type, = struct.unpack_from(b'>H', self.header, 0xC)

        # Start sector for additional files such as images, fonts, resources, etc
        # Can be missing so fall back to default set previously
        ofst, = struct.unpack_from(b'>L', self.header, 0x6C)
        if ofst != 0xffffffff:
            self.firstresource = ofst + self.start
        ofst, = struct.unpack_from(b'>L', self.header, 0x50)
        if ofst != 0xffffffff:
            self.firstnontext = ofst + self.start

        if self.isPrintReplica():
            return

        if self.version < 8:
            # Dictionary metaOrthIndex
            self.metaOrthIndex, = struct.unpack_from(b'>L', self.header, 0x28)
            if self.metaOrthIndex != 0xffffffff:
                self.metaOrthIndex += self.start

            # Dictionary metaInflIndex
            self.metaInflIndex, = struct.unpack_from(b'>L', self.header, 0x2C)
            if self.metaInflIndex != 0xffffffff:
                self.metaInflIndex += self.start

        # handle older headers without any ncxindex info and later
        # specifically 0xe4 headers
        if self.length + 16 < 0xf8:
            return

        # NCX Index
        self.ncxidx, = struct.unpack(b'>L', self.header[0xf4:0xf8])
        if self.ncxidx != 0xffffffff:
            self.ncxidx += self.start

        # K8 specific Indexes
        if self.start != 0 or self.version == 8:
            # Index into <xml> file skeletons in RawML
            self.skelidx, = struct.unpack_from(b'>L', self.header, 0xfc)
            if self.skelidx != 0xffffffff:
                self.skelidx += self.start

            # Index into <div> sections in RawML
            self.fragidx, = struct.unpack_from(b'>L', self.header, 0xf8)
            if self.fragidx != 0xffffffff:
                self.fragidx += self.start

            # Index into Other files
            self.guideidx, = struct.unpack_from(b'>L', self.header, 0x104)
            if self.guideidx != 0xffffffff:
                self.guideidx += self.start

            # dictionaries do not seem to use the same approach in K8's
            # so disable them
            self.metaOrthIndex = 0xffffffff
            self.metaInflIndex = 0xffffffff

            # need to use the FDST record to find out how to properly unpack
            # the rawML into pieces
            # it is simply a table of start and end locations for each flow piece
            self.fdst, = struct.unpack_from(b'>L', self.header, 0xc0)
            self.fdstcnt, = struct.unpack_from(b'>L', self.header, 0xc4)
            # if cnt is 1 or less, fdst section mumber can be garbage
            if self.fdstcnt <= 1:
                self.fdst = 0xffffffff
            if self.fdst != 0xffffffff:
                self.fdst += self.start
                # setting of fdst section description properly handled in mobi_kf8proc

    def dump_exth(self):
        # determine text encoding
        codec=self.codec
        if (not self.hasExth) or (self.exth_length) == 0 or (self.exth == b''):
            return
        num_items, = struct.unpack(b'>L', self.exth[8:12])
        pos = 12
        print("Key Size Decription                     Value")
        for _ in range(num_items):
            id, size = struct.unpack(b'>LL', self.exth[pos:pos+8])
            contentsize = size-8
            content = self.exth[pos + 8: pos + size]
            if id in MobiHeader.id_map_strings:
                exth_name = MobiHeader.id_map_strings[id]
                print('{0: >3d} {1: >4d} {2: <30s} {3:s}'.format(id, contentsize, exth_name, content.decode(codec, errors='replace')))
            elif id in MobiHeader.id_map_values:
                exth_name = MobiHeader.id_map_values[id]
                if size == 9:
                    value, = struct.unpack(b'B',content)
                    print('{0:3d} byte {1:<30s} {2:d}'.format(id, exth_name, value))
                elif size == 10:
                    value, = struct.unpack(b'>H',content)
                    print('{0:3d} word {1:<30s} 0x{2:0>4X} ({2:d})'.format(id, exth_name, value))
                elif size == 12:
                    value, = struct.unpack(b'>L',content)
                    print('{0:3d} long {1:<30s} 0x{2:0>8X} ({2:d})'.format(id, exth_name, value))
                else:
                    print('{0: >3d} {1: >4d} {2: <30s} (0x{3:s})'.format(id, contentsize, "Bad size for "+exth_name, hexlify(content)))
            elif id in MobiHeader.id_map_hexstrings:
                exth_name = MobiHeader.id_map_hexstrings[id]
                print('{0:3d} {1:4d} {2:<30s} 0x{3:s}'.format(id, contentsize, exth_name, hexlify(content)))
            else:
                exth_name = "Unknown EXTH ID {0:d}".format(id)
                print("{0: >3d} {1: >4d} {2: <30s} 0x{3:s}".format(id, contentsize, exth_name, hexlify(content)))
            pos += size
        return

    def dumpheader(self):
        # first 16 bytes are not part of the official mobiheader
        # but we will treat it as such
        # so section 0 is 16 (decimal) + self.length in total == at least 0x108 bytes for Mobi 8 headers
        print("Dumping section %d, Mobipocket Header version: %d, total length %d" % (self.start,self.version, self.length+16))
        self.hdr = {}
        # set it up for the proper header version
        if self.version == 0:
            self.mobi_header = MobiHeader.palmdoc_header
            self.mobi_header_sorted_keys = MobiHeader.palmdoc_header_sorted_keys
        elif self.version < 8:
            self.mobi_header = MobiHeader.mobi6_header
            self.mobi_header_sorted_keys = MobiHeader.mobi6_header_sorted_keys
        else:
            self.mobi_header = MobiHeader.mobi8_header
            self.mobi_header_sorted_keys = MobiHeader.mobi8_header_sorted_keys

        # parse the header information
        for key in self.mobi_header_sorted_keys:
            (pos, format, tot_len) = self.mobi_header[key]
            if pos < (self.length + 16):
                val, = struct.unpack_from(format, self.header, pos)
                self.hdr[key] = val

        if 'title_offset' in self.hdr:
            title_offset = self.hdr['title_offset']
            title_length = self.hdr['title_length']
        else:
            title_offset = 0
            title_length = 0
        if title_offset == 0:
            title_offset = len(self.header)
            title_length = 0
            self.title = self.sect.palmname.decode('latin-1', errors='replace')
        else:
            self.title = self.header[title_offset:title_offset+title_length].decode(self.codec, errors='replace')
            # title record always padded with two nul bytes and then padded with nuls to next 4 byte boundary
            title_length = ((title_length+2+3)>>2)<<2

        self.extra1 = self.header[self.exth_offset+self.exth_length:title_offset]
        self.extra2 = self.header[title_offset+title_length:]

        print("Mobipocket header from section %d" % self.start)
        print("     Offset  Value Hex Dec        Description")
        for key in self.mobi_header_sorted_keys:
            (pos, format, tot_len) = self.mobi_header[key]
            if pos < (self.length + 16):
                if key != 'magic':
                    fmt_string = "0x{0:0>3X} ({0:3d}){1: >" + str(9-2*tot_len) +"s}0x{2:0>" + str(2*tot_len) + "X} {2:10d} {3:s}"
                else:
                    self.hdr[key] = unicode_str(self.hdr[key])
                    fmt_string = "0x{0:0>3X} ({0:3d}){2:>11s}            {3:s}"
                print(fmt_string.format(pos, " ",self.hdr[key], key))
        print("")

        if self.exth_length > 0:
            print("EXTH metadata, offset %d, padded length %d" % (self.exth_offset,self.exth_length))
            self.dump_exth()
            print("")

        if len(self.extra1) > 0:
            print("Extra data between EXTH and Title, length %d" % len(self.extra1))
            print(hexlify(self.extra1))
            print("")

        if title_length > 0:
            print("Title in header at offset %d, padded length %d: '%s'" %(title_offset,title_length,self.title))
            print("")

        if len(self.extra2) > 0:
            print("Extra data between Title and end of header, length %d" % len(self.extra2))
            print(hexlify(self.extra2))
            print("")

    def isPrintReplica(self):
        return self.mlstart[0:4] == b"%MOP"

    def isK8(self):
        return self.start != 0 or self.version == 8

    def isEncrypted(self):
        return self.crypto_type != 0

    def hasNCX(self):
        return self.ncxidx != 0xffffffff

    def isDictionary(self):
        return self.metaOrthIndex != 0xffffffff

    def getncxIndex(self):
        return self.ncxidx

    def decompress(self, data):
        return self.unpack(data)

    def Language(self):
        langcode = struct.unpack(b'!L', self.header[0x5c:0x60])[0]
        langid = langcode & 0xFF
        sublangid = (langcode >> 8) & 0xFF
        return getLanguage(langid, sublangid)

    def DictInLanguage(self):
        if self.isDictionary():
            langcode = struct.unpack(b'!L', self.header[0x60:0x64])[0]
            langid = langcode & 0xFF
            sublangid = (langcode >> 10) & 0xFF
            if langid != 0:
                return getLanguage(langid, sublangid)
        return False

    def DictOutLanguage(self):
        if self.isDictionary():
            langcode = struct.unpack(b'!L', self.header[0x64:0x68])[0]
            langid = langcode & 0xFF
            sublangid = (langcode >> 10) & 0xFF
            if langid != 0:
                return getLanguage(langid, sublangid)
        return False

    def getRawML(self):
        def getSizeOfTrailingDataEntry(data):
            num = 0
            for v in data[-4:]:
                if bord(v) & 0x80:
                    num = 0
                num = (num << 7) | (bord(v) & 0x7f)
            return num
        def trimTrailingDataEntries(data):
            for _ in range(trailers):
                num = getSizeOfTrailingDataEntry(data)
                data = data[:-num]
            if multibyte:
                num = (ord(data[-1:]) & 3) + 1
                data = data[:-num]
            return data
        multibyte = 0
        trailers = 0
        if self.sect.ident == b'BOOKMOBI':
            mobi_length, = struct.unpack_from(b'>L', self.header, 0x14)
            mobi_version, = struct.unpack_from(b'>L', self.header, 0x68)
            if (mobi_length >= 0xE4) and (mobi_version >= 5):
                flags, = struct.unpack_from(b'>H', self.header, 0xF2)
                multibyte = flags & 1
                while flags > 1:
                    if flags & 2:
                        trailers += 1
                    flags = flags >> 1
        # get raw mobi markup languge
        print("Unpacking raw markup language")
        dataList = []
        # offset = 0
        for i in range(1, self.records+1):
            data = trimTrailingDataEntries(self.sect.loadSection(self.start + i))
            dataList.append(self.unpack(data))
            if self.isK8():
                self.sect.setsectiondescription(self.start + i,"KF8 Text Section {0:d}".format(i))
            elif self.version == 0:
                self.sect.setsectiondescription(self.start + i,"PalmDOC Text Section {0:d}".format(i))
            else:
                self.sect.setsectiondescription(self.start + i,"Mobipocket Text Section {0:d}".format(i))
        rawML = b''.join(dataList)
        self.rawSize = len(rawML)
        return rawML

    # all metadata is stored in a dictionary with key and returns a *list* of values
    # a list is used to allow for multiple creators, multiple contributors, etc
    def parseMetaData(self):
        def addValue(name, value):
            if name not in self.metadata:
                self.metadata[name] = [value]
            else:
                self.metadata[name].append(value)

        codec=self.codec
        if self.hasExth:
            extheader=self.exth
            _length, num_items = struct.unpack(b'>LL', extheader[4:12])
            extheader = extheader[12:]
            pos = 0
            for _ in range(num_items):
                id, size = struct.unpack(b'>LL', extheader[pos:pos+8])
                content = extheader[pos + 8: pos + size]
                if id in MobiHeader.id_map_strings:
                    name = MobiHeader.id_map_strings[id]
                    addValue(name, content.decode(codec, errors='replace'))
                elif id in MobiHeader.id_map_values:
                    name = MobiHeader.id_map_values[id]
                    if size == 9:
                        value, = struct.unpack(b'B',content)
                        addValue(name, unicode_str(str(value)))
                    elif size == 10:
                        value, = struct.unpack(b'>H',content)
                        addValue(name, unicode_str(str(value)))
                    elif size == 12:
                        value, = struct.unpack(b'>L',content)
                        # handle special case of missing CoverOffset or missing ThumbOffset
                        if id == 201 or id == 202:
                            if value != 0xffffffff:
                                addValue(name, unicode_str(str(value)))
                        else:
                            addValue(name, unicode_str(str(value)))
                    else:
                        print("Warning: Bad key, size, value combination detected in EXTH ", id, size, hexlify(content))
                        addValue(name, hexlify(content))
                elif id in MobiHeader.id_map_hexstrings:
                    name = MobiHeader.id_map_hexstrings[id]
                    addValue(name, hexlify(content))
                else:
                    name = unicode_str(str(id)) + ' (hex)'
                    addValue(name, hexlify(content))
                pos += size

        # add the basics to the metadata each as a list element
        self.metadata['Language'] = [self.Language()]
        self.metadata['Title'] = [unicode_str(self.title,self.codec)]
        self.metadata['Codec'] = [self.codec]
        self.metadata['UniqueID'] = [unicode_str(str(self.unique_id))]
        # if no asin create one using a uuid
        if 'ASIN' not in self.metadata:
            self.metadata['ASIN'] = [unicode_str(str(uuid.uuid4()))]
        # if no cdeType set it to "EBOK"
        if 'cdeType' not in self.metadata:
            self.metadata['cdeType'] = ['EBOK']

    def getMetaData(self):
        return self.metadata

    def describeHeader(self, DUMP):
        print("Mobi Version:", self.version)
        print("Codec:", self.codec)
        print("Title:", self.title)
        if 'Updated_Title' in self.metadata:
            print("EXTH Title:", self.metadata['Updated_Title'][0])
        if self.compression == 0x4448:
            print("Huffdic compression")
        elif self.compression == 2:
            print("Palmdoc compression")
        elif self.compression == 1:
            print("No compression")
        if DUMP:
            self.dumpheader()