277 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			277 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #!/usr/bin/env python
 | |
| # -*- coding: utf-8 -*-
 | |
| # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 | |
| 
 | |
| from __future__ import unicode_literals, division, absolute_import, print_function
 | |
| 
 | |
| from .compatibility_utils import PY2, bchr, bstr, bord
 | |
| if PY2:
 | |
|     range = xrange
 | |
| 
 | |
| import struct
 | |
| # note:  struct pack, unpack, unpack_from all require bytestring format
 | |
| # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 | |
| 
 | |
| from .mobi_utils import toHex
 | |
| 
 | |
| class MobiIndex:
 | |
| 
 | |
|     def __init__(self, sect, DEBUG=False):
 | |
|         self.sect = sect
 | |
|         self.DEBUG = DEBUG
 | |
| 
 | |
|     def getIndexData(self, idx, label="Unknown"):
 | |
|         sect = self.sect
 | |
|         outtbl = []
 | |
|         ctoc_text = {}
 | |
|         if idx != 0xffffffff:
 | |
|             sect.setsectiondescription(idx,"{0} Main INDX section".format(label))
 | |
|             data = sect.loadSection(idx)
 | |
|             idxhdr, hordt1, hordt2 = self.parseINDXHeader(data)
 | |
|             IndexCount = idxhdr['count']
 | |
|             # handle the case of multiple sections used for CTOC
 | |
|             rec_off = 0
 | |
|             off = idx + IndexCount + 1
 | |
|             for j in range(idxhdr['nctoc']):
 | |
|                 cdata = sect.loadSection(off + j)
 | |
|                 sect.setsectiondescription(off+j, label + ' CTOC Data ' + str(j))
 | |
|                 ctocdict = self.readCTOC(cdata)
 | |
|                 for k in ctocdict:
 | |
|                     ctoc_text[k + rec_off] = ctocdict[k]
 | |
|                 rec_off += 0x10000
 | |
|             tagSectionStart = idxhdr['len']
 | |
|             controlByteCount, tagTable = readTagSection(tagSectionStart, data)
 | |
|             if self.DEBUG:
 | |
|                 print("ControlByteCount is", controlByteCount)
 | |
|                 print("IndexCount is", IndexCount)
 | |
|                 print("TagTable: %s" % tagTable)
 | |
|             for i in range(idx + 1, idx + 1 + IndexCount):
 | |
|                 sect.setsectiondescription(i,"{0} Extra {1:d} INDX section".format(label,i-idx))
 | |
|                 data = sect.loadSection(i)
 | |
|                 hdrinfo, ordt1, ordt2 = self.parseINDXHeader(data)
 | |
|                 idxtPos = hdrinfo['start']
 | |
|                 entryCount = hdrinfo['count']
 | |
|                 if self.DEBUG:
 | |
|                     print(idxtPos, entryCount)
 | |
|                 # loop through to build up the IDXT position starts
 | |
|                 idxPositions = []
 | |
|                 for j in range(entryCount):
 | |
|                     pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j))
 | |
|                     idxPositions.append(pos)
 | |
|                 # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
 | |
|                 idxPositions.append(idxtPos)
 | |
|                 # for each entry in the IDXT build up the tagMap and any associated text
 | |
|                 for j in range(entryCount):
 | |
|                     startPos = idxPositions[j]
 | |
|                     endPos = idxPositions[j+1]
 | |
|                     textLength = ord(data[startPos:startPos+1])
 | |
|                     text = data[startPos+1:startPos+1+textLength]
 | |
|                     if hordt2 is not None:
 | |
|                         text = b''.join(bchr(hordt2[bord(x)]) for x in text)
 | |
|                     tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos)
 | |
|                     outtbl.append([text, tagMap])
 | |
|                     if self.DEBUG:
 | |
|                         print(tagMap)
 | |
|                         print(text)
 | |
|         return outtbl, ctoc_text
 | |
| 
 | |
|     def parseINDXHeader(self, data):
 | |
|         "read INDX header"
 | |
|         if not data[:4] == b'INDX':
 | |
|             print("Warning: index section is not INDX")
 | |
|             return False
 | |
|         words = (
 | |
|                 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
 | |
|                 'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc'
 | |
|         )
 | |
|         num = len(words)
 | |
|         values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)])
 | |
|         header = {}
 | |
|         for n in range(num):
 | |
|             header[words[n]] = values[n]
 | |
| 
 | |
|         ordt1 = None
 | |
|         ordt2 = None
 | |
| 
 | |
|         ocnt, oentries, op1, op2, otagx  = struct.unpack_from(b'>LLLLL',data, 0xa4)
 | |
|         if header['code'] == 0xfdea or ocnt != 0 or oentries > 0:
 | |
|             # horribly hacked up ESP (sample) mobi books use two ORDT sections but never specify
 | |
|             # them in the proper place in the header.  They seem to be codepage 65002 which seems
 | |
|             # to be some sort of strange EBCDIC utf-8 or 16 encoded strings
 | |
| 
 | |
|             # so we need to look for them and store them away to process leading text
 | |
|             # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
 | |
|             # we only ever seem to use the seocnd but ...
 | |
|             assert(ocnt == 1)
 | |
|             assert(data[op1:op1+4] == b'ORDT')
 | |
|             assert(data[op2:op2+4] == b'ORDT')
 | |
|             ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4)
 | |
|             ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4)
 | |
| 
 | |
|         if self.DEBUG:
 | |
|             print("parsed INDX header:")
 | |
|             for n in words:
 | |
|                 print(n, "%X" % header[n],)
 | |
|             print("")
 | |
|         return header, ordt1, ordt2
 | |
| 
 | |
|     def readCTOC(self, txtdata):
 | |
|         # read all blocks from CTOC
 | |
|         ctoc_data = {}
 | |
|         offset = 0
 | |
|         while offset<len(txtdata):
 | |
|             if PY2:
 | |
|                 if txtdata[offset] == b'\0':
 | |
|                     break
 | |
|             else:
 | |
|                 if txtdata[offset] == 0:
 | |
|                     break
 | |
|             idx_offs = offset
 | |
|             # first n bytes: name len as vwi
 | |
|             pos, ilen = getVariableWidthValue(txtdata, offset)
 | |
|             offset += pos
 | |
|             # <len> next bytes: name
 | |
|             name = txtdata[offset:offset+ilen]
 | |
|             offset += ilen
 | |
|             if self.DEBUG:
 | |
|                 print("name length is ", ilen)
 | |
|                 print(idx_offs, name)
 | |
|             ctoc_data[idx_offs] = name
 | |
|         return ctoc_data
 | |
| 
 | |
| 
 | |
| def getVariableWidthValue(data, offset):
 | |
|     '''
 | |
|     Decode variable width value from given bytes.
 | |
| 
 | |
|     @param data: The bytes to decode.
 | |
|     @param offset: The start offset into data.
 | |
|     @return: Tuple of consumed bytes count and decoded value.
 | |
|     '''
 | |
|     value = 0
 | |
|     consumed = 0
 | |
|     finished = False
 | |
|     while not finished:
 | |
|         v = data[offset + consumed: offset + consumed + 1]
 | |
|         consumed += 1
 | |
|         if ord(v) & 0x80:
 | |
|             finished = True
 | |
|         value = (value << 7) | (ord(v) & 0x7f)
 | |
|     return consumed, value
 | |
| 
 | |
| 
 | |
| def readTagSection(start, data):
 | |
|     '''
 | |
|     Read tag section from given data.
 | |
| 
 | |
|     @param start: The start position in the data.
 | |
|     @param data: The data to process.
 | |
|     @return: Tuple of control byte count and list of tag tuples.
 | |
|     '''
 | |
|     controlByteCount = 0
 | |
|     tags = []
 | |
|     if data[start:start+4] == b"TAGX":
 | |
|         firstEntryOffset, = struct.unpack_from(b'>L', data, start + 0x04)
 | |
|         controlByteCount, = struct.unpack_from(b'>L', data, start + 0x08)
 | |
| 
 | |
|         # Skip the first 12 bytes already read above.
 | |
|         for i in range(12, firstEntryOffset, 4):
 | |
|             pos = start + i
 | |
|             tags.append((ord(data[pos:pos+1]), ord(data[pos+1:pos+2]), ord(data[pos+2:pos+3]), ord(data[pos+3:pos+4])))
 | |
|     return controlByteCount, tags
 | |
| 
 | |
| 
 | |
| def countSetBits(value, bits=8):
 | |
|     '''
 | |
|     Count the set bits in the given value.
 | |
| 
 | |
|     @param value: Integer value.
 | |
|     @param bits: The number of bits of the input value (defaults to 8).
 | |
|     @return: Number of set bits.
 | |
|     '''
 | |
|     count = 0
 | |
|     for _ in range(bits):
 | |
|         if value & 0x01 == 0x01:
 | |
|             count += 1
 | |
|         value = value >> 1
 | |
|     return count
 | |
| 
 | |
| 
 | |
| def getTagMap(controlByteCount, tagTable, entryData, startPos, endPos):
 | |
|     '''
 | |
|     Create a map of tags and values from the given byte section.
 | |
| 
 | |
|     @param controlByteCount: The number of control bytes.
 | |
|     @param tagTable: The tag table.
 | |
|     @param entryData: The data to process.
 | |
|     @param startPos: The starting position in entryData.
 | |
|     @param endPos: The end position in entryData or None if it is unknown.
 | |
|     @return: Hashmap of tag and list of values.
 | |
|     '''
 | |
|     tags = []
 | |
|     tagHashMap = {}
 | |
|     controlByteIndex = 0
 | |
|     dataStart = startPos + controlByteCount
 | |
| 
 | |
|     for tag, valuesPerEntry, mask, endFlag in tagTable:
 | |
|         if endFlag == 0x01:
 | |
|             controlByteIndex += 1
 | |
|             continue
 | |
|         cbyte = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1])
 | |
|         if 0:
 | |
|             print("Control Byte Index %0x , Control Byte Value %0x" % (controlByteIndex, cbyte))
 | |
| 
 | |
|         value = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1]) & mask
 | |
|         if value != 0:
 | |
|             if value == mask:
 | |
|                 if countSetBits(mask) > 1:
 | |
|                     # If all bits of masked value are set and the mask has more than one bit, a variable width value
 | |
|                     # will follow after the control bytes which defines the length of bytes (NOT the value count!)
 | |
|                     # which will contain the corresponding variable width values.
 | |
|                     consumed, value = getVariableWidthValue(entryData, dataStart)
 | |
|                     dataStart += consumed
 | |
|                     tags.append((tag, None, value, valuesPerEntry))
 | |
|                 else:
 | |
|                     tags.append((tag, 1, None, valuesPerEntry))
 | |
|             else:
 | |
|                 # Shift bits to get the masked value.
 | |
|                 while mask & 0x01 == 0:
 | |
|                     mask = mask >> 1
 | |
|                     value = value >> 1
 | |
|                 tags.append((tag, value, None, valuesPerEntry))
 | |
|     for tag, valueCount, valueBytes, valuesPerEntry in tags:
 | |
|         values = []
 | |
|         if valueCount is not None:
 | |
|             # Read valueCount * valuesPerEntry variable width values.
 | |
|             for _ in range(valueCount):
 | |
|                 for _ in range(valuesPerEntry):
 | |
|                     consumed, data = getVariableWidthValue(entryData, dataStart)
 | |
|                     dataStart += consumed
 | |
|                     values.append(data)
 | |
|         else:
 | |
|             # Convert valueBytes to variable width values.
 | |
|             totalConsumed = 0
 | |
|             while totalConsumed < valueBytes:
 | |
|                 # Does this work for valuesPerEntry != 1?
 | |
|                 consumed, data = getVariableWidthValue(entryData, dataStart)
 | |
|                 dataStart += consumed
 | |
|                 totalConsumed += consumed
 | |
|                 values.append(data)
 | |
|             if totalConsumed != valueBytes:
 | |
|                 print("Error: Should consume %s bytes, but consumed %s" % (valueBytes, totalConsumed))
 | |
|         tagHashMap[tag] = values
 | |
|     # Test that all bytes have been processed if endPos is given.
 | |
|     if endPos is not None and dataStart != endPos:
 | |
|         # The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
 | |
|         for char in entryData[dataStart:endPos]:
 | |
|             if bord(char) != 0:
 | |
|                 print("Warning: There are unprocessed index bytes left: %s" % toHex(entryData[dataStart:endPos]))
 | |
|                 if 0:
 | |
|                     print("controlByteCount: %s" % controlByteCount)
 | |
|                     print("tagTable: %s" % tagTable)
 | |
|                     print("data: %s" % toHex(entryData[startPos:endPos]))
 | |
|                     print("tagHashMap: %s" % tagHashMap)
 | |
|                 break
 | |
| 
 | |
|     return tagHashMap
 |