159 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			159 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #!/usr/bin/env python
 | |
| # -*- coding: utf-8 -*-
 | |
| # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 | |
| 
 | |
| from __future__ import unicode_literals, division, absolute_import, print_function
 | |
| 
 | |
| from .compatibility_utils import PY2, unicode_str
 | |
| 
 | |
| if PY2:
 | |
|     range = xrange
 | |
| 
 | |
| import struct
 | |
| # note:  struct pack, unpack, unpack_from all require bytestring format
 | |
| # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 | |
| 
 | |
| import re
 | |
| # note: re requites the pattern to be the exact same type as the data to be searched in python3
 | |
| # but u"" is not allowed for the pattern itself only b""
 | |
| 
 | |
| 
 | |
| _TABLE = [('m', 1000), ('cm', 900), ('d', 500), ('cd', 400), ('c', 100), ('xc', 90), ('l', 50), ('xl', 40), ('x', 10), ('ix', 9), ('v', 5), ('iv', 4), ('i', 1)]
 | |
| 
 | |
| def int_to_roman(i):
 | |
|     parts = []
 | |
|     num = i
 | |
|     for letter, value in _TABLE:
 | |
|         while value <= num:
 | |
|             num -= value
 | |
|             parts.append(letter)
 | |
|     return ''.join(parts)
 | |
| 
 | |
| def roman_to_int(s):
 | |
|     result = 0
 | |
|     rnstr = s
 | |
|     for letter, value in _TABLE:
 | |
|         while rnstr.startswith(letter):
 | |
|             result += value
 | |
|             rnstr = rnstr[len(letter):]
 | |
|     return result
 | |
| 
 | |
| _pattern = r'''\(([^\)]*)\)'''
 | |
| _tup_pattern = re.compile(_pattern,re.IGNORECASE)
 | |
| 
 | |
| 
 | |
| def _parseNames(numpages, data):
 | |
|     data = unicode_str(data)
 | |
|     pagenames = []
 | |
|     pageMap = ''
 | |
|     for i in range(numpages):
 | |
|         pagenames.append(None)
 | |
|     for m in re.finditer(_tup_pattern, data):
 | |
|         tup = m.group(1)
 | |
|         if pageMap != '':
 | |
|             pageMap += ','
 | |
|         pageMap += '(' + tup + ')'
 | |
|         spos, nametype, svalue = tup.split(",")
 | |
|         # print(spos, nametype, svalue)
 | |
|         if nametype == 'a' or nametype == 'r':
 | |
|             svalue = int(svalue)
 | |
|         spos = int(spos)
 | |
|         for i in range(spos - 1, numpages):
 | |
|             if nametype == 'r':
 | |
|                 pname = int_to_roman(svalue)
 | |
|                 svalue += 1
 | |
|             elif nametype == 'a':
 | |
|                 pname = "%s" % svalue
 | |
|                 svalue += 1
 | |
|             elif nametype == 'c':
 | |
|                 sp = svalue.find('|')
 | |
|                 if sp == -1:
 | |
|                     pname = svalue
 | |
|                 else:
 | |
|                     pname = svalue[0:sp]
 | |
|                     svalue = svalue[sp+1:]
 | |
|             else:
 | |
|                 print("Error: unknown page numbering type", nametype)
 | |
|             pagenames[i] = pname
 | |
|     return pagenames, pageMap
 | |
| 
 | |
| 
 | |
| class PageMapProcessor:
 | |
| 
 | |
|     def __init__(self, mh, data):
 | |
|         self.mh = mh
 | |
|         self.data = data
 | |
|         self.pagenames = []
 | |
|         self.pageoffsets = []
 | |
|         self.pageMap = ''
 | |
|         self.pm_len = 0
 | |
|         self.pm_nn = 0
 | |
|         self.pn_bits = 0
 | |
|         self.pmoff = None
 | |
|         self.pmstr = ''
 | |
|         print("Extracting Page Map Information")
 | |
|         rev_len, = struct.unpack_from(b'>L', self.data, 0x10)
 | |
|         # skip over header, revision string length data, and revision string
 | |
|         ptr = 0x14 + rev_len
 | |
|         pm_1, self.pm_len, self.pm_nn, self.pm_bits  = struct.unpack_from(b'>4H', self.data, ptr)
 | |
|         # print(pm_1, self.pm_len, self.pm_nn, self.pm_bits)
 | |
|         self.pmstr = self.data[ptr+8:ptr+8+self.pm_len]
 | |
|         self.pmoff = self.data[ptr+8+self.pm_len:]
 | |
|         offsize = b">L"
 | |
|         offwidth = 4
 | |
|         if self.pm_bits == 16:
 | |
|             offsize = b">H"
 | |
|             offwidth = 2
 | |
|         ptr = 0
 | |
|         for i in range(self.pm_nn):
 | |
|             od, = struct.unpack_from(offsize, self.pmoff, ptr)
 | |
|             ptr += offwidth
 | |
|             self.pageoffsets.append(od)
 | |
|         self.pagenames, self.pageMap = _parseNames(self.pm_nn, self.pmstr)
 | |
| 
 | |
|     def getPageMap(self):
 | |
|         return self.pageMap
 | |
| 
 | |
|     def getNames(self):
 | |
|         return self.pagenames
 | |
| 
 | |
|     def getOffsets(self):
 | |
|         return self.pageoffsets
 | |
| 
 | |
|     # page-map.xml will be unicode but encoded to utf-8 immediately before being written to a file
 | |
|     def generateKF8PageMapXML(self, k8proc):
 | |
|         pagemapxml = '<page-map xmlns="http://www.idpf.org/2007/opf">\n'
 | |
|         for i in range(len(self.pagenames)):
 | |
|             pos = self.pageoffsets[i]
 | |
|             name = self.pagenames[i]
 | |
|             if name is not None and name != "":
 | |
|                 [pn, dir, filename, skelpos, skelend, aidtext] = k8proc.getSkelInfo(pos)
 | |
|                 idtext = unicode_str(k8proc.getPageIDTag(pos))
 | |
|                 linktgt = unicode_str(filename)
 | |
|                 if idtext != '':
 | |
|                     linktgt += '#' + idtext
 | |
|                 pagemapxml += '<page name="%s" href="%s/%s" />\n' % (name, dir, linktgt)
 | |
|         pagemapxml += "</page-map>\n"
 | |
|         return pagemapxml
 | |
| 
 | |
|     def generateAPNX(self, apnx_meta):
 | |
|         if apnx_meta['format'] == 'MOBI_8':
 | |
|             content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","format":"%(format)s","fileRevisionId":"1","acr":"%(acr)s"}' %apnx_meta
 | |
|         else:
 | |
|             content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","fileRevisionId":"1"}' % apnx_meta
 | |
|         content_header = content_header.encode('utf-8')
 | |
|         page_header = '{"asin":"%(asin)s","pageMap":"%(pageMap)s"}' % apnx_meta
 | |
|         page_header = page_header.encode('utf-8')
 | |
|         apnx = struct.pack(b'>H',1) + struct.pack(b'>H',1)
 | |
|         apnx += struct.pack(b'>I', 12 + len(content_header))
 | |
|         apnx += struct.pack(b'>I', len(content_header))
 | |
|         apnx += content_header
 | |
|         apnx += struct.pack(b'>H', 1)
 | |
|         apnx += struct.pack(b'>H', len(page_header))
 | |
|         apnx += struct.pack(b'>H', self.pm_nn)
 | |
|         apnx += struct.pack(b'>H', 32)
 | |
|         apnx += page_header
 | |
|         for page in self.pageoffsets:
 | |
|             apnx += struct.pack(b'>L', page)
 | |
|         return apnx
 |