Incorporate KindleUnpack from https://github.com/kevinhendricks/KindleUnpack
The GUI elements have been removed
This commit is contained in:
2
KindleUnpack/__init__.py
Normal file
2
KindleUnpack/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
278
KindleUnpack/compatibility_utils.py
Executable file
278
KindleUnpack/compatibility_utils.py
Executable file
@@ -0,0 +1,278 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
|
||||||
|
# Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
# are permitted provided that the following conditions are met:
|
||||||
|
#
|
||||||
|
# 1. Redistributions of source code must retain the above copyright notice, this list of
|
||||||
|
# conditions and the following disclaimer.
|
||||||
|
#
|
||||||
|
# 2. Redistributions in binary form must reproduce the above copyright notice, this list
|
||||||
|
# of conditions and the following disclaimer in the documentation and/or other materials
|
||||||
|
# provided with the distribution.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
|
||||||
|
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
|
||||||
|
# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||||
|
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||||
|
# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||||
|
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
|
||||||
|
# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import codecs
|
||||||
|
|
||||||
|
PY2 = sys.version_info[0] == 2
|
||||||
|
PY3 = sys.version_info[0] == 3
|
||||||
|
|
||||||
|
iswindows = sys.platform.startswith('win')
|
||||||
|
|
||||||
|
try:
|
||||||
|
from urllib.parse import unquote
|
||||||
|
except ImportError:
|
||||||
|
from urllib import unquote
|
||||||
|
|
||||||
|
if PY2:
|
||||||
|
from HTMLParser import HTMLParser
|
||||||
|
_h = HTMLParser()
|
||||||
|
elif sys.version_info[1] < 4:
|
||||||
|
import html.parser
|
||||||
|
_h = html.parser.HTMLParser()
|
||||||
|
else:
|
||||||
|
import html as _h
|
||||||
|
|
||||||
|
if PY3:
|
||||||
|
text_type = str
|
||||||
|
binary_type = bytes
|
||||||
|
# if will be printing arbitraty binary data to stdout on python 3
|
||||||
|
# sys.stdin = sys.stdin.detach()
|
||||||
|
# sys.stdout = sys.stdout.detach()
|
||||||
|
# sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
|
||||||
|
else:
|
||||||
|
range = xrange
|
||||||
|
text_type = unicode
|
||||||
|
binary_type = str
|
||||||
|
# if will be printing unicode under python 2 need to protect
|
||||||
|
# against sys.stdout.encoding being None stupidly forcing forcing ascii encoding of unicode
|
||||||
|
# sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
|
||||||
|
# alternatively set environment variable as follows **before** launching python: export PYTHONIOENCODING=UTF-8
|
||||||
|
|
||||||
|
# NOTE: Python 3 is completely broken when accessing single bytes in bytes strings
|
||||||
|
# (and they amazingly claim by design and no bug!)
|
||||||
|
|
||||||
|
# To illustrate: this works for unicode in Python 3 and for all Python 2.X for both bytestrings and unicode
|
||||||
|
# >>> o = '123456789'
|
||||||
|
# >>> o[-3]
|
||||||
|
# '7'
|
||||||
|
# >>> type(o[-3])
|
||||||
|
# <class 'str'>
|
||||||
|
# >>> type(o)
|
||||||
|
# <class 'str'>
|
||||||
|
|
||||||
|
# Unfortunately, this is what Python 3 does for no sane reason and only for bytestrings
|
||||||
|
# >>> o = b'123456789'
|
||||||
|
# >>> o[-3]
|
||||||
|
# 55
|
||||||
|
# >>> type(o[-3])
|
||||||
|
# <class 'int'>
|
||||||
|
# >>> type(o)
|
||||||
|
# <class 'bytes'>
|
||||||
|
|
||||||
|
# This mind boggling behaviour also happens when indexing a bytestring and/or
|
||||||
|
# iteratoring over a bytestring. In other words it will return an int but not
|
||||||
|
# the byte itself!!!!!!!
|
||||||
|
|
||||||
|
# The only way to access a single byte as a byte in bytestring and get the byte in both
|
||||||
|
# Python 2 and Python 3 is to use a slice
|
||||||
|
|
||||||
|
# This problem is so common there are horrible hacks floating around the net to **try**
|
||||||
|
# to work around it, so that code that works on both Python 2 and Python 3 is possible.
|
||||||
|
|
||||||
|
# So in order to write code that works on both Python 2 and Python 3
|
||||||
|
# if you index or access a single byte and want its ord() then use the bord() function.
|
||||||
|
# If instead you want it as a single character byte use the bchar() function
|
||||||
|
# both of which are defined below.
|
||||||
|
|
||||||
|
if PY3:
|
||||||
|
# Also Note: if decode a bytestring using 'latin-1' (or any other full range 0-255 encoding)
|
||||||
|
# in place of ascii you will get a byte value to half-word or integer value
|
||||||
|
# one-to-one mapping (in the 0 - 255 range)
|
||||||
|
|
||||||
|
def bchr(s):
|
||||||
|
return bytes([s])
|
||||||
|
|
||||||
|
def bstr(s):
|
||||||
|
if isinstance(s, str):
|
||||||
|
return bytes(s, 'latin-1')
|
||||||
|
else:
|
||||||
|
return bytes(s)
|
||||||
|
|
||||||
|
def bord(s):
|
||||||
|
return s
|
||||||
|
|
||||||
|
def bchar(s):
|
||||||
|
return bytes([s])
|
||||||
|
|
||||||
|
else:
|
||||||
|
def bchr(s):
|
||||||
|
return chr(s)
|
||||||
|
|
||||||
|
def bstr(s):
|
||||||
|
return str(s)
|
||||||
|
|
||||||
|
def bord(s):
|
||||||
|
return ord(s)
|
||||||
|
|
||||||
|
def bchar(s):
|
||||||
|
return s
|
||||||
|
|
||||||
|
if PY3:
|
||||||
|
# list-producing versions of the major Python iterating functions
|
||||||
|
def lrange(*args, **kwargs):
|
||||||
|
return list(range(*args, **kwargs))
|
||||||
|
|
||||||
|
def lzip(*args, **kwargs):
|
||||||
|
return list(zip(*args, **kwargs))
|
||||||
|
|
||||||
|
def lmap(*args, **kwargs):
|
||||||
|
return list(map(*args, **kwargs))
|
||||||
|
|
||||||
|
def lfilter(*args, **kwargs):
|
||||||
|
return list(filter(*args, **kwargs))
|
||||||
|
else:
|
||||||
|
import __builtin__
|
||||||
|
# Python 2-builtin ranges produce lists
|
||||||
|
lrange = __builtin__.range
|
||||||
|
lzip = __builtin__.zip
|
||||||
|
lmap = __builtin__.map
|
||||||
|
lfilter = __builtin__.filter
|
||||||
|
|
||||||
|
# In Python 3 you can no longer use .encode('hex') on a bytestring
|
||||||
|
# instead use the following on both platforms
|
||||||
|
import binascii
|
||||||
|
def hexlify(bdata):
|
||||||
|
return (binascii.hexlify(bdata)).decode('ascii')
|
||||||
|
|
||||||
|
# If you: import struct
|
||||||
|
# Note: struct pack, unpack, unpack_from all *require* bytestring format
|
||||||
|
# data all the way up to at least Python 2.7.5, Python 3 is okay with either
|
||||||
|
|
||||||
|
# If you: import re
|
||||||
|
# note: Python 3 "re" requires the pattern to be the exact same type as the data to be
|
||||||
|
# searched ... but u"" is not allowed for the pattern itself only b""
|
||||||
|
# Python 2.X allows the pattern to be any type and converts it to match the data
|
||||||
|
# and returns the same type as the data
|
||||||
|
|
||||||
|
# convert string to be utf-8 encoded
|
||||||
|
def utf8_str(p, enc='utf-8'):
|
||||||
|
if p is None:
|
||||||
|
return None
|
||||||
|
if isinstance(p, text_type):
|
||||||
|
return p.encode('utf-8')
|
||||||
|
if enc != 'utf-8':
|
||||||
|
return p.decode(enc).encode('utf-8')
|
||||||
|
return p
|
||||||
|
|
||||||
|
# convert string to be unicode encoded
|
||||||
|
def unicode_str(p, enc='utf-8'):
|
||||||
|
if p is None:
|
||||||
|
return None
|
||||||
|
if isinstance(p, text_type):
|
||||||
|
return p
|
||||||
|
return p.decode(enc)
|
||||||
|
|
||||||
|
ASCII_CHARS = set(chr(x) for x in range(128))
|
||||||
|
URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||||
|
'abcdefghijklmnopqrstuvwxyz'
|
||||||
|
'0123456789' '#' '_.-/~')
|
||||||
|
IRI_UNSAFE = ASCII_CHARS - URL_SAFE
|
||||||
|
|
||||||
|
# returns a quoted IRI (not a URI)
|
||||||
|
def quoteurl(href):
|
||||||
|
if isinstance(href,binary_type):
|
||||||
|
href = href.decode('utf-8')
|
||||||
|
result = []
|
||||||
|
for char in href:
|
||||||
|
if char in IRI_UNSAFE:
|
||||||
|
char = "%%%02x" % ord(char)
|
||||||
|
result.append(char)
|
||||||
|
return ''.join(result)
|
||||||
|
|
||||||
|
# unquotes url/iri
|
||||||
|
def unquoteurl(href):
|
||||||
|
if isinstance(href,binary_type):
|
||||||
|
href = href.decode('utf-8')
|
||||||
|
href = unquote(href)
|
||||||
|
return href
|
||||||
|
|
||||||
|
# unescape html
|
||||||
|
def unescapeit(sval):
|
||||||
|
return _h.unescape(sval)
|
||||||
|
|
||||||
|
# Python 2.X commandline parsing under Windows has been horribly broken for years!
|
||||||
|
# Use the following code to emulate full unicode commandline parsing on Python 2
|
||||||
|
# ie. To get sys.argv arguments and properly encode them as unicode
|
||||||
|
|
||||||
|
def unicode_argv():
|
||||||
|
global iswindows
|
||||||
|
global PY3
|
||||||
|
if PY3:
|
||||||
|
return sys.argv
|
||||||
|
if iswindows:
|
||||||
|
# Versions 2.x of Python don't support Unicode in sys.argv on
|
||||||
|
# Windows, with the underlying Windows API instead replacing multi-byte
|
||||||
|
# characters with '?'. So use shell32.GetCommandLineArgvW to get sys.argv
|
||||||
|
# as a list of Unicode strings
|
||||||
|
from ctypes import POINTER, byref, cdll, c_int, windll
|
||||||
|
from ctypes.wintypes import LPCWSTR, LPWSTR
|
||||||
|
|
||||||
|
GetCommandLineW = cdll.kernel32.GetCommandLineW
|
||||||
|
GetCommandLineW.argtypes = []
|
||||||
|
GetCommandLineW.restype = LPCWSTR
|
||||||
|
|
||||||
|
CommandLineToArgvW = windll.shell32.CommandLineToArgvW
|
||||||
|
CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)]
|
||||||
|
CommandLineToArgvW.restype = POINTER(LPWSTR)
|
||||||
|
|
||||||
|
cmd = GetCommandLineW()
|
||||||
|
argc = c_int(0)
|
||||||
|
argv = CommandLineToArgvW(cmd, byref(argc))
|
||||||
|
if argc.value > 0:
|
||||||
|
# Remove Python executable and commands if present
|
||||||
|
start = argc.value - len(sys.argv)
|
||||||
|
return [argv[i] for i in
|
||||||
|
range(start, argc.value)]
|
||||||
|
# this should never happen
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
argv = []
|
||||||
|
argvencoding = sys.stdin.encoding
|
||||||
|
if argvencoding is None:
|
||||||
|
argvencoding = sys.getfilesystemencoding()
|
||||||
|
if argvencoding is None:
|
||||||
|
argvencoding = 'utf-8'
|
||||||
|
for arg in sys.argv:
|
||||||
|
if isinstance(arg, text_type):
|
||||||
|
argv.append(arg)
|
||||||
|
else:
|
||||||
|
argv.append(arg.decode(argvencoding))
|
||||||
|
return argv
|
||||||
|
|
||||||
|
|
||||||
|
# Python 2.X is broken in that it does not recognize CP65001 as UTF-8
|
||||||
|
def add_cp65001_codec():
|
||||||
|
if PY2:
|
||||||
|
try:
|
||||||
|
codecs.lookup('cp65001')
|
||||||
|
except LookupError:
|
||||||
|
codecs.register(
|
||||||
|
lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None)
|
||||||
|
return
|
1018
KindleUnpack/kindleunpack.py
Normal file
1018
KindleUnpack/kindleunpack.py
Normal file
File diff suppressed because it is too large
Load Diff
238
KindleUnpack/mobi_cover.py
Normal file
238
KindleUnpack/mobi_cover.py
Normal file
@@ -0,0 +1,238 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
|
||||||
|
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||||
|
|
||||||
|
from .compatibility_utils import unicode_str
|
||||||
|
|
||||||
|
from .unipath import pathof
|
||||||
|
import os
|
||||||
|
import imghdr
|
||||||
|
|
||||||
|
import struct
|
||||||
|
# note: struct pack, unpack, unpack_from all require bytestring format
|
||||||
|
# data all the way up to at least python 2.7.5, python 3 okay with bytestring
|
||||||
|
|
||||||
|
USE_SVG_WRAPPER = True
|
||||||
|
""" Set to True to use svg wrapper for default. """
|
||||||
|
|
||||||
|
FORCE_DEFAULT_TITLE = False
|
||||||
|
""" Set to True to force to use the default title. """
|
||||||
|
|
||||||
|
COVER_PAGE_FINENAME = 'cover_page.xhtml'
|
||||||
|
""" The name for the cover page. """
|
||||||
|
|
||||||
|
DEFAULT_TITLE = 'Cover'
|
||||||
|
""" The default title for the cover page. """
|
||||||
|
|
||||||
|
MAX_WIDTH = 4096
|
||||||
|
""" The max width for the svg cover page. """
|
||||||
|
|
||||||
|
MAX_HEIGHT = 4096
|
||||||
|
""" The max height for the svg cover page. """
|
||||||
|
|
||||||
|
|
||||||
|
def get_image_type(imgname, imgdata=None):
|
||||||
|
imgtype = unicode_str(imghdr.what(pathof(imgname), imgdata))
|
||||||
|
|
||||||
|
# imghdr only checks for JFIF or Exif JPEG files. Apparently, there are some
|
||||||
|
# with only the magic JPEG bytes out there...
|
||||||
|
# ImageMagick handles those, so, do it too.
|
||||||
|
if imgtype is None:
|
||||||
|
if imgdata is None:
|
||||||
|
with open(pathof(imgname), 'rb') as f:
|
||||||
|
imgdata = f.read()
|
||||||
|
if imgdata[0:2] == b'\xFF\xD8':
|
||||||
|
# Get last non-null bytes
|
||||||
|
last = len(imgdata)
|
||||||
|
while (imgdata[last-1:last] == b'\x00'):
|
||||||
|
last-=1
|
||||||
|
# Be extra safe, check the trailing bytes, too.
|
||||||
|
if imgdata[last-2:last] == b'\xFF\xD9':
|
||||||
|
imgtype = "jpeg"
|
||||||
|
return imgtype
|
||||||
|
|
||||||
|
|
||||||
|
def get_image_size(imgname, imgdata=None):
|
||||||
|
'''Determine the image type of imgname (or imgdata) and return its size.
|
||||||
|
|
||||||
|
Originally,
|
||||||
|
Determine the image type of fhandle and return its size.
|
||||||
|
from draco'''
|
||||||
|
if imgdata is None:
|
||||||
|
fhandle = open(pathof(imgname), 'rb')
|
||||||
|
head = fhandle.read(24)
|
||||||
|
else:
|
||||||
|
head = imgdata[0:24]
|
||||||
|
if len(head) != 24:
|
||||||
|
return
|
||||||
|
|
||||||
|
imgtype = get_image_type(imgname, imgdata)
|
||||||
|
if imgtype == 'png':
|
||||||
|
check = struct.unpack(b'>i', head[4:8])[0]
|
||||||
|
if check != 0x0d0a1a0a:
|
||||||
|
return
|
||||||
|
width, height = struct.unpack(b'>ii', head[16:24])
|
||||||
|
elif imgtype == 'gif':
|
||||||
|
width, height = struct.unpack(b'<HH', head[6:10])
|
||||||
|
elif imgtype == 'jpeg' and imgdata is None:
|
||||||
|
try:
|
||||||
|
fhandle.seek(0) # Read 0xff next
|
||||||
|
size = 2
|
||||||
|
ftype = 0
|
||||||
|
while not 0xc0 <= ftype <= 0xcf:
|
||||||
|
fhandle.seek(size, 1)
|
||||||
|
byte = fhandle.read(1)
|
||||||
|
while ord(byte) == 0xff:
|
||||||
|
byte = fhandle.read(1)
|
||||||
|
ftype = ord(byte)
|
||||||
|
size = struct.unpack(b'>H', fhandle.read(2))[0] - 2
|
||||||
|
# We are at a SOFn block
|
||||||
|
fhandle.seek(1, 1) # Skip `precision' byte.
|
||||||
|
height, width = struct.unpack(b'>HH', fhandle.read(4))
|
||||||
|
except Exception: # IGNORE:W0703
|
||||||
|
return
|
||||||
|
elif imgtype == 'jpeg' and imgdata is not None:
|
||||||
|
try:
|
||||||
|
pos = 0
|
||||||
|
size = 2
|
||||||
|
ftype = 0
|
||||||
|
while not 0xc0 <= ftype <= 0xcf:
|
||||||
|
pos += size
|
||||||
|
byte = imgdata[pos:pos+1]
|
||||||
|
pos += 1
|
||||||
|
while ord(byte) == 0xff:
|
||||||
|
byte = imgdata[pos:pos+1]
|
||||||
|
pos += 1
|
||||||
|
ftype = ord(byte)
|
||||||
|
size = struct.unpack(b'>H', imgdata[pos:pos+2])[0] - 2
|
||||||
|
pos += 2
|
||||||
|
# We are at a SOFn block
|
||||||
|
pos += 1 # Skip `precision' byte.
|
||||||
|
height, width = struct.unpack(b'>HH', imgdata[pos:pos+4])
|
||||||
|
pos += 4
|
||||||
|
except Exception: # IGNORE:W0703
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
return width, height
|
||||||
|
|
||||||
|
# XXX experimental
|
||||||
|
class CoverProcessor(object):
|
||||||
|
|
||||||
|
"""Create a cover page.
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, files, metadata, rscnames, imgname=None, imgdata=None):
|
||||||
|
self.files = files
|
||||||
|
self.metadata = metadata
|
||||||
|
self.rscnames = rscnames
|
||||||
|
self.cover_page = COVER_PAGE_FINENAME
|
||||||
|
self.use_svg = USE_SVG_WRAPPER # Use svg wrapper.
|
||||||
|
self.lang = metadata.get('Language', ['en'])[0]
|
||||||
|
# This should ensure that if the methods to find the cover image's
|
||||||
|
# dimensions should fail for any reason, the SVG routine will not be used.
|
||||||
|
[self.width, self.height] = (-1,-1)
|
||||||
|
if FORCE_DEFAULT_TITLE:
|
||||||
|
self.title = DEFAULT_TITLE
|
||||||
|
else:
|
||||||
|
self.title = metadata.get('Title', [DEFAULT_TITLE])[0]
|
||||||
|
|
||||||
|
self.cover_image = None
|
||||||
|
if imgname is not None:
|
||||||
|
self.cover_image = imgname
|
||||||
|
elif 'CoverOffset' in metadata:
|
||||||
|
imageNumber = int(metadata['CoverOffset'][0])
|
||||||
|
cover_image = self.rscnames[imageNumber]
|
||||||
|
if cover_image is not None:
|
||||||
|
self.cover_image = cover_image
|
||||||
|
else:
|
||||||
|
print('Warning: Cannot identify the cover image.')
|
||||||
|
if self.use_svg:
|
||||||
|
try:
|
||||||
|
if imgdata is None:
|
||||||
|
fname = os.path.join(files.imgdir, self.cover_image)
|
||||||
|
[self.width, self.height] = get_image_size(fname)
|
||||||
|
else:
|
||||||
|
[self.width, self.height] = get_image_size(None, imgdata)
|
||||||
|
except:
|
||||||
|
self.use_svg = False
|
||||||
|
width = self.width
|
||||||
|
height = self.height
|
||||||
|
if width < 0 or height < 0 or width > MAX_WIDTH or height > MAX_HEIGHT:
|
||||||
|
self.use_svg = False
|
||||||
|
return
|
||||||
|
|
||||||
|
def getImageName(self):
|
||||||
|
return self.cover_image
|
||||||
|
|
||||||
|
def getXHTMLName(self):
|
||||||
|
return self.cover_page
|
||||||
|
|
||||||
|
def buildXHTML(self):
|
||||||
|
print('Building a cover page.')
|
||||||
|
files = self.files
|
||||||
|
cover_image = self.cover_image
|
||||||
|
title = self.title
|
||||||
|
lang = self.lang
|
||||||
|
|
||||||
|
image_dir = os.path.normpath(os.path.relpath(files.k8images, files.k8text))
|
||||||
|
image_path = os.path.join(image_dir, cover_image).replace('\\', '/')
|
||||||
|
|
||||||
|
if not self.use_svg:
|
||||||
|
data = ''
|
||||||
|
data += '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html>'
|
||||||
|
data += '<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"'
|
||||||
|
data += ' xml:lang="{:s}">\n'.format(lang)
|
||||||
|
data += '<head>\n<title>{:s}</title>\n'.format(title)
|
||||||
|
data += '<style type="text/css">\n'
|
||||||
|
data += 'body {\n margin: 0;\n padding: 0;\n text-align: center;\n}\n'
|
||||||
|
data += 'div {\n height: 100%;\n width: 100%;\n text-align: center;\n page-break-inside: avoid;\n}\n'
|
||||||
|
data += 'img {\n display: inline-block;\n height: 100%;\n margin: 0 auto;\n}\n'
|
||||||
|
data += '</style>\n</head>\n'
|
||||||
|
data += '<body><div>\n'
|
||||||
|
data += ' <img src="{:s}" alt=""/>\n'.format(image_path)
|
||||||
|
data += '</div></body>\n</html>'
|
||||||
|
else:
|
||||||
|
width = self.width
|
||||||
|
height = self.height
|
||||||
|
viewBox = "0 0 {0:d} {1:d}".format(width, height)
|
||||||
|
|
||||||
|
data = ''
|
||||||
|
data += '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html>'
|
||||||
|
data += '<html xmlns="http://www.w3.org/1999/xhtml"'
|
||||||
|
data += ' xml:lang="{:s}">\n'.format(lang)
|
||||||
|
data += '<head>\n <title>{:s}</title>\n'.format(title)
|
||||||
|
data += '<style type="text/css">\n'
|
||||||
|
data += 'svg {padding: 0pt; margin:0pt}\n'
|
||||||
|
data += 'body { text-align: center; padding:0pt; margin: 0pt; }\n'
|
||||||
|
data += '</style>\n</head>\n'
|
||||||
|
data += '<body>\n <div>\n'
|
||||||
|
data += ' <svg xmlns="http://www.w3.org/2000/svg" height="100%" preserveAspectRatio="xMidYMid meet"'
|
||||||
|
data += ' version="1.1" viewBox="{0:s}" width="100%" xmlns:xlink="http://www.w3.org/1999/xlink">\n'.format(viewBox)
|
||||||
|
data += ' <image height="{0}" width="{1}" xlink:href="{2}"/>\n'.format(height, width, image_path)
|
||||||
|
data += ' </svg>\n'
|
||||||
|
data += ' </div>\n</body>\n</html>'
|
||||||
|
return data
|
||||||
|
|
||||||
|
def writeXHTML(self):
|
||||||
|
files = self.files
|
||||||
|
cover_page = self.cover_page
|
||||||
|
|
||||||
|
data = self.buildXHTML()
|
||||||
|
|
||||||
|
outfile = os.path.join(files.k8text, cover_page)
|
||||||
|
if os.path.exists(pathof(outfile)):
|
||||||
|
print('Warning: {:s} already exists.'.format(cover_page))
|
||||||
|
os.remove(pathof(outfile))
|
||||||
|
with open(pathof(outfile), 'wb') as f:
|
||||||
|
f.write(data.encode('utf-8'))
|
||||||
|
return
|
||||||
|
|
||||||
|
def guide_toxml(self):
|
||||||
|
files = self.files
|
||||||
|
text_dir = os.path.relpath(files.k8text, files.k8oebps)
|
||||||
|
data = '<reference type="cover" title="Cover" href="{:s}/{:s}" />\n'.format(
|
||||||
|
text_dir, self.cover_page)
|
||||||
|
return data
|
377
KindleUnpack/mobi_dict.py
Normal file
377
KindleUnpack/mobi_dict.py
Normal file
@@ -0,0 +1,377 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
|
||||||
|
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||||
|
|
||||||
|
from .compatibility_utils import PY2, PY3, utf8_str, bstr, bchr
|
||||||
|
|
||||||
|
if PY2:
|
||||||
|
range = xrange
|
||||||
|
array_format = b'B'
|
||||||
|
if PY3:
|
||||||
|
unichr = chr
|
||||||
|
array_format = "B"
|
||||||
|
|
||||||
|
import array
|
||||||
|
|
||||||
|
import struct
|
||||||
|
# note: struct pack, unpack, unpack_from all require bytestring format
|
||||||
|
# data all the way up to at least python 2.7.5, python 3 okay with bytestring
|
||||||
|
|
||||||
|
from .mobi_index import getVariableWidthValue, readTagSection, getTagMap
|
||||||
|
from .mobi_utils import toHex
|
||||||
|
|
||||||
|
DEBUG_DICT = False
|
||||||
|
|
||||||
|
class InflectionData(object):
|
||||||
|
|
||||||
|
def __init__(self, infldatas):
|
||||||
|
self.infldatas = infldatas
|
||||||
|
self.starts = []
|
||||||
|
self.counts = []
|
||||||
|
for idata in self.infldatas:
|
||||||
|
start, = struct.unpack_from(b'>L', idata, 0x14)
|
||||||
|
count, = struct.unpack_from(b'>L', idata, 0x18)
|
||||||
|
self.starts.append(start)
|
||||||
|
self.counts.append(count)
|
||||||
|
|
||||||
|
def lookup(self, lookupvalue):
|
||||||
|
i = 0
|
||||||
|
rvalue = lookupvalue
|
||||||
|
while rvalue >= self.counts[i]:
|
||||||
|
rvalue = rvalue - self.counts[i]
|
||||||
|
i += 1
|
||||||
|
if i == len(self.counts):
|
||||||
|
print("Error: Problem with multiple inflections data sections")
|
||||||
|
return lookupvalue, self.starts[0], self.counts[0], self.infldatas[0]
|
||||||
|
return rvalue, self.starts[i], self.counts[i], self.infldatas[i]
|
||||||
|
|
||||||
|
def offsets(self, value):
|
||||||
|
rvalue, start, count, data = self.lookup(value)
|
||||||
|
offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue))
|
||||||
|
if rvalue + 1 < count:
|
||||||
|
nextOffset, = struct.unpack_from(b'>H',data, start + 4 + (2 * (rvalue + 1)))
|
||||||
|
else:
|
||||||
|
nextOffset = None
|
||||||
|
return offset, nextOffset, data
|
||||||
|
|
||||||
|
|
||||||
|
class dictSupport(object):
|
||||||
|
|
||||||
|
def __init__(self, mh, sect):
|
||||||
|
self.mh = mh
|
||||||
|
self.header = mh.header
|
||||||
|
self.sect = sect
|
||||||
|
self.metaOrthIndex = mh.metaOrthIndex
|
||||||
|
self.metaInflIndex = mh.metaInflIndex
|
||||||
|
|
||||||
|
def parseHeader(self, data):
|
||||||
|
"read INDX header"
|
||||||
|
if not data[:4] == b'INDX':
|
||||||
|
print("Warning: index section is not INDX")
|
||||||
|
return False
|
||||||
|
words = (
|
||||||
|
'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
|
||||||
|
'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc'
|
||||||
|
)
|
||||||
|
num = len(words)
|
||||||
|
values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)])
|
||||||
|
header = {}
|
||||||
|
for n in range(num):
|
||||||
|
header[words[n]] = values[n]
|
||||||
|
|
||||||
|
ordt1 = None
|
||||||
|
ordt2 = None
|
||||||
|
|
||||||
|
otype, oentries, op1, op2, otagx = struct.unpack_from(b'>LLLLL',data, 0xa4)
|
||||||
|
header['otype'] = otype
|
||||||
|
header['oentries'] = oentries
|
||||||
|
|
||||||
|
if DEBUG_DICT:
|
||||||
|
print("otype %d, oentries %d, op1 %d, op2 %d, otagx %d" % (otype, oentries, op1, op2, otagx))
|
||||||
|
|
||||||
|
if header['code'] == 0xfdea or oentries > 0:
|
||||||
|
# some dictionaries seem to be codepage 65002 (0xFDEA) which seems
|
||||||
|
# to be some sort of strange EBCDIC utf-8 or 16 encoded strings
|
||||||
|
# So we need to look for them and store them away to process leading text
|
||||||
|
# ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
|
||||||
|
# we only ever seem to use the second but ...
|
||||||
|
#
|
||||||
|
# if otype = 0, ORDT table uses 16 bit values as offsets into the table
|
||||||
|
# if otype = 1, ORDT table uses 8 bit values as offsets inot the table
|
||||||
|
|
||||||
|
assert(data[op1:op1+4] == b'ORDT')
|
||||||
|
assert(data[op2:op2+4] == b'ORDT')
|
||||||
|
ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4)
|
||||||
|
ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4)
|
||||||
|
|
||||||
|
if DEBUG_DICT:
|
||||||
|
print("parsed INDX header:")
|
||||||
|
for key in header:
|
||||||
|
print(key, "%x" % header[key],)
|
||||||
|
print("\n")
|
||||||
|
return header, ordt1, ordt2
|
||||||
|
|
||||||
|
def getPositionMap(self):
|
||||||
|
sect = self.sect
|
||||||
|
|
||||||
|
positionMap = {}
|
||||||
|
|
||||||
|
metaOrthIndex = self.metaOrthIndex
|
||||||
|
metaInflIndex = self.metaInflIndex
|
||||||
|
|
||||||
|
decodeInflection = True
|
||||||
|
if metaOrthIndex != 0xFFFFFFFF:
|
||||||
|
print("Info: Document contains orthographic index, handle as dictionary")
|
||||||
|
if metaInflIndex == 0xFFFFFFFF:
|
||||||
|
decodeInflection = False
|
||||||
|
else:
|
||||||
|
metaInflIndexData = sect.loadSection(metaInflIndex)
|
||||||
|
|
||||||
|
print("\nParsing metaInflIndexData")
|
||||||
|
midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData)
|
||||||
|
|
||||||
|
metaIndexCount = midxhdr['count']
|
||||||
|
idatas = []
|
||||||
|
for j in range(metaIndexCount):
|
||||||
|
idatas.append(sect.loadSection(metaInflIndex + 1 + j))
|
||||||
|
dinfl = InflectionData(idatas)
|
||||||
|
|
||||||
|
inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount)
|
||||||
|
tagSectionStart = midxhdr['len']
|
||||||
|
inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData)
|
||||||
|
if DEBUG_DICT:
|
||||||
|
print("inflectionTagTable: %s" % inflectionTagTable)
|
||||||
|
if self.hasTag(inflectionTagTable, 0x07):
|
||||||
|
print("Error: Dictionary uses obsolete inflection rule scheme which is not yet supported")
|
||||||
|
decodeInflection = False
|
||||||
|
|
||||||
|
data = sect.loadSection(metaOrthIndex)
|
||||||
|
|
||||||
|
print("\nParsing metaOrthIndex")
|
||||||
|
idxhdr, hordt1, hordt2 = self.parseHeader(data)
|
||||||
|
|
||||||
|
tagSectionStart = idxhdr['len']
|
||||||
|
controlByteCount, tagTable = readTagSection(tagSectionStart, data)
|
||||||
|
orthIndexCount = idxhdr['count']
|
||||||
|
print("orthIndexCount is", orthIndexCount)
|
||||||
|
if DEBUG_DICT:
|
||||||
|
print("orthTagTable: %s" % tagTable)
|
||||||
|
if hordt2 is not None:
|
||||||
|
print("orth entry uses ordt2 lookup table of type ", idxhdr['otype'])
|
||||||
|
hasEntryLength = self.hasTag(tagTable, 0x02)
|
||||||
|
if not hasEntryLength:
|
||||||
|
print("Info: Index doesn't contain entry length tags")
|
||||||
|
|
||||||
|
print("Read dictionary index data")
|
||||||
|
for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount):
|
||||||
|
data = sect.loadSection(i)
|
||||||
|
hdrinfo, ordt1, ordt2 = self.parseHeader(data)
|
||||||
|
idxtPos = hdrinfo['start']
|
||||||
|
entryCount = hdrinfo['count']
|
||||||
|
idxPositions = []
|
||||||
|
for j in range(entryCount):
|
||||||
|
pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j))
|
||||||
|
idxPositions.append(pos)
|
||||||
|
# The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
|
||||||
|
idxPositions.append(idxtPos)
|
||||||
|
for j in range(entryCount):
|
||||||
|
startPos = idxPositions[j]
|
||||||
|
endPos = idxPositions[j+1]
|
||||||
|
textLength = ord(data[startPos:startPos+1])
|
||||||
|
text = data[startPos+1:startPos+1+textLength]
|
||||||
|
if hordt2 is not None:
|
||||||
|
utext = u""
|
||||||
|
if idxhdr['otype'] == 0:
|
||||||
|
pattern = b'>H'
|
||||||
|
inc = 2
|
||||||
|
else:
|
||||||
|
pattern = b'>B'
|
||||||
|
inc = 1
|
||||||
|
pos = 0
|
||||||
|
while pos < textLength:
|
||||||
|
off, = struct.unpack_from(pattern, text, pos)
|
||||||
|
if off < len(hordt2):
|
||||||
|
utext += unichr(hordt2[off])
|
||||||
|
else:
|
||||||
|
utext += unichr(off)
|
||||||
|
pos += inc
|
||||||
|
text = utext.encode('utf-8')
|
||||||
|
|
||||||
|
tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos)
|
||||||
|
if 0x01 in tagMap:
|
||||||
|
if decodeInflection and 0x2a in tagMap:
|
||||||
|
inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable,
|
||||||
|
dinfl, inflNameData, tagMap[0x2a])
|
||||||
|
else:
|
||||||
|
inflectionGroups = b''
|
||||||
|
assert len(tagMap[0x01]) == 1
|
||||||
|
entryStartPosition = tagMap[0x01][0]
|
||||||
|
if hasEntryLength:
|
||||||
|
# The idx:entry attribute "scriptable" must be present to create entry length tags.
|
||||||
|
ml = b'<idx:entry scriptable="yes"><idx:orth value="' + text + b'">' + inflectionGroups + b'</idx:orth>'
|
||||||
|
if entryStartPosition in positionMap:
|
||||||
|
positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml
|
||||||
|
else:
|
||||||
|
positionMap[entryStartPosition] = ml
|
||||||
|
assert len(tagMap[0x02]) == 1
|
||||||
|
entryEndPosition = entryStartPosition + tagMap[0x02][0]
|
||||||
|
if entryEndPosition in positionMap:
|
||||||
|
positionMap[entryEndPosition] = b"</idx:entry>" + positionMap[entryEndPosition]
|
||||||
|
else:
|
||||||
|
positionMap[entryEndPosition] = b"</idx:entry>"
|
||||||
|
|
||||||
|
else:
|
||||||
|
indexTags = b'<idx:entry>\n<idx:orth value="' + text + b'">\n' + inflectionGroups + b'</idx:entry>\n'
|
||||||
|
if entryStartPosition in positionMap:
|
||||||
|
positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags
|
||||||
|
else:
|
||||||
|
positionMap[entryStartPosition] = indexTags
|
||||||
|
return positionMap
|
||||||
|
|
||||||
|
def hasTag(self, tagTable, tag):
|
||||||
|
'''
|
||||||
|
Test if tag table contains given tag.
|
||||||
|
|
||||||
|
@param tagTable: The tag table.
|
||||||
|
@param tag: The tag to search.
|
||||||
|
@return: True if tag table contains given tag; False otherwise.
|
||||||
|
'''
|
||||||
|
for currentTag, _, _, _ in tagTable:
|
||||||
|
if currentTag == tag:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList):
|
||||||
|
'''
|
||||||
|
Create string which contains the inflection groups with inflection rules as mobipocket tags.
|
||||||
|
|
||||||
|
@param mainEntry: The word to inflect.
|
||||||
|
@param controlByteCount: The number of control bytes.
|
||||||
|
@param tagTable: The tag table.
|
||||||
|
@param data: The Inflection data object to properly select the right inflection data section to use
|
||||||
|
@param inflectionNames: The inflection rule name data.
|
||||||
|
@param groupList: The list of inflection groups to process.
|
||||||
|
@return: String with inflection groups and rules or empty string if required tags are not available.
|
||||||
|
'''
|
||||||
|
result = b""
|
||||||
|
for value in groupList:
|
||||||
|
offset, nextOffset, data = dinfl.offsets(value)
|
||||||
|
|
||||||
|
# First byte seems to be always 0x00 and must be skipped.
|
||||||
|
assert ord(data[offset:offset+1]) == 0x00
|
||||||
|
tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset)
|
||||||
|
|
||||||
|
# Make sure that the required tags are available.
|
||||||
|
if 0x05 not in tagMap:
|
||||||
|
print("Error: Required tag 0x05 not found in tagMap")
|
||||||
|
return ""
|
||||||
|
if 0x1a not in tagMap:
|
||||||
|
print("Error: Required tag 0x1a not found in tagMap")
|
||||||
|
return b''
|
||||||
|
|
||||||
|
result += b'<idx:infl>'
|
||||||
|
|
||||||
|
for i in range(len(tagMap[0x05])):
|
||||||
|
|
||||||
|
# Get name of inflection rule.
|
||||||
|
value = tagMap[0x05][i]
|
||||||
|
consumed, textLength = getVariableWidthValue(inflectionNames, value)
|
||||||
|
inflectionName = inflectionNames[value+consumed:value+consumed+textLength]
|
||||||
|
|
||||||
|
# Get and apply inflection rule across possibly multiple inflection data sections
|
||||||
|
value = tagMap[0x1a][i]
|
||||||
|
rvalue, start, count, data = dinfl.lookup(value)
|
||||||
|
offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue))
|
||||||
|
textLength = ord(data[offset:offset+1])
|
||||||
|
inflection = self.applyInflectionRule(mainEntry, data, offset+1, offset+1+textLength)
|
||||||
|
if inflection is not None:
|
||||||
|
result += b' <idx:iform name="' + inflectionName + b'" value="' + inflection + b'"/>'
|
||||||
|
|
||||||
|
result += b'</idx:infl>'
|
||||||
|
return result
|
||||||
|
|
||||||
|
def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end):
|
||||||
|
'''
|
||||||
|
Apply inflection rule.
|
||||||
|
|
||||||
|
@param mainEntry: The word to inflect.
|
||||||
|
@param inflectionRuleData: The inflection rules.
|
||||||
|
@param start: The start position of the inflection rule to use.
|
||||||
|
@param end: The end position of the inflection rule to use.
|
||||||
|
@return: The string with the inflected word or None if an error occurs.
|
||||||
|
'''
|
||||||
|
mode = -1
|
||||||
|
byteArray = array.array(array_format, mainEntry)
|
||||||
|
position = len(byteArray)
|
||||||
|
for charOffset in range(start, end):
|
||||||
|
char = inflectionRuleData[charOffset:charOffset+1]
|
||||||
|
abyte = ord(char)
|
||||||
|
if abyte >= 0x0a and abyte <= 0x13:
|
||||||
|
# Move cursor backwards
|
||||||
|
offset = abyte - 0x0a
|
||||||
|
if mode not in [0x02, 0x03]:
|
||||||
|
mode = 0x02
|
||||||
|
position = len(byteArray)
|
||||||
|
position -= offset
|
||||||
|
elif abyte > 0x13:
|
||||||
|
if mode == -1:
|
||||||
|
print("Error: Unexpected first byte %i of inflection rule" % abyte)
|
||||||
|
return None
|
||||||
|
elif position == -1:
|
||||||
|
print("Error: Unexpected first byte %i of inflection rule" % abyte)
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
if mode == 0x01:
|
||||||
|
# Insert at word start
|
||||||
|
byteArray.insert(position, abyte)
|
||||||
|
position += 1
|
||||||
|
elif mode == 0x02:
|
||||||
|
# Insert at word end
|
||||||
|
byteArray.insert(position, abyte)
|
||||||
|
elif mode == 0x03:
|
||||||
|
# Delete at word end
|
||||||
|
position -= 1
|
||||||
|
deleted = byteArray.pop(position)
|
||||||
|
if bchr(deleted) != char:
|
||||||
|
if DEBUG_DICT:
|
||||||
|
print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted)))
|
||||||
|
print("Error: Delete operation of inflection rule failed")
|
||||||
|
return None
|
||||||
|
elif mode == 0x04:
|
||||||
|
# Delete at word start
|
||||||
|
deleted = byteArray.pop(position)
|
||||||
|
if bchr(deleted) != char:
|
||||||
|
if DEBUG_DICT:
|
||||||
|
print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted)))
|
||||||
|
print("Error: Delete operation of inflection rule failed")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
print("Error: Inflection rule mode %x is not implemented" % mode)
|
||||||
|
return None
|
||||||
|
elif abyte == 0x01:
|
||||||
|
# Insert at word start
|
||||||
|
if mode not in [0x01, 0x04]:
|
||||||
|
position = 0
|
||||||
|
mode = abyte
|
||||||
|
elif abyte == 0x02:
|
||||||
|
# Insert at word end
|
||||||
|
if mode not in [0x02, 0x03]:
|
||||||
|
position = len(byteArray)
|
||||||
|
mode = abyte
|
||||||
|
elif abyte == 0x03:
|
||||||
|
# Delete at word end
|
||||||
|
if mode not in [0x02, 0x03]:
|
||||||
|
position = len(byteArray)
|
||||||
|
mode = abyte
|
||||||
|
elif abyte == 0x04:
|
||||||
|
# Delete at word start
|
||||||
|
if mode not in [0x01, 0x04]:
|
||||||
|
position = 0
|
||||||
|
# Delete at word start
|
||||||
|
mode = abyte
|
||||||
|
else:
|
||||||
|
print("Error: Inflection rule mode %x is not implemented" % abyte)
|
||||||
|
return None
|
||||||
|
return utf8_str(byteArray.tostring())
|
934
KindleUnpack/mobi_header.py
Normal file
934
KindleUnpack/mobi_header.py
Normal file
@@ -0,0 +1,934 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
|
||||||
|
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||||
|
|
||||||
|
DEBUG_USE_ORDERED_DICTIONARY = False # OrderedDict is supoorted >= python 2.7.
|
||||||
|
""" set to True to use OrderedDict for MobiHeader.metadata."""
|
||||||
|
|
||||||
|
if DEBUG_USE_ORDERED_DICTIONARY:
|
||||||
|
from collections import OrderedDict as dict_
|
||||||
|
else:
|
||||||
|
dict_ = dict
|
||||||
|
|
||||||
|
from .compatibility_utils import PY2, unicode_str, hexlify, bord
|
||||||
|
|
||||||
|
if PY2:
|
||||||
|
range = xrange
|
||||||
|
|
||||||
|
import struct
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
# import the mobiunpack support libraries
|
||||||
|
from .mobi_utils import getLanguage
|
||||||
|
from .mobi_uncompress import HuffcdicReader, PalmdocReader, UncompressedReader
|
||||||
|
|
||||||
|
class unpackException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def sortedHeaderKeys(mheader):
|
||||||
|
hdrkeys = sorted(list(mheader.keys()), key=lambda akey: mheader[akey][0])
|
||||||
|
return hdrkeys
|
||||||
|
|
||||||
|
|
||||||
|
# HD Containers have their own headers and their own EXTH
|
||||||
|
# this is just guesswork so far, making big assumption that
|
||||||
|
# metavalue key numbers remain the same in the CONT EXTH
|
||||||
|
|
||||||
|
# Note: The layout of the CONT Header is still unknown
|
||||||
|
# so just deal with their EXTH sections for now
|
||||||
|
|
||||||
|
def dump_contexth(cpage, extheader):
|
||||||
|
# determine text encoding
|
||||||
|
codec = 'windows-1252'
|
||||||
|
codec_map = {
|
||||||
|
1252 : 'windows-1252',
|
||||||
|
65001: 'utf-8',
|
||||||
|
}
|
||||||
|
if cpage in codec_map:
|
||||||
|
codec = codec_map[cpage]
|
||||||
|
if extheader == b'':
|
||||||
|
return
|
||||||
|
id_map_strings = {
|
||||||
|
1 : 'Drm Server Id',
|
||||||
|
2 : 'Drm Commerce Id',
|
||||||
|
3 : 'Drm Ebookbase Book Id',
|
||||||
|
4 : 'Drm Ebookbase Dep Id',
|
||||||
|
100 : 'Creator',
|
||||||
|
101 : 'Publisher',
|
||||||
|
102 : 'Imprint',
|
||||||
|
103 : 'Description',
|
||||||
|
104 : 'ISBN',
|
||||||
|
105 : 'Subject',
|
||||||
|
106 : 'Published',
|
||||||
|
107 : 'Review',
|
||||||
|
108 : 'Contributor',
|
||||||
|
109 : 'Rights',
|
||||||
|
110 : 'SubjectCode',
|
||||||
|
111 : 'Type',
|
||||||
|
112 : 'Source',
|
||||||
|
113 : 'ASIN',
|
||||||
|
114 : 'versionNumber',
|
||||||
|
117 : 'Adult',
|
||||||
|
118 : 'Retail-Price',
|
||||||
|
119 : 'Retail-Currency',
|
||||||
|
120 : 'TSC',
|
||||||
|
122 : 'fixed-layout',
|
||||||
|
123 : 'book-type',
|
||||||
|
124 : 'orientation-lock',
|
||||||
|
126 : 'original-resolution',
|
||||||
|
127 : 'zero-gutter',
|
||||||
|
128 : 'zero-margin',
|
||||||
|
129 : 'MetadataResourceURI',
|
||||||
|
132 : 'RegionMagnification',
|
||||||
|
150 : 'LendingEnabled',
|
||||||
|
200 : 'DictShortName',
|
||||||
|
501 : 'cdeType',
|
||||||
|
502 : 'last_update_time',
|
||||||
|
503 : 'Updated_Title',
|
||||||
|
504 : 'CDEContentKey',
|
||||||
|
505 : 'AmazonContentReference',
|
||||||
|
506 : 'Title-Language',
|
||||||
|
507 : 'Title-Display-Direction',
|
||||||
|
508 : 'Title-Pronunciation',
|
||||||
|
509 : 'Title-Collation',
|
||||||
|
510 : 'Secondary-Title',
|
||||||
|
511 : 'Secondary-Title-Language',
|
||||||
|
512 : 'Secondary-Title-Direction',
|
||||||
|
513 : 'Secondary-Title-Pronunciation',
|
||||||
|
514 : 'Secondary-Title-Collation',
|
||||||
|
515 : 'Author-Language',
|
||||||
|
516 : 'Author-Display-Direction',
|
||||||
|
517 : 'Author-Pronunciation',
|
||||||
|
518 : 'Author-Collation',
|
||||||
|
519 : 'Author-Type',
|
||||||
|
520 : 'Publisher-Language',
|
||||||
|
521 : 'Publisher-Display-Direction',
|
||||||
|
522 : 'Publisher-Pronunciation',
|
||||||
|
523 : 'Publisher-Collation',
|
||||||
|
524 : 'Content-Language-Tag',
|
||||||
|
525 : 'primary-writing-mode',
|
||||||
|
526 : 'NCX-Ingested-By-Software',
|
||||||
|
527 : 'page-progression-direction',
|
||||||
|
528 : 'override-kindle-fonts',
|
||||||
|
529 : 'Compression-Upgraded',
|
||||||
|
530 : 'Soft-Hyphens-In-Content',
|
||||||
|
531 : 'Dictionary_In_Langague',
|
||||||
|
532 : 'Dictionary_Out_Language',
|
||||||
|
533 : 'Font_Converted',
|
||||||
|
534 : 'Amazon_Creator_Info',
|
||||||
|
535 : 'Creator-Build-Tag',
|
||||||
|
536 : 'HD-Media-Containers-Info', # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?)
|
||||||
|
538 : 'Resource-Container-Fidelity',
|
||||||
|
539 : 'HD-Container-Mimetype',
|
||||||
|
540 : 'Sample-For_Special-Purpose',
|
||||||
|
541 : 'Kindletool-Operation-Information',
|
||||||
|
542 : 'Container_Id',
|
||||||
|
543 : 'Asset-Type', # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER
|
||||||
|
544 : 'Unknown_544',
|
||||||
|
}
|
||||||
|
id_map_values = {
|
||||||
|
115 : 'sample',
|
||||||
|
116 : 'StartOffset',
|
||||||
|
121 : 'Mobi8-Boundary-Section',
|
||||||
|
125 : 'Embedded-Record-Count',
|
||||||
|
130 : 'Offline-Sample',
|
||||||
|
131 : 'Metadata-Record-Offset',
|
||||||
|
201 : 'CoverOffset',
|
||||||
|
202 : 'ThumbOffset',
|
||||||
|
203 : 'HasFakeCover',
|
||||||
|
204 : 'Creator-Software',
|
||||||
|
205 : 'Creator-Major-Version',
|
||||||
|
206 : 'Creator-Minor-Version',
|
||||||
|
207 : 'Creator-Build-Number',
|
||||||
|
401 : 'Clipping-Limit',
|
||||||
|
402 : 'Publisher-Limit',
|
||||||
|
404 : 'Text-to-Speech-Disabled',
|
||||||
|
406 : 'Rental-Expiration-Time',
|
||||||
|
}
|
||||||
|
id_map_hexstrings = {
|
||||||
|
208 : 'Watermark_(hex)',
|
||||||
|
209 : 'Tamper-Proof-Keys_(hex)',
|
||||||
|
300 : 'Font-Signature_(hex)',
|
||||||
|
403 : 'Unknown_(403)_(hex)',
|
||||||
|
405 : 'Ownership-Type_(hex)',
|
||||||
|
407 : 'Unknown_(407)_(hex)',
|
||||||
|
420 : 'Multimedia-Content-Reference_(hex)',
|
||||||
|
450 : 'Locations_Match_(hex)',
|
||||||
|
451 : 'Full-Story-Length_(hex)',
|
||||||
|
452 : 'Sample-Start_Location_(hex)',
|
||||||
|
453 : 'Sample-End-Location_(hex)',
|
||||||
|
}
|
||||||
|
_length, num_items = struct.unpack(b'>LL', extheader[4:12])
|
||||||
|
extheader = extheader[12:]
|
||||||
|
pos = 0
|
||||||
|
for _ in range(num_items):
|
||||||
|
id, size = struct.unpack(b'>LL', extheader[pos:pos+8])
|
||||||
|
content = extheader[pos + 8: pos + size]
|
||||||
|
if id in id_map_strings:
|
||||||
|
name = id_map_strings[id]
|
||||||
|
print('\n Key: "%s"\n Value: "%s"' % (name, content.decode(codec, errors='replace')))
|
||||||
|
elif id in id_map_values:
|
||||||
|
name = id_map_values[id]
|
||||||
|
if size == 9:
|
||||||
|
value, = struct.unpack(b'B',content)
|
||||||
|
print('\n Key: "%s"\n Value: 0x%01x' % (name, value))
|
||||||
|
elif size == 10:
|
||||||
|
value, = struct.unpack(b'>H',content)
|
||||||
|
print('\n Key: "%s"\n Value: 0x%02x' % (name, value))
|
||||||
|
elif size == 12:
|
||||||
|
value, = struct.unpack(b'>L',content)
|
||||||
|
print('\n Key: "%s"\n Value: 0x%04x' % (name, value))
|
||||||
|
else:
|
||||||
|
print("\nError: Value for %s has unexpected size of %s" % (name, size))
|
||||||
|
elif id in id_map_hexstrings:
|
||||||
|
name = id_map_hexstrings[id]
|
||||||
|
print('\n Key: "%s"\n Value: 0x%s' % (name, hexlify(content)))
|
||||||
|
else:
|
||||||
|
print("\nWarning: Unknown metadata with id %s found" % id)
|
||||||
|
name = str(id) + ' (hex)'
|
||||||
|
print(' Key: "%s"\n Value: 0x%s' % (name, hexlify(content)))
|
||||||
|
pos += size
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
class MobiHeader:
|
||||||
|
# all values are packed in big endian format
|
||||||
|
palmdoc_header = {
|
||||||
|
'compression_type' : (0x00, b'>H', 2),
|
||||||
|
'fill0' : (0x02, b'>H', 2),
|
||||||
|
'text_length' : (0x04, b'>L', 4),
|
||||||
|
'text_records' : (0x08, b'>H', 2),
|
||||||
|
'max_section_size' : (0x0a, b'>H', 2),
|
||||||
|
'read_pos ' : (0x0c, b'>L', 4),
|
||||||
|
}
|
||||||
|
|
||||||
|
mobi6_header = {
|
||||||
|
'compression_type' : (0x00, b'>H', 2),
|
||||||
|
'fill0' : (0x02, b'>H', 2),
|
||||||
|
'text_length' : (0x04, b'>L', 4),
|
||||||
|
'text_records' : (0x08, b'>H', 2),
|
||||||
|
'max_section_size' : (0x0a, b'>H', 2),
|
||||||
|
'crypto_type' : (0x0c, b'>H', 2),
|
||||||
|
'fill1' : (0x0e, b'>H', 2),
|
||||||
|
'magic' : (0x10, b'4s', 4),
|
||||||
|
'header_length (from MOBI)' : (0x14, b'>L', 4),
|
||||||
|
'type' : (0x18, b'>L', 4),
|
||||||
|
'codepage' : (0x1c, b'>L', 4),
|
||||||
|
'unique_id' : (0x20, b'>L', 4),
|
||||||
|
'version' : (0x24, b'>L', 4),
|
||||||
|
'metaorthindex' : (0x28, b'>L', 4),
|
||||||
|
'metainflindex' : (0x2c, b'>L', 4),
|
||||||
|
'index_names' : (0x30, b'>L', 4),
|
||||||
|
'index_keys' : (0x34, b'>L', 4),
|
||||||
|
'extra_index0' : (0x38, b'>L', 4),
|
||||||
|
'extra_index1' : (0x3c, b'>L', 4),
|
||||||
|
'extra_index2' : (0x40, b'>L', 4),
|
||||||
|
'extra_index3' : (0x44, b'>L', 4),
|
||||||
|
'extra_index4' : (0x48, b'>L', 4),
|
||||||
|
'extra_index5' : (0x4c, b'>L', 4),
|
||||||
|
'first_nontext' : (0x50, b'>L', 4),
|
||||||
|
'title_offset' : (0x54, b'>L', 4),
|
||||||
|
'title_length' : (0x58, b'>L', 4),
|
||||||
|
'language_code' : (0x5c, b'>L', 4),
|
||||||
|
'dict_in_lang' : (0x60, b'>L', 4),
|
||||||
|
'dict_out_lang' : (0x64, b'>L', 4),
|
||||||
|
'min_version' : (0x68, b'>L', 4),
|
||||||
|
'first_resc_offset' : (0x6c, b'>L', 4),
|
||||||
|
'huff_offset' : (0x70, b'>L', 4),
|
||||||
|
'huff_num' : (0x74, b'>L', 4),
|
||||||
|
'huff_tbl_offset' : (0x78, b'>L', 4),
|
||||||
|
'huff_tbl_len' : (0x7c, b'>L', 4),
|
||||||
|
'exth_flags' : (0x80, b'>L', 4),
|
||||||
|
'fill3_a' : (0x84, b'>L', 4),
|
||||||
|
'fill3_b' : (0x88, b'>L', 4),
|
||||||
|
'fill3_c' : (0x8c, b'>L', 4),
|
||||||
|
'fill3_d' : (0x90, b'>L', 4),
|
||||||
|
'fill3_e' : (0x94, b'>L', 4),
|
||||||
|
'fill3_f' : (0x98, b'>L', 4),
|
||||||
|
'fill3_g' : (0x9c, b'>L', 4),
|
||||||
|
'fill3_h' : (0xa0, b'>L', 4),
|
||||||
|
'unknown0' : (0xa4, b'>L', 4),
|
||||||
|
'drm_offset' : (0xa8, b'>L', 4),
|
||||||
|
'drm_count' : (0xac, b'>L', 4),
|
||||||
|
'drm_size' : (0xb0, b'>L', 4),
|
||||||
|
'drm_flags' : (0xb4, b'>L', 4),
|
||||||
|
'fill4_a' : (0xb8, b'>L', 4),
|
||||||
|
'fill4_b' : (0xbc, b'>L', 4),
|
||||||
|
'first_content' : (0xc0, b'>H', 2),
|
||||||
|
'last_content' : (0xc2, b'>H', 2),
|
||||||
|
'unknown0' : (0xc4, b'>L', 4),
|
||||||
|
'fcis_offset' : (0xc8, b'>L', 4),
|
||||||
|
'fcis_count' : (0xcc, b'>L', 4),
|
||||||
|
'flis_offset' : (0xd0, b'>L', 4),
|
||||||
|
'flis_count' : (0xd4, b'>L', 4),
|
||||||
|
'unknown1' : (0xd8, b'>L', 4),
|
||||||
|
'unknown2' : (0xdc, b'>L', 4),
|
||||||
|
'srcs_offset' : (0xe0, b'>L', 4),
|
||||||
|
'srcs_count' : (0xe4, b'>L', 4),
|
||||||
|
'unknown3' : (0xe8, b'>L', 4),
|
||||||
|
'unknown4' : (0xec, b'>L', 4),
|
||||||
|
'fill5' : (0xf0, b'>H', 2),
|
||||||
|
'traildata_flags' : (0xf2, b'>H', 2),
|
||||||
|
'ncx_index' : (0xf4, b'>L', 4),
|
||||||
|
'unknown5' : (0xf8, b'>L', 4),
|
||||||
|
'unknown6' : (0xfc, b'>L', 4),
|
||||||
|
'datp_offset' : (0x100, b'>L', 4),
|
||||||
|
'unknown7' : (0x104, b'>L', 4),
|
||||||
|
'Unknown ' : (0x108, b'>L', 4),
|
||||||
|
'Unknown ' : (0x10C, b'>L', 4),
|
||||||
|
'Unknown ' : (0x110, b'>L', 4),
|
||||||
|
'Unknown ' : (0x114, b'>L', 4),
|
||||||
|
'Unknown ' : (0x118, b'>L', 4),
|
||||||
|
'Unknown ' : (0x11C, b'>L', 4),
|
||||||
|
'Unknown ' : (0x120, b'>L', 4),
|
||||||
|
'Unknown ' : (0x124, b'>L', 4),
|
||||||
|
'Unknown ' : (0x128, b'>L', 4),
|
||||||
|
'Unknown ' : (0x12C, b'>L', 4),
|
||||||
|
'Unknown ' : (0x130, b'>L', 4),
|
||||||
|
'Unknown ' : (0x134, b'>L', 4),
|
||||||
|
'Unknown ' : (0x138, b'>L', 4),
|
||||||
|
'Unknown ' : (0x11C, b'>L', 4),
|
||||||
|
}
|
||||||
|
|
||||||
|
mobi8_header = {
|
||||||
|
'compression_type' : (0x00, b'>H', 2),
|
||||||
|
'fill0' : (0x02, b'>H', 2),
|
||||||
|
'text_length' : (0x04, b'>L', 4),
|
||||||
|
'text_records' : (0x08, b'>H', 2),
|
||||||
|
'max_section_size' : (0x0a, b'>H', 2),
|
||||||
|
'crypto_type' : (0x0c, b'>H', 2),
|
||||||
|
'fill1' : (0x0e, b'>H', 2),
|
||||||
|
'magic' : (0x10, b'4s', 4),
|
||||||
|
'header_length (from MOBI)' : (0x14, b'>L', 4),
|
||||||
|
'type' : (0x18, b'>L', 4),
|
||||||
|
'codepage' : (0x1c, b'>L', 4),
|
||||||
|
'unique_id' : (0x20, b'>L', 4),
|
||||||
|
'version' : (0x24, b'>L', 4),
|
||||||
|
'metaorthindex' : (0x28, b'>L', 4),
|
||||||
|
'metainflindex' : (0x2c, b'>L', 4),
|
||||||
|
'index_names' : (0x30, b'>L', 4),
|
||||||
|
'index_keys' : (0x34, b'>L', 4),
|
||||||
|
'extra_index0' : (0x38, b'>L', 4),
|
||||||
|
'extra_index1' : (0x3c, b'>L', 4),
|
||||||
|
'extra_index2' : (0x40, b'>L', 4),
|
||||||
|
'extra_index3' : (0x44, b'>L', 4),
|
||||||
|
'extra_index4' : (0x48, b'>L', 4),
|
||||||
|
'extra_index5' : (0x4c, b'>L', 4),
|
||||||
|
'first_nontext' : (0x50, b'>L', 4),
|
||||||
|
'title_offset' : (0x54, b'>L', 4),
|
||||||
|
'title_length' : (0x58, b'>L', 4),
|
||||||
|
'language_code' : (0x5c, b'>L', 4),
|
||||||
|
'dict_in_lang' : (0x60, b'>L', 4),
|
||||||
|
'dict_out_lang' : (0x64, b'>L', 4),
|
||||||
|
'min_version' : (0x68, b'>L', 4),
|
||||||
|
'first_resc_offset' : (0x6c, b'>L', 4),
|
||||||
|
'huff_offset' : (0x70, b'>L', 4),
|
||||||
|
'huff_num' : (0x74, b'>L', 4),
|
||||||
|
'huff_tbl_offset' : (0x78, b'>L', 4),
|
||||||
|
'huff_tbl_len' : (0x7c, b'>L', 4),
|
||||||
|
'exth_flags' : (0x80, b'>L', 4),
|
||||||
|
'fill3_a' : (0x84, b'>L', 4),
|
||||||
|
'fill3_b' : (0x88, b'>L', 4),
|
||||||
|
'fill3_c' : (0x8c, b'>L', 4),
|
||||||
|
'fill3_d' : (0x90, b'>L', 4),
|
||||||
|
'fill3_e' : (0x94, b'>L', 4),
|
||||||
|
'fill3_f' : (0x98, b'>L', 4),
|
||||||
|
'fill3_g' : (0x9c, b'>L', 4),
|
||||||
|
'fill3_h' : (0xa0, b'>L', 4),
|
||||||
|
'unknown0' : (0xa4, b'>L', 4),
|
||||||
|
'drm_offset' : (0xa8, b'>L', 4),
|
||||||
|
'drm_count' : (0xac, b'>L', 4),
|
||||||
|
'drm_size' : (0xb0, b'>L', 4),
|
||||||
|
'drm_flags' : (0xb4, b'>L', 4),
|
||||||
|
'fill4_a' : (0xb8, b'>L', 4),
|
||||||
|
'fill4_b' : (0xbc, b'>L', 4),
|
||||||
|
'fdst_offset' : (0xc0, b'>L', 4),
|
||||||
|
'fdst_flow_count' : (0xc4, b'>L', 4),
|
||||||
|
'fcis_offset' : (0xc8, b'>L', 4),
|
||||||
|
'fcis_count' : (0xcc, b'>L', 4),
|
||||||
|
'flis_offset' : (0xd0, b'>L', 4),
|
||||||
|
'flis_count' : (0xd4, b'>L', 4),
|
||||||
|
'unknown1' : (0xd8, b'>L', 4),
|
||||||
|
'unknown2' : (0xdc, b'>L', 4),
|
||||||
|
'srcs_offset' : (0xe0, b'>L', 4),
|
||||||
|
'srcs_count' : (0xe4, b'>L', 4),
|
||||||
|
'unknown3' : (0xe8, b'>L', 4),
|
||||||
|
'unknown4' : (0xec, b'>L', 4),
|
||||||
|
'fill5' : (0xf0, b'>H', 2),
|
||||||
|
'traildata_flags' : (0xf2, b'>H', 2),
|
||||||
|
'ncx_index' : (0xf4, b'>L', 4),
|
||||||
|
'fragment_index' : (0xf8, b'>L', 4),
|
||||||
|
'skeleton_index' : (0xfc, b'>L', 4),
|
||||||
|
'datp_offset' : (0x100, b'>L', 4),
|
||||||
|
'guide_index' : (0x104, b'>L', 4),
|
||||||
|
'Unknown ' : (0x108, b'>L', 4),
|
||||||
|
'Unknown ' : (0x10C, b'>L', 4),
|
||||||
|
'Unknown ' : (0x110, b'>L', 4),
|
||||||
|
'Unknown ' : (0x114, b'>L', 4),
|
||||||
|
'Unknown ' : (0x118, b'>L', 4),
|
||||||
|
'Unknown ' : (0x11C, b'>L', 4),
|
||||||
|
'Unknown ' : (0x120, b'>L', 4),
|
||||||
|
'Unknown ' : (0x124, b'>L', 4),
|
||||||
|
'Unknown ' : (0x128, b'>L', 4),
|
||||||
|
'Unknown ' : (0x12C, b'>L', 4),
|
||||||
|
'Unknown ' : (0x130, b'>L', 4),
|
||||||
|
'Unknown ' : (0x134, b'>L', 4),
|
||||||
|
'Unknown ' : (0x138, b'>L', 4),
|
||||||
|
'Unknown ' : (0x11C, b'>L', 4),
|
||||||
|
}
|
||||||
|
|
||||||
|
palmdoc_header_sorted_keys = sortedHeaderKeys(palmdoc_header)
|
||||||
|
mobi6_header_sorted_keys = sortedHeaderKeys(mobi6_header)
|
||||||
|
mobi8_header_sorted_keys = sortedHeaderKeys(mobi8_header)
|
||||||
|
|
||||||
|
id_map_strings = {
|
||||||
|
1 : 'Drm Server Id',
|
||||||
|
2 : 'Drm Commerce Id',
|
||||||
|
3 : 'Drm Ebookbase Book Id',
|
||||||
|
4 : 'Drm Ebookbase Dep Id',
|
||||||
|
100 : 'Creator',
|
||||||
|
101 : 'Publisher',
|
||||||
|
102 : 'Imprint',
|
||||||
|
103 : 'Description',
|
||||||
|
104 : 'ISBN',
|
||||||
|
105 : 'Subject',
|
||||||
|
106 : 'Published',
|
||||||
|
107 : 'Review',
|
||||||
|
108 : 'Contributor',
|
||||||
|
109 : 'Rights',
|
||||||
|
110 : 'SubjectCode',
|
||||||
|
111 : 'Type',
|
||||||
|
112 : 'Source',
|
||||||
|
113 : 'ASIN',
|
||||||
|
114 : 'versionNumber',
|
||||||
|
117 : 'Adult',
|
||||||
|
118 : 'Retail-Price',
|
||||||
|
119 : 'Retail-Currency',
|
||||||
|
120 : 'TSC',
|
||||||
|
122 : 'fixed-layout',
|
||||||
|
123 : 'book-type',
|
||||||
|
124 : 'orientation-lock',
|
||||||
|
126 : 'original-resolution',
|
||||||
|
127 : 'zero-gutter',
|
||||||
|
128 : 'zero-margin',
|
||||||
|
129 : 'MetadataResourceURI',
|
||||||
|
132 : 'RegionMagnification',
|
||||||
|
150 : 'LendingEnabled',
|
||||||
|
200 : 'DictShortName',
|
||||||
|
501 : 'cdeType',
|
||||||
|
502 : 'last_update_time',
|
||||||
|
503 : 'Updated_Title',
|
||||||
|
504 : 'CDEContentKey',
|
||||||
|
505 : 'AmazonContentReference',
|
||||||
|
506 : 'Title-Language',
|
||||||
|
507 : 'Title-Display-Direction',
|
||||||
|
508 : 'Title-Pronunciation',
|
||||||
|
509 : 'Title-Collation',
|
||||||
|
510 : 'Secondary-Title',
|
||||||
|
511 : 'Secondary-Title-Language',
|
||||||
|
512 : 'Secondary-Title-Direction',
|
||||||
|
513 : 'Secondary-Title-Pronunciation',
|
||||||
|
514 : 'Secondary-Title-Collation',
|
||||||
|
515 : 'Author-Language',
|
||||||
|
516 : 'Author-Display-Direction',
|
||||||
|
517 : 'Author-Pronunciation',
|
||||||
|
518 : 'Author-Collation',
|
||||||
|
519 : 'Author-Type',
|
||||||
|
520 : 'Publisher-Language',
|
||||||
|
521 : 'Publisher-Display-Direction',
|
||||||
|
522 : 'Publisher-Pronunciation',
|
||||||
|
523 : 'Publisher-Collation',
|
||||||
|
524 : 'Content-Language-Tag',
|
||||||
|
525 : 'primary-writing-mode',
|
||||||
|
526 : 'NCX-Ingested-By-Software',
|
||||||
|
527 : 'page-progression-direction',
|
||||||
|
528 : 'override-kindle-fonts',
|
||||||
|
529 : 'Compression-Upgraded',
|
||||||
|
530 : 'Soft-Hyphens-In-Content',
|
||||||
|
531 : 'Dictionary_In_Langague',
|
||||||
|
532 : 'Dictionary_Out_Language',
|
||||||
|
533 : 'Font_Converted',
|
||||||
|
534 : 'Amazon_Creator_Info',
|
||||||
|
535 : 'Creator-Build-Tag',
|
||||||
|
536 : 'HD-Media-Containers-Info', # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?)
|
||||||
|
538 : 'Resource-Container-Fidelity',
|
||||||
|
539 : 'HD-Container-Mimetype',
|
||||||
|
540 : 'Sample-For_Special-Purpose',
|
||||||
|
541 : 'Kindletool-Operation-Information',
|
||||||
|
542 : 'Container_Id',
|
||||||
|
543 : 'Asset-Type', # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER
|
||||||
|
544 : 'Unknown_544',
|
||||||
|
}
|
||||||
|
id_map_values = {
|
||||||
|
115 : 'sample',
|
||||||
|
116 : 'StartOffset',
|
||||||
|
121 : 'Mobi8-Boundary-Section',
|
||||||
|
125 : 'Embedded-Record-Count',
|
||||||
|
130 : 'Offline-Sample',
|
||||||
|
131 : 'Metadata-Record-Offset',
|
||||||
|
201 : 'CoverOffset',
|
||||||
|
202 : 'ThumbOffset',
|
||||||
|
203 : 'HasFakeCover',
|
||||||
|
204 : 'Creator-Software',
|
||||||
|
205 : 'Creator-Major-Version',
|
||||||
|
206 : 'Creator-Minor-Version',
|
||||||
|
207 : 'Creator-Build-Number',
|
||||||
|
401 : 'Clipping-Limit',
|
||||||
|
402 : 'Publisher-Limit',
|
||||||
|
404 : 'Text-to-Speech-Disabled',
|
||||||
|
406 : 'Rental-Expiration-Time',
|
||||||
|
}
|
||||||
|
id_map_hexstrings = {
|
||||||
|
208 : 'Watermark_(hex)',
|
||||||
|
209 : 'Tamper-Proof-Keys_(hex)',
|
||||||
|
300 : 'Font-Signature_(hex)',
|
||||||
|
403 : 'Unknown_(403)_(hex)',
|
||||||
|
405 : 'Ownership-Type_(hex)',
|
||||||
|
407 : 'Unknown_(407)_(hex)',
|
||||||
|
420 : 'Multimedia-Content-Reference_(hex)',
|
||||||
|
450 : 'Locations_Match_(hex)',
|
||||||
|
451 : 'Full-Story-Length_(hex)',
|
||||||
|
452 : 'Sample-Start_Location_(hex)',
|
||||||
|
453 : 'Sample-End-Location_(hex)',
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, sect, sectNumber):
|
||||||
|
self.sect = sect
|
||||||
|
self.start = sectNumber
|
||||||
|
self.header = self.sect.loadSection(self.start)
|
||||||
|
if len(self.header)>20 and self.header[16:20] == b'MOBI':
|
||||||
|
self.sect.setsectiondescription(0,"Mobipocket Header")
|
||||||
|
self.palm = False
|
||||||
|
elif self.sect.ident == b'TEXtREAd':
|
||||||
|
self.sect.setsectiondescription(0, "PalmDOC Header")
|
||||||
|
self.palm = True
|
||||||
|
else:
|
||||||
|
raise unpackException('Unknown File Format')
|
||||||
|
|
||||||
|
self.records, = struct.unpack_from(b'>H', self.header, 0x8)
|
||||||
|
|
||||||
|
# set defaults in case this is a PalmDOC
|
||||||
|
self.title = self.sect.palmname.decode('latin-1', errors='replace')
|
||||||
|
self.length = len(self.header)-16
|
||||||
|
self.type = 3
|
||||||
|
self.codepage = 1252
|
||||||
|
self.codec = 'windows-1252'
|
||||||
|
self.unique_id = 0
|
||||||
|
self.version = 0
|
||||||
|
self.hasExth = False
|
||||||
|
self.exth = b''
|
||||||
|
self.exth_offset = self.length + 16
|
||||||
|
self.exth_length = 0
|
||||||
|
self.crypto_type = 0
|
||||||
|
self.firstnontext = self.start+self.records + 1
|
||||||
|
self.firstresource = self.start+self.records + 1
|
||||||
|
self.ncxidx = 0xffffffff
|
||||||
|
self.metaOrthIndex = 0xffffffff
|
||||||
|
self.metaInflIndex = 0xffffffff
|
||||||
|
self.skelidx = 0xffffffff
|
||||||
|
self.fragidx = 0xffffffff
|
||||||
|
self.guideidx = 0xffffffff
|
||||||
|
self.fdst = 0xffffffff
|
||||||
|
self.mlstart = self.sect.loadSection(self.start+1)[:4]
|
||||||
|
self.rawSize = 0
|
||||||
|
self.metadata = dict_()
|
||||||
|
|
||||||
|
# set up for decompression/unpacking
|
||||||
|
self.compression, = struct.unpack_from(b'>H', self.header, 0x0)
|
||||||
|
if self.compression == 0x4448:
|
||||||
|
reader = HuffcdicReader()
|
||||||
|
huffoff, huffnum = struct.unpack_from(b'>LL', self.header, 0x70)
|
||||||
|
huffoff = huffoff + self.start
|
||||||
|
self.sect.setsectiondescription(huffoff,"Huffman Compression Seed")
|
||||||
|
reader.loadHuff(self.sect.loadSection(huffoff))
|
||||||
|
for i in range(1, huffnum):
|
||||||
|
self.sect.setsectiondescription(huffoff+i,"Huffman CDIC Compression Seed %d" % i)
|
||||||
|
reader.loadCdic(self.sect.loadSection(huffoff+i))
|
||||||
|
self.unpack = reader.unpack
|
||||||
|
elif self.compression == 2:
|
||||||
|
self.unpack = PalmdocReader().unpack
|
||||||
|
elif self.compression == 1:
|
||||||
|
self.unpack = UncompressedReader().unpack
|
||||||
|
else:
|
||||||
|
raise unpackException('invalid compression type: 0x%4x' % self.compression)
|
||||||
|
|
||||||
|
if self.palm:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.length, self.type, self.codepage, self.unique_id, self.version = struct.unpack(b'>LLLLL', self.header[20:40])
|
||||||
|
codec_map = {
|
||||||
|
1252 : 'windows-1252',
|
||||||
|
65001: 'utf-8',
|
||||||
|
}
|
||||||
|
if self.codepage in codec_map:
|
||||||
|
self.codec = codec_map[self.codepage]
|
||||||
|
|
||||||
|
# title
|
||||||
|
toff, tlen = struct.unpack(b'>II', self.header[0x54:0x5c])
|
||||||
|
tend = toff + tlen
|
||||||
|
self.title=self.header[toff:tend].decode(self.codec, errors='replace')
|
||||||
|
|
||||||
|
exth_flag, = struct.unpack(b'>L', self.header[0x80:0x84])
|
||||||
|
self.hasExth = exth_flag & 0x40
|
||||||
|
self.exth_offset = self.length + 16
|
||||||
|
self.exth_length = 0
|
||||||
|
if self.hasExth:
|
||||||
|
self.exth_length, = struct.unpack_from(b'>L', self.header, self.exth_offset+4)
|
||||||
|
self.exth_length = ((self.exth_length + 3)>>2)<<2 # round to next 4 byte boundary
|
||||||
|
self.exth = self.header[self.exth_offset:self.exth_offset+self.exth_length]
|
||||||
|
|
||||||
|
# parse the exth / metadata
|
||||||
|
self.parseMetaData()
|
||||||
|
|
||||||
|
# self.mlstart = self.sect.loadSection(self.start+1)
|
||||||
|
# self.mlstart = self.mlstart[0:4]
|
||||||
|
self.crypto_type, = struct.unpack_from(b'>H', self.header, 0xC)
|
||||||
|
|
||||||
|
# Start sector for additional files such as images, fonts, resources, etc
|
||||||
|
# Can be missing so fall back to default set previously
|
||||||
|
ofst, = struct.unpack_from(b'>L', self.header, 0x6C)
|
||||||
|
if ofst != 0xffffffff:
|
||||||
|
self.firstresource = ofst + self.start
|
||||||
|
ofst, = struct.unpack_from(b'>L', self.header, 0x50)
|
||||||
|
if ofst != 0xffffffff:
|
||||||
|
self.firstnontext = ofst + self.start
|
||||||
|
|
||||||
|
if self.isPrintReplica():
|
||||||
|
return
|
||||||
|
|
||||||
|
if self.version < 8:
|
||||||
|
# Dictionary metaOrthIndex
|
||||||
|
self.metaOrthIndex, = struct.unpack_from(b'>L', self.header, 0x28)
|
||||||
|
if self.metaOrthIndex != 0xffffffff:
|
||||||
|
self.metaOrthIndex += self.start
|
||||||
|
|
||||||
|
# Dictionary metaInflIndex
|
||||||
|
self.metaInflIndex, = struct.unpack_from(b'>L', self.header, 0x2C)
|
||||||
|
if self.metaInflIndex != 0xffffffff:
|
||||||
|
self.metaInflIndex += self.start
|
||||||
|
|
||||||
|
# handle older headers without any ncxindex info and later
|
||||||
|
# specifically 0xe4 headers
|
||||||
|
if self.length + 16 < 0xf8:
|
||||||
|
return
|
||||||
|
|
||||||
|
# NCX Index
|
||||||
|
self.ncxidx, = struct.unpack(b'>L', self.header[0xf4:0xf8])
|
||||||
|
if self.ncxidx != 0xffffffff:
|
||||||
|
self.ncxidx += self.start
|
||||||
|
|
||||||
|
# K8 specific Indexes
|
||||||
|
if self.start != 0 or self.version == 8:
|
||||||
|
# Index into <xml> file skeletons in RawML
|
||||||
|
self.skelidx, = struct.unpack_from(b'>L', self.header, 0xfc)
|
||||||
|
if self.skelidx != 0xffffffff:
|
||||||
|
self.skelidx += self.start
|
||||||
|
|
||||||
|
# Index into <div> sections in RawML
|
||||||
|
self.fragidx, = struct.unpack_from(b'>L', self.header, 0xf8)
|
||||||
|
if self.fragidx != 0xffffffff:
|
||||||
|
self.fragidx += self.start
|
||||||
|
|
||||||
|
# Index into Other files
|
||||||
|
self.guideidx, = struct.unpack_from(b'>L', self.header, 0x104)
|
||||||
|
if self.guideidx != 0xffffffff:
|
||||||
|
self.guideidx += self.start
|
||||||
|
|
||||||
|
# dictionaries do not seem to use the same approach in K8's
|
||||||
|
# so disable them
|
||||||
|
self.metaOrthIndex = 0xffffffff
|
||||||
|
self.metaInflIndex = 0xffffffff
|
||||||
|
|
||||||
|
# need to use the FDST record to find out how to properly unpack
|
||||||
|
# the rawML into pieces
|
||||||
|
# it is simply a table of start and end locations for each flow piece
|
||||||
|
self.fdst, = struct.unpack_from(b'>L', self.header, 0xc0)
|
||||||
|
self.fdstcnt, = struct.unpack_from(b'>L', self.header, 0xc4)
|
||||||
|
# if cnt is 1 or less, fdst section mumber can be garbage
|
||||||
|
if self.fdstcnt <= 1:
|
||||||
|
self.fdst = 0xffffffff
|
||||||
|
if self.fdst != 0xffffffff:
|
||||||
|
self.fdst += self.start
|
||||||
|
# setting of fdst section description properly handled in mobi_kf8proc
|
||||||
|
|
||||||
|
def dump_exth(self):
|
||||||
|
# determine text encoding
|
||||||
|
codec=self.codec
|
||||||
|
if (not self.hasExth) or (self.exth_length) == 0 or (self.exth == b''):
|
||||||
|
return
|
||||||
|
num_items, = struct.unpack(b'>L', self.exth[8:12])
|
||||||
|
pos = 12
|
||||||
|
print("Key Size Decription Value")
|
||||||
|
for _ in range(num_items):
|
||||||
|
id, size = struct.unpack(b'>LL', self.exth[pos:pos+8])
|
||||||
|
contentsize = size-8
|
||||||
|
content = self.exth[pos + 8: pos + size]
|
||||||
|
if id in MobiHeader.id_map_strings:
|
||||||
|
exth_name = MobiHeader.id_map_strings[id]
|
||||||
|
print('{0: >3d} {1: >4d} {2: <30s} {3:s}'.format(id, contentsize, exth_name, content.decode(codec, errors='replace')))
|
||||||
|
elif id in MobiHeader.id_map_values:
|
||||||
|
exth_name = MobiHeader.id_map_values[id]
|
||||||
|
if size == 9:
|
||||||
|
value, = struct.unpack(b'B',content)
|
||||||
|
print('{0:3d} byte {1:<30s} {2:d}'.format(id, exth_name, value))
|
||||||
|
elif size == 10:
|
||||||
|
value, = struct.unpack(b'>H',content)
|
||||||
|
print('{0:3d} word {1:<30s} 0x{2:0>4X} ({2:d})'.format(id, exth_name, value))
|
||||||
|
elif size == 12:
|
||||||
|
value, = struct.unpack(b'>L',content)
|
||||||
|
print('{0:3d} long {1:<30s} 0x{2:0>8X} ({2:d})'.format(id, exth_name, value))
|
||||||
|
else:
|
||||||
|
print('{0: >3d} {1: >4d} {2: <30s} (0x{3:s})'.format(id, contentsize, "Bad size for "+exth_name, hexlify(content)))
|
||||||
|
elif id in MobiHeader.id_map_hexstrings:
|
||||||
|
exth_name = MobiHeader.id_map_hexstrings[id]
|
||||||
|
print('{0:3d} {1:4d} {2:<30s} 0x{3:s}'.format(id, contentsize, exth_name, hexlify(content)))
|
||||||
|
else:
|
||||||
|
exth_name = "Unknown EXTH ID {0:d}".format(id)
|
||||||
|
print("{0: >3d} {1: >4d} {2: <30s} 0x{3:s}".format(id, contentsize, exth_name, hexlify(content)))
|
||||||
|
pos += size
|
||||||
|
return
|
||||||
|
|
||||||
|
def dumpheader(self):
|
||||||
|
# first 16 bytes are not part of the official mobiheader
|
||||||
|
# but we will treat it as such
|
||||||
|
# so section 0 is 16 (decimal) + self.length in total == at least 0x108 bytes for Mobi 8 headers
|
||||||
|
print("Dumping section %d, Mobipocket Header version: %d, total length %d" % (self.start,self.version, self.length+16))
|
||||||
|
self.hdr = {}
|
||||||
|
# set it up for the proper header version
|
||||||
|
if self.version == 0:
|
||||||
|
self.mobi_header = MobiHeader.palmdoc_header
|
||||||
|
self.mobi_header_sorted_keys = MobiHeader.palmdoc_header_sorted_keys
|
||||||
|
elif self.version < 8:
|
||||||
|
self.mobi_header = MobiHeader.mobi6_header
|
||||||
|
self.mobi_header_sorted_keys = MobiHeader.mobi6_header_sorted_keys
|
||||||
|
else:
|
||||||
|
self.mobi_header = MobiHeader.mobi8_header
|
||||||
|
self.mobi_header_sorted_keys = MobiHeader.mobi8_header_sorted_keys
|
||||||
|
|
||||||
|
# parse the header information
|
||||||
|
for key in self.mobi_header_sorted_keys:
|
||||||
|
(pos, format, tot_len) = self.mobi_header[key]
|
||||||
|
if pos < (self.length + 16):
|
||||||
|
val, = struct.unpack_from(format, self.header, pos)
|
||||||
|
self.hdr[key] = val
|
||||||
|
|
||||||
|
if 'title_offset' in self.hdr:
|
||||||
|
title_offset = self.hdr['title_offset']
|
||||||
|
title_length = self.hdr['title_length']
|
||||||
|
else:
|
||||||
|
title_offset = 0
|
||||||
|
title_length = 0
|
||||||
|
if title_offset == 0:
|
||||||
|
title_offset = len(self.header)
|
||||||
|
title_length = 0
|
||||||
|
self.title = self.sect.palmname.decode('latin-1', errors='replace')
|
||||||
|
else:
|
||||||
|
self.title = self.header[title_offset:title_offset+title_length].decode(self.codec, errors='replace')
|
||||||
|
# title record always padded with two nul bytes and then padded with nuls to next 4 byte boundary
|
||||||
|
title_length = ((title_length+2+3)>>2)<<2
|
||||||
|
|
||||||
|
self.extra1 = self.header[self.exth_offset+self.exth_length:title_offset]
|
||||||
|
self.extra2 = self.header[title_offset+title_length:]
|
||||||
|
|
||||||
|
print("Mobipocket header from section %d" % self.start)
|
||||||
|
print(" Offset Value Hex Dec Description")
|
||||||
|
for key in self.mobi_header_sorted_keys:
|
||||||
|
(pos, format, tot_len) = self.mobi_header[key]
|
||||||
|
if pos < (self.length + 16):
|
||||||
|
if key != 'magic':
|
||||||
|
fmt_string = "0x{0:0>3X} ({0:3d}){1: >" + str(9-2*tot_len) +"s}0x{2:0>" + str(2*tot_len) + "X} {2:10d} {3:s}"
|
||||||
|
else:
|
||||||
|
self.hdr[key] = unicode_str(self.hdr[key])
|
||||||
|
fmt_string = "0x{0:0>3X} ({0:3d}){2:>11s} {3:s}"
|
||||||
|
print(fmt_string.format(pos, " ",self.hdr[key], key))
|
||||||
|
print("")
|
||||||
|
|
||||||
|
if self.exth_length > 0:
|
||||||
|
print("EXTH metadata, offset %d, padded length %d" % (self.exth_offset,self.exth_length))
|
||||||
|
self.dump_exth()
|
||||||
|
print("")
|
||||||
|
|
||||||
|
if len(self.extra1) > 0:
|
||||||
|
print("Extra data between EXTH and Title, length %d" % len(self.extra1))
|
||||||
|
print(hexlify(self.extra1))
|
||||||
|
print("")
|
||||||
|
|
||||||
|
if title_length > 0:
|
||||||
|
print("Title in header at offset %d, padded length %d: '%s'" %(title_offset,title_length,self.title))
|
||||||
|
print("")
|
||||||
|
|
||||||
|
if len(self.extra2) > 0:
|
||||||
|
print("Extra data between Title and end of header, length %d" % len(self.extra2))
|
||||||
|
print(hexlify(self.extra2))
|
||||||
|
print("")
|
||||||
|
|
||||||
|
def isPrintReplica(self):
|
||||||
|
return self.mlstart[0:4] == b"%MOP"
|
||||||
|
|
||||||
|
def isK8(self):
|
||||||
|
return self.start != 0 or self.version == 8
|
||||||
|
|
||||||
|
def isEncrypted(self):
|
||||||
|
return self.crypto_type != 0
|
||||||
|
|
||||||
|
def hasNCX(self):
|
||||||
|
return self.ncxidx != 0xffffffff
|
||||||
|
|
||||||
|
def isDictionary(self):
|
||||||
|
return self.metaOrthIndex != 0xffffffff
|
||||||
|
|
||||||
|
def getncxIndex(self):
|
||||||
|
return self.ncxidx
|
||||||
|
|
||||||
|
def decompress(self, data):
|
||||||
|
return self.unpack(data)
|
||||||
|
|
||||||
|
def Language(self):
|
||||||
|
langcode = struct.unpack(b'!L', self.header[0x5c:0x60])[0]
|
||||||
|
langid = langcode & 0xFF
|
||||||
|
sublangid = (langcode >> 8) & 0xFF
|
||||||
|
return getLanguage(langid, sublangid)
|
||||||
|
|
||||||
|
def DictInLanguage(self):
|
||||||
|
if self.isDictionary():
|
||||||
|
langcode = struct.unpack(b'!L', self.header[0x60:0x64])[0]
|
||||||
|
langid = langcode & 0xFF
|
||||||
|
sublangid = (langcode >> 10) & 0xFF
|
||||||
|
if langid != 0:
|
||||||
|
return getLanguage(langid, sublangid)
|
||||||
|
return False
|
||||||
|
|
||||||
|
def DictOutLanguage(self):
|
||||||
|
if self.isDictionary():
|
||||||
|
langcode = struct.unpack(b'!L', self.header[0x64:0x68])[0]
|
||||||
|
langid = langcode & 0xFF
|
||||||
|
sublangid = (langcode >> 10) & 0xFF
|
||||||
|
if langid != 0:
|
||||||
|
return getLanguage(langid, sublangid)
|
||||||
|
return False
|
||||||
|
|
||||||
|
def getRawML(self):
|
||||||
|
def getSizeOfTrailingDataEntry(data):
|
||||||
|
num = 0
|
||||||
|
for v in data[-4:]:
|
||||||
|
if bord(v) & 0x80:
|
||||||
|
num = 0
|
||||||
|
num = (num << 7) | (bord(v) & 0x7f)
|
||||||
|
return num
|
||||||
|
def trimTrailingDataEntries(data):
|
||||||
|
for _ in range(trailers):
|
||||||
|
num = getSizeOfTrailingDataEntry(data)
|
||||||
|
data = data[:-num]
|
||||||
|
if multibyte:
|
||||||
|
num = (ord(data[-1:]) & 3) + 1
|
||||||
|
data = data[:-num]
|
||||||
|
return data
|
||||||
|
multibyte = 0
|
||||||
|
trailers = 0
|
||||||
|
if self.sect.ident == b'BOOKMOBI':
|
||||||
|
mobi_length, = struct.unpack_from(b'>L', self.header, 0x14)
|
||||||
|
mobi_version, = struct.unpack_from(b'>L', self.header, 0x68)
|
||||||
|
if (mobi_length >= 0xE4) and (mobi_version >= 5):
|
||||||
|
flags, = struct.unpack_from(b'>H', self.header, 0xF2)
|
||||||
|
multibyte = flags & 1
|
||||||
|
while flags > 1:
|
||||||
|
if flags & 2:
|
||||||
|
trailers += 1
|
||||||
|
flags = flags >> 1
|
||||||
|
# get raw mobi markup languge
|
||||||
|
print("Unpacking raw markup language")
|
||||||
|
dataList = []
|
||||||
|
# offset = 0
|
||||||
|
for i in range(1, self.records+1):
|
||||||
|
data = trimTrailingDataEntries(self.sect.loadSection(self.start + i))
|
||||||
|
dataList.append(self.unpack(data))
|
||||||
|
if self.isK8():
|
||||||
|
self.sect.setsectiondescription(self.start + i,"KF8 Text Section {0:d}".format(i))
|
||||||
|
elif self.version == 0:
|
||||||
|
self.sect.setsectiondescription(self.start + i,"PalmDOC Text Section {0:d}".format(i))
|
||||||
|
else:
|
||||||
|
self.sect.setsectiondescription(self.start + i,"Mobipocket Text Section {0:d}".format(i))
|
||||||
|
rawML = b''.join(dataList)
|
||||||
|
self.rawSize = len(rawML)
|
||||||
|
return rawML
|
||||||
|
|
||||||
|
# all metadata is stored in a dictionary with key and returns a *list* of values
|
||||||
|
# a list is used to allow for multiple creators, multiple contributors, etc
|
||||||
|
def parseMetaData(self):
|
||||||
|
def addValue(name, value):
|
||||||
|
if name not in self.metadata:
|
||||||
|
self.metadata[name] = [value]
|
||||||
|
else:
|
||||||
|
self.metadata[name].append(value)
|
||||||
|
|
||||||
|
codec=self.codec
|
||||||
|
if self.hasExth:
|
||||||
|
extheader=self.exth
|
||||||
|
_length, num_items = struct.unpack(b'>LL', extheader[4:12])
|
||||||
|
extheader = extheader[12:]
|
||||||
|
pos = 0
|
||||||
|
for _ in range(num_items):
|
||||||
|
id, size = struct.unpack(b'>LL', extheader[pos:pos+8])
|
||||||
|
content = extheader[pos + 8: pos + size]
|
||||||
|
if id in MobiHeader.id_map_strings:
|
||||||
|
name = MobiHeader.id_map_strings[id]
|
||||||
|
addValue(name, content.decode(codec, errors='replace'))
|
||||||
|
elif id in MobiHeader.id_map_values:
|
||||||
|
name = MobiHeader.id_map_values[id]
|
||||||
|
if size == 9:
|
||||||
|
value, = struct.unpack(b'B',content)
|
||||||
|
addValue(name, unicode_str(str(value)))
|
||||||
|
elif size == 10:
|
||||||
|
value, = struct.unpack(b'>H',content)
|
||||||
|
addValue(name, unicode_str(str(value)))
|
||||||
|
elif size == 12:
|
||||||
|
value, = struct.unpack(b'>L',content)
|
||||||
|
# handle special case of missing CoverOffset or missing ThumbOffset
|
||||||
|
if id == 201 or id == 202:
|
||||||
|
if value != 0xffffffff:
|
||||||
|
addValue(name, unicode_str(str(value)))
|
||||||
|
else:
|
||||||
|
addValue(name, unicode_str(str(value)))
|
||||||
|
else:
|
||||||
|
print("Warning: Bad key, size, value combination detected in EXTH ", id, size, hexlify(content))
|
||||||
|
addValue(name, hexlify(content))
|
||||||
|
elif id in MobiHeader.id_map_hexstrings:
|
||||||
|
name = MobiHeader.id_map_hexstrings[id]
|
||||||
|
addValue(name, hexlify(content))
|
||||||
|
else:
|
||||||
|
name = unicode_str(str(id)) + ' (hex)'
|
||||||
|
addValue(name, hexlify(content))
|
||||||
|
pos += size
|
||||||
|
|
||||||
|
# add the basics to the metadata each as a list element
|
||||||
|
self.metadata['Language'] = [self.Language()]
|
||||||
|
self.metadata['Title'] = [unicode_str(self.title,self.codec)]
|
||||||
|
self.metadata['Codec'] = [self.codec]
|
||||||
|
self.metadata['UniqueID'] = [unicode_str(str(self.unique_id))]
|
||||||
|
# if no asin create one using a uuid
|
||||||
|
if 'ASIN' not in self.metadata:
|
||||||
|
self.metadata['ASIN'] = [unicode_str(str(uuid.uuid4()))]
|
||||||
|
# if no cdeType set it to "EBOK"
|
||||||
|
if 'cdeType' not in self.metadata:
|
||||||
|
self.metadata['cdeType'] = ['EBOK']
|
||||||
|
|
||||||
|
def getMetaData(self):
|
||||||
|
return self.metadata
|
||||||
|
|
||||||
|
def describeHeader(self, DUMP):
|
||||||
|
print("Mobi Version:", self.version)
|
||||||
|
print("Codec:", self.codec)
|
||||||
|
print("Title:", self.title)
|
||||||
|
if 'Updated_Title' in self.metadata:
|
||||||
|
print("EXTH Title:", self.metadata['Updated_Title'][0])
|
||||||
|
if self.compression == 0x4448:
|
||||||
|
print("Huffdic compression")
|
||||||
|
elif self.compression == 2:
|
||||||
|
print("Palmdoc compression")
|
||||||
|
elif self.compression == 1:
|
||||||
|
print("No compression")
|
||||||
|
if DUMP:
|
||||||
|
self.dumpheader()
|
439
KindleUnpack/mobi_html.py
Normal file
439
KindleUnpack/mobi_html.py
Normal file
@@ -0,0 +1,439 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
|
||||||
|
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||||
|
|
||||||
|
from .compatibility_utils import PY2, utf8_str
|
||||||
|
|
||||||
|
if PY2:
|
||||||
|
range = xrange
|
||||||
|
|
||||||
|
import re
|
||||||
|
# note: re requites the pattern to be the exact same type as the data to be searched in python3
|
||||||
|
# but u"" is not allowed for the pattern itself only b""
|
||||||
|
|
||||||
|
from .mobi_utils import fromBase32
|
||||||
|
|
||||||
|
class HTMLProcessor:
|
||||||
|
|
||||||
|
def __init__(self, files, metadata, rscnames):
|
||||||
|
self.files = files
|
||||||
|
self.metadata = metadata
|
||||||
|
self.rscnames = rscnames
|
||||||
|
# for original style mobis, default to including all image files in the opf manifest
|
||||||
|
self.used = {}
|
||||||
|
for name in rscnames:
|
||||||
|
self.used[name] = 'used'
|
||||||
|
|
||||||
|
def findAnchors(self, rawtext, indx_data, positionMap):
|
||||||
|
# process the raw text
|
||||||
|
# find anchors...
|
||||||
|
print("Find link anchors")
|
||||||
|
link_pattern = re.compile(br'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''', re.IGNORECASE)
|
||||||
|
# TEST NCX: merge in filepos from indx
|
||||||
|
pos_links = [int(m.group(1)) for m in link_pattern.finditer(rawtext)]
|
||||||
|
if indx_data:
|
||||||
|
pos_indx = [e['pos'] for e in indx_data if e['pos']>0]
|
||||||
|
pos_links = list(set(pos_links + pos_indx))
|
||||||
|
|
||||||
|
for position in pos_links:
|
||||||
|
if position in positionMap:
|
||||||
|
positionMap[position] = positionMap[position] + utf8_str('<a id="filepos%d" />' % position)
|
||||||
|
else:
|
||||||
|
positionMap[position] = utf8_str('<a id="filepos%d" />' % position)
|
||||||
|
|
||||||
|
# apply dictionary metadata and anchors
|
||||||
|
print("Insert data into html")
|
||||||
|
pos = 0
|
||||||
|
lastPos = len(rawtext)
|
||||||
|
dataList = []
|
||||||
|
for end in sorted(positionMap.keys()):
|
||||||
|
if end == 0 or end > lastPos:
|
||||||
|
continue # something's up - can't put a tag in outside <html>...</html>
|
||||||
|
dataList.append(rawtext[pos:end])
|
||||||
|
dataList.append(positionMap[end])
|
||||||
|
pos = end
|
||||||
|
dataList.append(rawtext[pos:])
|
||||||
|
srctext = b"".join(dataList)
|
||||||
|
rawtext = None
|
||||||
|
dataList = None
|
||||||
|
self.srctext = srctext
|
||||||
|
self.indx_data = indx_data
|
||||||
|
return srctext
|
||||||
|
|
||||||
|
def insertHREFS(self):
|
||||||
|
srctext = self.srctext
|
||||||
|
rscnames = self.rscnames
|
||||||
|
metadata = self.metadata
|
||||||
|
|
||||||
|
# put in the hrefs
|
||||||
|
print("Insert hrefs into html")
|
||||||
|
# There doesn't seem to be a standard, so search as best as we can
|
||||||
|
|
||||||
|
link_pattern = re.compile(br'''<a([^>]*?)filepos=['"]{0,1}0*(\d+)['"]{0,1}([^>]*?)>''', re.IGNORECASE)
|
||||||
|
srctext = link_pattern.sub(br'''<a\1href="#filepos\2"\3>''', srctext)
|
||||||
|
|
||||||
|
# remove empty anchors
|
||||||
|
print("Remove empty anchors from html")
|
||||||
|
srctext = re.sub(br"<a\s*/>",br"", srctext)
|
||||||
|
srctext = re.sub(br"<a\s*>\s*</a>",br"", srctext)
|
||||||
|
|
||||||
|
# convert image references
|
||||||
|
print("Insert image references into html")
|
||||||
|
# split string into image tag pieces and other pieces
|
||||||
|
image_pattern = re.compile(br'''(<img.*?>)''', re.IGNORECASE)
|
||||||
|
image_index_pattern = re.compile(br'''recindex=['"]{0,1}([0-9]+)['"]{0,1}''', re.IGNORECASE)
|
||||||
|
srcpieces = image_pattern.split(srctext)
|
||||||
|
srctext = self.srctext = None
|
||||||
|
|
||||||
|
# all odd pieces are image tags (nulls string on even pieces if no space between them in srctext)
|
||||||
|
for i in range(1, len(srcpieces), 2):
|
||||||
|
tag = srcpieces[i]
|
||||||
|
for m in image_index_pattern.finditer(tag):
|
||||||
|
imageNumber = int(m.group(1))
|
||||||
|
imageName = rscnames[imageNumber-1]
|
||||||
|
if imageName is None:
|
||||||
|
print("Error: Referenced image %s was not recognized as a valid image" % imageNumber)
|
||||||
|
else:
|
||||||
|
replacement = b'src="Images/' + utf8_str(imageName) + b'"'
|
||||||
|
tag = image_index_pattern.sub(replacement, tag, 1)
|
||||||
|
srcpieces[i] = tag
|
||||||
|
srctext = b"".join(srcpieces)
|
||||||
|
|
||||||
|
# add in character set meta into the html header if needed
|
||||||
|
if 'Codec' in metadata:
|
||||||
|
srctext = srctext[0:12]+b'<meta http-equiv="content-type" content="text/html; charset='+utf8_str(metadata.get('Codec')[0])+b'" />'+srctext[12:]
|
||||||
|
return srctext, self.used
|
||||||
|
|
||||||
|
|
||||||
|
class XHTMLK8Processor:
|
||||||
|
|
||||||
|
def __init__(self, rscnames, k8proc):
|
||||||
|
self.rscnames = rscnames
|
||||||
|
self.k8proc = k8proc
|
||||||
|
self.used = {}
|
||||||
|
|
||||||
|
def buildXHTML(self):
|
||||||
|
|
||||||
|
# first need to update all links that are internal which
|
||||||
|
# are based on positions within the xhtml files **BEFORE**
|
||||||
|
# cutting and pasting any pieces into the xhtml text files
|
||||||
|
|
||||||
|
# kindle:pos:fid:XXXX:off:YYYYYYYYYY (used for internal link within xhtml)
|
||||||
|
# XXXX is the offset in records into divtbl
|
||||||
|
# YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position
|
||||||
|
|
||||||
|
# pos:fid pattern
|
||||||
|
posfid_pattern = re.compile(br'''(<a.*?href=.*?>)''', re.IGNORECASE)
|
||||||
|
posfid_index_pattern = re.compile(br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''')
|
||||||
|
|
||||||
|
parts = []
|
||||||
|
print("Building proper xhtml for each file")
|
||||||
|
for i in range(self.k8proc.getNumberOfParts()):
|
||||||
|
part = self.k8proc.getPart(i)
|
||||||
|
[partnum, dir, filename, beg, end, aidtext] = self.k8proc.getPartInfo(i)
|
||||||
|
|
||||||
|
# internal links
|
||||||
|
srcpieces = posfid_pattern.split(part)
|
||||||
|
for j in range(1, len(srcpieces),2):
|
||||||
|
tag = srcpieces[j]
|
||||||
|
if tag.startswith(b'<'):
|
||||||
|
for m in posfid_index_pattern.finditer(tag):
|
||||||
|
posfid = m.group(1)
|
||||||
|
offset = m.group(2)
|
||||||
|
filename, idtag = self.k8proc.getIDTagByPosFid(posfid, offset)
|
||||||
|
if idtag == b'':
|
||||||
|
replacement= b'"' + utf8_str(filename) + b'"'
|
||||||
|
else:
|
||||||
|
replacement = b'"' + utf8_str(filename) + b'#' + idtag + b'"'
|
||||||
|
tag = posfid_index_pattern.sub(replacement, tag, 1)
|
||||||
|
srcpieces[j] = tag
|
||||||
|
part = b"".join(srcpieces)
|
||||||
|
parts.append(part)
|
||||||
|
|
||||||
|
# we are free to cut and paste as we see fit
|
||||||
|
# we can safely remove all of the Kindlegen generated aid tags
|
||||||
|
# change aid ids that are in k8proc.linked_aids to xhtml ids
|
||||||
|
find_tag_with_aid_pattern = re.compile(br'''(<[^>]*\said\s*=[^>]*>)''', re.IGNORECASE)
|
||||||
|
within_tag_aid_position_pattern = re.compile(br'''\said\s*=['"]([^'"]*)['"]''')
|
||||||
|
for i in range(len(parts)):
|
||||||
|
part = parts[i]
|
||||||
|
srcpieces = find_tag_with_aid_pattern.split(part)
|
||||||
|
for j in range(len(srcpieces)):
|
||||||
|
tag = srcpieces[j]
|
||||||
|
if tag.startswith(b'<'):
|
||||||
|
for m in within_tag_aid_position_pattern.finditer(tag):
|
||||||
|
try:
|
||||||
|
aid = m.group(1)
|
||||||
|
except IndexError:
|
||||||
|
aid = None
|
||||||
|
replacement = b''
|
||||||
|
if aid in self.k8proc.linked_aids:
|
||||||
|
replacement = b' id="aid-' + aid + b'"'
|
||||||
|
tag = within_tag_aid_position_pattern.sub(replacement, tag, 1)
|
||||||
|
srcpieces[j] = tag
|
||||||
|
part = b"".join(srcpieces)
|
||||||
|
parts[i] = part
|
||||||
|
|
||||||
|
# we can safely replace all of the Kindlegen generated data-AmznPageBreak tags
|
||||||
|
# with page-break-after style patterns
|
||||||
|
find_tag_with_AmznPageBreak_pattern = re.compile(br'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE)
|
||||||
|
within_tag_AmznPageBreak_position_pattern = re.compile(br'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''')
|
||||||
|
for i in range(len(parts)):
|
||||||
|
part = parts[i]
|
||||||
|
srcpieces = find_tag_with_AmznPageBreak_pattern.split(part)
|
||||||
|
for j in range(len(srcpieces)):
|
||||||
|
tag = srcpieces[j]
|
||||||
|
if tag.startswith(b'<'):
|
||||||
|
srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub(
|
||||||
|
lambda m:b' style="page-break-after:' + m.group(1) + b'"', tag)
|
||||||
|
part = b"".join(srcpieces)
|
||||||
|
parts[i] = part
|
||||||
|
|
||||||
|
# we have to handle substitutions for the flows pieces first as they may
|
||||||
|
# be inlined into the xhtml text
|
||||||
|
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
|
||||||
|
# kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
|
||||||
|
# kindle:embed:XXXX (used for fonts)
|
||||||
|
|
||||||
|
flows = []
|
||||||
|
flows.append(None)
|
||||||
|
flowinfo = []
|
||||||
|
flowinfo.append([None, None, None, None])
|
||||||
|
|
||||||
|
# regular expression search patterns
|
||||||
|
img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
|
||||||
|
img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE)
|
||||||
|
|
||||||
|
tag_pattern = re.compile(br'''(<[^>]*>)''')
|
||||||
|
flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
|
||||||
|
|
||||||
|
url_pattern = re.compile(br'''(url\(.*?\))''', re.IGNORECASE)
|
||||||
|
url_img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*["')]''', re.IGNORECASE)
|
||||||
|
font_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)["')]''', re.IGNORECASE)
|
||||||
|
url_css_index_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE)
|
||||||
|
url_svg_image_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=image/svg\+xml[^\)]*''', re.IGNORECASE)
|
||||||
|
|
||||||
|
for i in range(1, self.k8proc.getNumberOfFlows()):
|
||||||
|
[ftype, format, dir, filename] = self.k8proc.getFlowInfo(i)
|
||||||
|
flowpart = self.k8proc.getFlow(i)
|
||||||
|
|
||||||
|
# links to raster image files from image tags
|
||||||
|
# image_pattern
|
||||||
|
srcpieces = img_pattern.split(flowpart)
|
||||||
|
for j in range(1, len(srcpieces),2):
|
||||||
|
tag = srcpieces[j]
|
||||||
|
if tag.startswith(b'<im'):
|
||||||
|
for m in img_index_pattern.finditer(tag):
|
||||||
|
imageNumber = fromBase32(m.group(1))
|
||||||
|
imageName = self.rscnames[imageNumber-1]
|
||||||
|
if imageName is not None:
|
||||||
|
replacement = b'"../Images/' + utf8_str(imageName) + b'"'
|
||||||
|
self.used[imageName] = 'used'
|
||||||
|
tag = img_index_pattern.sub(replacement, tag, 1)
|
||||||
|
else:
|
||||||
|
print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag))
|
||||||
|
srcpieces[j] = tag
|
||||||
|
flowpart = b"".join(srcpieces)
|
||||||
|
|
||||||
|
# replacements inside css url():
|
||||||
|
srcpieces = url_pattern.split(flowpart)
|
||||||
|
for j in range(1, len(srcpieces),2):
|
||||||
|
tag = srcpieces[j]
|
||||||
|
|
||||||
|
# process links to raster image files
|
||||||
|
for m in url_img_index_pattern.finditer(tag):
|
||||||
|
imageNumber = fromBase32(m.group(1))
|
||||||
|
imageName = self.rscnames[imageNumber-1]
|
||||||
|
osep = m.group()[0:1]
|
||||||
|
csep = m.group()[-1:]
|
||||||
|
if imageName is not None:
|
||||||
|
replacement = osep + b'../Images/' + utf8_str(imageName) + csep
|
||||||
|
self.used[imageName] = 'used'
|
||||||
|
tag = url_img_index_pattern.sub(replacement, tag, 1)
|
||||||
|
else:
|
||||||
|
print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag))
|
||||||
|
|
||||||
|
# process links to fonts
|
||||||
|
for m in font_index_pattern.finditer(tag):
|
||||||
|
fontNumber = fromBase32(m.group(1))
|
||||||
|
fontName = self.rscnames[fontNumber-1]
|
||||||
|
osep = m.group()[0:1]
|
||||||
|
csep = m.group()[-1:]
|
||||||
|
if fontName is None:
|
||||||
|
print("Error: Referenced font %s was not recognized as a valid font in %s" % (fontNumber, tag))
|
||||||
|
else:
|
||||||
|
replacement = osep + b'../Fonts/' + utf8_str(fontName) + csep
|
||||||
|
tag = font_index_pattern.sub(replacement, tag, 1)
|
||||||
|
self.used[fontName] = 'used'
|
||||||
|
|
||||||
|
# process links to other css pieces
|
||||||
|
for m in url_css_index_pattern.finditer(tag):
|
||||||
|
num = fromBase32(m.group(1))
|
||||||
|
[typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
|
||||||
|
replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
|
||||||
|
tag = url_css_index_pattern.sub(replacement, tag, 1)
|
||||||
|
self.used[fnm] = 'used'
|
||||||
|
|
||||||
|
# process links to svg images
|
||||||
|
for m in url_svg_image_pattern.finditer(tag):
|
||||||
|
num = fromBase32(m.group(1))
|
||||||
|
[typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
|
||||||
|
replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
|
||||||
|
tag = url_svg_image_pattern.sub(replacement, tag, 1)
|
||||||
|
self.used[fnm] = 'used'
|
||||||
|
|
||||||
|
srcpieces[j] = tag
|
||||||
|
flowpart = b"".join(srcpieces)
|
||||||
|
|
||||||
|
# store away in our own copy
|
||||||
|
flows.append(flowpart)
|
||||||
|
|
||||||
|
# I do not think this case exists and even if it does exist, it needs to be done in a separate
|
||||||
|
# pass to prevent inlining a flow piece into another flow piece before the inserted one or the
|
||||||
|
# target one has been fully processed
|
||||||
|
|
||||||
|
# but keep it around if it ends up we do need it
|
||||||
|
|
||||||
|
# flow pattern not inside url()
|
||||||
|
# srcpieces = tag_pattern.split(flowpart)
|
||||||
|
# for j in range(1, len(srcpieces),2):
|
||||||
|
# tag = srcpieces[j]
|
||||||
|
# if tag.startswith(b'<'):
|
||||||
|
# for m in flow_pattern.finditer(tag):
|
||||||
|
# num = fromBase32(m.group(1))
|
||||||
|
# [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
|
||||||
|
# flowtext = self.k8proc.getFlow(num)
|
||||||
|
# if fmt == b'inline':
|
||||||
|
# tag = flowtext
|
||||||
|
# else:
|
||||||
|
# replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
|
||||||
|
# tag = flow_pattern.sub(replacement, tag, 1)
|
||||||
|
# self.used[fnm] = 'used'
|
||||||
|
# srcpieces[j] = tag
|
||||||
|
# flowpart = b"".join(srcpieces)
|
||||||
|
|
||||||
|
# now handle the main text xhtml parts
|
||||||
|
|
||||||
|
# Handle the flow items in the XHTML text pieces
|
||||||
|
# kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
|
||||||
|
tag_pattern = re.compile(br'''(<[^>]*>)''')
|
||||||
|
flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
|
||||||
|
for i in range(len(parts)):
|
||||||
|
part = parts[i]
|
||||||
|
[partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
|
||||||
|
# flow pattern
|
||||||
|
srcpieces = tag_pattern.split(part)
|
||||||
|
for j in range(1, len(srcpieces),2):
|
||||||
|
tag = srcpieces[j]
|
||||||
|
if tag.startswith(b'<'):
|
||||||
|
for m in flow_pattern.finditer(tag):
|
||||||
|
num = fromBase32(m.group(1))
|
||||||
|
if num > 0 and num < len(self.k8proc.flowinfo):
|
||||||
|
[typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
|
||||||
|
flowpart = flows[num]
|
||||||
|
if fmt == b'inline':
|
||||||
|
tag = flowpart
|
||||||
|
else:
|
||||||
|
replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
|
||||||
|
tag = flow_pattern.sub(replacement, tag, 1)
|
||||||
|
self.used[fnm] = 'used'
|
||||||
|
else:
|
||||||
|
print("warning: ignoring non-existent flow link", tag, " value 0x%x" % num)
|
||||||
|
srcpieces[j] = tag
|
||||||
|
part = b''.join(srcpieces)
|
||||||
|
|
||||||
|
# store away modified version
|
||||||
|
parts[i] = part
|
||||||
|
|
||||||
|
# Handle any embedded raster images links in style= attributes urls
|
||||||
|
style_pattern = re.compile(br'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''', re.IGNORECASE)
|
||||||
|
img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE)
|
||||||
|
|
||||||
|
for i in range(len(parts)):
|
||||||
|
part = parts[i]
|
||||||
|
[partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
|
||||||
|
|
||||||
|
# replace urls in style attributes
|
||||||
|
srcpieces = style_pattern.split(part)
|
||||||
|
for j in range(1, len(srcpieces),2):
|
||||||
|
tag = srcpieces[j]
|
||||||
|
if b'kindle:embed' in tag:
|
||||||
|
for m in img_index_pattern.finditer(tag):
|
||||||
|
imageNumber = fromBase32(m.group(1))
|
||||||
|
imageName = self.rscnames[imageNumber-1]
|
||||||
|
osep = m.group()[0:1]
|
||||||
|
csep = m.group()[-1:]
|
||||||
|
if imageName is not None:
|
||||||
|
replacement = osep + b'../Images/'+ utf8_str(imageName) + csep
|
||||||
|
self.used[imageName] = 'used'
|
||||||
|
tag = img_index_pattern.sub(replacement, tag, 1)
|
||||||
|
else:
|
||||||
|
print("Error: Referenced image %s in style url was not recognized in %s" % (imageNumber, tag))
|
||||||
|
srcpieces[j] = tag
|
||||||
|
part = b"".join(srcpieces)
|
||||||
|
|
||||||
|
# store away modified version
|
||||||
|
parts[i] = part
|
||||||
|
|
||||||
|
# Handle any embedded raster images links in the xhtml text
|
||||||
|
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
|
||||||
|
img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
|
||||||
|
img_index_pattern = re.compile(br'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''')
|
||||||
|
|
||||||
|
for i in range(len(parts)):
|
||||||
|
part = parts[i]
|
||||||
|
[partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
|
||||||
|
|
||||||
|
# links to raster image files
|
||||||
|
# image_pattern
|
||||||
|
srcpieces = img_pattern.split(part)
|
||||||
|
for j in range(1, len(srcpieces),2):
|
||||||
|
tag = srcpieces[j]
|
||||||
|
if tag.startswith(b'<im'):
|
||||||
|
for m in img_index_pattern.finditer(tag):
|
||||||
|
imageNumber = fromBase32(m.group(1))
|
||||||
|
imageName = self.rscnames[imageNumber-1]
|
||||||
|
if imageName is not None:
|
||||||
|
replacement = b'"../Images/' + utf8_str(imageName) + b'"'
|
||||||
|
self.used[imageName] = 'used'
|
||||||
|
tag = img_index_pattern.sub(replacement, tag, 1)
|
||||||
|
else:
|
||||||
|
print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag))
|
||||||
|
srcpieces[j] = tag
|
||||||
|
part = b"".join(srcpieces)
|
||||||
|
# store away modified version
|
||||||
|
parts[i] = part
|
||||||
|
|
||||||
|
# finally perform any general cleanups needed to make valid XHTML
|
||||||
|
# these include:
|
||||||
|
# in svg tags replace "perserveaspectratio" attributes with "perserveAspectRatio"
|
||||||
|
# in svg tags replace "viewbox" attributes with "viewBox"
|
||||||
|
# in <li> remove value="XX" attributes since these are illegal
|
||||||
|
tag_pattern = re.compile(br'''(<[^>]*>)''')
|
||||||
|
li_value_pattern = re.compile(br'''\svalue\s*=\s*['"][^'"]*['"]''', re.IGNORECASE)
|
||||||
|
|
||||||
|
for i in range(len(parts)):
|
||||||
|
part = parts[i]
|
||||||
|
[partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
|
||||||
|
|
||||||
|
# tag pattern
|
||||||
|
srcpieces = tag_pattern.split(part)
|
||||||
|
for j in range(1, len(srcpieces),2):
|
||||||
|
tag = srcpieces[j]
|
||||||
|
if tag.startswith(b'<svg') or tag.startswith(b'<SVG'):
|
||||||
|
tag = tag.replace(b'preserveaspectratio',b'preserveAspectRatio')
|
||||||
|
tag = tag.replace(b'viewbox',b'viewBox')
|
||||||
|
elif tag.startswith(b'<li ') or tag.startswith(b'<LI '):
|
||||||
|
tagpieces = li_value_pattern.split(tag)
|
||||||
|
tag = b"".join(tagpieces)
|
||||||
|
srcpieces[j] = tag
|
||||||
|
part = b"".join(srcpieces)
|
||||||
|
# store away modified version
|
||||||
|
parts[i] = part
|
||||||
|
|
||||||
|
self.k8proc.setFlows(flows)
|
||||||
|
self.k8proc.setParts(parts)
|
||||||
|
|
||||||
|
return self.used
|
276
KindleUnpack/mobi_index.py
Normal file
276
KindleUnpack/mobi_index.py
Normal file
@@ -0,0 +1,276 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
|
||||||
|
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||||
|
|
||||||
|
from .compatibility_utils import PY2, bchr, bstr, bord
|
||||||
|
if PY2:
|
||||||
|
range = xrange
|
||||||
|
|
||||||
|
import struct
|
||||||
|
# note: struct pack, unpack, unpack_from all require bytestring format
|
||||||
|
# data all the way up to at least python 2.7.5, python 3 okay with bytestring
|
||||||
|
|
||||||
|
from .mobi_utils import toHex
|
||||||
|
|
||||||
|
class MobiIndex:
|
||||||
|
|
||||||
|
def __init__(self, sect, DEBUG=False):
|
||||||
|
self.sect = sect
|
||||||
|
self.DEBUG = DEBUG
|
||||||
|
|
||||||
|
def getIndexData(self, idx, label="Unknown"):
|
||||||
|
sect = self.sect
|
||||||
|
outtbl = []
|
||||||
|
ctoc_text = {}
|
||||||
|
if idx != 0xffffffff:
|
||||||
|
sect.setsectiondescription(idx,"{0} Main INDX section".format(label))
|
||||||
|
data = sect.loadSection(idx)
|
||||||
|
idxhdr, hordt1, hordt2 = self.parseINDXHeader(data)
|
||||||
|
IndexCount = idxhdr['count']
|
||||||
|
# handle the case of multiple sections used for CTOC
|
||||||
|
rec_off = 0
|
||||||
|
off = idx + IndexCount + 1
|
||||||
|
for j in range(idxhdr['nctoc']):
|
||||||
|
cdata = sect.loadSection(off + j)
|
||||||
|
sect.setsectiondescription(off+j, label + ' CTOC Data ' + str(j))
|
||||||
|
ctocdict = self.readCTOC(cdata)
|
||||||
|
for k in ctocdict:
|
||||||
|
ctoc_text[k + rec_off] = ctocdict[k]
|
||||||
|
rec_off += 0x10000
|
||||||
|
tagSectionStart = idxhdr['len']
|
||||||
|
controlByteCount, tagTable = readTagSection(tagSectionStart, data)
|
||||||
|
if self.DEBUG:
|
||||||
|
print("ControlByteCount is", controlByteCount)
|
||||||
|
print("IndexCount is", IndexCount)
|
||||||
|
print("TagTable: %s" % tagTable)
|
||||||
|
for i in range(idx + 1, idx + 1 + IndexCount):
|
||||||
|
sect.setsectiondescription(i,"{0} Extra {1:d} INDX section".format(label,i-idx))
|
||||||
|
data = sect.loadSection(i)
|
||||||
|
hdrinfo, ordt1, ordt2 = self.parseINDXHeader(data)
|
||||||
|
idxtPos = hdrinfo['start']
|
||||||
|
entryCount = hdrinfo['count']
|
||||||
|
if self.DEBUG:
|
||||||
|
print(idxtPos, entryCount)
|
||||||
|
# loop through to build up the IDXT position starts
|
||||||
|
idxPositions = []
|
||||||
|
for j in range(entryCount):
|
||||||
|
pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j))
|
||||||
|
idxPositions.append(pos)
|
||||||
|
# The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
|
||||||
|
idxPositions.append(idxtPos)
|
||||||
|
# for each entry in the IDXT build up the tagMap and any associated text
|
||||||
|
for j in range(entryCount):
|
||||||
|
startPos = idxPositions[j]
|
||||||
|
endPos = idxPositions[j+1]
|
||||||
|
textLength = ord(data[startPos:startPos+1])
|
||||||
|
text = data[startPos+1:startPos+1+textLength]
|
||||||
|
if hordt2 is not None:
|
||||||
|
text = b''.join(bchr(hordt2[bord(x)]) for x in text)
|
||||||
|
tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos)
|
||||||
|
outtbl.append([text, tagMap])
|
||||||
|
if self.DEBUG:
|
||||||
|
print(tagMap)
|
||||||
|
print(text)
|
||||||
|
return outtbl, ctoc_text
|
||||||
|
|
||||||
|
def parseINDXHeader(self, data):
|
||||||
|
"read INDX header"
|
||||||
|
if not data[:4] == b'INDX':
|
||||||
|
print("Warning: index section is not INDX")
|
||||||
|
return False
|
||||||
|
words = (
|
||||||
|
'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
|
||||||
|
'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc'
|
||||||
|
)
|
||||||
|
num = len(words)
|
||||||
|
values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)])
|
||||||
|
header = {}
|
||||||
|
for n in range(num):
|
||||||
|
header[words[n]] = values[n]
|
||||||
|
|
||||||
|
ordt1 = None
|
||||||
|
ordt2 = None
|
||||||
|
|
||||||
|
ocnt, oentries, op1, op2, otagx = struct.unpack_from(b'>LLLLL',data, 0xa4)
|
||||||
|
if header['code'] == 0xfdea or ocnt != 0 or oentries > 0:
|
||||||
|
# horribly hacked up ESP (sample) mobi books use two ORDT sections but never specify
|
||||||
|
# them in the proper place in the header. They seem to be codepage 65002 which seems
|
||||||
|
# to be some sort of strange EBCDIC utf-8 or 16 encoded strings
|
||||||
|
|
||||||
|
# so we need to look for them and store them away to process leading text
|
||||||
|
# ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
|
||||||
|
# we only ever seem to use the seocnd but ...
|
||||||
|
assert(ocnt == 1)
|
||||||
|
assert(data[op1:op1+4] == b'ORDT')
|
||||||
|
assert(data[op2:op2+4] == b'ORDT')
|
||||||
|
ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4)
|
||||||
|
ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4)
|
||||||
|
|
||||||
|
if self.DEBUG:
|
||||||
|
print("parsed INDX header:")
|
||||||
|
for n in words:
|
||||||
|
print(n, "%X" % header[n],)
|
||||||
|
print("")
|
||||||
|
return header, ordt1, ordt2
|
||||||
|
|
||||||
|
def readCTOC(self, txtdata):
|
||||||
|
# read all blocks from CTOC
|
||||||
|
ctoc_data = {}
|
||||||
|
offset = 0
|
||||||
|
while offset<len(txtdata):
|
||||||
|
if PY2:
|
||||||
|
if txtdata[offset] == b'\0':
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
if txtdata[offset] == 0:
|
||||||
|
break
|
||||||
|
idx_offs = offset
|
||||||
|
# first n bytes: name len as vwi
|
||||||
|
pos, ilen = getVariableWidthValue(txtdata, offset)
|
||||||
|
offset += pos
|
||||||
|
# <len> next bytes: name
|
||||||
|
name = txtdata[offset:offset+ilen]
|
||||||
|
offset += ilen
|
||||||
|
if self.DEBUG:
|
||||||
|
print("name length is ", ilen)
|
||||||
|
print(idx_offs, name)
|
||||||
|
ctoc_data[idx_offs] = name
|
||||||
|
return ctoc_data
|
||||||
|
|
||||||
|
|
||||||
|
def getVariableWidthValue(data, offset):
|
||||||
|
'''
|
||||||
|
Decode variable width value from given bytes.
|
||||||
|
|
||||||
|
@param data: The bytes to decode.
|
||||||
|
@param offset: The start offset into data.
|
||||||
|
@return: Tuple of consumed bytes count and decoded value.
|
||||||
|
'''
|
||||||
|
value = 0
|
||||||
|
consumed = 0
|
||||||
|
finished = False
|
||||||
|
while not finished:
|
||||||
|
v = data[offset + consumed: offset + consumed + 1]
|
||||||
|
consumed += 1
|
||||||
|
if ord(v) & 0x80:
|
||||||
|
finished = True
|
||||||
|
value = (value << 7) | (ord(v) & 0x7f)
|
||||||
|
return consumed, value
|
||||||
|
|
||||||
|
|
||||||
|
def readTagSection(start, data):
|
||||||
|
'''
|
||||||
|
Read tag section from given data.
|
||||||
|
|
||||||
|
@param start: The start position in the data.
|
||||||
|
@param data: The data to process.
|
||||||
|
@return: Tuple of control byte count and list of tag tuples.
|
||||||
|
'''
|
||||||
|
controlByteCount = 0
|
||||||
|
tags = []
|
||||||
|
if data[start:start+4] == b"TAGX":
|
||||||
|
firstEntryOffset, = struct.unpack_from(b'>L', data, start + 0x04)
|
||||||
|
controlByteCount, = struct.unpack_from(b'>L', data, start + 0x08)
|
||||||
|
|
||||||
|
# Skip the first 12 bytes already read above.
|
||||||
|
for i in range(12, firstEntryOffset, 4):
|
||||||
|
pos = start + i
|
||||||
|
tags.append((ord(data[pos:pos+1]), ord(data[pos+1:pos+2]), ord(data[pos+2:pos+3]), ord(data[pos+3:pos+4])))
|
||||||
|
return controlByteCount, tags
|
||||||
|
|
||||||
|
|
||||||
|
def countSetBits(value, bits=8):
|
||||||
|
'''
|
||||||
|
Count the set bits in the given value.
|
||||||
|
|
||||||
|
@param value: Integer value.
|
||||||
|
@param bits: The number of bits of the input value (defaults to 8).
|
||||||
|
@return: Number of set bits.
|
||||||
|
'''
|
||||||
|
count = 0
|
||||||
|
for _ in range(bits):
|
||||||
|
if value & 0x01 == 0x01:
|
||||||
|
count += 1
|
||||||
|
value = value >> 1
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def getTagMap(controlByteCount, tagTable, entryData, startPos, endPos):
|
||||||
|
'''
|
||||||
|
Create a map of tags and values from the given byte section.
|
||||||
|
|
||||||
|
@param controlByteCount: The number of control bytes.
|
||||||
|
@param tagTable: The tag table.
|
||||||
|
@param entryData: The data to process.
|
||||||
|
@param startPos: The starting position in entryData.
|
||||||
|
@param endPos: The end position in entryData or None if it is unknown.
|
||||||
|
@return: Hashmap of tag and list of values.
|
||||||
|
'''
|
||||||
|
tags = []
|
||||||
|
tagHashMap = {}
|
||||||
|
controlByteIndex = 0
|
||||||
|
dataStart = startPos + controlByteCount
|
||||||
|
|
||||||
|
for tag, valuesPerEntry, mask, endFlag in tagTable:
|
||||||
|
if endFlag == 0x01:
|
||||||
|
controlByteIndex += 1
|
||||||
|
continue
|
||||||
|
cbyte = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1])
|
||||||
|
if 0:
|
||||||
|
print("Control Byte Index %0x , Control Byte Value %0x" % (controlByteIndex, cbyte))
|
||||||
|
|
||||||
|
value = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1]) & mask
|
||||||
|
if value != 0:
|
||||||
|
if value == mask:
|
||||||
|
if countSetBits(mask) > 1:
|
||||||
|
# If all bits of masked value are set and the mask has more than one bit, a variable width value
|
||||||
|
# will follow after the control bytes which defines the length of bytes (NOT the value count!)
|
||||||
|
# which will contain the corresponding variable width values.
|
||||||
|
consumed, value = getVariableWidthValue(entryData, dataStart)
|
||||||
|
dataStart += consumed
|
||||||
|
tags.append((tag, None, value, valuesPerEntry))
|
||||||
|
else:
|
||||||
|
tags.append((tag, 1, None, valuesPerEntry))
|
||||||
|
else:
|
||||||
|
# Shift bits to get the masked value.
|
||||||
|
while mask & 0x01 == 0:
|
||||||
|
mask = mask >> 1
|
||||||
|
value = value >> 1
|
||||||
|
tags.append((tag, value, None, valuesPerEntry))
|
||||||
|
for tag, valueCount, valueBytes, valuesPerEntry in tags:
|
||||||
|
values = []
|
||||||
|
if valueCount is not None:
|
||||||
|
# Read valueCount * valuesPerEntry variable width values.
|
||||||
|
for _ in range(valueCount):
|
||||||
|
for _ in range(valuesPerEntry):
|
||||||
|
consumed, data = getVariableWidthValue(entryData, dataStart)
|
||||||
|
dataStart += consumed
|
||||||
|
values.append(data)
|
||||||
|
else:
|
||||||
|
# Convert valueBytes to variable width values.
|
||||||
|
totalConsumed = 0
|
||||||
|
while totalConsumed < valueBytes:
|
||||||
|
# Does this work for valuesPerEntry != 1?
|
||||||
|
consumed, data = getVariableWidthValue(entryData, dataStart)
|
||||||
|
dataStart += consumed
|
||||||
|
totalConsumed += consumed
|
||||||
|
values.append(data)
|
||||||
|
if totalConsumed != valueBytes:
|
||||||
|
print("Error: Should consume %s bytes, but consumed %s" % (valueBytes, totalConsumed))
|
||||||
|
tagHashMap[tag] = values
|
||||||
|
# Test that all bytes have been processed if endPos is given.
|
||||||
|
if endPos is not None and dataStart != endPos:
|
||||||
|
# The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
|
||||||
|
for char in entryData[dataStart:endPos]:
|
||||||
|
if bord(char) != 0:
|
||||||
|
print("Warning: There are unprocessed index bytes left: %s" % toHex(entryData[dataStart:endPos]))
|
||||||
|
if 0:
|
||||||
|
print("controlByteCount: %s" % controlByteCount)
|
||||||
|
print("tagTable: %s" % tagTable)
|
||||||
|
print("data: %s" % toHex(entryData[startPos:endPos]))
|
||||||
|
print("tagHashMap: %s" % tagHashMap)
|
||||||
|
break
|
||||||
|
|
||||||
|
return tagHashMap
|
494
KindleUnpack/mobi_k8proc.py
Normal file
494
KindleUnpack/mobi_k8proc.py
Normal file
@@ -0,0 +1,494 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
|
||||||
|
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||||
|
|
||||||
|
from .compatibility_utils import PY2, bstr, utf8_str
|
||||||
|
|
||||||
|
if PY2:
|
||||||
|
range = xrange
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
import struct
|
||||||
|
# note: struct pack, unpack, unpack_from all require bytestring format
|
||||||
|
# data all the way up to at least python 2.7.5, python 3 okay with bytestring
|
||||||
|
|
||||||
|
import re
|
||||||
|
# note: re requites the pattern to be the exact same type as the data to be searched in python3
|
||||||
|
# but u"" is not allowed for the pattern itself only b""
|
||||||
|
|
||||||
|
from .mobi_index import MobiIndex
|
||||||
|
from .mobi_utils import fromBase32
|
||||||
|
from .unipath import pathof
|
||||||
|
|
||||||
|
_guide_types = [b'cover',b'title-page',b'toc',b'index',b'glossary',b'acknowledgements',
|
||||||
|
b'bibliography',b'colophon',b'copyright-page',b'dedication',
|
||||||
|
b'epigraph',b'foreward',b'loi',b'lot',b'notes',b'preface',b'text']
|
||||||
|
|
||||||
|
# locate beginning and ending positions of tag with specific aid attribute
|
||||||
|
def locate_beg_end_of_tag(ml, aid):
|
||||||
|
pattern = utf8_str(r'''<[^>]*\said\s*=\s*['"]%s['"][^>]*>''' % aid)
|
||||||
|
aid_pattern = re.compile(pattern,re.IGNORECASE)
|
||||||
|
for m in re.finditer(aid_pattern, ml):
|
||||||
|
plt = m.start()
|
||||||
|
pgt = ml.find(b'>',plt+1)
|
||||||
|
return plt, pgt
|
||||||
|
return 0, 0
|
||||||
|
|
||||||
|
|
||||||
|
# iterate over all tags in block in reverse order, i.e. last ta to first tag
|
||||||
|
def reverse_tag_iter(block):
|
||||||
|
end = len(block)
|
||||||
|
while True:
|
||||||
|
pgt = block.rfind(b'>', 0, end)
|
||||||
|
if pgt == -1:
|
||||||
|
break
|
||||||
|
plt = block.rfind(b'<', 0, pgt)
|
||||||
|
if plt == -1:
|
||||||
|
break
|
||||||
|
yield block[plt:pgt+1]
|
||||||
|
end = plt
|
||||||
|
|
||||||
|
|
||||||
|
class K8Processor:
|
||||||
|
|
||||||
|
def __init__(self, mh, sect, files, debug=False):
|
||||||
|
self.sect = sect
|
||||||
|
self.files = files
|
||||||
|
self.mi = MobiIndex(sect)
|
||||||
|
self.mh = mh
|
||||||
|
self.skelidx = mh.skelidx
|
||||||
|
self.fragidx = mh.fragidx
|
||||||
|
self.guideidx = mh.guideidx
|
||||||
|
self.fdst = mh.fdst
|
||||||
|
self.flowmap = {}
|
||||||
|
self.flows = None
|
||||||
|
self.flowinfo = []
|
||||||
|
self.parts = None
|
||||||
|
self.partinfo = []
|
||||||
|
self.linked_aids = set()
|
||||||
|
self.fdsttbl= [0,0xffffffff]
|
||||||
|
self.DEBUG = debug
|
||||||
|
|
||||||
|
# read in and parse the FDST info which is very similar in format to the Palm DB section
|
||||||
|
# parsing except it provides offsets into rawML file and not the Palm DB file
|
||||||
|
# this is needed to split up the final css, svg, etc flow section
|
||||||
|
# that can exist at the end of the rawML file
|
||||||
|
if self.fdst != 0xffffffff:
|
||||||
|
header = self.sect.loadSection(self.fdst)
|
||||||
|
if header[0:4] == b"FDST":
|
||||||
|
num_sections, = struct.unpack_from(b'>L', header, 0x08)
|
||||||
|
self.fdsttbl = struct.unpack_from(bstr('>%dL' % (num_sections*2)), header, 12)[::2] + (mh.rawSize, )
|
||||||
|
sect.setsectiondescription(self.fdst,"KF8 FDST INDX")
|
||||||
|
if self.DEBUG:
|
||||||
|
print("\nFDST Section Map: %d sections" % num_sections)
|
||||||
|
for j in range(num_sections):
|
||||||
|
print("Section %d: 0x%08X - 0x%08X" % (j, self.fdsttbl[j],self.fdsttbl[j+1]))
|
||||||
|
else:
|
||||||
|
print("\nError: K8 Mobi with Missing FDST info")
|
||||||
|
|
||||||
|
# read/process skeleton index info to create the skeleton table
|
||||||
|
skeltbl = []
|
||||||
|
if self.skelidx != 0xffffffff:
|
||||||
|
# for i in range(2):
|
||||||
|
# fname = 'skel%04d.dat' % i
|
||||||
|
# data = self.sect.loadSection(self.skelidx + i)
|
||||||
|
# with open(pathof(fname), 'wb') as f:
|
||||||
|
# f.write(data)
|
||||||
|
outtbl, ctoc_text = self.mi.getIndexData(self.skelidx, "KF8 Skeleton")
|
||||||
|
fileptr = 0
|
||||||
|
for [text, tagMap] in outtbl:
|
||||||
|
# file number, skeleton name, fragtbl record count, start position, length
|
||||||
|
skeltbl.append([fileptr, text, tagMap[1][0], tagMap[6][0], tagMap[6][1]])
|
||||||
|
fileptr += 1
|
||||||
|
self.skeltbl = skeltbl
|
||||||
|
if self.DEBUG:
|
||||||
|
print("\nSkel Table: %d entries" % len(self.skeltbl))
|
||||||
|
print("table: filenum, skeleton name, frag tbl record count, start position, length")
|
||||||
|
for j in range(len(self.skeltbl)):
|
||||||
|
print(self.skeltbl[j])
|
||||||
|
|
||||||
|
# read/process the fragment index to create the fragment table
|
||||||
|
fragtbl = []
|
||||||
|
if self.fragidx != 0xffffffff:
|
||||||
|
# for i in range(3):
|
||||||
|
# fname = 'frag%04d.dat' % i
|
||||||
|
# data = self.sect.loadSection(self.fragidx + i)
|
||||||
|
# with open(pathof(fname), 'wb') as f:
|
||||||
|
# f.write(data)
|
||||||
|
outtbl, ctoc_text = self.mi.getIndexData(self.fragidx, "KF8 Fragment")
|
||||||
|
for [text, tagMap] in outtbl:
|
||||||
|
# insert position, ctoc offset (aidtext), file number, sequence number, start position, length
|
||||||
|
ctocoffset = tagMap[2][0]
|
||||||
|
ctocdata = ctoc_text[ctocoffset]
|
||||||
|
fragtbl.append([int(text), ctocdata, tagMap[3][0], tagMap[4][0], tagMap[6][0], tagMap[6][1]])
|
||||||
|
self.fragtbl = fragtbl
|
||||||
|
if self.DEBUG:
|
||||||
|
print("\nFragment Table: %d entries" % len(self.fragtbl))
|
||||||
|
print("table: file position, link id text, file num, sequence number, start position, length")
|
||||||
|
for j in range(len(self.fragtbl)):
|
||||||
|
print(self.fragtbl[j])
|
||||||
|
|
||||||
|
# read / process guide index for guide elements of opf
|
||||||
|
guidetbl = []
|
||||||
|
if self.guideidx != 0xffffffff:
|
||||||
|
# for i in range(3):
|
||||||
|
# fname = 'guide%04d.dat' % i
|
||||||
|
# data = self.sect.loadSection(self.guideidx + i)
|
||||||
|
# with open(pathof(fname), 'wb') as f:
|
||||||
|
# f.write(data)
|
||||||
|
outtbl, ctoc_text = self.mi.getIndexData(self.guideidx, "KF8 Guide elements)")
|
||||||
|
for [text, tagMap] in outtbl:
|
||||||
|
# ref_type, ref_title, frag number
|
||||||
|
ctocoffset = tagMap[1][0]
|
||||||
|
ref_title = ctoc_text[ctocoffset]
|
||||||
|
ref_type = text
|
||||||
|
fileno = None
|
||||||
|
if 3 in tagMap:
|
||||||
|
fileno = tagMap[3][0]
|
||||||
|
if 6 in tagMap:
|
||||||
|
fileno = tagMap[6][0]
|
||||||
|
guidetbl.append([ref_type, ref_title, fileno])
|
||||||
|
self.guidetbl = guidetbl
|
||||||
|
if self.DEBUG:
|
||||||
|
print("\nGuide Table: %d entries" % len(self.guidetbl))
|
||||||
|
print("table: ref_type, ref_title, fragtbl entry number")
|
||||||
|
for j in range(len(self.guidetbl)):
|
||||||
|
print(self.guidetbl[j])
|
||||||
|
|
||||||
|
def buildParts(self, rawML):
|
||||||
|
# now split the rawML into its flow pieces
|
||||||
|
self.flows = []
|
||||||
|
for j in range(0, len(self.fdsttbl)-1):
|
||||||
|
start = self.fdsttbl[j]
|
||||||
|
end = self.fdsttbl[j+1]
|
||||||
|
self.flows.append(rawML[start:end])
|
||||||
|
|
||||||
|
# the first piece represents the xhtml text
|
||||||
|
text = self.flows[0]
|
||||||
|
self.flows[0] = b''
|
||||||
|
|
||||||
|
# walk the <skeleton> and fragment tables to build original source xhtml files
|
||||||
|
# *without* destroying any file position information needed for later href processing
|
||||||
|
# and create final list of file separation start: stop points and etc in partinfo
|
||||||
|
if self.DEBUG:
|
||||||
|
print("\nRebuilding flow piece 0: the main body of the ebook")
|
||||||
|
self.parts = []
|
||||||
|
self.partinfo = []
|
||||||
|
fragptr = 0
|
||||||
|
baseptr = 0
|
||||||
|
cnt = 0
|
||||||
|
for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl:
|
||||||
|
baseptr = skelpos + skellen
|
||||||
|
skeleton = text[skelpos: baseptr]
|
||||||
|
for i in range(fragcnt):
|
||||||
|
[insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[fragptr]
|
||||||
|
aidtext = idtext[12:-2]
|
||||||
|
if i == 0:
|
||||||
|
filename = 'part%04d.xhtml' % filenum
|
||||||
|
slice = text[baseptr: baseptr + length]
|
||||||
|
insertpos = insertpos - skelpos
|
||||||
|
head = skeleton[:insertpos]
|
||||||
|
tail = skeleton[insertpos:]
|
||||||
|
actual_inspos = insertpos
|
||||||
|
if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') < head.rfind(b'<')):
|
||||||
|
# There is an incomplete tag in either the head or tail.
|
||||||
|
# This can happen for some badly formed KF8 files
|
||||||
|
print('The fragment table for %s has incorrect insert position. Calculating manually.' % skelname)
|
||||||
|
bp, ep = locate_beg_end_of_tag(skeleton, aidtext)
|
||||||
|
if bp != ep:
|
||||||
|
actual_inspos = ep + 1 + startpos
|
||||||
|
if insertpos != actual_inspos:
|
||||||
|
print("fixed corrupt fragment table insert position", insertpos+skelpos, actual_inspos+skelpos)
|
||||||
|
insertpos = actual_inspos
|
||||||
|
self.fragtbl[fragptr][0] = actual_inspos + skelpos
|
||||||
|
skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:]
|
||||||
|
baseptr = baseptr + length
|
||||||
|
fragptr += 1
|
||||||
|
cnt += 1
|
||||||
|
self.parts.append(skeleton)
|
||||||
|
self.partinfo.append([skelnum, 'Text', filename, skelpos, baseptr, aidtext])
|
||||||
|
|
||||||
|
assembled_text = b''.join(self.parts)
|
||||||
|
if self.DEBUG:
|
||||||
|
outassembled = os.path.join(self.files.k8dir, 'assembled_text.dat')
|
||||||
|
with open(pathof(outassembled),'wb') as f:
|
||||||
|
f.write(assembled_text)
|
||||||
|
|
||||||
|
# The primary css style sheet is typically stored next followed by any
|
||||||
|
# snippets of code that were previously inlined in the
|
||||||
|
# original xhtml but have been stripped out and placed here.
|
||||||
|
# This can include local CDATA snippets and and svg sections.
|
||||||
|
|
||||||
|
# The problem is that for most browsers and ereaders, you can not
|
||||||
|
# use <img src="imageXXXX.svg" /> to import any svg image that itself
|
||||||
|
# properly uses an <image/> tag to import some raster image - it
|
||||||
|
# should work according to the spec but does not for almost all browsers
|
||||||
|
# and ereaders and causes epub validation issues because those raster
|
||||||
|
# images are in manifest but not in xhtml text - since they only
|
||||||
|
# referenced from an svg image
|
||||||
|
|
||||||
|
# So we need to check the remaining flow pieces to see if they are css
|
||||||
|
# or svg images. if svg images, we must check if they have an <image />
|
||||||
|
# and if so inline them into the xhtml text pieces.
|
||||||
|
|
||||||
|
# there may be other sorts of pieces stored here but until we see one
|
||||||
|
# in the wild to reverse engineer we won't be able to tell
|
||||||
|
self.flowinfo.append([None, None, None, None])
|
||||||
|
svg_tag_pattern = re.compile(br'''(<svg[^>]*>)''', re.IGNORECASE)
|
||||||
|
image_tag_pattern = re.compile(br'''(<image[^>]*>)''', re.IGNORECASE)
|
||||||
|
for j in range(1,len(self.flows)):
|
||||||
|
flowpart = self.flows[j]
|
||||||
|
nstr = '%04d' % j
|
||||||
|
m = re.search(svg_tag_pattern, flowpart)
|
||||||
|
if m is not None:
|
||||||
|
# svg
|
||||||
|
ptype = b'svg'
|
||||||
|
start = m.start()
|
||||||
|
m2 = re.search(image_tag_pattern, flowpart)
|
||||||
|
if m2 is not None:
|
||||||
|
pformat = b'inline'
|
||||||
|
pdir = None
|
||||||
|
fname = None
|
||||||
|
# strip off anything before <svg if inlining
|
||||||
|
flowpart = flowpart[start:]
|
||||||
|
else:
|
||||||
|
pformat = b'file'
|
||||||
|
pdir = "Images"
|
||||||
|
fname = 'svgimg' + nstr + '.svg'
|
||||||
|
else:
|
||||||
|
# search for CDATA and if exists inline it
|
||||||
|
if flowpart.find(b'[CDATA[') >= 0:
|
||||||
|
ptype = b'css'
|
||||||
|
flowpart = b'<style type="text/css">\n' + flowpart + b'\n</style>\n'
|
||||||
|
pformat = b'inline'
|
||||||
|
pdir = None
|
||||||
|
fname = None
|
||||||
|
else:
|
||||||
|
# css - assume as standalone css file
|
||||||
|
ptype = b'css'
|
||||||
|
pformat = b'file'
|
||||||
|
pdir = "Styles"
|
||||||
|
fname = 'style' + nstr + '.css'
|
||||||
|
|
||||||
|
self.flows[j] = flowpart
|
||||||
|
self.flowinfo.append([ptype, pformat, pdir, fname])
|
||||||
|
|
||||||
|
if self.DEBUG:
|
||||||
|
print("\nFlow Map: %d entries" % len(self.flowinfo))
|
||||||
|
for fi in self.flowinfo:
|
||||||
|
print(fi)
|
||||||
|
print("\n")
|
||||||
|
|
||||||
|
print("\nXHTML File Part Position Information: %d entries" % len(self.partinfo))
|
||||||
|
for pi in self.partinfo:
|
||||||
|
print(pi)
|
||||||
|
|
||||||
|
if False: # self.Debug:
|
||||||
|
# dump all of the locations of the aid tags used in TEXT
|
||||||
|
# find id links only inside of tags
|
||||||
|
# inside any < > pair find all "aid=' and return whatever is inside the quotes
|
||||||
|
# [^>]* means match any amount of chars except for '>' char
|
||||||
|
# [^'"] match any amount of chars except for the quote character
|
||||||
|
# \s* means match any amount of whitespace
|
||||||
|
print("\npositions of all aid= pieces")
|
||||||
|
id_pattern = re.compile(br'''<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>''',re.IGNORECASE)
|
||||||
|
for m in re.finditer(id_pattern, rawML):
|
||||||
|
[filename, partnum, start, end] = self.getFileInfo(m.start())
|
||||||
|
[seqnum, idtext] = self.getFragTblInfo(m.start())
|
||||||
|
value = fromBase32(m.group(1))
|
||||||
|
print(" aid: %s value: %d at: %d -> part: %d, start: %d, end: %d" % (m.group(1), value, m.start(), partnum, start, end))
|
||||||
|
print(" %s fragtbl entry %d" % (idtext, seqnum))
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
# get information fragment table entry by pos
|
||||||
|
def getFragTblInfo(self, pos):
|
||||||
|
for j in range(len(self.fragtbl)):
|
||||||
|
[insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[j]
|
||||||
|
if pos >= insertpos and pos < (insertpos + length):
|
||||||
|
# why are these "in: and before: added here
|
||||||
|
return seqnum, b'in: ' + idtext
|
||||||
|
if pos < insertpos:
|
||||||
|
return seqnum, b'before: ' + idtext
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# get information about the part (file) that exists at pos in original rawML
|
||||||
|
def getFileInfo(self, pos):
|
||||||
|
for [partnum, pdir, filename, start, end, aidtext] in self.partinfo:
|
||||||
|
if pos >= start and pos < end:
|
||||||
|
return filename, partnum, start, end
|
||||||
|
return None, None, None, None
|
||||||
|
|
||||||
|
# accessor functions to properly protect the internal structure
|
||||||
|
def getNumberOfParts(self):
|
||||||
|
return len(self.parts)
|
||||||
|
|
||||||
|
def getPart(self,i):
|
||||||
|
if i >= 0 and i < len(self.parts):
|
||||||
|
return self.parts[i]
|
||||||
|
return None
|
||||||
|
|
||||||
|
def getPartInfo(self, i):
|
||||||
|
if i >= 0 and i < len(self.partinfo):
|
||||||
|
return self.partinfo[i]
|
||||||
|
return None
|
||||||
|
|
||||||
|
def getNumberOfFlows(self):
|
||||||
|
return len(self.flows)
|
||||||
|
|
||||||
|
def getFlow(self,i):
|
||||||
|
# note flows[0] is empty - it was all of the original text
|
||||||
|
if i > 0 and i < len(self.flows):
|
||||||
|
return self.flows[i]
|
||||||
|
return None
|
||||||
|
|
||||||
|
def getFlowInfo(self,i):
|
||||||
|
# note flowinfo[0] is empty - it was all of the original text
|
||||||
|
if i > 0 and i < len(self.flowinfo):
|
||||||
|
return self.flowinfo[i]
|
||||||
|
return None
|
||||||
|
|
||||||
|
def getIDTagByPosFid(self, posfid, offset):
|
||||||
|
# first convert kindle:pos:fid and offset info to position in file
|
||||||
|
# (fromBase32 can handle both string types on input)
|
||||||
|
row = fromBase32(posfid)
|
||||||
|
off = fromBase32(offset)
|
||||||
|
[insertpos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[row]
|
||||||
|
pos = insertpos + off
|
||||||
|
fname, pn, skelpos, skelend = self.getFileInfo(pos)
|
||||||
|
if fname is None:
|
||||||
|
# pos does not exist
|
||||||
|
# default to skeleton pos instead
|
||||||
|
print("Link To Position", pos, "does not exist, retargeting to top of target")
|
||||||
|
pos = self.skeltbl[filenum][3]
|
||||||
|
fname, pn, skelpos, skelend = self.getFileInfo(pos)
|
||||||
|
# an existing "id=" or "name=" attribute must exist in original xhtml otherwise it would not have worked for linking.
|
||||||
|
# Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent
|
||||||
|
# some position information encoded into Base32 name.
|
||||||
|
# so find the closest "id=" before position the file by actually searching in that file
|
||||||
|
idtext = self.getIDTag(pos)
|
||||||
|
return fname, idtext
|
||||||
|
|
||||||
|
def getIDTag(self, pos):
|
||||||
|
# find the first tag with a named anchor (name or id attribute) before pos
|
||||||
|
fname, pn, skelpos, skelend = self.getFileInfo(pos)
|
||||||
|
if pn is None and skelpos is None:
|
||||||
|
print("Error: getIDTag - no file contains ", pos)
|
||||||
|
textblock = self.parts[pn]
|
||||||
|
npos = pos - skelpos
|
||||||
|
# if npos inside a tag then search all text before the its end of tag marker
|
||||||
|
pgt = textblock.find(b'>',npos)
|
||||||
|
plt = textblock.find(b'<',npos)
|
||||||
|
if plt == npos or pgt < plt:
|
||||||
|
npos = pgt + 1
|
||||||
|
# find id and name attributes only inside of tags
|
||||||
|
# use a reverse tag search since that is faster
|
||||||
|
# inside any < > pair find "id=" and "name=" attributes return it
|
||||||
|
# [^>]* means match any amount of chars except for '>' char
|
||||||
|
# [^'"] match any amount of chars except for the quote character
|
||||||
|
# \s* means match any amount of whitespace
|
||||||
|
textblock = textblock[0:npos]
|
||||||
|
id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE)
|
||||||
|
name_pattern = re.compile(br'''<[^>]*\sname\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE)
|
||||||
|
aid_pattern = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''')
|
||||||
|
for tag in reverse_tag_iter(textblock):
|
||||||
|
# any ids in the body should default to top of file
|
||||||
|
if tag[0:6] == b'<body ':
|
||||||
|
return b''
|
||||||
|
if tag[0:6] != b'<meta ':
|
||||||
|
m = id_pattern.match(tag) or name_pattern.match(tag)
|
||||||
|
if m is not None:
|
||||||
|
return m.group(1)
|
||||||
|
m = aid_pattern.match(tag)
|
||||||
|
if m is not None:
|
||||||
|
self.linked_aids.add(m.group(1))
|
||||||
|
return b'aid-' + m.group(1)
|
||||||
|
return b''
|
||||||
|
|
||||||
|
# do we need to do deep copying
|
||||||
|
def setParts(self, parts):
|
||||||
|
assert(len(parts) == len(self.parts))
|
||||||
|
for i in range(len(parts)):
|
||||||
|
self.parts[i] = parts[i]
|
||||||
|
|
||||||
|
# do we need to do deep copying
|
||||||
|
def setFlows(self, flows):
|
||||||
|
assert(len(flows) == len(self.flows))
|
||||||
|
for i in range(len(flows)):
|
||||||
|
self.flows[i] = flows[i]
|
||||||
|
|
||||||
|
# get information about the part (file) that exists at pos in original rawML
|
||||||
|
def getSkelInfo(self, pos):
|
||||||
|
for [partnum, pdir, filename, start, end, aidtext] in self.partinfo:
|
||||||
|
if pos >= start and pos < end:
|
||||||
|
return [partnum, pdir, filename, start, end, aidtext]
|
||||||
|
return [None, None, None, None, None, None]
|
||||||
|
|
||||||
|
# fileno is actually a reference into fragtbl (a fragment)
|
||||||
|
def getGuideText(self):
|
||||||
|
guidetext = b''
|
||||||
|
for [ref_type, ref_title, fileno] in self.guidetbl:
|
||||||
|
if ref_type == b'thumbimagestandard':
|
||||||
|
continue
|
||||||
|
if ref_type not in _guide_types and not ref_type.startswith(b'other.'):
|
||||||
|
if ref_type == b'start':
|
||||||
|
ref_type = b'text'
|
||||||
|
else:
|
||||||
|
ref_type = b'other.' + ref_type
|
||||||
|
[pos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[fileno]
|
||||||
|
[pn, pdir, filename, skelpos, skelend, aidtext] = self.getSkelInfo(pos)
|
||||||
|
idtext = self.getIDTag(pos)
|
||||||
|
linktgt = filename.encode('utf-8')
|
||||||
|
if idtext != b'':
|
||||||
|
linktgt += b'#' + idtext
|
||||||
|
guidetext += b'<reference type="'+ref_type+b'" title="'+ref_title+b'" href="'+utf8_str(pdir)+b'/'+linktgt+b'" />\n'
|
||||||
|
# opf is encoded utf-8 so must convert any titles properly
|
||||||
|
guidetext = (guidetext.decode(self.mh.codec)).encode("utf-8")
|
||||||
|
return guidetext
|
||||||
|
|
||||||
|
def getPageIDTag(self, pos):
|
||||||
|
# find the first tag with a named anchor (name or id attribute) before pos
|
||||||
|
# but page map offsets need to little more leeway so if the offset points
|
||||||
|
# into a tag look for the next ending tag "/>" or "</" and start your search from there.
|
||||||
|
fname, pn, skelpos, skelend = self.getFileInfo(pos)
|
||||||
|
if pn is None and skelpos is None:
|
||||||
|
print("Error: getIDTag - no file contains ", pos)
|
||||||
|
textblock = self.parts[pn]
|
||||||
|
npos = pos - skelpos
|
||||||
|
# if npos inside a tag then search all text before next ending tag
|
||||||
|
pgt = textblock.find(b'>',npos)
|
||||||
|
plt = textblock.find(b'<',npos)
|
||||||
|
if plt == npos or pgt < plt:
|
||||||
|
# we are in a tag
|
||||||
|
# so find first ending tag
|
||||||
|
pend1 = textblock.find(b'/>', npos)
|
||||||
|
pend2 = textblock.find(b'</', npos)
|
||||||
|
if pend1 != -1 and pend2 != -1:
|
||||||
|
pend = min(pend1, pend2)
|
||||||
|
else:
|
||||||
|
pend = max(pend1, pend2)
|
||||||
|
if pend != -1:
|
||||||
|
npos = pend
|
||||||
|
else:
|
||||||
|
npos = pgt + 1
|
||||||
|
# find id and name attributes only inside of tags
|
||||||
|
# use a reverse tag search since that is faster
|
||||||
|
# inside any < > pair find "id=" and "name=" attributes return it
|
||||||
|
# [^>]* means match any amount of chars except for '>' char
|
||||||
|
# [^'"] match any amount of chars except for the quote character
|
||||||
|
# \s* means match any amount of whitespace
|
||||||
|
textblock = textblock[0:npos]
|
||||||
|
id_pattern = re.compile(br'''<[^>]*\sid\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE)
|
||||||
|
name_pattern = re.compile(br'''<[^>]*\sname\s*=\s*['"]([^'"]*)['"]''',re.IGNORECASE)
|
||||||
|
for tag in reverse_tag_iter(textblock):
|
||||||
|
# any ids in the body should default to top of file
|
||||||
|
if tag[0:6] == b'<body ':
|
||||||
|
return b''
|
||||||
|
if tag[0:6] != b'<meta ':
|
||||||
|
m = id_pattern.match(tag) or name_pattern.match(tag)
|
||||||
|
if m is not None:
|
||||||
|
return m.group(1)
|
||||||
|
return b''
|
268
KindleUnpack/mobi_k8resc.py
Normal file
268
KindleUnpack/mobi_k8resc.py
Normal file
@@ -0,0 +1,268 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
|
||||||
|
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||||
|
|
||||||
|
DEBUG_USE_ORDERED_DICTIONARY = False # OrderedDict is supoorted >= python 2.7.
|
||||||
|
""" set to True to use OrderedDict for K8RESCProcessor.parsetag.tattr."""
|
||||||
|
|
||||||
|
if DEBUG_USE_ORDERED_DICTIONARY:
|
||||||
|
from collections import OrderedDict as dict_
|
||||||
|
else:
|
||||||
|
dict_ = dict
|
||||||
|
|
||||||
|
from .compatibility_utils import unicode_str
|
||||||
|
|
||||||
|
from .mobi_utils import fromBase32
|
||||||
|
|
||||||
|
_OPF_PARENT_TAGS = ['xml', 'package', 'metadata', 'dc-metadata',
|
||||||
|
'x-metadata', 'manifest', 'spine', 'tours', 'guide']
|
||||||
|
|
||||||
|
class K8RESCProcessor(object):
|
||||||
|
|
||||||
|
def __init__(self, data, debug=False):
|
||||||
|
self._debug = debug
|
||||||
|
self.resc = None
|
||||||
|
self.opos = 0
|
||||||
|
self.extrameta = []
|
||||||
|
self.cover_name = None
|
||||||
|
self.spine_idrefs = {}
|
||||||
|
self.spine_order = []
|
||||||
|
self.spine_pageattributes = {}
|
||||||
|
self.spine_ppd = None
|
||||||
|
# need3 indicate the book has fields which require epub3.
|
||||||
|
# but the estimation of the source epub version from the fields is difficult.
|
||||||
|
self.need3 = False
|
||||||
|
self.package_ver = None
|
||||||
|
self.extra_metadata = []
|
||||||
|
self.refines_metadata = []
|
||||||
|
self.extra_attributes = []
|
||||||
|
# get header
|
||||||
|
start_pos = data.find(b'<')
|
||||||
|
self.resc_header = data[:start_pos]
|
||||||
|
# get resc data length
|
||||||
|
start = self.resc_header.find(b'=') + 1
|
||||||
|
end = self.resc_header.find(b'&', start)
|
||||||
|
resc_size = 0
|
||||||
|
if end > 0:
|
||||||
|
resc_size = fromBase32(self.resc_header[start:end])
|
||||||
|
resc_rawbytes = len(data) - start_pos
|
||||||
|
if resc_rawbytes == resc_size:
|
||||||
|
self.resc_length = resc_size
|
||||||
|
else:
|
||||||
|
# Most RESC has a nul string at its tail but some do not.
|
||||||
|
end_pos = data.find(b'\x00', start_pos)
|
||||||
|
if end_pos < 0:
|
||||||
|
self.resc_length = resc_rawbytes
|
||||||
|
else:
|
||||||
|
self.resc_length = end_pos - start_pos
|
||||||
|
if self.resc_length != resc_size:
|
||||||
|
print("Warning: RESC section length({:d}bytes) does not match its size({:d}bytes).".format(self.resc_length, resc_size))
|
||||||
|
# now parse RESC after converting it to unicode from utf-8
|
||||||
|
self.resc = unicode_str(data[start_pos:start_pos+self.resc_length])
|
||||||
|
self.parseData()
|
||||||
|
|
||||||
|
def prepend_to_spine(self, key, idref, linear, properties):
|
||||||
|
self.spine_order = [key] + self.spine_order
|
||||||
|
self.spine_idrefs[key] = idref
|
||||||
|
attributes = {}
|
||||||
|
if linear is not None:
|
||||||
|
attributes['linear'] = linear
|
||||||
|
if properties is not None:
|
||||||
|
attributes['properties'] = properties
|
||||||
|
self.spine_pageattributes[key] = attributes
|
||||||
|
|
||||||
|
# RESC tag iterator
|
||||||
|
def resc_tag_iter(self):
|
||||||
|
tcontent = last_tattr = None
|
||||||
|
prefix = ['']
|
||||||
|
while True:
|
||||||
|
text, tag = self.parseresc()
|
||||||
|
if text is None and tag is None:
|
||||||
|
break
|
||||||
|
if text is not None:
|
||||||
|
tcontent = text.rstrip(' \r\n')
|
||||||
|
else: # we have a tag
|
||||||
|
ttype, tname, tattr = self.parsetag(tag)
|
||||||
|
if ttype == 'begin':
|
||||||
|
tcontent = None
|
||||||
|
prefix.append(tname + '.')
|
||||||
|
if tname in _OPF_PARENT_TAGS:
|
||||||
|
yield ''.join(prefix), tname, tattr, tcontent
|
||||||
|
else:
|
||||||
|
last_tattr = tattr
|
||||||
|
else: # single or end
|
||||||
|
if ttype == 'end':
|
||||||
|
prefix.pop()
|
||||||
|
tattr = last_tattr
|
||||||
|
last_tattr = None
|
||||||
|
if tname in _OPF_PARENT_TAGS:
|
||||||
|
tname += '-end'
|
||||||
|
yield ''.join(prefix), tname, tattr, tcontent
|
||||||
|
tcontent = None
|
||||||
|
|
||||||
|
# now parse the RESC to extract spine and extra metadata info
|
||||||
|
def parseData(self):
|
||||||
|
for prefix, tname, tattr, tcontent in self.resc_tag_iter():
|
||||||
|
if self._debug:
|
||||||
|
print(" Parsing RESC: ", prefix, tname, tattr, tcontent)
|
||||||
|
if tname == 'package':
|
||||||
|
self.package_ver = tattr.get('version', '2.0')
|
||||||
|
package_prefix = tattr.get('prefix','')
|
||||||
|
if self.package_ver.startswith('3') or package_prefix.startswith('rendition'):
|
||||||
|
self.need3 = True
|
||||||
|
if tname == 'spine':
|
||||||
|
self.spine_ppd = tattr.get('page-progession-direction', None)
|
||||||
|
if self.spine_ppd is not None and self.spine_ppd == 'rtl':
|
||||||
|
self.need3 = True
|
||||||
|
if tname == 'itemref':
|
||||||
|
skelid = tattr.pop('skelid', None)
|
||||||
|
if skelid is None and len(self.spine_order) == 0:
|
||||||
|
# assume it was removed initial coverpage
|
||||||
|
skelid = 'coverpage'
|
||||||
|
tattr['linear'] = 'no'
|
||||||
|
self.spine_order.append(skelid)
|
||||||
|
idref = tattr.pop('idref', None)
|
||||||
|
if idref is not None:
|
||||||
|
idref = 'x_' + idref
|
||||||
|
self.spine_idrefs[skelid] = idref
|
||||||
|
if 'id' in tattr:
|
||||||
|
del tattr['id']
|
||||||
|
# tattr["id"] = 'x_' + tattr["id"]
|
||||||
|
if 'properties' in tattr:
|
||||||
|
self.need3 = True
|
||||||
|
self.spine_pageattributes[skelid] = tattr
|
||||||
|
if tname == 'meta' or tname.startswith('dc:'):
|
||||||
|
if 'refines' in tattr or 'property' in tattr:
|
||||||
|
self.need3 = True
|
||||||
|
if tattr.get('name','') == 'cover':
|
||||||
|
cover_name = tattr.get('content',None)
|
||||||
|
if cover_name is not None:
|
||||||
|
cover_name = 'x_' + cover_name
|
||||||
|
self.cover_name = cover_name
|
||||||
|
else:
|
||||||
|
self.extrameta.append([tname, tattr, tcontent])
|
||||||
|
|
||||||
|
# parse and return either leading text or the next tag
|
||||||
|
def parseresc(self):
|
||||||
|
p = self.opos
|
||||||
|
if p >= len(self.resc):
|
||||||
|
return None, None
|
||||||
|
if self.resc[p] != '<':
|
||||||
|
res = self.resc.find('<',p)
|
||||||
|
if res == -1 :
|
||||||
|
res = len(self.resc)
|
||||||
|
self.opos = res
|
||||||
|
return self.resc[p:res], None
|
||||||
|
# handle comment as a special case
|
||||||
|
if self.resc[p:p+4] == '<!--':
|
||||||
|
te = self.resc.find('-->',p+1)
|
||||||
|
if te != -1:
|
||||||
|
te = te+2
|
||||||
|
else:
|
||||||
|
te = self.resc.find('>',p+1)
|
||||||
|
ntb = self.resc.find('<',p+1)
|
||||||
|
if ntb != -1 and ntb < te:
|
||||||
|
self.opos = ntb
|
||||||
|
return self.resc[p:ntb], None
|
||||||
|
self.opos = te + 1
|
||||||
|
return None, self.resc[p:te+1]
|
||||||
|
|
||||||
|
# parses tag to identify: [tname, ttype, tattr]
|
||||||
|
# tname: tag name
|
||||||
|
# ttype: tag type ('begin', 'end' or 'single');
|
||||||
|
# tattr: dictionary of tag atributes
|
||||||
|
def parsetag(self, s):
|
||||||
|
p = 1
|
||||||
|
tname = None
|
||||||
|
ttype = None
|
||||||
|
tattr = dict_()
|
||||||
|
while s[p:p+1] == ' ' :
|
||||||
|
p += 1
|
||||||
|
if s[p:p+1] == '/':
|
||||||
|
ttype = 'end'
|
||||||
|
p += 1
|
||||||
|
while s[p:p+1] == ' ' :
|
||||||
|
p += 1
|
||||||
|
b = p
|
||||||
|
while s[p:p+1] not in ('>', '/', ' ', '"', "'",'\r','\n') :
|
||||||
|
p += 1
|
||||||
|
tname=s[b:p].lower()
|
||||||
|
# some special cases
|
||||||
|
if tname == '?xml':
|
||||||
|
tname = 'xml'
|
||||||
|
if tname == '!--':
|
||||||
|
ttype = 'single'
|
||||||
|
comment = s[p:-3].strip()
|
||||||
|
tattr['comment'] = comment
|
||||||
|
if ttype is None:
|
||||||
|
# parse any attributes of begin or single tags
|
||||||
|
while s.find('=',p) != -1 :
|
||||||
|
while s[p:p+1] == ' ' :
|
||||||
|
p += 1
|
||||||
|
b = p
|
||||||
|
while s[p:p+1] != '=' :
|
||||||
|
p += 1
|
||||||
|
aname = s[b:p].lower()
|
||||||
|
aname = aname.rstrip(' ')
|
||||||
|
p += 1
|
||||||
|
while s[p:p+1] == ' ' :
|
||||||
|
p += 1
|
||||||
|
if s[p:p+1] in ('"', "'") :
|
||||||
|
p = p + 1
|
||||||
|
b = p
|
||||||
|
while s[p:p+1] not in ('"', "'"):
|
||||||
|
p += 1
|
||||||
|
val = s[b:p]
|
||||||
|
p += 1
|
||||||
|
else :
|
||||||
|
b = p
|
||||||
|
while s[p:p+1] not in ('>', '/', ' ') :
|
||||||
|
p += 1
|
||||||
|
val = s[b:p]
|
||||||
|
tattr[aname] = val
|
||||||
|
if ttype is None:
|
||||||
|
ttype = 'begin'
|
||||||
|
if s.find('/',p) >= 0:
|
||||||
|
ttype = 'single'
|
||||||
|
return ttype, tname, tattr
|
||||||
|
|
||||||
|
def taginfo_toxml(self, taginfo):
|
||||||
|
res = []
|
||||||
|
tname, tattr, tcontent = taginfo
|
||||||
|
res.append('<' + tname)
|
||||||
|
if tattr is not None:
|
||||||
|
for key in tattr:
|
||||||
|
res.append(' ' + key + '="'+tattr[key]+'"')
|
||||||
|
if tcontent is not None:
|
||||||
|
res.append('>' + tcontent + '</' + tname + '>\n')
|
||||||
|
else:
|
||||||
|
res.append('/>\n')
|
||||||
|
return "".join(res)
|
||||||
|
|
||||||
|
def hasSpine(self):
|
||||||
|
return len(self.spine_order) > 0
|
||||||
|
|
||||||
|
def needEPUB3(self):
|
||||||
|
return self.need3
|
||||||
|
|
||||||
|
def hasRefines(self):
|
||||||
|
for [tname, tattr, tcontent] in self.extrameta:
|
||||||
|
if 'refines' in tattr:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def createMetadata(self, epubver):
|
||||||
|
for taginfo in self.extrameta:
|
||||||
|
tname, tattr, tcontent = taginfo
|
||||||
|
if 'refines' in tattr:
|
||||||
|
if epubver == 'F' and 'property' in tattr:
|
||||||
|
attr = ' id="%s" opf:%s="%s"\n' % (tattr['refines'], tattr['property'], tcontent)
|
||||||
|
self.extra_attributes.append(attr)
|
||||||
|
else:
|
||||||
|
tag = self.taginfo_toxml(taginfo)
|
||||||
|
self.refines_metadata.append(tag)
|
||||||
|
else:
|
||||||
|
tag = self.taginfo_toxml(taginfo)
|
||||||
|
self.extra_metadata.append(tag)
|
186
KindleUnpack/mobi_nav.py
Normal file
186
KindleUnpack/mobi_nav.py
Normal file
@@ -0,0 +1,186 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
|
||||||
|
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||||
|
|
||||||
|
from .compatibility_utils import unicode_str
|
||||||
|
import os
|
||||||
|
from .unipath import pathof
|
||||||
|
|
||||||
|
import re
|
||||||
|
# note: re requites the pattern to be the exact same type as the data to be searched in python3
|
||||||
|
# but u"" is not allowed for the pattern itself only b""
|
||||||
|
|
||||||
|
DEBUG_NAV = False
|
||||||
|
|
||||||
|
FORCE_DEFAULT_TITLE = False
|
||||||
|
""" Set to True to force to use the default title. """
|
||||||
|
|
||||||
|
NAVIGATION_FINENAME = 'nav.xhtml'
|
||||||
|
""" The name for the navigation document. """
|
||||||
|
|
||||||
|
DEFAULT_TITLE = 'Navigation'
|
||||||
|
""" The default title for the navigation document. """
|
||||||
|
|
||||||
|
class NAVProcessor(object):
|
||||||
|
|
||||||
|
def __init__(self, files):
|
||||||
|
self.files = files
|
||||||
|
self.navname = NAVIGATION_FINENAME
|
||||||
|
|
||||||
|
def buildLandmarks(self, guidetext):
|
||||||
|
header = ''
|
||||||
|
header += ' <nav epub:type="landmarks" id="landmarks" hidden="">\n'
|
||||||
|
header += ' <h2>Guide</h2>\n'
|
||||||
|
header += ' <ol>\n'
|
||||||
|
element = ' <li><a epub:type="{:s}" href="{:s}">{:s}</a></li>\n'
|
||||||
|
footer = ''
|
||||||
|
footer += ' </ol>\n'
|
||||||
|
footer += ' </nav>\n'
|
||||||
|
|
||||||
|
type_map = {
|
||||||
|
'cover' : 'cover',
|
||||||
|
'title-page' : 'title-page',
|
||||||
|
# ?: 'frontmatter',
|
||||||
|
'text' : 'bodymatter',
|
||||||
|
# ?: 'backmatter',
|
||||||
|
'toc' : 'toc',
|
||||||
|
'loi' : 'loi',
|
||||||
|
'lot' : 'lot',
|
||||||
|
'preface' : 'preface',
|
||||||
|
'bibliography' : 'bibliography',
|
||||||
|
'index' : 'index',
|
||||||
|
'glossary' : 'glossary',
|
||||||
|
'acknowledgements' : 'acknowledgements',
|
||||||
|
'colophon' : None,
|
||||||
|
'copyright-page' : None,
|
||||||
|
'dedication' : None,
|
||||||
|
'epigraph' : None,
|
||||||
|
'foreword' : None,
|
||||||
|
'notes' : None
|
||||||
|
}
|
||||||
|
|
||||||
|
re_type = re.compile(r'\s+type\s*=\s*"(.*?)"', re.I)
|
||||||
|
re_title = re.compile(r'\s+title\s*=\s*"(.*?)"', re.I)
|
||||||
|
re_link = re.compile(r'\s+href\s*=\s*"(.*?)"', re.I)
|
||||||
|
dir_ = os.path.relpath(self.files.k8text, self.files.k8oebps).replace('\\', '/')
|
||||||
|
|
||||||
|
data = ''
|
||||||
|
references = re.findall(r'<reference\s+.*?>', unicode_str(guidetext), re.I)
|
||||||
|
for reference in references:
|
||||||
|
mo_type = re_type.search(reference)
|
||||||
|
mo_title = re_title.search(reference)
|
||||||
|
mo_link = re_link.search(reference)
|
||||||
|
if mo_type is not None:
|
||||||
|
type_ = type_map.get(mo_type.group(1), None)
|
||||||
|
else:
|
||||||
|
type_ = None
|
||||||
|
if mo_title is not None:
|
||||||
|
title = mo_title.group(1)
|
||||||
|
else:
|
||||||
|
title = None
|
||||||
|
if mo_link is not None:
|
||||||
|
link = mo_link.group(1)
|
||||||
|
else:
|
||||||
|
link = None
|
||||||
|
|
||||||
|
if type_ is not None and title is not None and link is not None:
|
||||||
|
link = os.path.relpath(link, dir_).replace('\\', '/')
|
||||||
|
data += element.format(type_, link, title)
|
||||||
|
if len(data) > 0:
|
||||||
|
return header + data + footer
|
||||||
|
else:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def buildTOC(self, indx_data):
|
||||||
|
header = ''
|
||||||
|
header += ' <nav epub:type="toc" id="toc">\n'
|
||||||
|
header += ' <h1>Table of contents</h1>\n'
|
||||||
|
footer = ' </nav>\n'
|
||||||
|
|
||||||
|
# recursive part
|
||||||
|
def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
|
||||||
|
if start>len(indx_data) or end>len(indx_data):
|
||||||
|
print("Warning (in buildTOC): missing INDX child entries", start, end, len(indx_data))
|
||||||
|
return ''
|
||||||
|
if DEBUG_NAV:
|
||||||
|
print("recursINDX (in buildTOC) lvl %d from %d to %d" % (lvl, start, end))
|
||||||
|
xhtml = ''
|
||||||
|
if start <= 0:
|
||||||
|
start = 0
|
||||||
|
if end <= 0:
|
||||||
|
end = len(indx_data)
|
||||||
|
if lvl > max_lvl:
|
||||||
|
max_lvl = lvl
|
||||||
|
|
||||||
|
indent1 = ' ' * (2 + lvl * 2)
|
||||||
|
indent2 = ' ' * (3 + lvl * 2)
|
||||||
|
xhtml += indent1 + '<ol>\n'
|
||||||
|
for i in range(start, end):
|
||||||
|
e = indx_data[i]
|
||||||
|
htmlfile = e['filename']
|
||||||
|
desttag = e['idtag']
|
||||||
|
text = e['text']
|
||||||
|
if not e['hlvl'] == lvl:
|
||||||
|
continue
|
||||||
|
num += 1
|
||||||
|
if desttag == '':
|
||||||
|
link = htmlfile
|
||||||
|
else:
|
||||||
|
link = '{:s}#{:s}'.format(htmlfile, desttag)
|
||||||
|
xhtml += indent2 + '<li>'
|
||||||
|
entry = '<a href="{:}">{:s}</a>'.format(link, text)
|
||||||
|
xhtml += entry
|
||||||
|
# recurs
|
||||||
|
if e['child1'] >= 0:
|
||||||
|
xhtml += '\n'
|
||||||
|
xhtmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1,
|
||||||
|
e['child1'], e['childn'] + 1)
|
||||||
|
xhtml += xhtmlrec
|
||||||
|
xhtml += indent2
|
||||||
|
# close entry
|
||||||
|
xhtml += '</li>\n'
|
||||||
|
xhtml += indent1 + '</ol>\n'
|
||||||
|
return xhtml, max_lvl, num
|
||||||
|
|
||||||
|
data, max_lvl, num = recursINDX()
|
||||||
|
if not len(indx_data) == num:
|
||||||
|
print("Warning (in buildTOC): different number of entries in NCX", len(indx_data), num)
|
||||||
|
return header + data + footer
|
||||||
|
|
||||||
|
def buildNAV(self, ncx_data, guidetext, title, lang):
|
||||||
|
print("Building Navigation Document.")
|
||||||
|
if FORCE_DEFAULT_TITLE:
|
||||||
|
title = DEFAULT_TITLE
|
||||||
|
nav_header = ''
|
||||||
|
nav_header += '<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE html>'
|
||||||
|
nav_header += '<html xmlns="http://www.w3.org/1999/xhtml"'
|
||||||
|
nav_header += ' xmlns:epub="http://www.idpf.org/2007/ops"'
|
||||||
|
nav_header += ' lang="{0:s}" xml:lang="{0:s}">\n'.format(lang)
|
||||||
|
nav_header += '<head>\n<title>{:s}</title>\n'.format(title)
|
||||||
|
nav_header += '<meta charset="UTF-8" />\n'
|
||||||
|
nav_header += '<style type="text/css">\n'
|
||||||
|
nav_header += 'nav#landmarks { display:none; }\n'
|
||||||
|
nav_header += '</style>\n</head>\n<body>\n'
|
||||||
|
nav_footer = '</body>\n</html>\n'
|
||||||
|
|
||||||
|
landmarks = self.buildLandmarks(guidetext)
|
||||||
|
toc = self.buildTOC(ncx_data)
|
||||||
|
|
||||||
|
data = nav_header
|
||||||
|
data += landmarks
|
||||||
|
data += toc
|
||||||
|
data += nav_footer
|
||||||
|
return data
|
||||||
|
|
||||||
|
def getNAVName(self):
|
||||||
|
return self.navname
|
||||||
|
|
||||||
|
def writeNAV(self, ncx_data, guidetext, metadata):
|
||||||
|
# build the xhtml
|
||||||
|
# print("Write Navigation Document.")
|
||||||
|
xhtml = self.buildNAV(ncx_data, guidetext, metadata.get('Title')[0], metadata.get('Language')[0])
|
||||||
|
fname = os.path.join(self.files.k8text, self.navname)
|
||||||
|
with open(pathof(fname), 'wb') as f:
|
||||||
|
f.write(xhtml.encode('utf-8'))
|
272
KindleUnpack/mobi_ncx.py
Normal file
272
KindleUnpack/mobi_ncx.py
Normal file
@@ -0,0 +1,272 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
|
||||||
|
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||||
|
|
||||||
|
import os
|
||||||
|
from .unipath import pathof
|
||||||
|
|
||||||
|
|
||||||
|
import re
|
||||||
|
# note: re requites the pattern to be the exact same type as the data to be searched in python3
|
||||||
|
# but u"" is not allowed for the pattern itself only b""
|
||||||
|
|
||||||
|
from .mobi_utils import toBase32
|
||||||
|
from .mobi_index import MobiIndex
|
||||||
|
|
||||||
|
DEBUG_NCX = False
|
||||||
|
|
||||||
|
class ncxExtract:
|
||||||
|
|
||||||
|
def __init__(self, mh, files):
|
||||||
|
self.mh = mh
|
||||||
|
self.sect = self.mh.sect
|
||||||
|
self.files = files
|
||||||
|
self.isNCX = False
|
||||||
|
self.mi = MobiIndex(self.sect)
|
||||||
|
self.ncxidx = self.mh.ncxidx
|
||||||
|
self.indx_data = None
|
||||||
|
|
||||||
|
def parseNCX(self):
|
||||||
|
indx_data = []
|
||||||
|
tag_fieldname_map = {
|
||||||
|
1: ['pos',0],
|
||||||
|
2: ['len',0],
|
||||||
|
3: ['noffs',0],
|
||||||
|
4: ['hlvl',0],
|
||||||
|
5: ['koffs',0],
|
||||||
|
6: ['pos_fid',0],
|
||||||
|
21: ['parent',0],
|
||||||
|
22: ['child1',0],
|
||||||
|
23: ['childn',0]
|
||||||
|
}
|
||||||
|
if self.ncxidx != 0xffffffff:
|
||||||
|
outtbl, ctoc_text = self.mi.getIndexData(self.ncxidx, "NCX")
|
||||||
|
if DEBUG_NCX:
|
||||||
|
print(ctoc_text)
|
||||||
|
print(outtbl)
|
||||||
|
num = 0
|
||||||
|
for [text, tagMap] in outtbl:
|
||||||
|
tmp = {
|
||||||
|
'name': text.decode('utf-8'),
|
||||||
|
'pos': -1,
|
||||||
|
'len': 0,
|
||||||
|
'noffs': -1,
|
||||||
|
'text' : "Unknown Text",
|
||||||
|
'hlvl' : -1,
|
||||||
|
'kind' : "Unknown Kind",
|
||||||
|
'pos_fid' : None,
|
||||||
|
'parent' : -1,
|
||||||
|
'child1' : -1,
|
||||||
|
'childn' : -1,
|
||||||
|
'num' : num
|
||||||
|
}
|
||||||
|
for tag in tag_fieldname_map:
|
||||||
|
[fieldname, i] = tag_fieldname_map[tag]
|
||||||
|
if tag in tagMap:
|
||||||
|
fieldvalue = tagMap[tag][i]
|
||||||
|
if tag == 6:
|
||||||
|
pos_fid = toBase32(fieldvalue,4).decode('utf-8')
|
||||||
|
fieldvalue2 = tagMap[tag][i+1]
|
||||||
|
pos_off = toBase32(fieldvalue2,10).decode('utf-8')
|
||||||
|
fieldvalue = 'kindle:pos:fid:%s:off:%s' % (pos_fid, pos_off)
|
||||||
|
tmp[fieldname] = fieldvalue
|
||||||
|
if tag == 3:
|
||||||
|
toctext = ctoc_text.get(fieldvalue, 'Unknown Text')
|
||||||
|
toctext = toctext.decode(self.mh.codec)
|
||||||
|
tmp['text'] = toctext
|
||||||
|
if tag == 5:
|
||||||
|
kindtext = ctoc_text.get(fieldvalue, 'Unknown Kind')
|
||||||
|
kindtext = kindtext.decode(self.mh.codec)
|
||||||
|
tmp['kind'] = kindtext
|
||||||
|
indx_data.append(tmp)
|
||||||
|
if DEBUG_NCX:
|
||||||
|
print("record number: ", num)
|
||||||
|
print("name: ", tmp['name'],)
|
||||||
|
print("position", tmp['pos']," length: ", tmp['len'])
|
||||||
|
print("text: ", tmp['text'])
|
||||||
|
print("kind: ", tmp['kind'])
|
||||||
|
print("heading level: ", tmp['hlvl'])
|
||||||
|
print("parent:", tmp['parent'])
|
||||||
|
print("first child: ",tmp['child1']," last child: ", tmp['childn'])
|
||||||
|
print("pos_fid is ", tmp['pos_fid'])
|
||||||
|
print("\n\n")
|
||||||
|
num += 1
|
||||||
|
self.indx_data = indx_data
|
||||||
|
return indx_data
|
||||||
|
|
||||||
|
def buildNCX(self, htmlfile, title, ident, lang):
|
||||||
|
indx_data = self.indx_data
|
||||||
|
|
||||||
|
ncx_header = \
|
||||||
|
'''<?xml version='1.0' encoding='utf-8'?>
|
||||||
|
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="%s">
|
||||||
|
<head>
|
||||||
|
<meta content="%s" name="dtb:uid"/>
|
||||||
|
<meta content="%d" name="dtb:depth"/>
|
||||||
|
<meta content="mobiunpack.py" name="dtb:generator"/>
|
||||||
|
<meta content="0" name="dtb:totalPageCount"/>
|
||||||
|
<meta content="0" name="dtb:maxPageNumber"/>
|
||||||
|
</head>
|
||||||
|
<docTitle>
|
||||||
|
<text>%s</text>
|
||||||
|
</docTitle>
|
||||||
|
<navMap>
|
||||||
|
'''
|
||||||
|
|
||||||
|
ncx_footer = \
|
||||||
|
''' </navMap>
|
||||||
|
</ncx>
|
||||||
|
'''
|
||||||
|
|
||||||
|
ncx_entry = \
|
||||||
|
'''<navPoint id="%s" playOrder="%d">
|
||||||
|
<navLabel>
|
||||||
|
<text>%s</text>
|
||||||
|
</navLabel>
|
||||||
|
<content src="%s"/>'''
|
||||||
|
|
||||||
|
# recursive part
|
||||||
|
def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
|
||||||
|
if start>len(indx_data) or end>len(indx_data):
|
||||||
|
print("Warning: missing INDX child entries", start, end, len(indx_data))
|
||||||
|
return ''
|
||||||
|
if DEBUG_NCX:
|
||||||
|
print("recursINDX lvl %d from %d to %d" % (lvl, start, end))
|
||||||
|
xml = ''
|
||||||
|
if start <= 0:
|
||||||
|
start = 0
|
||||||
|
if end <= 0:
|
||||||
|
end = len(indx_data)
|
||||||
|
if lvl > max_lvl:
|
||||||
|
max_lvl = lvl
|
||||||
|
indent = ' ' * (2 + lvl)
|
||||||
|
|
||||||
|
for i in range(start, end):
|
||||||
|
e = indx_data[i]
|
||||||
|
if not e['hlvl'] == lvl:
|
||||||
|
continue
|
||||||
|
# open entry
|
||||||
|
num += 1
|
||||||
|
link = '%s#filepos%d' % (htmlfile, e['pos'])
|
||||||
|
tagid = 'np_%d' % num
|
||||||
|
entry = ncx_entry % (tagid, num, e['text'], link)
|
||||||
|
entry = re.sub(re.compile('^', re.M), indent, entry, 0)
|
||||||
|
xml += entry + '\n'
|
||||||
|
# recurs
|
||||||
|
if e['child1']>=0:
|
||||||
|
xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1,
|
||||||
|
e['child1'], e['childn'] + 1)
|
||||||
|
xml += xmlrec
|
||||||
|
# close entry
|
||||||
|
xml += indent + '</navPoint>\n'
|
||||||
|
return xml, max_lvl, num
|
||||||
|
|
||||||
|
body, max_lvl, num = recursINDX()
|
||||||
|
header = ncx_header % (lang, ident, max_lvl + 1, title)
|
||||||
|
ncx = header + body + ncx_footer
|
||||||
|
if not len(indx_data) == num:
|
||||||
|
print("Warning: different number of entries in NCX", len(indx_data), num)
|
||||||
|
return ncx
|
||||||
|
|
||||||
|
def writeNCX(self, metadata):
|
||||||
|
# build the xml
|
||||||
|
self.isNCX = True
|
||||||
|
print("Write ncx")
|
||||||
|
# htmlname = os.path.basename(self.files.outbase)
|
||||||
|
# htmlname += '.html'
|
||||||
|
htmlname = 'book.html'
|
||||||
|
xml = self.buildNCX(htmlname, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0])
|
||||||
|
# write the ncx file
|
||||||
|
# ncxname = os.path.join(self.files.mobi7dir, self.files.getInputFileBasename() + '.ncx')
|
||||||
|
ncxname = os.path.join(self.files.mobi7dir, 'toc.ncx')
|
||||||
|
with open(pathof(ncxname), 'wb') as f:
|
||||||
|
f.write(xml.encode('utf-8'))
|
||||||
|
|
||||||
|
def buildK8NCX(self, indx_data, title, ident, lang):
|
||||||
|
ncx_header = \
|
||||||
|
'''<?xml version='1.0' encoding='utf-8'?>
|
||||||
|
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="%s">
|
||||||
|
<head>
|
||||||
|
<meta content="%s" name="dtb:uid"/>
|
||||||
|
<meta content="%d" name="dtb:depth"/>
|
||||||
|
<meta content="mobiunpack.py" name="dtb:generator"/>
|
||||||
|
<meta content="0" name="dtb:totalPageCount"/>
|
||||||
|
<meta content="0" name="dtb:maxPageNumber"/>
|
||||||
|
</head>
|
||||||
|
<docTitle>
|
||||||
|
<text>%s</text>
|
||||||
|
</docTitle>
|
||||||
|
<navMap>
|
||||||
|
'''
|
||||||
|
|
||||||
|
ncx_footer = \
|
||||||
|
''' </navMap>
|
||||||
|
</ncx>
|
||||||
|
'''
|
||||||
|
|
||||||
|
ncx_entry = \
|
||||||
|
'''<navPoint id="%s" playOrder="%d">
|
||||||
|
<navLabel>
|
||||||
|
<text>%s</text>
|
||||||
|
</navLabel>
|
||||||
|
<content src="%s"/>'''
|
||||||
|
|
||||||
|
# recursive part
|
||||||
|
def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
|
||||||
|
if start>len(indx_data) or end>len(indx_data):
|
||||||
|
print("Warning: missing INDX child entries", start, end, len(indx_data))
|
||||||
|
return ''
|
||||||
|
if DEBUG_NCX:
|
||||||
|
print("recursINDX lvl %d from %d to %d" % (lvl, start, end))
|
||||||
|
xml = ''
|
||||||
|
if start <= 0:
|
||||||
|
start = 0
|
||||||
|
if end <= 0:
|
||||||
|
end = len(indx_data)
|
||||||
|
if lvl > max_lvl:
|
||||||
|
max_lvl = lvl
|
||||||
|
indent = ' ' * (2 + lvl)
|
||||||
|
|
||||||
|
for i in range(start, end):
|
||||||
|
e = indx_data[i]
|
||||||
|
htmlfile = e['filename']
|
||||||
|
desttag = e['idtag']
|
||||||
|
if not e['hlvl'] == lvl:
|
||||||
|
continue
|
||||||
|
# open entry
|
||||||
|
num += 1
|
||||||
|
if desttag == '':
|
||||||
|
link = 'Text/%s' % htmlfile
|
||||||
|
else:
|
||||||
|
link = 'Text/%s#%s' % (htmlfile, desttag)
|
||||||
|
tagid = 'np_%d' % num
|
||||||
|
entry = ncx_entry % (tagid, num, e['text'], link)
|
||||||
|
entry = re.sub(re.compile('^', re.M), indent, entry, 0)
|
||||||
|
xml += entry + '\n'
|
||||||
|
# recurs
|
||||||
|
if e['child1']>=0:
|
||||||
|
xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1,
|
||||||
|
e['child1'], e['childn'] + 1)
|
||||||
|
xml += xmlrec
|
||||||
|
# close entry
|
||||||
|
xml += indent + '</navPoint>\n'
|
||||||
|
return xml, max_lvl, num
|
||||||
|
|
||||||
|
body, max_lvl, num = recursINDX()
|
||||||
|
header = ncx_header % (lang, ident, max_lvl + 1, title)
|
||||||
|
ncx = header + body + ncx_footer
|
||||||
|
if not len(indx_data) == num:
|
||||||
|
print("Warning: different number of entries in NCX", len(indx_data), num)
|
||||||
|
return ncx
|
||||||
|
|
||||||
|
def writeK8NCX(self, ncx_data, metadata):
|
||||||
|
# build the xml
|
||||||
|
self.isNCX = True
|
||||||
|
print("Write K8 ncx")
|
||||||
|
xml = self.buildK8NCX(ncx_data, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0])
|
||||||
|
bname = 'toc.ncx'
|
||||||
|
ncxname = os.path.join(self.files.k8oebps,bname)
|
||||||
|
with open(pathof(ncxname), 'wb') as f:
|
||||||
|
f.write(xml.encode('utf-8'))
|
681
KindleUnpack/mobi_opf.py
Normal file
681
KindleUnpack/mobi_opf.py
Normal file
@@ -0,0 +1,681 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
|
||||||
|
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||||
|
|
||||||
|
from .compatibility_utils import unicode_str, unescapeit
|
||||||
|
from .compatibility_utils import lzip
|
||||||
|
|
||||||
|
from .unipath import pathof
|
||||||
|
|
||||||
|
from xml.sax.saxutils import escape as xmlescape
|
||||||
|
|
||||||
|
import os
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
# In EPUB3, NCX and <guide> MAY exist in OPF, although the NCX is superseded
|
||||||
|
# by the Navigation Document and the <guide> is deprecated. Currently, EPUB3_WITH_NCX
|
||||||
|
# and EPUB3_WITH_GUIDE are set to True due to compatibility with epub2 reading systems.
|
||||||
|
# They might be change to set to False in the future.
|
||||||
|
|
||||||
|
EPUB3_WITH_NCX = True # Do not set to False except for debug.
|
||||||
|
""" Set to True to create a toc.ncx when converting to epub3. """
|
||||||
|
|
||||||
|
EPUB3_WITH_GUIDE = True # Do not set to False except for debug.
|
||||||
|
""" Set to True to create a guide element in an opf when converting to epub3. """
|
||||||
|
|
||||||
|
EPUB_OPF = 'content.opf'
|
||||||
|
""" The name for the OPF of EPUB. """
|
||||||
|
|
||||||
|
TOC_NCX = 'toc.ncx'
|
||||||
|
""" The name for the TOC of EPUB2. """
|
||||||
|
|
||||||
|
NAVIGATION_DOCUMENT = 'nav.xhtml'
|
||||||
|
""" The name for the navigation document of EPUB3. """
|
||||||
|
|
||||||
|
BEGIN_INFO_ONLY = '<!-- BEGIN INFORMATION ONLY '
|
||||||
|
""" The comment to indicate the beginning of metadata which will be ignored by kindlegen. """
|
||||||
|
|
||||||
|
END_INFO_ONLY = 'END INFORMATION ONLY -->'
|
||||||
|
""" The comment to indicate the end of metadata which will be ignored by kindlegen. """
|
||||||
|
|
||||||
|
EXTH_TITLE_FURIGANA = 'Title-Pronunciation'
|
||||||
|
""" The name for Title Furigana(similar to file-as) set by KDP. """
|
||||||
|
|
||||||
|
EXTH_CREATOR_FURIGANA = 'Author-Pronunciation'
|
||||||
|
""" The name for Creator Furigana(similar to file-as) set by KDP. """
|
||||||
|
|
||||||
|
EXTH_PUBLISHER_FURIGANA = 'Publisher-Pronunciation'
|
||||||
|
""" The name for Publisher Furigana(similar to file-as) set by KDP. """
|
||||||
|
|
||||||
|
EXTRA_ENTITIES = {'"': '"', "'": "'"}
|
||||||
|
|
||||||
|
class OPFProcessor(object):
|
||||||
|
|
||||||
|
def __init__(self, files, metadata, fileinfo, rscnames, hasNCX, mh, usedmap, pagemapxml='', guidetext='', k8resc=None, epubver='2'):
|
||||||
|
self.files = files
|
||||||
|
self.metadata = metadata
|
||||||
|
self.fileinfo = fileinfo
|
||||||
|
self.rscnames = rscnames
|
||||||
|
self.has_ncx = hasNCX
|
||||||
|
self.codec = mh.codec
|
||||||
|
self.isK8 = mh.isK8()
|
||||||
|
self.printReplica = mh.isPrintReplica()
|
||||||
|
self.guidetext = unicode_str(guidetext)
|
||||||
|
self.used = usedmap
|
||||||
|
self.k8resc = k8resc
|
||||||
|
self.covername = None
|
||||||
|
self.cover_id = 'cover_img'
|
||||||
|
if self.k8resc is not None and self.k8resc.cover_name is not None:
|
||||||
|
# update cover id info from RESC if available
|
||||||
|
self.cover_id = self.k8resc.cover_name
|
||||||
|
# Create a unique urn uuid
|
||||||
|
self.BookId = unicode_str(str(uuid.uuid4()))
|
||||||
|
self.pagemap = pagemapxml
|
||||||
|
|
||||||
|
self.ncxname = None
|
||||||
|
self.navname = None
|
||||||
|
|
||||||
|
# page-progression-direction is only set in spine
|
||||||
|
self.page_progression_direction = metadata.pop('page-progression-direction', [None])[0]
|
||||||
|
if 'rl' in metadata.get('primary-writing-mode', [''])[0]:
|
||||||
|
self.page_progression_direction = 'rtl'
|
||||||
|
self.epubver = epubver # the epub version set by user
|
||||||
|
self.target_epubver = epubver # the epub vertion set by user or detected automatically
|
||||||
|
if self.epubver == 'A':
|
||||||
|
self.target_epubver = self.autodetectEPUBVersion()
|
||||||
|
elif self.epubver == 'F':
|
||||||
|
self.target_epubver = '2'
|
||||||
|
elif self.epubver != '2' and self.epubver != '3':
|
||||||
|
self.target_epubver = '2'
|
||||||
|
|
||||||
|
# id for rifine attributes
|
||||||
|
self.title_id = {}
|
||||||
|
self.creator_id = {}
|
||||||
|
self.publisher_id = {}
|
||||||
|
# extra attributes
|
||||||
|
self.title_attrib = {}
|
||||||
|
self.creator_attrib = {}
|
||||||
|
self.publisher_attrib = {}
|
||||||
|
self.extra_attributes = [] # for force epub2 option
|
||||||
|
# Create epub3 metadata from EXTH.
|
||||||
|
self.exth_solved_refines_metadata = []
|
||||||
|
self.exth_refines_metadata = []
|
||||||
|
self.exth_fixedlayout_metadata = []
|
||||||
|
|
||||||
|
self.defineRefinesID()
|
||||||
|
self.processRefinesMetadata()
|
||||||
|
if self.k8resc is not None:
|
||||||
|
# Create metadata in RESC section.
|
||||||
|
self.k8resc.createMetadata(epubver)
|
||||||
|
if self.target_epubver == "3":
|
||||||
|
self.createMetadataForFixedlayout()
|
||||||
|
|
||||||
|
def escapeit(self, sval, EXTRAS=None):
|
||||||
|
# note, xmlescape and unescape do not work with utf-8 bytestrings
|
||||||
|
sval = unicode_str(sval)
|
||||||
|
if EXTRAS:
|
||||||
|
res = xmlescape(unescapeit(sval), EXTRAS)
|
||||||
|
else:
|
||||||
|
res = xmlescape(unescapeit(sval))
|
||||||
|
return res
|
||||||
|
|
||||||
|
def createMetaTag(self, data, property, content, refid=''):
|
||||||
|
refines = ''
|
||||||
|
if refid:
|
||||||
|
refines = ' refines="#%s"' % refid
|
||||||
|
data.append('<meta property="%s"%s>%s</meta>\n' % (property, refines, content))
|
||||||
|
|
||||||
|
def buildOPFMetadata(self, start_tag, has_obfuscated_fonts=False):
|
||||||
|
# convert from EXTH metadata format to target epub version metadata
|
||||||
|
# epub 3 will ignore <meta name="xxxx" content="yyyy" /> style metatags
|
||||||
|
# but allows them to be present for backwards compatibility
|
||||||
|
# instead the new format is
|
||||||
|
# <meta property="xxxx" id="iiii" ... > property_value</meta>
|
||||||
|
# and DCMES elements such as:
|
||||||
|
# <dc:blah id="iiii">value</dc:blah>
|
||||||
|
|
||||||
|
metadata = self.metadata
|
||||||
|
k8resc = self.k8resc
|
||||||
|
|
||||||
|
META_TAGS = ['Drm Server Id', 'Drm Commerce Id', 'Drm Ebookbase Book Id', 'ASIN', 'ThumbOffset', 'Fake Cover',
|
||||||
|
'Creator Software', 'Creator Major Version', 'Creator Minor Version', 'Creator Build Number',
|
||||||
|
'Watermark', 'Clipping Limit', 'Publisher Limit', 'Text to Speech Disabled', 'CDE Type',
|
||||||
|
'Updated Title', 'Font Signature (hex)', 'Tamper Proof Keys (hex)',]
|
||||||
|
|
||||||
|
# def handleTag(data, metadata, key, tag, ids={}):
|
||||||
|
def handleTag(data, metadata, key, tag, attrib={}):
|
||||||
|
'''Format metadata values.
|
||||||
|
|
||||||
|
@param data: List of formatted metadata entries.
|
||||||
|
@param metadata: The metadata dictionary.
|
||||||
|
@param key: The key of the metadata value to handle.
|
||||||
|
@param tag: The opf tag corresponds to the metadata value.
|
||||||
|
###@param ids: The ids in tags for refines property of epub3.
|
||||||
|
@param attrib: The extra attibute for refines or opf prefixs.
|
||||||
|
'''
|
||||||
|
if key in metadata:
|
||||||
|
for i, value in enumerate(metadata[key]):
|
||||||
|
closingTag = tag.split(" ")[0]
|
||||||
|
res = '<%s%s>%s</%s>\n' % (tag, attrib.get(i, ''), self.escapeit(value), closingTag)
|
||||||
|
data.append(res)
|
||||||
|
del metadata[key]
|
||||||
|
|
||||||
|
# these are allowed but ignored by epub3
|
||||||
|
def handleMetaPairs(data, metadata, key, name):
|
||||||
|
if key in metadata:
|
||||||
|
for value in metadata[key]:
|
||||||
|
res = '<meta name="%s" content="%s" />\n' % (name, self.escapeit(value, EXTRA_ENTITIES))
|
||||||
|
data.append(res)
|
||||||
|
del metadata[key]
|
||||||
|
|
||||||
|
data = []
|
||||||
|
data.append(start_tag + '\n')
|
||||||
|
# Handle standard metadata
|
||||||
|
if 'Title' in metadata:
|
||||||
|
handleTag(data, metadata, 'Title', 'dc:title', self.title_attrib)
|
||||||
|
else:
|
||||||
|
data.append('<dc:title>Untitled</dc:title>\n')
|
||||||
|
handleTag(data, metadata, 'Language', 'dc:language')
|
||||||
|
if 'UniqueID' in metadata:
|
||||||
|
handleTag(data, metadata, 'UniqueID', 'dc:identifier id="uid"')
|
||||||
|
else:
|
||||||
|
# No unique ID in original, give it a generic one.
|
||||||
|
data.append('<dc:identifier id="uid">0</dc:identifier>\n')
|
||||||
|
|
||||||
|
if self.target_epubver == '3':
|
||||||
|
# epub version 3 minimal metadata requires a dcterms:modifed date tag
|
||||||
|
self.createMetaTag(data, 'dcterms:modified', datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"))
|
||||||
|
|
||||||
|
if self.isK8 and has_obfuscated_fonts:
|
||||||
|
# Use the random generated urn:uuid so obuscated fonts work.
|
||||||
|
# It doesn't need to be _THE_ unique identifier to work as a key
|
||||||
|
# for obfuscated fonts in Sigil, ADE and calibre. Its just has
|
||||||
|
# to use the opf:scheme="UUID" and have the urn:uuid: prefix.
|
||||||
|
if self.target_epubver == '3':
|
||||||
|
data.append('<dc:identifier>urn:uuid:'+self.BookId+'</dc:identifier>\n')
|
||||||
|
else:
|
||||||
|
data.append('<dc:identifier opf:scheme="UUID">urn:uuid:'+self.BookId+'</dc:identifier>\n')
|
||||||
|
|
||||||
|
handleTag(data, metadata, 'Creator', 'dc:creator', self.creator_attrib)
|
||||||
|
handleTag(data, metadata, 'Contributor', 'dc:contributor')
|
||||||
|
handleTag(data, metadata, 'Publisher', 'dc:publisher', self.publisher_attrib)
|
||||||
|
handleTag(data, metadata, 'Source', 'dc:source')
|
||||||
|
handleTag(data, metadata, 'Type', 'dc:type')
|
||||||
|
if self.target_epubver == '3':
|
||||||
|
if 'ISBN' in metadata:
|
||||||
|
for i, value in enumerate(metadata['ISBN']):
|
||||||
|
res = '<dc:identifier>urn:isbn:%s</dc:identifier>\n' % self.escapeit(value)
|
||||||
|
data.append(res)
|
||||||
|
else:
|
||||||
|
handleTag(data, metadata, 'ISBN', 'dc:identifier opf:scheme="ISBN"')
|
||||||
|
if 'Subject' in metadata:
|
||||||
|
if 'SubjectCode' in metadata:
|
||||||
|
codeList = metadata['SubjectCode']
|
||||||
|
del metadata['SubjectCode']
|
||||||
|
else:
|
||||||
|
codeList = None
|
||||||
|
for i in range(len(metadata['Subject'])):
|
||||||
|
if codeList and i < len(codeList):
|
||||||
|
data.append('<dc:subject BASICCode="'+codeList[i]+'">')
|
||||||
|
else:
|
||||||
|
data.append('<dc:subject>')
|
||||||
|
data.append(self.escapeit(metadata['Subject'][i])+'</dc:subject>\n')
|
||||||
|
del metadata['Subject']
|
||||||
|
handleTag(data, metadata, 'Description', 'dc:description')
|
||||||
|
if self.target_epubver == '3':
|
||||||
|
if 'Published' in metadata:
|
||||||
|
for i, value in enumerate(metadata['Published']):
|
||||||
|
res = '<dc:date>%s</dc:date>\n' % self.escapeit(value)
|
||||||
|
data.append(res)
|
||||||
|
else:
|
||||||
|
handleTag(data, metadata, 'Published', 'dc:date opf:event="publication"')
|
||||||
|
handleTag(data, metadata, 'Rights', 'dc:rights')
|
||||||
|
|
||||||
|
if self.epubver == 'F':
|
||||||
|
if self.extra_attributes or k8resc is not None and k8resc.extra_attributes:
|
||||||
|
data.append('<!-- THE FOLLOWINGS ARE REQUIRED TO INSERT INTO <dc:xxx> MANUALLY\n')
|
||||||
|
if self.extra_attributes:
|
||||||
|
data += self.extra_attributes
|
||||||
|
if k8resc is not None and k8resc.extra_attributes:
|
||||||
|
data += k8resc.extra_attributes
|
||||||
|
data.append('-->\n')
|
||||||
|
else:
|
||||||
|
# Append refines metadata.
|
||||||
|
if self.exth_solved_refines_metadata:
|
||||||
|
data.append('<!-- Refines MetaData from EXTH -->\n')
|
||||||
|
data += self.exth_solved_refines_metadata
|
||||||
|
if self.exth_refines_metadata or k8resc is not None and k8resc.refines_metadata:
|
||||||
|
data.append('<!-- THE FOLLOWINGS ARE REQUIRED TO EDIT IDS MANUALLY\n')
|
||||||
|
if self.exth_refines_metadata:
|
||||||
|
data += self.exth_refines_metadata
|
||||||
|
if k8resc is not None and k8resc.refines_metadata:
|
||||||
|
data += k8resc.refines_metadata
|
||||||
|
data.append('-->\n')
|
||||||
|
|
||||||
|
# Append metadata in RESC section.
|
||||||
|
if k8resc is not None and k8resc.extra_metadata:
|
||||||
|
data.append('<!-- Extra MetaData from RESC\n')
|
||||||
|
data += k8resc.extra_metadata
|
||||||
|
data.append('-->\n')
|
||||||
|
|
||||||
|
if 'CoverOffset' in metadata:
|
||||||
|
imageNumber = int(metadata['CoverOffset'][0])
|
||||||
|
self.covername = self.rscnames[imageNumber]
|
||||||
|
if self.covername is None:
|
||||||
|
print("Error: Cover image %s was not recognized as a valid image" % imageNumber)
|
||||||
|
else:
|
||||||
|
# <meta name="cover"> is obsoleted in EPUB3, but kindlegen v2.9 requires it.
|
||||||
|
data.append('<meta name="cover" content="' + self.cover_id + '" />\n')
|
||||||
|
self.used[self.covername] = 'used'
|
||||||
|
del metadata['CoverOffset']
|
||||||
|
|
||||||
|
handleMetaPairs(data, metadata, 'Codec', 'output encoding')
|
||||||
|
# handle kindlegen specifc tags
|
||||||
|
handleTag(data, metadata, 'DictInLanguage', 'DictionaryInLanguage')
|
||||||
|
handleTag(data, metadata, 'DictOutLanguage', 'DictionaryOutLanguage')
|
||||||
|
handleMetaPairs(data, metadata, 'RegionMagnification', 'RegionMagnification')
|
||||||
|
handleMetaPairs(data, metadata, 'book-type', 'book-type')
|
||||||
|
handleMetaPairs(data, metadata, 'zero-gutter', 'zero-gutter')
|
||||||
|
handleMetaPairs(data, metadata, 'zero-margin', 'zero-margin')
|
||||||
|
handleMetaPairs(data, metadata, 'primary-writing-mode', 'primary-writing-mode')
|
||||||
|
handleMetaPairs(data, metadata, 'fixed-layout', 'fixed-layout')
|
||||||
|
handleMetaPairs(data, metadata, 'orientation-lock', 'orientation-lock')
|
||||||
|
handleMetaPairs(data, metadata, 'original-resolution', 'original-resolution')
|
||||||
|
|
||||||
|
# these are not allowed in epub2 or 3 so convert them to meta name content pairs
|
||||||
|
# perhaps these could better be mapped into the dcterms namespace instead
|
||||||
|
handleMetaPairs(data, metadata, 'Review', 'review')
|
||||||
|
handleMetaPairs(data, metadata, 'Imprint', 'imprint')
|
||||||
|
handleMetaPairs(data, metadata, 'Adult', 'adult')
|
||||||
|
handleMetaPairs(data, metadata, 'DictShortName', 'DictionaryVeryShortName')
|
||||||
|
|
||||||
|
# these are needed by kobo books upon submission but not sure if legal metadata in epub2 or epub3
|
||||||
|
if 'Price' in metadata and 'Currency' in metadata:
|
||||||
|
priceList = metadata['Price']
|
||||||
|
currencyList = metadata['Currency']
|
||||||
|
if len(priceList) != len(currencyList):
|
||||||
|
print("Error: found %s price entries, but %s currency entries.")
|
||||||
|
else:
|
||||||
|
for i in range(len(priceList)):
|
||||||
|
data.append('<SRP Currency="'+currencyList[i]+'">'+priceList[i]+'</SRP>\n')
|
||||||
|
del metadata['Price']
|
||||||
|
del metadata['Currency']
|
||||||
|
|
||||||
|
if self.target_epubver == '3':
|
||||||
|
# Append metadata for EPUB3.
|
||||||
|
if self.exth_fixedlayout_metadata:
|
||||||
|
data.append('<!-- EPUB3 MedaData converted from EXTH -->\n')
|
||||||
|
data += self.exth_fixedlayout_metadata
|
||||||
|
|
||||||
|
# all that remains is extra EXTH info we will store inside a comment inside meta name/content pairs
|
||||||
|
# so it can not impact anything and will be automatically stripped out if found again in a RESC section
|
||||||
|
data.append(BEGIN_INFO_ONLY + '\n')
|
||||||
|
if 'ThumbOffset' in metadata:
|
||||||
|
imageNumber = int(metadata['ThumbOffset'][0])
|
||||||
|
imageName = self.rscnames[imageNumber]
|
||||||
|
if imageName is None:
|
||||||
|
print("Error: Cover Thumbnail image %s was not recognized as a valid image" % imageNumber)
|
||||||
|
else:
|
||||||
|
data.append('<meta name="Cover ThumbNail Image" content="'+ 'Images/'+imageName+'" />\n')
|
||||||
|
# self.used[imageName] = 'used' # thumbnail image is always generated by Kindlegen, so don't include in manifest
|
||||||
|
self.used[imageName] = 'not used'
|
||||||
|
del metadata['ThumbOffset']
|
||||||
|
for metaName in META_TAGS:
|
||||||
|
if metaName in metadata:
|
||||||
|
for value in metadata[metaName]:
|
||||||
|
data.append('<meta name="'+metaName+'" content="'+self.escapeit(value, EXTRA_ENTITIES)+'" />\n')
|
||||||
|
del metadata[metaName]
|
||||||
|
for key in list(metadata.keys()):
|
||||||
|
for value in metadata[key]:
|
||||||
|
data.append('<meta name="'+key+'" content="'+self.escapeit(value, EXTRA_ENTITIES)+'" />\n')
|
||||||
|
del metadata[key]
|
||||||
|
data.append(END_INFO_ONLY + '\n')
|
||||||
|
data.append('</metadata>\n')
|
||||||
|
return data
|
||||||
|
|
||||||
|
def buildOPFManifest(self, ncxname, navname=None):
|
||||||
|
# buildManifest for mobi7, azw4, epub2 and epub3.
|
||||||
|
k8resc = self.k8resc
|
||||||
|
cover_id = self.cover_id
|
||||||
|
hasK8RescSpine = k8resc is not None and k8resc.hasSpine()
|
||||||
|
self.ncxname = ncxname
|
||||||
|
self.navname = navname
|
||||||
|
|
||||||
|
data = []
|
||||||
|
data.append('<manifest>\n')
|
||||||
|
media_map = {
|
||||||
|
'.jpg' : 'image/jpeg',
|
||||||
|
'.jpeg' : 'image/jpeg',
|
||||||
|
'.png' : 'image/png',
|
||||||
|
'.gif' : 'image/gif',
|
||||||
|
'.svg' : 'image/svg+xml',
|
||||||
|
'.xhtml': 'application/xhtml+xml',
|
||||||
|
'.html' : 'text/html', # for mobi7
|
||||||
|
'.pdf' : 'application/pdf', # for azw4(print replica textbook)
|
||||||
|
'.ttf' : 'application/x-font-ttf',
|
||||||
|
'.otf' : 'application/x-font-opentype', # replaced?
|
||||||
|
'.css' : 'text/css',
|
||||||
|
# '.html' : 'text/x-oeb1-document', # for mobi7
|
||||||
|
# '.otf' : 'application/vnd.ms-opentype', # [OpenType] OpenType fonts
|
||||||
|
# '.woff' : 'application/font-woff', # [WOFF] WOFF fonts
|
||||||
|
# '.smil' : 'application/smil+xml', # [MediaOverlays301] EPUB Media Overlay documents
|
||||||
|
# '.pls' : 'application/pls+xml', # [PLS] Text-to-Speech (TTS) Pronunciation lexicons
|
||||||
|
# '.mp3' : 'audio/mpeg',
|
||||||
|
# '.mp4' : 'video/mp4',
|
||||||
|
# '.js' : 'text/javascript', # not supported in K8
|
||||||
|
}
|
||||||
|
spinerefs = []
|
||||||
|
|
||||||
|
idcnt = 0
|
||||||
|
for [key,dir,fname] in self.fileinfo:
|
||||||
|
name, ext = os.path.splitext(fname)
|
||||||
|
ext = ext.lower()
|
||||||
|
media = media_map.get(ext)
|
||||||
|
ref = "item%d" % idcnt
|
||||||
|
if hasK8RescSpine:
|
||||||
|
if key is not None and key in k8resc.spine_idrefs:
|
||||||
|
ref = k8resc.spine_idrefs[key]
|
||||||
|
properties = ''
|
||||||
|
if dir != '':
|
||||||
|
fpath = dir + '/' + fname
|
||||||
|
else:
|
||||||
|
fpath = fname
|
||||||
|
data.append('<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(ref, media, fpath, properties))
|
||||||
|
|
||||||
|
if ext in ['.xhtml', '.html']:
|
||||||
|
spinerefs.append(ref)
|
||||||
|
idcnt += 1
|
||||||
|
|
||||||
|
for fname in self.rscnames:
|
||||||
|
if fname is not None:
|
||||||
|
if self.used.get(fname,'not used') == 'not used':
|
||||||
|
continue
|
||||||
|
name, ext = os.path.splitext(fname)
|
||||||
|
ext = ext.lower()
|
||||||
|
media = media_map.get(ext,ext[1:])
|
||||||
|
properties = ''
|
||||||
|
if fname == self.covername:
|
||||||
|
ref = cover_id
|
||||||
|
if self.target_epubver == '3':
|
||||||
|
properties = 'properties="cover-image"'
|
||||||
|
else:
|
||||||
|
ref = "item%d" % idcnt
|
||||||
|
if ext == '.ttf' or ext == '.otf':
|
||||||
|
if self.isK8: # fonts are only used in Mobi 8
|
||||||
|
fpath = 'Fonts/' + fname
|
||||||
|
data.append('<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(ref, media, fpath, properties))
|
||||||
|
else:
|
||||||
|
fpath = 'Images/' + fname
|
||||||
|
data.append('<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(ref, media, fpath, properties))
|
||||||
|
idcnt += 1
|
||||||
|
|
||||||
|
if self.target_epubver == '3' and navname is not None:
|
||||||
|
data.append('<item id="nav" media-type="application/xhtml+xml" href="Text/' + navname + '" properties="nav"/>\n')
|
||||||
|
if self.has_ncx and ncxname is not None:
|
||||||
|
data.append('<item id="ncx" media-type="application/x-dtbncx+xml" href="' + ncxname +'" />\n')
|
||||||
|
if self.pagemap != '':
|
||||||
|
data.append('<item id="map" media-type="application/oebs-page-map+xml" href="page-map.xml" />\n')
|
||||||
|
data.append('</manifest>\n')
|
||||||
|
return [data, spinerefs]
|
||||||
|
|
||||||
|
def buildOPFSpine(self, spinerefs, isNCX):
|
||||||
|
# build spine
|
||||||
|
k8resc = self.k8resc
|
||||||
|
hasK8RescSpine = k8resc is not None and k8resc.hasSpine()
|
||||||
|
data = []
|
||||||
|
ppd = ''
|
||||||
|
if self.isK8 and self.page_progression_direction is not None:
|
||||||
|
ppd = ' page-progression-direction="{:s}"'.format(self.page_progression_direction)
|
||||||
|
ncx = ''
|
||||||
|
if isNCX:
|
||||||
|
ncx = ' toc="ncx"'
|
||||||
|
map=''
|
||||||
|
if self.pagemap != '':
|
||||||
|
map = ' page-map="map"'
|
||||||
|
if self.epubver == 'F':
|
||||||
|
if ppd:
|
||||||
|
ppd = '<!--' + ppd + ' -->'
|
||||||
|
spine_start_tag = '<spine{1:s}{2:s}>{0:s}\n'.format(ppd, map, ncx)
|
||||||
|
else:
|
||||||
|
spine_start_tag = '<spine{0:s}{1:s}{2:s}>\n'.format(ppd, map, ncx)
|
||||||
|
data.append(spine_start_tag)
|
||||||
|
|
||||||
|
if hasK8RescSpine:
|
||||||
|
for key in k8resc.spine_order:
|
||||||
|
idref = k8resc.spine_idrefs[key]
|
||||||
|
attribs = k8resc.spine_pageattributes[key]
|
||||||
|
tag = '<itemref idref="%s"' % idref
|
||||||
|
for aname, val in list(attribs.items()):
|
||||||
|
if self.epubver == 'F' and aname == 'properties':
|
||||||
|
continue
|
||||||
|
if val is not None:
|
||||||
|
tag += ' %s="%s"' % (aname, val)
|
||||||
|
tag += '/>'
|
||||||
|
if self.epubver == 'F' and 'properties' in attribs:
|
||||||
|
val = attribs['properties']
|
||||||
|
if val is not None:
|
||||||
|
tag += '<!-- properties="%s" -->' % val
|
||||||
|
tag += '\n'
|
||||||
|
data.append(tag)
|
||||||
|
else:
|
||||||
|
start = 0
|
||||||
|
# special case the created coverpage if need be
|
||||||
|
[key, dir, fname] = self.fileinfo[0]
|
||||||
|
if key is not None and key == "coverpage":
|
||||||
|
entry = spinerefs[start]
|
||||||
|
data.append('<itemref idref="%s" linear="no"/>\n' % entry)
|
||||||
|
start += 1
|
||||||
|
for entry in spinerefs[start:]:
|
||||||
|
data.append('<itemref idref="' + entry + '"/>\n')
|
||||||
|
data.append('</spine>\n')
|
||||||
|
return data
|
||||||
|
|
||||||
|
def buildMobi7OPF(self):
|
||||||
|
# Build an OPF for mobi7 and azw4.
|
||||||
|
print("Building an opf for mobi7/azw4.")
|
||||||
|
data = []
|
||||||
|
data.append('<?xml version="1.0" encoding="utf-8"?>\n')
|
||||||
|
data.append('<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="uid">\n')
|
||||||
|
metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">'
|
||||||
|
opf_metadata = self.buildOPFMetadata(metadata_tag)
|
||||||
|
data += opf_metadata
|
||||||
|
if self.has_ncx:
|
||||||
|
# ncxname = self.files.getInputFileBasename() + '.ncx'
|
||||||
|
ncxname = 'toc.ncx'
|
||||||
|
else:
|
||||||
|
ncxname = None
|
||||||
|
[opf_manifest, spinerefs] = self.buildOPFManifest(ncxname)
|
||||||
|
data += opf_manifest
|
||||||
|
opf_spine = self.buildOPFSpine(spinerefs, self.has_ncx)
|
||||||
|
data += opf_spine
|
||||||
|
data.append('<tours>\n</tours>\n')
|
||||||
|
if not self.printReplica:
|
||||||
|
guide ='<guide>\n' + self.guidetext + '</guide>\n'
|
||||||
|
data.append(guide)
|
||||||
|
data.append('</package>\n')
|
||||||
|
return ''.join(data)
|
||||||
|
|
||||||
|
def buildEPUBOPF(self, has_obfuscated_fonts=False):
|
||||||
|
print("Building an opf for mobi8 using epub version: ", self.target_epubver)
|
||||||
|
if self.target_epubver == '2':
|
||||||
|
has_ncx = self.has_ncx
|
||||||
|
has_guide = True
|
||||||
|
ncxname = None
|
||||||
|
ncxname = TOC_NCX
|
||||||
|
navname = None
|
||||||
|
package = '<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="uid">\n'
|
||||||
|
tours = '<tours>\n</tours>\n'
|
||||||
|
metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">'
|
||||||
|
else:
|
||||||
|
has_ncx = EPUB3_WITH_NCX
|
||||||
|
has_guide = EPUB3_WITH_GUIDE
|
||||||
|
ncxname = None
|
||||||
|
if has_ncx:
|
||||||
|
ncxname = TOC_NCX
|
||||||
|
navname = NAVIGATION_DOCUMENT
|
||||||
|
package = '<package version="3.0" xmlns="http://www.idpf.org/2007/opf" prefix="rendition: http://www.idpf.org/vocab/rendition/#" unique-identifier="uid">\n'
|
||||||
|
tours = ''
|
||||||
|
metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">'
|
||||||
|
|
||||||
|
data = []
|
||||||
|
data.append('<?xml version="1.0" encoding="utf-8"?>\n')
|
||||||
|
data.append(package)
|
||||||
|
opf_metadata = self.buildOPFMetadata(metadata_tag, has_obfuscated_fonts)
|
||||||
|
data += opf_metadata
|
||||||
|
[opf_manifest, spinerefs] = self.buildOPFManifest(ncxname, navname)
|
||||||
|
data += opf_manifest
|
||||||
|
opf_spine = self.buildOPFSpine(spinerefs, has_ncx)
|
||||||
|
data += opf_spine
|
||||||
|
data.append(tours)
|
||||||
|
if has_guide:
|
||||||
|
guide ='<guide>\n' + self.guidetext + '</guide>\n'
|
||||||
|
data.append(guide)
|
||||||
|
data.append('</package>\n')
|
||||||
|
return ''.join(data)
|
||||||
|
|
||||||
|
def writeOPF(self, has_obfuscated_fonts=False):
|
||||||
|
if self.isK8:
|
||||||
|
data = self.buildEPUBOPF(has_obfuscated_fonts)
|
||||||
|
outopf = os.path.join(self.files.k8oebps, EPUB_OPF)
|
||||||
|
with open(pathof(outopf), 'wb') as f:
|
||||||
|
f.write(data.encode('utf-8'))
|
||||||
|
return self.BookId
|
||||||
|
else:
|
||||||
|
data = self.buildMobi7OPF()
|
||||||
|
outopf = os.path.join(self.files.mobi7dir, 'content.opf')
|
||||||
|
with open(pathof(outopf), 'wb') as f:
|
||||||
|
f.write(data.encode('utf-8'))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def getBookId(self):
|
||||||
|
return self.BookId
|
||||||
|
|
||||||
|
def getNCXName(self):
|
||||||
|
return self.ncxname
|
||||||
|
|
||||||
|
def getNAVName(self):
|
||||||
|
return self.navname
|
||||||
|
|
||||||
|
def getEPUBVersion(self):
|
||||||
|
return self.target_epubver
|
||||||
|
|
||||||
|
def hasNCX(self):
|
||||||
|
return self.ncxname is not None and self.has_ncx
|
||||||
|
|
||||||
|
def hasNAV(self):
|
||||||
|
return self.navname is not None
|
||||||
|
|
||||||
|
def autodetectEPUBVersion(self):
|
||||||
|
# Determine EPUB version from metadata and RESC.
|
||||||
|
metadata = self.metadata
|
||||||
|
k8resc = self.k8resc
|
||||||
|
epubver = '2'
|
||||||
|
if 'true' == metadata.get('fixed-layout', [''])[0].lower():
|
||||||
|
epubver = '3'
|
||||||
|
elif metadata.get('orientation-lock', [''])[0].lower() in ['portrait', 'landscape']:
|
||||||
|
epubver = '3'
|
||||||
|
elif self.page_progression_direction == 'rtl':
|
||||||
|
epubver = '3'
|
||||||
|
elif EXTH_TITLE_FURIGANA in metadata:
|
||||||
|
epubver = '3'
|
||||||
|
elif EXTH_CREATOR_FURIGANA in metadata:
|
||||||
|
epubver = '3'
|
||||||
|
elif EXTH_PUBLISHER_FURIGANA in metadata:
|
||||||
|
epubver = '3'
|
||||||
|
elif k8resc is not None and k8resc.needEPUB3():
|
||||||
|
epubver = '3'
|
||||||
|
return epubver
|
||||||
|
|
||||||
|
def defineRefinesID(self):
|
||||||
|
# the following EXTH are set by KDP.
|
||||||
|
# 'Title_Furigana_(508)'
|
||||||
|
# 'Creator_Furigana_(517)',
|
||||||
|
# 'Publisher_Furigana_(522)'
|
||||||
|
# It is difficult to find correspondence between Title, Creator, Publisher
|
||||||
|
# and EXTH 508,512, 522 if they have more than two values since KDP seems not preserve the oders of EXTH 508,512 and 522.
|
||||||
|
# It is also difficult to find correspondence between them and tags which have refine attributes in RESC.
|
||||||
|
# So editing manually is required.
|
||||||
|
metadata = self.metadata
|
||||||
|
|
||||||
|
needRefinesId = False
|
||||||
|
if self.k8resc is not None:
|
||||||
|
needRefinesId = self.k8resc.hasRefines()
|
||||||
|
# Create id for rifine attributes
|
||||||
|
if (needRefinesId or EXTH_TITLE_FURIGANA in metadata) and 'Title' in metadata:
|
||||||
|
for i in range(len(metadata.get('Title'))):
|
||||||
|
self.title_id[i] = 'title%02d' % (i+1)
|
||||||
|
|
||||||
|
if (needRefinesId or EXTH_CREATOR_FURIGANA in metadata) and 'Creator' in metadata:
|
||||||
|
for i in range(len(metadata.get('Creator'))):
|
||||||
|
self.creator_id[i] = 'creator%02d' % (i+1)
|
||||||
|
|
||||||
|
if (needRefinesId or EXTH_PUBLISHER_FURIGANA in metadata) and 'Publisher' in metadata:
|
||||||
|
for i in range(len(metadata.get('Publisher'))):
|
||||||
|
self.publisher_id[i] = 'publisher%02d' % (i+1)
|
||||||
|
|
||||||
|
def processRefinesMetadata(self):
|
||||||
|
# create refines metadata defined in epub3 or convert refines property to opf: attribues for epub2.
|
||||||
|
metadata = self.metadata
|
||||||
|
|
||||||
|
refines_list = [
|
||||||
|
[EXTH_TITLE_FURIGANA, self.title_id, self.title_attrib, 'title00'],
|
||||||
|
[EXTH_CREATOR_FURIGANA, self.creator_id, self.creator_attrib, 'creator00'],
|
||||||
|
[EXTH_PUBLISHER_FURIGANA, self.publisher_id, self.publisher_attrib, 'publisher00']
|
||||||
|
]
|
||||||
|
|
||||||
|
create_refines_metadata = False
|
||||||
|
for EXTH in lzip(*refines_list)[0]:
|
||||||
|
if EXTH in metadata:
|
||||||
|
create_refines_metadata = True
|
||||||
|
break
|
||||||
|
if create_refines_metadata:
|
||||||
|
for [EXTH, id, attrib, defaultid] in refines_list:
|
||||||
|
if self.target_epubver == '3':
|
||||||
|
for i, value in list(id.items()):
|
||||||
|
attrib[i] = ' id="%s"' % value
|
||||||
|
|
||||||
|
if EXTH in metadata:
|
||||||
|
if len(metadata[EXTH]) == 1 and len(id) == 1:
|
||||||
|
self.createMetaTag(self.exth_solved_refines_metadata, 'file-as', metadata[EXTH][0], id[0])
|
||||||
|
else:
|
||||||
|
for i, value in enumerate(metadata[EXTH]):
|
||||||
|
self.createMetaTag(self.exth_refines_metadata, 'file-as', value, id.get(i, defaultid))
|
||||||
|
else:
|
||||||
|
if EXTH in metadata:
|
||||||
|
if len(metadata[EXTH]) == 1 and len(id) == 1:
|
||||||
|
attr = ' opf:file-as="%s"' % metadata[EXTH][0]
|
||||||
|
attrib[0] = attr
|
||||||
|
else:
|
||||||
|
for i, value in enumerate(metadata[EXTH]):
|
||||||
|
attr = ' id="#%s" opf:file-as="%s"\n' % (id.get(i, defaultid), value)
|
||||||
|
self.extra_attributes.append(attr)
|
||||||
|
|
||||||
|
def createMetadataForFixedlayout(self):
|
||||||
|
# convert fixed layout to epub3 format if needed.
|
||||||
|
metadata = self.metadata
|
||||||
|
|
||||||
|
if 'fixed-layout' in metadata:
|
||||||
|
fixedlayout = metadata['fixed-layout'][0]
|
||||||
|
content = {'true' : 'pre-paginated'}.get(fixedlayout.lower(), 'reflowable')
|
||||||
|
self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:layout', content)
|
||||||
|
|
||||||
|
if 'orientation-lock' in metadata:
|
||||||
|
content = metadata['orientation-lock'][0].lower()
|
||||||
|
if content == 'portrait' or content == 'landscape':
|
||||||
|
self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:orientation', content)
|
||||||
|
|
||||||
|
# according to epub3 spec about correspondence with Amazon
|
||||||
|
# if 'original-resolution' is provided it needs to be converted to
|
||||||
|
# meta viewport property tag stored in the <head></head> of **each**
|
||||||
|
# xhtml page - so this tag would need to be handled by editing each part
|
||||||
|
# before reaching this routine
|
||||||
|
# we need to add support for this to the k8html routine
|
||||||
|
# if 'original-resolution' in metadata.keys():
|
||||||
|
# resolution = metadata['original-resolution'][0].lower()
|
||||||
|
# width, height = resolution.split('x')
|
||||||
|
# if width.isdigit() and int(width) > 0 and height.isdigit() and int(height) > 0:
|
||||||
|
# viewport = 'width=%s, height=%s' % (width, height)
|
||||||
|
# self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:viewport', viewport)
|
158
KindleUnpack/mobi_pagemap.py
Normal file
158
KindleUnpack/mobi_pagemap.py
Normal file
@@ -0,0 +1,158 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
|
||||||
|
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||||
|
|
||||||
|
from .compatibility_utils import PY2, unicode_str
|
||||||
|
|
||||||
|
if PY2:
|
||||||
|
range = xrange
|
||||||
|
|
||||||
|
import struct
|
||||||
|
# note: struct pack, unpack, unpack_from all require bytestring format
|
||||||
|
# data all the way up to at least python 2.7.5, python 3 okay with bytestring
|
||||||
|
|
||||||
|
import re
|
||||||
|
# note: re requites the pattern to be the exact same type as the data to be searched in python3
|
||||||
|
# but u"" is not allowed for the pattern itself only b""
|
||||||
|
|
||||||
|
|
||||||
|
_TABLE = [('m', 1000), ('cm', 900), ('d', 500), ('cd', 400), ('c', 100), ('xc', 90), ('l', 50), ('xl', 40), ('x', 10), ('ix', 9), ('v', 5), ('iv', 4), ('i', 1)]
|
||||||
|
|
||||||
|
def int_to_roman(i):
|
||||||
|
parts = []
|
||||||
|
num = i
|
||||||
|
for letter, value in _TABLE:
|
||||||
|
while value <= num:
|
||||||
|
num -= value
|
||||||
|
parts.append(letter)
|
||||||
|
return ''.join(parts)
|
||||||
|
|
||||||
|
def roman_to_int(s):
|
||||||
|
result = 0
|
||||||
|
rnstr = s
|
||||||
|
for letter, value in _TABLE:
|
||||||
|
while rnstr.startswith(letter):
|
||||||
|
result += value
|
||||||
|
rnstr = rnstr[len(letter):]
|
||||||
|
return result
|
||||||
|
|
||||||
|
_pattern = r'''\(([^\)]*)\)'''
|
||||||
|
_tup_pattern = re.compile(_pattern,re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def _parseNames(numpages, data):
|
||||||
|
data = unicode_str(data)
|
||||||
|
pagenames = []
|
||||||
|
pageMap = ''
|
||||||
|
for i in range(numpages):
|
||||||
|
pagenames.append(None)
|
||||||
|
for m in re.finditer(_tup_pattern, data):
|
||||||
|
tup = m.group(1)
|
||||||
|
if pageMap != '':
|
||||||
|
pageMap += ','
|
||||||
|
pageMap += '(' + tup + ')'
|
||||||
|
spos, nametype, svalue = tup.split(",")
|
||||||
|
# print(spos, nametype, svalue)
|
||||||
|
if nametype == 'a' or nametype == 'r':
|
||||||
|
svalue = int(svalue)
|
||||||
|
spos = int(spos)
|
||||||
|
for i in range(spos - 1, numpages):
|
||||||
|
if nametype == 'r':
|
||||||
|
pname = int_to_roman(svalue)
|
||||||
|
svalue += 1
|
||||||
|
elif nametype == 'a':
|
||||||
|
pname = "%s" % svalue
|
||||||
|
svalue += 1
|
||||||
|
elif nametype == 'c':
|
||||||
|
sp = svalue.find('|')
|
||||||
|
if sp == -1:
|
||||||
|
pname = svalue
|
||||||
|
else:
|
||||||
|
pname = svalue[0:sp]
|
||||||
|
svalue = svalue[sp+1:]
|
||||||
|
else:
|
||||||
|
print("Error: unknown page numbering type", nametype)
|
||||||
|
pagenames[i] = pname
|
||||||
|
return pagenames, pageMap
|
||||||
|
|
||||||
|
|
||||||
|
class PageMapProcessor:
|
||||||
|
|
||||||
|
def __init__(self, mh, data):
|
||||||
|
self.mh = mh
|
||||||
|
self.data = data
|
||||||
|
self.pagenames = []
|
||||||
|
self.pageoffsets = []
|
||||||
|
self.pageMap = ''
|
||||||
|
self.pm_len = 0
|
||||||
|
self.pm_nn = 0
|
||||||
|
self.pn_bits = 0
|
||||||
|
self.pmoff = None
|
||||||
|
self.pmstr = ''
|
||||||
|
print("Extracting Page Map Information")
|
||||||
|
rev_len, = struct.unpack_from(b'>L', self.data, 0x10)
|
||||||
|
# skip over header, revision string length data, and revision string
|
||||||
|
ptr = 0x14 + rev_len
|
||||||
|
pm_1, self.pm_len, self.pm_nn, self.pm_bits = struct.unpack_from(b'>4H', self.data, ptr)
|
||||||
|
# print(pm_1, self.pm_len, self.pm_nn, self.pm_bits)
|
||||||
|
self.pmstr = self.data[ptr+8:ptr+8+self.pm_len]
|
||||||
|
self.pmoff = self.data[ptr+8+self.pm_len:]
|
||||||
|
offsize = b">L"
|
||||||
|
offwidth = 4
|
||||||
|
if self.pm_bits == 16:
|
||||||
|
offsize = b">H"
|
||||||
|
offwidth = 2
|
||||||
|
ptr = 0
|
||||||
|
for i in range(self.pm_nn):
|
||||||
|
od, = struct.unpack_from(offsize, self.pmoff, ptr)
|
||||||
|
ptr += offwidth
|
||||||
|
self.pageoffsets.append(od)
|
||||||
|
self.pagenames, self.pageMap = _parseNames(self.pm_nn, self.pmstr)
|
||||||
|
|
||||||
|
def getPageMap(self):
|
||||||
|
return self.pageMap
|
||||||
|
|
||||||
|
def getNames(self):
|
||||||
|
return self.pagenames
|
||||||
|
|
||||||
|
def getOffsets(self):
|
||||||
|
return self.pageoffsets
|
||||||
|
|
||||||
|
# page-map.xml will be unicode but encoded to utf-8 immediately before being written to a file
|
||||||
|
def generateKF8PageMapXML(self, k8proc):
|
||||||
|
pagemapxml = '<page-map xmlns="http://www.idpf.org/2007/opf">\n'
|
||||||
|
for i in range(len(self.pagenames)):
|
||||||
|
pos = self.pageoffsets[i]
|
||||||
|
name = self.pagenames[i]
|
||||||
|
if name is not None and name != "":
|
||||||
|
[pn, dir, filename, skelpos, skelend, aidtext] = k8proc.getSkelInfo(pos)
|
||||||
|
idtext = unicode_str(k8proc.getPageIDTag(pos))
|
||||||
|
linktgt = unicode_str(filename)
|
||||||
|
if idtext != '':
|
||||||
|
linktgt += '#' + idtext
|
||||||
|
pagemapxml += '<page name="%s" href="%s/%s" />\n' % (name, dir, linktgt)
|
||||||
|
pagemapxml += "</page-map>\n"
|
||||||
|
return pagemapxml
|
||||||
|
|
||||||
|
def generateAPNX(self, apnx_meta):
|
||||||
|
if apnx_meta['format'] == 'MOBI_8':
|
||||||
|
content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","format":"%(format)s","fileRevisionId":"1","acr":"%(acr)s"}' %apnx_meta
|
||||||
|
else:
|
||||||
|
content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","fileRevisionId":"1"}' % apnx_meta
|
||||||
|
content_header = content_header.encode('utf-8')
|
||||||
|
page_header = '{"asin":"%(asin)s","pageMap":"%(pageMap)s"}' % apnx_meta
|
||||||
|
page_header = page_header.encode('utf-8')
|
||||||
|
apnx = struct.pack(b'>H',1) + struct.pack(b'>H',1)
|
||||||
|
apnx += struct.pack(b'>I', 12 + len(content_header))
|
||||||
|
apnx += struct.pack(b'>I', len(content_header))
|
||||||
|
apnx += content_header
|
||||||
|
apnx += struct.pack(b'>H', 1)
|
||||||
|
apnx += struct.pack(b'>H', len(page_header))
|
||||||
|
apnx += struct.pack(b'>H', self.pm_nn)
|
||||||
|
apnx += struct.pack(b'>H', 32)
|
||||||
|
apnx += page_header
|
||||||
|
for page in self.pageoffsets:
|
||||||
|
apnx += struct.pack(b'>L', page)
|
||||||
|
return apnx
|
120
KindleUnpack/mobi_sectioner.py
Normal file
120
KindleUnpack/mobi_sectioner.py
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
|
||||||
|
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||||
|
|
||||||
|
from .compatibility_utils import PY2, hexlify, bstr, bord, bchar
|
||||||
|
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
if PY2:
|
||||||
|
range = xrange
|
||||||
|
|
||||||
|
# note: struct pack, unpack, unpack_from all require bytestring format
|
||||||
|
# data all the way up to at least python 2.7.5, python 3 okay with bytestring
|
||||||
|
import struct
|
||||||
|
|
||||||
|
from .unipath import pathof
|
||||||
|
|
||||||
|
DUMP = False
|
||||||
|
""" Set to True to dump all possible information. """
|
||||||
|
|
||||||
|
class unpackException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def describe(data):
|
||||||
|
txtans = ''
|
||||||
|
hexans = hexlify(data)
|
||||||
|
for i in data:
|
||||||
|
if bord(i) < 32 or bord(i) > 127:
|
||||||
|
txtans += '?'
|
||||||
|
else:
|
||||||
|
txtans += bchar(i).decode('latin-1')
|
||||||
|
return '"' + txtans + '"' + ' 0x'+ hexans
|
||||||
|
|
||||||
|
def datetimefrompalmtime(palmtime):
|
||||||
|
if palmtime > 0x7FFFFFFF:
|
||||||
|
pythondatetime = datetime.datetime(year=1904,month=1,day=1)+datetime.timedelta(seconds=palmtime)
|
||||||
|
else:
|
||||||
|
pythondatetime = datetime.datetime(year=1970,month=1,day=1)+datetime.timedelta(seconds=palmtime)
|
||||||
|
return pythondatetime
|
||||||
|
|
||||||
|
|
||||||
|
class Sectionizer:
|
||||||
|
|
||||||
|
def __init__(self, filename):
|
||||||
|
self.data = b''
|
||||||
|
with open(pathof(filename), 'rb') as f:
|
||||||
|
self.data = f.read()
|
||||||
|
self.palmheader = self.data[:78]
|
||||||
|
self.palmname = self.data[:32]
|
||||||
|
self.ident = self.palmheader[0x3C:0x3C+8]
|
||||||
|
self.num_sections, = struct.unpack_from(b'>H', self.palmheader, 76)
|
||||||
|
self.filelength = len(self.data)
|
||||||
|
sectionsdata = struct.unpack_from(bstr('>%dL' % (self.num_sections*2)), self.data, 78) + (self.filelength, 0)
|
||||||
|
self.sectionoffsets = sectionsdata[::2]
|
||||||
|
self.sectionattributes = sectionsdata[1::2]
|
||||||
|
self.sectiondescriptions = ["" for x in range(self.num_sections+1)]
|
||||||
|
self.sectiondescriptions[-1] = "File Length Only"
|
||||||
|
return
|
||||||
|
|
||||||
|
def dumpsectionsinfo(self):
|
||||||
|
print("Section Offset Length UID Attribs Description")
|
||||||
|
for i in range(self.num_sections):
|
||||||
|
print("%3d %3X 0x%07X 0x%05X % 8d % 7d %s" % (i,i, self.sectionoffsets[i], self.sectionoffsets[
|
||||||
|
i+1] - self.sectionoffsets[i], self.sectionattributes[i]&0xFFFFFF, (self.sectionattributes[i]>>24)&0xFF, self.sectiondescriptions[i]))
|
||||||
|
print("%3d %3X 0x%07X %s" %
|
||||||
|
(self.num_sections,self.num_sections, self.sectionoffsets[self.num_sections], self.sectiondescriptions[self.num_sections]))
|
||||||
|
|
||||||
|
def setsectiondescription(self, section, description):
|
||||||
|
if section < len(self.sectiondescriptions):
|
||||||
|
self.sectiondescriptions[section] = description
|
||||||
|
else:
|
||||||
|
print("Section out of range: %d, description %s" % (section,description))
|
||||||
|
|
||||||
|
def dumppalmheader(self):
|
||||||
|
print("Palm Database Header")
|
||||||
|
print("Database name: " + repr(self.palmheader[:32]))
|
||||||
|
dbattributes, = struct.unpack_from(b'>H', self.palmheader, 32)
|
||||||
|
print("Bitfield attributes: 0x%0X" % dbattributes,)
|
||||||
|
if dbattributes != 0:
|
||||||
|
print(" (",)
|
||||||
|
if (dbattributes & 2):
|
||||||
|
print("Read-only; ",)
|
||||||
|
if (dbattributes & 4):
|
||||||
|
print("Dirty AppInfoArea; ",)
|
||||||
|
if (dbattributes & 8):
|
||||||
|
print("Needs to be backed up; ",)
|
||||||
|
if (dbattributes & 16):
|
||||||
|
print("OK to install over newer; ",)
|
||||||
|
if (dbattributes & 32):
|
||||||
|
print("Reset after installation; ",)
|
||||||
|
if (dbattributes & 64):
|
||||||
|
print("No copying by PalmPilot beaming; ",)
|
||||||
|
print(")")
|
||||||
|
else:
|
||||||
|
print("")
|
||||||
|
print("File version: %d" % struct.unpack_from(b'>H', self.palmheader, 34)[0])
|
||||||
|
dbcreation, = struct.unpack_from(b'>L', self.palmheader, 36)
|
||||||
|
print("Creation Date: " + str(datetimefrompalmtime(dbcreation))+ (" (0x%0X)" % dbcreation))
|
||||||
|
dbmodification, = struct.unpack_from(b'>L', self.palmheader, 40)
|
||||||
|
print("Modification Date: " + str(datetimefrompalmtime(dbmodification))+ (" (0x%0X)" % dbmodification))
|
||||||
|
dbbackup, = struct.unpack_from(b'>L', self.palmheader, 44)
|
||||||
|
if dbbackup != 0:
|
||||||
|
print("Backup Date: " + str(datetimefrompalmtime(dbbackup))+ (" (0x%0X)" % dbbackup))
|
||||||
|
print("Modification No.: %d" % struct.unpack_from(b'>L', self.palmheader, 48)[0])
|
||||||
|
print("App Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 52)[0])
|
||||||
|
print("Sort Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 56)[0])
|
||||||
|
print("Type/Creator: %s/%s" % (repr(self.palmheader[60:64]), repr(self.palmheader[64:68])))
|
||||||
|
print("Unique seed: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 68)[0])
|
||||||
|
expectedzero, = struct.unpack_from(b'>L', self.palmheader, 72)
|
||||||
|
if expectedzero != 0:
|
||||||
|
print("Should be zero but isn't: %d" % struct.unpack_from(b'>L', self.palmheader, 72)[0])
|
||||||
|
print("Number of sections: %d" % struct.unpack_from(b'>H', self.palmheader, 76)[0])
|
||||||
|
return
|
||||||
|
|
||||||
|
def loadSection(self, section):
|
||||||
|
before, after = self.sectionoffsets[section:section+2]
|
||||||
|
return self.data[before:after]
|
438
KindleUnpack/mobi_split.py
Executable file
438
KindleUnpack/mobi_split.py
Executable file
@@ -0,0 +1,438 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
|
||||||
|
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||||
|
|
||||||
|
import struct
|
||||||
|
# note: struct pack, unpack, unpack_from all require bytestring format
|
||||||
|
# data all the way up to at least python 2.7.5, python 3 okay with bytestring
|
||||||
|
|
||||||
|
from .unipath import pathof
|
||||||
|
|
||||||
|
|
||||||
|
# important pdb header offsets
|
||||||
|
unique_id_seed = 68
|
||||||
|
number_of_pdb_records = 76
|
||||||
|
|
||||||
|
# important palmdoc header offsets
|
||||||
|
book_length = 4
|
||||||
|
book_record_count = 8
|
||||||
|
first_pdb_record = 78
|
||||||
|
|
||||||
|
# important rec0 offsets
|
||||||
|
length_of_book = 4
|
||||||
|
mobi_header_base = 16
|
||||||
|
mobi_header_length = 20
|
||||||
|
mobi_type = 24
|
||||||
|
mobi_version = 36
|
||||||
|
first_non_text = 80
|
||||||
|
title_offset = 84
|
||||||
|
first_resc_record = 108
|
||||||
|
first_content_index = 192
|
||||||
|
last_content_index = 194
|
||||||
|
kf8_fdst_index = 192 # for KF8 mobi headers
|
||||||
|
fcis_index = 200
|
||||||
|
flis_index = 208
|
||||||
|
srcs_index = 224
|
||||||
|
srcs_count = 228
|
||||||
|
primary_index = 244
|
||||||
|
datp_index = 256
|
||||||
|
huffoff = 112
|
||||||
|
hufftbloff = 120
|
||||||
|
|
||||||
|
def getint(datain,ofs,sz=b'L'):
|
||||||
|
i, = struct.unpack_from(b'>'+sz,datain,ofs)
|
||||||
|
return i
|
||||||
|
|
||||||
|
def writeint(datain,ofs,n,len=b'L'):
|
||||||
|
if len==b'L':
|
||||||
|
return datain[:ofs]+struct.pack(b'>L',n)+datain[ofs+4:]
|
||||||
|
else:
|
||||||
|
return datain[:ofs]+struct.pack(b'>H',n)+datain[ofs+2:]
|
||||||
|
|
||||||
|
def getsecaddr(datain,secno):
|
||||||
|
nsec = getint(datain,number_of_pdb_records,b'H')
|
||||||
|
assert secno>=0 & secno<nsec,'secno %d out of range (nsec=%d)'%(secno,nsec)
|
||||||
|
secstart = getint(datain,first_pdb_record+secno*8)
|
||||||
|
if secno == nsec-1:
|
||||||
|
secend = len(datain)
|
||||||
|
else:
|
||||||
|
secend = getint(datain,first_pdb_record+(secno+1)*8)
|
||||||
|
return secstart,secend
|
||||||
|
|
||||||
|
def readsection(datain,secno):
|
||||||
|
secstart, secend = getsecaddr(datain,secno)
|
||||||
|
return datain[secstart:secend]
|
||||||
|
|
||||||
|
def writesection(datain,secno,secdata): # overwrite, accounting for different length
|
||||||
|
# dataout = deletesectionrange(datain,secno, secno)
|
||||||
|
# return insertsection(dataout, secno, secdata)
|
||||||
|
datalst = []
|
||||||
|
nsec = getint(datain,number_of_pdb_records,b'H')
|
||||||
|
zerosecstart,zerosecend = getsecaddr(datain,0)
|
||||||
|
secstart,secend = getsecaddr(datain,secno)
|
||||||
|
dif = len(secdata) - (secend - secstart)
|
||||||
|
datalst.append(datain[:unique_id_seed])
|
||||||
|
datalst.append(struct.pack(b'>L',2*nsec+1))
|
||||||
|
datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
|
||||||
|
datalst.append(struct.pack(b'>H',nsec))
|
||||||
|
newstart = zerosecstart
|
||||||
|
for i in range(0,secno):
|
||||||
|
ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
|
||||||
|
datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
|
||||||
|
datalst.append(struct.pack(b'>L', secstart) + struct.pack(b'>L', (2*secno)))
|
||||||
|
for i in range(secno+1,nsec):
|
||||||
|
ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
|
||||||
|
ofs = ofs + dif
|
||||||
|
datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
|
||||||
|
lpad = newstart - (first_pdb_record + 8*nsec)
|
||||||
|
if lpad > 0:
|
||||||
|
datalst.append(b'\0' * lpad)
|
||||||
|
datalst.append(datain[zerosecstart:secstart])
|
||||||
|
datalst.append(secdata)
|
||||||
|
datalst.append(datain[secend:])
|
||||||
|
dataout = b''.join(datalst)
|
||||||
|
return dataout
|
||||||
|
|
||||||
|
def nullsection(datain,secno): # make it zero-length without deleting it
|
||||||
|
datalst = []
|
||||||
|
nsec = getint(datain,number_of_pdb_records,b'H')
|
||||||
|
secstart, secend = getsecaddr(datain,secno)
|
||||||
|
zerosecstart, zerosecend = getsecaddr(datain, 0)
|
||||||
|
dif = secend-secstart
|
||||||
|
datalst.append(datain[:first_pdb_record])
|
||||||
|
for i in range(0,secno+1):
|
||||||
|
ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
|
||||||
|
datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
|
||||||
|
for i in range(secno+1, nsec):
|
||||||
|
ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
|
||||||
|
ofs = ofs - dif
|
||||||
|
datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
|
||||||
|
lpad = zerosecstart - (first_pdb_record + 8*nsec)
|
||||||
|
if lpad > 0:
|
||||||
|
datalst.append(b'\0' * lpad)
|
||||||
|
datalst.append(datain[zerosecstart: secstart])
|
||||||
|
datalst.append(datain[secend:])
|
||||||
|
dataout = b''.join(datalst)
|
||||||
|
return dataout
|
||||||
|
|
||||||
|
def deletesectionrange(datain,firstsec,lastsec): # delete a range of sections
|
||||||
|
datalst = []
|
||||||
|
firstsecstart,firstsecend = getsecaddr(datain,firstsec)
|
||||||
|
lastsecstart,lastsecend = getsecaddr(datain,lastsec)
|
||||||
|
zerosecstart, zerosecend = getsecaddr(datain, 0)
|
||||||
|
dif = lastsecend - firstsecstart + 8*(lastsec-firstsec+1)
|
||||||
|
nsec = getint(datain,number_of_pdb_records,b'H')
|
||||||
|
datalst.append(datain[:unique_id_seed])
|
||||||
|
datalst.append(struct.pack(b'>L',2*(nsec-(lastsec-firstsec+1))+1))
|
||||||
|
datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
|
||||||
|
datalst.append(struct.pack(b'>H',nsec-(lastsec-firstsec+1)))
|
||||||
|
newstart = zerosecstart - 8*(lastsec-firstsec+1)
|
||||||
|
for i in range(0,firstsec):
|
||||||
|
ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
|
||||||
|
ofs = ofs-8*(lastsec-firstsec+1)
|
||||||
|
datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
|
||||||
|
for i in range(lastsec+1,nsec):
|
||||||
|
ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
|
||||||
|
ofs = ofs - dif
|
||||||
|
flgval = 2*(i-(lastsec-firstsec+1))
|
||||||
|
datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
|
||||||
|
lpad = newstart - (first_pdb_record + 8*(nsec - (lastsec - firstsec + 1)))
|
||||||
|
if lpad > 0:
|
||||||
|
datalst.append(b'\0' * lpad)
|
||||||
|
datalst.append(datain[zerosecstart:firstsecstart])
|
||||||
|
datalst.append(datain[lastsecend:])
|
||||||
|
dataout = b''.join(datalst)
|
||||||
|
return dataout
|
||||||
|
|
||||||
|
def insertsection(datain,secno,secdata): # insert a new section
|
||||||
|
datalst = []
|
||||||
|
nsec = getint(datain,number_of_pdb_records,b'H')
|
||||||
|
# print("inserting secno" , secno, "into" ,nsec, "sections")
|
||||||
|
secstart,secend = getsecaddr(datain,secno)
|
||||||
|
zerosecstart,zerosecend = getsecaddr(datain,0)
|
||||||
|
dif = len(secdata)
|
||||||
|
datalst.append(datain[:unique_id_seed])
|
||||||
|
datalst.append(struct.pack(b'>L',2*(nsec+1)+1))
|
||||||
|
datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
|
||||||
|
datalst.append(struct.pack(b'>H',nsec+1))
|
||||||
|
newstart = zerosecstart + 8
|
||||||
|
for i in range(0,secno):
|
||||||
|
ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
|
||||||
|
ofs += 8
|
||||||
|
datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
|
||||||
|
datalst.append(struct.pack(b'>L', secstart + 8) + struct.pack(b'>L', (2*secno)))
|
||||||
|
for i in range(secno,nsec):
|
||||||
|
ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
|
||||||
|
ofs = ofs + dif + 8
|
||||||
|
flgval = 2*(i+1)
|
||||||
|
datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
|
||||||
|
lpad = newstart - (first_pdb_record + 8*(nsec + 1))
|
||||||
|
if lpad > 0:
|
||||||
|
datalst.append(b'\0' * lpad)
|
||||||
|
datalst.append(datain[zerosecstart:secstart])
|
||||||
|
datalst.append(secdata)
|
||||||
|
datalst.append(datain[secstart:])
|
||||||
|
dataout = b''.join(datalst)
|
||||||
|
return dataout
|
||||||
|
|
||||||
|
|
||||||
|
def insertsectionrange(sectionsource,firstsec,lastsec,sectiontarget,targetsec): # insert a range of sections
|
||||||
|
# print("inserting secno" , firstsec, "to", lastsec, "into" ,targetsec, "sections")
|
||||||
|
# dataout = sectiontarget
|
||||||
|
# for idx in range(lastsec,firstsec-1,-1):
|
||||||
|
# dataout = insertsection(dataout,targetsec,readsection(sectionsource,idx))
|
||||||
|
# return dataout
|
||||||
|
datalst = []
|
||||||
|
nsec = getint(sectiontarget,number_of_pdb_records,b'H')
|
||||||
|
zerosecstart, zerosecend = getsecaddr(sectiontarget,0)
|
||||||
|
insstart, nul = getsecaddr(sectiontarget,targetsec)
|
||||||
|
nins = lastsec - firstsec + 1
|
||||||
|
srcstart, nul = getsecaddr(sectionsource,firstsec)
|
||||||
|
nul, srcend = getsecaddr(sectionsource,lastsec)
|
||||||
|
newstart = zerosecstart + 8*nins
|
||||||
|
|
||||||
|
datalst.append(sectiontarget[:unique_id_seed])
|
||||||
|
datalst.append(struct.pack(b'>L',2*(nsec+nins)+1))
|
||||||
|
datalst.append(sectiontarget[unique_id_seed+4:number_of_pdb_records])
|
||||||
|
datalst.append(struct.pack(b'>H',nsec+nins))
|
||||||
|
for i in range(0,targetsec):
|
||||||
|
ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8)
|
||||||
|
ofsnew = ofs + 8*nins
|
||||||
|
flgvalnew = flgval
|
||||||
|
datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew))
|
||||||
|
# print(ofsnew, flgvalnew, ofs, flgval)
|
||||||
|
srcstart0, nul = getsecaddr(sectionsource,firstsec)
|
||||||
|
for i in range(nins):
|
||||||
|
isrcstart, nul = getsecaddr(sectionsource,firstsec+i)
|
||||||
|
ofsnew = insstart + (isrcstart-srcstart0) + 8*nins
|
||||||
|
flgvalnew = 2*(targetsec+i)
|
||||||
|
datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew))
|
||||||
|
# print(ofsnew, flgvalnew)
|
||||||
|
dif = srcend - srcstart
|
||||||
|
for i in range(targetsec,nsec):
|
||||||
|
ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8)
|
||||||
|
ofsnew = ofs + dif + 8*nins
|
||||||
|
flgvalnew = 2*(i+nins)
|
||||||
|
datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L',flgvalnew))
|
||||||
|
# print(ofsnew, flgvalnew, ofs, flgval)
|
||||||
|
lpad = newstart - (first_pdb_record + 8*(nsec + nins))
|
||||||
|
if lpad > 0:
|
||||||
|
datalst.append(b'\0' * lpad)
|
||||||
|
datalst.append(sectiontarget[zerosecstart:insstart])
|
||||||
|
datalst.append(sectionsource[srcstart:srcend])
|
||||||
|
datalst.append(sectiontarget[insstart:])
|
||||||
|
dataout = b''.join(datalst)
|
||||||
|
return dataout
|
||||||
|
|
||||||
|
def get_exth_params(rec0):
|
||||||
|
ebase = mobi_header_base + getint(rec0,mobi_header_length)
|
||||||
|
elen = getint(rec0,ebase+4)
|
||||||
|
enum = getint(rec0,ebase+8)
|
||||||
|
return ebase,elen,enum
|
||||||
|
|
||||||
|
def add_exth(rec0,exth_num,exth_bytes):
|
||||||
|
ebase,elen,enum = get_exth_params(rec0)
|
||||||
|
newrecsize = 8+len(exth_bytes)
|
||||||
|
newrec0 = rec0[0:ebase+4]+struct.pack(b'>L',elen+newrecsize)+struct.pack(b'>L',enum+1)+\
|
||||||
|
struct.pack(b'>L',exth_num)+struct.pack(b'>L',newrecsize)+exth_bytes+rec0[ebase+12:]
|
||||||
|
newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+newrecsize)
|
||||||
|
return newrec0
|
||||||
|
|
||||||
|
def read_exth(rec0,exth_num):
|
||||||
|
exth_values = []
|
||||||
|
ebase,elen,enum = get_exth_params(rec0)
|
||||||
|
ebase = ebase+12
|
||||||
|
while enum>0:
|
||||||
|
exth_id = getint(rec0,ebase)
|
||||||
|
if exth_id == exth_num:
|
||||||
|
# We might have multiple exths, so build a list.
|
||||||
|
exth_values.append(rec0[ebase+8:ebase+getint(rec0,ebase+4)])
|
||||||
|
enum = enum-1
|
||||||
|
ebase = ebase+getint(rec0,ebase+4)
|
||||||
|
return exth_values
|
||||||
|
|
||||||
|
def write_exth(rec0,exth_num,exth_bytes):
|
||||||
|
ebase,elen,enum = get_exth_params(rec0)
|
||||||
|
ebase_idx = ebase+12
|
||||||
|
enum_idx = enum
|
||||||
|
while enum_idx>0:
|
||||||
|
exth_id = getint(rec0,ebase_idx)
|
||||||
|
if exth_id == exth_num:
|
||||||
|
dif = len(exth_bytes)+8-getint(rec0,ebase_idx+4)
|
||||||
|
newrec0 = rec0
|
||||||
|
if dif != 0:
|
||||||
|
newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+dif)
|
||||||
|
return newrec0[:ebase+4]+struct.pack(b'>L',elen+len(exth_bytes)+8-getint(rec0,ebase_idx+4))+\
|
||||||
|
struct.pack(b'>L',enum)+rec0[ebase+12:ebase_idx+4]+\
|
||||||
|
struct.pack(b'>L',len(exth_bytes)+8)+exth_bytes+\
|
||||||
|
rec0[ebase_idx+getint(rec0,ebase_idx+4):]
|
||||||
|
enum_idx = enum_idx-1
|
||||||
|
ebase_idx = ebase_idx+getint(rec0,ebase_idx+4)
|
||||||
|
return rec0
|
||||||
|
|
||||||
|
def del_exth(rec0,exth_num):
|
||||||
|
ebase,elen,enum = get_exth_params(rec0)
|
||||||
|
ebase_idx = ebase+12
|
||||||
|
enum_idx = 0
|
||||||
|
while enum_idx < enum:
|
||||||
|
exth_id = getint(rec0,ebase_idx)
|
||||||
|
exth_size = getint(rec0,ebase_idx+4)
|
||||||
|
if exth_id == exth_num:
|
||||||
|
newrec0 = rec0
|
||||||
|
newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)-exth_size)
|
||||||
|
newrec0 = newrec0[:ebase_idx]+newrec0[ebase_idx+exth_size:]
|
||||||
|
newrec0 = newrec0[0:ebase+4]+struct.pack(b'>L',elen-exth_size)+struct.pack(b'>L',enum-1)+newrec0[ebase+12:]
|
||||||
|
return newrec0
|
||||||
|
enum_idx += 1
|
||||||
|
ebase_idx = ebase_idx+exth_size
|
||||||
|
return rec0
|
||||||
|
|
||||||
|
|
||||||
|
class mobi_split:
|
||||||
|
|
||||||
|
def __init__(self, infile):
|
||||||
|
datain = b''
|
||||||
|
with open(pathof(infile), 'rb') as f:
|
||||||
|
datain = f.read()
|
||||||
|
datain_rec0 = readsection(datain,0)
|
||||||
|
ver = getint(datain_rec0,mobi_version)
|
||||||
|
self.combo = (ver!=8)
|
||||||
|
if not self.combo:
|
||||||
|
return
|
||||||
|
exth121 = read_exth(datain_rec0,121)
|
||||||
|
if len(exth121) == 0:
|
||||||
|
self.combo = False
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
# only pay attention to first exth121
|
||||||
|
# (there should only be one)
|
||||||
|
datain_kf8, = struct.unpack_from(b'>L',exth121[0],0)
|
||||||
|
if datain_kf8 == 0xffffffff:
|
||||||
|
self.combo = False
|
||||||
|
return
|
||||||
|
datain_kfrec0 =readsection(datain,datain_kf8)
|
||||||
|
|
||||||
|
# create the standalone mobi7
|
||||||
|
num_sec = getint(datain,number_of_pdb_records,b'H')
|
||||||
|
# remove BOUNDARY up to but not including ELF record
|
||||||
|
self.result_file7 = deletesectionrange(datain,datain_kf8-1,num_sec-2)
|
||||||
|
# check if there are SRCS records and delete them
|
||||||
|
srcs = getint(datain_rec0,srcs_index)
|
||||||
|
num_srcs = getint(datain_rec0,srcs_count)
|
||||||
|
if srcs != 0xffffffff and num_srcs > 0:
|
||||||
|
self.result_file7 = deletesectionrange(self.result_file7,srcs,srcs+num_srcs-1)
|
||||||
|
datain_rec0 = writeint(datain_rec0,srcs_index,0xffffffff)
|
||||||
|
datain_rec0 = writeint(datain_rec0,srcs_count,0)
|
||||||
|
# reset the EXTH 121 KF8 Boundary meta data to 0xffffffff
|
||||||
|
datain_rec0 = write_exth(datain_rec0,121, struct.pack(b'>L', 0xffffffff))
|
||||||
|
# datain_rec0 = del_exth(datain_rec0,121)
|
||||||
|
# datain_rec0 = del_exth(datain_rec0,534)
|
||||||
|
# don't remove the EXTH 125 KF8 Count of Resources, seems to be present in mobi6 files as well
|
||||||
|
# set the EXTH 129 KF8 Masthead / Cover Image string to the null string
|
||||||
|
datain_rec0 = write_exth(datain_rec0,129, b'')
|
||||||
|
# don't remove the EXTH 131 KF8 Unidentified Count, seems to be present in mobi6 files as well
|
||||||
|
|
||||||
|
# need to reset flags stored in 0x80-0x83
|
||||||
|
# old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050
|
||||||
|
# Bit Flags
|
||||||
|
# 0x1000 = Bit 12 indicates if embedded fonts are used or not
|
||||||
|
# 0x0800 = means this Header points to *shared* images/resource/fonts ??
|
||||||
|
# 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8?
|
||||||
|
# 0x0040 = exth exists
|
||||||
|
# 0x0010 = Not sure but this is always set so far
|
||||||
|
fval, = struct.unpack_from(b'>L',datain_rec0, 0x80)
|
||||||
|
# need to remove flag 0x0800 for KindlePreviewer 2.8 and unset Bit 12 for embedded fonts
|
||||||
|
fval = fval & 0x07FF
|
||||||
|
datain_rec0 = datain_rec0[:0x80] + struct.pack(b'>L',fval) + datain_rec0[0x84:]
|
||||||
|
|
||||||
|
self.result_file7 = writesection(self.result_file7,0,datain_rec0)
|
||||||
|
|
||||||
|
# no need to replace kf8 style fcis with mobi 7 one
|
||||||
|
# fcis_secnum, = struct.unpack_from(b'>L',datain_rec0, 0xc8)
|
||||||
|
# if fcis_secnum != 0xffffffff:
|
||||||
|
# fcis_info = readsection(datain, fcis_secnum)
|
||||||
|
# text_len, = struct.unpack_from(b'>L', fcis_info, 0x14)
|
||||||
|
# new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
|
||||||
|
# new_fcis += struct.pack(b'>L',text_len)
|
||||||
|
# new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
|
||||||
|
# self.result_file7 = writesection(self.result_file7, fcis_secnum, new_fcis)
|
||||||
|
|
||||||
|
firstimage = getint(datain_rec0,first_resc_record)
|
||||||
|
lastimage = getint(datain_rec0,last_content_index,b'H')
|
||||||
|
# print("Old First Image, last Image", firstimage,lastimage)
|
||||||
|
if lastimage == 0xffff:
|
||||||
|
# find the lowest of the next sections and copy up to that.
|
||||||
|
ofs_list = [(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')]
|
||||||
|
for ofs,sz in ofs_list:
|
||||||
|
n = getint(datain_rec0,ofs,sz)
|
||||||
|
# print("n",n)
|
||||||
|
if n > 0 and n < lastimage:
|
||||||
|
lastimage = n-1
|
||||||
|
print("First Image, last Image", firstimage,lastimage)
|
||||||
|
|
||||||
|
# Try to null out FONT and RES, but leave the (empty) PDB record so image refs remain valid
|
||||||
|
for i in range(firstimage,lastimage):
|
||||||
|
imgsec = readsection(self.result_file7,i)
|
||||||
|
if imgsec[0:4] in [b'RESC',b'FONT']:
|
||||||
|
self.result_file7 = nullsection(self.result_file7,i)
|
||||||
|
|
||||||
|
# mobi7 finished
|
||||||
|
|
||||||
|
# create standalone mobi8
|
||||||
|
self.result_file8 = deletesectionrange(datain,0,datain_kf8-1)
|
||||||
|
target = getint(datain_kfrec0,first_resc_record)
|
||||||
|
self.result_file8 = insertsectionrange(datain,firstimage,lastimage,self.result_file8,target)
|
||||||
|
datain_kfrec0 =readsection(self.result_file8,0)
|
||||||
|
|
||||||
|
# Only keep the correct EXTH 116 StartOffset, KG 2.5 carries over the one from the mobi7 part, which then points at garbage in the mobi8 part, and confuses FW 3.4
|
||||||
|
kf8starts = read_exth(datain_kfrec0,116)
|
||||||
|
# If we have multiple StartOffset, keep only the last one
|
||||||
|
kf8start_count = len(kf8starts)
|
||||||
|
while kf8start_count > 1:
|
||||||
|
kf8start_count -= 1
|
||||||
|
datain_kfrec0 = del_exth(datain_kfrec0,116)
|
||||||
|
|
||||||
|
# update the EXTH 125 KF8 Count of Images/Fonts/Resources
|
||||||
|
datain_kfrec0 = write_exth(datain_kfrec0,125,struct.pack(b'>L',lastimage-firstimage+1))
|
||||||
|
|
||||||
|
# need to reset flags stored in 0x80-0x83
|
||||||
|
# old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050
|
||||||
|
# standalone mobi8 with exth: 0x0050
|
||||||
|
# Bit Flags
|
||||||
|
# 0x1000 = Bit 12 indicates if embedded fonts are used or not
|
||||||
|
# 0x0800 = means this Header points to *shared* images/resource/fonts ??
|
||||||
|
# 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8?
|
||||||
|
# 0x0040 = exth exists
|
||||||
|
# 0x0010 = Not sure but this is always set so far
|
||||||
|
fval, = struct.unpack_from('>L',datain_kfrec0, 0x80)
|
||||||
|
fval = fval & 0x1FFF
|
||||||
|
fval |= 0x0800
|
||||||
|
datain_kfrec0 = datain_kfrec0[:0x80] + struct.pack(b'>L',fval) + datain_kfrec0[0x84:]
|
||||||
|
|
||||||
|
# properly update other index pointers that have been shifted by the insertion of images
|
||||||
|
ofs_list = [(kf8_fdst_index,b'L'),(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')]
|
||||||
|
for ofs,sz in ofs_list:
|
||||||
|
n = getint(datain_kfrec0,ofs,sz)
|
||||||
|
if n != 0xffffffff:
|
||||||
|
datain_kfrec0 = writeint(datain_kfrec0,ofs,n+lastimage-firstimage+1,sz)
|
||||||
|
self.result_file8 = writesection(self.result_file8,0,datain_kfrec0)
|
||||||
|
|
||||||
|
# no need to replace kf8 style fcis with mobi 7 one
|
||||||
|
# fcis_secnum, = struct.unpack_from(b'>L',datain_kfrec0, 0xc8)
|
||||||
|
# if fcis_secnum != 0xffffffff:
|
||||||
|
# fcis_info = readsection(self.result_file8, fcis_secnum)
|
||||||
|
# text_len, = struct.unpack_from(b'>L', fcis_info, 0x14)
|
||||||
|
# new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
|
||||||
|
# new_fcis += struct.pack(b'>L',text_len)
|
||||||
|
# new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
|
||||||
|
# self.result_file8 = writesection(self.result_file8, fcis_secnum, new_fcis)
|
||||||
|
|
||||||
|
# mobi8 finished
|
||||||
|
|
||||||
|
def getResult8(self):
|
||||||
|
return self.result_file8
|
||||||
|
|
||||||
|
def getResult7(self):
|
||||||
|
return self.result_file7
|
131
KindleUnpack/mobi_uncompress.py
Normal file
131
KindleUnpack/mobi_uncompress.py
Normal file
@@ -0,0 +1,131 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
|
||||||
|
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||||
|
|
||||||
|
from .compatibility_utils import PY2, bchr, lmap, bstr
|
||||||
|
|
||||||
|
if PY2:
|
||||||
|
range = xrange
|
||||||
|
|
||||||
|
import struct
|
||||||
|
# note: struct pack, unpack, unpack_from all require bytestring format
|
||||||
|
# data all the way up to at least python 2.7.5, python 3 okay with bytestring
|
||||||
|
|
||||||
|
|
||||||
|
class unpackException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class UncompressedReader:
|
||||||
|
|
||||||
|
def unpack(self, data):
|
||||||
|
return data
|
||||||
|
|
||||||
|
class PalmdocReader:
|
||||||
|
|
||||||
|
def unpack(self, i):
|
||||||
|
o, p = b'', 0
|
||||||
|
while p < len(i):
|
||||||
|
# for python 3 must use slice since i[p] returns int while slice returns character
|
||||||
|
c = ord(i[p:p+1])
|
||||||
|
p += 1
|
||||||
|
if (c >= 1 and c <= 8):
|
||||||
|
o += i[p:p+c]
|
||||||
|
p += c
|
||||||
|
elif (c < 128):
|
||||||
|
o += bchr(c)
|
||||||
|
elif (c >= 192):
|
||||||
|
o += b' ' + bchr(c ^ 128)
|
||||||
|
else:
|
||||||
|
if p < len(i):
|
||||||
|
c = (c << 8) | ord(i[p:p+1])
|
||||||
|
p += 1
|
||||||
|
m = (c >> 3) & 0x07ff
|
||||||
|
n = (c & 7) + 3
|
||||||
|
if (m > n):
|
||||||
|
o += o[-m:n-m]
|
||||||
|
else:
|
||||||
|
for _ in range(n):
|
||||||
|
# because of completely ass-backwards decision by python mainters for python 3
|
||||||
|
# we must use slice for bytes as i[p] returns int while slice returns character
|
||||||
|
if m == 1:
|
||||||
|
o += o[-m:]
|
||||||
|
else:
|
||||||
|
o += o[-m:-m+1]
|
||||||
|
return o
|
||||||
|
|
||||||
|
class HuffcdicReader:
|
||||||
|
q = struct.Struct(b'>Q').unpack_from
|
||||||
|
|
||||||
|
def loadHuff(self, huff):
|
||||||
|
if huff[0:8] != b'HUFF\x00\x00\x00\x18':
|
||||||
|
raise unpackException('invalid huff header')
|
||||||
|
off1, off2 = struct.unpack_from(b'>LL', huff, 8)
|
||||||
|
|
||||||
|
def dict1_unpack(v):
|
||||||
|
codelen, term, maxcode = v&0x1f, v&0x80, v>>8
|
||||||
|
assert codelen != 0
|
||||||
|
if codelen <= 8:
|
||||||
|
assert term
|
||||||
|
maxcode = ((maxcode + 1) << (32 - codelen)) - 1
|
||||||
|
return (codelen, term, maxcode)
|
||||||
|
self.dict1 = lmap(dict1_unpack, struct.unpack_from(b'>256L', huff, off1))
|
||||||
|
|
||||||
|
dict2 = struct.unpack_from(b'>64L', huff, off2)
|
||||||
|
self.mincode, self.maxcode = (), ()
|
||||||
|
for codelen, mincode in enumerate((0,) + dict2[0::2]):
|
||||||
|
self.mincode += (mincode << (32 - codelen), )
|
||||||
|
for codelen, maxcode in enumerate((0,) + dict2[1::2]):
|
||||||
|
self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, )
|
||||||
|
|
||||||
|
self.dictionary = []
|
||||||
|
|
||||||
|
def loadCdic(self, cdic):
|
||||||
|
if cdic[0:8] != b'CDIC\x00\x00\x00\x10':
|
||||||
|
raise unpackException('invalid cdic header')
|
||||||
|
phrases, bits = struct.unpack_from(b'>LL', cdic, 8)
|
||||||
|
n = min(1<<bits, phrases-len(self.dictionary))
|
||||||
|
h = struct.Struct(b'>H').unpack_from
|
||||||
|
def getslice(off):
|
||||||
|
blen, = h(cdic, 16+off)
|
||||||
|
slice = cdic[18+off:18+off+(blen&0x7fff)]
|
||||||
|
return (slice, blen&0x8000)
|
||||||
|
self.dictionary += lmap(getslice, struct.unpack_from(bstr('>%dH' % n), cdic, 16))
|
||||||
|
|
||||||
|
def unpack(self, data):
|
||||||
|
q = HuffcdicReader.q
|
||||||
|
|
||||||
|
bitsleft = len(data) * 8
|
||||||
|
data += b"\x00\x00\x00\x00\x00\x00\x00\x00"
|
||||||
|
pos = 0
|
||||||
|
x, = q(data, pos)
|
||||||
|
n = 32
|
||||||
|
|
||||||
|
s = b''
|
||||||
|
while True:
|
||||||
|
if n <= 0:
|
||||||
|
pos += 4
|
||||||
|
x, = q(data, pos)
|
||||||
|
n += 32
|
||||||
|
code = (x >> n) & ((1 << 32) - 1)
|
||||||
|
|
||||||
|
codelen, term, maxcode = self.dict1[code >> 24]
|
||||||
|
if not term:
|
||||||
|
while code < self.mincode[codelen]:
|
||||||
|
codelen += 1
|
||||||
|
maxcode = self.maxcode[codelen]
|
||||||
|
|
||||||
|
n -= codelen
|
||||||
|
bitsleft -= codelen
|
||||||
|
if bitsleft < 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
r = (maxcode - code) >> (32 - codelen)
|
||||||
|
slice, flag = self.dictionary[r]
|
||||||
|
if not flag:
|
||||||
|
self.dictionary[r] = None
|
||||||
|
slice = self.unpack(slice)
|
||||||
|
self.dictionary[r] = (slice, 1)
|
||||||
|
s += slice
|
||||||
|
return s
|
191
KindleUnpack/mobi_utils.py
Normal file
191
KindleUnpack/mobi_utils.py
Normal file
@@ -0,0 +1,191 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
# flake8: noqa
|
||||||
|
|
||||||
|
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||||
|
|
||||||
|
from .compatibility_utils import PY2, text_type, bchr, bord
|
||||||
|
|
||||||
|
import binascii
|
||||||
|
|
||||||
|
if PY2:
|
||||||
|
range = xrange
|
||||||
|
|
||||||
|
from itertools import cycle
|
||||||
|
|
||||||
|
def getLanguage(langID, sublangID):
|
||||||
|
mobilangdict = {
|
||||||
|
54 : {0 : 'af'}, # Afrikaans
|
||||||
|
28 : {0 : 'sq'}, # Albanian
|
||||||
|
1 : {0 : 'ar' , 5 : 'ar-dz' , 15 : 'ar-bh' , 3 : 'ar-eg' , 2 : 'ar-iq', 11 : 'ar-jo' , 13 : 'ar-kw' , 12 : 'ar-lb' , 4: 'ar-ly',
|
||||||
|
6 : 'ar-ma' , 8 : 'ar-om' , 16 : 'ar-qa' , 1 : 'ar-sa' , 10 : 'ar-sy' , 7 : 'ar-tn' , 14 : 'ar-ae' , 9 : 'ar-ye'},
|
||||||
|
# Arabic, Arabic (Algeria), Arabic (Bahrain), Arabic (Egypt), Arabic
|
||||||
|
# (Iraq), Arabic (Jordan), Arabic (Kuwait), Arabic (Lebanon), Arabic
|
||||||
|
# (Libya), Arabic (Morocco), Arabic (Oman), Arabic (Qatar), Arabic
|
||||||
|
# (Saudi Arabia), Arabic (Syria), Arabic (Tunisia), Arabic (United Arab
|
||||||
|
# Emirates), Arabic (Yemen)
|
||||||
|
43 : {0 : 'hy'}, # Armenian
|
||||||
|
77 : {0 : 'as'}, # Assamese
|
||||||
|
44 : {0 : 'az'}, # "Azeri (IANA: Azerbaijani)
|
||||||
|
45 : {0 : 'eu'}, # Basque
|
||||||
|
35 : {0 : 'be'}, # Belarusian
|
||||||
|
69 : {0 : 'bn'}, # Bengali
|
||||||
|
2 : {0 : 'bg'}, # Bulgarian
|
||||||
|
3 : {0 : 'ca'}, # Catalan
|
||||||
|
4 : {0 : 'zh' , 3 : 'zh-hk' , 2 : 'zh-cn' , 4 : 'zh-sg' , 1 : 'zh-tw'},
|
||||||
|
# Chinese, Chinese (Hong Kong), Chinese (PRC), Chinese (Singapore), Chinese (Taiwan)
|
||||||
|
26 : {0 : 'hr', 3 : 'sr'}, # Croatian, Serbian
|
||||||
|
5 : {0 : 'cs'}, # Czech
|
||||||
|
6 : {0 : 'da'}, # Danish
|
||||||
|
19 : {0: 'nl', 1 : 'nl' , 2 : 'nl-be'}, # Dutch / Flemish, Dutch (Belgium)
|
||||||
|
9 : {0: 'en', 1 : 'en' , 3 : 'en-au' , 40 : 'en-bz' , 4 : 'en-ca' , 6 : 'en-ie' , 8 : 'en-jm' , 5 : 'en-nz' , 13 : 'en-ph' ,
|
||||||
|
7 : 'en-za' , 11 : 'en-tt' , 2 : 'en-gb', 1 : 'en-us' , 12 : 'en-zw'},
|
||||||
|
# English, English (Australia), English (Belize), English (Canada),
|
||||||
|
# English (Ireland), English (Jamaica), English (New Zealand), English
|
||||||
|
# (Philippines), English (South Africa), English (Trinidad), English
|
||||||
|
# (United Kingdom), English (United States), English (Zimbabwe)
|
||||||
|
37 : {0 : 'et'}, # Estonian
|
||||||
|
56 : {0 : 'fo'}, # Faroese
|
||||||
|
41 : {0 : 'fa'}, # Farsi / Persian
|
||||||
|
11 : {0 : 'fi'}, # Finnish
|
||||||
|
12 : {0 : 'fr', 1 : 'fr' , 2 : 'fr-be' , 3 : 'fr-ca' , 5 : 'fr-lu' , 6 : 'fr-mc' , 4 : 'fr-ch'},
|
||||||
|
# French, French (Belgium), French (Canada), French (Luxembourg), French (Monaco), French (Switzerland)
|
||||||
|
55 : {0 : 'ka'}, # Georgian
|
||||||
|
7 : {0 : 'de', 1 : 'de' , 3 : 'de-at' , 5 : 'de-li' , 4 : 'de-lu' , 2 : 'de-ch'},
|
||||||
|
# German, German (Austria), German (Liechtenstein), German (Luxembourg), German (Switzerland)
|
||||||
|
8 : {0 : 'el'}, # Greek, Modern (1453-)
|
||||||
|
71 : {0 : 'gu'}, # Gujarati
|
||||||
|
13 : {0 : 'he'}, # Hebrew (also code 'iw'?)
|
||||||
|
57 : {0 : 'hi'}, # Hindi
|
||||||
|
14 : {0 : 'hu'}, # Hungarian
|
||||||
|
15 : {0 : 'is'}, # Icelandic
|
||||||
|
33 : {0 : 'id'}, # Indonesian
|
||||||
|
16 : {0 : 'it', 1 : 'it' , 2 : 'it-ch'}, # Italian, Italian (Switzerland)
|
||||||
|
17 : {0 : 'ja'}, # Japanese
|
||||||
|
75 : {0 : 'kn'}, # Kannada
|
||||||
|
63 : {0 : 'kk'}, # Kazakh
|
||||||
|
87 : {0 : 'x-kok'}, # Konkani (real language code is 'kok'?)
|
||||||
|
18 : {0 : 'ko'}, # Korean
|
||||||
|
38 : {0 : 'lv'}, # Latvian
|
||||||
|
39 : {0 : 'lt'}, # Lithuanian
|
||||||
|
47 : {0 : 'mk'}, # Macedonian
|
||||||
|
62 : {0 : 'ms'}, # Malay
|
||||||
|
76 : {0 : 'ml'}, # Malayalam
|
||||||
|
58 : {0 : 'mt'}, # Maltese
|
||||||
|
78 : {0 : 'mr'}, # Marathi
|
||||||
|
97 : {0 : 'ne'}, # Nepali
|
||||||
|
20 : {0 : 'no'}, # Norwegian
|
||||||
|
72 : {0 : 'or'}, # Oriya
|
||||||
|
21 : {0 : 'pl'}, # Polish
|
||||||
|
22 : {0 : 'pt', 2 : 'pt' , 1 : 'pt-br'}, # Portuguese, Portuguese (Brazil)
|
||||||
|
70 : {0 : 'pa'}, # Punjabi
|
||||||
|
23 : {0 : 'rm'}, # "Rhaeto-Romanic" (IANA: Romansh)
|
||||||
|
24 : {0 : 'ro'}, # Romanian
|
||||||
|
25 : {0 : 'ru'}, # Russian
|
||||||
|
59 : {0 : 'sz'}, # "Sami (Lappish)" (not an IANA language code)
|
||||||
|
# IANA code for "Northern Sami" is 'se'
|
||||||
|
# 'SZ' is the IANA region code for Swaziland
|
||||||
|
79 : {0 : 'sa'}, # Sanskrit
|
||||||
|
27 : {0 : 'sk'}, # Slovak
|
||||||
|
36 : {0 : 'sl'}, # Slovenian
|
||||||
|
46 : {0 : 'sb'}, # "Sorbian" (not an IANA language code)
|
||||||
|
# 'SB' is IANA region code for 'Solomon Islands'
|
||||||
|
# Lower Sorbian = 'dsb'
|
||||||
|
# Upper Sorbian = 'hsb'
|
||||||
|
# Sorbian Languages = 'wen'
|
||||||
|
10 : {0 : 'es' , 4 : 'es' , 44 : 'es-ar' , 64 : 'es-bo' , 52 : 'es-cl' , 36 : 'es-co' , 20 : 'es-cr' , 28 : 'es-do' ,
|
||||||
|
48 : 'es-ec' , 68 : 'es-sv' , 16 : 'es-gt' , 72 : 'es-hn' , 8 : 'es-mx' , 76 : 'es-ni' , 24 : 'es-pa' ,
|
||||||
|
60 : 'es-py' , 40 : 'es-pe' , 80 : 'es-pr' , 56 : 'es-uy' , 32 : 'es-ve'},
|
||||||
|
# Spanish, Spanish (Mobipocket bug?), Spanish (Argentina), Spanish
|
||||||
|
# (Bolivia), Spanish (Chile), Spanish (Colombia), Spanish (Costa Rica),
|
||||||
|
# Spanish (Dominican Republic), Spanish (Ecuador), Spanish (El
|
||||||
|
# Salvador), Spanish (Guatemala), Spanish (Honduras), Spanish (Mexico),
|
||||||
|
# Spanish (Nicaragua), Spanish (Panama), Spanish (Paraguay), Spanish
|
||||||
|
# (Peru), Spanish (Puerto Rico), Spanish (Uruguay), Spanish (Venezuela)
|
||||||
|
48 : {0 : 'sx'}, # "Sutu" (not an IANA language code)
|
||||||
|
# "Sutu" is another name for "Southern Sotho"?
|
||||||
|
# IANA code for "Southern Sotho" is 'st'
|
||||||
|
65 : {0 : 'sw'}, # Swahili
|
||||||
|
29 : {0 : 'sv' , 1 : 'sv' , 8 : 'sv-fi'}, # Swedish, Swedish (Finland)
|
||||||
|
73 : {0 : 'ta'}, # Tamil
|
||||||
|
68 : {0 : 'tt'}, # Tatar
|
||||||
|
74 : {0 : 'te'}, # Telugu
|
||||||
|
30 : {0 : 'th'}, # Thai
|
||||||
|
49 : {0 : 'ts'}, # Tsonga
|
||||||
|
50 : {0 : 'tn'}, # Tswana
|
||||||
|
31 : {0 : 'tr'}, # Turkish
|
||||||
|
34 : {0 : 'uk'}, # Ukrainian
|
||||||
|
32 : {0 : 'ur'}, # Urdu
|
||||||
|
67 : {0 : 'uz', 2 : 'uz'}, # Uzbek
|
||||||
|
42 : {0 : 'vi'}, # Vietnamese
|
||||||
|
52 : {0 : 'xh'}, # Xhosa
|
||||||
|
53 : {0 : 'zu'}, # Zulu
|
||||||
|
}
|
||||||
|
lang = "en"
|
||||||
|
if langID in mobilangdict:
|
||||||
|
subdict = mobilangdict[langID]
|
||||||
|
lang = subdict[0]
|
||||||
|
if sublangID in subdict:
|
||||||
|
lang = subdict[sublangID]
|
||||||
|
return lang
|
||||||
|
|
||||||
|
|
||||||
|
def toHex(byteList):
|
||||||
|
return binascii.hexlify(byteList)
|
||||||
|
|
||||||
|
# returns base32 bytestring
|
||||||
|
def toBase32(value, npad=4):
|
||||||
|
digits = b'0123456789ABCDEFGHIJKLMNOPQRSTUV'
|
||||||
|
num_string=b''
|
||||||
|
current = value
|
||||||
|
while current != 0:
|
||||||
|
next, remainder = divmod(current, 32)
|
||||||
|
rem_string = digits[remainder:remainder+1]
|
||||||
|
num_string = rem_string + num_string
|
||||||
|
current=next
|
||||||
|
if num_string == b'':
|
||||||
|
num_string = b'0'
|
||||||
|
pad = npad - len(num_string)
|
||||||
|
if pad > 0:
|
||||||
|
num_string = b'0' * pad + num_string
|
||||||
|
return num_string
|
||||||
|
|
||||||
|
|
||||||
|
# converts base32 string to value
|
||||||
|
def fromBase32(str_num):
|
||||||
|
if isinstance(str_num, text_type):
|
||||||
|
str_num = str_num.encode('latin-1')
|
||||||
|
scalelst = [1,32,1024,32768,1048576,33554432,1073741824,34359738368]
|
||||||
|
value = 0
|
||||||
|
j = 0
|
||||||
|
n = len(str_num)
|
||||||
|
scale = 0
|
||||||
|
for i in range(n):
|
||||||
|
c = str_num[n-i-1:n-i]
|
||||||
|
if c in b'0123456789':
|
||||||
|
v = ord(c) - ord(b'0')
|
||||||
|
else:
|
||||||
|
v = ord(c) - ord(b'A') + 10
|
||||||
|
if j < len(scalelst):
|
||||||
|
scale = scalelst[j]
|
||||||
|
else:
|
||||||
|
scale = scale * 32
|
||||||
|
j += 1
|
||||||
|
if v != 0:
|
||||||
|
value = value + (v * scale)
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
# note: if decode a bytestring using 'latin-1' (or any other 0-255 encoding)
|
||||||
|
# in place of ascii you will get a byte to half-word or integer
|
||||||
|
# one to one mapping of values from 0 - 255
|
||||||
|
|
||||||
|
def mangle_fonts(encryption_key, data):
|
||||||
|
if isinstance(encryption_key, text_type):
|
||||||
|
encryption_key = encryption_key.encode('latin-1')
|
||||||
|
crypt = data[:1024]
|
||||||
|
key = cycle(iter(map(bord, encryption_key)))
|
||||||
|
# encrypt = ''.join([chr(ord(x)^key.next()) for x in crypt])
|
||||||
|
encrypt = b''.join([bchr(bord(x)^next(key)) for x in crypt])
|
||||||
|
return encrypt + data[1024:]
|
525
KindleUnpack/mobiml2xhtml.py
Executable file
525
KindleUnpack/mobiml2xhtml.py
Executable file
@@ -0,0 +1,525 @@
|
|||||||
|
#! /usr/bin/python
|
||||||
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
|
||||||
|
|
||||||
|
# this program works in concert with the output from KindleUnpack
|
||||||
|
|
||||||
|
'''
|
||||||
|
Convert from Mobi ML to XHTML
|
||||||
|
'''
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
|
||||||
|
SPECIAL_HANDLING_TAGS = {
|
||||||
|
'?xml' : ('xmlheader', -1),
|
||||||
|
'!--' : ('comment', -3),
|
||||||
|
'!DOCTYPE' : ('doctype', -1),
|
||||||
|
}
|
||||||
|
|
||||||
|
SPECIAL_HANDLING_TYPES = ['xmlheader', 'doctype', 'comment']
|
||||||
|
|
||||||
|
SELF_CLOSING_TAGS = ['br' , 'hr', 'input', 'img', 'image', 'meta', 'spacer', 'link', 'frame', 'base', 'col', 'reference']
|
||||||
|
|
||||||
|
class MobiMLConverter(object):
|
||||||
|
|
||||||
|
PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
|
||||||
|
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
|
||||||
|
|
||||||
|
def __init__(self, filename):
|
||||||
|
self.base_css_rules = 'blockquote { margin: 0em 0em 0em 1.25em }\n'
|
||||||
|
self.base_css_rules += 'p { margin: 0em }\n'
|
||||||
|
self.base_css_rules += '.bold { font-weight: bold }\n'
|
||||||
|
self.base_css_rules += '.italic { font-style: italic }\n'
|
||||||
|
self.base_css_rules += '.mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n'
|
||||||
|
self.tag_css_rules = {}
|
||||||
|
self.tag_css_rule_cnt = 0
|
||||||
|
self.path = []
|
||||||
|
self.filename = filename
|
||||||
|
self.wipml = open(self.filename, 'rb').read()
|
||||||
|
self.pos = 0
|
||||||
|
self.opfname = self.filename.rsplit('.',1)[0] + '.opf'
|
||||||
|
self.opos = 0
|
||||||
|
self.meta = ''
|
||||||
|
self.cssname = os.path.join(os.path.dirname(self.filename),'styles.css')
|
||||||
|
self.current_font_size = 3
|
||||||
|
self.font_history = []
|
||||||
|
|
||||||
|
def cleanup_html(self):
|
||||||
|
self.wipml = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.wipml)
|
||||||
|
self.wipml = self.wipml.replace('\r\n', '\n')
|
||||||
|
self.wipml = self.wipml.replace('> <', '>\n<')
|
||||||
|
self.wipml = self.wipml.replace('<mbp: ', '<mbp:')
|
||||||
|
# self.wipml = re.sub(r'<?xml[^>]*>', '', self.wipml)
|
||||||
|
self.wipml = self.wipml.replace('<br></br>','<br/>')
|
||||||
|
|
||||||
|
def replace_page_breaks(self):
|
||||||
|
self.wipml = self.PAGE_BREAK_PAT.sub(
|
||||||
|
'<div class="mbp_pagebreak" />',
|
||||||
|
self.wipml)
|
||||||
|
|
||||||
|
# parse leading text of ml and tag
|
||||||
|
def parseml(self):
|
||||||
|
p = self.pos
|
||||||
|
if p >= len(self.wipml):
|
||||||
|
return None
|
||||||
|
if self.wipml[p] != '<':
|
||||||
|
res = self.wipml.find('<',p)
|
||||||
|
if res == -1 :
|
||||||
|
res = len(self.wipml)
|
||||||
|
self.pos = res
|
||||||
|
return self.wipml[p:res], None
|
||||||
|
# handle comment as a special case to deal with multi-line comments
|
||||||
|
if self.wipml[p:p+4] == '<!--':
|
||||||
|
te = self.wipml.find('-->',p+1)
|
||||||
|
if te != -1:
|
||||||
|
te = te+2
|
||||||
|
else :
|
||||||
|
te = self.wipml.find('>',p+1)
|
||||||
|
ntb = self.wipml.find('<',p+1)
|
||||||
|
if ntb != -1 and ntb < te:
|
||||||
|
self.pos = ntb
|
||||||
|
return self.wipml[p:ntb], None
|
||||||
|
self.pos = te + 1
|
||||||
|
return None, self.wipml[p:te+1]
|
||||||
|
|
||||||
|
# parses string version of tag to identify its name,
|
||||||
|
# its type 'begin', 'end' or 'single',
|
||||||
|
# plus build a hashtable of its attributes
|
||||||
|
# code is written to handle the possiblity of very poor formating
|
||||||
|
def parsetag(self, s):
|
||||||
|
p = 1
|
||||||
|
# get the tag name
|
||||||
|
tname = None
|
||||||
|
ttype = None
|
||||||
|
tattr = {}
|
||||||
|
while s[p:p+1] == ' ' :
|
||||||
|
p += 1
|
||||||
|
if s[p:p+1] == '/':
|
||||||
|
ttype = 'end'
|
||||||
|
p += 1
|
||||||
|
while s[p:p+1] == ' ' :
|
||||||
|
p += 1
|
||||||
|
b = p
|
||||||
|
while s[p:p+1] not in ('>', '/', ' ', '"', "'", "\r", "\n") :
|
||||||
|
p += 1
|
||||||
|
tname=s[b:p].lower()
|
||||||
|
if tname == '!doctype':
|
||||||
|
tname = '!DOCTYPE'
|
||||||
|
# special cases
|
||||||
|
if tname in SPECIAL_HANDLING_TAGS.keys():
|
||||||
|
ttype, backstep = SPECIAL_HANDLING_TAGS[tname]
|
||||||
|
tattr['special'] = s[p:backstep]
|
||||||
|
if ttype is None:
|
||||||
|
# parse any attributes
|
||||||
|
while s.find('=',p) != -1 :
|
||||||
|
while s[p:p+1] == ' ' :
|
||||||
|
p += 1
|
||||||
|
b = p
|
||||||
|
while s[p:p+1] != '=' :
|
||||||
|
p += 1
|
||||||
|
aname = s[b:p].lower()
|
||||||
|
aname = aname.rstrip(' ')
|
||||||
|
p += 1
|
||||||
|
while s[p:p+1] == ' ' :
|
||||||
|
p += 1
|
||||||
|
if s[p:p+1] in ('"', "'") :
|
||||||
|
p = p + 1
|
||||||
|
b = p
|
||||||
|
while s[p:p+1] not in ('"', "'") :
|
||||||
|
p += 1
|
||||||
|
val = s[b:p]
|
||||||
|
p += 1
|
||||||
|
else :
|
||||||
|
b = p
|
||||||
|
while s[p:p+1] not in ('>', '/', ' ') :
|
||||||
|
p += 1
|
||||||
|
val = s[b:p]
|
||||||
|
tattr[aname] = val
|
||||||
|
# label beginning and single tags
|
||||||
|
if ttype is None:
|
||||||
|
ttype = 'begin'
|
||||||
|
if s.find(' /',p) >= 0:
|
||||||
|
ttype = 'single_ext'
|
||||||
|
elif s.find('/',p) >= 0:
|
||||||
|
ttype = 'single'
|
||||||
|
return ttype, tname, tattr
|
||||||
|
|
||||||
|
# main routine to convert from mobi markup language to html
|
||||||
|
def processml(self):
|
||||||
|
|
||||||
|
# are these really needed
|
||||||
|
html_done = False
|
||||||
|
head_done = False
|
||||||
|
body_done = False
|
||||||
|
|
||||||
|
skip = False
|
||||||
|
|
||||||
|
htmlstr = ''
|
||||||
|
self.replace_page_breaks()
|
||||||
|
self.cleanup_html()
|
||||||
|
|
||||||
|
# now parse the cleaned up ml into standard xhtml
|
||||||
|
while True:
|
||||||
|
|
||||||
|
r = self.parseml()
|
||||||
|
if not r:
|
||||||
|
break
|
||||||
|
|
||||||
|
text, tag = r
|
||||||
|
|
||||||
|
if text:
|
||||||
|
if not skip:
|
||||||
|
htmlstr += text
|
||||||
|
|
||||||
|
if tag:
|
||||||
|
ttype, tname, tattr = self.parsetag(tag)
|
||||||
|
|
||||||
|
# If we run into a DTD or xml declarations inside the body ... bail.
|
||||||
|
if tname in SPECIAL_HANDLING_TAGS.keys() and tname != 'comment' and body_done:
|
||||||
|
htmlstr += '\n</body></html>'
|
||||||
|
break
|
||||||
|
|
||||||
|
# make sure self-closing tags actually self-close
|
||||||
|
if ttype == 'begin' and tname in SELF_CLOSING_TAGS:
|
||||||
|
ttype = 'single'
|
||||||
|
|
||||||
|
# make sure any end tags of self-closing tags are discarded
|
||||||
|
if ttype == 'end' and tname in SELF_CLOSING_TAGS:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# remove embedded guide and refernces from old mobis
|
||||||
|
if tname in ('guide', 'ncx', 'reference') and ttype in ('begin', 'single', 'single_ext'):
|
||||||
|
tname = 'removeme:{0}'.format(tname)
|
||||||
|
tattr = None
|
||||||
|
if tname in ('guide', 'ncx', 'reference', 'font', 'span') and ttype == 'end':
|
||||||
|
if self.path[-1] == 'removeme:{0}'.format(tname):
|
||||||
|
tname = 'removeme:{0}'.format(tname)
|
||||||
|
tattr = None
|
||||||
|
|
||||||
|
# Get rid of font tags that only have a color attribute.
|
||||||
|
if tname == 'font' and ttype in ('begin', 'single', 'single_ext'):
|
||||||
|
if 'color' in tattr.keys() and len(tattr.keys()) == 1:
|
||||||
|
tname = 'removeme:{0}'.format(tname)
|
||||||
|
tattr = None
|
||||||
|
|
||||||
|
# Get rid of empty spans in the markup.
|
||||||
|
if tname == 'span' and ttype in ('begin', 'single', 'single_ext') and not len(tattr):
|
||||||
|
tname = 'removeme:{0}'.format(tname)
|
||||||
|
|
||||||
|
# need to handle fonts outside of the normal methods
|
||||||
|
# so fonts tags won't be added to the self.path since we keep track
|
||||||
|
# of font tags separately with self.font_history
|
||||||
|
if tname == 'font' and ttype == 'begin':
|
||||||
|
# check for nested font start tags
|
||||||
|
if len(self.font_history) > 0 :
|
||||||
|
# inject a font end tag
|
||||||
|
taginfo = ('end', 'font', None)
|
||||||
|
htmlstr += self.processtag(taginfo)
|
||||||
|
self.font_history.append((ttype, tname, tattr))
|
||||||
|
# handle the current font start tag
|
||||||
|
taginfo = (ttype, tname, tattr)
|
||||||
|
htmlstr += self.processtag(taginfo)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# check for nested font tags and unnest them
|
||||||
|
if tname == 'font' and ttype == 'end':
|
||||||
|
self.font_history.pop()
|
||||||
|
# handle this font end tag
|
||||||
|
taginfo = ('end', 'font', None)
|
||||||
|
htmlstr += self.processtag(taginfo)
|
||||||
|
# check if we were nested
|
||||||
|
if len(self.font_history) > 0:
|
||||||
|
# inject a copy of the most recent font start tag from history
|
||||||
|
taginfo = self.font_history[-1]
|
||||||
|
htmlstr += self.processtag(taginfo)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# keep track of nesting path
|
||||||
|
if ttype == 'begin':
|
||||||
|
self.path.append(tname)
|
||||||
|
elif ttype == 'end':
|
||||||
|
if tname != self.path[-1]:
|
||||||
|
print ('improper nesting: ', self.path, tname, ttype)
|
||||||
|
if tname not in self.path:
|
||||||
|
# handle case of end tag with no beginning by injecting empty begin tag
|
||||||
|
taginfo = ('begin', tname, None)
|
||||||
|
htmlstr += self.processtag(taginfo)
|
||||||
|
print " - fixed by injecting empty start tag ", tname
|
||||||
|
self.path.append(tname)
|
||||||
|
elif len(self.path) > 1 and tname == self.path[-2]:
|
||||||
|
# handle case of dangling missing end
|
||||||
|
taginfo = ('end', self.path[-1], None)
|
||||||
|
htmlstr += self.processtag(taginfo)
|
||||||
|
print " - fixed by injecting end tag ", self.path[-1]
|
||||||
|
self.path.pop()
|
||||||
|
self.path.pop()
|
||||||
|
|
||||||
|
if tname == 'removeme:{0}'.format(tname):
|
||||||
|
if ttype in ('begin', 'single', 'single_ext'):
|
||||||
|
skip = True
|
||||||
|
else:
|
||||||
|
skip = False
|
||||||
|
else:
|
||||||
|
taginfo = (ttype, tname, tattr)
|
||||||
|
htmlstr += self.processtag(taginfo)
|
||||||
|
|
||||||
|
# handle potential issue of multiple html, head, and body sections
|
||||||
|
if tname == 'html' and ttype == 'begin' and not html_done:
|
||||||
|
htmlstr += '\n'
|
||||||
|
html_done = True
|
||||||
|
|
||||||
|
if tname == 'head' and ttype == 'begin' and not head_done:
|
||||||
|
htmlstr += '\n'
|
||||||
|
# also add in metadata and style link tags
|
||||||
|
htmlstr += self.meta
|
||||||
|
htmlstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
|
||||||
|
head_done = True
|
||||||
|
|
||||||
|
if tname == 'body' and ttype == 'begin' and not body_done:
|
||||||
|
htmlstr += '\n'
|
||||||
|
body_done = True
|
||||||
|
|
||||||
|
# handle issue of possibly missing html, head, and body tags
|
||||||
|
# I have not seen this but the original did something like this so ...
|
||||||
|
if not body_done:
|
||||||
|
htmlstr = '<body>\n' + htmlstr + '</body>\n'
|
||||||
|
if not head_done:
|
||||||
|
headstr = '<head>\n'
|
||||||
|
headstr += self.meta
|
||||||
|
headstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
|
||||||
|
headstr += '</head>\n'
|
||||||
|
htmlstr = headstr + htmlstr
|
||||||
|
if not html_done:
|
||||||
|
htmlstr = '<html>\n' + htmlstr + '</html>\n'
|
||||||
|
|
||||||
|
# finally add DOCTYPE info
|
||||||
|
htmlstr = '<?xml version="1.0"?>\n<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n' + htmlstr
|
||||||
|
|
||||||
|
css = self.base_css_rules
|
||||||
|
for cls, rule in self.tag_css_rules.items():
|
||||||
|
css += '.%s { %s }\n' % (cls, rule)
|
||||||
|
|
||||||
|
return (htmlstr, css, self.cssname)
|
||||||
|
|
||||||
|
def ensure_unit(self, raw, unit='px'):
|
||||||
|
if re.search(r'\d+$', raw) is not None:
|
||||||
|
raw += unit
|
||||||
|
return raw
|
||||||
|
|
||||||
|
# flatten possibly modified tag back to string
|
||||||
|
def taginfo_tostring(self, taginfo):
|
||||||
|
(ttype, tname, tattr) = taginfo
|
||||||
|
if ttype is None or tname is None:
|
||||||
|
return ''
|
||||||
|
if ttype == 'end':
|
||||||
|
return '</%s>' % tname
|
||||||
|
if ttype in SPECIAL_HANDLING_TYPES and tattr is not None and 'special' in tattr.keys():
|
||||||
|
info = tattr['special']
|
||||||
|
if ttype == 'comment':
|
||||||
|
return '<%s %s-->' % tname, info
|
||||||
|
else:
|
||||||
|
return '<%s %s>' % tname, info
|
||||||
|
res = []
|
||||||
|
res.append('<%s' % tname)
|
||||||
|
if tattr is not None:
|
||||||
|
for key in tattr.keys():
|
||||||
|
res.append(' %s="%s"' % (key, tattr[key]))
|
||||||
|
if ttype == 'single':
|
||||||
|
res.append('/>')
|
||||||
|
elif ttype == 'single_ext':
|
||||||
|
res.append(' />')
|
||||||
|
else :
|
||||||
|
res.append('>')
|
||||||
|
return "".join(res)
|
||||||
|
|
||||||
|
# routines to convert from mobi ml tags atributes to xhtml attributes and styles
|
||||||
|
def processtag(self, taginfo):
|
||||||
|
# Converting mobi font sizes to numerics
|
||||||
|
size_map = {
|
||||||
|
'xx-small': '1',
|
||||||
|
'x-small': '2',
|
||||||
|
'small': '3',
|
||||||
|
'medium': '4',
|
||||||
|
'large': '5',
|
||||||
|
'x-large': '6',
|
||||||
|
'xx-large': '7',
|
||||||
|
}
|
||||||
|
|
||||||
|
size_to_em_map = {
|
||||||
|
'1': '.65em',
|
||||||
|
'2': '.75em',
|
||||||
|
'3': '1em',
|
||||||
|
'4': '1.125em',
|
||||||
|
'5': '1.25em',
|
||||||
|
'6': '1.5em',
|
||||||
|
'7': '2em',
|
||||||
|
}
|
||||||
|
|
||||||
|
# current tag to work on
|
||||||
|
(ttype, tname, tattr) = taginfo
|
||||||
|
if not tattr:
|
||||||
|
tattr = {}
|
||||||
|
|
||||||
|
styles = []
|
||||||
|
|
||||||
|
if tname is None or tname.startswith('removeme'):
|
||||||
|
return ''
|
||||||
|
|
||||||
|
# have not seen an example of this yet so keep it here to be safe
|
||||||
|
# until this is better understood
|
||||||
|
if tname in ('country-region', 'place', 'placetype', 'placename',
|
||||||
|
'state', 'city', 'street', 'address', 'content'):
|
||||||
|
tname = 'div' if tname == 'content' else 'span'
|
||||||
|
for key in tattr.keys():
|
||||||
|
tattr.pop(key)
|
||||||
|
|
||||||
|
# handle general case of style, height, width, bgcolor in any tag
|
||||||
|
if 'style' in tattr.keys():
|
||||||
|
style = tattr.pop('style').strip()
|
||||||
|
if style:
|
||||||
|
styles.append(style)
|
||||||
|
|
||||||
|
if 'align' in tattr.keys():
|
||||||
|
align = tattr.pop('align').strip()
|
||||||
|
if align:
|
||||||
|
if tname in ('table', 'td', 'tr'):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
styles.append('text-align: %s' % align)
|
||||||
|
|
||||||
|
if 'height' in tattr.keys():
|
||||||
|
height = tattr.pop('height').strip()
|
||||||
|
if height and '<' not in height and '>' not in height and re.search(r'\d+', height):
|
||||||
|
if tname in ('table', 'td', 'tr'):
|
||||||
|
pass
|
||||||
|
elif tname == 'img':
|
||||||
|
tattr['height'] = height
|
||||||
|
else:
|
||||||
|
styles.append('margin-top: %s' % self.ensure_unit(height))
|
||||||
|
|
||||||
|
if 'width' in tattr.keys():
|
||||||
|
width = tattr.pop('width').strip()
|
||||||
|
if width and re.search(r'\d+', width):
|
||||||
|
if tname in ('table', 'td', 'tr'):
|
||||||
|
pass
|
||||||
|
elif tname == 'img':
|
||||||
|
tattr['width'] = width
|
||||||
|
else:
|
||||||
|
styles.append('text-indent: %s' % self.ensure_unit(width))
|
||||||
|
if width.startswith('-'):
|
||||||
|
styles.append('margin-left: %s' % self.ensure_unit(width[1:]))
|
||||||
|
|
||||||
|
if 'bgcolor' in tattr.keys():
|
||||||
|
# no proprietary html allowed
|
||||||
|
if tname == 'div':
|
||||||
|
del tattr['bgcolor']
|
||||||
|
|
||||||
|
elif tname == 'font':
|
||||||
|
# Change font tags to span tags
|
||||||
|
tname = 'span'
|
||||||
|
if ttype in ('begin', 'single', 'single_ext'):
|
||||||
|
# move the face attribute to css font-family
|
||||||
|
if 'face' in tattr.keys():
|
||||||
|
face = tattr.pop('face').strip()
|
||||||
|
styles.append('font-family: "%s"' % face)
|
||||||
|
|
||||||
|
# Monitor the constantly changing font sizes, change them to ems and move
|
||||||
|
# them to css. The following will work for 'flat' font tags, but nested font tags
|
||||||
|
# will cause things to go wonky. Need to revert to the parent font tag's size
|
||||||
|
# when a closing tag is encountered.
|
||||||
|
if 'size' in tattr.keys():
|
||||||
|
sz = tattr.pop('size').strip().lower()
|
||||||
|
try:
|
||||||
|
float(sz)
|
||||||
|
except ValueError:
|
||||||
|
if sz in size_map.keys():
|
||||||
|
sz = size_map[sz]
|
||||||
|
else:
|
||||||
|
if sz.startswith('-') or sz.startswith('+'):
|
||||||
|
sz = self.current_font_size + float(sz)
|
||||||
|
if sz > 7:
|
||||||
|
sz = 7
|
||||||
|
elif sz < 1:
|
||||||
|
sz = 1
|
||||||
|
sz = str(int(sz))
|
||||||
|
styles.append('font-size: %s' % size_to_em_map[sz])
|
||||||
|
self.current_font_size = int(sz)
|
||||||
|
|
||||||
|
elif tname == 'img':
|
||||||
|
for attr in ('width', 'height'):
|
||||||
|
if attr in tattr:
|
||||||
|
val = tattr[attr]
|
||||||
|
if val.lower().endswith('em'):
|
||||||
|
try:
|
||||||
|
nval = float(val[:-2])
|
||||||
|
nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile
|
||||||
|
tattr[attr] = "%dpx"%int(nval)
|
||||||
|
except:
|
||||||
|
del tattr[attr]
|
||||||
|
elif val.lower().endswith('%'):
|
||||||
|
del tattr[attr]
|
||||||
|
|
||||||
|
# convert the anchor tags
|
||||||
|
if 'filepos-id' in tattr:
|
||||||
|
tattr['id'] = tattr.pop('filepos-id')
|
||||||
|
if 'name' in tattr and tattr['name'] != tattr['id']:
|
||||||
|
tattr['name'] = tattr['id']
|
||||||
|
|
||||||
|
if 'filepos' in tattr:
|
||||||
|
filepos = tattr.pop('filepos')
|
||||||
|
try:
|
||||||
|
tattr['href'] = "#filepos%d" % int(filepos)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if styles:
|
||||||
|
ncls = None
|
||||||
|
rule = '; '.join(styles)
|
||||||
|
for sel, srule in self.tag_css_rules.items():
|
||||||
|
if srule == rule:
|
||||||
|
ncls = sel
|
||||||
|
break
|
||||||
|
if ncls is None:
|
||||||
|
self.tag_css_rule_cnt += 1
|
||||||
|
ncls = 'rule_%d' % self.tag_css_rule_cnt
|
||||||
|
self.tag_css_rules[ncls] = rule
|
||||||
|
cls = tattr.get('class', '')
|
||||||
|
cls = cls + (' ' if cls else '') + ncls
|
||||||
|
tattr['class'] = cls
|
||||||
|
|
||||||
|
# convert updated tag back to string representation
|
||||||
|
if len(tattr) == 0:
|
||||||
|
tattr = None
|
||||||
|
taginfo = (ttype, tname, tattr)
|
||||||
|
return self.taginfo_tostring(taginfo)
|
||||||
|
|
||||||
|
''' main only left in for testing outside of plugin '''
|
||||||
|
|
||||||
|
def main(argv=sys.argv):
|
||||||
|
if len(argv) != 2:
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
infile = argv[1]
|
||||||
|
|
||||||
|
try:
|
||||||
|
print 'Converting Mobi Markup Language to XHTML'
|
||||||
|
mlc = MobiMLConverter(infile)
|
||||||
|
print 'Processing ...'
|
||||||
|
htmlstr, css, cssname = mlc.processml()
|
||||||
|
outname = infile.rsplit('.',1)[0] + '_converted.html'
|
||||||
|
file(outname, 'wb').write(htmlstr)
|
||||||
|
file(cssname, 'wb').write(css)
|
||||||
|
print 'Completed'
|
||||||
|
print 'XHTML version of book can be found at: ' + outname
|
||||||
|
|
||||||
|
except ValueError, e:
|
||||||
|
print "Error: %s" % e
|
||||||
|
return 1
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
93
KindleUnpack/unipath.py
Executable file
93
KindleUnpack/unipath.py
Executable file
@@ -0,0 +1,93 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
|
||||||
|
# Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
# are permitted provided that the following conditions are met:
|
||||||
|
#
|
||||||
|
# 1. Redistributions of source code must retain the above copyright notice, this list of
|
||||||
|
# conditions and the following disclaimer.
|
||||||
|
#
|
||||||
|
# 2. Redistributions in binary form must reproduce the above copyright notice, this list
|
||||||
|
# of conditions and the following disclaimer in the documentation and/or other materials
|
||||||
|
# provided with the distribution.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
|
||||||
|
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
|
||||||
|
# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||||
|
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||||
|
# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||||
|
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
|
||||||
|
# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||||
|
from .compatibility_utils import PY2, text_type, binary_type
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
# utility routines to convert all paths to be full unicode
|
||||||
|
|
||||||
|
# Under Python 2, if a bytestring, try to convert it to unicode using sys.getfilesystemencoding
|
||||||
|
# Under Python 3, if bytes, try to convert it to unicode using os.fsencode() to decode it
|
||||||
|
|
||||||
|
# Mac OS X and Windows will happily support full unicode paths
|
||||||
|
# Linux can support full unicode paths but allows arbitrary byte paths which may be inconsistent with unicode
|
||||||
|
|
||||||
|
fsencoding = sys.getfilesystemencoding()
|
||||||
|
|
||||||
|
def pathof(s, enc=fsencoding):
|
||||||
|
if s is None:
|
||||||
|
return None
|
||||||
|
if isinstance(s, text_type):
|
||||||
|
return s
|
||||||
|
if isinstance(s, binary_type):
|
||||||
|
try:
|
||||||
|
return s.decode(enc)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return s
|
||||||
|
|
||||||
|
def exists(s):
|
||||||
|
return os.path.exists(pathof(s))
|
||||||
|
|
||||||
|
def isfile(s):
|
||||||
|
return os.path.isfile(pathof(s))
|
||||||
|
|
||||||
|
def isdir(s):
|
||||||
|
return os.path.isdir(pathof(s))
|
||||||
|
|
||||||
|
def mkdir(s):
|
||||||
|
return os.mkdir(pathof(s))
|
||||||
|
|
||||||
|
def listdir(s):
|
||||||
|
rv = []
|
||||||
|
for file in os.listdir(pathof(s)):
|
||||||
|
rv.append(pathof(file))
|
||||||
|
return rv
|
||||||
|
|
||||||
|
def getcwd():
|
||||||
|
if PY2:
|
||||||
|
return os.getcwdu()
|
||||||
|
return os.getcwd()
|
||||||
|
|
||||||
|
def walk(top):
|
||||||
|
top = pathof(top)
|
||||||
|
rv = []
|
||||||
|
for base, dnames, names in os.walk(top):
|
||||||
|
base = pathof(base)
|
||||||
|
for name in names:
|
||||||
|
name = pathof(name)
|
||||||
|
rv.append(relpath(os.path.join(base, name), top))
|
||||||
|
return rv
|
||||||
|
|
||||||
|
def relpath(path, start=None):
|
||||||
|
return os.path.relpath(pathof(path) , pathof(start))
|
||||||
|
|
||||||
|
def abspath(path):
|
||||||
|
return os.path.abspath(pathof(path))
|
167
KindleUnpack/unpack_structure.py
Normal file
167
KindleUnpack/unpack_structure.py
Normal file
@@ -0,0 +1,167 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||||
|
|
||||||
|
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||||
|
|
||||||
|
from .compatibility_utils import text_type
|
||||||
|
|
||||||
|
from . import unipath
|
||||||
|
from .unipath import pathof
|
||||||
|
|
||||||
|
DUMP = False
|
||||||
|
""" Set to True to dump all possible information. """
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
import re
|
||||||
|
# note: re requites the pattern to be the exact same type as the data to be searched in python3
|
||||||
|
# but u"" is not allowed for the pattern itself only b""
|
||||||
|
|
||||||
|
import zipfile
|
||||||
|
import binascii
|
||||||
|
from .mobi_utils import mangle_fonts
|
||||||
|
|
||||||
|
class unpackException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class ZipInfo(zipfile.ZipInfo):
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
if 'compress_type' in kwargs:
|
||||||
|
compress_type = kwargs.pop('compress_type')
|
||||||
|
super(ZipInfo, self).__init__(*args, **kwargs)
|
||||||
|
self.compress_type = compress_type
|
||||||
|
|
||||||
|
class fileNames:
|
||||||
|
|
||||||
|
def __init__(self, infile, outdir):
|
||||||
|
self.infile = infile
|
||||||
|
self.outdir = outdir
|
||||||
|
if not unipath.exists(self.outdir):
|
||||||
|
unipath.mkdir(self.outdir)
|
||||||
|
self.mobi7dir = os.path.join(self.outdir,'mobi7')
|
||||||
|
if not unipath.exists(self.mobi7dir):
|
||||||
|
unipath.mkdir(self.mobi7dir)
|
||||||
|
self.imgdir = os.path.join(self.mobi7dir, 'Images')
|
||||||
|
if not unipath.exists(self.imgdir):
|
||||||
|
unipath.mkdir(self.imgdir)
|
||||||
|
self.hdimgdir = os.path.join(self.outdir,'HDImages')
|
||||||
|
if not unipath.exists(self.hdimgdir):
|
||||||
|
unipath.mkdir(self.hdimgdir)
|
||||||
|
self.outbase = os.path.join(self.outdir, os.path.splitext(os.path.split(infile)[1])[0])
|
||||||
|
|
||||||
|
def getInputFileBasename(self):
|
||||||
|
return os.path.splitext(os.path.basename(self.infile))[0]
|
||||||
|
|
||||||
|
def makeK8Struct(self):
|
||||||
|
self.k8dir = os.path.join(self.outdir,'mobi8')
|
||||||
|
if not unipath.exists(self.k8dir):
|
||||||
|
unipath.mkdir(self.k8dir)
|
||||||
|
self.k8metainf = os.path.join(self.k8dir,'META-INF')
|
||||||
|
if not unipath.exists(self.k8metainf):
|
||||||
|
unipath.mkdir(self.k8metainf)
|
||||||
|
self.k8oebps = os.path.join(self.k8dir,'OEBPS')
|
||||||
|
if not unipath.exists(self.k8oebps):
|
||||||
|
unipath.mkdir(self.k8oebps)
|
||||||
|
self.k8images = os.path.join(self.k8oebps,'Images')
|
||||||
|
if not unipath.exists(self.k8images):
|
||||||
|
unipath.mkdir(self.k8images)
|
||||||
|
self.k8fonts = os.path.join(self.k8oebps,'Fonts')
|
||||||
|
if not unipath.exists(self.k8fonts):
|
||||||
|
unipath.mkdir(self.k8fonts)
|
||||||
|
self.k8styles = os.path.join(self.k8oebps,'Styles')
|
||||||
|
if not unipath.exists(self.k8styles):
|
||||||
|
unipath.mkdir(self.k8styles)
|
||||||
|
self.k8text = os.path.join(self.k8oebps,'Text')
|
||||||
|
if not unipath.exists(self.k8text):
|
||||||
|
unipath.mkdir(self.k8text)
|
||||||
|
|
||||||
|
# recursive zip creation support routine
|
||||||
|
def zipUpDir(self, myzip, tdir, localname):
|
||||||
|
currentdir = tdir
|
||||||
|
if localname != "":
|
||||||
|
currentdir = os.path.join(currentdir,localname)
|
||||||
|
list = unipath.listdir(currentdir)
|
||||||
|
for file in list:
|
||||||
|
afilename = file
|
||||||
|
localfilePath = os.path.join(localname, afilename)
|
||||||
|
realfilePath = os.path.join(currentdir,file)
|
||||||
|
if unipath.isfile(realfilePath):
|
||||||
|
myzip.write(pathof(realfilePath), pathof(localfilePath), zipfile.ZIP_DEFLATED)
|
||||||
|
elif unipath.isdir(realfilePath):
|
||||||
|
self.zipUpDir(myzip, tdir, localfilePath)
|
||||||
|
|
||||||
|
def makeEPUB(self, usedmap, obfuscate_data, uid):
|
||||||
|
bname = os.path.join(self.k8dir, self.getInputFileBasename() + '.epub')
|
||||||
|
# Create an encryption key for Adobe font obfuscation
|
||||||
|
# based on the epub's uid
|
||||||
|
if isinstance(uid,text_type):
|
||||||
|
uid = uid.encode('ascii')
|
||||||
|
if obfuscate_data:
|
||||||
|
key = re.sub(br'[^a-fA-F0-9]', b'', uid)
|
||||||
|
key = binascii.unhexlify((key + key)[:32])
|
||||||
|
|
||||||
|
# copy over all images and fonts that are actually used in the ebook
|
||||||
|
# and remove all font files from mobi7 since not supported
|
||||||
|
imgnames = unipath.listdir(self.imgdir)
|
||||||
|
for name in imgnames:
|
||||||
|
if usedmap.get(name,'not used') == 'used':
|
||||||
|
filein = os.path.join(self.imgdir,name)
|
||||||
|
if name.endswith(".ttf"):
|
||||||
|
fileout = os.path.join(self.k8fonts,name)
|
||||||
|
elif name.endswith(".otf"):
|
||||||
|
fileout = os.path.join(self.k8fonts,name)
|
||||||
|
elif name.endswith(".failed"):
|
||||||
|
fileout = os.path.join(self.k8fonts,name)
|
||||||
|
else:
|
||||||
|
fileout = os.path.join(self.k8images,name)
|
||||||
|
data = b''
|
||||||
|
with open(pathof(filein),'rb') as f:
|
||||||
|
data = f.read()
|
||||||
|
if obfuscate_data:
|
||||||
|
if name in obfuscate_data:
|
||||||
|
data = mangle_fonts(key, data)
|
||||||
|
open(pathof(fileout),'wb').write(data)
|
||||||
|
if name.endswith(".ttf") or name.endswith(".otf"):
|
||||||
|
os.remove(pathof(filein))
|
||||||
|
|
||||||
|
# opf file name hard coded to "content.opf"
|
||||||
|
container = '<?xml version="1.0" encoding="UTF-8"?>\n'
|
||||||
|
container += '<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">\n'
|
||||||
|
container += ' <rootfiles>\n'
|
||||||
|
container += '<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>'
|
||||||
|
container += ' </rootfiles>\n</container>\n'
|
||||||
|
fileout = os.path.join(self.k8metainf,'container.xml')
|
||||||
|
with open(pathof(fileout),'wb') as f:
|
||||||
|
f.write(container.encode('utf-8'))
|
||||||
|
|
||||||
|
if obfuscate_data:
|
||||||
|
encryption = '<encryption xmlns="urn:oasis:names:tc:opendocument:xmlns:container" \
|
||||||
|
xmlns:enc="http://www.w3.org/2001/04/xmlenc#" xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">\n'
|
||||||
|
for font in obfuscate_data:
|
||||||
|
encryption += ' <enc:EncryptedData>\n'
|
||||||
|
encryption += ' <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>\n'
|
||||||
|
encryption += ' <enc:CipherData>\n'
|
||||||
|
encryption += ' <enc:CipherReference URI="OEBPS/Fonts/' + font + '"/>\n'
|
||||||
|
encryption += ' </enc:CipherData>\n'
|
||||||
|
encryption += ' </enc:EncryptedData>\n'
|
||||||
|
encryption += '</encryption>\n'
|
||||||
|
fileout = os.path.join(self.k8metainf,'encryption.xml')
|
||||||
|
with open(pathof(fileout),'wb') as f:
|
||||||
|
f.write(encryption.encode('utf-8'))
|
||||||
|
|
||||||
|
# ready to build epub
|
||||||
|
self.outzip = zipfile.ZipFile(pathof(bname), 'w')
|
||||||
|
|
||||||
|
# add the mimetype file uncompressed
|
||||||
|
mimetype = b'application/epub+zip'
|
||||||
|
fileout = os.path.join(self.k8dir,'mimetype')
|
||||||
|
with open(pathof(fileout),'wb') as f:
|
||||||
|
f.write(mimetype)
|
||||||
|
nzinfo = ZipInfo('mimetype', compress_type=zipfile.ZIP_STORED)
|
||||||
|
nzinfo.external_attr = 0o600 << 16 # make this a normal file
|
||||||
|
self.outzip.writestr(nzinfo, mimetype)
|
||||||
|
self.zipUpDir(self.outzip,self.k8dir,'META-INF')
|
||||||
|
self.zipUpDir(self.outzip,self.k8dir,'OEBPS')
|
||||||
|
self.outzip.close()
|
Reference in New Issue
Block a user