Lector/lector/KindleUnpack/mobi_k8resc.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab

from __future__ import unicode_literals, division, absolute_import, print_function

DEBUG_USE_ORDERED_DICTIONARY = False  # OrderedDict is supoorted >= python 2.7.
""" set to True to use OrderedDict for K8RESCProcessor.parsetag.tattr."""

if DEBUG_USE_ORDERED_DICTIONARY:
    from collections import OrderedDict as dict_
else:
    dict_ = dict

from .compatibility_utils import unicode_str

from .mobi_utils import fromBase32

_OPF_PARENT_TAGS = ['xml', 'package', 'metadata', 'dc-metadata',
                    'x-metadata', 'manifest', 'spine', 'tours', 'guide']

class K8RESCProcessor(object):

    def __init__(self, data, debug=False):
        self._debug = debug
        self.resc = None
        self.opos = 0
        self.extrameta = []
        self.cover_name = None
        self.spine_idrefs = {}
        self.spine_order = []
        self.spine_pageattributes = {}
        self.spine_ppd = None
        # need3 indicate the book has fields which require epub3.
        # but the estimation of the source epub version from the fields is difficult.
        self.need3 = False
        self.package_ver = None
        self.extra_metadata = []
        self.refines_metadata = []
        self.extra_attributes = []
        # get header
        start_pos = data.find(b'<')
        self.resc_header = data[:start_pos]
        # get resc data length
        start = self.resc_header.find(b'=') + 1
        end = self.resc_header.find(b'&', start)
        resc_size = 0
        if end > 0:
            resc_size = fromBase32(self.resc_header[start:end])
        resc_rawbytes = len(data) - start_pos
        if resc_rawbytes == resc_size:
            self.resc_length = resc_size
        else:
            # Most RESC has a nul string at its tail but some do not.
            end_pos = data.find(b'\x00', start_pos)
            if end_pos < 0:
                self.resc_length = resc_rawbytes
            else:
                self.resc_length = end_pos - start_pos
        if self.resc_length != resc_size:
            print("Warning: RESC section length({:d}bytes) does not match its size({:d}bytes).".format(self.resc_length, resc_size))
        # now parse RESC after converting it to unicode from utf-8
        self.resc = unicode_str(data[start_pos:start_pos+self.resc_length])
        self.parseData()

    def prepend_to_spine(self, key, idref, linear, properties):
        self.spine_order = [key] + self.spine_order
        self.spine_idrefs[key] = idref
        attributes = {}
        if linear is not None:
            attributes['linear'] = linear
        if properties is not None:
            attributes['properties'] = properties
        self.spine_pageattributes[key] = attributes

    # RESC tag iterator
    def resc_tag_iter(self):
        tcontent = last_tattr = None
        prefix = ['']
        while True:
            text, tag = self.parseresc()
            if text is None and tag is None:
                break
            if text is not None:
                tcontent = text.rstrip(' \r\n')
            else:  # we have a tag
                ttype, tname, tattr = self.parsetag(tag)
                if ttype == 'begin':
                    tcontent = None
                    prefix.append(tname + '.')
                    if tname in _OPF_PARENT_TAGS:
                        yield ''.join(prefix), tname, tattr, tcontent
                    else:
                        last_tattr = tattr
                else:  # single or end
                    if ttype == 'end':
                        prefix.pop()
                        tattr = last_tattr
                        last_tattr = None
                        if tname in _OPF_PARENT_TAGS:
                            tname += '-end'
                    yield ''.join(prefix), tname, tattr, tcontent
                    tcontent = None

    # now parse the RESC to extract spine and extra metadata info
    def parseData(self):
        for prefix, tname, tattr, tcontent in self.resc_tag_iter():
            if self._debug:
                print("   Parsing RESC: ", prefix, tname, tattr, tcontent)
            if tname == 'package':
                self.package_ver = tattr.get('version', '2.0')
                package_prefix = tattr.get('prefix','')
                if self.package_ver.startswith('3') or package_prefix.startswith('rendition'):
                    self.need3 = True
            if tname == 'spine':
                self.spine_ppd = tattr.get('page-progession-direction', None)
                if self.spine_ppd is not None and self.spine_ppd == 'rtl':
                    self.need3 = True
            if tname == 'itemref':
                skelid = tattr.pop('skelid', None)
                if skelid is None and len(self.spine_order) == 0:
                    # assume it was removed initial coverpage
                    skelid = 'coverpage'
                    tattr['linear'] = 'no'
                self.spine_order.append(skelid)
                idref = tattr.pop('idref', None)
                if idref is not None:
                    idref = 'x_' + idref
                self.spine_idrefs[skelid] = idref
                if 'id' in tattr:
                    del tattr['id']
                # tattr["id"] = 'x_' + tattr["id"]
                if 'properties' in tattr:
                    self.need3 = True
                self.spine_pageattributes[skelid] = tattr
            if tname == 'meta' or tname.startswith('dc:'):
                if 'refines' in tattr or 'property' in tattr:
                    self.need3 = True
                if tattr.get('name','') == 'cover':
                    cover_name = tattr.get('content',None)
                    if cover_name is not None:
                        cover_name = 'x_' + cover_name
                    self.cover_name = cover_name
                else:
                    self.extrameta.append([tname, tattr, tcontent])

    # parse and return either leading text or the next tag
    def parseresc(self):
        p = self.opos
        if p >= len(self.resc):
            return None, None
        if self.resc[p] != '<':
            res = self.resc.find('<',p)
            if res == -1 :
                res = len(self.resc)
            self.opos = res
            return self.resc[p:res], None
        # handle comment as a special case
        if self.resc[p:p+4] == '<!--':
            te = self.resc.find('-->',p+1)
            if te != -1:
                te = te+2
        else:
            te = self.resc.find('>',p+1)
            ntb = self.resc.find('<',p+1)
            if ntb != -1 and ntb < te:
                self.opos = ntb
                return self.resc[p:ntb], None
        self.opos = te + 1
        return None, self.resc[p:te+1]

    # parses tag to identify:  [tname, ttype, tattr]
    #    tname: tag name
    #    ttype: tag type ('begin', 'end' or 'single');
    #    tattr: dictionary of tag atributes
    def parsetag(self, s):
        p = 1
        tname = None
        ttype = None
        tattr = dict_()
        while s[p:p+1] == ' ' :
            p += 1
        if s[p:p+1] == '/':
            ttype = 'end'
            p += 1
            while s[p:p+1] == ' ' :
                p += 1
        b = p
        while s[p:p+1] not in ('>', '/', ' ', '"', "'",'\r','\n') :
            p += 1
        tname=s[b:p].lower()
        # some special cases
        if tname == '?xml':
            tname = 'xml'
        if tname == '!--':
            ttype = 'single'
            comment = s[p:-3].strip()
            tattr['comment'] = comment
        if ttype is None:
            # parse any attributes of begin or single tags
            while s.find('=',p) != -1 :
                while s[p:p+1] == ' ' :
                    p += 1
                b = p
                while s[p:p+1] != '=' :
                    p += 1
                aname = s[b:p].lower()
                aname = aname.rstrip(' ')
                p += 1
                while s[p:p+1] == ' ' :
                    p += 1
                if s[p:p+1] in ('"', "'") :
                    p = p + 1
                    b = p
                    while s[p:p+1] not in ('"', "'"):
                        p += 1
                    val = s[b:p]
                    p += 1
                else :
                    b = p
                    while s[p:p+1] not in ('>', '/', ' ') :
                        p += 1
                    val = s[b:p]
                tattr[aname] = val
        if ttype is None:
            ttype = 'begin'
            if s.find('/',p) >= 0:
                ttype = 'single'
        return ttype, tname, tattr

    def taginfo_toxml(self, taginfo):
        res = []
        tname, tattr, tcontent = taginfo
        res.append('<' + tname)
        if tattr is not None:
            for key in tattr:
                res.append(' ' + key + '="'+tattr[key]+'"')
        if tcontent is not None:
            res.append('>' + tcontent + '</' + tname + '>\n')
        else:
            res.append('/>\n')
        return "".join(res)

    def hasSpine(self):
        return len(self.spine_order) > 0

    def needEPUB3(self):
        return self.need3

    def hasRefines(self):
        for [tname, tattr, tcontent] in self.extrameta:
            if 'refines' in tattr:
                return True
        return False

    def createMetadata(self, epubver):
        for taginfo in self.extrameta:
            tname, tattr, tcontent = taginfo
            if 'refines' in tattr:
                if epubver == 'F' and 'property' in tattr:
                    attr = ' id="%s" opf:%s="%s"\n' % (tattr['refines'], tattr['property'], tcontent)
                    self.extra_attributes.append(attr)
                else:
                    tag = self.taginfo_toxml(taginfo)
                    self.refines_metadata.append(tag)
            else:
                tag = self.taginfo_toxml(taginfo)
                self.extra_metadata.append(tag)