269 lines
9.8 KiB
Python
269 lines
9.8 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
|
|
|
from __future__ import unicode_literals, division, absolute_import, print_function
|
|
|
|
DEBUG_USE_ORDERED_DICTIONARY = False # OrderedDict is supoorted >= python 2.7.
|
|
""" set to True to use OrderedDict for K8RESCProcessor.parsetag.tattr."""
|
|
|
|
if DEBUG_USE_ORDERED_DICTIONARY:
|
|
from collections import OrderedDict as dict_
|
|
else:
|
|
dict_ = dict
|
|
|
|
from .compatibility_utils import unicode_str
|
|
|
|
from .mobi_utils import fromBase32
|
|
|
|
_OPF_PARENT_TAGS = ['xml', 'package', 'metadata', 'dc-metadata',
|
|
'x-metadata', 'manifest', 'spine', 'tours', 'guide']
|
|
|
|
class K8RESCProcessor(object):
|
|
|
|
def __init__(self, data, debug=False):
|
|
self._debug = debug
|
|
self.resc = None
|
|
self.opos = 0
|
|
self.extrameta = []
|
|
self.cover_name = None
|
|
self.spine_idrefs = {}
|
|
self.spine_order = []
|
|
self.spine_pageattributes = {}
|
|
self.spine_ppd = None
|
|
# need3 indicate the book has fields which require epub3.
|
|
# but the estimation of the source epub version from the fields is difficult.
|
|
self.need3 = False
|
|
self.package_ver = None
|
|
self.extra_metadata = []
|
|
self.refines_metadata = []
|
|
self.extra_attributes = []
|
|
# get header
|
|
start_pos = data.find(b'<')
|
|
self.resc_header = data[:start_pos]
|
|
# get resc data length
|
|
start = self.resc_header.find(b'=') + 1
|
|
end = self.resc_header.find(b'&', start)
|
|
resc_size = 0
|
|
if end > 0:
|
|
resc_size = fromBase32(self.resc_header[start:end])
|
|
resc_rawbytes = len(data) - start_pos
|
|
if resc_rawbytes == resc_size:
|
|
self.resc_length = resc_size
|
|
else:
|
|
# Most RESC has a nul string at its tail but some do not.
|
|
end_pos = data.find(b'\x00', start_pos)
|
|
if end_pos < 0:
|
|
self.resc_length = resc_rawbytes
|
|
else:
|
|
self.resc_length = end_pos - start_pos
|
|
if self.resc_length != resc_size:
|
|
print("Warning: RESC section length({:d}bytes) does not match its size({:d}bytes).".format(self.resc_length, resc_size))
|
|
# now parse RESC after converting it to unicode from utf-8
|
|
self.resc = unicode_str(data[start_pos:start_pos+self.resc_length])
|
|
self.parseData()
|
|
|
|
def prepend_to_spine(self, key, idref, linear, properties):
|
|
self.spine_order = [key] + self.spine_order
|
|
self.spine_idrefs[key] = idref
|
|
attributes = {}
|
|
if linear is not None:
|
|
attributes['linear'] = linear
|
|
if properties is not None:
|
|
attributes['properties'] = properties
|
|
self.spine_pageattributes[key] = attributes
|
|
|
|
# RESC tag iterator
|
|
def resc_tag_iter(self):
|
|
tcontent = last_tattr = None
|
|
prefix = ['']
|
|
while True:
|
|
text, tag = self.parseresc()
|
|
if text is None and tag is None:
|
|
break
|
|
if text is not None:
|
|
tcontent = text.rstrip(' \r\n')
|
|
else: # we have a tag
|
|
ttype, tname, tattr = self.parsetag(tag)
|
|
if ttype == 'begin':
|
|
tcontent = None
|
|
prefix.append(tname + '.')
|
|
if tname in _OPF_PARENT_TAGS:
|
|
yield ''.join(prefix), tname, tattr, tcontent
|
|
else:
|
|
last_tattr = tattr
|
|
else: # single or end
|
|
if ttype == 'end':
|
|
prefix.pop()
|
|
tattr = last_tattr
|
|
last_tattr = None
|
|
if tname in _OPF_PARENT_TAGS:
|
|
tname += '-end'
|
|
yield ''.join(prefix), tname, tattr, tcontent
|
|
tcontent = None
|
|
|
|
# now parse the RESC to extract spine and extra metadata info
|
|
def parseData(self):
|
|
for prefix, tname, tattr, tcontent in self.resc_tag_iter():
|
|
if self._debug:
|
|
print(" Parsing RESC: ", prefix, tname, tattr, tcontent)
|
|
if tname == 'package':
|
|
self.package_ver = tattr.get('version', '2.0')
|
|
package_prefix = tattr.get('prefix','')
|
|
if self.package_ver.startswith('3') or package_prefix.startswith('rendition'):
|
|
self.need3 = True
|
|
if tname == 'spine':
|
|
self.spine_ppd = tattr.get('page-progession-direction', None)
|
|
if self.spine_ppd is not None and self.spine_ppd == 'rtl':
|
|
self.need3 = True
|
|
if tname == 'itemref':
|
|
skelid = tattr.pop('skelid', None)
|
|
if skelid is None and len(self.spine_order) == 0:
|
|
# assume it was removed initial coverpage
|
|
skelid = 'coverpage'
|
|
tattr['linear'] = 'no'
|
|
self.spine_order.append(skelid)
|
|
idref = tattr.pop('idref', None)
|
|
if idref is not None:
|
|
idref = 'x_' + idref
|
|
self.spine_idrefs[skelid] = idref
|
|
if 'id' in tattr:
|
|
del tattr['id']
|
|
# tattr["id"] = 'x_' + tattr["id"]
|
|
if 'properties' in tattr:
|
|
self.need3 = True
|
|
self.spine_pageattributes[skelid] = tattr
|
|
if tname == 'meta' or tname.startswith('dc:'):
|
|
if 'refines' in tattr or 'property' in tattr:
|
|
self.need3 = True
|
|
if tattr.get('name','') == 'cover':
|
|
cover_name = tattr.get('content',None)
|
|
if cover_name is not None:
|
|
cover_name = 'x_' + cover_name
|
|
self.cover_name = cover_name
|
|
else:
|
|
self.extrameta.append([tname, tattr, tcontent])
|
|
|
|
# parse and return either leading text or the next tag
|
|
def parseresc(self):
|
|
p = self.opos
|
|
if p >= len(self.resc):
|
|
return None, None
|
|
if self.resc[p] != '<':
|
|
res = self.resc.find('<',p)
|
|
if res == -1 :
|
|
res = len(self.resc)
|
|
self.opos = res
|
|
return self.resc[p:res], None
|
|
# handle comment as a special case
|
|
if self.resc[p:p+4] == '<!--':
|
|
te = self.resc.find('-->',p+1)
|
|
if te != -1:
|
|
te = te+2
|
|
else:
|
|
te = self.resc.find('>',p+1)
|
|
ntb = self.resc.find('<',p+1)
|
|
if ntb != -1 and ntb < te:
|
|
self.opos = ntb
|
|
return self.resc[p:ntb], None
|
|
self.opos = te + 1
|
|
return None, self.resc[p:te+1]
|
|
|
|
# parses tag to identify: [tname, ttype, tattr]
|
|
# tname: tag name
|
|
# ttype: tag type ('begin', 'end' or 'single');
|
|
# tattr: dictionary of tag atributes
|
|
def parsetag(self, s):
|
|
p = 1
|
|
tname = None
|
|
ttype = None
|
|
tattr = dict_()
|
|
while s[p:p+1] == ' ' :
|
|
p += 1
|
|
if s[p:p+1] == '/':
|
|
ttype = 'end'
|
|
p += 1
|
|
while s[p:p+1] == ' ' :
|
|
p += 1
|
|
b = p
|
|
while s[p:p+1] not in ('>', '/', ' ', '"', "'",'\r','\n') :
|
|
p += 1
|
|
tname=s[b:p].lower()
|
|
# some special cases
|
|
if tname == '?xml':
|
|
tname = 'xml'
|
|
if tname == '!--':
|
|
ttype = 'single'
|
|
comment = s[p:-3].strip()
|
|
tattr['comment'] = comment
|
|
if ttype is None:
|
|
# parse any attributes of begin or single tags
|
|
while s.find('=',p) != -1 :
|
|
while s[p:p+1] == ' ' :
|
|
p += 1
|
|
b = p
|
|
while s[p:p+1] != '=' :
|
|
p += 1
|
|
aname = s[b:p].lower()
|
|
aname = aname.rstrip(' ')
|
|
p += 1
|
|
while s[p:p+1] == ' ' :
|
|
p += 1
|
|
if s[p:p+1] in ('"', "'") :
|
|
p = p + 1
|
|
b = p
|
|
while s[p:p+1] not in ('"', "'"):
|
|
p += 1
|
|
val = s[b:p]
|
|
p += 1
|
|
else :
|
|
b = p
|
|
while s[p:p+1] not in ('>', '/', ' ') :
|
|
p += 1
|
|
val = s[b:p]
|
|
tattr[aname] = val
|
|
if ttype is None:
|
|
ttype = 'begin'
|
|
if s.find('/',p) >= 0:
|
|
ttype = 'single'
|
|
return ttype, tname, tattr
|
|
|
|
def taginfo_toxml(self, taginfo):
|
|
res = []
|
|
tname, tattr, tcontent = taginfo
|
|
res.append('<' + tname)
|
|
if tattr is not None:
|
|
for key in tattr:
|
|
res.append(' ' + key + '="'+tattr[key]+'"')
|
|
if tcontent is not None:
|
|
res.append('>' + tcontent + '</' + tname + '>\n')
|
|
else:
|
|
res.append('/>\n')
|
|
return "".join(res)
|
|
|
|
def hasSpine(self):
|
|
return len(self.spine_order) > 0
|
|
|
|
def needEPUB3(self):
|
|
return self.need3
|
|
|
|
def hasRefines(self):
|
|
for [tname, tattr, tcontent] in self.extrameta:
|
|
if 'refines' in tattr:
|
|
return True
|
|
return False
|
|
|
|
def createMetadata(self, epubver):
|
|
for taginfo in self.extrameta:
|
|
tname, tattr, tcontent = taginfo
|
|
if 'refines' in tattr:
|
|
if epubver == 'F' and 'property' in tattr:
|
|
attr = ' id="%s" opf:%s="%s"\n' % (tattr['refines'], tattr['property'], tcontent)
|
|
self.extra_attributes.append(attr)
|
|
else:
|
|
tag = self.taginfo_toxml(taginfo)
|
|
self.refines_metadata.append(tag)
|
|
else:
|
|
tag = self.taginfo_toxml(taginfo)
|
|
self.extra_metadata.append(tag)
|