526 lines
19 KiB
Python
Executable File
526 lines
19 KiB
Python
Executable File
#! /usr/bin/python
|
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
|
|
|
|
|
# this program works in concert with the output from KindleUnpack
|
|
|
|
'''
|
|
Convert from Mobi ML to XHTML
|
|
'''
|
|
|
|
import os
|
|
import sys
|
|
import re
|
|
|
|
SPECIAL_HANDLING_TAGS = {
|
|
'?xml' : ('xmlheader', -1),
|
|
'!--' : ('comment', -3),
|
|
'!DOCTYPE' : ('doctype', -1),
|
|
}
|
|
|
|
SPECIAL_HANDLING_TYPES = ['xmlheader', 'doctype', 'comment']
|
|
|
|
SELF_CLOSING_TAGS = ['br' , 'hr', 'input', 'img', 'image', 'meta', 'spacer', 'link', 'frame', 'base', 'col', 'reference']
|
|
|
|
class MobiMLConverter(object):
|
|
|
|
PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
|
|
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
|
|
|
|
def __init__(self, filename):
|
|
self.base_css_rules = 'blockquote { margin: 0em 0em 0em 1.25em }\n'
|
|
self.base_css_rules += 'p { margin: 0em }\n'
|
|
self.base_css_rules += '.bold { font-weight: bold }\n'
|
|
self.base_css_rules += '.italic { font-style: italic }\n'
|
|
self.base_css_rules += '.mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n'
|
|
self.tag_css_rules = {}
|
|
self.tag_css_rule_cnt = 0
|
|
self.path = []
|
|
self.filename = filename
|
|
self.wipml = open(self.filename, 'rb').read()
|
|
self.pos = 0
|
|
self.opfname = self.filename.rsplit('.',1)[0] + '.opf'
|
|
self.opos = 0
|
|
self.meta = ''
|
|
self.cssname = os.path.join(os.path.dirname(self.filename),'styles.css')
|
|
self.current_font_size = 3
|
|
self.font_history = []
|
|
|
|
def cleanup_html(self):
|
|
self.wipml = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.wipml)
|
|
self.wipml = self.wipml.replace('\r\n', '\n')
|
|
self.wipml = self.wipml.replace('> <', '>\n<')
|
|
self.wipml = self.wipml.replace('<mbp: ', '<mbp:')
|
|
# self.wipml = re.sub(r'<?xml[^>]*>', '', self.wipml)
|
|
self.wipml = self.wipml.replace('<br></br>','<br/>')
|
|
|
|
def replace_page_breaks(self):
|
|
self.wipml = self.PAGE_BREAK_PAT.sub(
|
|
'<div class="mbp_pagebreak" />',
|
|
self.wipml)
|
|
|
|
# parse leading text of ml and tag
|
|
def parseml(self):
|
|
p = self.pos
|
|
if p >= len(self.wipml):
|
|
return None
|
|
if self.wipml[p] != '<':
|
|
res = self.wipml.find('<',p)
|
|
if res == -1 :
|
|
res = len(self.wipml)
|
|
self.pos = res
|
|
return self.wipml[p:res], None
|
|
# handle comment as a special case to deal with multi-line comments
|
|
if self.wipml[p:p+4] == '<!--':
|
|
te = self.wipml.find('-->',p+1)
|
|
if te != -1:
|
|
te = te+2
|
|
else :
|
|
te = self.wipml.find('>',p+1)
|
|
ntb = self.wipml.find('<',p+1)
|
|
if ntb != -1 and ntb < te:
|
|
self.pos = ntb
|
|
return self.wipml[p:ntb], None
|
|
self.pos = te + 1
|
|
return None, self.wipml[p:te+1]
|
|
|
|
# parses string version of tag to identify its name,
|
|
# its type 'begin', 'end' or 'single',
|
|
# plus build a hashtable of its attributes
|
|
# code is written to handle the possiblity of very poor formating
|
|
def parsetag(self, s):
|
|
p = 1
|
|
# get the tag name
|
|
tname = None
|
|
ttype = None
|
|
tattr = {}
|
|
while s[p:p+1] == ' ' :
|
|
p += 1
|
|
if s[p:p+1] == '/':
|
|
ttype = 'end'
|
|
p += 1
|
|
while s[p:p+1] == ' ' :
|
|
p += 1
|
|
b = p
|
|
while s[p:p+1] not in ('>', '/', ' ', '"', "'", "\r", "\n") :
|
|
p += 1
|
|
tname=s[b:p].lower()
|
|
if tname == '!doctype':
|
|
tname = '!DOCTYPE'
|
|
# special cases
|
|
if tname in SPECIAL_HANDLING_TAGS.keys():
|
|
ttype, backstep = SPECIAL_HANDLING_TAGS[tname]
|
|
tattr['special'] = s[p:backstep]
|
|
if ttype is None:
|
|
# parse any attributes
|
|
while s.find('=',p) != -1 :
|
|
while s[p:p+1] == ' ' :
|
|
p += 1
|
|
b = p
|
|
while s[p:p+1] != '=' :
|
|
p += 1
|
|
aname = s[b:p].lower()
|
|
aname = aname.rstrip(' ')
|
|
p += 1
|
|
while s[p:p+1] == ' ' :
|
|
p += 1
|
|
if s[p:p+1] in ('"', "'") :
|
|
p = p + 1
|
|
b = p
|
|
while s[p:p+1] not in ('"', "'") :
|
|
p += 1
|
|
val = s[b:p]
|
|
p += 1
|
|
else :
|
|
b = p
|
|
while s[p:p+1] not in ('>', '/', ' ') :
|
|
p += 1
|
|
val = s[b:p]
|
|
tattr[aname] = val
|
|
# label beginning and single tags
|
|
if ttype is None:
|
|
ttype = 'begin'
|
|
if s.find(' /',p) >= 0:
|
|
ttype = 'single_ext'
|
|
elif s.find('/',p) >= 0:
|
|
ttype = 'single'
|
|
return ttype, tname, tattr
|
|
|
|
# main routine to convert from mobi markup language to html
|
|
def processml(self):
|
|
|
|
# are these really needed
|
|
html_done = False
|
|
head_done = False
|
|
body_done = False
|
|
|
|
skip = False
|
|
|
|
htmlstr = ''
|
|
self.replace_page_breaks()
|
|
self.cleanup_html()
|
|
|
|
# now parse the cleaned up ml into standard xhtml
|
|
while True:
|
|
|
|
r = self.parseml()
|
|
if not r:
|
|
break
|
|
|
|
text, tag = r
|
|
|
|
if text:
|
|
if not skip:
|
|
htmlstr += text
|
|
|
|
if tag:
|
|
ttype, tname, tattr = self.parsetag(tag)
|
|
|
|
# If we run into a DTD or xml declarations inside the body ... bail.
|
|
if tname in SPECIAL_HANDLING_TAGS.keys() and tname != 'comment' and body_done:
|
|
htmlstr += '\n</body></html>'
|
|
break
|
|
|
|
# make sure self-closing tags actually self-close
|
|
if ttype == 'begin' and tname in SELF_CLOSING_TAGS:
|
|
ttype = 'single'
|
|
|
|
# make sure any end tags of self-closing tags are discarded
|
|
if ttype == 'end' and tname in SELF_CLOSING_TAGS:
|
|
continue
|
|
|
|
# remove embedded guide and refernces from old mobis
|
|
if tname in ('guide', 'ncx', 'reference') and ttype in ('begin', 'single', 'single_ext'):
|
|
tname = 'removeme:{0}'.format(tname)
|
|
tattr = None
|
|
if tname in ('guide', 'ncx', 'reference', 'font', 'span') and ttype == 'end':
|
|
if self.path[-1] == 'removeme:{0}'.format(tname):
|
|
tname = 'removeme:{0}'.format(tname)
|
|
tattr = None
|
|
|
|
# Get rid of font tags that only have a color attribute.
|
|
if tname == 'font' and ttype in ('begin', 'single', 'single_ext'):
|
|
if 'color' in tattr.keys() and len(tattr.keys()) == 1:
|
|
tname = 'removeme:{0}'.format(tname)
|
|
tattr = None
|
|
|
|
# Get rid of empty spans in the markup.
|
|
if tname == 'span' and ttype in ('begin', 'single', 'single_ext') and not len(tattr):
|
|
tname = 'removeme:{0}'.format(tname)
|
|
|
|
# need to handle fonts outside of the normal methods
|
|
# so fonts tags won't be added to the self.path since we keep track
|
|
# of font tags separately with self.font_history
|
|
if tname == 'font' and ttype == 'begin':
|
|
# check for nested font start tags
|
|
if len(self.font_history) > 0 :
|
|
# inject a font end tag
|
|
taginfo = ('end', 'font', None)
|
|
htmlstr += self.processtag(taginfo)
|
|
self.font_history.append((ttype, tname, tattr))
|
|
# handle the current font start tag
|
|
taginfo = (ttype, tname, tattr)
|
|
htmlstr += self.processtag(taginfo)
|
|
continue
|
|
|
|
# check for nested font tags and unnest them
|
|
if tname == 'font' and ttype == 'end':
|
|
self.font_history.pop()
|
|
# handle this font end tag
|
|
taginfo = ('end', 'font', None)
|
|
htmlstr += self.processtag(taginfo)
|
|
# check if we were nested
|
|
if len(self.font_history) > 0:
|
|
# inject a copy of the most recent font start tag from history
|
|
taginfo = self.font_history[-1]
|
|
htmlstr += self.processtag(taginfo)
|
|
continue
|
|
|
|
# keep track of nesting path
|
|
if ttype == 'begin':
|
|
self.path.append(tname)
|
|
elif ttype == 'end':
|
|
if tname != self.path[-1]:
|
|
print ('improper nesting: ', self.path, tname, ttype)
|
|
if tname not in self.path:
|
|
# handle case of end tag with no beginning by injecting empty begin tag
|
|
taginfo = ('begin', tname, None)
|
|
htmlstr += self.processtag(taginfo)
|
|
print(" - fixed by injecting empty start tag ", tname)
|
|
self.path.append(tname)
|
|
elif len(self.path) > 1 and tname == self.path[-2]:
|
|
# handle case of dangling missing end
|
|
taginfo = ('end', self.path[-1], None)
|
|
htmlstr += self.processtag(taginfo)
|
|
print(" - fixed by injecting end tag ", self.path[-1])
|
|
self.path.pop()
|
|
self.path.pop()
|
|
|
|
if tname == 'removeme:{0}'.format(tname):
|
|
if ttype in ('begin', 'single', 'single_ext'):
|
|
skip = True
|
|
else:
|
|
skip = False
|
|
else:
|
|
taginfo = (ttype, tname, tattr)
|
|
htmlstr += self.processtag(taginfo)
|
|
|
|
# handle potential issue of multiple html, head, and body sections
|
|
if tname == 'html' and ttype == 'begin' and not html_done:
|
|
htmlstr += '\n'
|
|
html_done = True
|
|
|
|
if tname == 'head' and ttype == 'begin' and not head_done:
|
|
htmlstr += '\n'
|
|
# also add in metadata and style link tags
|
|
htmlstr += self.meta
|
|
htmlstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
|
|
head_done = True
|
|
|
|
if tname == 'body' and ttype == 'begin' and not body_done:
|
|
htmlstr += '\n'
|
|
body_done = True
|
|
|
|
# handle issue of possibly missing html, head, and body tags
|
|
# I have not seen this but the original did something like this so ...
|
|
if not body_done:
|
|
htmlstr = '<body>\n' + htmlstr + '</body>\n'
|
|
if not head_done:
|
|
headstr = '<head>\n'
|
|
headstr += self.meta
|
|
headstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
|
|
headstr += '</head>\n'
|
|
htmlstr = headstr + htmlstr
|
|
if not html_done:
|
|
htmlstr = '<html>\n' + htmlstr + '</html>\n'
|
|
|
|
# finally add DOCTYPE info
|
|
htmlstr = '<?xml version="1.0"?>\n<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n' + htmlstr
|
|
|
|
css = self.base_css_rules
|
|
for cls, rule in self.tag_css_rules.items():
|
|
css += '.%s { %s }\n' % (cls, rule)
|
|
|
|
return (htmlstr, css, self.cssname)
|
|
|
|
def ensure_unit(self, raw, unit='px'):
|
|
if re.search(r'\d+$', raw) is not None:
|
|
raw += unit
|
|
return raw
|
|
|
|
# flatten possibly modified tag back to string
|
|
def taginfo_tostring(self, taginfo):
|
|
(ttype, tname, tattr) = taginfo
|
|
if ttype is None or tname is None:
|
|
return ''
|
|
if ttype == 'end':
|
|
return '</%s>' % tname
|
|
if ttype in SPECIAL_HANDLING_TYPES and tattr is not None and 'special' in tattr.keys():
|
|
info = tattr['special']
|
|
if ttype == 'comment':
|
|
return '<%s %s-->' % tname, info
|
|
else:
|
|
return '<%s %s>' % tname, info
|
|
res = []
|
|
res.append('<%s' % tname)
|
|
if tattr is not None:
|
|
for key in tattr.keys():
|
|
res.append(' %s="%s"' % (key, tattr[key]))
|
|
if ttype == 'single':
|
|
res.append('/>')
|
|
elif ttype == 'single_ext':
|
|
res.append(' />')
|
|
else :
|
|
res.append('>')
|
|
return "".join(res)
|
|
|
|
# routines to convert from mobi ml tags atributes to xhtml attributes and styles
|
|
def processtag(self, taginfo):
|
|
# Converting mobi font sizes to numerics
|
|
size_map = {
|
|
'xx-small': '1',
|
|
'x-small': '2',
|
|
'small': '3',
|
|
'medium': '4',
|
|
'large': '5',
|
|
'x-large': '6',
|
|
'xx-large': '7',
|
|
}
|
|
|
|
size_to_em_map = {
|
|
'1': '.65em',
|
|
'2': '.75em',
|
|
'3': '1em',
|
|
'4': '1.125em',
|
|
'5': '1.25em',
|
|
'6': '1.5em',
|
|
'7': '2em',
|
|
}
|
|
|
|
# current tag to work on
|
|
(ttype, tname, tattr) = taginfo
|
|
if not tattr:
|
|
tattr = {}
|
|
|
|
styles = []
|
|
|
|
if tname is None or tname.startswith('removeme'):
|
|
return ''
|
|
|
|
# have not seen an example of this yet so keep it here to be safe
|
|
# until this is better understood
|
|
if tname in ('country-region', 'place', 'placetype', 'placename',
|
|
'state', 'city', 'street', 'address', 'content'):
|
|
tname = 'div' if tname == 'content' else 'span'
|
|
for key in tattr.keys():
|
|
tattr.pop(key)
|
|
|
|
# handle general case of style, height, width, bgcolor in any tag
|
|
if 'style' in tattr.keys():
|
|
style = tattr.pop('style').strip()
|
|
if style:
|
|
styles.append(style)
|
|
|
|
if 'align' in tattr.keys():
|
|
align = tattr.pop('align').strip()
|
|
if align:
|
|
if tname in ('table', 'td', 'tr'):
|
|
pass
|
|
else:
|
|
styles.append('text-align: %s' % align)
|
|
|
|
if 'height' in tattr.keys():
|
|
height = tattr.pop('height').strip()
|
|
if height and '<' not in height and '>' not in height and re.search(r'\d+', height):
|
|
if tname in ('table', 'td', 'tr'):
|
|
pass
|
|
elif tname == 'img':
|
|
tattr['height'] = height
|
|
else:
|
|
styles.append('margin-top: %s' % self.ensure_unit(height))
|
|
|
|
if 'width' in tattr.keys():
|
|
width = tattr.pop('width').strip()
|
|
if width and re.search(r'\d+', width):
|
|
if tname in ('table', 'td', 'tr'):
|
|
pass
|
|
elif tname == 'img':
|
|
tattr['width'] = width
|
|
else:
|
|
styles.append('text-indent: %s' % self.ensure_unit(width))
|
|
if width.startswith('-'):
|
|
styles.append('margin-left: %s' % self.ensure_unit(width[1:]))
|
|
|
|
if 'bgcolor' in tattr.keys():
|
|
# no proprietary html allowed
|
|
if tname == 'div':
|
|
del tattr['bgcolor']
|
|
|
|
elif tname == 'font':
|
|
# Change font tags to span tags
|
|
tname = 'span'
|
|
if ttype in ('begin', 'single', 'single_ext'):
|
|
# move the face attribute to css font-family
|
|
if 'face' in tattr.keys():
|
|
face = tattr.pop('face').strip()
|
|
styles.append('font-family: "%s"' % face)
|
|
|
|
# Monitor the constantly changing font sizes, change them to ems and move
|
|
# them to css. The following will work for 'flat' font tags, but nested font tags
|
|
# will cause things to go wonky. Need to revert to the parent font tag's size
|
|
# when a closing tag is encountered.
|
|
if 'size' in tattr.keys():
|
|
sz = tattr.pop('size').strip().lower()
|
|
try:
|
|
float(sz)
|
|
except ValueError:
|
|
if sz in size_map.keys():
|
|
sz = size_map[sz]
|
|
else:
|
|
if sz.startswith('-') or sz.startswith('+'):
|
|
sz = self.current_font_size + float(sz)
|
|
if sz > 7:
|
|
sz = 7
|
|
elif sz < 1:
|
|
sz = 1
|
|
sz = str(int(sz))
|
|
styles.append('font-size: %s' % size_to_em_map[sz])
|
|
self.current_font_size = int(sz)
|
|
|
|
elif tname == 'img':
|
|
for attr in ('width', 'height'):
|
|
if attr in tattr:
|
|
val = tattr[attr]
|
|
if val.lower().endswith('em'):
|
|
try:
|
|
nval = float(val[:-2])
|
|
nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile
|
|
tattr[attr] = "%dpx"%int(nval)
|
|
except:
|
|
del tattr[attr]
|
|
elif val.lower().endswith('%'):
|
|
del tattr[attr]
|
|
|
|
# convert the anchor tags
|
|
if 'filepos-id' in tattr:
|
|
tattr['id'] = tattr.pop('filepos-id')
|
|
if 'name' in tattr and tattr['name'] != tattr['id']:
|
|
tattr['name'] = tattr['id']
|
|
|
|
if 'filepos' in tattr:
|
|
filepos = tattr.pop('filepos')
|
|
try:
|
|
tattr['href'] = "#filepos%d" % int(filepos)
|
|
except ValueError:
|
|
pass
|
|
|
|
if styles:
|
|
ncls = None
|
|
rule = '; '.join(styles)
|
|
for sel, srule in self.tag_css_rules.items():
|
|
if srule == rule:
|
|
ncls = sel
|
|
break
|
|
if ncls is None:
|
|
self.tag_css_rule_cnt += 1
|
|
ncls = 'rule_%d' % self.tag_css_rule_cnt
|
|
self.tag_css_rules[ncls] = rule
|
|
cls = tattr.get('class', '')
|
|
cls = cls + (' ' if cls else '') + ncls
|
|
tattr['class'] = cls
|
|
|
|
# convert updated tag back to string representation
|
|
if len(tattr) == 0:
|
|
tattr = None
|
|
taginfo = (ttype, tname, tattr)
|
|
return self.taginfo_tostring(taginfo)
|
|
|
|
''' main only left in for testing outside of plugin '''
|
|
|
|
def main(argv=sys.argv):
|
|
if len(argv) != 2:
|
|
return 1
|
|
else:
|
|
infile = argv[1]
|
|
|
|
try:
|
|
print('Converting Mobi Markup Language to XHTML')
|
|
mlc = MobiMLConverter(infile)
|
|
print('Processing ...')
|
|
htmlstr, css, cssname = mlc.processml()
|
|
outname = infile.rsplit('.',1)[0] + '_converted.html'
|
|
file(outname, 'wb').write(htmlstr)
|
|
file(cssname, 'wb').write(css)
|
|
print('Completed')
|
|
print('XHTML version of book can be found at: ', outname)
|
|
|
|
except ValueError as e:
|
|
print("Error: %s" % e)
|
|
return 1
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|