#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# this program works in concert with the output from KindleUnpack
'''
Convert from Mobi ML to XHTML
'''
import os
import sys
import re
SPECIAL_HANDLING_TAGS = {
    '?xml'     : ('xmlheader', -1),
    '!--'      : ('comment', -3),
    '!DOCTYPE' : ('doctype', -1),
}
SPECIAL_HANDLING_TYPES = ['xmlheader', 'doctype', 'comment']
SELF_CLOSING_TAGS = ['br' , 'hr', 'input', 'img', 'image', 'meta', 'spacer', 'link', 'frame', 'base', 'col', 'reference']
class MobiMLConverter(object):
    PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
    IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
    def __init__(self, filename):
        self.base_css_rules =  'blockquote { margin: 0em 0em 0em 1.25em }\n'
        self.base_css_rules += 'p { margin: 0em }\n'
        self.base_css_rules += '.bold { font-weight: bold }\n'
        self.base_css_rules += '.italic { font-style: italic }\n'
        self.base_css_rules += '.mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n'
        self.tag_css_rules = {}
        self.tag_css_rule_cnt = 0
        self.path = []
        self.filename = filename
        self.wipml = open(self.filename, 'rb').read()
        self.pos = 0
        self.opfname = self.filename.rsplit('.',1)[0] + '.opf'
        self.opos = 0
        self.meta = ''
        self.cssname = os.path.join(os.path.dirname(self.filename),'styles.css')
        self.current_font_size = 3
        self.font_history = []
    def cleanup_html(self):
        self.wipml = re.sub(r'
', '', self.wipml)
        self.wipml = self.wipml.replace('\r\n', '\n')
        self.wipml = self.wipml.replace('> <', '>\n<')
        self.wipml = self.wipml.replace(']*>', '', self.wipml)
        self.wipml = self.wipml.replace('
','
')
    def replace_page_breaks(self):
        self.wipml = self.PAGE_BREAK_PAT.sub(
            '',
            self.wipml)
    # parse leading text of ml and tag
    def parseml(self):
        p = self.pos
        if p >= len(self.wipml):
            return None
        if self.wipml[p] != '<':
            res = self.wipml.find('<',p)
            if res == -1 :
                res = len(self.wipml)
            self.pos = res
            return self.wipml[p:res], None
        # handle comment as a special case to deal with multi-line comments
        if self.wipml[p:p+4] == '',p+1)
            if te != -1:
                te = te+2
        else :
            te = self.wipml.find('>',p+1)
            ntb = self.wipml.find('<',p+1)
            if ntb != -1 and ntb < te:
                self.pos = ntb
                return self.wipml[p:ntb], None
        self.pos = te + 1
        return None, self.wipml[p:te+1]
    # parses string version of tag to identify its name,
    # its type 'begin', 'end' or 'single',
    # plus build a hashtable of its attributes
    # code is written to handle the possiblity of very poor formating
    def parsetag(self, s):
        p = 1
        # get the tag name
        tname = None
        ttype = None
        tattr = {}
        while s[p:p+1] == ' ' :
            p += 1
        if s[p:p+1] == '/':
            ttype = 'end'
            p += 1
            while s[p:p+1] == ' ' :
                p += 1
        b = p
        while s[p:p+1] not in ('>', '/', ' ', '"', "'", "\r", "\n") :
            p += 1
        tname=s[b:p].lower()
        if tname == '!doctype':
            tname = '!DOCTYPE'
        # special cases
        if tname in SPECIAL_HANDLING_TAGS.keys():
            ttype, backstep = SPECIAL_HANDLING_TAGS[tname]
            tattr['special'] = s[p:backstep]
        if ttype is None:
            # parse any attributes
            while s.find('=',p) != -1 :
                while s[p:p+1] == ' ' :
                    p += 1
                b = p
                while s[p:p+1] != '=' :
                    p += 1
                aname = s[b:p].lower()
                aname = aname.rstrip(' ')
                p += 1
                while s[p:p+1] == ' ' :
                    p += 1
                if s[p:p+1] in ('"', "'") :
                    p = p + 1
                    b = p
                    while s[p:p+1] not in ('"', "'") :
                        p += 1
                    val = s[b:p]
                    p += 1
                else :
                    b = p
                    while s[p:p+1] not in ('>', '/', ' ') :
                        p += 1
                    val = s[b:p]
                tattr[aname] = val
        # label beginning and single tags
        if ttype is None:
            ttype = 'begin'
            if s.find(' /',p) >= 0:
                ttype = 'single_ext'
            elif s.find('/',p) >= 0:
                ttype = 'single'
        return ttype, tname, tattr
    # main routine to convert from mobi markup language to html
    def processml(self):
        # are these really needed
        html_done = False
        head_done = False
        body_done = False
        skip = False
        htmlstr = ''
        self.replace_page_breaks()
        self.cleanup_html()
        # now parse the cleaned up ml into standard xhtml
        while True:
            r = self.parseml()
            if not r:
                break
            text, tag = r
            if text:
                if not skip:
                    htmlstr += text
            if tag:
                ttype, tname, tattr = self.parsetag(tag)
                # If we run into a DTD or xml declarations inside the body ... bail.
                if tname in SPECIAL_HANDLING_TAGS.keys() and tname != 'comment' and body_done:
                    htmlstr += '\n