diff --git a/lector/KindleUnpack/mobiml2xhtml.py b/lector/KindleUnpack/mobiml2xhtml.py deleted file mode 100755 index 85be8ba..0000000 --- a/lector/KindleUnpack/mobiml2xhtml.py +++ /dev/null @@ -1,525 +0,0 @@ -#! /usr/bin/python -# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab - - -# this program works in concert with the output from KindleUnpack - -''' -Convert from Mobi ML to XHTML -''' - -import os -import sys -import re - -SPECIAL_HANDLING_TAGS = { - '?xml' : ('xmlheader', -1), - '!--' : ('comment', -3), - '!DOCTYPE' : ('doctype', -1), -} - -SPECIAL_HANDLING_TYPES = ['xmlheader', 'doctype', 'comment'] - -SELF_CLOSING_TAGS = ['br' , 'hr', 'input', 'img', 'image', 'meta', 'spacer', 'link', 'frame', 'base', 'col', 'reference'] - -class MobiMLConverter(object): - - PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE) - IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex') - - def __init__(self, filename): - self.base_css_rules = 'blockquote { margin: 0em 0em 0em 1.25em }\n' - self.base_css_rules += 'p { margin: 0em }\n' - self.base_css_rules += '.bold { font-weight: bold }\n' - self.base_css_rules += '.italic { font-style: italic }\n' - self.base_css_rules += '.mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n' - self.tag_css_rules = {} - self.tag_css_rule_cnt = 0 - self.path = [] - self.filename = filename - self.wipml = open(self.filename, 'rb').read() - self.pos = 0 - self.opfname = self.filename.rsplit('.',1)[0] + '.opf' - self.opos = 0 - self.meta = '' - self.cssname = os.path.join(os.path.dirname(self.filename),'styles.css') - self.current_font_size = 3 - self.font_history = [] - - def cleanup_html(self): - self.wipml = re.sub(r'
', '', self.wipml) - self.wipml = self.wipml.replace('\r\n', '\n') - self.wipml = self.wipml.replace('> <', '>\n<') - self.wipml = self.wipml.replace(']*>', '', self.wipml) - self.wipml = self.wipml.replace('

','
') - - def replace_page_breaks(self): - self.wipml = self.PAGE_BREAK_PAT.sub( - '
', - self.wipml) - - # parse leading text of ml and tag - def parseml(self): - p = self.pos - if p >= len(self.wipml): - return None - if self.wipml[p] != '<': - res = self.wipml.find('<',p) - if res == -1 : - res = len(self.wipml) - self.pos = res - return self.wipml[p:res], None - # handle comment as a special case to deal with multi-line comments - if self.wipml[p:p+4] == '',p+1) - if te != -1: - te = te+2 - else : - te = self.wipml.find('>',p+1) - ntb = self.wipml.find('<',p+1) - if ntb != -1 and ntb < te: - self.pos = ntb - return self.wipml[p:ntb], None - self.pos = te + 1 - return None, self.wipml[p:te+1] - - # parses string version of tag to identify its name, - # its type 'begin', 'end' or 'single', - # plus build a hashtable of its attributes - # code is written to handle the possiblity of very poor formating - def parsetag(self, s): - p = 1 - # get the tag name - tname = None - ttype = None - tattr = {} - while s[p:p+1] == ' ' : - p += 1 - if s[p:p+1] == '/': - ttype = 'end' - p += 1 - while s[p:p+1] == ' ' : - p += 1 - b = p - while s[p:p+1] not in ('>', '/', ' ', '"', "'", "\r", "\n") : - p += 1 - tname=s[b:p].lower() - if tname == '!doctype': - tname = '!DOCTYPE' - # special cases - if tname in SPECIAL_HANDLING_TAGS.keys(): - ttype, backstep = SPECIAL_HANDLING_TAGS[tname] - tattr['special'] = s[p:backstep] - if ttype is None: - # parse any attributes - while s.find('=',p) != -1 : - while s[p:p+1] == ' ' : - p += 1 - b = p - while s[p:p+1] != '=' : - p += 1 - aname = s[b:p].lower() - aname = aname.rstrip(' ') - p += 1 - while s[p:p+1] == ' ' : - p += 1 - if s[p:p+1] in ('"', "'") : - p = p + 1 - b = p - while s[p:p+1] not in ('"', "'") : - p += 1 - val = s[b:p] - p += 1 - else : - b = p - while s[p:p+1] not in ('>', '/', ' ') : - p += 1 - val = s[b:p] - tattr[aname] = val - # label beginning and single tags - if ttype is None: - ttype = 'begin' - if s.find(' /',p) >= 0: - ttype = 'single_ext' - elif s.find('/',p) >= 0: - ttype = 'single' - return ttype, tname, tattr - - # main routine to convert from mobi markup language to html - def processml(self): - - # are these really needed - html_done = False - head_done = False - body_done = False - - skip = False - - htmlstr = '' - self.replace_page_breaks() - self.cleanup_html() - - # now parse the cleaned up ml into standard xhtml - while True: - - r = self.parseml() - if not r: - break - - text, tag = r - - if text: - if not skip: - htmlstr += text - - if tag: - ttype, tname, tattr = self.parsetag(tag) - - # If we run into a DTD or xml declarations inside the body ... bail. - if tname in SPECIAL_HANDLING_TAGS.keys() and tname != 'comment' and body_done: - htmlstr += '\n' - break - - # make sure self-closing tags actually self-close - if ttype == 'begin' and tname in SELF_CLOSING_TAGS: - ttype = 'single' - - # make sure any end tags of self-closing tags are discarded - if ttype == 'end' and tname in SELF_CLOSING_TAGS: - continue - - # remove embedded guide and refernces from old mobis - if tname in ('guide', 'ncx', 'reference') and ttype in ('begin', 'single', 'single_ext'): - tname = 'removeme:{0}'.format(tname) - tattr = None - if tname in ('guide', 'ncx', 'reference', 'font', 'span') and ttype == 'end': - if self.path[-1] == 'removeme:{0}'.format(tname): - tname = 'removeme:{0}'.format(tname) - tattr = None - - # Get rid of font tags that only have a color attribute. - if tname == 'font' and ttype in ('begin', 'single', 'single_ext'): - if 'color' in tattr.keys() and len(tattr.keys()) == 1: - tname = 'removeme:{0}'.format(tname) - tattr = None - - # Get rid of empty spans in the markup. - if tname == 'span' and ttype in ('begin', 'single', 'single_ext') and not len(tattr): - tname = 'removeme:{0}'.format(tname) - - # need to handle fonts outside of the normal methods - # so fonts tags won't be added to the self.path since we keep track - # of font tags separately with self.font_history - if tname == 'font' and ttype == 'begin': - # check for nested font start tags - if len(self.font_history) > 0 : - # inject a font end tag - taginfo = ('end', 'font', None) - htmlstr += self.processtag(taginfo) - self.font_history.append((ttype, tname, tattr)) - # handle the current font start tag - taginfo = (ttype, tname, tattr) - htmlstr += self.processtag(taginfo) - continue - - # check for nested font tags and unnest them - if tname == 'font' and ttype == 'end': - self.font_history.pop() - # handle this font end tag - taginfo = ('end', 'font', None) - htmlstr += self.processtag(taginfo) - # check if we were nested - if len(self.font_history) > 0: - # inject a copy of the most recent font start tag from history - taginfo = self.font_history[-1] - htmlstr += self.processtag(taginfo) - continue - - # keep track of nesting path - if ttype == 'begin': - self.path.append(tname) - elif ttype == 'end': - if tname != self.path[-1]: - print ('improper nesting: ', self.path, tname, ttype) - if tname not in self.path: - # handle case of end tag with no beginning by injecting empty begin tag - taginfo = ('begin', tname, None) - htmlstr += self.processtag(taginfo) - print " - fixed by injecting empty start tag ", tname - self.path.append(tname) - elif len(self.path) > 1 and tname == self.path[-2]: - # handle case of dangling missing end - taginfo = ('end', self.path[-1], None) - htmlstr += self.processtag(taginfo) - print " - fixed by injecting end tag ", self.path[-1] - self.path.pop() - self.path.pop() - - if tname == 'removeme:{0}'.format(tname): - if ttype in ('begin', 'single', 'single_ext'): - skip = True - else: - skip = False - else: - taginfo = (ttype, tname, tattr) - htmlstr += self.processtag(taginfo) - - # handle potential issue of multiple html, head, and body sections - if tname == 'html' and ttype == 'begin' and not html_done: - htmlstr += '\n' - html_done = True - - if tname == 'head' and ttype == 'begin' and not head_done: - htmlstr += '\n' - # also add in metadata and style link tags - htmlstr += self.meta - htmlstr += '\n' - head_done = True - - if tname == 'body' and ttype == 'begin' and not body_done: - htmlstr += '\n' - body_done = True - - # handle issue of possibly missing html, head, and body tags - # I have not seen this but the original did something like this so ... - if not body_done: - htmlstr = '\n' + htmlstr + '\n' - if not head_done: - headstr = '\n' - headstr += self.meta - headstr += '\n' - headstr += '\n' - htmlstr = headstr + htmlstr - if not html_done: - htmlstr = '\n' + htmlstr + '\n' - - # finally add DOCTYPE info - htmlstr = '\n\n' + htmlstr - - css = self.base_css_rules - for cls, rule in self.tag_css_rules.items(): - css += '.%s { %s }\n' % (cls, rule) - - return (htmlstr, css, self.cssname) - - def ensure_unit(self, raw, unit='px'): - if re.search(r'\d+$', raw) is not None: - raw += unit - return raw - - # flatten possibly modified tag back to string - def taginfo_tostring(self, taginfo): - (ttype, tname, tattr) = taginfo - if ttype is None or tname is None: - return '' - if ttype == 'end': - return '' % tname - if ttype in SPECIAL_HANDLING_TYPES and tattr is not None and 'special' in tattr.keys(): - info = tattr['special'] - if ttype == 'comment': - return '<%s %s-->' % tname, info - else: - return '<%s %s>' % tname, info - res = [] - res.append('<%s' % tname) - if tattr is not None: - for key in tattr.keys(): - res.append(' %s="%s"' % (key, tattr[key])) - if ttype == 'single': - res.append('/>') - elif ttype == 'single_ext': - res.append(' />') - else : - res.append('>') - return "".join(res) - - # routines to convert from mobi ml tags atributes to xhtml attributes and styles - def processtag(self, taginfo): - # Converting mobi font sizes to numerics - size_map = { - 'xx-small': '1', - 'x-small': '2', - 'small': '3', - 'medium': '4', - 'large': '5', - 'x-large': '6', - 'xx-large': '7', - } - - size_to_em_map = { - '1': '.65em', - '2': '.75em', - '3': '1em', - '4': '1.125em', - '5': '1.25em', - '6': '1.5em', - '7': '2em', - } - - # current tag to work on - (ttype, tname, tattr) = taginfo - if not tattr: - tattr = {} - - styles = [] - - if tname is None or tname.startswith('removeme'): - return '' - - # have not seen an example of this yet so keep it here to be safe - # until this is better understood - if tname in ('country-region', 'place', 'placetype', 'placename', - 'state', 'city', 'street', 'address', 'content'): - tname = 'div' if tname == 'content' else 'span' - for key in tattr.keys(): - tattr.pop(key) - - # handle general case of style, height, width, bgcolor in any tag - if 'style' in tattr.keys(): - style = tattr.pop('style').strip() - if style: - styles.append(style) - - if 'align' in tattr.keys(): - align = tattr.pop('align').strip() - if align: - if tname in ('table', 'td', 'tr'): - pass - else: - styles.append('text-align: %s' % align) - - if 'height' in tattr.keys(): - height = tattr.pop('height').strip() - if height and '<' not in height and '>' not in height and re.search(r'\d+', height): - if tname in ('table', 'td', 'tr'): - pass - elif tname == 'img': - tattr['height'] = height - else: - styles.append('margin-top: %s' % self.ensure_unit(height)) - - if 'width' in tattr.keys(): - width = tattr.pop('width').strip() - if width and re.search(r'\d+', width): - if tname in ('table', 'td', 'tr'): - pass - elif tname == 'img': - tattr['width'] = width - else: - styles.append('text-indent: %s' % self.ensure_unit(width)) - if width.startswith('-'): - styles.append('margin-left: %s' % self.ensure_unit(width[1:])) - - if 'bgcolor' in tattr.keys(): - # no proprietary html allowed - if tname == 'div': - del tattr['bgcolor'] - - elif tname == 'font': - # Change font tags to span tags - tname = 'span' - if ttype in ('begin', 'single', 'single_ext'): - # move the face attribute to css font-family - if 'face' in tattr.keys(): - face = tattr.pop('face').strip() - styles.append('font-family: "%s"' % face) - - # Monitor the constantly changing font sizes, change them to ems and move - # them to css. The following will work for 'flat' font tags, but nested font tags - # will cause things to go wonky. Need to revert to the parent font tag's size - # when a closing tag is encountered. - if 'size' in tattr.keys(): - sz = tattr.pop('size').strip().lower() - try: - float(sz) - except ValueError: - if sz in size_map.keys(): - sz = size_map[sz] - else: - if sz.startswith('-') or sz.startswith('+'): - sz = self.current_font_size + float(sz) - if sz > 7: - sz = 7 - elif sz < 1: - sz = 1 - sz = str(int(sz)) - styles.append('font-size: %s' % size_to_em_map[sz]) - self.current_font_size = int(sz) - - elif tname == 'img': - for attr in ('width', 'height'): - if attr in tattr: - val = tattr[attr] - if val.lower().endswith('em'): - try: - nval = float(val[:-2]) - nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile - tattr[attr] = "%dpx"%int(nval) - except: - del tattr[attr] - elif val.lower().endswith('%'): - del tattr[attr] - - # convert the anchor tags - if 'filepos-id' in tattr: - tattr['id'] = tattr.pop('filepos-id') - if 'name' in tattr and tattr['name'] != tattr['id']: - tattr['name'] = tattr['id'] - - if 'filepos' in tattr: - filepos = tattr.pop('filepos') - try: - tattr['href'] = "#filepos%d" % int(filepos) - except ValueError: - pass - - if styles: - ncls = None - rule = '; '.join(styles) - for sel, srule in self.tag_css_rules.items(): - if srule == rule: - ncls = sel - break - if ncls is None: - self.tag_css_rule_cnt += 1 - ncls = 'rule_%d' % self.tag_css_rule_cnt - self.tag_css_rules[ncls] = rule - cls = tattr.get('class', '') - cls = cls + (' ' if cls else '') + ncls - tattr['class'] = cls - - # convert updated tag back to string representation - if len(tattr) == 0: - tattr = None - taginfo = (ttype, tname, tattr) - return self.taginfo_tostring(taginfo) - -''' main only left in for testing outside of plugin ''' - -def main(argv=sys.argv): - if len(argv) != 2: - return 1 - else: - infile = argv[1] - - try: - print 'Converting Mobi Markup Language to XHTML' - mlc = MobiMLConverter(infile) - print 'Processing ...' - htmlstr, css, cssname = mlc.processml() - outname = infile.rsplit('.',1)[0] + '_converted.html' - file(outname, 'wb').write(htmlstr) - file(cssname, 'wb').write(css) - print 'Completed' - print 'XHTML version of book can be found at: ' + outname - - except ValueError, e: - print "Error: %s" % e - return 1 - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/lector/__main__.py b/lector/__main__.py index 8386682..1289b92 100755 --- a/lector/__main__.py +++ b/lector/__main__.py @@ -429,15 +429,15 @@ class MainUI(QtWidgets.QMainWindow, mainwindow.Ui_MainWindow): logger.info( 'Attempting to open: ' + ', '.join(file_paths)) - contents = sorter.BookSorter( + contents, errors = sorter.BookSorter( file_paths, ('reading', None), self.database_path, self.settings, self.temp_dir.path()).initiate_threads() - # TODO - # Notification feedback in case all books return nothing + if errors: + self.display_error_notification(errors) if not contents: logger.error('No parseable files found')