Incorporate ebooklib from https://github.com/aerkalov/ebooklib

2017-11-06 07:42:48 +05:30
parent b5cbc3d529
commit f3d748f5dd
9 changed files with 2245 additions and 0 deletions
--- a/ebooklib/init.py
+++ b/ebooklib/init.py
@@ -0,0 +1,42 @@
+# This file is part of EbookLib.
+# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
+#
+# EbookLib is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# EbookLib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with EbookLib.  If not, see <http://www.gnu.org/licenses/>.
+
+# Version of ebook library
+
+VERSION = (0, 16, 0)
+
+# LIST OF POSSIBLE ITEMS
+ITEM_UNKNOWN = 0
+ITEM_IMAGE = 1
+ITEM_STYLE = 2
+ITEM_SCRIPT = 3
+ITEM_NAVIGATION = 4
+ITEM_VECTOR = 5
+ITEM_FONT = 6
+ITEM_VIDEO = 7
+ITEM_AUDIO = 8
+ITEM_DOCUMENT = 9
+
+# EXTENSION MAPPER
+EXTENSIONS = {ITEM_IMAGE: ['.jpg', '.jpeg', '.gif', '.tiff', '.tif', '.png'],
+              ITEM_STYLE: ['.css'],
+              ITEM_VECTOR: ['.svg'],
+              ITEM_FONT: ['.otf', '.woff', '.ttf'],
+              ITEM_SCRIPT: ['.js'],
+              ITEM_NAVIGATION: ['.ncx'],
+              ITEM_VIDEO: ['.mov', '.mp4', '.avi'],
+              ITEM_AUDIO: ['.mp3', '.ogg']
+              }
--- a/ebooklib/epub.py
+++ b/ebooklib/epub.py
--- a/ebooklib/plugins/init.py
+++ b/ebooklib/plugins/init.py
--- a/ebooklib/plugins/base.py
+++ b/ebooklib/plugins/base.py
@@ -0,0 +1,49 @@
+# This file is part of EbookLib.
+# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
+#
+# EbookLib is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# EbookLib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with EbookLib.  If not, see <http://www.gnu.org/licenses/>.
+
+
+class BasePlugin(object):
+    def before_write(self, book):
+        "Processing before save"
+        return True
+
+    def after_write(self, book):
+        "Processing after save"
+        return True
+
+    def before_read(self, book):
+        "Processing before save"
+        return True
+
+    def after_read(self, book):
+        "Processing after save"
+        return True
+
+    def item_after_read(self, book, item):
+        "Process general item after read."
+        return True
+
+    def item_before_write(self, book, item):
+        "Process general item before write."
+        return True
+
+    def html_after_read(self, book, chapter):
+        "Processing HTML before read."
+        return True
+
+    def html_before_write(self, book, chapter):
+        "Processing HTML before save."
+        return True
--- a/ebooklib/plugins/booktype.py
+++ b/ebooklib/plugins/booktype.py
@@ -0,0 +1,119 @@
+# This file is part of EbookLib.
+# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
+#
+# EbookLib is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# EbookLib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with EbookLib.  If not, see <http://www.gnu.org/licenses/>.
+
+from ebooklib.plugins.base import BasePlugin
+from ebooklib.utils import parse_html_string
+
+class BooktypeLinks(BasePlugin):
+    NAME = 'Booktype Links'
+
+    def __init__(self, booktype_book):
+        self.booktype_book = booktype_book
+
+    def html_before_write(self, book, chapter):
+        from lxml import  etree
+
+        try:
+            from urlparse import urlparse, urljoin
+        except ImportError:
+            from urllib.parse import urlparse, urljoin
+
+        try:
+            tree = parse_html_string(chapter.content)
+        except:
+            return
+
+        root = tree.getroottree()
+
+        if len(root.find('body')) != 0:
+            body = tree.find('body')
+
+            # should also be aware to handle
+            # ../chapter/
+            # ../chapter/#reference
+            # ../chapter#reference
+
+            for _link in body.xpath('//a'):
+                # This is just temporary for the footnotes
+                if _link.get('href', '').find('InsertNoteID') != -1:
+                    _ln = _link.get('href', '')
+                    i = _ln.find('#')                                       
+                    _link.set('href', _ln[i:]);
+
+                    continue
+
+                _u = urlparse(_link.get('href', ''))
+
+                # Let us care only for internal links at the moment
+                if _u.scheme == '':
+                    if _u.path != '':
+                        _link.set('href', '%s.xhtml' % _u.path)
+                    
+                    if _u.fragment != '':
+                        _link.set('href', urljoin(_link.get('href'), '#%s' % _u.fragment))
+
+                    if _link.get('name') != None:
+                        _link.set('id', _link.get('name'))
+                        etree.strip_attributes(_link, 'name')
+                    
+        chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8')
+            
+
+
+
+class BooktypeFootnotes(BasePlugin):
+    NAME = 'Booktype Footnotes'
+
+    def __init__(self, booktype_book):
+        self.booktype_book = booktype_book
+
+    def html_before_write(self, book, chapter):
+        from lxml import etree
+
+        from ebooklib import epub
+
+        try:
+            tree = parse_html_string(chapter.content)
+        except:
+            return
+
+        root = tree.getroottree()
+
+        if len(root.find('body')) != 0:
+            body = tree.find('body')
+
+            # <span id="InsertNoteID_1_marker1" class="InsertNoteMarker"><sup><a href="#InsertNoteID_1">1</a></sup><span>
+            # <ol id="InsertNote_NoteList"><li id="InsertNoteID_1">prvi footnote <span id="InsertNoteID_1_LinkBacks"><sup><a href="#InsertNoteID_1_marker1">^</a></sup></span></li>
+
+            # <a epub:type="noteref" href="#n1">1</a></p>
+            # <aside epub:type="footnote" id="n1"><p>These have been corrected in this EPUB3 edition.</p></aside>
+            for footnote in body.xpath('//span[@class="InsertNoteMarker"]'):
+                footnote_id = footnote.get('id')[:-8]
+                a = footnote.getchildren()[0].getchildren()[0]
+                
+                footnote_text = body.xpath('//li[@id="%s"]' % footnote_id)[0]
+                
+                a.attrib['{%s}type' % epub.NAMESPACES['EPUB']] = 'noteref'
+                ftn = etree.SubElement(body, 'aside', {'id': footnote_id})
+                ftn.attrib['{%s}type' % epub.NAMESPACES['EPUB']] = 'footnote'
+                ftn_p = etree.SubElement(ftn, 'p')
+                ftn_p.text = footnote_text.text
+
+            old_footnote = body.xpath('//ol[@id="InsertNote_NoteList"]')
+            if len(old_footnote) > 0:
+                body.remove(old_footnote[0])
+            
+        chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8')        
--- a/ebooklib/plugins/sourcecode.py
+++ b/ebooklib/plugins/sourcecode.py
@@ -0,0 +1,68 @@
+# This file is part of EbookLib.
+# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
+#
+# EbookLib is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# EbookLib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with EbookLib.  If not, see <http://www.gnu.org/licenses/>.
+
+from ebooklib.plugins.base import BasePlugin
+from ebooklib.utils import parse_html_string
+
+class SourceHighlighter(BasePlugin):    
+    def __init__(self):
+        pass
+
+    def html_before_write(self, book, chapter):
+        from lxml import etree, html
+
+        from pygments import highlight
+        from pygments.formatters import HtmlFormatter
+
+        from ebooklib import epub
+
+        try:
+            tree = parse_html_string(chapter.content)
+        except:
+            return
+
+        root = tree.getroottree()
+
+        had_source = False
+
+        if len(root.find('body')) != 0:
+            body = tree.find('body')
+            # check for embeded source
+            for source in body.xpath('//pre[contains(@class,"source-")]'):
+                css_class = source.get('class')
+
+                source_text = (source.text or '') + ''.join([html.tostring(child) for child in source.iterchildren()])
+
+                if 'source-python' in css_class:
+                    from pygments.lexers import PythonLexer
+
+#                    _text =  highlight(source_text, PythonLexer(), HtmlFormatter(linenos="inline"))
+                    _text =  highlight(source_text, PythonLexer(), HtmlFormatter())
+
+                if 'source-css' in css_class:
+                    from pygments.lexers import CssLexer
+
+                    _text =  highlight(source_text, CssLexer(), HtmlFormatter())
+
+                _parent = source.getparent()
+                _parent.replace(source, etree.XML(_text))
+
+                had_source = True
+
+        if had_source:
+            chapter.add_link(href="style/code.css", rel="stylesheet", type="text/css")
+            chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8')        
+
--- a/ebooklib/plugins/standard.py
+++ b/ebooklib/plugins/standard.py
@@ -0,0 +1,230 @@
+# This file is part of EbookLib.
+# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
+#
+# EbookLib is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# EbookLib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with EbookLib.  If not, see <http://www.gnu.org/licenses/>.
+
+import six
+
+from ebooklib.plugins.base import BasePlugin
+from ebooklib.utils import parse_html_string
+
+# TODO:
+#   - should also look for the _required_ elements
+# http://www.w3.org/html/wg/drafts/html/master/tabular-data.html#the-table-element
+
+ATTRIBUTES_GLOBAL = ['accesskey', 'class', 'contenteditable', 'contextmenu', 'dir', 'draggable',
+                     'dropzone', 'hidden',  'id', 'inert', 'itemid', 'itemprop', 'itemref',
+                     'itemscope', 'itemtype', 'lang', 'spellcheck', 'style', 'tabindex',
+                     'title', 'translate', 'epub:type']
+
+# Remove <u> for now from here
+DEPRECATED_TAGS = ['acronym', 'applet', 'basefont', 'big', 'center', 'dir', 'font', 'frame',
+                   'frameset', 'isindex', 'noframes', 's', 'strike', 'tt']
+
+
+def leave_only(item, tag_list):
+    for _attr in six.iterkeys(item.attrib):
+        if _attr not in tag_list:
+            del item.attrib[_attr]
+
+
+class SyntaxPlugin(BasePlugin):
+    NAME = 'Check HTML syntax'
+
+    def html_before_write(self, book, chapter):
+        from lxml import etree
+
+        try:
+            tree = parse_html_string(chapter.content)
+        except:
+            return
+
+        root = tree.getroottree()
+
+        # delete deprecated tags
+        # i should really have a list of allowed tags
+        for tag in DEPRECATED_TAGS:
+            etree.strip_tags(root, tag)
+
+        head = tree.find('head')
+        
+        if head is not None and len(head) != 0:
+            
+            for _item in head:
+                if _item.tag == 'base':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'target'])
+                elif _item.tag == 'link':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'crossorigin', 'rel', 'media', 'hreflang', 'type', 'sizes'])
+                elif _item.tag == 'title':
+                    if _item.text == '':
+                        head.remove(_item)
+                elif _item.tag == 'meta':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['name', 'http-equiv', 'content', 'charset'])
+                    # just remove for now, but really should not be like this
+                    head.remove(_item) 
+                elif _item.tag == 'script':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'charset', 'async', 'defer', 'crossorigin'])
+                elif _item.tag == 'source':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'media'])
+                elif _item.tag == 'style':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['media', 'type', 'scoped'])
+                else:
+                    leave_only(_item, ATTRIBUTES_GLOBAL)
+
+
+        if len(root.find('body')) != 0:
+            body = tree.find('body')
+
+            for _item in body.iter():
+                # it is not
+                # <a class="indexterm" href="ch05.html#ix_epub:trigger_element">
+                
+                if _item.tag == 'a':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'target', 'download', 'rel', 'hreflang', 'type'])
+                elif _item.tag == 'area':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['alt', 'coords', 'shape', 'href', 'target', 'download', 'rel', 'hreflang', 'type'])
+                elif _item.tag == 'audio':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'crossorigin', 'preload', 'autoplay', 'mediagroup', 'loop', 'muted', 'controls'])
+                elif _item.tag == 'blockquote':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['cite'])
+                elif _item.tag == 'button':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['autofocus', 'disabled', 'form', 'formaction', 'formenctype', 'formmethod', 'formnovalidate',
+                                                           'formtarget', 'name', 'type', 'value', 'menu'])
+                elif _item.tag == 'canvas':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height'])
+                elif _item.tag == 'canvas':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height'])
+                elif _item.tag == 'del':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['cite', 'datetime'])
+                elif _item.tag == 'details':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['open'])
+                elif _item.tag == 'embed':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'width', 'height'])
+                elif _item.tag == 'fieldset':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['disable', 'form', 'name'])
+                elif _item.tag == 'details':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['accept-charset', 'action', 'autocomplete', 'enctype', 'method', 'name', 'novalidate', 'target'])
+                elif _item.tag == 'iframe':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'srcdoc', 'name', 'sandbox', 'seamless', 'allowfullscreen', 'width', 'height'])
+                elif _item.tag == 'img':
+                    _src =  _item.get('src', '').lower()
+                    if _src.startswith('http://') or _src.startswith('https://'):
+                        if 'remote-resources' not in chapter.properties:
+                            chapter.properties.append('remote-resources')
+                            # THIS DOES NOT WORK, ONLY VIDEO AND AUDIO FILES CAN BE REMOTE RESOURCES
+                            # THAT MEANS I SHOULD ALSO CATCH <SOURCE TAG
+                            from ebooklib import epub
+                            _img = epub.EpubImage(file_name = _item.get('src'))
+                            book.add_item(_img)
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['alt', 'src', 'crossorigin', 'usemap', 'ismap', 'width', 'height'])
+                elif _item.tag == 'input':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['accept', 'alt', 'autocomplete', 'autofocus', 'checked', 'dirname',
+                                                           'disabled', 'form', 'formaction', 'formenctype', 'formmethod', 'formnovalidate',
+                                                           'formtarget', 'height', 'inputmode', 'list', 'max', 'maxlength', 'min', 'multiple',
+                                                           'name', 'pattern', 'placeholder', 'readonly', 'required', 'size', 'src', 'step'
+                                                           'type', 'value', 'width'])
+                elif _item.tag == 'ins':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['cite', 'datetime'])
+                elif _item.tag == 'keygen':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['autofocus', 'challenge', 'disabled', 'form', 'keytype', 'name'])
+                elif _item.tag == 'label':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['form', 'for'])
+                elif _item.tag == 'label':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['form', 'for'])
+                elif _item.tag == 'map':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['name'])
+                elif _item.tag == 'menu':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['type', 'label'])
+                elif _item.tag == 'object':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['data', 'type', 'typemustmatch', 'name', 'usemap', 'form', 'width', 'height'])
+                elif _item.tag == 'ol':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['reversed', 'start', 'type'])
+                elif _item.tag == 'optgroup':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['disabled', 'label'])
+                elif _item.tag == 'option':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['disabled', 'label', 'selected', 'value'])
+                elif _item.tag == 'output':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['for', 'form', 'name'])
+                elif _item.tag == 'param':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['name', 'value'])
+                elif _item.tag == 'progress':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['value', 'max'])
+                elif _item.tag == 'q':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['cite'])
+                elif _item.tag == 'select':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['autofocus', 'disabled', 'form', 'multiple', 'name', 'required', 'size'])
+
+                elif _item.tag == 'table':
+                    if _item.get('border', None):
+                        if _item.get('border') == '0':
+                            _item.set('border', '')
+
+                    if _item.get('summary', None):
+                        _caption = etree.Element('caption', {})
+                        _caption.text = _item.get('summary')
+                        _item.insert(0, _caption)
+
+                        # add it as caption
+                        del _item.attrib['summary']
+
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['border', 'sortable'])
+                elif _item.tag == 'dl':
+                    _d = _item.find('dd')
+                    if _d is not None and len(_d) == 0:
+                        pass
+
+                        # http://html5doctor.com/the-dl-element/
+                        # should be like this really
+                        # some of the elements can be missing
+                        # dl
+                        #   dt
+                        #   dd
+                        #   dt
+                        #   dd
+                elif _item.tag == 'td':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['colspan', 'rowspan', 'headers'])
+                elif _item.tag == 'textarea':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['autocomplete', 'autofocus', 'cols', 'dirname', 'disabled', 'form',
+                                                           'inputmode', 'maxlength', 'name', 'placeholder', 'readonly', 'required',
+                                                           'rows', 'wrap'])
+
+                elif _item.tag in ['col', 'colgroup']:
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['span'])
+                elif _item.tag == 'th':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['colspan', 'rowspan', 'headers', 'scope', 'abbr', 'sorted'])
+                elif _item.tag in ['time']:
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['datetime'])
+                elif _item.tag in ['track']:
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['kind', 'src', 'srclang', 'label', 'default'])
+                elif _item.tag == 'video':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'crossorigin', 'poster', 'preload', 'autoplay', 'mediagroup',
+                                                           'loop', 'muted', 'controls', 'width', 'height'])
+                elif _item.tag == 'svg':
+                    # We need to add property "svg" in case we have embeded svg file
+                    if 'svg' not in chapter.properties:
+                        chapter.properties.append('svg')
+                        
+                    if _item.get('viewbox', None):
+                        del _item.attrib['viewbox']
+
+                    if _item.get('preserveaspectratio', None):
+                        del _item.attrib['preserveaspectratio']
+                else:
+                    for _attr in six.iterkeys(_item.attrib):
+                        if _attr not in ATTRIBUTES_GLOBAL:
+                            del _item.attrib[_attr]
+
+        chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)
+        
+        return chapter.content
--- a/ebooklib/plugins/tidyhtml.py
+++ b/ebooklib/plugins/tidyhtml.py
@@ -0,0 +1,82 @@
+# This file is part of EbookLib.
+# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
+#
+# EbookLib is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# EbookLib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with EbookLib.  If not, see <http://www.gnu.org/licenses/>.
+
+import six
+import subprocess
+
+from ebooklib.plugins.base import BasePlugin
+from ebooklib.utils import parse_html_string
+
+# Recommend usage of
+# - https://github.com/w3c/tidy-html5
+
+def tidy_cleanup(content, **extra):
+    cmd = []
+
+    for k, v in six.iteritems(extra):
+
+        if v:
+            cmd.append('--%s' % k)
+            cmd.append(v)
+        else:
+            cmd.append('-%s' % k)
+
+    # must parse all other extra arguments
+    try:
+        p = subprocess.Popen(['tidy']+cmd, shell=False, 
+                             stdin=subprocess.PIPE, stdout=subprocess.PIPE, 
+                             stderr=subprocess.PIPE, close_fds=True)
+    except OSError:
+        return (3, None)
+
+    p.stdin.write(content)
+
+    (cont, p_err) = p.communicate()
+
+    # 0 - all ok
+    # 1 - there were warnings
+    # 2 - there were errors
+    # 3 - exception
+
+    return (p.returncode, cont)
+
+
+class TidyPlugin(BasePlugin):
+    NAME = 'Tidy HTML'
+    OPTIONS = {'char-encoding': 'utf8',
+               'tidy-mark': 'no'
+              }
+
+    def __init__(self, extra = {}):
+        self.options = dict(self.OPTIONS)
+        self.options.update(extra)
+
+    def html_before_write(self, book, chapter):
+        if not chapter.content:
+            return None
+
+        (_, chapter.content) = tidy_cleanup(chapter.content, **self.options)
+
+        return chapter.content
+
+    def html_after_read(self, book, chapter):
+        if not chapter.content:
+            return None
+
+        (_, chapter.content) = tidy_cleanup(chapter.content, **self.options)
+
+        return chapter.content
+
--- a/ebooklib/utils.py
+++ b/ebooklib/utils.py
@@ -0,0 +1,60 @@
+# This file is part of EbookLib.
+# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
+#
+# EbookLib is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# EbookLib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with EbookLib.  If not, see <http://www.gnu.org/licenses/>.
+
+import io
+import mimetypes
+
+from lxml import etree
+
+
+mimetype_initialised = False
+
+
+def debug(obj):
+    import pprint
+
+    pp = pprint.PrettyPrinter(indent=4)
+    pp.pprint(obj)
+
+
+def parse_string(s):
+    try:
+        tree = etree.parse(io.BytesIO(s.encode('utf-8')))
+    except:
+        tree = etree.parse(io.BytesIO(s))
+
+    return tree
+
+
+def parse_html_string(s):
+    from lxml import html
+
+    utf8_parser = html.HTMLParser(encoding='utf-8')
+
+    html_tree = html.document_fromstring(s, parser=utf8_parser)
+
+    return html_tree
+
+
+def guess_type(extenstion):
+    global mimetype_initialised
+
+    if not mimetype_initialised:
+        mimetypes.init()
+        mimetypes.add_type('application/xhtml+xml', '.xhtml')
+        mimetype_initialised = True
+
+    return mimetypes.guess_type(extenstion)