From f3d748f5ddba90ec0d07756b99cf7dd0b3858bd1 Mon Sep 17 00:00:00 2001
From: BasioMeusPuga <disgruntled.mob@gmail.com>
Date: Mon, 6 Nov 2017 07:42:48 +0530
Subject: [PATCH] Incorporate ebooklib from
 https://github.com/aerkalov/ebooklib

---
 ebooklib/__init__.py           |   42 +
 ebooklib/epub.py               | 1595 ++++++++++++++++++++++++++++++++
 ebooklib/plugins/__init__.py   |    0
 ebooklib/plugins/base.py       |   49 +
 ebooklib/plugins/booktype.py   |  119 +++
 ebooklib/plugins/sourcecode.py |   68 ++
 ebooklib/plugins/standard.py   |  230 +++++
 ebooklib/plugins/tidyhtml.py   |   82 ++
 ebooklib/utils.py              |   60 ++
 9 files changed, 2245 insertions(+)
 create mode 100644 ebooklib/__init__.py
 create mode 100644 ebooklib/epub.py
 create mode 100644 ebooklib/plugins/__init__.py
 create mode 100644 ebooklib/plugins/base.py
 create mode 100644 ebooklib/plugins/booktype.py
 create mode 100644 ebooklib/plugins/sourcecode.py
 create mode 100644 ebooklib/plugins/standard.py
 create mode 100644 ebooklib/plugins/tidyhtml.py
 create mode 100644 ebooklib/utils.py

diff --git a/ebooklib/__init__.py b/ebooklib/__init__.py
new file mode 100644
index 0000000..020d2f5
--- /dev/null
+++ b/ebooklib/__init__.py
@@ -0,0 +1,42 @@
+# This file is part of EbookLib.
+# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
+#
+# EbookLib is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# EbookLib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with EbookLib.  If not, see <http://www.gnu.org/licenses/>.
+
+# Version of ebook library
+
+VERSION = (0, 16, 0)
+
+# LIST OF POSSIBLE ITEMS
+ITEM_UNKNOWN = 0
+ITEM_IMAGE = 1
+ITEM_STYLE = 2
+ITEM_SCRIPT = 3
+ITEM_NAVIGATION = 4
+ITEM_VECTOR = 5
+ITEM_FONT = 6
+ITEM_VIDEO = 7
+ITEM_AUDIO = 8
+ITEM_DOCUMENT = 9
+
+# EXTENSION MAPPER
+EXTENSIONS = {ITEM_IMAGE: ['.jpg', '.jpeg', '.gif', '.tiff', '.tif', '.png'],
+              ITEM_STYLE: ['.css'],
+              ITEM_VECTOR: ['.svg'],
+              ITEM_FONT: ['.otf', '.woff', '.ttf'],
+              ITEM_SCRIPT: ['.js'],
+              ITEM_NAVIGATION: ['.ncx'],
+              ITEM_VIDEO: ['.mov', '.mp4', '.avi'],
+              ITEM_AUDIO: ['.mp3', '.ogg']
+              }
diff --git a/ebooklib/epub.py b/ebooklib/epub.py
new file mode 100644
index 0000000..fd3a0ff
--- /dev/null
+++ b/ebooklib/epub.py
@@ -0,0 +1,1595 @@
+# This file is part of EbookLib.
+# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
+#
+# EbookLib is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# EbookLib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with EbookLib.  If not, see <http://www.gnu.org/licenses/>.
+
+import zipfile
+import six
+import logging
+import uuid
+import posixpath as zip_path
+import os.path
+from collections import OrderedDict
+
+try:
+    from urllib.parse import unquote
+except ImportError:
+    from urllib import unquote
+
+from lxml import etree
+
+import ebooklib
+
+from ebooklib.utils import parse_string, parse_html_string, guess_type
+
+
+# Version of EPUB library
+VERSION = (0, 15, 0)
+
+NAMESPACES = {'XML': 'http://www.w3.org/XML/1998/namespace',
+              'EPUB': 'http://www.idpf.org/2007/ops',
+              'DAISY': 'http://www.daisy.org/z3986/2005/ncx/',
+              'OPF': 'http://www.idpf.org/2007/opf',
+              'CONTAINERNS': 'urn:oasis:names:tc:opendocument:xmlns:container',
+              'DC': 'http://purl.org/dc/elements/1.1/',
+              'XHTML': 'http://www.w3.org/1999/xhtml'}
+
+# XML Templates
+
+CONTAINER_PATH = 'META-INF/container.xml'
+
+CONTAINER_XML = '''<?xml version='1.0' encoding='utf-8'?>
+<container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
+  <rootfiles>
+    <rootfile media-type="application/oebps-package+xml" full-path="%(folder_name)s/content.opf"/>
+  </rootfiles>
+</container>
+'''
+
+NCX_XML = six.b('''<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
+<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" />''')
+
+NAV_XML = six.b('''<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html><html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"/>''')
+
+CHAPTER_XML = six.b('''<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE html><html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"  epub:prefix="z3998: http://www.daisy.org/z3998/2012/vocab/structure/#"></html>''')
+
+COVER_XML = six.b('''<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="en" xml:lang="en">
+ <head>
+  <style>
+    body { margin: 0em; padding: 0em; }
+    img { max-width: 100%; max-height: 100%; }
+  </style>
+ </head>
+ <body>
+   <img src="" alt="" />
+ </body>
+</html>''')
+
+
+IMAGE_MEDIA_TYPES = ['image/jpeg', 'image/jpg', 'image/png', 'image/svg+xml']
+
+
+# TOC elements
+
+class Section(object):
+
+    def __init__(self, title, href=''):
+        self.title = title
+        self.href = href
+
+
+class Link(object):
+
+    def __init__(self, href, title, uid=None):
+        self.href = href
+        self.title = title
+        self.uid = uid
+
+# Exceptions
+
+
+class EpubException(Exception):
+
+    def __init__(self, code, msg):
+        self.code = code
+        self.msg = msg
+
+    def __str__(self):
+        return repr(self.msg)
+
+# Items
+
+
+class EpubItem(object):
+
+    """
+    Base class for the items in a book.
+    """
+
+    def __init__(self, uid=None, file_name='', media_type='', content=six.b(''), manifest=True):
+        """
+        :Args:
+          - uid: Unique identifier for this item (optional)
+          - file_name: File name for this item (optional)
+          - media_type: Media type for this item (optional)
+          - content: Content for this item (optional)
+          - manifest: Manifest for this item (optional)
+        """
+        self.id = uid
+        self.file_name = file_name
+        self.media_type = media_type
+        self.content = content
+        self.is_linear = True
+        self.manifest = manifest
+
+        self.book = None
+
+    def get_id(self):
+        """
+        Returns unique identifier for this item.
+
+        :Returns:
+          Returns uid number as string.
+        """
+        return self.id
+
+    def get_name(self):
+        """
+        Returns name for this item. By default it is always file name but it does not have to be.
+
+        :Returns:
+          Returns file name for this item.
+        """
+        return self.file_name
+
+    def get_type(self):
+        """
+        Guess type according to the file extension. Might not be the best way how to do it, but it works for now.
+
+        Items can be of type:
+          - ITEM_UNKNOWN = 0
+          - ITEM_IMAGE = 1
+          - ITEM_STYLE = 2
+          - ITEM_SCRIPT = 3
+          - ITEM_NAVIGATION = 4
+          - ITEM_VECTOR = 5
+          - ITEM_FONT = 6
+          - ITEM_VIDEO = 7
+          - ITEM_AUDIO = 8
+          - ITEM_DOCUMENT = 9
+
+        We map type according to the extensions which are defined in ebooklib.EXTENSIONS.
+
+        :Returns:
+          Returns type of the item as number.
+        """
+        _, ext = zip_path.splitext(self.get_name())
+        ext = ext.lower()
+
+        for uid, ext_list in six.iteritems(ebooklib.EXTENSIONS):
+            if ext in ext_list:
+                return uid
+
+        return ebooklib.ITEM_UNKNOWN
+
+    def get_content(self, default=six.b('')):
+        """
+        Returns content of the item. Content should be of type 'str' (Python 2) or 'bytes' (Python 3)
+
+        :Args:
+          - default: Default value for the content if it is not already defined.
+
+        :Returns:
+          Returns content of the item.
+        """
+        return self.content or default
+
+    def set_content(self, content):
+        """
+        Sets content value for this item.
+
+        :Args:
+          - content: Content value
+        """
+        self.content = content
+
+    def __str__(self):
+        return '<EpubItem:%s>' % self.id
+
+
+class EpubNcx(EpubItem):
+
+    "Represents Navigation Control File (NCX) in the EPUB."
+
+    def __init__(self, uid='ncx', file_name='toc.ncx'):
+        super(EpubNcx, self).__init__(uid=uid, file_name=file_name, media_type='application/x-dtbncx+xml')
+
+    def __str__(self):
+        return '<EpubNcx:%s>' % self.id
+
+
+class EpubCover(EpubItem):
+
+    """
+    Represents Cover image in the EPUB file.
+    """
+
+    def __init__(self, uid='cover-img', file_name=''):
+        super(EpubCover, self).__init__(uid=uid, file_name=file_name)
+
+    def __str__(self):
+        return '<EpubCover:%s:%s>' % (self.id, self.file_name)
+
+
+class EpubHtml(EpubItem):
+
+    """
+    Represents HTML document in the EPUB file.
+    """
+    _template_name = 'chapter'
+
+    def __init__(self, uid=None, file_name='', media_type='', content=None, title='', lang=None, direction=None):
+        super(EpubHtml, self).__init__(uid, file_name, media_type, content)
+
+        self.title = title
+        self.lang = lang
+        self.direction = direction
+
+        self.links = []
+        self.properties = []
+
+    def is_chapter(self):
+        """
+        Returns if this document is chapter or not.
+
+        :Returns:
+          Returns book value.
+        """
+        return True
+
+    def get_type(self):
+        """
+        Always returns ebooklib.ITEM_DOCUMENT as type of this document.
+
+        :Returns:
+          Always returns ebooklib.ITEM_DOCUMENT
+        """
+
+        return ebooklib.ITEM_DOCUMENT
+
+    def set_language(self, lang):
+        """
+        Sets language for this book item. By default it will use language of the book but it
+        can be overwritten with this call.
+        """
+        self.lang = lang
+
+    def get_language(self):
+        """
+        Get language code for this book item. Language of the book item can be different from
+        the language settings defined globaly for book.
+
+        :Returns:
+          As string returns language code.
+        """
+        return self.lang
+
+    def add_link(self, **kwgs):
+        """
+        Add additional link to the document. Links will be embeded only inside of this document.
+
+        >>> add_link(href='styles.css', rel='stylesheet', type='text/css')
+        """
+        self.links.append(kwgs)
+
+    def get_links(self):
+        """
+        Returns list of additional links defined for this document.
+
+        :Returns:
+          As tuple return list of links.
+        """
+        return (link for link in self.links)
+
+    def get_links_of_type(self, link_type):
+        """
+        Returns list of additional links of specific type.
+
+        :Returns:
+          As tuple returns list of links.
+        """
+        return (link for link in self.links if link.get('type', '') == link_type)
+
+    def add_item(self, item):
+        """
+        Add other item to this document. It will create additional links according to the item type.
+
+        :Args:
+          - item: item we want to add defined as instance of EpubItem
+        """
+        if item.get_type() == ebooklib.ITEM_STYLE:
+            self.add_link(href=item.get_name(), rel='stylesheet', type='text/css')
+
+        if item.get_type() == ebooklib.ITEM_SCRIPT:
+            self.add_link(src=item.get_name(), type='text/javascript')
+
+    def get_body_content(self):
+        """
+        Returns content of BODY element for this HTML document. Content will be of type 'str' (Python 2)
+        or 'bytes' (Python 3).
+
+        :Returns:
+          Returns content of this document.
+        """
+
+        try:
+            html_tree = parse_html_string(self.content)
+        except:
+            return ''
+
+        html_root = html_tree.getroottree()
+
+        if len(html_root.find('body')) != 0:
+            body = html_tree.find('body')
+
+            tree_str = etree.tostring(body, pretty_print=True, encoding='utf-8', xml_declaration=False)
+
+            # this is so stupid
+            if tree_str.startswith(six.b('<body>')):
+                n = tree_str.rindex(six.b('</body>'))
+
+                return tree_str[6:n]
+
+            return tree_str
+
+        return ''
+
+    def get_content(self, default=None):
+        """
+        Returns content for this document as HTML string. Content will be of type 'str' (Python 2)
+        or 'bytes' (Python 3).
+
+        :Args:
+          - default: Default value for the content if it is not defined.
+
+        :Returns:
+          Returns content of this document.
+        """
+
+        tree = parse_string(self.book.get_template(self._template_name))
+        tree_root = tree.getroot()
+
+        tree_root.set('lang', self.lang or self.book.language)
+        tree_root.attrib['{%s}lang' % NAMESPACES['XML']] = self.lang or self.book.language
+
+        # add to the head also
+        #  <meta charset="utf-8" />
+
+        try:
+            html_tree = parse_html_string(self.content)
+        except:
+            return ''
+
+        html_root = html_tree.getroottree()
+
+        # create and populate head
+
+        _head = etree.SubElement(tree_root, 'head')
+
+        if self.title != '':
+            _title = etree.SubElement(_head, 'title')
+            _title.text = self.title
+
+        for lnk in self.links:
+            if lnk.get('type') == 'text/javascript':
+                _lnk = etree.SubElement(_head, 'script', lnk)
+                # force <script></script>
+                _lnk.text = ''
+            else:
+                _lnk = etree.SubElement(_head, 'link', lnk)
+
+        # this should not be like this
+        # head = html_root.find('head')
+        # if head is not None:
+        #     for i in head.getchildren():
+        #         if i.tag == 'title' and self.title != '':
+        #             continue
+        #         _head.append(i)
+
+        # create and populate body
+
+        _body = etree.SubElement(tree_root, 'body')
+        if self.direction:
+            _body.set('dir', self.direction)
+
+        body = html_tree.find('body')
+        if body is not None:
+            for i in body.getchildren():
+                _body.append(i)
+
+        tree_str = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)
+
+        return tree_str
+
+    def __str__(self):
+        return '<EpubHtml:%s:%s>' % (self.id, self.file_name)
+
+
+class EpubCoverHtml(EpubHtml):
+
+    """
+    Represents Cover page in the EPUB file.
+    """
+
+    def __init__(self, uid='cover', file_name='cover.xhtml', image_name='', title='Cover'):
+        super(EpubCoverHtml, self).__init__(uid=uid, file_name=file_name, title=title)
+
+        self.image_name = image_name
+        self.is_linear = False
+
+    def is_chapter(self):
+        """
+        Returns if this document is chapter or not.
+
+        :Returns:
+          Returns book value.
+        """
+
+        return False
+
+    def get_content(self):
+        """
+        Returns content for cover page as HTML string. Content will be of type 'str' (Python 2) or 'bytes' (Python 3).
+
+        :Returns:
+          Returns content of this document.
+        """
+
+        self.content = self.book.get_template('cover')
+
+        tree = parse_string(super(EpubCoverHtml, self).get_content())
+        tree_root = tree.getroot()
+
+        images = tree_root.xpath('//xhtml:img', namespaces={'xhtml': NAMESPACES['XHTML']})
+
+        images[0].set('src', self.image_name)
+        images[0].set('alt', self.title)
+
+        tree_str = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)
+
+        return tree_str
+
+    def __str__(self):
+        return '<EpubCoverHtml:%s:%s>' % (self.id, self.file_name)
+
+
+class EpubNav(EpubHtml):
+
+    """
+    Represents Navigation Document in the EPUB file.
+    """
+
+    def __init__(self, uid='nav', file_name='nav.xhtml', media_type='application/xhtml+xml'):
+        super(EpubNav, self).__init__(uid=uid, file_name=file_name, media_type=media_type)
+
+    def is_chapter(self):
+        """
+        Returns if this document is chapter or not.
+
+        :Returns:
+          Returns book value.
+        """
+
+        return False
+
+    def __str__(self):
+        return '<EpubNav:%s:%s>' % (self.id, self.file_name)
+
+
+class EpubImage(EpubItem):
+
+    """
+    Represents Image in the EPUB file.
+    """
+
+    def __init__(self):
+        super(EpubImage, self).__init__()
+
+    def get_type(self):
+        return ebooklib.ITEM_IMAGE
+
+    def __str__(self):
+        return '<EpubImage:%s:%s>' % (self.id, self.file_name)
+
+
+# EpubBook
+
+class EpubBook(object):
+
+    def __init__(self):
+        self.EPUB_VERSION = None
+
+        self.reset()
+
+        # we should have options here
+
+    def reset(self):
+        "Initialises all needed variables to default values"
+
+        self.metadata = {}
+        self.items = []
+        self.spine = []
+        self.guide = []
+        self.toc = []
+        self.bindings = []
+
+        self.IDENTIFIER_ID = 'id'
+        self.FOLDER_NAME = 'EPUB'
+
+        self._id_html = 0
+        self._id_image = 0
+        self._id_static = 0
+
+        self.title = ''
+        self.language = 'en'
+        self.direction = None
+
+        self.templates = {
+            'ncx': NCX_XML,
+            'nav': NAV_XML,
+            'chapter': CHAPTER_XML,
+            'cover': COVER_XML
+        }
+
+        self.add_metadata('OPF', 'generator', '', {
+            'name': 'generator', 'content': 'Ebook-lib %s' % '.'.join([str(s) for s in VERSION])
+        })
+
+        # default to using a randomly-unique identifier if one is not specified manually
+        self.set_identifier(str(uuid.uuid4()))
+
+        # custom prefixes and namespaces to be set to the content.opf doc
+        self.prefixes = []
+        self.namespaces = {}
+
+    def set_identifier(self, uid):
+        """
+        Sets unique id for this epub
+
+        :Args:
+          - uid: Value of unique identifier for this book
+        """
+
+        self.uid = uid
+
+        self.set_unique_metadata('DC', 'identifier', self.uid, {'id': self.IDENTIFIER_ID})
+
+    def set_title(self, title):
+        """
+        Set title. You can set multiple titles.
+
+        :Args:
+          - title: Title value
+        """
+
+        self.title = title
+
+        self.add_metadata('DC', 'title', self.title)
+
+    def set_language(self, lang):
+        """
+        Set language for this epub. You can set multiple languages. Specific items in the book can have
+        different language settings.
+
+        :Args:
+          - lang: Language code
+        """
+
+        self.language = lang
+
+        self.add_metadata('DC', 'language', lang)
+
+    def set_direction(self, direction):
+        """
+        :Args:
+          - direction: Options are "ltr", "rtl" and "default"
+        """
+
+        self.direction = direction
+
+    def set_cover(self, file_name, content, create_page=True):
+        """
+        Set cover and create cover document if needed.
+
+        :Args:
+          - file_name: file name of the cover page
+          - content: Content for the cover image
+          - create_page: Should cover page be defined. Defined as bool value (optional). Default value is True.
+        """
+
+        # as it is now, it can only be called once
+        c0 = EpubCover(file_name=file_name)
+        c0.content = content
+        self.add_item(c0)
+
+        if create_page:
+            c1 = EpubCoverHtml(image_name=file_name)
+            self.add_item(c1)
+
+        self.add_metadata(None, 'meta', '', OrderedDict([('name', 'cover'), ('content', 'cover-img')]))
+
+    def add_author(self, author, file_as=None, role=None, uid='creator'):
+        "Add author for this document"
+
+        self.add_metadata('DC', 'creator', author, {'id': uid})
+
+        if file_as:
+            self.add_metadata(None, 'meta', file_as, {'refines': '#' + uid,
+                                                      'property': 'file-as',
+                                                      'scheme': 'marc:relators'})
+        if role:
+            self.add_metadata(None, 'meta', role, {'refines': '#' + uid,
+                                                   'property': 'role',
+                                                   'scheme': 'marc:relators'})
+
+    def add_metadata(self, namespace, name, value, others=None):
+        "Add metadata"
+
+        if namespace in NAMESPACES:
+            namespace = NAMESPACES[namespace]
+
+        if namespace not in self.metadata:
+            self.metadata[namespace] = {}
+
+        if name not in self.metadata[namespace]:
+            self.metadata[namespace][name] = []
+
+        self.metadata[namespace][name].append((value, others))
+
+    def get_metadata(self, namespace, name):
+        "Retrieve metadata"
+
+        if namespace in NAMESPACES:
+            namespace = NAMESPACES[namespace]
+
+        return self.metadata[namespace][name]
+
+    def set_unique_metadata(self, namespace, name, value, others=None):
+        "Add metadata if metadata with this identifier does not already exist, otherwise update existing metadata."
+
+        if namespace in NAMESPACES:
+            namespace = NAMESPACES[namespace]
+
+        if namespace in self.metadata and name in self.metadata[namespace]:
+            self.metadata[namespace][name] = [(value, others)]
+        else:
+            self.add_metadata(namespace, name, value, others)
+
+    def add_item(self, item):
+        """
+        Add additional item to the book. If not defined, media type and chapter id will be defined
+        for the item.
+
+        :Args:
+          - item: Item instance
+        """
+        if item.media_type == '':
+            (has_guessed, media_type) = guess_type(item.get_name().lower())
+
+            if has_guessed:
+                if media_type is not None:
+                    item.media_type = media_type
+                else:
+                    item.media_type = has_guessed
+            else:
+                item.media_type = 'application/octet-stream'
+
+        if not item.get_id():
+            # make chapter_, image_ and static_ configurable
+            if isinstance(item, EpubHtml):
+                item.id = 'chapter_%d' % self._id_html
+                self._id_html += 1
+            elif isinstance(item, EpubImage):
+                item.id = 'image_%d' % self._id_image
+                self._id_image += 1
+            else:
+                item.id = 'static_%d' % self._id_image
+                self._id_image += 1
+
+        item.book = self
+        self.items.append(item)
+
+        return item
+
+    def get_item_with_id(self, uid):
+        """
+        Returns item for defined UID.
+
+        >>> book.get_item_with_id('image_001')
+
+        :Args:
+          - uid: UID for the item
+
+        :Returns:
+          Returns item object. Returns None if nothing was found.
+        """
+        for item in self.get_items():
+            if item.id == uid:
+                return item
+
+        return None
+
+    def get_item_with_href(self, href):
+        """
+        Returns item for defined HREF.
+
+        >>> book.get_item_with_href('EPUB/document.xhtml')
+
+        :Args:
+          - href: HREF for the item we are searching for
+
+        :Returns:
+          Returns item object. Returns None if nothing was found.
+        """
+        for item in self.get_items():
+            if item.get_name() == href:
+                return item
+
+        return None
+
+    def get_items(self):
+        """
+        Returns all items attached to this book.
+
+        :Returns:
+          Returns all items as tuple.
+        """
+        return (item for item in self.items)
+
+    def get_items_of_type(self, item_type):
+        """
+        Returns all items of specified type.
+
+        >>> book.get_items_of_type(epub.ITEM_IMAGE)
+
+        :Args:
+          - item_type: Type for items we are searching for
+
+        :Returns:
+          Returns found items as tuple.
+        """
+        return (item for item in self.items if item.get_type() == item_type)
+
+    def get_items_of_media_type(self, media_type):
+        """
+        Returns all items of specified media type.
+
+        :Args:
+          - media_type: Media type for items we are searching for
+
+        :Returns:
+          Returns found items as tuple.
+        """
+        return (item for item in self.items if item.media_type == media_type)
+
+    def set_template(self, name, value):
+        """
+        Defines templates which are used to generate certain types of pages. When defining new value for the template
+        we have to use content of type 'str' (Python 2) or 'bytes' (Python 3).
+
+        At the moment we use these templates:
+          - ncx
+          - nav
+          - chapter
+          - cover
+
+        :Args:
+          - name: Name for the template
+          - value: Content for the template
+        """
+
+        self.templates[name] = value
+
+    def get_template(self, name):
+        """
+        Returns value for the template.
+
+        :Args:
+          - name: template name
+
+        :Returns:
+          Value of the template.
+        """
+        return self.templates.get(name)
+
+    def add_prefix(self, name, uri):
+        """
+        Appends custom prefix to be added to the content.opf document
+
+        >>> epub_book.add_prefix('bkterms', 'http://booktype.org/')
+
+        :Args:
+          - name: namespave name
+          - uri: URI for the namespace
+        """
+
+        self.prefixes.append('%s: %s' % (name, uri))
+
+
+class EpubWriter(object):
+    DEFAULT_OPTIONS = {
+        'epub2_guide': True,
+        'epub3_landmark': True,
+        'landmark_title': 'Guide',
+        'spine_direction': True,
+        'package_direction': False
+    }
+
+    def __init__(self, name, book, options=None):
+        self.file_name = name
+        self.book = book
+
+        self.options = dict(self.DEFAULT_OPTIONS)
+        if options:
+            self.options.update(options)
+
+    def process(self):
+        # should cache this html parsing so we don't do it for every plugin
+        for plg in self.options.get('plugins', []):
+            if hasattr(plg, 'before_write'):
+                plg.before_write(self.book)
+
+        for item in self.book.get_items():
+            if isinstance(item, EpubHtml):
+                for plg in self.options.get('plugins', []):
+                    if hasattr(plg, 'html_before_write'):
+                        plg.html_before_write(self.book, item)
+
+    def _write_container(self):
+        container_xml = CONTAINER_XML % {'folder_name': self.book.FOLDER_NAME}
+        self.out.writestr(CONTAINER_PATH, container_xml)
+
+    def _write_opf_metadata(self, root):
+        # This is really not needed
+        # problem is uppercase/lowercase
+        # for ns_name, values in six.iteritems(self.book.metadata):
+        #     if ns_name:
+        #         for n_id, ns_url in six.iteritems(NAMESPACES):
+        #             if ns_name == ns_url:
+        #                 nsmap[n_id.lower()] = NAMESPACES[n_id]
+
+        nsmap = {'dc': NAMESPACES['DC'], 'opf': NAMESPACES['OPF']}
+        nsmap.update(self.book.namespaces)
+
+        metadata = etree.SubElement(root, 'metadata', nsmap=nsmap)
+
+        el = etree.SubElement(metadata, 'meta', {'property': 'dcterms:modified'})
+        if 'mtime' in self.options:
+            mtime = self.options['mtime']
+        else:
+            import datetime
+            mtime = datetime.datetime.now()
+        el.text = mtime.strftime('%Y-%m-%dT%H:%M:%SZ')
+
+        for ns_name, values in six.iteritems(self.book.metadata):
+            if ns_name == NAMESPACES['OPF']:
+                for values in values.values():
+                    for v in values:
+                        if 'property' in v[1] and v[1]['property'] == 'dcterms:modified':
+                            continue
+                        try:
+                            el = etree.SubElement(metadata, 'meta', v[1])
+                            if v[0]:
+                                el.text = v[0]
+                        except ValueError:
+                            logging.error('Could not create metadata.')
+            else:
+                for name, values in six.iteritems(values):
+                    for v in values:
+                        try:
+                            if ns_name:
+                                el = etree.SubElement(metadata, '{%s}%s' % (ns_name, name), v[1])
+                            else:
+                                el = etree.SubElement(metadata, '%s' % name, v[1])
+
+                            el.text = v[0]
+                        except ValueError:
+                            logging.error('Could not create metadata "{}".'.format(name))
+
+    def _write_opf_manifest(self, root):
+        manifest = etree.SubElement(root, 'manifest')
+        _ncx_id = None
+
+        # mathml, scripted, svg, remote-resources, and switch
+        # nav
+        # cover-image
+
+        for item in self.book.get_items():
+            if not item.manifest:
+                continue
+
+            if isinstance(item, EpubNav):
+                etree.SubElement(manifest, 'item', {'href': item.get_name(),
+                                                    'id': item.id,
+                                                    'media-type': item.media_type,
+                                                    'properties': 'nav'})
+            elif isinstance(item, EpubNcx):
+                _ncx_id = item.id
+                etree.SubElement(manifest, 'item', {'href': item.file_name,
+                                                    'id': item.id,
+                                                    'media-type': item.media_type})
+
+            elif isinstance(item, EpubCover):
+                etree.SubElement(manifest, 'item', {'href': item.file_name,
+                                                    'id': item.id,
+                                                    'media-type': item.media_type,
+                                                    'properties': 'cover-image'})
+            else:
+                opts = {'href': item.file_name,
+                        'id': item.id,
+                        'media-type': item.media_type}
+
+                if hasattr(item, 'properties') and len(item.properties) > 0:
+                    opts['properties'] = ' '.join(item.properties)
+
+                etree.SubElement(manifest, 'item', opts)
+
+        return _ncx_id
+
+    def _write_opf_spine(self, root, ncx_id):
+        spine_attributes = {'toc': ncx_id or 'ncx'}
+        if self.book.direction and self.options['spine_direction']:
+            spine_attributes['page-progression-direction'] = self.book.direction
+
+        spine = etree.SubElement(root, 'spine', spine_attributes)
+
+        for _item in self.book.spine:
+            # this is for now
+            # later we should be able to fetch things from tuple
+
+            is_linear = True
+
+            if isinstance(_item, tuple):
+                item = _item[0]
+
+                if len(_item) > 1:
+                    if _item[1] == 'no':
+                        is_linear = False
+            else:
+                item = _item
+
+            if isinstance(item, EpubHtml):
+                opts = {'idref': item.get_id()}
+
+                if not item.is_linear or not is_linear:
+                    opts['linear'] = 'no'
+            elif isinstance(item, EpubItem):
+                opts = {'idref': item.get_id()}
+
+                if not item.is_linear or not is_linear:
+                    opts['linear'] = 'no'
+            else:
+                opts = {'idref': item}
+
+                try:
+                    itm = self.book.get_item_with_id(item)
+
+                    if not itm.is_linear or not is_linear:
+                        opts['linear'] = 'no'
+                except:
+                    pass
+
+            etree.SubElement(spine, 'itemref', opts)
+
+    def _write_opf_guide(self, root):
+        # - http://www.idpf.org/epub/20/spec/OPF_2.0.1_draft.htm#Section2.6
+
+        if len(self.book.guide) > 0 and self.options.get('epub2_guide'):
+            guide = etree.SubElement(root, 'guide', {})
+
+            for item in self.book.guide:
+                if 'item' in item:
+                    chap = item.get('item')
+                    if chap:
+                        _href = chap.file_name
+                        _title = chap.title
+                else:
+                    _href = item.get('href', '')
+                    _title = item.get('title', '')
+
+                if _title is None:
+                    _title = ''
+                ref = etree.SubElement(guide, 'reference', {'type': item.get('type', ''),
+                                                            'title': _title,
+                                                            'href': _href})
+
+    def _write_opf_bindings(self, root):
+        if len(self.book.bindings) > 0:
+            bindings = etree.SubElement(root, 'bindings', {})
+            for item in self.book.bindings:
+                etree.SubElement(bindings, 'mediaType', item)
+
+    def _write_opf_file(self, root):
+        tree_str = etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True)
+
+        self.out.writestr('%s/content.opf' % self.book.FOLDER_NAME, tree_str)
+
+    def _write_opf(self):
+        package_attributes = {'xmlns': NAMESPACES['OPF'],
+                              'unique-identifier': self.book.IDENTIFIER_ID,
+                              'version': '3.0'}
+        if self.book.direction and self.options['package_direction']:
+            package_attributes['dir'] = self.book.direction
+
+        root = etree.Element('package', package_attributes)
+
+        prefixes = ['rendition: http://www.idpf.org/vocab/rendition/#'] + self.book.prefixes
+        root.attrib['prefix'] = ' '.join(prefixes)
+
+        # METADATA
+        self._write_opf_metadata(root)
+
+        # MANIFEST
+        _ncx_id = self._write_opf_manifest(root)
+
+        # SPINE
+        self._write_opf_spine(root, _ncx_id)
+
+        # GUIDE
+        self._write_opf_guide(root)
+
+        # BINDINGS
+        self._write_opf_bindings(root)
+
+        # WRITE FILE
+        self._write_opf_file(root)
+
+    def _get_nav(self, item):
+        # just a basic navigation for now
+        nav_xml = parse_string(self.book.get_template('nav'))
+        root = nav_xml.getroot()
+
+        root.set('lang', self.book.language)
+        root.attrib['{%s}lang' % NAMESPACES['XML']] = self.book.language
+
+        nav_dir_name = os.path.dirname(item.file_name)
+
+        head = etree.SubElement(root, 'head')
+        title = etree.SubElement(head, 'title')
+        title.text = self.book.title
+
+        # for now this just handles css files and ignores others
+        for _link in item.links:
+            _lnk = etree.SubElement(head, 'link', {
+                'href': _link.get('href', ''), 'rel': 'stylesheet', 'type': 'text/css'
+            })
+
+        body = etree.SubElement(root, 'body')
+        nav = etree.SubElement(body, 'nav', {'{%s}type' % NAMESPACES['EPUB']: 'toc', 'id': 'id'})
+
+        content_title = etree.SubElement(nav, 'h2')
+        content_title.text = self.book.title
+
+        def _create_section(itm, items):
+            ol = etree.SubElement(itm, 'ol')
+            for item in items:
+                if isinstance(item, tuple) or isinstance(item, list):
+                    li = etree.SubElement(ol, 'li')
+                    if isinstance(item[0], EpubHtml):
+                        a = etree.SubElement(li, 'a', {'href': os.path.relpath(item[0].file_name, nav_dir_name)})
+                    elif isinstance(item[0], Section) and item[0].href != '':
+                        a = etree.SubElement(li, 'a', {'href': os.path.relpath(item[0].href, nav_dir_name)})
+                    elif isinstance(item[0], Link):
+                        a = etree.SubElement(li, 'a', {'href': os.path.relpath(item[0].href, nav_dir_name)})
+                    else:
+                        a = etree.SubElement(li, 'span')
+                    a.text = item[0].title
+
+                    _create_section(li, item[1])
+
+                elif isinstance(item, Link):
+                    li = etree.SubElement(ol, 'li')
+                    a = etree.SubElement(li, 'a', {'href': os.path.relpath(item.href, nav_dir_name)})
+                    a.text = item.title
+                elif isinstance(item, EpubHtml):
+                    li = etree.SubElement(ol, 'li')
+                    a = etree.SubElement(li, 'a', {'href': os.path.relpath(item.file_name, nav_dir_name)})
+                    a.text = item.title
+
+        _create_section(nav, self.book.toc)
+
+        # LANDMARKS / GUIDE
+        # - http://www.idpf.org/epub/30/spec/epub30-contentdocs.html#sec-xhtml-nav-def-types-landmarks
+
+        if len(self.book.guide) > 0 and self.options.get('epub3_landmark'):
+
+            # Epub2 guide types do not map completely to epub3 landmark types.
+            guide_to_landscape_map = {
+                'notes': 'rearnotes',
+                'text': 'bodymatter'
+            }
+
+            guide_nav = etree.SubElement(body, 'nav', {'{%s}type' % NAMESPACES['EPUB']: 'landmarks'})
+
+            guide_content_title = etree.SubElement(guide_nav, 'h2')
+            guide_content_title.text = self.options.get('landmark_title', 'Guide')
+
+            guild_ol = etree.SubElement(guide_nav, 'ol')
+
+            for elem in self.book.guide:
+                li_item = etree.SubElement(guild_ol, 'li')
+
+                if 'item' in elem:
+                    chap = elem.get('item', None)
+                    if chap:
+                        _href = chap.file_name
+                        _title = chap.title
+                else:
+                    _href = elem.get('href', '')
+                    _title = elem.get('title', '')
+
+                guide_type = elem.get('type', '')
+                a_item = etree.SubElement(li_item, 'a', {
+                    '{%s}type' % NAMESPACES['EPUB']: guide_to_landscape_map.get(guide_type, guide_type),
+                    'href': os.path.relpath(_href, nav_dir_name)
+                })
+                a_item.text = _title
+
+        tree_str = etree.tostring(nav_xml, pretty_print=True, encoding='utf-8', xml_declaration=True)
+
+        return tree_str
+
+    def _get_ncx(self):
+
+        # we should be able to setup language for NCX as also
+        ncx = parse_string(self.book.get_template('ncx'))
+        root = ncx.getroot()
+
+        head = etree.SubElement(root, 'head')
+
+        # get this id
+        uid = etree.SubElement(head, 'meta', {'content': self.book.uid, 'name': 'dtb:uid'})
+        uid = etree.SubElement(head, 'meta', {'content': '0', 'name': 'dtb:depth'})
+        uid = etree.SubElement(head, 'meta', {'content': '0', 'name': 'dtb:totalPageCount'})
+        uid = etree.SubElement(head, 'meta', {'content': '0', 'name': 'dtb:maxPageNumber'})
+
+        doc_title = etree.SubElement(root, 'docTitle')
+        title = etree.SubElement(doc_title, 'text')
+        title.text = self.book.title
+
+#        doc_author = etree.SubElement(root, 'docAuthor')
+#        author = etree.SubElement(doc_author, 'text')
+#        author.text = 'Name of the person'
+
+        # For now just make a very simple navMap
+        nav_map = etree.SubElement(root, 'navMap')
+
+        def _create_section(itm, items, uid):
+            for item in items:
+                if isinstance(item, tuple) or isinstance(item, list):
+                    section, subsection = item[0], item[1]
+
+                    np = etree.SubElement(itm, 'navPoint', {
+                        'id': section.get_id() if isinstance(section, EpubHtml) else 'sep_%d' % uid
+                    })
+                    nl = etree.SubElement(np, 'navLabel')
+                    nt = etree.SubElement(nl, 'text')
+                    nt.text = section.title
+
+                    # CAN NOT HAVE EMPTY SRC HERE
+                    href = ''
+                    if isinstance(section, EpubHtml):
+                        href = section.file_name
+                    elif isinstance(section, Section) and section.href != '':
+                        href = section.href
+                    elif isinstance(section, Link):
+                        href = section.href
+
+                    nc = etree.SubElement(np, 'content', {'src': href})
+
+                    uid = _create_section(np, subsection, uid + 1)
+                elif isinstance(item, Link):
+                    _parent = itm
+                    _content = _parent.find('content')
+
+                    if _content is not None:
+                        if _content.get('src') == '':
+                            _content.set('src', item.href)
+
+                    np = etree.SubElement(itm, 'navPoint', {'id': item.uid})
+                    nl = etree.SubElement(np, 'navLabel')
+                    nt = etree.SubElement(nl, 'text')
+                    nt.text = item.title
+
+                    nc = etree.SubElement(np, 'content', {'src': item.href})
+                elif isinstance(item, EpubHtml):
+                    _parent = itm
+                    _content = _parent.find('content')
+
+                    if _content is not None:
+                        if _content.get('src') == '':
+                            _content.set('src', item.file_name)
+
+                    np = etree.SubElement(itm, 'navPoint', {'id': item.get_id()})
+                    nl = etree.SubElement(np, 'navLabel')
+                    nt = etree.SubElement(nl, 'text')
+                    nt.text = item.title
+
+                    nc = etree.SubElement(np, 'content', {'src': item.file_name})
+
+            return uid
+
+        _create_section(nav_map, self.book.toc, 0)
+
+        tree_str = etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True)
+
+        return tree_str
+
+    def _write_items(self):
+        for item in self.book.get_items():
+            if isinstance(item, EpubNcx):
+                self.out.writestr('%s/%s' % (self.book.FOLDER_NAME, item.file_name), self._get_ncx())
+            elif isinstance(item, EpubNav):
+                self.out.writestr('%s/%s' % (self.book.FOLDER_NAME, item.file_name), self._get_nav(item))
+            elif item.manifest:
+                self.out.writestr('%s/%s' % (self.book.FOLDER_NAME, item.file_name), item.get_content())
+            else:
+                self.out.writestr('%s' % item.file_name, item.get_content())
+
+    def write(self):
+        # check for the option allowZip64
+        self.out = zipfile.ZipFile(self.file_name, 'w', zipfile.ZIP_DEFLATED)
+        self.out.writestr('mimetype', 'application/epub+zip', compress_type=zipfile.ZIP_STORED)
+
+        self._write_container()
+        self._write_opf()
+        self._write_items()
+
+        self.out.close()
+
+
+class EpubReader(object):
+    DEFAULT_OPTIONS = {}
+
+    def __init__(self, epub_file_name, options=None):
+        self.file_name = epub_file_name
+        self.book = EpubBook()
+        self.zf = None
+
+        self.opf_file = ''
+        self.opf_dir = ''
+
+        self.options = dict(self.DEFAULT_OPTIONS)
+        if options:
+            self.options.update(options)
+
+    def process(self):
+        # should cache this html parsing so we don't do it for every plugin
+        for plg in self.options.get('plugins', []):
+            if hasattr(plg, 'after_read'):
+                plg.after_read(self.book)
+
+        for item in self.book.get_items():
+            if isinstance(item, EpubHtml):
+                for plg in self.options.get('plugins', []):
+                    if hasattr(plg, 'html_after_read'):
+                        plg.html_after_read(self.book, item)
+
+    def load(self):
+        self._load()
+
+        return self.book
+
+    def read_file(self, name):
+        # Raises KeyError
+        name = os.path.normpath(name)
+        return self.zf.read(name)
+
+    def _load_container(self):
+        meta_inf = self.read_file('META-INF/container.xml')
+        tree = parse_string(meta_inf)
+
+        for root_file in tree.findall('//xmlns:rootfile[@media-type]', namespaces={'xmlns': NAMESPACES['CONTAINERNS']}):
+            if root_file.get('media-type') == 'application/oebps-package+xml':
+                self.opf_file = root_file.get('full-path')
+                self.opf_dir = zip_path.dirname(self.opf_file)
+
+    def _load_metadata(self):
+        container_root = self.container.getroot()
+
+        # get epub version
+        self.book.version = container_root.get('version', None)
+
+        # get unique-identifier
+        if container_root.get('unique-identifier', None):
+            self.book.IDENTIFIER_ID = container_root.get('unique-identifier')
+
+        # get xml:lang
+        # get metadata
+        metadata = self.container.find('{%s}%s' % (NAMESPACES['OPF'], 'metadata'))
+
+        nsmap = metadata.nsmap
+        nstags = dict((k, '{%s}' % v) for k, v in six.iteritems(nsmap))
+        default_ns = nstags.get(None, '')
+
+        nsdict = dict((v, {}) for v in nsmap.values())
+
+        def add_item(ns, tag, value, extra):
+            if ns not in nsdict:
+                nsdict[ns] = {}
+
+            values = nsdict[ns].setdefault(tag, [])
+            values.append((value, extra))
+
+        for t in metadata:
+            if not etree.iselement(t) or t.tag is etree.Comment:
+                continue
+            if t.tag == default_ns + 'meta':
+                name = t.get('name')
+                others = dict((k, v) for k, v in t.items())
+
+                if name and ':' in name:
+                    prefix, name = name.split(':', 1)
+                else:
+                    prefix = None
+
+                add_item(t.nsmap.get(prefix, prefix), name, t.text, others)
+            else:
+                tag = t.tag[t.tag.rfind('}') + 1:]
+
+                if (t.prefix and t.prefix.lower() == 'dc') and tag == 'identifier':
+                    _id = t.get('id', None)
+
+                    if _id:
+                        self.book.IDENTIFIER_ID = _id
+
+                others = dict((k, v) for k, v in t.items())
+                add_item(t.nsmap[t.prefix], tag, t.text, others)
+
+        self.book.metadata = nsdict
+
+        titles = self.book.get_metadata('DC', 'title')
+        if len(titles) > 0:
+            self.book.title = titles[0][0]
+
+        for value, others in self.book.get_metadata('DC', 'identifier'):
+            if others.get('id') == self.book.IDENTIFIER_ID:
+                self.book.uid = value
+
+    def _load_manifest(self):
+        for r in self.container.find('{%s}%s' % (NAMESPACES['OPF'], 'manifest')):
+            if r is not None and r.tag != '{%s}item' % NAMESPACES['OPF']:
+                continue
+
+            media_type = r.get('media-type')
+            _properties = r.get('properties', '')
+
+            if _properties:
+                properties = _properties.split(' ')
+            else:
+                properties = []
+
+            # people use wrong content types
+            if media_type == 'image/jpg':
+                media_type = 'image/jpeg'
+
+            if media_type == 'application/x-dtbncx+xml':
+                ei = EpubNcx(uid=r.get('id'), file_name=unquote(r.get('href')))
+
+                ei.content = self.read_file(zip_path.join(self.opf_dir, ei.file_name))
+            elif media_type == 'application/xhtml+xml':
+                if 'nav' in properties:
+                    ei = EpubNav(uid=r.get('id'), file_name=unquote(r.get('href')))
+
+                    ei.content = self.read_file(zip_path.join(self.opf_dir, r.get('href')))
+                elif 'cover' in properties:
+                    ei = EpubCoverHtml()
+
+                    ei.content = self.read_file(zip_path.join(self.opf_dir, unquote(r.get('href'))))
+                else:
+                    ei = EpubHtml()
+
+                    ei.id = r.get('id')
+                    ei.file_name = unquote(r.get('href'))
+                    ei.media_type = media_type
+                    ei.content = self.read_file(zip_path.join(self.opf_dir, ei.get_name()))
+                    ei.properties = properties
+            elif media_type in IMAGE_MEDIA_TYPES:
+                if 'cover-image' in properties:
+                    ei = EpubCover(uid=r.get('id'), file_name=unquote(r.get('href')))
+
+                    ei.media_type = media_type
+                    ei.content = self.read_file(zip_path.join(self.opf_dir, ei.get_name()))
+                else:
+                    ei = EpubImage()
+
+                    ei.id = r.get('id')
+                    ei.file_name = unquote(r.get('href'))
+                    ei.media_type = media_type
+                    ei.content = self.read_file(zip_path.join(self.opf_dir, ei.get_name()))
+            else:
+                # different types
+                ei = EpubItem()
+
+                ei.id = r.get('id')
+                ei.file_name = unquote(r.get('href'))
+                ei.media_type = media_type
+
+                ei.content = self.read_file(zip_path.join(self.opf_dir, ei.get_name()))
+
+            self.book.add_item(ei)
+
+    def _parse_ncx(self, data):
+        tree = parse_string(data)
+        tree_root = tree.getroot()
+
+        nav_map = tree_root.find('{%s}navMap' % NAMESPACES['DAISY'])
+
+        def _get_children(elems, n, nid):
+            label, content = '', ''
+            children = []
+
+            for a in elems.getchildren():
+                if a.tag == '{%s}navLabel' % NAMESPACES['DAISY']:
+                    label = a.getchildren()[0].text
+                if a.tag == '{%s}content' % NAMESPACES['DAISY']:
+                    content = a.get('src', '')
+                if a.tag == '{%s}navPoint' % NAMESPACES['DAISY']:
+                    children.append(_get_children(a, n + 1, a.get('id', '')))
+
+            if len(children) > 0:
+                if n == 0:
+                    return children
+
+                return (Section(label, href=content),
+                        children)
+            else:
+                return Link(content, label, nid)
+
+        self.book.toc = _get_children(nav_map, 0, '')
+
+    def _parse_nav(self, data, base_path):
+        html_node = parse_html_string(data)
+        nav_node = html_node.xpath("//nav[@*='toc']")[0]
+
+        def parse_list(list_node):
+            items = []
+
+            for item_node in list_node.findall('li'):
+
+                sublist_node = item_node.find('ol')
+                link_node = item_node.find('a')
+
+                if sublist_node is not None:
+                    title = item_node[0].text
+                    children = parse_list(sublist_node)
+
+                    if link_node is not None:
+                        href = zip_path.normpath(zip_path.join(base_path, link_node.get('href')))
+                        items.append((Section(title, href=href), children))
+                    else:
+                        items.append((Section(title), children))
+                elif link_node is not None:
+                    title = link_node.text
+                    href = zip_path.normpath(zip_path.join(base_path, link_node.get('href')))
+
+                    items.append(Link(href, title))
+
+            return items
+
+        self.book.toc = parse_list(nav_node.find('ol'))
+
+    def _load_spine(self):
+        spine = self.container.find('{%s}%s' % (NAMESPACES['OPF'], 'spine'))
+
+        self.book.spine = [(t.get('idref'), t.get('linear', 'yes')) for t in spine]
+
+        toc = spine.get('toc', '')
+        self.book.set_direction(spine.get('page-progression-direction', None))
+
+        # should read ncx or nav file
+        if toc:
+            try:
+                ncxFile = self.read_file(zip_path.join(self.opf_dir, self.book.get_item_with_id(toc).get_name()))
+            except KeyError:
+                raise EpubException(-1, 'Can not find ncx file.')
+
+            self._parse_ncx(ncxFile)
+
+    def _load_guide(self):
+        guide = self.container.find('{%s}%s' % (NAMESPACES['OPF'], 'guide'))
+        if guide is not None:
+            self.book.guide = [{'href': t.get('href'), 'title': t.get('title'), 'type': t.get('type')} for t in guide]
+
+    def _load_opf_file(self):
+        try:
+            s = self.read_file(self.opf_file)
+        except KeyError:
+            raise EpubException(-1, 'Can not find container file')
+
+        self.container = parse_string(s)
+
+        self._load_metadata()
+        self._load_manifest()
+        self._load_spine()
+        self._load_guide()
+
+        # read nav file if found
+        #
+        if not self.book.toc:
+            nav_item = next((item for item in self.book.items if isinstance(item, EpubNav)), None)
+            if nav_item:
+                self._parse_nav(nav_item.content, zip_path.dirname(nav_item.file_name))
+
+    def _load(self):
+        try:
+            self.zf = zipfile.ZipFile(self.file_name, 'r', compression=zipfile.ZIP_DEFLATED, allowZip64=True)
+        except zipfile.BadZipfile as bz:
+            raise EpubException(0, 'Bad Zip file')
+        except zipfile.LargeZipFile as bz:
+            raise EpubException(1, 'Large Zip file')
+
+        # 1st check metadata
+        self._load_container()
+        self._load_opf_file()
+
+        self.zf.close()
+
+
+# WRITE
+
+def write_epub(name, book, options=None):
+    """
+    Creates epub file with the content defined in EpubBook.
+
+    >>> ebooklib.write_epub('book.epub', book)
+
+    :Args:
+      - name: file name for the output file
+      - book: instance of EpubBook
+      - options: extra opions as dictionary (optional)
+    """
+    epub = EpubWriter(name, book, options)
+
+    epub.process()
+
+    try:
+        epub.write()
+    except IOError:
+        pass
+
+# READ
+
+
+def read_epub(name, options=None):
+    """
+    Creates new instance of EpubBook with the content defined in the input file.
+
+    >>> book = ebooklib.read_epub('book.epub')
+
+    :Args:
+      - name: full path to the input file
+      - options: extra options as dictionary (optional)
+
+    :Returns:
+      Instance of EpubBook.
+    """
+    reader = EpubReader(name, options)
+
+    book = reader.load()
+    reader.process()
+
+    return book
diff --git a/ebooklib/plugins/__init__.py b/ebooklib/plugins/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/ebooklib/plugins/base.py b/ebooklib/plugins/base.py
new file mode 100644
index 0000000..6351f81
--- /dev/null
+++ b/ebooklib/plugins/base.py
@@ -0,0 +1,49 @@
+# This file is part of EbookLib.
+# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
+#
+# EbookLib is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# EbookLib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with EbookLib.  If not, see <http://www.gnu.org/licenses/>.
+
+
+class BasePlugin(object):
+    def before_write(self, book):
+        "Processing before save"
+        return True
+
+    def after_write(self, book):
+        "Processing after save"
+        return True
+
+    def before_read(self, book):
+        "Processing before save"
+        return True
+
+    def after_read(self, book):
+        "Processing after save"
+        return True
+
+    def item_after_read(self, book, item):
+        "Process general item after read."
+        return True
+
+    def item_before_write(self, book, item):
+        "Process general item before write."
+        return True
+
+    def html_after_read(self, book, chapter):
+        "Processing HTML before read."
+        return True
+
+    def html_before_write(self, book, chapter):
+        "Processing HTML before save."
+        return True
diff --git a/ebooklib/plugins/booktype.py b/ebooklib/plugins/booktype.py
new file mode 100644
index 0000000..5518258
--- /dev/null
+++ b/ebooklib/plugins/booktype.py
@@ -0,0 +1,119 @@
+# This file is part of EbookLib.
+# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
+#
+# EbookLib is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# EbookLib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with EbookLib.  If not, see <http://www.gnu.org/licenses/>.
+
+from ebooklib.plugins.base import BasePlugin
+from ebooklib.utils import parse_html_string
+
+class BooktypeLinks(BasePlugin):
+    NAME = 'Booktype Links'
+
+    def __init__(self, booktype_book):
+        self.booktype_book = booktype_book
+
+    def html_before_write(self, book, chapter):
+        from lxml import  etree
+
+        try:
+            from urlparse import urlparse, urljoin
+        except ImportError:
+            from urllib.parse import urlparse, urljoin
+
+        try:
+            tree = parse_html_string(chapter.content)
+        except:
+            return
+
+        root = tree.getroottree()
+
+        if len(root.find('body')) != 0:
+            body = tree.find('body')
+
+            # should also be aware to handle
+            # ../chapter/
+            # ../chapter/#reference
+            # ../chapter#reference
+
+            for _link in body.xpath('//a'):
+                # This is just temporary for the footnotes
+                if _link.get('href', '').find('InsertNoteID') != -1:
+                    _ln = _link.get('href', '')
+                    i = _ln.find('#')                                       
+                    _link.set('href', _ln[i:]);
+
+                    continue
+
+                _u = urlparse(_link.get('href', ''))
+
+                # Let us care only for internal links at the moment
+                if _u.scheme == '':
+                    if _u.path != '':
+                        _link.set('href', '%s.xhtml' % _u.path)
+                    
+                    if _u.fragment != '':
+                        _link.set('href', urljoin(_link.get('href'), '#%s' % _u.fragment))
+
+                    if _link.get('name') != None:
+                        _link.set('id', _link.get('name'))
+                        etree.strip_attributes(_link, 'name')
+                    
+        chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8')
+            
+
+
+
+class BooktypeFootnotes(BasePlugin):
+    NAME = 'Booktype Footnotes'
+
+    def __init__(self, booktype_book):
+        self.booktype_book = booktype_book
+
+    def html_before_write(self, book, chapter):
+        from lxml import etree
+
+        from ebooklib import epub
+
+        try:
+            tree = parse_html_string(chapter.content)
+        except:
+            return
+
+        root = tree.getroottree()
+
+        if len(root.find('body')) != 0:
+            body = tree.find('body')
+
+            # <span id="InsertNoteID_1_marker1" class="InsertNoteMarker"><sup><a href="#InsertNoteID_1">1</a></sup><span>
+            # <ol id="InsertNote_NoteList"><li id="InsertNoteID_1">prvi footnote <span id="InsertNoteID_1_LinkBacks"><sup><a href="#InsertNoteID_1_marker1">^</a></sup></span></li>
+
+            # <a epub:type="noteref" href="#n1">1</a></p>
+            # <aside epub:type="footnote" id="n1"><p>These have been corrected in this EPUB3 edition.</p></aside>
+            for footnote in body.xpath('//span[@class="InsertNoteMarker"]'):
+                footnote_id = footnote.get('id')[:-8]
+                a = footnote.getchildren()[0].getchildren()[0]
+                
+                footnote_text = body.xpath('//li[@id="%s"]' % footnote_id)[0]
+                
+                a.attrib['{%s}type' % epub.NAMESPACES['EPUB']] = 'noteref'
+                ftn = etree.SubElement(body, 'aside', {'id': footnote_id})
+                ftn.attrib['{%s}type' % epub.NAMESPACES['EPUB']] = 'footnote'
+                ftn_p = etree.SubElement(ftn, 'p')
+                ftn_p.text = footnote_text.text
+
+            old_footnote = body.xpath('//ol[@id="InsertNote_NoteList"]')
+            if len(old_footnote) > 0:
+                body.remove(old_footnote[0])
+            
+        chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8')        
diff --git a/ebooklib/plugins/sourcecode.py b/ebooklib/plugins/sourcecode.py
new file mode 100644
index 0000000..4f973a2
--- /dev/null
+++ b/ebooklib/plugins/sourcecode.py
@@ -0,0 +1,68 @@
+# This file is part of EbookLib.
+# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
+#
+# EbookLib is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# EbookLib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with EbookLib.  If not, see <http://www.gnu.org/licenses/>.
+
+from ebooklib.plugins.base import BasePlugin
+from ebooklib.utils import parse_html_string
+
+class SourceHighlighter(BasePlugin):    
+    def __init__(self):
+        pass
+
+    def html_before_write(self, book, chapter):
+        from lxml import etree, html
+
+        from pygments import highlight
+        from pygments.formatters import HtmlFormatter
+
+        from ebooklib import epub
+
+        try:
+            tree = parse_html_string(chapter.content)
+        except:
+            return
+
+        root = tree.getroottree()
+
+        had_source = False
+
+        if len(root.find('body')) != 0:
+            body = tree.find('body')
+            # check for embeded source
+            for source in body.xpath('//pre[contains(@class,"source-")]'):
+                css_class = source.get('class')
+
+                source_text = (source.text or '') + ''.join([html.tostring(child) for child in source.iterchildren()])
+
+                if 'source-python' in css_class:
+                    from pygments.lexers import PythonLexer
+
+#                    _text =  highlight(source_text, PythonLexer(), HtmlFormatter(linenos="inline"))
+                    _text =  highlight(source_text, PythonLexer(), HtmlFormatter())
+
+                if 'source-css' in css_class:
+                    from pygments.lexers import CssLexer
+
+                    _text =  highlight(source_text, CssLexer(), HtmlFormatter())
+
+                _parent = source.getparent()
+                _parent.replace(source, etree.XML(_text))
+
+                had_source = True
+
+        if had_source:
+            chapter.add_link(href="style/code.css", rel="stylesheet", type="text/css")
+            chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8')        
+
diff --git a/ebooklib/plugins/standard.py b/ebooklib/plugins/standard.py
new file mode 100644
index 0000000..61576f9
--- /dev/null
+++ b/ebooklib/plugins/standard.py
@@ -0,0 +1,230 @@
+# This file is part of EbookLib.
+# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
+#
+# EbookLib is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# EbookLib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with EbookLib.  If not, see <http://www.gnu.org/licenses/>.
+
+import six
+
+from ebooklib.plugins.base import BasePlugin
+from ebooklib.utils import parse_html_string
+
+# TODO:
+#   - should also look for the _required_ elements
+# http://www.w3.org/html/wg/drafts/html/master/tabular-data.html#the-table-element
+
+ATTRIBUTES_GLOBAL = ['accesskey', 'class', 'contenteditable', 'contextmenu', 'dir', 'draggable',
+                     'dropzone', 'hidden',  'id', 'inert', 'itemid', 'itemprop', 'itemref',
+                     'itemscope', 'itemtype', 'lang', 'spellcheck', 'style', 'tabindex',
+                     'title', 'translate', 'epub:type']
+
+# Remove <u> for now from here
+DEPRECATED_TAGS = ['acronym', 'applet', 'basefont', 'big', 'center', 'dir', 'font', 'frame',
+                   'frameset', 'isindex', 'noframes', 's', 'strike', 'tt']
+
+
+def leave_only(item, tag_list):
+    for _attr in six.iterkeys(item.attrib):
+        if _attr not in tag_list:
+            del item.attrib[_attr]
+
+
+class SyntaxPlugin(BasePlugin):
+    NAME = 'Check HTML syntax'
+
+    def html_before_write(self, book, chapter):
+        from lxml import etree
+
+        try:
+            tree = parse_html_string(chapter.content)
+        except:
+            return
+
+        root = tree.getroottree()
+
+        # delete deprecated tags
+        # i should really have a list of allowed tags
+        for tag in DEPRECATED_TAGS:
+            etree.strip_tags(root, tag)
+
+        head = tree.find('head')
+        
+        if head is not None and len(head) != 0:
+            
+            for _item in head:
+                if _item.tag == 'base':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'target'])
+                elif _item.tag == 'link':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'crossorigin', 'rel', 'media', 'hreflang', 'type', 'sizes'])
+                elif _item.tag == 'title':
+                    if _item.text == '':
+                        head.remove(_item)
+                elif _item.tag == 'meta':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['name', 'http-equiv', 'content', 'charset'])
+                    # just remove for now, but really should not be like this
+                    head.remove(_item) 
+                elif _item.tag == 'script':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'charset', 'async', 'defer', 'crossorigin'])
+                elif _item.tag == 'source':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'media'])
+                elif _item.tag == 'style':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['media', 'type', 'scoped'])
+                else:
+                    leave_only(_item, ATTRIBUTES_GLOBAL)
+
+
+        if len(root.find('body')) != 0:
+            body = tree.find('body')
+
+            for _item in body.iter():
+                # it is not
+                # <a class="indexterm" href="ch05.html#ix_epub:trigger_element">
+                
+                if _item.tag == 'a':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'target', 'download', 'rel', 'hreflang', 'type'])
+                elif _item.tag == 'area':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['alt', 'coords', 'shape', 'href', 'target', 'download', 'rel', 'hreflang', 'type'])
+                elif _item.tag == 'audio':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'crossorigin', 'preload', 'autoplay', 'mediagroup', 'loop', 'muted', 'controls'])
+                elif _item.tag == 'blockquote':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['cite'])
+                elif _item.tag == 'button':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['autofocus', 'disabled', 'form', 'formaction', 'formenctype', 'formmethod', 'formnovalidate',
+                                                           'formtarget', 'name', 'type', 'value', 'menu'])
+                elif _item.tag == 'canvas':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height'])
+                elif _item.tag == 'canvas':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height'])
+                elif _item.tag == 'del':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['cite', 'datetime'])
+                elif _item.tag == 'details':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['open'])
+                elif _item.tag == 'embed':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'width', 'height'])
+                elif _item.tag == 'fieldset':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['disable', 'form', 'name'])
+                elif _item.tag == 'details':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['accept-charset', 'action', 'autocomplete', 'enctype', 'method', 'name', 'novalidate', 'target'])
+                elif _item.tag == 'iframe':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'srcdoc', 'name', 'sandbox', 'seamless', 'allowfullscreen', 'width', 'height'])
+                elif _item.tag == 'img':
+                    _src =  _item.get('src', '').lower()
+                    if _src.startswith('http://') or _src.startswith('https://'):
+                        if 'remote-resources' not in chapter.properties:
+                            chapter.properties.append('remote-resources')
+                            # THIS DOES NOT WORK, ONLY VIDEO AND AUDIO FILES CAN BE REMOTE RESOURCES
+                            # THAT MEANS I SHOULD ALSO CATCH <SOURCE TAG
+                            from ebooklib import epub
+                            _img = epub.EpubImage(file_name = _item.get('src'))
+                            book.add_item(_img)
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['alt', 'src', 'crossorigin', 'usemap', 'ismap', 'width', 'height'])
+                elif _item.tag == 'input':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['accept', 'alt', 'autocomplete', 'autofocus', 'checked', 'dirname',
+                                                           'disabled', 'form', 'formaction', 'formenctype', 'formmethod', 'formnovalidate',
+                                                           'formtarget', 'height', 'inputmode', 'list', 'max', 'maxlength', 'min', 'multiple',
+                                                           'name', 'pattern', 'placeholder', 'readonly', 'required', 'size', 'src', 'step'
+                                                           'type', 'value', 'width'])
+                elif _item.tag == 'ins':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['cite', 'datetime'])
+                elif _item.tag == 'keygen':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['autofocus', 'challenge', 'disabled', 'form', 'keytype', 'name'])
+                elif _item.tag == 'label':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['form', 'for'])
+                elif _item.tag == 'label':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['form', 'for'])
+                elif _item.tag == 'map':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['name'])
+                elif _item.tag == 'menu':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['type', 'label'])
+                elif _item.tag == 'object':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['data', 'type', 'typemustmatch', 'name', 'usemap', 'form', 'width', 'height'])
+                elif _item.tag == 'ol':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['reversed', 'start', 'type'])
+                elif _item.tag == 'optgroup':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['disabled', 'label'])
+                elif _item.tag == 'option':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['disabled', 'label', 'selected', 'value'])
+                elif _item.tag == 'output':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['for', 'form', 'name'])
+                elif _item.tag == 'param':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['name', 'value'])
+                elif _item.tag == 'progress':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['value', 'max'])
+                elif _item.tag == 'q':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['cite'])
+                elif _item.tag == 'select':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['autofocus', 'disabled', 'form', 'multiple', 'name', 'required', 'size'])
+
+                elif _item.tag == 'table':
+                    if _item.get('border', None):
+                        if _item.get('border') == '0':
+                            _item.set('border', '')
+
+                    if _item.get('summary', None):
+                        _caption = etree.Element('caption', {})
+                        _caption.text = _item.get('summary')
+                        _item.insert(0, _caption)
+
+                        # add it as caption
+                        del _item.attrib['summary']
+
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['border', 'sortable'])
+                elif _item.tag == 'dl':
+                    _d = _item.find('dd')
+                    if _d is not None and len(_d) == 0:
+                        pass
+
+                        # http://html5doctor.com/the-dl-element/
+                        # should be like this really
+                        # some of the elements can be missing
+                        # dl
+                        #   dt
+                        #   dd
+                        #   dt
+                        #   dd
+                elif _item.tag == 'td':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['colspan', 'rowspan', 'headers'])
+                elif _item.tag == 'textarea':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['autocomplete', 'autofocus', 'cols', 'dirname', 'disabled', 'form',
+                                                           'inputmode', 'maxlength', 'name', 'placeholder', 'readonly', 'required',
+                                                           'rows', 'wrap'])
+
+                elif _item.tag in ['col', 'colgroup']:
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['span'])
+                elif _item.tag == 'th':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['colspan', 'rowspan', 'headers', 'scope', 'abbr', 'sorted'])
+                elif _item.tag in ['time']:
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['datetime'])
+                elif _item.tag in ['track']:
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['kind', 'src', 'srclang', 'label', 'default'])
+                elif _item.tag == 'video':
+                    leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'crossorigin', 'poster', 'preload', 'autoplay', 'mediagroup',
+                                                           'loop', 'muted', 'controls', 'width', 'height'])
+                elif _item.tag == 'svg':
+                    # We need to add property "svg" in case we have embeded svg file
+                    if 'svg' not in chapter.properties:
+                        chapter.properties.append('svg')
+                        
+                    if _item.get('viewbox', None):
+                        del _item.attrib['viewbox']
+
+                    if _item.get('preserveaspectratio', None):
+                        del _item.attrib['preserveaspectratio']
+                else:
+                    for _attr in six.iterkeys(_item.attrib):
+                        if _attr not in ATTRIBUTES_GLOBAL:
+                            del _item.attrib[_attr]
+
+        chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)
+        
+        return chapter.content
diff --git a/ebooklib/plugins/tidyhtml.py b/ebooklib/plugins/tidyhtml.py
new file mode 100644
index 0000000..8640ccd
--- /dev/null
+++ b/ebooklib/plugins/tidyhtml.py
@@ -0,0 +1,82 @@
+# This file is part of EbookLib.
+# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
+#
+# EbookLib is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# EbookLib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with EbookLib.  If not, see <http://www.gnu.org/licenses/>.
+
+import six
+import subprocess
+
+from ebooklib.plugins.base import BasePlugin
+from ebooklib.utils import parse_html_string
+
+# Recommend usage of
+# - https://github.com/w3c/tidy-html5
+
+def tidy_cleanup(content, **extra):
+    cmd = []
+
+    for k, v in six.iteritems(extra):
+
+        if v:
+            cmd.append('--%s' % k)
+            cmd.append(v)
+        else:
+            cmd.append('-%s' % k)
+
+    # must parse all other extra arguments
+    try:
+        p = subprocess.Popen(['tidy']+cmd, shell=False, 
+                             stdin=subprocess.PIPE, stdout=subprocess.PIPE, 
+                             stderr=subprocess.PIPE, close_fds=True)
+    except OSError:
+        return (3, None)
+
+    p.stdin.write(content)
+
+    (cont, p_err) = p.communicate()
+
+    # 0 - all ok
+    # 1 - there were warnings
+    # 2 - there were errors
+    # 3 - exception
+
+    return (p.returncode, cont)
+
+
+class TidyPlugin(BasePlugin):
+    NAME = 'Tidy HTML'
+    OPTIONS = {'char-encoding': 'utf8',
+               'tidy-mark': 'no'
+              }
+
+    def __init__(self, extra = {}):
+        self.options = dict(self.OPTIONS)
+        self.options.update(extra)
+
+    def html_before_write(self, book, chapter):
+        if not chapter.content:
+            return None
+
+        (_, chapter.content) = tidy_cleanup(chapter.content, **self.options)
+
+        return chapter.content
+
+    def html_after_read(self, book, chapter):
+        if not chapter.content:
+            return None
+
+        (_, chapter.content) = tidy_cleanup(chapter.content, **self.options)
+
+        return chapter.content
+
diff --git a/ebooklib/utils.py b/ebooklib/utils.py
new file mode 100644
index 0000000..162f4c9
--- /dev/null
+++ b/ebooklib/utils.py
@@ -0,0 +1,60 @@
+# This file is part of EbookLib.
+# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
+#
+# EbookLib is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# EbookLib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with EbookLib.  If not, see <http://www.gnu.org/licenses/>.
+
+import io
+import mimetypes
+
+from lxml import etree
+
+
+mimetype_initialised = False
+
+
+def debug(obj):
+    import pprint
+
+    pp = pprint.PrettyPrinter(indent=4)
+    pp.pprint(obj)
+
+
+def parse_string(s):
+    try:
+        tree = etree.parse(io.BytesIO(s.encode('utf-8')))
+    except:
+        tree = etree.parse(io.BytesIO(s))
+
+    return tree
+
+
+def parse_html_string(s):
+    from lxml import html
+
+    utf8_parser = html.HTMLParser(encoding='utf-8')
+
+    html_tree = html.document_fromstring(s, parser=utf8_parser)
+
+    return html_tree
+
+
+def guess_type(extenstion):
+    global mimetype_initialised
+
+    if not mimetype_initialised:
+        mimetypes.init()
+        mimetypes.add_type('application/xhtml+xml', '.xhtml')
+        mimetype_initialised = True
+
+    return mimetypes.guess_type(extenstion)