From f3d748f5ddba90ec0d07756b99cf7dd0b3858bd1 Mon Sep 17 00:00:00 2001
From: BasioMeusPuga
Date: Mon, 6 Nov 2017 07:42:48 +0530
Subject: [PATCH] Incorporate ebooklib from
https://github.com/aerkalov/ebooklib
---
ebooklib/__init__.py | 42 +
ebooklib/epub.py | 1595 ++++++++++++++++++++++++++++++++
ebooklib/plugins/__init__.py | 0
ebooklib/plugins/base.py | 49 +
ebooklib/plugins/booktype.py | 119 +++
ebooklib/plugins/sourcecode.py | 68 ++
ebooklib/plugins/standard.py | 230 +++++
ebooklib/plugins/tidyhtml.py | 82 ++
ebooklib/utils.py | 60 ++
9 files changed, 2245 insertions(+)
create mode 100644 ebooklib/__init__.py
create mode 100644 ebooklib/epub.py
create mode 100644 ebooklib/plugins/__init__.py
create mode 100644 ebooklib/plugins/base.py
create mode 100644 ebooklib/plugins/booktype.py
create mode 100644 ebooklib/plugins/sourcecode.py
create mode 100644 ebooklib/plugins/standard.py
create mode 100644 ebooklib/plugins/tidyhtml.py
create mode 100644 ebooklib/utils.py
diff --git a/ebooklib/__init__.py b/ebooklib/__init__.py
new file mode 100644
index 0000000..020d2f5
--- /dev/null
+++ b/ebooklib/__init__.py
@@ -0,0 +1,42 @@
+# This file is part of EbookLib.
+# Copyright (c) 2013 Aleksandar Erkalovic
+#
+# EbookLib is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# EbookLib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with EbookLib. If not, see .
+
+# Version of ebook library
+
+VERSION = (0, 16, 0)
+
+# LIST OF POSSIBLE ITEMS
+ITEM_UNKNOWN = 0
+ITEM_IMAGE = 1
+ITEM_STYLE = 2
+ITEM_SCRIPT = 3
+ITEM_NAVIGATION = 4
+ITEM_VECTOR = 5
+ITEM_FONT = 6
+ITEM_VIDEO = 7
+ITEM_AUDIO = 8
+ITEM_DOCUMENT = 9
+
+# EXTENSION MAPPER
+EXTENSIONS = {ITEM_IMAGE: ['.jpg', '.jpeg', '.gif', '.tiff', '.tif', '.png'],
+ ITEM_STYLE: ['.css'],
+ ITEM_VECTOR: ['.svg'],
+ ITEM_FONT: ['.otf', '.woff', '.ttf'],
+ ITEM_SCRIPT: ['.js'],
+ ITEM_NAVIGATION: ['.ncx'],
+ ITEM_VIDEO: ['.mov', '.mp4', '.avi'],
+ ITEM_AUDIO: ['.mp3', '.ogg']
+ }
diff --git a/ebooklib/epub.py b/ebooklib/epub.py
new file mode 100644
index 0000000..fd3a0ff
--- /dev/null
+++ b/ebooklib/epub.py
@@ -0,0 +1,1595 @@
+# This file is part of EbookLib.
+# Copyright (c) 2013 Aleksandar Erkalovic
+#
+# EbookLib is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# EbookLib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with EbookLib. If not, see .
+
+import zipfile
+import six
+import logging
+import uuid
+import posixpath as zip_path
+import os.path
+from collections import OrderedDict
+
+try:
+ from urllib.parse import unquote
+except ImportError:
+ from urllib import unquote
+
+from lxml import etree
+
+import ebooklib
+
+from ebooklib.utils import parse_string, parse_html_string, guess_type
+
+
+# Version of EPUB library
+VERSION = (0, 15, 0)
+
+NAMESPACES = {'XML': 'http://www.w3.org/XML/1998/namespace',
+ 'EPUB': 'http://www.idpf.org/2007/ops',
+ 'DAISY': 'http://www.daisy.org/z3986/2005/ncx/',
+ 'OPF': 'http://www.idpf.org/2007/opf',
+ 'CONTAINERNS': 'urn:oasis:names:tc:opendocument:xmlns:container',
+ 'DC': 'http://purl.org/dc/elements/1.1/',
+ 'XHTML': 'http://www.w3.org/1999/xhtml'}
+
+# XML Templates
+
+CONTAINER_PATH = 'META-INF/container.xml'
+
+CONTAINER_XML = '''
+
+
+
+
+
+'''
+
+NCX_XML = six.b('''
+''')
+
+NAV_XML = six.b('''''')
+
+CHAPTER_XML = six.b('''''')
+
+COVER_XML = six.b('''
+
+
+
+
+
+
+
+
+''')
+
+
+IMAGE_MEDIA_TYPES = ['image/jpeg', 'image/jpg', 'image/png', 'image/svg+xml']
+
+
+# TOC elements
+
+class Section(object):
+
+ def __init__(self, title, href=''):
+ self.title = title
+ self.href = href
+
+
+class Link(object):
+
+ def __init__(self, href, title, uid=None):
+ self.href = href
+ self.title = title
+ self.uid = uid
+
+# Exceptions
+
+
+class EpubException(Exception):
+
+ def __init__(self, code, msg):
+ self.code = code
+ self.msg = msg
+
+ def __str__(self):
+ return repr(self.msg)
+
+# Items
+
+
+class EpubItem(object):
+
+ """
+ Base class for the items in a book.
+ """
+
+ def __init__(self, uid=None, file_name='', media_type='', content=six.b(''), manifest=True):
+ """
+ :Args:
+ - uid: Unique identifier for this item (optional)
+ - file_name: File name for this item (optional)
+ - media_type: Media type for this item (optional)
+ - content: Content for this item (optional)
+ - manifest: Manifest for this item (optional)
+ """
+ self.id = uid
+ self.file_name = file_name
+ self.media_type = media_type
+ self.content = content
+ self.is_linear = True
+ self.manifest = manifest
+
+ self.book = None
+
+ def get_id(self):
+ """
+ Returns unique identifier for this item.
+
+ :Returns:
+ Returns uid number as string.
+ """
+ return self.id
+
+ def get_name(self):
+ """
+ Returns name for this item. By default it is always file name but it does not have to be.
+
+ :Returns:
+ Returns file name for this item.
+ """
+ return self.file_name
+
+ def get_type(self):
+ """
+ Guess type according to the file extension. Might not be the best way how to do it, but it works for now.
+
+ Items can be of type:
+ - ITEM_UNKNOWN = 0
+ - ITEM_IMAGE = 1
+ - ITEM_STYLE = 2
+ - ITEM_SCRIPT = 3
+ - ITEM_NAVIGATION = 4
+ - ITEM_VECTOR = 5
+ - ITEM_FONT = 6
+ - ITEM_VIDEO = 7
+ - ITEM_AUDIO = 8
+ - ITEM_DOCUMENT = 9
+
+ We map type according to the extensions which are defined in ebooklib.EXTENSIONS.
+
+ :Returns:
+ Returns type of the item as number.
+ """
+ _, ext = zip_path.splitext(self.get_name())
+ ext = ext.lower()
+
+ for uid, ext_list in six.iteritems(ebooklib.EXTENSIONS):
+ if ext in ext_list:
+ return uid
+
+ return ebooklib.ITEM_UNKNOWN
+
+ def get_content(self, default=six.b('')):
+ """
+ Returns content of the item. Content should be of type 'str' (Python 2) or 'bytes' (Python 3)
+
+ :Args:
+ - default: Default value for the content if it is not already defined.
+
+ :Returns:
+ Returns content of the item.
+ """
+ return self.content or default
+
+ def set_content(self, content):
+ """
+ Sets content value for this item.
+
+ :Args:
+ - content: Content value
+ """
+ self.content = content
+
+ def __str__(self):
+ return '' % self.id
+
+
+class EpubNcx(EpubItem):
+
+ "Represents Navigation Control File (NCX) in the EPUB."
+
+ def __init__(self, uid='ncx', file_name='toc.ncx'):
+ super(EpubNcx, self).__init__(uid=uid, file_name=file_name, media_type='application/x-dtbncx+xml')
+
+ def __str__(self):
+ return '' % self.id
+
+
+class EpubCover(EpubItem):
+
+ """
+ Represents Cover image in the EPUB file.
+ """
+
+ def __init__(self, uid='cover-img', file_name=''):
+ super(EpubCover, self).__init__(uid=uid, file_name=file_name)
+
+ def __str__(self):
+ return '' % (self.id, self.file_name)
+
+
+class EpubHtml(EpubItem):
+
+ """
+ Represents HTML document in the EPUB file.
+ """
+ _template_name = 'chapter'
+
+ def __init__(self, uid=None, file_name='', media_type='', content=None, title='', lang=None, direction=None):
+ super(EpubHtml, self).__init__(uid, file_name, media_type, content)
+
+ self.title = title
+ self.lang = lang
+ self.direction = direction
+
+ self.links = []
+ self.properties = []
+
+ def is_chapter(self):
+ """
+ Returns if this document is chapter or not.
+
+ :Returns:
+ Returns book value.
+ """
+ return True
+
+ def get_type(self):
+ """
+ Always returns ebooklib.ITEM_DOCUMENT as type of this document.
+
+ :Returns:
+ Always returns ebooklib.ITEM_DOCUMENT
+ """
+
+ return ebooklib.ITEM_DOCUMENT
+
+ def set_language(self, lang):
+ """
+ Sets language for this book item. By default it will use language of the book but it
+ can be overwritten with this call.
+ """
+ self.lang = lang
+
+ def get_language(self):
+ """
+ Get language code for this book item. Language of the book item can be different from
+ the language settings defined globaly for book.
+
+ :Returns:
+ As string returns language code.
+ """
+ return self.lang
+
+ def add_link(self, **kwgs):
+ """
+ Add additional link to the document. Links will be embeded only inside of this document.
+
+ >>> add_link(href='styles.css', rel='stylesheet', type='text/css')
+ """
+ self.links.append(kwgs)
+
+ def get_links(self):
+ """
+ Returns list of additional links defined for this document.
+
+ :Returns:
+ As tuple return list of links.
+ """
+ return (link for link in self.links)
+
+ def get_links_of_type(self, link_type):
+ """
+ Returns list of additional links of specific type.
+
+ :Returns:
+ As tuple returns list of links.
+ """
+ return (link for link in self.links if link.get('type', '') == link_type)
+
+ def add_item(self, item):
+ """
+ Add other item to this document. It will create additional links according to the item type.
+
+ :Args:
+ - item: item we want to add defined as instance of EpubItem
+ """
+ if item.get_type() == ebooklib.ITEM_STYLE:
+ self.add_link(href=item.get_name(), rel='stylesheet', type='text/css')
+
+ if item.get_type() == ebooklib.ITEM_SCRIPT:
+ self.add_link(src=item.get_name(), type='text/javascript')
+
+ def get_body_content(self):
+ """
+ Returns content of BODY element for this HTML document. Content will be of type 'str' (Python 2)
+ or 'bytes' (Python 3).
+
+ :Returns:
+ Returns content of this document.
+ """
+
+ try:
+ html_tree = parse_html_string(self.content)
+ except:
+ return ''
+
+ html_root = html_tree.getroottree()
+
+ if len(html_root.find('body')) != 0:
+ body = html_tree.find('body')
+
+ tree_str = etree.tostring(body, pretty_print=True, encoding='utf-8', xml_declaration=False)
+
+ # this is so stupid
+ if tree_str.startswith(six.b('')):
+ n = tree_str.rindex(six.b(''))
+
+ return tree_str[6:n]
+
+ return tree_str
+
+ return ''
+
+ def get_content(self, default=None):
+ """
+ Returns content for this document as HTML string. Content will be of type 'str' (Python 2)
+ or 'bytes' (Python 3).
+
+ :Args:
+ - default: Default value for the content if it is not defined.
+
+ :Returns:
+ Returns content of this document.
+ """
+
+ tree = parse_string(self.book.get_template(self._template_name))
+ tree_root = tree.getroot()
+
+ tree_root.set('lang', self.lang or self.book.language)
+ tree_root.attrib['{%s}lang' % NAMESPACES['XML']] = self.lang or self.book.language
+
+ # add to the head also
+ #
+
+ try:
+ html_tree = parse_html_string(self.content)
+ except:
+ return ''
+
+ html_root = html_tree.getroottree()
+
+ # create and populate head
+
+ _head = etree.SubElement(tree_root, 'head')
+
+ if self.title != '':
+ _title = etree.SubElement(_head, 'title')
+ _title.text = self.title
+
+ for lnk in self.links:
+ if lnk.get('type') == 'text/javascript':
+ _lnk = etree.SubElement(_head, 'script', lnk)
+ # force
+ _lnk.text = ''
+ else:
+ _lnk = etree.SubElement(_head, 'link', lnk)
+
+ # this should not be like this
+ # head = html_root.find('head')
+ # if head is not None:
+ # for i in head.getchildren():
+ # if i.tag == 'title' and self.title != '':
+ # continue
+ # _head.append(i)
+
+ # create and populate body
+
+ _body = etree.SubElement(tree_root, 'body')
+ if self.direction:
+ _body.set('dir', self.direction)
+
+ body = html_tree.find('body')
+ if body is not None:
+ for i in body.getchildren():
+ _body.append(i)
+
+ tree_str = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)
+
+ return tree_str
+
+ def __str__(self):
+ return '' % (self.id, self.file_name)
+
+
+class EpubCoverHtml(EpubHtml):
+
+ """
+ Represents Cover page in the EPUB file.
+ """
+
+ def __init__(self, uid='cover', file_name='cover.xhtml', image_name='', title='Cover'):
+ super(EpubCoverHtml, self).__init__(uid=uid, file_name=file_name, title=title)
+
+ self.image_name = image_name
+ self.is_linear = False
+
+ def is_chapter(self):
+ """
+ Returns if this document is chapter or not.
+
+ :Returns:
+ Returns book value.
+ """
+
+ return False
+
+ def get_content(self):
+ """
+ Returns content for cover page as HTML string. Content will be of type 'str' (Python 2) or 'bytes' (Python 3).
+
+ :Returns:
+ Returns content of this document.
+ """
+
+ self.content = self.book.get_template('cover')
+
+ tree = parse_string(super(EpubCoverHtml, self).get_content())
+ tree_root = tree.getroot()
+
+ images = tree_root.xpath('//xhtml:img', namespaces={'xhtml': NAMESPACES['XHTML']})
+
+ images[0].set('src', self.image_name)
+ images[0].set('alt', self.title)
+
+ tree_str = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)
+
+ return tree_str
+
+ def __str__(self):
+ return '' % (self.id, self.file_name)
+
+
+class EpubNav(EpubHtml):
+
+ """
+ Represents Navigation Document in the EPUB file.
+ """
+
+ def __init__(self, uid='nav', file_name='nav.xhtml', media_type='application/xhtml+xml'):
+ super(EpubNav, self).__init__(uid=uid, file_name=file_name, media_type=media_type)
+
+ def is_chapter(self):
+ """
+ Returns if this document is chapter or not.
+
+ :Returns:
+ Returns book value.
+ """
+
+ return False
+
+ def __str__(self):
+ return '' % (self.id, self.file_name)
+
+
+class EpubImage(EpubItem):
+
+ """
+ Represents Image in the EPUB file.
+ """
+
+ def __init__(self):
+ super(EpubImage, self).__init__()
+
+ def get_type(self):
+ return ebooklib.ITEM_IMAGE
+
+ def __str__(self):
+ return '' % (self.id, self.file_name)
+
+
+# EpubBook
+
+class EpubBook(object):
+
+ def __init__(self):
+ self.EPUB_VERSION = None
+
+ self.reset()
+
+ # we should have options here
+
+ def reset(self):
+ "Initialises all needed variables to default values"
+
+ self.metadata = {}
+ self.items = []
+ self.spine = []
+ self.guide = []
+ self.toc = []
+ self.bindings = []
+
+ self.IDENTIFIER_ID = 'id'
+ self.FOLDER_NAME = 'EPUB'
+
+ self._id_html = 0
+ self._id_image = 0
+ self._id_static = 0
+
+ self.title = ''
+ self.language = 'en'
+ self.direction = None
+
+ self.templates = {
+ 'ncx': NCX_XML,
+ 'nav': NAV_XML,
+ 'chapter': CHAPTER_XML,
+ 'cover': COVER_XML
+ }
+
+ self.add_metadata('OPF', 'generator', '', {
+ 'name': 'generator', 'content': 'Ebook-lib %s' % '.'.join([str(s) for s in VERSION])
+ })
+
+ # default to using a randomly-unique identifier if one is not specified manually
+ self.set_identifier(str(uuid.uuid4()))
+
+ # custom prefixes and namespaces to be set to the content.opf doc
+ self.prefixes = []
+ self.namespaces = {}
+
+ def set_identifier(self, uid):
+ """
+ Sets unique id for this epub
+
+ :Args:
+ - uid: Value of unique identifier for this book
+ """
+
+ self.uid = uid
+
+ self.set_unique_metadata('DC', 'identifier', self.uid, {'id': self.IDENTIFIER_ID})
+
+ def set_title(self, title):
+ """
+ Set title. You can set multiple titles.
+
+ :Args:
+ - title: Title value
+ """
+
+ self.title = title
+
+ self.add_metadata('DC', 'title', self.title)
+
+ def set_language(self, lang):
+ """
+ Set language for this epub. You can set multiple languages. Specific items in the book can have
+ different language settings.
+
+ :Args:
+ - lang: Language code
+ """
+
+ self.language = lang
+
+ self.add_metadata('DC', 'language', lang)
+
+ def set_direction(self, direction):
+ """
+ :Args:
+ - direction: Options are "ltr", "rtl" and "default"
+ """
+
+ self.direction = direction
+
+ def set_cover(self, file_name, content, create_page=True):
+ """
+ Set cover and create cover document if needed.
+
+ :Args:
+ - file_name: file name of the cover page
+ - content: Content for the cover image
+ - create_page: Should cover page be defined. Defined as bool value (optional). Default value is True.
+ """
+
+ # as it is now, it can only be called once
+ c0 = EpubCover(file_name=file_name)
+ c0.content = content
+ self.add_item(c0)
+
+ if create_page:
+ c1 = EpubCoverHtml(image_name=file_name)
+ self.add_item(c1)
+
+ self.add_metadata(None, 'meta', '', OrderedDict([('name', 'cover'), ('content', 'cover-img')]))
+
+ def add_author(self, author, file_as=None, role=None, uid='creator'):
+ "Add author for this document"
+
+ self.add_metadata('DC', 'creator', author, {'id': uid})
+
+ if file_as:
+ self.add_metadata(None, 'meta', file_as, {'refines': '#' + uid,
+ 'property': 'file-as',
+ 'scheme': 'marc:relators'})
+ if role:
+ self.add_metadata(None, 'meta', role, {'refines': '#' + uid,
+ 'property': 'role',
+ 'scheme': 'marc:relators'})
+
+ def add_metadata(self, namespace, name, value, others=None):
+ "Add metadata"
+
+ if namespace in NAMESPACES:
+ namespace = NAMESPACES[namespace]
+
+ if namespace not in self.metadata:
+ self.metadata[namespace] = {}
+
+ if name not in self.metadata[namespace]:
+ self.metadata[namespace][name] = []
+
+ self.metadata[namespace][name].append((value, others))
+
+ def get_metadata(self, namespace, name):
+ "Retrieve metadata"
+
+ if namespace in NAMESPACES:
+ namespace = NAMESPACES[namespace]
+
+ return self.metadata[namespace][name]
+
+ def set_unique_metadata(self, namespace, name, value, others=None):
+ "Add metadata if metadata with this identifier does not already exist, otherwise update existing metadata."
+
+ if namespace in NAMESPACES:
+ namespace = NAMESPACES[namespace]
+
+ if namespace in self.metadata and name in self.metadata[namespace]:
+ self.metadata[namespace][name] = [(value, others)]
+ else:
+ self.add_metadata(namespace, name, value, others)
+
+ def add_item(self, item):
+ """
+ Add additional item to the book. If not defined, media type and chapter id will be defined
+ for the item.
+
+ :Args:
+ - item: Item instance
+ """
+ if item.media_type == '':
+ (has_guessed, media_type) = guess_type(item.get_name().lower())
+
+ if has_guessed:
+ if media_type is not None:
+ item.media_type = media_type
+ else:
+ item.media_type = has_guessed
+ else:
+ item.media_type = 'application/octet-stream'
+
+ if not item.get_id():
+ # make chapter_, image_ and static_ configurable
+ if isinstance(item, EpubHtml):
+ item.id = 'chapter_%d' % self._id_html
+ self._id_html += 1
+ elif isinstance(item, EpubImage):
+ item.id = 'image_%d' % self._id_image
+ self._id_image += 1
+ else:
+ item.id = 'static_%d' % self._id_image
+ self._id_image += 1
+
+ item.book = self
+ self.items.append(item)
+
+ return item
+
+ def get_item_with_id(self, uid):
+ """
+ Returns item for defined UID.
+
+ >>> book.get_item_with_id('image_001')
+
+ :Args:
+ - uid: UID for the item
+
+ :Returns:
+ Returns item object. Returns None if nothing was found.
+ """
+ for item in self.get_items():
+ if item.id == uid:
+ return item
+
+ return None
+
+ def get_item_with_href(self, href):
+ """
+ Returns item for defined HREF.
+
+ >>> book.get_item_with_href('EPUB/document.xhtml')
+
+ :Args:
+ - href: HREF for the item we are searching for
+
+ :Returns:
+ Returns item object. Returns None if nothing was found.
+ """
+ for item in self.get_items():
+ if item.get_name() == href:
+ return item
+
+ return None
+
+ def get_items(self):
+ """
+ Returns all items attached to this book.
+
+ :Returns:
+ Returns all items as tuple.
+ """
+ return (item for item in self.items)
+
+ def get_items_of_type(self, item_type):
+ """
+ Returns all items of specified type.
+
+ >>> book.get_items_of_type(epub.ITEM_IMAGE)
+
+ :Args:
+ - item_type: Type for items we are searching for
+
+ :Returns:
+ Returns found items as tuple.
+ """
+ return (item for item in self.items if item.get_type() == item_type)
+
+ def get_items_of_media_type(self, media_type):
+ """
+ Returns all items of specified media type.
+
+ :Args:
+ - media_type: Media type for items we are searching for
+
+ :Returns:
+ Returns found items as tuple.
+ """
+ return (item for item in self.items if item.media_type == media_type)
+
+ def set_template(self, name, value):
+ """
+ Defines templates which are used to generate certain types of pages. When defining new value for the template
+ we have to use content of type 'str' (Python 2) or 'bytes' (Python 3).
+
+ At the moment we use these templates:
+ - ncx
+ - nav
+ - chapter
+ - cover
+
+ :Args:
+ - name: Name for the template
+ - value: Content for the template
+ """
+
+ self.templates[name] = value
+
+ def get_template(self, name):
+ """
+ Returns value for the template.
+
+ :Args:
+ - name: template name
+
+ :Returns:
+ Value of the template.
+ """
+ return self.templates.get(name)
+
+ def add_prefix(self, name, uri):
+ """
+ Appends custom prefix to be added to the content.opf document
+
+ >>> epub_book.add_prefix('bkterms', 'http://booktype.org/')
+
+ :Args:
+ - name: namespave name
+ - uri: URI for the namespace
+ """
+
+ self.prefixes.append('%s: %s' % (name, uri))
+
+
+class EpubWriter(object):
+ DEFAULT_OPTIONS = {
+ 'epub2_guide': True,
+ 'epub3_landmark': True,
+ 'landmark_title': 'Guide',
+ 'spine_direction': True,
+ 'package_direction': False
+ }
+
+ def __init__(self, name, book, options=None):
+ self.file_name = name
+ self.book = book
+
+ self.options = dict(self.DEFAULT_OPTIONS)
+ if options:
+ self.options.update(options)
+
+ def process(self):
+ # should cache this html parsing so we don't do it for every plugin
+ for plg in self.options.get('plugins', []):
+ if hasattr(plg, 'before_write'):
+ plg.before_write(self.book)
+
+ for item in self.book.get_items():
+ if isinstance(item, EpubHtml):
+ for plg in self.options.get('plugins', []):
+ if hasattr(plg, 'html_before_write'):
+ plg.html_before_write(self.book, item)
+
+ def _write_container(self):
+ container_xml = CONTAINER_XML % {'folder_name': self.book.FOLDER_NAME}
+ self.out.writestr(CONTAINER_PATH, container_xml)
+
+ def _write_opf_metadata(self, root):
+ # This is really not needed
+ # problem is uppercase/lowercase
+ # for ns_name, values in six.iteritems(self.book.metadata):
+ # if ns_name:
+ # for n_id, ns_url in six.iteritems(NAMESPACES):
+ # if ns_name == ns_url:
+ # nsmap[n_id.lower()] = NAMESPACES[n_id]
+
+ nsmap = {'dc': NAMESPACES['DC'], 'opf': NAMESPACES['OPF']}
+ nsmap.update(self.book.namespaces)
+
+ metadata = etree.SubElement(root, 'metadata', nsmap=nsmap)
+
+ el = etree.SubElement(metadata, 'meta', {'property': 'dcterms:modified'})
+ if 'mtime' in self.options:
+ mtime = self.options['mtime']
+ else:
+ import datetime
+ mtime = datetime.datetime.now()
+ el.text = mtime.strftime('%Y-%m-%dT%H:%M:%SZ')
+
+ for ns_name, values in six.iteritems(self.book.metadata):
+ if ns_name == NAMESPACES['OPF']:
+ for values in values.values():
+ for v in values:
+ if 'property' in v[1] and v[1]['property'] == 'dcterms:modified':
+ continue
+ try:
+ el = etree.SubElement(metadata, 'meta', v[1])
+ if v[0]:
+ el.text = v[0]
+ except ValueError:
+ logging.error('Could not create metadata.')
+ else:
+ for name, values in six.iteritems(values):
+ for v in values:
+ try:
+ if ns_name:
+ el = etree.SubElement(metadata, '{%s}%s' % (ns_name, name), v[1])
+ else:
+ el = etree.SubElement(metadata, '%s' % name, v[1])
+
+ el.text = v[0]
+ except ValueError:
+ logging.error('Could not create metadata "{}".'.format(name))
+
+ def _write_opf_manifest(self, root):
+ manifest = etree.SubElement(root, 'manifest')
+ _ncx_id = None
+
+ # mathml, scripted, svg, remote-resources, and switch
+ # nav
+ # cover-image
+
+ for item in self.book.get_items():
+ if not item.manifest:
+ continue
+
+ if isinstance(item, EpubNav):
+ etree.SubElement(manifest, 'item', {'href': item.get_name(),
+ 'id': item.id,
+ 'media-type': item.media_type,
+ 'properties': 'nav'})
+ elif isinstance(item, EpubNcx):
+ _ncx_id = item.id
+ etree.SubElement(manifest, 'item', {'href': item.file_name,
+ 'id': item.id,
+ 'media-type': item.media_type})
+
+ elif isinstance(item, EpubCover):
+ etree.SubElement(manifest, 'item', {'href': item.file_name,
+ 'id': item.id,
+ 'media-type': item.media_type,
+ 'properties': 'cover-image'})
+ else:
+ opts = {'href': item.file_name,
+ 'id': item.id,
+ 'media-type': item.media_type}
+
+ if hasattr(item, 'properties') and len(item.properties) > 0:
+ opts['properties'] = ' '.join(item.properties)
+
+ etree.SubElement(manifest, 'item', opts)
+
+ return _ncx_id
+
+ def _write_opf_spine(self, root, ncx_id):
+ spine_attributes = {'toc': ncx_id or 'ncx'}
+ if self.book.direction and self.options['spine_direction']:
+ spine_attributes['page-progression-direction'] = self.book.direction
+
+ spine = etree.SubElement(root, 'spine', spine_attributes)
+
+ for _item in self.book.spine:
+ # this is for now
+ # later we should be able to fetch things from tuple
+
+ is_linear = True
+
+ if isinstance(_item, tuple):
+ item = _item[0]
+
+ if len(_item) > 1:
+ if _item[1] == 'no':
+ is_linear = False
+ else:
+ item = _item
+
+ if isinstance(item, EpubHtml):
+ opts = {'idref': item.get_id()}
+
+ if not item.is_linear or not is_linear:
+ opts['linear'] = 'no'
+ elif isinstance(item, EpubItem):
+ opts = {'idref': item.get_id()}
+
+ if not item.is_linear or not is_linear:
+ opts['linear'] = 'no'
+ else:
+ opts = {'idref': item}
+
+ try:
+ itm = self.book.get_item_with_id(item)
+
+ if not itm.is_linear or not is_linear:
+ opts['linear'] = 'no'
+ except:
+ pass
+
+ etree.SubElement(spine, 'itemref', opts)
+
+ def _write_opf_guide(self, root):
+ # - http://www.idpf.org/epub/20/spec/OPF_2.0.1_draft.htm#Section2.6
+
+ if len(self.book.guide) > 0 and self.options.get('epub2_guide'):
+ guide = etree.SubElement(root, 'guide', {})
+
+ for item in self.book.guide:
+ if 'item' in item:
+ chap = item.get('item')
+ if chap:
+ _href = chap.file_name
+ _title = chap.title
+ else:
+ _href = item.get('href', '')
+ _title = item.get('title', '')
+
+ if _title is None:
+ _title = ''
+ ref = etree.SubElement(guide, 'reference', {'type': item.get('type', ''),
+ 'title': _title,
+ 'href': _href})
+
+ def _write_opf_bindings(self, root):
+ if len(self.book.bindings) > 0:
+ bindings = etree.SubElement(root, 'bindings', {})
+ for item in self.book.bindings:
+ etree.SubElement(bindings, 'mediaType', item)
+
+ def _write_opf_file(self, root):
+ tree_str = etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True)
+
+ self.out.writestr('%s/content.opf' % self.book.FOLDER_NAME, tree_str)
+
+ def _write_opf(self):
+ package_attributes = {'xmlns': NAMESPACES['OPF'],
+ 'unique-identifier': self.book.IDENTIFIER_ID,
+ 'version': '3.0'}
+ if self.book.direction and self.options['package_direction']:
+ package_attributes['dir'] = self.book.direction
+
+ root = etree.Element('package', package_attributes)
+
+ prefixes = ['rendition: http://www.idpf.org/vocab/rendition/#'] + self.book.prefixes
+ root.attrib['prefix'] = ' '.join(prefixes)
+
+ # METADATA
+ self._write_opf_metadata(root)
+
+ # MANIFEST
+ _ncx_id = self._write_opf_manifest(root)
+
+ # SPINE
+ self._write_opf_spine(root, _ncx_id)
+
+ # GUIDE
+ self._write_opf_guide(root)
+
+ # BINDINGS
+ self._write_opf_bindings(root)
+
+ # WRITE FILE
+ self._write_opf_file(root)
+
+ def _get_nav(self, item):
+ # just a basic navigation for now
+ nav_xml = parse_string(self.book.get_template('nav'))
+ root = nav_xml.getroot()
+
+ root.set('lang', self.book.language)
+ root.attrib['{%s}lang' % NAMESPACES['XML']] = self.book.language
+
+ nav_dir_name = os.path.dirname(item.file_name)
+
+ head = etree.SubElement(root, 'head')
+ title = etree.SubElement(head, 'title')
+ title.text = self.book.title
+
+ # for now this just handles css files and ignores others
+ for _link in item.links:
+ _lnk = etree.SubElement(head, 'link', {
+ 'href': _link.get('href', ''), 'rel': 'stylesheet', 'type': 'text/css'
+ })
+
+ body = etree.SubElement(root, 'body')
+ nav = etree.SubElement(body, 'nav', {'{%s}type' % NAMESPACES['EPUB']: 'toc', 'id': 'id'})
+
+ content_title = etree.SubElement(nav, 'h2')
+ content_title.text = self.book.title
+
+ def _create_section(itm, items):
+ ol = etree.SubElement(itm, 'ol')
+ for item in items:
+ if isinstance(item, tuple) or isinstance(item, list):
+ li = etree.SubElement(ol, 'li')
+ if isinstance(item[0], EpubHtml):
+ a = etree.SubElement(li, 'a', {'href': os.path.relpath(item[0].file_name, nav_dir_name)})
+ elif isinstance(item[0], Section) and item[0].href != '':
+ a = etree.SubElement(li, 'a', {'href': os.path.relpath(item[0].href, nav_dir_name)})
+ elif isinstance(item[0], Link):
+ a = etree.SubElement(li, 'a', {'href': os.path.relpath(item[0].href, nav_dir_name)})
+ else:
+ a = etree.SubElement(li, 'span')
+ a.text = item[0].title
+
+ _create_section(li, item[1])
+
+ elif isinstance(item, Link):
+ li = etree.SubElement(ol, 'li')
+ a = etree.SubElement(li, 'a', {'href': os.path.relpath(item.href, nav_dir_name)})
+ a.text = item.title
+ elif isinstance(item, EpubHtml):
+ li = etree.SubElement(ol, 'li')
+ a = etree.SubElement(li, 'a', {'href': os.path.relpath(item.file_name, nav_dir_name)})
+ a.text = item.title
+
+ _create_section(nav, self.book.toc)
+
+ # LANDMARKS / GUIDE
+ # - http://www.idpf.org/epub/30/spec/epub30-contentdocs.html#sec-xhtml-nav-def-types-landmarks
+
+ if len(self.book.guide) > 0 and self.options.get('epub3_landmark'):
+
+ # Epub2 guide types do not map completely to epub3 landmark types.
+ guide_to_landscape_map = {
+ 'notes': 'rearnotes',
+ 'text': 'bodymatter'
+ }
+
+ guide_nav = etree.SubElement(body, 'nav', {'{%s}type' % NAMESPACES['EPUB']: 'landmarks'})
+
+ guide_content_title = etree.SubElement(guide_nav, 'h2')
+ guide_content_title.text = self.options.get('landmark_title', 'Guide')
+
+ guild_ol = etree.SubElement(guide_nav, 'ol')
+
+ for elem in self.book.guide:
+ li_item = etree.SubElement(guild_ol, 'li')
+
+ if 'item' in elem:
+ chap = elem.get('item', None)
+ if chap:
+ _href = chap.file_name
+ _title = chap.title
+ else:
+ _href = elem.get('href', '')
+ _title = elem.get('title', '')
+
+ guide_type = elem.get('type', '')
+ a_item = etree.SubElement(li_item, 'a', {
+ '{%s}type' % NAMESPACES['EPUB']: guide_to_landscape_map.get(guide_type, guide_type),
+ 'href': os.path.relpath(_href, nav_dir_name)
+ })
+ a_item.text = _title
+
+ tree_str = etree.tostring(nav_xml, pretty_print=True, encoding='utf-8', xml_declaration=True)
+
+ return tree_str
+
+ def _get_ncx(self):
+
+ # we should be able to setup language for NCX as also
+ ncx = parse_string(self.book.get_template('ncx'))
+ root = ncx.getroot()
+
+ head = etree.SubElement(root, 'head')
+
+ # get this id
+ uid = etree.SubElement(head, 'meta', {'content': self.book.uid, 'name': 'dtb:uid'})
+ uid = etree.SubElement(head, 'meta', {'content': '0', 'name': 'dtb:depth'})
+ uid = etree.SubElement(head, 'meta', {'content': '0', 'name': 'dtb:totalPageCount'})
+ uid = etree.SubElement(head, 'meta', {'content': '0', 'name': 'dtb:maxPageNumber'})
+
+ doc_title = etree.SubElement(root, 'docTitle')
+ title = etree.SubElement(doc_title, 'text')
+ title.text = self.book.title
+
+# doc_author = etree.SubElement(root, 'docAuthor')
+# author = etree.SubElement(doc_author, 'text')
+# author.text = 'Name of the person'
+
+ # For now just make a very simple navMap
+ nav_map = etree.SubElement(root, 'navMap')
+
+ def _create_section(itm, items, uid):
+ for item in items:
+ if isinstance(item, tuple) or isinstance(item, list):
+ section, subsection = item[0], item[1]
+
+ np = etree.SubElement(itm, 'navPoint', {
+ 'id': section.get_id() if isinstance(section, EpubHtml) else 'sep_%d' % uid
+ })
+ nl = etree.SubElement(np, 'navLabel')
+ nt = etree.SubElement(nl, 'text')
+ nt.text = section.title
+
+ # CAN NOT HAVE EMPTY SRC HERE
+ href = ''
+ if isinstance(section, EpubHtml):
+ href = section.file_name
+ elif isinstance(section, Section) and section.href != '':
+ href = section.href
+ elif isinstance(section, Link):
+ href = section.href
+
+ nc = etree.SubElement(np, 'content', {'src': href})
+
+ uid = _create_section(np, subsection, uid + 1)
+ elif isinstance(item, Link):
+ _parent = itm
+ _content = _parent.find('content')
+
+ if _content is not None:
+ if _content.get('src') == '':
+ _content.set('src', item.href)
+
+ np = etree.SubElement(itm, 'navPoint', {'id': item.uid})
+ nl = etree.SubElement(np, 'navLabel')
+ nt = etree.SubElement(nl, 'text')
+ nt.text = item.title
+
+ nc = etree.SubElement(np, 'content', {'src': item.href})
+ elif isinstance(item, EpubHtml):
+ _parent = itm
+ _content = _parent.find('content')
+
+ if _content is not None:
+ if _content.get('src') == '':
+ _content.set('src', item.file_name)
+
+ np = etree.SubElement(itm, 'navPoint', {'id': item.get_id()})
+ nl = etree.SubElement(np, 'navLabel')
+ nt = etree.SubElement(nl, 'text')
+ nt.text = item.title
+
+ nc = etree.SubElement(np, 'content', {'src': item.file_name})
+
+ return uid
+
+ _create_section(nav_map, self.book.toc, 0)
+
+ tree_str = etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True)
+
+ return tree_str
+
+ def _write_items(self):
+ for item in self.book.get_items():
+ if isinstance(item, EpubNcx):
+ self.out.writestr('%s/%s' % (self.book.FOLDER_NAME, item.file_name), self._get_ncx())
+ elif isinstance(item, EpubNav):
+ self.out.writestr('%s/%s' % (self.book.FOLDER_NAME, item.file_name), self._get_nav(item))
+ elif item.manifest:
+ self.out.writestr('%s/%s' % (self.book.FOLDER_NAME, item.file_name), item.get_content())
+ else:
+ self.out.writestr('%s' % item.file_name, item.get_content())
+
+ def write(self):
+ # check for the option allowZip64
+ self.out = zipfile.ZipFile(self.file_name, 'w', zipfile.ZIP_DEFLATED)
+ self.out.writestr('mimetype', 'application/epub+zip', compress_type=zipfile.ZIP_STORED)
+
+ self._write_container()
+ self._write_opf()
+ self._write_items()
+
+ self.out.close()
+
+
+class EpubReader(object):
+ DEFAULT_OPTIONS = {}
+
+ def __init__(self, epub_file_name, options=None):
+ self.file_name = epub_file_name
+ self.book = EpubBook()
+ self.zf = None
+
+ self.opf_file = ''
+ self.opf_dir = ''
+
+ self.options = dict(self.DEFAULT_OPTIONS)
+ if options:
+ self.options.update(options)
+
+ def process(self):
+ # should cache this html parsing so we don't do it for every plugin
+ for plg in self.options.get('plugins', []):
+ if hasattr(plg, 'after_read'):
+ plg.after_read(self.book)
+
+ for item in self.book.get_items():
+ if isinstance(item, EpubHtml):
+ for plg in self.options.get('plugins', []):
+ if hasattr(plg, 'html_after_read'):
+ plg.html_after_read(self.book, item)
+
+ def load(self):
+ self._load()
+
+ return self.book
+
+ def read_file(self, name):
+ # Raises KeyError
+ name = os.path.normpath(name)
+ return self.zf.read(name)
+
+ def _load_container(self):
+ meta_inf = self.read_file('META-INF/container.xml')
+ tree = parse_string(meta_inf)
+
+ for root_file in tree.findall('//xmlns:rootfile[@media-type]', namespaces={'xmlns': NAMESPACES['CONTAINERNS']}):
+ if root_file.get('media-type') == 'application/oebps-package+xml':
+ self.opf_file = root_file.get('full-path')
+ self.opf_dir = zip_path.dirname(self.opf_file)
+
+ def _load_metadata(self):
+ container_root = self.container.getroot()
+
+ # get epub version
+ self.book.version = container_root.get('version', None)
+
+ # get unique-identifier
+ if container_root.get('unique-identifier', None):
+ self.book.IDENTIFIER_ID = container_root.get('unique-identifier')
+
+ # get xml:lang
+ # get metadata
+ metadata = self.container.find('{%s}%s' % (NAMESPACES['OPF'], 'metadata'))
+
+ nsmap = metadata.nsmap
+ nstags = dict((k, '{%s}' % v) for k, v in six.iteritems(nsmap))
+ default_ns = nstags.get(None, '')
+
+ nsdict = dict((v, {}) for v in nsmap.values())
+
+ def add_item(ns, tag, value, extra):
+ if ns not in nsdict:
+ nsdict[ns] = {}
+
+ values = nsdict[ns].setdefault(tag, [])
+ values.append((value, extra))
+
+ for t in metadata:
+ if not etree.iselement(t) or t.tag is etree.Comment:
+ continue
+ if t.tag == default_ns + 'meta':
+ name = t.get('name')
+ others = dict((k, v) for k, v in t.items())
+
+ if name and ':' in name:
+ prefix, name = name.split(':', 1)
+ else:
+ prefix = None
+
+ add_item(t.nsmap.get(prefix, prefix), name, t.text, others)
+ else:
+ tag = t.tag[t.tag.rfind('}') + 1:]
+
+ if (t.prefix and t.prefix.lower() == 'dc') and tag == 'identifier':
+ _id = t.get('id', None)
+
+ if _id:
+ self.book.IDENTIFIER_ID = _id
+
+ others = dict((k, v) for k, v in t.items())
+ add_item(t.nsmap[t.prefix], tag, t.text, others)
+
+ self.book.metadata = nsdict
+
+ titles = self.book.get_metadata('DC', 'title')
+ if len(titles) > 0:
+ self.book.title = titles[0][0]
+
+ for value, others in self.book.get_metadata('DC', 'identifier'):
+ if others.get('id') == self.book.IDENTIFIER_ID:
+ self.book.uid = value
+
+ def _load_manifest(self):
+ for r in self.container.find('{%s}%s' % (NAMESPACES['OPF'], 'manifest')):
+ if r is not None and r.tag != '{%s}item' % NAMESPACES['OPF']:
+ continue
+
+ media_type = r.get('media-type')
+ _properties = r.get('properties', '')
+
+ if _properties:
+ properties = _properties.split(' ')
+ else:
+ properties = []
+
+ # people use wrong content types
+ if media_type == 'image/jpg':
+ media_type = 'image/jpeg'
+
+ if media_type == 'application/x-dtbncx+xml':
+ ei = EpubNcx(uid=r.get('id'), file_name=unquote(r.get('href')))
+
+ ei.content = self.read_file(zip_path.join(self.opf_dir, ei.file_name))
+ elif media_type == 'application/xhtml+xml':
+ if 'nav' in properties:
+ ei = EpubNav(uid=r.get('id'), file_name=unquote(r.get('href')))
+
+ ei.content = self.read_file(zip_path.join(self.opf_dir, r.get('href')))
+ elif 'cover' in properties:
+ ei = EpubCoverHtml()
+
+ ei.content = self.read_file(zip_path.join(self.opf_dir, unquote(r.get('href'))))
+ else:
+ ei = EpubHtml()
+
+ ei.id = r.get('id')
+ ei.file_name = unquote(r.get('href'))
+ ei.media_type = media_type
+ ei.content = self.read_file(zip_path.join(self.opf_dir, ei.get_name()))
+ ei.properties = properties
+ elif media_type in IMAGE_MEDIA_TYPES:
+ if 'cover-image' in properties:
+ ei = EpubCover(uid=r.get('id'), file_name=unquote(r.get('href')))
+
+ ei.media_type = media_type
+ ei.content = self.read_file(zip_path.join(self.opf_dir, ei.get_name()))
+ else:
+ ei = EpubImage()
+
+ ei.id = r.get('id')
+ ei.file_name = unquote(r.get('href'))
+ ei.media_type = media_type
+ ei.content = self.read_file(zip_path.join(self.opf_dir, ei.get_name()))
+ else:
+ # different types
+ ei = EpubItem()
+
+ ei.id = r.get('id')
+ ei.file_name = unquote(r.get('href'))
+ ei.media_type = media_type
+
+ ei.content = self.read_file(zip_path.join(self.opf_dir, ei.get_name()))
+
+ self.book.add_item(ei)
+
+ def _parse_ncx(self, data):
+ tree = parse_string(data)
+ tree_root = tree.getroot()
+
+ nav_map = tree_root.find('{%s}navMap' % NAMESPACES['DAISY'])
+
+ def _get_children(elems, n, nid):
+ label, content = '', ''
+ children = []
+
+ for a in elems.getchildren():
+ if a.tag == '{%s}navLabel' % NAMESPACES['DAISY']:
+ label = a.getchildren()[0].text
+ if a.tag == '{%s}content' % NAMESPACES['DAISY']:
+ content = a.get('src', '')
+ if a.tag == '{%s}navPoint' % NAMESPACES['DAISY']:
+ children.append(_get_children(a, n + 1, a.get('id', '')))
+
+ if len(children) > 0:
+ if n == 0:
+ return children
+
+ return (Section(label, href=content),
+ children)
+ else:
+ return Link(content, label, nid)
+
+ self.book.toc = _get_children(nav_map, 0, '')
+
+ def _parse_nav(self, data, base_path):
+ html_node = parse_html_string(data)
+ nav_node = html_node.xpath("//nav[@*='toc']")[0]
+
+ def parse_list(list_node):
+ items = []
+
+ for item_node in list_node.findall('li'):
+
+ sublist_node = item_node.find('ol')
+ link_node = item_node.find('a')
+
+ if sublist_node is not None:
+ title = item_node[0].text
+ children = parse_list(sublist_node)
+
+ if link_node is not None:
+ href = zip_path.normpath(zip_path.join(base_path, link_node.get('href')))
+ items.append((Section(title, href=href), children))
+ else:
+ items.append((Section(title), children))
+ elif link_node is not None:
+ title = link_node.text
+ href = zip_path.normpath(zip_path.join(base_path, link_node.get('href')))
+
+ items.append(Link(href, title))
+
+ return items
+
+ self.book.toc = parse_list(nav_node.find('ol'))
+
+ def _load_spine(self):
+ spine = self.container.find('{%s}%s' % (NAMESPACES['OPF'], 'spine'))
+
+ self.book.spine = [(t.get('idref'), t.get('linear', 'yes')) for t in spine]
+
+ toc = spine.get('toc', '')
+ self.book.set_direction(spine.get('page-progression-direction', None))
+
+ # should read ncx or nav file
+ if toc:
+ try:
+ ncxFile = self.read_file(zip_path.join(self.opf_dir, self.book.get_item_with_id(toc).get_name()))
+ except KeyError:
+ raise EpubException(-1, 'Can not find ncx file.')
+
+ self._parse_ncx(ncxFile)
+
+ def _load_guide(self):
+ guide = self.container.find('{%s}%s' % (NAMESPACES['OPF'], 'guide'))
+ if guide is not None:
+ self.book.guide = [{'href': t.get('href'), 'title': t.get('title'), 'type': t.get('type')} for t in guide]
+
+ def _load_opf_file(self):
+ try:
+ s = self.read_file(self.opf_file)
+ except KeyError:
+ raise EpubException(-1, 'Can not find container file')
+
+ self.container = parse_string(s)
+
+ self._load_metadata()
+ self._load_manifest()
+ self._load_spine()
+ self._load_guide()
+
+ # read nav file if found
+ #
+ if not self.book.toc:
+ nav_item = next((item for item in self.book.items if isinstance(item, EpubNav)), None)
+ if nav_item:
+ self._parse_nav(nav_item.content, zip_path.dirname(nav_item.file_name))
+
+ def _load(self):
+ try:
+ self.zf = zipfile.ZipFile(self.file_name, 'r', compression=zipfile.ZIP_DEFLATED, allowZip64=True)
+ except zipfile.BadZipfile as bz:
+ raise EpubException(0, 'Bad Zip file')
+ except zipfile.LargeZipFile as bz:
+ raise EpubException(1, 'Large Zip file')
+
+ # 1st check metadata
+ self._load_container()
+ self._load_opf_file()
+
+ self.zf.close()
+
+
+# WRITE
+
+def write_epub(name, book, options=None):
+ """
+ Creates epub file with the content defined in EpubBook.
+
+ >>> ebooklib.write_epub('book.epub', book)
+
+ :Args:
+ - name: file name for the output file
+ - book: instance of EpubBook
+ - options: extra opions as dictionary (optional)
+ """
+ epub = EpubWriter(name, book, options)
+
+ epub.process()
+
+ try:
+ epub.write()
+ except IOError:
+ pass
+
+# READ
+
+
+def read_epub(name, options=None):
+ """
+ Creates new instance of EpubBook with the content defined in the input file.
+
+ >>> book = ebooklib.read_epub('book.epub')
+
+ :Args:
+ - name: full path to the input file
+ - options: extra options as dictionary (optional)
+
+ :Returns:
+ Instance of EpubBook.
+ """
+ reader = EpubReader(name, options)
+
+ book = reader.load()
+ reader.process()
+
+ return book
diff --git a/ebooklib/plugins/__init__.py b/ebooklib/plugins/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/ebooklib/plugins/base.py b/ebooklib/plugins/base.py
new file mode 100644
index 0000000..6351f81
--- /dev/null
+++ b/ebooklib/plugins/base.py
@@ -0,0 +1,49 @@
+# This file is part of EbookLib.
+# Copyright (c) 2013 Aleksandar Erkalovic
+#
+# EbookLib is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# EbookLib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with EbookLib. If not, see .
+
+
+class BasePlugin(object):
+ def before_write(self, book):
+ "Processing before save"
+ return True
+
+ def after_write(self, book):
+ "Processing after save"
+ return True
+
+ def before_read(self, book):
+ "Processing before save"
+ return True
+
+ def after_read(self, book):
+ "Processing after save"
+ return True
+
+ def item_after_read(self, book, item):
+ "Process general item after read."
+ return True
+
+ def item_before_write(self, book, item):
+ "Process general item before write."
+ return True
+
+ def html_after_read(self, book, chapter):
+ "Processing HTML before read."
+ return True
+
+ def html_before_write(self, book, chapter):
+ "Processing HTML before save."
+ return True
diff --git a/ebooklib/plugins/booktype.py b/ebooklib/plugins/booktype.py
new file mode 100644
index 0000000..5518258
--- /dev/null
+++ b/ebooklib/plugins/booktype.py
@@ -0,0 +1,119 @@
+# This file is part of EbookLib.
+# Copyright (c) 2013 Aleksandar Erkalovic
+#
+# EbookLib is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# EbookLib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with EbookLib. If not, see .
+
+from ebooklib.plugins.base import BasePlugin
+from ebooklib.utils import parse_html_string
+
+class BooktypeLinks(BasePlugin):
+ NAME = 'Booktype Links'
+
+ def __init__(self, booktype_book):
+ self.booktype_book = booktype_book
+
+ def html_before_write(self, book, chapter):
+ from lxml import etree
+
+ try:
+ from urlparse import urlparse, urljoin
+ except ImportError:
+ from urllib.parse import urlparse, urljoin
+
+ try:
+ tree = parse_html_string(chapter.content)
+ except:
+ return
+
+ root = tree.getroottree()
+
+ if len(root.find('body')) != 0:
+ body = tree.find('body')
+
+ # should also be aware to handle
+ # ../chapter/
+ # ../chapter/#reference
+ # ../chapter#reference
+
+ for _link in body.xpath('//a'):
+ # This is just temporary for the footnotes
+ if _link.get('href', '').find('InsertNoteID') != -1:
+ _ln = _link.get('href', '')
+ i = _ln.find('#')
+ _link.set('href', _ln[i:]);
+
+ continue
+
+ _u = urlparse(_link.get('href', ''))
+
+ # Let us care only for internal links at the moment
+ if _u.scheme == '':
+ if _u.path != '':
+ _link.set('href', '%s.xhtml' % _u.path)
+
+ if _u.fragment != '':
+ _link.set('href', urljoin(_link.get('href'), '#%s' % _u.fragment))
+
+ if _link.get('name') != None:
+ _link.set('id', _link.get('name'))
+ etree.strip_attributes(_link, 'name')
+
+ chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8')
+
+
+
+
+class BooktypeFootnotes(BasePlugin):
+ NAME = 'Booktype Footnotes'
+
+ def __init__(self, booktype_book):
+ self.booktype_book = booktype_book
+
+ def html_before_write(self, book, chapter):
+ from lxml import etree
+
+ from ebooklib import epub
+
+ try:
+ tree = parse_html_string(chapter.content)
+ except:
+ return
+
+ root = tree.getroottree()
+
+ if len(root.find('body')) != 0:
+ body = tree.find('body')
+
+ # 1
+ # - prvi footnote ^
+
+ # 1
+ #
+ for footnote in body.xpath('//span[@class="InsertNoteMarker"]'):
+ footnote_id = footnote.get('id')[:-8]
+ a = footnote.getchildren()[0].getchildren()[0]
+
+ footnote_text = body.xpath('//li[@id="%s"]' % footnote_id)[0]
+
+ a.attrib['{%s}type' % epub.NAMESPACES['EPUB']] = 'noteref'
+ ftn = etree.SubElement(body, 'aside', {'id': footnote_id})
+ ftn.attrib['{%s}type' % epub.NAMESPACES['EPUB']] = 'footnote'
+ ftn_p = etree.SubElement(ftn, 'p')
+ ftn_p.text = footnote_text.text
+
+ old_footnote = body.xpath('//ol[@id="InsertNote_NoteList"]')
+ if len(old_footnote) > 0:
+ body.remove(old_footnote[0])
+
+ chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8')
diff --git a/ebooklib/plugins/sourcecode.py b/ebooklib/plugins/sourcecode.py
new file mode 100644
index 0000000..4f973a2
--- /dev/null
+++ b/ebooklib/plugins/sourcecode.py
@@ -0,0 +1,68 @@
+# This file is part of EbookLib.
+# Copyright (c) 2013 Aleksandar Erkalovic
+#
+# EbookLib is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# EbookLib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with EbookLib. If not, see .
+
+from ebooklib.plugins.base import BasePlugin
+from ebooklib.utils import parse_html_string
+
+class SourceHighlighter(BasePlugin):
+ def __init__(self):
+ pass
+
+ def html_before_write(self, book, chapter):
+ from lxml import etree, html
+
+ from pygments import highlight
+ from pygments.formatters import HtmlFormatter
+
+ from ebooklib import epub
+
+ try:
+ tree = parse_html_string(chapter.content)
+ except:
+ return
+
+ root = tree.getroottree()
+
+ had_source = False
+
+ if len(root.find('body')) != 0:
+ body = tree.find('body')
+ # check for embeded source
+ for source in body.xpath('//pre[contains(@class,"source-")]'):
+ css_class = source.get('class')
+
+ source_text = (source.text or '') + ''.join([html.tostring(child) for child in source.iterchildren()])
+
+ if 'source-python' in css_class:
+ from pygments.lexers import PythonLexer
+
+# _text = highlight(source_text, PythonLexer(), HtmlFormatter(linenos="inline"))
+ _text = highlight(source_text, PythonLexer(), HtmlFormatter())
+
+ if 'source-css' in css_class:
+ from pygments.lexers import CssLexer
+
+ _text = highlight(source_text, CssLexer(), HtmlFormatter())
+
+ _parent = source.getparent()
+ _parent.replace(source, etree.XML(_text))
+
+ had_source = True
+
+ if had_source:
+ chapter.add_link(href="style/code.css", rel="stylesheet", type="text/css")
+ chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8')
+
diff --git a/ebooklib/plugins/standard.py b/ebooklib/plugins/standard.py
new file mode 100644
index 0000000..61576f9
--- /dev/null
+++ b/ebooklib/plugins/standard.py
@@ -0,0 +1,230 @@
+# This file is part of EbookLib.
+# Copyright (c) 2013 Aleksandar Erkalovic
+#
+# EbookLib is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# EbookLib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with EbookLib. If not, see .
+
+import six
+
+from ebooklib.plugins.base import BasePlugin
+from ebooklib.utils import parse_html_string
+
+# TODO:
+# - should also look for the _required_ elements
+# http://www.w3.org/html/wg/drafts/html/master/tabular-data.html#the-table-element
+
+ATTRIBUTES_GLOBAL = ['accesskey', 'class', 'contenteditable', 'contextmenu', 'dir', 'draggable',
+ 'dropzone', 'hidden', 'id', 'inert', 'itemid', 'itemprop', 'itemref',
+ 'itemscope', 'itemtype', 'lang', 'spellcheck', 'style', 'tabindex',
+ 'title', 'translate', 'epub:type']
+
+# Remove for now from here
+DEPRECATED_TAGS = ['acronym', 'applet', 'basefont', 'big', 'center', 'dir', 'font', 'frame',
+ 'frameset', 'isindex', 'noframes', 's', 'strike', 'tt']
+
+
+def leave_only(item, tag_list):
+ for _attr in six.iterkeys(item.attrib):
+ if _attr not in tag_list:
+ del item.attrib[_attr]
+
+
+class SyntaxPlugin(BasePlugin):
+ NAME = 'Check HTML syntax'
+
+ def html_before_write(self, book, chapter):
+ from lxml import etree
+
+ try:
+ tree = parse_html_string(chapter.content)
+ except:
+ return
+
+ root = tree.getroottree()
+
+ # delete deprecated tags
+ # i should really have a list of allowed tags
+ for tag in DEPRECATED_TAGS:
+ etree.strip_tags(root, tag)
+
+ head = tree.find('head')
+
+ if head is not None and len(head) != 0:
+
+ for _item in head:
+ if _item.tag == 'base':
+ leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'target'])
+ elif _item.tag == 'link':
+ leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'crossorigin', 'rel', 'media', 'hreflang', 'type', 'sizes'])
+ elif _item.tag == 'title':
+ if _item.text == '':
+ head.remove(_item)
+ elif _item.tag == 'meta':
+ leave_only(_item, ATTRIBUTES_GLOBAL + ['name', 'http-equiv', 'content', 'charset'])
+ # just remove for now, but really should not be like this
+ head.remove(_item)
+ elif _item.tag == 'script':
+ leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'charset', 'async', 'defer', 'crossorigin'])
+ elif _item.tag == 'source':
+ leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'media'])
+ elif _item.tag == 'style':
+ leave_only(_item, ATTRIBUTES_GLOBAL + ['media', 'type', 'scoped'])
+ else:
+ leave_only(_item, ATTRIBUTES_GLOBAL)
+
+
+ if len(root.find('body')) != 0:
+ body = tree.find('body')
+
+ for _item in body.iter():
+ # it is not
+ #
+
+ if _item.tag == 'a':
+ leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'target', 'download', 'rel', 'hreflang', 'type'])
+ elif _item.tag == 'area':
+ leave_only(_item, ATTRIBUTES_GLOBAL + ['alt', 'coords', 'shape', 'href', 'target', 'download', 'rel', 'hreflang', 'type'])
+ elif _item.tag == 'audio':
+ leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'crossorigin', 'preload', 'autoplay', 'mediagroup', 'loop', 'muted', 'controls'])
+ elif _item.tag == 'blockquote':
+ leave_only(_item, ATTRIBUTES_GLOBAL + ['cite'])
+ elif _item.tag == 'button':
+ leave_only(_item, ATTRIBUTES_GLOBAL + ['autofocus', 'disabled', 'form', 'formaction', 'formenctype', 'formmethod', 'formnovalidate',
+ 'formtarget', 'name', 'type', 'value', 'menu'])
+ elif _item.tag == 'canvas':
+ leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height'])
+ elif _item.tag == 'canvas':
+ leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height'])
+ elif _item.tag == 'del':
+ leave_only(_item, ATTRIBUTES_GLOBAL + ['cite', 'datetime'])
+ elif _item.tag == 'details':
+ leave_only(_item, ATTRIBUTES_GLOBAL + ['open'])
+ elif _item.tag == 'embed':
+ leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'width', 'height'])
+ elif _item.tag == 'fieldset':
+ leave_only(_item, ATTRIBUTES_GLOBAL + ['disable', 'form', 'name'])
+ elif _item.tag == 'details':
+ leave_only(_item, ATTRIBUTES_GLOBAL + ['accept-charset', 'action', 'autocomplete', 'enctype', 'method', 'name', 'novalidate', 'target'])
+ elif _item.tag == 'iframe':
+ leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'srcdoc', 'name', 'sandbox', 'seamless', 'allowfullscreen', 'width', 'height'])
+ elif _item.tag == 'img':
+ _src = _item.get('src', '').lower()
+ if _src.startswith('http://') or _src.startswith('https://'):
+ if 'remote-resources' not in chapter.properties:
+ chapter.properties.append('remote-resources')
+ # THIS DOES NOT WORK, ONLY VIDEO AND AUDIO FILES CAN BE REMOTE RESOURCES
+ # THAT MEANS I SHOULD ALSO CATCH
+#
+# EbookLib is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# EbookLib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with EbookLib. If not, see .
+
+import six
+import subprocess
+
+from ebooklib.plugins.base import BasePlugin
+from ebooklib.utils import parse_html_string
+
+# Recommend usage of
+# - https://github.com/w3c/tidy-html5
+
+def tidy_cleanup(content, **extra):
+ cmd = []
+
+ for k, v in six.iteritems(extra):
+
+ if v:
+ cmd.append('--%s' % k)
+ cmd.append(v)
+ else:
+ cmd.append('-%s' % k)
+
+ # must parse all other extra arguments
+ try:
+ p = subprocess.Popen(['tidy']+cmd, shell=False,
+ stdin=subprocess.PIPE, stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE, close_fds=True)
+ except OSError:
+ return (3, None)
+
+ p.stdin.write(content)
+
+ (cont, p_err) = p.communicate()
+
+ # 0 - all ok
+ # 1 - there were warnings
+ # 2 - there were errors
+ # 3 - exception
+
+ return (p.returncode, cont)
+
+
+class TidyPlugin(BasePlugin):
+ NAME = 'Tidy HTML'
+ OPTIONS = {'char-encoding': 'utf8',
+ 'tidy-mark': 'no'
+ }
+
+ def __init__(self, extra = {}):
+ self.options = dict(self.OPTIONS)
+ self.options.update(extra)
+
+ def html_before_write(self, book, chapter):
+ if not chapter.content:
+ return None
+
+ (_, chapter.content) = tidy_cleanup(chapter.content, **self.options)
+
+ return chapter.content
+
+ def html_after_read(self, book, chapter):
+ if not chapter.content:
+ return None
+
+ (_, chapter.content) = tidy_cleanup(chapter.content, **self.options)
+
+ return chapter.content
+
diff --git a/ebooklib/utils.py b/ebooklib/utils.py
new file mode 100644
index 0000000..162f4c9
--- /dev/null
+++ b/ebooklib/utils.py
@@ -0,0 +1,60 @@
+# This file is part of EbookLib.
+# Copyright (c) 2013 Aleksandar Erkalovic
+#
+# EbookLib is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# EbookLib is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with EbookLib. If not, see .
+
+import io
+import mimetypes
+
+from lxml import etree
+
+
+mimetype_initialised = False
+
+
+def debug(obj):
+ import pprint
+
+ pp = pprint.PrettyPrinter(indent=4)
+ pp.pprint(obj)
+
+
+def parse_string(s):
+ try:
+ tree = etree.parse(io.BytesIO(s.encode('utf-8')))
+ except:
+ tree = etree.parse(io.BytesIO(s))
+
+ return tree
+
+
+def parse_html_string(s):
+ from lxml import html
+
+ utf8_parser = html.HTMLParser(encoding='utf-8')
+
+ html_tree = html.document_fromstring(s, parser=utf8_parser)
+
+ return html_tree
+
+
+def guess_type(extenstion):
+ global mimetype_initialised
+
+ if not mimetype_initialised:
+ mimetypes.init()
+ mimetypes.add_type('application/xhtml+xml', '.xhtml')
+ mimetype_initialised = True
+
+ return mimetypes.guess_type(extenstion)