From f3d748f5ddba90ec0d07756b99cf7dd0b3858bd1 Mon Sep 17 00:00:00 2001 From: BasioMeusPuga Date: Mon, 6 Nov 2017 07:42:48 +0530 Subject: [PATCH] Incorporate ebooklib from https://github.com/aerkalov/ebooklib --- ebooklib/__init__.py | 42 + ebooklib/epub.py | 1595 ++++++++++++++++++++++++++++++++ ebooklib/plugins/__init__.py | 0 ebooklib/plugins/base.py | 49 + ebooklib/plugins/booktype.py | 119 +++ ebooklib/plugins/sourcecode.py | 68 ++ ebooklib/plugins/standard.py | 230 +++++ ebooklib/plugins/tidyhtml.py | 82 ++ ebooklib/utils.py | 60 ++ 9 files changed, 2245 insertions(+) create mode 100644 ebooklib/__init__.py create mode 100644 ebooklib/epub.py create mode 100644 ebooklib/plugins/__init__.py create mode 100644 ebooklib/plugins/base.py create mode 100644 ebooklib/plugins/booktype.py create mode 100644 ebooklib/plugins/sourcecode.py create mode 100644 ebooklib/plugins/standard.py create mode 100644 ebooklib/plugins/tidyhtml.py create mode 100644 ebooklib/utils.py diff --git a/ebooklib/__init__.py b/ebooklib/__init__.py new file mode 100644 index 0000000..020d2f5 --- /dev/null +++ b/ebooklib/__init__.py @@ -0,0 +1,42 @@ +# This file is part of EbookLib. +# Copyright (c) 2013 Aleksandar Erkalovic +# +# EbookLib is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# EbookLib is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with EbookLib. If not, see . + +# Version of ebook library + +VERSION = (0, 16, 0) + +# LIST OF POSSIBLE ITEMS +ITEM_UNKNOWN = 0 +ITEM_IMAGE = 1 +ITEM_STYLE = 2 +ITEM_SCRIPT = 3 +ITEM_NAVIGATION = 4 +ITEM_VECTOR = 5 +ITEM_FONT = 6 +ITEM_VIDEO = 7 +ITEM_AUDIO = 8 +ITEM_DOCUMENT = 9 + +# EXTENSION MAPPER +EXTENSIONS = {ITEM_IMAGE: ['.jpg', '.jpeg', '.gif', '.tiff', '.tif', '.png'], + ITEM_STYLE: ['.css'], + ITEM_VECTOR: ['.svg'], + ITEM_FONT: ['.otf', '.woff', '.ttf'], + ITEM_SCRIPT: ['.js'], + ITEM_NAVIGATION: ['.ncx'], + ITEM_VIDEO: ['.mov', '.mp4', '.avi'], + ITEM_AUDIO: ['.mp3', '.ogg'] + } diff --git a/ebooklib/epub.py b/ebooklib/epub.py new file mode 100644 index 0000000..fd3a0ff --- /dev/null +++ b/ebooklib/epub.py @@ -0,0 +1,1595 @@ +# This file is part of EbookLib. +# Copyright (c) 2013 Aleksandar Erkalovic +# +# EbookLib is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# EbookLib is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with EbookLib. If not, see . + +import zipfile +import six +import logging +import uuid +import posixpath as zip_path +import os.path +from collections import OrderedDict + +try: + from urllib.parse import unquote +except ImportError: + from urllib import unquote + +from lxml import etree + +import ebooklib + +from ebooklib.utils import parse_string, parse_html_string, guess_type + + +# Version of EPUB library +VERSION = (0, 15, 0) + +NAMESPACES = {'XML': 'http://www.w3.org/XML/1998/namespace', + 'EPUB': 'http://www.idpf.org/2007/ops', + 'DAISY': 'http://www.daisy.org/z3986/2005/ncx/', + 'OPF': 'http://www.idpf.org/2007/opf', + 'CONTAINERNS': 'urn:oasis:names:tc:opendocument:xmlns:container', + 'DC': 'http://purl.org/dc/elements/1.1/', + 'XHTML': 'http://www.w3.org/1999/xhtml'} + +# XML Templates + +CONTAINER_PATH = 'META-INF/container.xml' + +CONTAINER_XML = ''' + + + + + +''' + +NCX_XML = six.b(''' +''') + +NAV_XML = six.b('''''') + +CHAPTER_XML = six.b('''''') + +COVER_XML = six.b(''' + + + + + + + + +''') + + +IMAGE_MEDIA_TYPES = ['image/jpeg', 'image/jpg', 'image/png', 'image/svg+xml'] + + +# TOC elements + +class Section(object): + + def __init__(self, title, href=''): + self.title = title + self.href = href + + +class Link(object): + + def __init__(self, href, title, uid=None): + self.href = href + self.title = title + self.uid = uid + +# Exceptions + + +class EpubException(Exception): + + def __init__(self, code, msg): + self.code = code + self.msg = msg + + def __str__(self): + return repr(self.msg) + +# Items + + +class EpubItem(object): + + """ + Base class for the items in a book. + """ + + def __init__(self, uid=None, file_name='', media_type='', content=six.b(''), manifest=True): + """ + :Args: + - uid: Unique identifier for this item (optional) + - file_name: File name for this item (optional) + - media_type: Media type for this item (optional) + - content: Content for this item (optional) + - manifest: Manifest for this item (optional) + """ + self.id = uid + self.file_name = file_name + self.media_type = media_type + self.content = content + self.is_linear = True + self.manifest = manifest + + self.book = None + + def get_id(self): + """ + Returns unique identifier for this item. + + :Returns: + Returns uid number as string. + """ + return self.id + + def get_name(self): + """ + Returns name for this item. By default it is always file name but it does not have to be. + + :Returns: + Returns file name for this item. + """ + return self.file_name + + def get_type(self): + """ + Guess type according to the file extension. Might not be the best way how to do it, but it works for now. + + Items can be of type: + - ITEM_UNKNOWN = 0 + - ITEM_IMAGE = 1 + - ITEM_STYLE = 2 + - ITEM_SCRIPT = 3 + - ITEM_NAVIGATION = 4 + - ITEM_VECTOR = 5 + - ITEM_FONT = 6 + - ITEM_VIDEO = 7 + - ITEM_AUDIO = 8 + - ITEM_DOCUMENT = 9 + + We map type according to the extensions which are defined in ebooklib.EXTENSIONS. + + :Returns: + Returns type of the item as number. + """ + _, ext = zip_path.splitext(self.get_name()) + ext = ext.lower() + + for uid, ext_list in six.iteritems(ebooklib.EXTENSIONS): + if ext in ext_list: + return uid + + return ebooklib.ITEM_UNKNOWN + + def get_content(self, default=six.b('')): + """ + Returns content of the item. Content should be of type 'str' (Python 2) or 'bytes' (Python 3) + + :Args: + - default: Default value for the content if it is not already defined. + + :Returns: + Returns content of the item. + """ + return self.content or default + + def set_content(self, content): + """ + Sets content value for this item. + + :Args: + - content: Content value + """ + self.content = content + + def __str__(self): + return '' % self.id + + +class EpubNcx(EpubItem): + + "Represents Navigation Control File (NCX) in the EPUB." + + def __init__(self, uid='ncx', file_name='toc.ncx'): + super(EpubNcx, self).__init__(uid=uid, file_name=file_name, media_type='application/x-dtbncx+xml') + + def __str__(self): + return '' % self.id + + +class EpubCover(EpubItem): + + """ + Represents Cover image in the EPUB file. + """ + + def __init__(self, uid='cover-img', file_name=''): + super(EpubCover, self).__init__(uid=uid, file_name=file_name) + + def __str__(self): + return '' % (self.id, self.file_name) + + +class EpubHtml(EpubItem): + + """ + Represents HTML document in the EPUB file. + """ + _template_name = 'chapter' + + def __init__(self, uid=None, file_name='', media_type='', content=None, title='', lang=None, direction=None): + super(EpubHtml, self).__init__(uid, file_name, media_type, content) + + self.title = title + self.lang = lang + self.direction = direction + + self.links = [] + self.properties = [] + + def is_chapter(self): + """ + Returns if this document is chapter or not. + + :Returns: + Returns book value. + """ + return True + + def get_type(self): + """ + Always returns ebooklib.ITEM_DOCUMENT as type of this document. + + :Returns: + Always returns ebooklib.ITEM_DOCUMENT + """ + + return ebooklib.ITEM_DOCUMENT + + def set_language(self, lang): + """ + Sets language for this book item. By default it will use language of the book but it + can be overwritten with this call. + """ + self.lang = lang + + def get_language(self): + """ + Get language code for this book item. Language of the book item can be different from + the language settings defined globaly for book. + + :Returns: + As string returns language code. + """ + return self.lang + + def add_link(self, **kwgs): + """ + Add additional link to the document. Links will be embeded only inside of this document. + + >>> add_link(href='styles.css', rel='stylesheet', type='text/css') + """ + self.links.append(kwgs) + + def get_links(self): + """ + Returns list of additional links defined for this document. + + :Returns: + As tuple return list of links. + """ + return (link for link in self.links) + + def get_links_of_type(self, link_type): + """ + Returns list of additional links of specific type. + + :Returns: + As tuple returns list of links. + """ + return (link for link in self.links if link.get('type', '') == link_type) + + def add_item(self, item): + """ + Add other item to this document. It will create additional links according to the item type. + + :Args: + - item: item we want to add defined as instance of EpubItem + """ + if item.get_type() == ebooklib.ITEM_STYLE: + self.add_link(href=item.get_name(), rel='stylesheet', type='text/css') + + if item.get_type() == ebooklib.ITEM_SCRIPT: + self.add_link(src=item.get_name(), type='text/javascript') + + def get_body_content(self): + """ + Returns content of BODY element for this HTML document. Content will be of type 'str' (Python 2) + or 'bytes' (Python 3). + + :Returns: + Returns content of this document. + """ + + try: + html_tree = parse_html_string(self.content) + except: + return '' + + html_root = html_tree.getroottree() + + if len(html_root.find('body')) != 0: + body = html_tree.find('body') + + tree_str = etree.tostring(body, pretty_print=True, encoding='utf-8', xml_declaration=False) + + # this is so stupid + if tree_str.startswith(six.b('')): + n = tree_str.rindex(six.b('')) + + return tree_str[6:n] + + return tree_str + + return '' + + def get_content(self, default=None): + """ + Returns content for this document as HTML string. Content will be of type 'str' (Python 2) + or 'bytes' (Python 3). + + :Args: + - default: Default value for the content if it is not defined. + + :Returns: + Returns content of this document. + """ + + tree = parse_string(self.book.get_template(self._template_name)) + tree_root = tree.getroot() + + tree_root.set('lang', self.lang or self.book.language) + tree_root.attrib['{%s}lang' % NAMESPACES['XML']] = self.lang or self.book.language + + # add to the head also + # + + try: + html_tree = parse_html_string(self.content) + except: + return '' + + html_root = html_tree.getroottree() + + # create and populate head + + _head = etree.SubElement(tree_root, 'head') + + if self.title != '': + _title = etree.SubElement(_head, 'title') + _title.text = self.title + + for lnk in self.links: + if lnk.get('type') == 'text/javascript': + _lnk = etree.SubElement(_head, 'script', lnk) + # force + _lnk.text = '' + else: + _lnk = etree.SubElement(_head, 'link', lnk) + + # this should not be like this + # head = html_root.find('head') + # if head is not None: + # for i in head.getchildren(): + # if i.tag == 'title' and self.title != '': + # continue + # _head.append(i) + + # create and populate body + + _body = etree.SubElement(tree_root, 'body') + if self.direction: + _body.set('dir', self.direction) + + body = html_tree.find('body') + if body is not None: + for i in body.getchildren(): + _body.append(i) + + tree_str = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True) + + return tree_str + + def __str__(self): + return '' % (self.id, self.file_name) + + +class EpubCoverHtml(EpubHtml): + + """ + Represents Cover page in the EPUB file. + """ + + def __init__(self, uid='cover', file_name='cover.xhtml', image_name='', title='Cover'): + super(EpubCoverHtml, self).__init__(uid=uid, file_name=file_name, title=title) + + self.image_name = image_name + self.is_linear = False + + def is_chapter(self): + """ + Returns if this document is chapter or not. + + :Returns: + Returns book value. + """ + + return False + + def get_content(self): + """ + Returns content for cover page as HTML string. Content will be of type 'str' (Python 2) or 'bytes' (Python 3). + + :Returns: + Returns content of this document. + """ + + self.content = self.book.get_template('cover') + + tree = parse_string(super(EpubCoverHtml, self).get_content()) + tree_root = tree.getroot() + + images = tree_root.xpath('//xhtml:img', namespaces={'xhtml': NAMESPACES['XHTML']}) + + images[0].set('src', self.image_name) + images[0].set('alt', self.title) + + tree_str = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True) + + return tree_str + + def __str__(self): + return '' % (self.id, self.file_name) + + +class EpubNav(EpubHtml): + + """ + Represents Navigation Document in the EPUB file. + """ + + def __init__(self, uid='nav', file_name='nav.xhtml', media_type='application/xhtml+xml'): + super(EpubNav, self).__init__(uid=uid, file_name=file_name, media_type=media_type) + + def is_chapter(self): + """ + Returns if this document is chapter or not. + + :Returns: + Returns book value. + """ + + return False + + def __str__(self): + return '' % (self.id, self.file_name) + + +class EpubImage(EpubItem): + + """ + Represents Image in the EPUB file. + """ + + def __init__(self): + super(EpubImage, self).__init__() + + def get_type(self): + return ebooklib.ITEM_IMAGE + + def __str__(self): + return '' % (self.id, self.file_name) + + +# EpubBook + +class EpubBook(object): + + def __init__(self): + self.EPUB_VERSION = None + + self.reset() + + # we should have options here + + def reset(self): + "Initialises all needed variables to default values" + + self.metadata = {} + self.items = [] + self.spine = [] + self.guide = [] + self.toc = [] + self.bindings = [] + + self.IDENTIFIER_ID = 'id' + self.FOLDER_NAME = 'EPUB' + + self._id_html = 0 + self._id_image = 0 + self._id_static = 0 + + self.title = '' + self.language = 'en' + self.direction = None + + self.templates = { + 'ncx': NCX_XML, + 'nav': NAV_XML, + 'chapter': CHAPTER_XML, + 'cover': COVER_XML + } + + self.add_metadata('OPF', 'generator', '', { + 'name': 'generator', 'content': 'Ebook-lib %s' % '.'.join([str(s) for s in VERSION]) + }) + + # default to using a randomly-unique identifier if one is not specified manually + self.set_identifier(str(uuid.uuid4())) + + # custom prefixes and namespaces to be set to the content.opf doc + self.prefixes = [] + self.namespaces = {} + + def set_identifier(self, uid): + """ + Sets unique id for this epub + + :Args: + - uid: Value of unique identifier for this book + """ + + self.uid = uid + + self.set_unique_metadata('DC', 'identifier', self.uid, {'id': self.IDENTIFIER_ID}) + + def set_title(self, title): + """ + Set title. You can set multiple titles. + + :Args: + - title: Title value + """ + + self.title = title + + self.add_metadata('DC', 'title', self.title) + + def set_language(self, lang): + """ + Set language for this epub. You can set multiple languages. Specific items in the book can have + different language settings. + + :Args: + - lang: Language code + """ + + self.language = lang + + self.add_metadata('DC', 'language', lang) + + def set_direction(self, direction): + """ + :Args: + - direction: Options are "ltr", "rtl" and "default" + """ + + self.direction = direction + + def set_cover(self, file_name, content, create_page=True): + """ + Set cover and create cover document if needed. + + :Args: + - file_name: file name of the cover page + - content: Content for the cover image + - create_page: Should cover page be defined. Defined as bool value (optional). Default value is True. + """ + + # as it is now, it can only be called once + c0 = EpubCover(file_name=file_name) + c0.content = content + self.add_item(c0) + + if create_page: + c1 = EpubCoverHtml(image_name=file_name) + self.add_item(c1) + + self.add_metadata(None, 'meta', '', OrderedDict([('name', 'cover'), ('content', 'cover-img')])) + + def add_author(self, author, file_as=None, role=None, uid='creator'): + "Add author for this document" + + self.add_metadata('DC', 'creator', author, {'id': uid}) + + if file_as: + self.add_metadata(None, 'meta', file_as, {'refines': '#' + uid, + 'property': 'file-as', + 'scheme': 'marc:relators'}) + if role: + self.add_metadata(None, 'meta', role, {'refines': '#' + uid, + 'property': 'role', + 'scheme': 'marc:relators'}) + + def add_metadata(self, namespace, name, value, others=None): + "Add metadata" + + if namespace in NAMESPACES: + namespace = NAMESPACES[namespace] + + if namespace not in self.metadata: + self.metadata[namespace] = {} + + if name not in self.metadata[namespace]: + self.metadata[namespace][name] = [] + + self.metadata[namespace][name].append((value, others)) + + def get_metadata(self, namespace, name): + "Retrieve metadata" + + if namespace in NAMESPACES: + namespace = NAMESPACES[namespace] + + return self.metadata[namespace][name] + + def set_unique_metadata(self, namespace, name, value, others=None): + "Add metadata if metadata with this identifier does not already exist, otherwise update existing metadata." + + if namespace in NAMESPACES: + namespace = NAMESPACES[namespace] + + if namespace in self.metadata and name in self.metadata[namespace]: + self.metadata[namespace][name] = [(value, others)] + else: + self.add_metadata(namespace, name, value, others) + + def add_item(self, item): + """ + Add additional item to the book. If not defined, media type and chapter id will be defined + for the item. + + :Args: + - item: Item instance + """ + if item.media_type == '': + (has_guessed, media_type) = guess_type(item.get_name().lower()) + + if has_guessed: + if media_type is not None: + item.media_type = media_type + else: + item.media_type = has_guessed + else: + item.media_type = 'application/octet-stream' + + if not item.get_id(): + # make chapter_, image_ and static_ configurable + if isinstance(item, EpubHtml): + item.id = 'chapter_%d' % self._id_html + self._id_html += 1 + elif isinstance(item, EpubImage): + item.id = 'image_%d' % self._id_image + self._id_image += 1 + else: + item.id = 'static_%d' % self._id_image + self._id_image += 1 + + item.book = self + self.items.append(item) + + return item + + def get_item_with_id(self, uid): + """ + Returns item for defined UID. + + >>> book.get_item_with_id('image_001') + + :Args: + - uid: UID for the item + + :Returns: + Returns item object. Returns None if nothing was found. + """ + for item in self.get_items(): + if item.id == uid: + return item + + return None + + def get_item_with_href(self, href): + """ + Returns item for defined HREF. + + >>> book.get_item_with_href('EPUB/document.xhtml') + + :Args: + - href: HREF for the item we are searching for + + :Returns: + Returns item object. Returns None if nothing was found. + """ + for item in self.get_items(): + if item.get_name() == href: + return item + + return None + + def get_items(self): + """ + Returns all items attached to this book. + + :Returns: + Returns all items as tuple. + """ + return (item for item in self.items) + + def get_items_of_type(self, item_type): + """ + Returns all items of specified type. + + >>> book.get_items_of_type(epub.ITEM_IMAGE) + + :Args: + - item_type: Type for items we are searching for + + :Returns: + Returns found items as tuple. + """ + return (item for item in self.items if item.get_type() == item_type) + + def get_items_of_media_type(self, media_type): + """ + Returns all items of specified media type. + + :Args: + - media_type: Media type for items we are searching for + + :Returns: + Returns found items as tuple. + """ + return (item for item in self.items if item.media_type == media_type) + + def set_template(self, name, value): + """ + Defines templates which are used to generate certain types of pages. When defining new value for the template + we have to use content of type 'str' (Python 2) or 'bytes' (Python 3). + + At the moment we use these templates: + - ncx + - nav + - chapter + - cover + + :Args: + - name: Name for the template + - value: Content for the template + """ + + self.templates[name] = value + + def get_template(self, name): + """ + Returns value for the template. + + :Args: + - name: template name + + :Returns: + Value of the template. + """ + return self.templates.get(name) + + def add_prefix(self, name, uri): + """ + Appends custom prefix to be added to the content.opf document + + >>> epub_book.add_prefix('bkterms', 'http://booktype.org/') + + :Args: + - name: namespave name + - uri: URI for the namespace + """ + + self.prefixes.append('%s: %s' % (name, uri)) + + +class EpubWriter(object): + DEFAULT_OPTIONS = { + 'epub2_guide': True, + 'epub3_landmark': True, + 'landmark_title': 'Guide', + 'spine_direction': True, + 'package_direction': False + } + + def __init__(self, name, book, options=None): + self.file_name = name + self.book = book + + self.options = dict(self.DEFAULT_OPTIONS) + if options: + self.options.update(options) + + def process(self): + # should cache this html parsing so we don't do it for every plugin + for plg in self.options.get('plugins', []): + if hasattr(plg, 'before_write'): + plg.before_write(self.book) + + for item in self.book.get_items(): + if isinstance(item, EpubHtml): + for plg in self.options.get('plugins', []): + if hasattr(plg, 'html_before_write'): + plg.html_before_write(self.book, item) + + def _write_container(self): + container_xml = CONTAINER_XML % {'folder_name': self.book.FOLDER_NAME} + self.out.writestr(CONTAINER_PATH, container_xml) + + def _write_opf_metadata(self, root): + # This is really not needed + # problem is uppercase/lowercase + # for ns_name, values in six.iteritems(self.book.metadata): + # if ns_name: + # for n_id, ns_url in six.iteritems(NAMESPACES): + # if ns_name == ns_url: + # nsmap[n_id.lower()] = NAMESPACES[n_id] + + nsmap = {'dc': NAMESPACES['DC'], 'opf': NAMESPACES['OPF']} + nsmap.update(self.book.namespaces) + + metadata = etree.SubElement(root, 'metadata', nsmap=nsmap) + + el = etree.SubElement(metadata, 'meta', {'property': 'dcterms:modified'}) + if 'mtime' in self.options: + mtime = self.options['mtime'] + else: + import datetime + mtime = datetime.datetime.now() + el.text = mtime.strftime('%Y-%m-%dT%H:%M:%SZ') + + for ns_name, values in six.iteritems(self.book.metadata): + if ns_name == NAMESPACES['OPF']: + for values in values.values(): + for v in values: + if 'property' in v[1] and v[1]['property'] == 'dcterms:modified': + continue + try: + el = etree.SubElement(metadata, 'meta', v[1]) + if v[0]: + el.text = v[0] + except ValueError: + logging.error('Could not create metadata.') + else: + for name, values in six.iteritems(values): + for v in values: + try: + if ns_name: + el = etree.SubElement(metadata, '{%s}%s' % (ns_name, name), v[1]) + else: + el = etree.SubElement(metadata, '%s' % name, v[1]) + + el.text = v[0] + except ValueError: + logging.error('Could not create metadata "{}".'.format(name)) + + def _write_opf_manifest(self, root): + manifest = etree.SubElement(root, 'manifest') + _ncx_id = None + + # mathml, scripted, svg, remote-resources, and switch + # nav + # cover-image + + for item in self.book.get_items(): + if not item.manifest: + continue + + if isinstance(item, EpubNav): + etree.SubElement(manifest, 'item', {'href': item.get_name(), + 'id': item.id, + 'media-type': item.media_type, + 'properties': 'nav'}) + elif isinstance(item, EpubNcx): + _ncx_id = item.id + etree.SubElement(manifest, 'item', {'href': item.file_name, + 'id': item.id, + 'media-type': item.media_type}) + + elif isinstance(item, EpubCover): + etree.SubElement(manifest, 'item', {'href': item.file_name, + 'id': item.id, + 'media-type': item.media_type, + 'properties': 'cover-image'}) + else: + opts = {'href': item.file_name, + 'id': item.id, + 'media-type': item.media_type} + + if hasattr(item, 'properties') and len(item.properties) > 0: + opts['properties'] = ' '.join(item.properties) + + etree.SubElement(manifest, 'item', opts) + + return _ncx_id + + def _write_opf_spine(self, root, ncx_id): + spine_attributes = {'toc': ncx_id or 'ncx'} + if self.book.direction and self.options['spine_direction']: + spine_attributes['page-progression-direction'] = self.book.direction + + spine = etree.SubElement(root, 'spine', spine_attributes) + + for _item in self.book.spine: + # this is for now + # later we should be able to fetch things from tuple + + is_linear = True + + if isinstance(_item, tuple): + item = _item[0] + + if len(_item) > 1: + if _item[1] == 'no': + is_linear = False + else: + item = _item + + if isinstance(item, EpubHtml): + opts = {'idref': item.get_id()} + + if not item.is_linear or not is_linear: + opts['linear'] = 'no' + elif isinstance(item, EpubItem): + opts = {'idref': item.get_id()} + + if not item.is_linear or not is_linear: + opts['linear'] = 'no' + else: + opts = {'idref': item} + + try: + itm = self.book.get_item_with_id(item) + + if not itm.is_linear or not is_linear: + opts['linear'] = 'no' + except: + pass + + etree.SubElement(spine, 'itemref', opts) + + def _write_opf_guide(self, root): + # - http://www.idpf.org/epub/20/spec/OPF_2.0.1_draft.htm#Section2.6 + + if len(self.book.guide) > 0 and self.options.get('epub2_guide'): + guide = etree.SubElement(root, 'guide', {}) + + for item in self.book.guide: + if 'item' in item: + chap = item.get('item') + if chap: + _href = chap.file_name + _title = chap.title + else: + _href = item.get('href', '') + _title = item.get('title', '') + + if _title is None: + _title = '' + ref = etree.SubElement(guide, 'reference', {'type': item.get('type', ''), + 'title': _title, + 'href': _href}) + + def _write_opf_bindings(self, root): + if len(self.book.bindings) > 0: + bindings = etree.SubElement(root, 'bindings', {}) + for item in self.book.bindings: + etree.SubElement(bindings, 'mediaType', item) + + def _write_opf_file(self, root): + tree_str = etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True) + + self.out.writestr('%s/content.opf' % self.book.FOLDER_NAME, tree_str) + + def _write_opf(self): + package_attributes = {'xmlns': NAMESPACES['OPF'], + 'unique-identifier': self.book.IDENTIFIER_ID, + 'version': '3.0'} + if self.book.direction and self.options['package_direction']: + package_attributes['dir'] = self.book.direction + + root = etree.Element('package', package_attributes) + + prefixes = ['rendition: http://www.idpf.org/vocab/rendition/#'] + self.book.prefixes + root.attrib['prefix'] = ' '.join(prefixes) + + # METADATA + self._write_opf_metadata(root) + + # MANIFEST + _ncx_id = self._write_opf_manifest(root) + + # SPINE + self._write_opf_spine(root, _ncx_id) + + # GUIDE + self._write_opf_guide(root) + + # BINDINGS + self._write_opf_bindings(root) + + # WRITE FILE + self._write_opf_file(root) + + def _get_nav(self, item): + # just a basic navigation for now + nav_xml = parse_string(self.book.get_template('nav')) + root = nav_xml.getroot() + + root.set('lang', self.book.language) + root.attrib['{%s}lang' % NAMESPACES['XML']] = self.book.language + + nav_dir_name = os.path.dirname(item.file_name) + + head = etree.SubElement(root, 'head') + title = etree.SubElement(head, 'title') + title.text = self.book.title + + # for now this just handles css files and ignores others + for _link in item.links: + _lnk = etree.SubElement(head, 'link', { + 'href': _link.get('href', ''), 'rel': 'stylesheet', 'type': 'text/css' + }) + + body = etree.SubElement(root, 'body') + nav = etree.SubElement(body, 'nav', {'{%s}type' % NAMESPACES['EPUB']: 'toc', 'id': 'id'}) + + content_title = etree.SubElement(nav, 'h2') + content_title.text = self.book.title + + def _create_section(itm, items): + ol = etree.SubElement(itm, 'ol') + for item in items: + if isinstance(item, tuple) or isinstance(item, list): + li = etree.SubElement(ol, 'li') + if isinstance(item[0], EpubHtml): + a = etree.SubElement(li, 'a', {'href': os.path.relpath(item[0].file_name, nav_dir_name)}) + elif isinstance(item[0], Section) and item[0].href != '': + a = etree.SubElement(li, 'a', {'href': os.path.relpath(item[0].href, nav_dir_name)}) + elif isinstance(item[0], Link): + a = etree.SubElement(li, 'a', {'href': os.path.relpath(item[0].href, nav_dir_name)}) + else: + a = etree.SubElement(li, 'span') + a.text = item[0].title + + _create_section(li, item[1]) + + elif isinstance(item, Link): + li = etree.SubElement(ol, 'li') + a = etree.SubElement(li, 'a', {'href': os.path.relpath(item.href, nav_dir_name)}) + a.text = item.title + elif isinstance(item, EpubHtml): + li = etree.SubElement(ol, 'li') + a = etree.SubElement(li, 'a', {'href': os.path.relpath(item.file_name, nav_dir_name)}) + a.text = item.title + + _create_section(nav, self.book.toc) + + # LANDMARKS / GUIDE + # - http://www.idpf.org/epub/30/spec/epub30-contentdocs.html#sec-xhtml-nav-def-types-landmarks + + if len(self.book.guide) > 0 and self.options.get('epub3_landmark'): + + # Epub2 guide types do not map completely to epub3 landmark types. + guide_to_landscape_map = { + 'notes': 'rearnotes', + 'text': 'bodymatter' + } + + guide_nav = etree.SubElement(body, 'nav', {'{%s}type' % NAMESPACES['EPUB']: 'landmarks'}) + + guide_content_title = etree.SubElement(guide_nav, 'h2') + guide_content_title.text = self.options.get('landmark_title', 'Guide') + + guild_ol = etree.SubElement(guide_nav, 'ol') + + for elem in self.book.guide: + li_item = etree.SubElement(guild_ol, 'li') + + if 'item' in elem: + chap = elem.get('item', None) + if chap: + _href = chap.file_name + _title = chap.title + else: + _href = elem.get('href', '') + _title = elem.get('title', '') + + guide_type = elem.get('type', '') + a_item = etree.SubElement(li_item, 'a', { + '{%s}type' % NAMESPACES['EPUB']: guide_to_landscape_map.get(guide_type, guide_type), + 'href': os.path.relpath(_href, nav_dir_name) + }) + a_item.text = _title + + tree_str = etree.tostring(nav_xml, pretty_print=True, encoding='utf-8', xml_declaration=True) + + return tree_str + + def _get_ncx(self): + + # we should be able to setup language for NCX as also + ncx = parse_string(self.book.get_template('ncx')) + root = ncx.getroot() + + head = etree.SubElement(root, 'head') + + # get this id + uid = etree.SubElement(head, 'meta', {'content': self.book.uid, 'name': 'dtb:uid'}) + uid = etree.SubElement(head, 'meta', {'content': '0', 'name': 'dtb:depth'}) + uid = etree.SubElement(head, 'meta', {'content': '0', 'name': 'dtb:totalPageCount'}) + uid = etree.SubElement(head, 'meta', {'content': '0', 'name': 'dtb:maxPageNumber'}) + + doc_title = etree.SubElement(root, 'docTitle') + title = etree.SubElement(doc_title, 'text') + title.text = self.book.title + +# doc_author = etree.SubElement(root, 'docAuthor') +# author = etree.SubElement(doc_author, 'text') +# author.text = 'Name of the person' + + # For now just make a very simple navMap + nav_map = etree.SubElement(root, 'navMap') + + def _create_section(itm, items, uid): + for item in items: + if isinstance(item, tuple) or isinstance(item, list): + section, subsection = item[0], item[1] + + np = etree.SubElement(itm, 'navPoint', { + 'id': section.get_id() if isinstance(section, EpubHtml) else 'sep_%d' % uid + }) + nl = etree.SubElement(np, 'navLabel') + nt = etree.SubElement(nl, 'text') + nt.text = section.title + + # CAN NOT HAVE EMPTY SRC HERE + href = '' + if isinstance(section, EpubHtml): + href = section.file_name + elif isinstance(section, Section) and section.href != '': + href = section.href + elif isinstance(section, Link): + href = section.href + + nc = etree.SubElement(np, 'content', {'src': href}) + + uid = _create_section(np, subsection, uid + 1) + elif isinstance(item, Link): + _parent = itm + _content = _parent.find('content') + + if _content is not None: + if _content.get('src') == '': + _content.set('src', item.href) + + np = etree.SubElement(itm, 'navPoint', {'id': item.uid}) + nl = etree.SubElement(np, 'navLabel') + nt = etree.SubElement(nl, 'text') + nt.text = item.title + + nc = etree.SubElement(np, 'content', {'src': item.href}) + elif isinstance(item, EpubHtml): + _parent = itm + _content = _parent.find('content') + + if _content is not None: + if _content.get('src') == '': + _content.set('src', item.file_name) + + np = etree.SubElement(itm, 'navPoint', {'id': item.get_id()}) + nl = etree.SubElement(np, 'navLabel') + nt = etree.SubElement(nl, 'text') + nt.text = item.title + + nc = etree.SubElement(np, 'content', {'src': item.file_name}) + + return uid + + _create_section(nav_map, self.book.toc, 0) + + tree_str = etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True) + + return tree_str + + def _write_items(self): + for item in self.book.get_items(): + if isinstance(item, EpubNcx): + self.out.writestr('%s/%s' % (self.book.FOLDER_NAME, item.file_name), self._get_ncx()) + elif isinstance(item, EpubNav): + self.out.writestr('%s/%s' % (self.book.FOLDER_NAME, item.file_name), self._get_nav(item)) + elif item.manifest: + self.out.writestr('%s/%s' % (self.book.FOLDER_NAME, item.file_name), item.get_content()) + else: + self.out.writestr('%s' % item.file_name, item.get_content()) + + def write(self): + # check for the option allowZip64 + self.out = zipfile.ZipFile(self.file_name, 'w', zipfile.ZIP_DEFLATED) + self.out.writestr('mimetype', 'application/epub+zip', compress_type=zipfile.ZIP_STORED) + + self._write_container() + self._write_opf() + self._write_items() + + self.out.close() + + +class EpubReader(object): + DEFAULT_OPTIONS = {} + + def __init__(self, epub_file_name, options=None): + self.file_name = epub_file_name + self.book = EpubBook() + self.zf = None + + self.opf_file = '' + self.opf_dir = '' + + self.options = dict(self.DEFAULT_OPTIONS) + if options: + self.options.update(options) + + def process(self): + # should cache this html parsing so we don't do it for every plugin + for plg in self.options.get('plugins', []): + if hasattr(plg, 'after_read'): + plg.after_read(self.book) + + for item in self.book.get_items(): + if isinstance(item, EpubHtml): + for plg in self.options.get('plugins', []): + if hasattr(plg, 'html_after_read'): + plg.html_after_read(self.book, item) + + def load(self): + self._load() + + return self.book + + def read_file(self, name): + # Raises KeyError + name = os.path.normpath(name) + return self.zf.read(name) + + def _load_container(self): + meta_inf = self.read_file('META-INF/container.xml') + tree = parse_string(meta_inf) + + for root_file in tree.findall('//xmlns:rootfile[@media-type]', namespaces={'xmlns': NAMESPACES['CONTAINERNS']}): + if root_file.get('media-type') == 'application/oebps-package+xml': + self.opf_file = root_file.get('full-path') + self.opf_dir = zip_path.dirname(self.opf_file) + + def _load_metadata(self): + container_root = self.container.getroot() + + # get epub version + self.book.version = container_root.get('version', None) + + # get unique-identifier + if container_root.get('unique-identifier', None): + self.book.IDENTIFIER_ID = container_root.get('unique-identifier') + + # get xml:lang + # get metadata + metadata = self.container.find('{%s}%s' % (NAMESPACES['OPF'], 'metadata')) + + nsmap = metadata.nsmap + nstags = dict((k, '{%s}' % v) for k, v in six.iteritems(nsmap)) + default_ns = nstags.get(None, '') + + nsdict = dict((v, {}) for v in nsmap.values()) + + def add_item(ns, tag, value, extra): + if ns not in nsdict: + nsdict[ns] = {} + + values = nsdict[ns].setdefault(tag, []) + values.append((value, extra)) + + for t in metadata: + if not etree.iselement(t) or t.tag is etree.Comment: + continue + if t.tag == default_ns + 'meta': + name = t.get('name') + others = dict((k, v) for k, v in t.items()) + + if name and ':' in name: + prefix, name = name.split(':', 1) + else: + prefix = None + + add_item(t.nsmap.get(prefix, prefix), name, t.text, others) + else: + tag = t.tag[t.tag.rfind('}') + 1:] + + if (t.prefix and t.prefix.lower() == 'dc') and tag == 'identifier': + _id = t.get('id', None) + + if _id: + self.book.IDENTIFIER_ID = _id + + others = dict((k, v) for k, v in t.items()) + add_item(t.nsmap[t.prefix], tag, t.text, others) + + self.book.metadata = nsdict + + titles = self.book.get_metadata('DC', 'title') + if len(titles) > 0: + self.book.title = titles[0][0] + + for value, others in self.book.get_metadata('DC', 'identifier'): + if others.get('id') == self.book.IDENTIFIER_ID: + self.book.uid = value + + def _load_manifest(self): + for r in self.container.find('{%s}%s' % (NAMESPACES['OPF'], 'manifest')): + if r is not None and r.tag != '{%s}item' % NAMESPACES['OPF']: + continue + + media_type = r.get('media-type') + _properties = r.get('properties', '') + + if _properties: + properties = _properties.split(' ') + else: + properties = [] + + # people use wrong content types + if media_type == 'image/jpg': + media_type = 'image/jpeg' + + if media_type == 'application/x-dtbncx+xml': + ei = EpubNcx(uid=r.get('id'), file_name=unquote(r.get('href'))) + + ei.content = self.read_file(zip_path.join(self.opf_dir, ei.file_name)) + elif media_type == 'application/xhtml+xml': + if 'nav' in properties: + ei = EpubNav(uid=r.get('id'), file_name=unquote(r.get('href'))) + + ei.content = self.read_file(zip_path.join(self.opf_dir, r.get('href'))) + elif 'cover' in properties: + ei = EpubCoverHtml() + + ei.content = self.read_file(zip_path.join(self.opf_dir, unquote(r.get('href')))) + else: + ei = EpubHtml() + + ei.id = r.get('id') + ei.file_name = unquote(r.get('href')) + ei.media_type = media_type + ei.content = self.read_file(zip_path.join(self.opf_dir, ei.get_name())) + ei.properties = properties + elif media_type in IMAGE_MEDIA_TYPES: + if 'cover-image' in properties: + ei = EpubCover(uid=r.get('id'), file_name=unquote(r.get('href'))) + + ei.media_type = media_type + ei.content = self.read_file(zip_path.join(self.opf_dir, ei.get_name())) + else: + ei = EpubImage() + + ei.id = r.get('id') + ei.file_name = unquote(r.get('href')) + ei.media_type = media_type + ei.content = self.read_file(zip_path.join(self.opf_dir, ei.get_name())) + else: + # different types + ei = EpubItem() + + ei.id = r.get('id') + ei.file_name = unquote(r.get('href')) + ei.media_type = media_type + + ei.content = self.read_file(zip_path.join(self.opf_dir, ei.get_name())) + + self.book.add_item(ei) + + def _parse_ncx(self, data): + tree = parse_string(data) + tree_root = tree.getroot() + + nav_map = tree_root.find('{%s}navMap' % NAMESPACES['DAISY']) + + def _get_children(elems, n, nid): + label, content = '', '' + children = [] + + for a in elems.getchildren(): + if a.tag == '{%s}navLabel' % NAMESPACES['DAISY']: + label = a.getchildren()[0].text + if a.tag == '{%s}content' % NAMESPACES['DAISY']: + content = a.get('src', '') + if a.tag == '{%s}navPoint' % NAMESPACES['DAISY']: + children.append(_get_children(a, n + 1, a.get('id', ''))) + + if len(children) > 0: + if n == 0: + return children + + return (Section(label, href=content), + children) + else: + return Link(content, label, nid) + + self.book.toc = _get_children(nav_map, 0, '') + + def _parse_nav(self, data, base_path): + html_node = parse_html_string(data) + nav_node = html_node.xpath("//nav[@*='toc']")[0] + + def parse_list(list_node): + items = [] + + for item_node in list_node.findall('li'): + + sublist_node = item_node.find('ol') + link_node = item_node.find('a') + + if sublist_node is not None: + title = item_node[0].text + children = parse_list(sublist_node) + + if link_node is not None: + href = zip_path.normpath(zip_path.join(base_path, link_node.get('href'))) + items.append((Section(title, href=href), children)) + else: + items.append((Section(title), children)) + elif link_node is not None: + title = link_node.text + href = zip_path.normpath(zip_path.join(base_path, link_node.get('href'))) + + items.append(Link(href, title)) + + return items + + self.book.toc = parse_list(nav_node.find('ol')) + + def _load_spine(self): + spine = self.container.find('{%s}%s' % (NAMESPACES['OPF'], 'spine')) + + self.book.spine = [(t.get('idref'), t.get('linear', 'yes')) for t in spine] + + toc = spine.get('toc', '') + self.book.set_direction(spine.get('page-progression-direction', None)) + + # should read ncx or nav file + if toc: + try: + ncxFile = self.read_file(zip_path.join(self.opf_dir, self.book.get_item_with_id(toc).get_name())) + except KeyError: + raise EpubException(-1, 'Can not find ncx file.') + + self._parse_ncx(ncxFile) + + def _load_guide(self): + guide = self.container.find('{%s}%s' % (NAMESPACES['OPF'], 'guide')) + if guide is not None: + self.book.guide = [{'href': t.get('href'), 'title': t.get('title'), 'type': t.get('type')} for t in guide] + + def _load_opf_file(self): + try: + s = self.read_file(self.opf_file) + except KeyError: + raise EpubException(-1, 'Can not find container file') + + self.container = parse_string(s) + + self._load_metadata() + self._load_manifest() + self._load_spine() + self._load_guide() + + # read nav file if found + # + if not self.book.toc: + nav_item = next((item for item in self.book.items if isinstance(item, EpubNav)), None) + if nav_item: + self._parse_nav(nav_item.content, zip_path.dirname(nav_item.file_name)) + + def _load(self): + try: + self.zf = zipfile.ZipFile(self.file_name, 'r', compression=zipfile.ZIP_DEFLATED, allowZip64=True) + except zipfile.BadZipfile as bz: + raise EpubException(0, 'Bad Zip file') + except zipfile.LargeZipFile as bz: + raise EpubException(1, 'Large Zip file') + + # 1st check metadata + self._load_container() + self._load_opf_file() + + self.zf.close() + + +# WRITE + +def write_epub(name, book, options=None): + """ + Creates epub file with the content defined in EpubBook. + + >>> ebooklib.write_epub('book.epub', book) + + :Args: + - name: file name for the output file + - book: instance of EpubBook + - options: extra opions as dictionary (optional) + """ + epub = EpubWriter(name, book, options) + + epub.process() + + try: + epub.write() + except IOError: + pass + +# READ + + +def read_epub(name, options=None): + """ + Creates new instance of EpubBook with the content defined in the input file. + + >>> book = ebooklib.read_epub('book.epub') + + :Args: + - name: full path to the input file + - options: extra options as dictionary (optional) + + :Returns: + Instance of EpubBook. + """ + reader = EpubReader(name, options) + + book = reader.load() + reader.process() + + return book diff --git a/ebooklib/plugins/__init__.py b/ebooklib/plugins/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ebooklib/plugins/base.py b/ebooklib/plugins/base.py new file mode 100644 index 0000000..6351f81 --- /dev/null +++ b/ebooklib/plugins/base.py @@ -0,0 +1,49 @@ +# This file is part of EbookLib. +# Copyright (c) 2013 Aleksandar Erkalovic +# +# EbookLib is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# EbookLib is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with EbookLib. If not, see . + + +class BasePlugin(object): + def before_write(self, book): + "Processing before save" + return True + + def after_write(self, book): + "Processing after save" + return True + + def before_read(self, book): + "Processing before save" + return True + + def after_read(self, book): + "Processing after save" + return True + + def item_after_read(self, book, item): + "Process general item after read." + return True + + def item_before_write(self, book, item): + "Process general item before write." + return True + + def html_after_read(self, book, chapter): + "Processing HTML before read." + return True + + def html_before_write(self, book, chapter): + "Processing HTML before save." + return True diff --git a/ebooklib/plugins/booktype.py b/ebooklib/plugins/booktype.py new file mode 100644 index 0000000..5518258 --- /dev/null +++ b/ebooklib/plugins/booktype.py @@ -0,0 +1,119 @@ +# This file is part of EbookLib. +# Copyright (c) 2013 Aleksandar Erkalovic +# +# EbookLib is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# EbookLib is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with EbookLib. If not, see . + +from ebooklib.plugins.base import BasePlugin +from ebooklib.utils import parse_html_string + +class BooktypeLinks(BasePlugin): + NAME = 'Booktype Links' + + def __init__(self, booktype_book): + self.booktype_book = booktype_book + + def html_before_write(self, book, chapter): + from lxml import etree + + try: + from urlparse import urlparse, urljoin + except ImportError: + from urllib.parse import urlparse, urljoin + + try: + tree = parse_html_string(chapter.content) + except: + return + + root = tree.getroottree() + + if len(root.find('body')) != 0: + body = tree.find('body') + + # should also be aware to handle + # ../chapter/ + # ../chapter/#reference + # ../chapter#reference + + for _link in body.xpath('//a'): + # This is just temporary for the footnotes + if _link.get('href', '').find('InsertNoteID') != -1: + _ln = _link.get('href', '') + i = _ln.find('#') + _link.set('href', _ln[i:]); + + continue + + _u = urlparse(_link.get('href', '')) + + # Let us care only for internal links at the moment + if _u.scheme == '': + if _u.path != '': + _link.set('href', '%s.xhtml' % _u.path) + + if _u.fragment != '': + _link.set('href', urljoin(_link.get('href'), '#%s' % _u.fragment)) + + if _link.get('name') != None: + _link.set('id', _link.get('name')) + etree.strip_attributes(_link, 'name') + + chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8') + + + + +class BooktypeFootnotes(BasePlugin): + NAME = 'Booktype Footnotes' + + def __init__(self, booktype_book): + self.booktype_book = booktype_book + + def html_before_write(self, book, chapter): + from lxml import etree + + from ebooklib import epub + + try: + tree = parse_html_string(chapter.content) + except: + return + + root = tree.getroottree() + + if len(root.find('body')) != 0: + body = tree.find('body') + + # 1 + #
  1. prvi footnote ^
  2. + + # 1

    + # + for footnote in body.xpath('//span[@class="InsertNoteMarker"]'): + footnote_id = footnote.get('id')[:-8] + a = footnote.getchildren()[0].getchildren()[0] + + footnote_text = body.xpath('//li[@id="%s"]' % footnote_id)[0] + + a.attrib['{%s}type' % epub.NAMESPACES['EPUB']] = 'noteref' + ftn = etree.SubElement(body, 'aside', {'id': footnote_id}) + ftn.attrib['{%s}type' % epub.NAMESPACES['EPUB']] = 'footnote' + ftn_p = etree.SubElement(ftn, 'p') + ftn_p.text = footnote_text.text + + old_footnote = body.xpath('//ol[@id="InsertNote_NoteList"]') + if len(old_footnote) > 0: + body.remove(old_footnote[0]) + + chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8') diff --git a/ebooklib/plugins/sourcecode.py b/ebooklib/plugins/sourcecode.py new file mode 100644 index 0000000..4f973a2 --- /dev/null +++ b/ebooklib/plugins/sourcecode.py @@ -0,0 +1,68 @@ +# This file is part of EbookLib. +# Copyright (c) 2013 Aleksandar Erkalovic +# +# EbookLib is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# EbookLib is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with EbookLib. If not, see . + +from ebooklib.plugins.base import BasePlugin +from ebooklib.utils import parse_html_string + +class SourceHighlighter(BasePlugin): + def __init__(self): + pass + + def html_before_write(self, book, chapter): + from lxml import etree, html + + from pygments import highlight + from pygments.formatters import HtmlFormatter + + from ebooklib import epub + + try: + tree = parse_html_string(chapter.content) + except: + return + + root = tree.getroottree() + + had_source = False + + if len(root.find('body')) != 0: + body = tree.find('body') + # check for embeded source + for source in body.xpath('//pre[contains(@class,"source-")]'): + css_class = source.get('class') + + source_text = (source.text or '') + ''.join([html.tostring(child) for child in source.iterchildren()]) + + if 'source-python' in css_class: + from pygments.lexers import PythonLexer + +# _text = highlight(source_text, PythonLexer(), HtmlFormatter(linenos="inline")) + _text = highlight(source_text, PythonLexer(), HtmlFormatter()) + + if 'source-css' in css_class: + from pygments.lexers import CssLexer + + _text = highlight(source_text, CssLexer(), HtmlFormatter()) + + _parent = source.getparent() + _parent.replace(source, etree.XML(_text)) + + had_source = True + + if had_source: + chapter.add_link(href="style/code.css", rel="stylesheet", type="text/css") + chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8') + diff --git a/ebooklib/plugins/standard.py b/ebooklib/plugins/standard.py new file mode 100644 index 0000000..61576f9 --- /dev/null +++ b/ebooklib/plugins/standard.py @@ -0,0 +1,230 @@ +# This file is part of EbookLib. +# Copyright (c) 2013 Aleksandar Erkalovic +# +# EbookLib is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# EbookLib is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with EbookLib. If not, see . + +import six + +from ebooklib.plugins.base import BasePlugin +from ebooklib.utils import parse_html_string + +# TODO: +# - should also look for the _required_ elements +# http://www.w3.org/html/wg/drafts/html/master/tabular-data.html#the-table-element + +ATTRIBUTES_GLOBAL = ['accesskey', 'class', 'contenteditable', 'contextmenu', 'dir', 'draggable', + 'dropzone', 'hidden', 'id', 'inert', 'itemid', 'itemprop', 'itemref', + 'itemscope', 'itemtype', 'lang', 'spellcheck', 'style', 'tabindex', + 'title', 'translate', 'epub:type'] + +# Remove for now from here +DEPRECATED_TAGS = ['acronym', 'applet', 'basefont', 'big', 'center', 'dir', 'font', 'frame', + 'frameset', 'isindex', 'noframes', 's', 'strike', 'tt'] + + +def leave_only(item, tag_list): + for _attr in six.iterkeys(item.attrib): + if _attr not in tag_list: + del item.attrib[_attr] + + +class SyntaxPlugin(BasePlugin): + NAME = 'Check HTML syntax' + + def html_before_write(self, book, chapter): + from lxml import etree + + try: + tree = parse_html_string(chapter.content) + except: + return + + root = tree.getroottree() + + # delete deprecated tags + # i should really have a list of allowed tags + for tag in DEPRECATED_TAGS: + etree.strip_tags(root, tag) + + head = tree.find('head') + + if head is not None and len(head) != 0: + + for _item in head: + if _item.tag == 'base': + leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'target']) + elif _item.tag == 'link': + leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'crossorigin', 'rel', 'media', 'hreflang', 'type', 'sizes']) + elif _item.tag == 'title': + if _item.text == '': + head.remove(_item) + elif _item.tag == 'meta': + leave_only(_item, ATTRIBUTES_GLOBAL + ['name', 'http-equiv', 'content', 'charset']) + # just remove for now, but really should not be like this + head.remove(_item) + elif _item.tag == 'script': + leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'charset', 'async', 'defer', 'crossorigin']) + elif _item.tag == 'source': + leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'media']) + elif _item.tag == 'style': + leave_only(_item, ATTRIBUTES_GLOBAL + ['media', 'type', 'scoped']) + else: + leave_only(_item, ATTRIBUTES_GLOBAL) + + + if len(root.find('body')) != 0: + body = tree.find('body') + + for _item in body.iter(): + # it is not + # + + if _item.tag == 'a': + leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'target', 'download', 'rel', 'hreflang', 'type']) + elif _item.tag == 'area': + leave_only(_item, ATTRIBUTES_GLOBAL + ['alt', 'coords', 'shape', 'href', 'target', 'download', 'rel', 'hreflang', 'type']) + elif _item.tag == 'audio': + leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'crossorigin', 'preload', 'autoplay', 'mediagroup', 'loop', 'muted', 'controls']) + elif _item.tag == 'blockquote': + leave_only(_item, ATTRIBUTES_GLOBAL + ['cite']) + elif _item.tag == 'button': + leave_only(_item, ATTRIBUTES_GLOBAL + ['autofocus', 'disabled', 'form', 'formaction', 'formenctype', 'formmethod', 'formnovalidate', + 'formtarget', 'name', 'type', 'value', 'menu']) + elif _item.tag == 'canvas': + leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height']) + elif _item.tag == 'canvas': + leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height']) + elif _item.tag == 'del': + leave_only(_item, ATTRIBUTES_GLOBAL + ['cite', 'datetime']) + elif _item.tag == 'details': + leave_only(_item, ATTRIBUTES_GLOBAL + ['open']) + elif _item.tag == 'embed': + leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'width', 'height']) + elif _item.tag == 'fieldset': + leave_only(_item, ATTRIBUTES_GLOBAL + ['disable', 'form', 'name']) + elif _item.tag == 'details': + leave_only(_item, ATTRIBUTES_GLOBAL + ['accept-charset', 'action', 'autocomplete', 'enctype', 'method', 'name', 'novalidate', 'target']) + elif _item.tag == 'iframe': + leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'srcdoc', 'name', 'sandbox', 'seamless', 'allowfullscreen', 'width', 'height']) + elif _item.tag == 'img': + _src = _item.get('src', '').lower() + if _src.startswith('http://') or _src.startswith('https://'): + if 'remote-resources' not in chapter.properties: + chapter.properties.append('remote-resources') + # THIS DOES NOT WORK, ONLY VIDEO AND AUDIO FILES CAN BE REMOTE RESOURCES + # THAT MEANS I SHOULD ALSO CATCH +# +# EbookLib is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# EbookLib is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with EbookLib. If not, see . + +import six +import subprocess + +from ebooklib.plugins.base import BasePlugin +from ebooklib.utils import parse_html_string + +# Recommend usage of +# - https://github.com/w3c/tidy-html5 + +def tidy_cleanup(content, **extra): + cmd = [] + + for k, v in six.iteritems(extra): + + if v: + cmd.append('--%s' % k) + cmd.append(v) + else: + cmd.append('-%s' % k) + + # must parse all other extra arguments + try: + p = subprocess.Popen(['tidy']+cmd, shell=False, + stdin=subprocess.PIPE, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, close_fds=True) + except OSError: + return (3, None) + + p.stdin.write(content) + + (cont, p_err) = p.communicate() + + # 0 - all ok + # 1 - there were warnings + # 2 - there were errors + # 3 - exception + + return (p.returncode, cont) + + +class TidyPlugin(BasePlugin): + NAME = 'Tidy HTML' + OPTIONS = {'char-encoding': 'utf8', + 'tidy-mark': 'no' + } + + def __init__(self, extra = {}): + self.options = dict(self.OPTIONS) + self.options.update(extra) + + def html_before_write(self, book, chapter): + if not chapter.content: + return None + + (_, chapter.content) = tidy_cleanup(chapter.content, **self.options) + + return chapter.content + + def html_after_read(self, book, chapter): + if not chapter.content: + return None + + (_, chapter.content) = tidy_cleanup(chapter.content, **self.options) + + return chapter.content + diff --git a/ebooklib/utils.py b/ebooklib/utils.py new file mode 100644 index 0000000..162f4c9 --- /dev/null +++ b/ebooklib/utils.py @@ -0,0 +1,60 @@ +# This file is part of EbookLib. +# Copyright (c) 2013 Aleksandar Erkalovic +# +# EbookLib is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# EbookLib is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with EbookLib. If not, see . + +import io +import mimetypes + +from lxml import etree + + +mimetype_initialised = False + + +def debug(obj): + import pprint + + pp = pprint.PrettyPrinter(indent=4) + pp.pprint(obj) + + +def parse_string(s): + try: + tree = etree.parse(io.BytesIO(s.encode('utf-8'))) + except: + tree = etree.parse(io.BytesIO(s)) + + return tree + + +def parse_html_string(s): + from lxml import html + + utf8_parser = html.HTMLParser(encoding='utf-8') + + html_tree = html.document_fromstring(s, parser=utf8_parser) + + return html_tree + + +def guess_type(extenstion): + global mimetype_initialised + + if not mimetype_initialised: + mimetypes.init() + mimetypes.add_type('application/xhtml+xml', '.xhtml') + mimetype_initialised = True + + return mimetypes.guess_type(extenstion)