Incorporate ebooklib from https://github.com/aerkalov/ebooklib
This commit is contained in:
42
ebooklib/__init__.py
Normal file
42
ebooklib/__init__.py
Normal file
@@ -0,0 +1,42 @@
|
||||
# This file is part of EbookLib.
|
||||
# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
|
||||
#
|
||||
# EbookLib is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# EbookLib is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with EbookLib. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
# Version of ebook library
|
||||
|
||||
VERSION = (0, 16, 0)
|
||||
|
||||
# LIST OF POSSIBLE ITEMS
|
||||
ITEM_UNKNOWN = 0
|
||||
ITEM_IMAGE = 1
|
||||
ITEM_STYLE = 2
|
||||
ITEM_SCRIPT = 3
|
||||
ITEM_NAVIGATION = 4
|
||||
ITEM_VECTOR = 5
|
||||
ITEM_FONT = 6
|
||||
ITEM_VIDEO = 7
|
||||
ITEM_AUDIO = 8
|
||||
ITEM_DOCUMENT = 9
|
||||
|
||||
# EXTENSION MAPPER
|
||||
EXTENSIONS = {ITEM_IMAGE: ['.jpg', '.jpeg', '.gif', '.tiff', '.tif', '.png'],
|
||||
ITEM_STYLE: ['.css'],
|
||||
ITEM_VECTOR: ['.svg'],
|
||||
ITEM_FONT: ['.otf', '.woff', '.ttf'],
|
||||
ITEM_SCRIPT: ['.js'],
|
||||
ITEM_NAVIGATION: ['.ncx'],
|
||||
ITEM_VIDEO: ['.mov', '.mp4', '.avi'],
|
||||
ITEM_AUDIO: ['.mp3', '.ogg']
|
||||
}
|
1595
ebooklib/epub.py
Normal file
1595
ebooklib/epub.py
Normal file
File diff suppressed because it is too large
Load Diff
0
ebooklib/plugins/__init__.py
Normal file
0
ebooklib/plugins/__init__.py
Normal file
49
ebooklib/plugins/base.py
Normal file
49
ebooklib/plugins/base.py
Normal file
@@ -0,0 +1,49 @@
|
||||
# This file is part of EbookLib.
|
||||
# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
|
||||
#
|
||||
# EbookLib is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# EbookLib is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with EbookLib. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
class BasePlugin(object):
|
||||
def before_write(self, book):
|
||||
"Processing before save"
|
||||
return True
|
||||
|
||||
def after_write(self, book):
|
||||
"Processing after save"
|
||||
return True
|
||||
|
||||
def before_read(self, book):
|
||||
"Processing before save"
|
||||
return True
|
||||
|
||||
def after_read(self, book):
|
||||
"Processing after save"
|
||||
return True
|
||||
|
||||
def item_after_read(self, book, item):
|
||||
"Process general item after read."
|
||||
return True
|
||||
|
||||
def item_before_write(self, book, item):
|
||||
"Process general item before write."
|
||||
return True
|
||||
|
||||
def html_after_read(self, book, chapter):
|
||||
"Processing HTML before read."
|
||||
return True
|
||||
|
||||
def html_before_write(self, book, chapter):
|
||||
"Processing HTML before save."
|
||||
return True
|
119
ebooklib/plugins/booktype.py
Normal file
119
ebooklib/plugins/booktype.py
Normal file
@@ -0,0 +1,119 @@
|
||||
# This file is part of EbookLib.
|
||||
# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
|
||||
#
|
||||
# EbookLib is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# EbookLib is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with EbookLib. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
from ebooklib.plugins.base import BasePlugin
|
||||
from ebooklib.utils import parse_html_string
|
||||
|
||||
class BooktypeLinks(BasePlugin):
|
||||
NAME = 'Booktype Links'
|
||||
|
||||
def __init__(self, booktype_book):
|
||||
self.booktype_book = booktype_book
|
||||
|
||||
def html_before_write(self, book, chapter):
|
||||
from lxml import etree
|
||||
|
||||
try:
|
||||
from urlparse import urlparse, urljoin
|
||||
except ImportError:
|
||||
from urllib.parse import urlparse, urljoin
|
||||
|
||||
try:
|
||||
tree = parse_html_string(chapter.content)
|
||||
except:
|
||||
return
|
||||
|
||||
root = tree.getroottree()
|
||||
|
||||
if len(root.find('body')) != 0:
|
||||
body = tree.find('body')
|
||||
|
||||
# should also be aware to handle
|
||||
# ../chapter/
|
||||
# ../chapter/#reference
|
||||
# ../chapter#reference
|
||||
|
||||
for _link in body.xpath('//a'):
|
||||
# This is just temporary for the footnotes
|
||||
if _link.get('href', '').find('InsertNoteID') != -1:
|
||||
_ln = _link.get('href', '')
|
||||
i = _ln.find('#')
|
||||
_link.set('href', _ln[i:]);
|
||||
|
||||
continue
|
||||
|
||||
_u = urlparse(_link.get('href', ''))
|
||||
|
||||
# Let us care only for internal links at the moment
|
||||
if _u.scheme == '':
|
||||
if _u.path != '':
|
||||
_link.set('href', '%s.xhtml' % _u.path)
|
||||
|
||||
if _u.fragment != '':
|
||||
_link.set('href', urljoin(_link.get('href'), '#%s' % _u.fragment))
|
||||
|
||||
if _link.get('name') != None:
|
||||
_link.set('id', _link.get('name'))
|
||||
etree.strip_attributes(_link, 'name')
|
||||
|
||||
chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8')
|
||||
|
||||
|
||||
|
||||
|
||||
class BooktypeFootnotes(BasePlugin):
|
||||
NAME = 'Booktype Footnotes'
|
||||
|
||||
def __init__(self, booktype_book):
|
||||
self.booktype_book = booktype_book
|
||||
|
||||
def html_before_write(self, book, chapter):
|
||||
from lxml import etree
|
||||
|
||||
from ebooklib import epub
|
||||
|
||||
try:
|
||||
tree = parse_html_string(chapter.content)
|
||||
except:
|
||||
return
|
||||
|
||||
root = tree.getroottree()
|
||||
|
||||
if len(root.find('body')) != 0:
|
||||
body = tree.find('body')
|
||||
|
||||
# <span id="InsertNoteID_1_marker1" class="InsertNoteMarker"><sup><a href="#InsertNoteID_1">1</a></sup><span>
|
||||
# <ol id="InsertNote_NoteList"><li id="InsertNoteID_1">prvi footnote <span id="InsertNoteID_1_LinkBacks"><sup><a href="#InsertNoteID_1_marker1">^</a></sup></span></li>
|
||||
|
||||
# <a epub:type="noteref" href="#n1">1</a></p>
|
||||
# <aside epub:type="footnote" id="n1"><p>These have been corrected in this EPUB3 edition.</p></aside>
|
||||
for footnote in body.xpath('//span[@class="InsertNoteMarker"]'):
|
||||
footnote_id = footnote.get('id')[:-8]
|
||||
a = footnote.getchildren()[0].getchildren()[0]
|
||||
|
||||
footnote_text = body.xpath('//li[@id="%s"]' % footnote_id)[0]
|
||||
|
||||
a.attrib['{%s}type' % epub.NAMESPACES['EPUB']] = 'noteref'
|
||||
ftn = etree.SubElement(body, 'aside', {'id': footnote_id})
|
||||
ftn.attrib['{%s}type' % epub.NAMESPACES['EPUB']] = 'footnote'
|
||||
ftn_p = etree.SubElement(ftn, 'p')
|
||||
ftn_p.text = footnote_text.text
|
||||
|
||||
old_footnote = body.xpath('//ol[@id="InsertNote_NoteList"]')
|
||||
if len(old_footnote) > 0:
|
||||
body.remove(old_footnote[0])
|
||||
|
||||
chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8')
|
68
ebooklib/plugins/sourcecode.py
Normal file
68
ebooklib/plugins/sourcecode.py
Normal file
@@ -0,0 +1,68 @@
|
||||
# This file is part of EbookLib.
|
||||
# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
|
||||
#
|
||||
# EbookLib is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# EbookLib is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with EbookLib. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
from ebooklib.plugins.base import BasePlugin
|
||||
from ebooklib.utils import parse_html_string
|
||||
|
||||
class SourceHighlighter(BasePlugin):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def html_before_write(self, book, chapter):
|
||||
from lxml import etree, html
|
||||
|
||||
from pygments import highlight
|
||||
from pygments.formatters import HtmlFormatter
|
||||
|
||||
from ebooklib import epub
|
||||
|
||||
try:
|
||||
tree = parse_html_string(chapter.content)
|
||||
except:
|
||||
return
|
||||
|
||||
root = tree.getroottree()
|
||||
|
||||
had_source = False
|
||||
|
||||
if len(root.find('body')) != 0:
|
||||
body = tree.find('body')
|
||||
# check for embeded source
|
||||
for source in body.xpath('//pre[contains(@class,"source-")]'):
|
||||
css_class = source.get('class')
|
||||
|
||||
source_text = (source.text or '') + ''.join([html.tostring(child) for child in source.iterchildren()])
|
||||
|
||||
if 'source-python' in css_class:
|
||||
from pygments.lexers import PythonLexer
|
||||
|
||||
# _text = highlight(source_text, PythonLexer(), HtmlFormatter(linenos="inline"))
|
||||
_text = highlight(source_text, PythonLexer(), HtmlFormatter())
|
||||
|
||||
if 'source-css' in css_class:
|
||||
from pygments.lexers import CssLexer
|
||||
|
||||
_text = highlight(source_text, CssLexer(), HtmlFormatter())
|
||||
|
||||
_parent = source.getparent()
|
||||
_parent.replace(source, etree.XML(_text))
|
||||
|
||||
had_source = True
|
||||
|
||||
if had_source:
|
||||
chapter.add_link(href="style/code.css", rel="stylesheet", type="text/css")
|
||||
chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8')
|
||||
|
230
ebooklib/plugins/standard.py
Normal file
230
ebooklib/plugins/standard.py
Normal file
@@ -0,0 +1,230 @@
|
||||
# This file is part of EbookLib.
|
||||
# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
|
||||
#
|
||||
# EbookLib is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# EbookLib is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with EbookLib. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import six
|
||||
|
||||
from ebooklib.plugins.base import BasePlugin
|
||||
from ebooklib.utils import parse_html_string
|
||||
|
||||
# TODO:
|
||||
# - should also look for the _required_ elements
|
||||
# http://www.w3.org/html/wg/drafts/html/master/tabular-data.html#the-table-element
|
||||
|
||||
ATTRIBUTES_GLOBAL = ['accesskey', 'class', 'contenteditable', 'contextmenu', 'dir', 'draggable',
|
||||
'dropzone', 'hidden', 'id', 'inert', 'itemid', 'itemprop', 'itemref',
|
||||
'itemscope', 'itemtype', 'lang', 'spellcheck', 'style', 'tabindex',
|
||||
'title', 'translate', 'epub:type']
|
||||
|
||||
# Remove <u> for now from here
|
||||
DEPRECATED_TAGS = ['acronym', 'applet', 'basefont', 'big', 'center', 'dir', 'font', 'frame',
|
||||
'frameset', 'isindex', 'noframes', 's', 'strike', 'tt']
|
||||
|
||||
|
||||
def leave_only(item, tag_list):
|
||||
for _attr in six.iterkeys(item.attrib):
|
||||
if _attr not in tag_list:
|
||||
del item.attrib[_attr]
|
||||
|
||||
|
||||
class SyntaxPlugin(BasePlugin):
|
||||
NAME = 'Check HTML syntax'
|
||||
|
||||
def html_before_write(self, book, chapter):
|
||||
from lxml import etree
|
||||
|
||||
try:
|
||||
tree = parse_html_string(chapter.content)
|
||||
except:
|
||||
return
|
||||
|
||||
root = tree.getroottree()
|
||||
|
||||
# delete deprecated tags
|
||||
# i should really have a list of allowed tags
|
||||
for tag in DEPRECATED_TAGS:
|
||||
etree.strip_tags(root, tag)
|
||||
|
||||
head = tree.find('head')
|
||||
|
||||
if head is not None and len(head) != 0:
|
||||
|
||||
for _item in head:
|
||||
if _item.tag == 'base':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'target'])
|
||||
elif _item.tag == 'link':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'crossorigin', 'rel', 'media', 'hreflang', 'type', 'sizes'])
|
||||
elif _item.tag == 'title':
|
||||
if _item.text == '':
|
||||
head.remove(_item)
|
||||
elif _item.tag == 'meta':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['name', 'http-equiv', 'content', 'charset'])
|
||||
# just remove for now, but really should not be like this
|
||||
head.remove(_item)
|
||||
elif _item.tag == 'script':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'charset', 'async', 'defer', 'crossorigin'])
|
||||
elif _item.tag == 'source':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'media'])
|
||||
elif _item.tag == 'style':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['media', 'type', 'scoped'])
|
||||
else:
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL)
|
||||
|
||||
|
||||
if len(root.find('body')) != 0:
|
||||
body = tree.find('body')
|
||||
|
||||
for _item in body.iter():
|
||||
# it is not
|
||||
# <a class="indexterm" href="ch05.html#ix_epub:trigger_element">
|
||||
|
||||
if _item.tag == 'a':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'target', 'download', 'rel', 'hreflang', 'type'])
|
||||
elif _item.tag == 'area':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['alt', 'coords', 'shape', 'href', 'target', 'download', 'rel', 'hreflang', 'type'])
|
||||
elif _item.tag == 'audio':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'crossorigin', 'preload', 'autoplay', 'mediagroup', 'loop', 'muted', 'controls'])
|
||||
elif _item.tag == 'blockquote':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['cite'])
|
||||
elif _item.tag == 'button':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['autofocus', 'disabled', 'form', 'formaction', 'formenctype', 'formmethod', 'formnovalidate',
|
||||
'formtarget', 'name', 'type', 'value', 'menu'])
|
||||
elif _item.tag == 'canvas':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height'])
|
||||
elif _item.tag == 'canvas':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height'])
|
||||
elif _item.tag == 'del':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['cite', 'datetime'])
|
||||
elif _item.tag == 'details':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['open'])
|
||||
elif _item.tag == 'embed':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'width', 'height'])
|
||||
elif _item.tag == 'fieldset':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['disable', 'form', 'name'])
|
||||
elif _item.tag == 'details':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['accept-charset', 'action', 'autocomplete', 'enctype', 'method', 'name', 'novalidate', 'target'])
|
||||
elif _item.tag == 'iframe':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'srcdoc', 'name', 'sandbox', 'seamless', 'allowfullscreen', 'width', 'height'])
|
||||
elif _item.tag == 'img':
|
||||
_src = _item.get('src', '').lower()
|
||||
if _src.startswith('http://') or _src.startswith('https://'):
|
||||
if 'remote-resources' not in chapter.properties:
|
||||
chapter.properties.append('remote-resources')
|
||||
# THIS DOES NOT WORK, ONLY VIDEO AND AUDIO FILES CAN BE REMOTE RESOURCES
|
||||
# THAT MEANS I SHOULD ALSO CATCH <SOURCE TAG
|
||||
from ebooklib import epub
|
||||
_img = epub.EpubImage(file_name = _item.get('src'))
|
||||
book.add_item(_img)
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['alt', 'src', 'crossorigin', 'usemap', 'ismap', 'width', 'height'])
|
||||
elif _item.tag == 'input':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['accept', 'alt', 'autocomplete', 'autofocus', 'checked', 'dirname',
|
||||
'disabled', 'form', 'formaction', 'formenctype', 'formmethod', 'formnovalidate',
|
||||
'formtarget', 'height', 'inputmode', 'list', 'max', 'maxlength', 'min', 'multiple',
|
||||
'name', 'pattern', 'placeholder', 'readonly', 'required', 'size', 'src', 'step'
|
||||
'type', 'value', 'width'])
|
||||
elif _item.tag == 'ins':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['cite', 'datetime'])
|
||||
elif _item.tag == 'keygen':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['autofocus', 'challenge', 'disabled', 'form', 'keytype', 'name'])
|
||||
elif _item.tag == 'label':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['form', 'for'])
|
||||
elif _item.tag == 'label':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['form', 'for'])
|
||||
elif _item.tag == 'map':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['name'])
|
||||
elif _item.tag == 'menu':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['type', 'label'])
|
||||
elif _item.tag == 'object':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['data', 'type', 'typemustmatch', 'name', 'usemap', 'form', 'width', 'height'])
|
||||
elif _item.tag == 'ol':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['reversed', 'start', 'type'])
|
||||
elif _item.tag == 'optgroup':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['disabled', 'label'])
|
||||
elif _item.tag == 'option':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['disabled', 'label', 'selected', 'value'])
|
||||
elif _item.tag == 'output':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['for', 'form', 'name'])
|
||||
elif _item.tag == 'param':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['name', 'value'])
|
||||
elif _item.tag == 'progress':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['value', 'max'])
|
||||
elif _item.tag == 'q':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['cite'])
|
||||
elif _item.tag == 'select':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['autofocus', 'disabled', 'form', 'multiple', 'name', 'required', 'size'])
|
||||
|
||||
elif _item.tag == 'table':
|
||||
if _item.get('border', None):
|
||||
if _item.get('border') == '0':
|
||||
_item.set('border', '')
|
||||
|
||||
if _item.get('summary', None):
|
||||
_caption = etree.Element('caption', {})
|
||||
_caption.text = _item.get('summary')
|
||||
_item.insert(0, _caption)
|
||||
|
||||
# add it as caption
|
||||
del _item.attrib['summary']
|
||||
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['border', 'sortable'])
|
||||
elif _item.tag == 'dl':
|
||||
_d = _item.find('dd')
|
||||
if _d is not None and len(_d) == 0:
|
||||
pass
|
||||
|
||||
# http://html5doctor.com/the-dl-element/
|
||||
# should be like this really
|
||||
# some of the elements can be missing
|
||||
# dl
|
||||
# dt
|
||||
# dd
|
||||
# dt
|
||||
# dd
|
||||
elif _item.tag == 'td':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['colspan', 'rowspan', 'headers'])
|
||||
elif _item.tag == 'textarea':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['autocomplete', 'autofocus', 'cols', 'dirname', 'disabled', 'form',
|
||||
'inputmode', 'maxlength', 'name', 'placeholder', 'readonly', 'required',
|
||||
'rows', 'wrap'])
|
||||
|
||||
elif _item.tag in ['col', 'colgroup']:
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['span'])
|
||||
elif _item.tag == 'th':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['colspan', 'rowspan', 'headers', 'scope', 'abbr', 'sorted'])
|
||||
elif _item.tag in ['time']:
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['datetime'])
|
||||
elif _item.tag in ['track']:
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['kind', 'src', 'srclang', 'label', 'default'])
|
||||
elif _item.tag == 'video':
|
||||
leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'crossorigin', 'poster', 'preload', 'autoplay', 'mediagroup',
|
||||
'loop', 'muted', 'controls', 'width', 'height'])
|
||||
elif _item.tag == 'svg':
|
||||
# We need to add property "svg" in case we have embeded svg file
|
||||
if 'svg' not in chapter.properties:
|
||||
chapter.properties.append('svg')
|
||||
|
||||
if _item.get('viewbox', None):
|
||||
del _item.attrib['viewbox']
|
||||
|
||||
if _item.get('preserveaspectratio', None):
|
||||
del _item.attrib['preserveaspectratio']
|
||||
else:
|
||||
for _attr in six.iterkeys(_item.attrib):
|
||||
if _attr not in ATTRIBUTES_GLOBAL:
|
||||
del _item.attrib[_attr]
|
||||
|
||||
chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)
|
||||
|
||||
return chapter.content
|
82
ebooklib/plugins/tidyhtml.py
Normal file
82
ebooklib/plugins/tidyhtml.py
Normal file
@@ -0,0 +1,82 @@
|
||||
# This file is part of EbookLib.
|
||||
# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
|
||||
#
|
||||
# EbookLib is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# EbookLib is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with EbookLib. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import six
|
||||
import subprocess
|
||||
|
||||
from ebooklib.plugins.base import BasePlugin
|
||||
from ebooklib.utils import parse_html_string
|
||||
|
||||
# Recommend usage of
|
||||
# - https://github.com/w3c/tidy-html5
|
||||
|
||||
def tidy_cleanup(content, **extra):
|
||||
cmd = []
|
||||
|
||||
for k, v in six.iteritems(extra):
|
||||
|
||||
if v:
|
||||
cmd.append('--%s' % k)
|
||||
cmd.append(v)
|
||||
else:
|
||||
cmd.append('-%s' % k)
|
||||
|
||||
# must parse all other extra arguments
|
||||
try:
|
||||
p = subprocess.Popen(['tidy']+cmd, shell=False,
|
||||
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE, close_fds=True)
|
||||
except OSError:
|
||||
return (3, None)
|
||||
|
||||
p.stdin.write(content)
|
||||
|
||||
(cont, p_err) = p.communicate()
|
||||
|
||||
# 0 - all ok
|
||||
# 1 - there were warnings
|
||||
# 2 - there were errors
|
||||
# 3 - exception
|
||||
|
||||
return (p.returncode, cont)
|
||||
|
||||
|
||||
class TidyPlugin(BasePlugin):
|
||||
NAME = 'Tidy HTML'
|
||||
OPTIONS = {'char-encoding': 'utf8',
|
||||
'tidy-mark': 'no'
|
||||
}
|
||||
|
||||
def __init__(self, extra = {}):
|
||||
self.options = dict(self.OPTIONS)
|
||||
self.options.update(extra)
|
||||
|
||||
def html_before_write(self, book, chapter):
|
||||
if not chapter.content:
|
||||
return None
|
||||
|
||||
(_, chapter.content) = tidy_cleanup(chapter.content, **self.options)
|
||||
|
||||
return chapter.content
|
||||
|
||||
def html_after_read(self, book, chapter):
|
||||
if not chapter.content:
|
||||
return None
|
||||
|
||||
(_, chapter.content) = tidy_cleanup(chapter.content, **self.options)
|
||||
|
||||
return chapter.content
|
||||
|
60
ebooklib/utils.py
Normal file
60
ebooklib/utils.py
Normal file
@@ -0,0 +1,60 @@
|
||||
# This file is part of EbookLib.
|
||||
# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
|
||||
#
|
||||
# EbookLib is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# EbookLib is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with EbookLib. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import io
|
||||
import mimetypes
|
||||
|
||||
from lxml import etree
|
||||
|
||||
|
||||
mimetype_initialised = False
|
||||
|
||||
|
||||
def debug(obj):
|
||||
import pprint
|
||||
|
||||
pp = pprint.PrettyPrinter(indent=4)
|
||||
pp.pprint(obj)
|
||||
|
||||
|
||||
def parse_string(s):
|
||||
try:
|
||||
tree = etree.parse(io.BytesIO(s.encode('utf-8')))
|
||||
except:
|
||||
tree = etree.parse(io.BytesIO(s))
|
||||
|
||||
return tree
|
||||
|
||||
|
||||
def parse_html_string(s):
|
||||
from lxml import html
|
||||
|
||||
utf8_parser = html.HTMLParser(encoding='utf-8')
|
||||
|
||||
html_tree = html.document_fromstring(s, parser=utf8_parser)
|
||||
|
||||
return html_tree
|
||||
|
||||
|
||||
def guess_type(extenstion):
|
||||
global mimetype_initialised
|
||||
|
||||
if not mimetype_initialised:
|
||||
mimetypes.init()
|
||||
mimetypes.add_type('application/xhtml+xml', '.xhtml')
|
||||
mimetype_initialised = True
|
||||
|
||||
return mimetypes.guess_type(extenstion)
|
Reference in New Issue
Block a user