Overhaul EPUB parsing and ToC generation

This commit is contained in:
BasioMeusPuga
2019-02-09 04:21:22 +05:30
parent 1e004774c9
commit e4be239bf0
4 changed files with 312 additions and 285 deletions

11
TODO
View File

@@ -36,6 +36,7 @@ TODO
Set focus to newly added file Set focus to newly added file
Reading: Reading:
✓ Drop down for TOC ✓ Drop down for TOC
✓ Treeview navigation for TOC
✓ Override the keypress event of the textedit ✓ Override the keypress event of the textedit
✓ Use format* icons for toolbar buttons ✓ Use format* icons for toolbar buttons
✓ Implement book view settings with a(nother) toolbar ✓ Implement book view settings with a(nother) toolbar
@@ -86,7 +87,6 @@ TODO
Have them save to memory Have them save to memory
✓ fb2 support ✓ fb2 support
✓ Images need to show up in their placeholders ✓ Images need to show up in their placeholders
djvu support
Other: Other:
✓ Define every widget in code ✓ Define every widget in code
Bugs: Bugs:
@@ -98,8 +98,11 @@ TODO
Better recursion needed for fb2 toc Better recursion needed for fb2 toc
Secondary: Secondary:
Additional Settings:
Disable progressbar - 20% book addition speed improvement
Disable cover loading when reading - Saves ~2M / book
Create covers for books without them - VERY SLOW
Special formatting for each chapter's title Special formatting for each chapter's title
Create covers for books without them
Signal end of chapter with some text Signal end of chapter with some text
Graphical themes Graphical themes
Change focus rectangle dimensions Change focus rectangle dimensions
@@ -108,7 +111,7 @@ TODO
Goodreads API: Ratings, Read, Recommendations Goodreads API: Ratings, Read, Recommendations
Get ISBN using python-isbnlib Get ISBN using python-isbnlib
Use embedded fonts + CSS Use embedded fonts + CSS
txt, doc, chm support txt, doc, chm, djvu support
Include icons for filetype emblems Include icons for filetype emblems
Comic view modes Comic view modes
Continuous paging Continuous paging
@@ -116,7 +119,7 @@ TODO
? Add only one file type if multiple are present ? Add only one file type if multiple are present
? Create emblem per filetype ? Create emblem per filetype
In application notifications In application notifications
Notification in case the filter is filtering out all files with no option in place Notification in case the filter is filtering out all files with no option in place
Option to fit images to viewport Option to fit images to viewport
Need help with: Need help with:

View File

@@ -29,14 +29,13 @@ class ParseEPUB:
# Maybe also include book description # Maybe also include book description
self.book_ref = None self.book_ref = None
self.book = None self.book = None
self.temp_dir = temp_dir
self.filename = filename self.filename = filename
self.extract_path = os.path.join(temp_dir, file_md5) self.extract_path = os.path.join(temp_dir, file_md5)
def read_book(self): def read_book(self):
self.book_ref = EPUB(self.filename) self.book_ref = EPUB(self.filename, self.temp_dir)
contents_found = self.book_ref.read_epub() self.book_ref.generate_metadata()
if not contents_found:
return False
self.book = self.book_ref.book self.book = self.book_ref.book
return True return True
@@ -61,14 +60,8 @@ class ParseEPUB:
def get_contents(self): def get_contents(self):
zipfile.ZipFile(self.filename).extractall(self.extract_path) zipfile.ZipFile(self.filename).extractall(self.extract_path)
self.book_ref.parse_toc() self.book_ref.generate_toc()
self.book_ref.parse_chapters(temp_dir=self.extract_path) self.book_ref.generate_content()
toc = []
content = []
for count, i in enumerate(self.book['book_list']):
toc.append((1, i[0], count + 1))
content.append(i[1])
# Return toc, content, images_only # Return toc, content, images_only
return toc, content, False return self.book['toc'], self.book['content'], False

View File

@@ -14,175 +14,333 @@
# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
import os # TODO
import logging # See if inserting chapters not in the toc.ncx can be avoided
import zipfile # Missing file order is messed up
from urllib.parse import unquote # Account for stylesheets... eventually
# Everything needs logging
# Mobipocket files
import os
import zipfile
import logging
import collections
import xmltodict
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class EPUB: class EPUB:
def __init__(self, filename): def __init__(self, book_filename, temp_dir):
self.filename = filename self.book_filename = book_filename
self.temp_dir = temp_dir
self.zip_file = None self.zip_file = None
self.file_list = None
self.opf_dict = None
self.book = {} self.book = {}
self.generate_references()
def find_file(self, filename):
# First, look for the file in the root of the book
if filename in self.file_list:
return filename
# Then, search for it elsewhere
else:
file_basename = os.path.basename(filename)
for i in self.file_list:
if os.path.basename(i) == file_basename:
return i
# If the file isn't found
logger.error(filename + ' not found')
return False
def generate_references(self):
self.zip_file = zipfile.ZipFile(
self.book_filename, mode='r', allowZip64=True)
self.file_list = self.zip_file.namelist()
# Book structure relies on parsing the .opf file
# in the book. Now that might be the usual content.opf
# or package.opf or it might be named after your favorite
# eldritch abomination. The point is we have to check
# the container.xml
container = self.find_file('container.xml')
if container:
container_xml = self.zip_file.read(container)
container_dict = xmltodict.parse(container_xml)
packagefile = container_dict['container']['rootfiles']['rootfile']['@full-path']
else:
presumptive_names = ('content.opf', 'package.opf')
for i in presumptive_names:
packagefile = self.find_file(i)
if packagefile:
break
packagefile_data = self.zip_file.read(packagefile)
self.opf_dict = xmltodict.parse(packagefile_data)
def generate_toc(self):
self.book['toc'] = []
# I'm currently going with the file always being named toc.ncx
# But this is epub. The wild west of ebook formats.
tocfile = self.find_file('toc.ncx')
tocfile_data = self.zip_file.read(tocfile)
toc_dict = xmltodict.parse(tocfile_data)
def recursor(level, nav_node):
if isinstance(nav_node, list):
these_contents = [[
level + 1,
i['navLabel']['text'],
i['content']['@src']] for i in nav_node]
self.book['toc'].extend(these_contents)
return
if 'navPoint' in nav_node.keys():
recursor(level, nav_node['navPoint'])
else:
self.book['toc'].append([
level + 1,
nav_node['navLabel']['text'],
nav_node['content']['@src']])
navpoints = toc_dict['ncx']['navMap']['navPoint']
for top_level_nav in navpoints:
self.book['toc'].append([
1,
top_level_nav['navLabel']['text'],
top_level_nav['content']['@src']])
if 'navPoint' in top_level_nav.keys():
recursor(1, top_level_nav)
def get_chapter_content(self, chapter_file):
this_file = self.find_file(chapter_file)
if this_file:
return self.zip_file.read(this_file).decode()
else:
print('Not found: ' + chapter_file)
return chapter_file
def parse_split_chapters(self, chapters_with_split_content):
self.book['split_chapters'] = {} self.book['split_chapters'] = {}
def read_epub(self): # For split chapters, get the whole chapter first, then split
# This is the function that should error out in # between ids using their anchors, then "heal" the resultant text
# case the module cannot process the file # by creating a BeautifulSoup object. Write its str to the content
try: for i in chapters_with_split_content.items():
self.load_zip() chapter_file = i[0]
contents_path = self.get_file_path( self.book['split_chapters'][chapter_file] = {}
None, True)
if not contents_path: chapter_content = self.get_chapter_content(chapter_file)
return False # No (valid) opf was found so processing cannot continue soup = BeautifulSoup(chapter_content, 'lxml')
self.generate_book_metadata(contents_path) split_anchors = i[1]
except: # Not specifying an exception type here may be justified for this_anchor in reversed(split_anchors):
return False this_tag = soup.find(
attrs={"id":lambda x: x == this_anchor})
return True markup_split = str(soup).split(str(this_tag))
soup = BeautifulSoup(markup_split[0], 'lxml')
this_markup = BeautifulSoup(
str(this_tag) + markup_split[1], 'lxml')
def load_zip(self): self.book['split_chapters'][chapter_file][this_anchor] = str(this_markup)
try:
self.zip_file = zipfile.ZipFile(
self.filename, mode='r', allowZip64=True)
except (KeyError, AttributeError, zipfile.BadZipFile):
logger.error('Malformed zip file ' + self.filename)
return
def parse_xml(self, filename, parser): # Remaining markup is assigned here
try: self.book['split_chapters'][chapter_file]['top_level'] = str(soup)
this_xml = self.zip_file.read(filename).decode()
except KeyError:
short_filename = os.path.basename(self.filename)
warning_string = f'{str(filename)} not found in {short_filename}'
logger.warning(warning_string)
return
root = BeautifulSoup(this_xml, parser) def generate_content(self):
return root # Find all the chapters mentioned in the opf spine
# These are simply ids that correspond to the actual item
# as mentioned in the manifest - which is a comprehensive
# list of files
chapters_in_spine = [
i['@idref']
for i in self.opf_dict['package']['spine']['itemref']]
def get_file_path(self, filename, is_content_file=False): # Next, find items and ids from the manifest
# Use this to get the location of the content.opf file chapters_from_manifest = {
# And maybe some other file that has a more well formatted i['@id']: i['@href']
# idea of the TOC for i in self.opf_dict['package']['manifest']['item']}
# We're going to all this trouble because there really is
# no going forward without a toc
if is_content_file:
container_location = self.get_file_path('container.xml')
xml = self.parse_xml(container_location, 'xml')
if xml: # Finally, check which items are supposed to be in the spine
root_item = xml.find('rootfile') # on the basis of the id and change the toc accordingly
spine_final = []
for i in chapters_in_spine:
try:
spine_final.append(chapters_from_manifest.pop(i))
except KeyError:
pass
# TODO
# Check what happens in case missing chapters are either
# at the beginning or the end of the book
chapter_title = 1
toc_chapters = [i[2] for i in self.book['toc']]
last_valid_index = 0
for i in spine_final:
if not i in toc_chapters:
previous_chapter = spine_final[spine_final.index(i) - 1]
try: try:
return root_item.get('full-path') previous_chapter_toc_index = toc_chapters.index(previous_chapter)
except AttributeError: # In case of 2+ consecutive missing chapters
error_string = f'ePub module: {self.filename} has a malformed container.xml' last_valid_index = previous_chapter_toc_index
except ValueError:
last_valid_index += 1
self.book['toc'].insert(
last_valid_index + 1,
[1, str(chapter_title), i])
chapter_title += 1
# Parse split chapters as below
# They can be picked up during the iteration through the toc
chapters_with_split_content = {}
for i in self.book['toc']:
if '#' in i[2]:
this_split = i[2].split('#')
chapter = this_split[0]
anchor = this_split[1]
try:
chapters_with_split_content[chapter].append(anchor)
except KeyError:
chapters_with_split_content[chapter] = []
chapters_with_split_content[chapter].append(anchor)
self.parse_split_chapters(chapters_with_split_content)
# Now we iterate over the ToC as presented in the toc.ncx
# and add chapters to the content list
# In case a split chapter is encountered, get its content
# from the split_chapters dictionary
# What could possibly go wrong?
# The content list is separated from the toc list because
# the mupdf library returns its own toc a certain way and
# this keeps things uniform
split_chapters = self.book['split_chapters']
toc_copy = self.book['toc'][:]
self.book['content'] = []
# Put the book into the book
for count, i in enumerate(toc_copy):
chapter_file = i[2]
# Get split content according to its corresponding id attribute
if '#' in chapter_file:
this_split = chapter_file.split('#')
chapter_file_proper = this_split[0]
this_anchor = this_split[1]
try:
chapter_content = (
split_chapters[chapter_file_proper][this_anchor])
except KeyError:
chapter_content = 'Parse Error'
error_string = (
f'Error parsing {self.book_filename}: {chapter_file_proper}')
logger.error(error_string) logger.error(error_string)
return None
possible_filenames = ('content.opf', 'package.opf') # Get content that remained at the end of the pillaging above
for i in possible_filenames: elif chapter_file in split_chapters.keys():
presumptive_location = self.get_file_path(i) try:
if presumptive_location: chapter_content = split_chapters[chapter_file]['top_level']
return presumptive_location except KeyError:
chapter_content = 'Parse Error'
error_string = (
f'Error parsing {self.book_filename}: {chapter_file}')
logger.error(error_string)
for i in self.zip_file.filelist: # Vanilla non split chapters
if os.path.basename(i.filename) == os.path.basename(filename):
return i.filename
return None
def read_from_zip(self, filename):
filename = unquote(filename)
try:
file_data = self.zip_file.read(filename)
return file_data
except KeyError:
file_path_actual = self.get_file_path(filename)
if file_path_actual:
return self.zip_file.read(file_path_actual)
else: else:
logger.error('ePub module can\'t find ' + filename) chapter_content = self.get_chapter_content(chapter_file)
#______________________________________________________ # The count + 2 is an adjustment due to the cover being inserted below
self.book['toc'][count][2] = count + 2
self.book['content'].append(chapter_content)
def generate_book_metadata(self, contents_path): self.generate_book_cover()
self.book['title'] = os.path.splitext( if self.book['cover']:
os.path.basename(self.filename))[0] cover_path = os.path.join(
self.book['author'] = 'Unknown' self.temp_dir, os.path.basename(self.book_filename)) + '- cover'
self.book['isbn'] = None with open(cover_path, 'wb') as cover_temp:
self.book['tags'] = None cover_temp.write(self.book['cover'])
self.book['cover'] = None
self.book['toc_file'] = 'toc.ncx' # Overwritten if another one exists
# Parse XML self.book['toc'].insert(0, (1, 'Cover', 1))
xml = self.parse_xml(contents_path, 'xml') self.book['content'].insert(
0, (f'<center><img src="{cover_path}" alt="Cover"></center>'))
# Parse metadata def generate_metadata(self):
item_dict = { metadata = self.opf_dict['package']['metadata']
'title': 'title',
'author': 'creator',
'year': 'date'}
for i in item_dict.items(): # There are no exception types specified below
item = xml.find(i[1]) # This is on purpose and makes me long for the days
if item: # of simpler, happier things.
self.book[i[0]] = item.text
# Book title
try: try:
self.book['year'] = int(self.book['year'][:4]) self.book['title'] = metadata['dc:title']
except (TypeError, KeyError, IndexError, ValueError): if isinstance(self.book['title'], collections.OrderedDict):
self.book['title'] = metadata['dc:title']['#text']
except:
print('Title parse error')
self.book['title'] = os.path.splitext(
os.path.basename(self.book_filename))[0]
# Book author
try:
self.book['author'] = metadata['dc:creator']['#text']
except:
self.book['author'] = 'Unknown'
# Book year
try:
self.book['year'] = int(metadata['dc:date'][:4])
except:
self.book['year'] = 9999 self.book['year'] = 9999
# Get identifier # Book isbn
identifier_items = xml.find_all('identifier') self.book['isbn'] = None
for i in identifier_items: try:
scheme = i.get('scheme') for i in metadata['dc:identifier']:
try: if i['@opf:scheme'].lower() == 'isbn':
if scheme.lower() == 'isbn': self.book['isbn'] = i['#text']
self.book['isbn'] = i.text except:
except AttributeError: pass
self.book['isbn'] = None
# Tags # Book tags
tag_items = xml.find_all('subject') try:
tag_list = [i.text for i in tag_items] self.book['tags'] = metadata['dc:subject']
self.book['tags'] = tag_list except:
self.book['tags'] = []
# Get items # Book cover
self.book['content_dict'] = {} self.generate_book_cover()
all_items = xml.find_all('item')
for i in all_items:
media_type = i.get('media-type')
this_id = i.get('id')
if media_type == 'application/xhtml+xml' or media_type == 'text/html': def generate_book_cover(self):
self.book['content_dict'][this_id] = i.get('href') # This is separate because the book cover needs to
# be found and extracted both during addition / reading
if media_type == 'application/x-dtbncx+xml': self.book['cover'] = None
self.book['toc_file'] = i.get('href') try:
cover_image = [
# Cover image i['@href'] for i in self.opf_dict['package']['manifest']['item']
if 'cover' in this_id and media_type.split('/')[0] == 'image': if i['@media-type'].split('/')[0] == 'image' and
cover_href = i.get('href') 'cover' in i['@id']][0]
try: self.book['cover'] = self.zip_file.read(
self.book['cover'] = self.zip_file.read(cover_href) self.find_file(cover_image))
except KeyError: except:
# The cover cannot be found according to the pass
# path specified in the content reference
self.book['cover'] = self.zip_file.read(
self.get_file_path(cover_href))
# Find book cover the hard way
if not self.book['cover']: if not self.book['cover']:
# If no cover is located the conventional way,
# we go looking for the largest image in the book
biggest_image_size = 0 biggest_image_size = 0
biggest_image = None biggest_image = None
for j in self.zip_file.filelist: for j in self.zip_file.filelist:
@@ -192,139 +350,5 @@ class EPUB:
biggest_image_size = j.file_size biggest_image_size = j.file_size
if biggest_image: if biggest_image:
self.book['cover'] = self.read_from_zip(biggest_image) self.book['cover'] = self.zip_file.read(
else: self.find_file(biggest_image))
logger.error('No cover found for: ' + self.filename)
# Parse spine and arrange chapter paths acquired from the opf
# according to the order IN THE SPINE
spine_items = xml.find_all('itemref')
spine_order = []
for i in spine_items:
spine_order.append(i.get('idref'))
self.book['chapters_in_order'] = []
for i in spine_order:
chapter_path = self.book['content_dict'][i]
self.book['chapters_in_order'].append(chapter_path)
def parse_toc(self):
# This has no bearing on the actual order
# We're just using this to get chapter names
self.book['navpoint_dict'] = {}
toc_file = self.book['toc_file']
if toc_file:
toc_file = self.get_file_path(toc_file)
xml = self.parse_xml(toc_file, 'xml')
if not xml:
return
navpoints = xml.find_all('navPoint')
for i in navpoints:
chapter_title = i.find('text').text
chapter_source = i.find('content').get('src')
chapter_source_file = unquote(chapter_source.split('#')[0])
if '#' in chapter_source:
try:
self.book['split_chapters'][chapter_source_file].append(
(chapter_source.split('#')[1], chapter_title))
except KeyError:
self.book['split_chapters'][chapter_source_file] = []
self.book['split_chapters'][chapter_source_file].append(
(chapter_source.split('#')[1], chapter_title))
self.book['navpoint_dict'][chapter_source_file] = chapter_title
def parse_chapters(self, temp_dir=None, split_large_xml=False):
no_title_chapter = 0
self.book['book_list'] = []
for i in self.book['chapters_in_order']:
chapter_data = self.read_from_zip(i).decode()
if i in self.book['split_chapters'] and not split_large_xml:
split_chapters = get_split_content(
chapter_data, self.book['split_chapters'][i])
self.book['book_list'].extend(split_chapters)
elif split_large_xml:
# https://stackoverflow.com/questions/14444732/how-to-split-a-html-page-to-multiple-pages-using-python-and-beautiful-soup
markup = BeautifulSoup(chapter_data, 'xml')
chapters = []
pagebreaks = markup.find_all('pagebreak')
def next_element(elem):
while elem is not None:
elem = elem.next_sibling
if hasattr(elem, 'name'):
return elem
for pbreak in pagebreaks:
chapter = [str(pbreak)]
elem = next_element(pbreak)
while elem and elem.name != 'pagebreak':
chapter.append(str(elem))
elem = next_element(elem)
chapters.append('\n'.join(chapter))
for this_chapter in chapters:
fallback_title = str(no_title_chapter)
self.book['book_list'].append(
(fallback_title, this_chapter + ('<br/>' * 8)))
no_title_chapter += 1
else:
try:
self.book['book_list'].append(
(self.book['navpoint_dict'][i], chapter_data + ('<br/>' * 8)))
except KeyError:
fallback_title = str(no_title_chapter)
self.book['book_list'].append(
(fallback_title, chapter_data))
no_title_chapter += 1
cover_path = os.path.join(temp_dir, os.path.basename(self.filename)) + '- cover'
if self.book['cover']:
with open(cover_path, 'wb') as cover_temp:
cover_temp.write(self.book['cover'])
try:
self.book['book_list'][0] = (
'Cover', f'<center><img src="{cover_path}" alt="Cover"></center>')
except IndexError:
pass
def get_split_content(chapter_data, split_by):
split_anchors = [i[0] for i in split_by]
chapter_titles = [i[1] for i in split_by]
return_list = []
xml = BeautifulSoup(chapter_data, 'lxml')
xml_string = xml.body.prettify()
for count, i in enumerate(split_anchors):
this_split = xml_string.split(i)
current_chapter = this_split[0]
bs_obj = BeautifulSoup(current_chapter, 'lxml')
# Since tags correspond to data following them, the first
# chunk will be ignored
# As will all empty chapters
if bs_obj.text == '\n' or bs_obj.text == '' or count == 0:
continue
bs_obj_string = str(bs_obj).replace('"&gt;', '', 1) + ('<br/>' * 8)
return_list.append(
(chapter_titles[count - 1], bs_obj_string))
xml_string = ''.join(this_split[1:])
bs_obj = BeautifulSoup(xml_string, 'lxml')
bs_obj_string = str(bs_obj).replace('"&gt;', '', 1) + ('<br/>' * 8)
return_list.append(
(chapter_titles[-1], bs_obj_string))
return return_list

View File

@@ -150,6 +150,9 @@ class BookSorter:
i[0]: i[1] for i in all_hashes_and_paths} i[0]: i[1] for i in all_hashes_and_paths}
def database_entry_for_book(self, file_hash): def database_entry_for_book(self, file_hash):
# TODO
# This will probably look a whole lot better with a namedtuple
database_return = database.DatabaseFunctions( database_return = database.DatabaseFunctions(
self.database_path).fetch_data( self.database_path).fetch_data(
('Title', 'Author', 'Year', 'ISBN', 'Tags', ('Title', 'Author', 'Year', 'ISBN', 'Tags',
@@ -246,7 +249,10 @@ class BookSorter:
if cover_image_raw: if cover_image_raw:
cover_image = resize_image(cover_image_raw) cover_image = resize_image(cover_image_raw)
else: else:
cover_image = fetch_cover(title, author) # TODO
# Needs an option
# cover_image = fetch_cover(title, author)
cover_image = None
this_book[file_md5]['cover_image'] = cover_image this_book[file_md5]['cover_image'] = cover_image
this_book[file_md5]['addition_mode'] = self.addition_mode this_book[file_md5]['addition_mode'] = self.addition_mode
@@ -408,3 +414,4 @@ def fetch_cover(title, author):
except: except:
logger.error(f'Couldn\'t find cover for ' + title) logger.error(f'Couldn\'t find cover for ' + title)
return None