Improve EPUB parser compatibility and speed

Completely break MOBI parser
This commit is contained in:
BasioMeusPuga
2019-02-10 06:47:51 +05:30
parent e4be239bf0
commit c6e30b67ad
4 changed files with 135 additions and 51 deletions

3
TODO
View File

@@ -96,8 +96,11 @@ TODO
Bookmark name for a page that's not on the TOC or has nothing before Bookmark name for a page that's not on the TOC or has nothing before
Screen position still keeps jumping when inside a paragraph Screen position still keeps jumping when inside a paragraph
Better recursion needed for fb2 toc Better recursion needed for fb2 toc
Initial sort by author in tableview
Last column not filling up tableview
Secondary: Secondary:
Tab tooltip
Additional Settings: Additional Settings:
Disable progressbar - 20% book addition speed improvement Disable progressbar - 20% book addition speed improvement
Disable cover loading when reading - Saves ~2M / book Disable cover loading when reading - Saves ~2M / book

View File

@@ -63,5 +63,11 @@ class ParseEPUB:
self.book_ref.generate_toc() self.book_ref.generate_toc()
self.book_ref.generate_content() self.book_ref.generate_content()
toc = []
content = []
for count, i in enumerate(self.book['content']):
toc.append((i[0], i[1], count + 1))
content.append(i[2])
# Return toc, content, images_only # Return toc, content, images_only
return self.book['toc'], self.book['content'], False return toc, content, False

View File

@@ -55,10 +55,10 @@ class ParseMOBI:
self.epub_filepath = shutil.make_archive(zip_file, 'zip', zip_dir) self.epub_filepath = shutil.make_archive(zip_file, 'zip', zip_dir)
self.split_large_xml = True self.split_large_xml = True
self.book_ref = EPUB(self.epub_filepath) self.book_ref = EPUB(self.epub_filepath, self.temp_dir)
contents_found = self.book_ref.read_epub() self.book_ref.generate_metadata()
if not contents_found: self.book_ref.generate_toc()
return False self.book_ref.generate_content()
self.book = self.book_ref.book self.book = self.book_ref.book
return True return True
@@ -81,6 +81,7 @@ class ParseMOBI:
return self.book['tags'] return self.book['tags']
def get_contents(self): def get_contents(self):
return
extract_path = os.path.join(self.extract_dir) extract_path = os.path.join(self.extract_dir)
zipfile.ZipFile(self.epub_filepath).extractall(extract_path) zipfile.ZipFile(self.epub_filepath).extractall(extract_path)

View File

@@ -16,7 +16,6 @@
# TODO # TODO
# See if inserting chapters not in the toc.ncx can be avoided # See if inserting chapters not in the toc.ncx can be avoided
# Missing file order is messed up
# Account for stylesheets... eventually # Account for stylesheets... eventually
# Everything needs logging # Everything needs logging
# Mobipocket files # Mobipocket files
@@ -25,8 +24,10 @@ import os
import zipfile import zipfile
import logging import logging
import collections import collections
from urllib.parse import unquote
import xmltodict import xmltodict
from PyQt5 import QtGui
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -44,11 +45,14 @@ class EPUB:
self.generate_references() self.generate_references()
def find_file(self, filename): def find_file(self, filename):
# Get rid of special characters
filename = unquote(filename)
# First, look for the file in the root of the book # First, look for the file in the root of the book
if filename in self.file_list: if filename in self.file_list:
return filename return filename
# Then, search for it elsewhere # Then search for it elsewhere
else: else:
file_basename = os.path.basename(filename) file_basename = os.path.basename(filename)
for i in self.file_list: for i in self.file_list:
@@ -56,7 +60,7 @@ class EPUB:
return i return i
# If the file isn't found # If the file isn't found
logger.error(filename + ' not found') logging.error(filename + ' not found in ' + self.book_filename)
return False return False
def generate_references(self): def generate_references(self):
@@ -75,7 +79,7 @@ class EPUB:
container_dict = xmltodict.parse(container_xml) container_dict = xmltodict.parse(container_xml)
packagefile = container_dict['container']['rootfiles']['rootfile']['@full-path'] packagefile = container_dict['container']['rootfiles']['rootfile']['@full-path']
else: else:
presumptive_names = ('content.opf', 'package.opf') presumptive_names = ('content.opf', 'package.opf', 'volume.opf')
for i in presumptive_names: for i in presumptive_names:
packagefile = self.find_file(i) packagefile = self.find_file(i)
if packagefile: if packagefile:
@@ -85,11 +89,42 @@ class EPUB:
self.opf_dict = xmltodict.parse(packagefile_data) self.opf_dict = xmltodict.parse(packagefile_data)
def generate_toc(self): def generate_toc(self):
self.book['toc'] = [] self.book['content'] = []
# I'm currently going with the file always being named toc.ncx def find_alternative_toc():
# But this is epub. The wild west of ebook formats. toc_filename = None
tocfile = self.find_file('toc.ncx') toc_filename_alternative = None
manifest = self.opf_dict['package']['manifest']['item']
for i in manifest:
# Behold the burning hoops we're jumping through
if i['@id'] == 'ncx':
toc_filename = i['@href']
if ('ncx' in i['@id']) or ('toc' in i['@id']):
toc_filename_alternative = i['@href']
if toc_filename and toc_filename_alternative:
break
if not toc_filename:
if not toc_filename_alternative:
logger.error('No ToC found for: ' + self.book_filename)
else:
toc_filename = toc_filename_alternative
logger.info('Using alternate ToC for: ' + self.book_filename)
return toc_filename
# Find the toc.ncx file from the manifest
# EPUBs will name literally anything, anything so try
# a less stringent approach if the first one doesn't work
# The idea is to prioritize 'toc.ncx' since this should work
# for the vast majority of books
toc_filename = 'toc.ncx'
does_toc_exist = self.find_file(toc_filename)
if not does_toc_exist:
toc_filename = find_alternative_toc()
tocfile = self.find_file(toc_filename)
tocfile_data = self.zip_file.read(tocfile) tocfile_data = self.zip_file.read(tocfile)
toc_dict = xmltodict.parse(tocfile_data) toc_dict = xmltodict.parse(tocfile_data)
@@ -99,21 +134,30 @@ class EPUB:
level + 1, level + 1,
i['navLabel']['text'], i['navLabel']['text'],
i['content']['@src']] for i in nav_node] i['content']['@src']] for i in nav_node]
self.book['toc'].extend(these_contents) self.book['content'].extend(these_contents)
return return
if 'navPoint' in nav_node.keys(): if 'navPoint' in nav_node.keys():
recursor(level, nav_node['navPoint']) recursor(level, nav_node['navPoint'])
else: else:
self.book['toc'].append([ self.book['content'].append([
level + 1, level + 1,
nav_node['navLabel']['text'], nav_node['navLabel']['text'],
nav_node['content']['@src']]) nav_node['content']['@src']])
navpoints = toc_dict['ncx']['navMap']['navPoint'] navpoints = toc_dict['ncx']['navMap']['navPoint']
for top_level_nav in navpoints: for top_level_nav in navpoints:
self.book['toc'].append([ # Just one chapter
if isinstance(top_level_nav, str):
self.book['content'].append([
1,
navpoints['navLabel']['text'],
navpoints['content']['@src']])
break
# Multiple chapters
self.book['content'].append([
1, 1,
top_level_nav['navLabel']['text'], top_level_nav['navLabel']['text'],
top_level_nav['content']['@src']]) top_level_nav['content']['@src']])
@@ -124,10 +168,19 @@ class EPUB:
def get_chapter_content(self, chapter_file): def get_chapter_content(self, chapter_file):
this_file = self.find_file(chapter_file) this_file = self.find_file(chapter_file)
if this_file: if this_file:
return self.zip_file.read(this_file).decode() chapter_content = self.zip_file.read(this_file).decode()
# Generate a None return for a blank chapter
# These will be removed from the contents later
contentDocument = QtGui.QTextDocument(None)
contentDocument.setHtml(chapter_content)
contentText = contentDocument.toPlainText().replace('\n', '')
if contentText == '':
chapter_content = None
return chapter_content
else: else:
print('Not found: ' + chapter_file) return 'Possible parse error: ' + chapter_file
return chapter_file
def parse_split_chapters(self, chapters_with_split_content): def parse_split_chapters(self, chapters_with_split_content):
self.book['split_chapters'] = {} self.book['split_chapters'] = {}
@@ -149,10 +202,13 @@ class EPUB:
markup_split = str(soup).split(str(this_tag)) markup_split = str(soup).split(str(this_tag))
soup = BeautifulSoup(markup_split[0], 'lxml') soup = BeautifulSoup(markup_split[0], 'lxml')
this_markup = BeautifulSoup(
str(this_tag) + markup_split[1], 'lxml')
self.book['split_chapters'][chapter_file][this_anchor] = str(this_markup) # If the tag is None, it probably means the content is overlapping
# Skipping the insert is the way forward
if this_tag:
this_markup = BeautifulSoup(
str(this_tag).strip() + markup_split[1], 'lxml')
self.book['split_chapters'][chapter_file][this_anchor] = str(this_markup)
# Remaining markup is assigned here # Remaining markup is assigned here
self.book['split_chapters'][chapter_file]['top_level'] = str(soup) self.book['split_chapters'][chapter_file]['top_level'] = str(soup)
@@ -180,12 +236,11 @@ class EPUB:
except KeyError: except KeyError:
pass pass
# TODO
# Check what happens in case missing chapters are either
# at the beginning or the end of the book
chapter_title = 1 chapter_title = 1
toc_chapters = [i[2] for i in self.book['toc']] toc_chapters = [
last_valid_index = 0 unquote(i[2].split('#')[0]) for i in self.book['content']]
last_valid_index = -2 # Yes, but why?
for i in spine_final: for i in spine_final:
if not i in toc_chapters: if not i in toc_chapters:
previous_chapter = spine_final[spine_final.index(i) - 1] previous_chapter = spine_final[spine_final.index(i) - 1]
@@ -195,7 +250,8 @@ class EPUB:
last_valid_index = previous_chapter_toc_index last_valid_index = previous_chapter_toc_index
except ValueError: except ValueError:
last_valid_index += 1 last_valid_index += 1
self.book['toc'].insert(
self.book['content'].insert(
last_valid_index + 1, last_valid_index + 1,
[1, str(chapter_title), i]) [1, str(chapter_title), i])
chapter_title += 1 chapter_title += 1
@@ -203,7 +259,7 @@ class EPUB:
# Parse split chapters as below # Parse split chapters as below
# They can be picked up during the iteration through the toc # They can be picked up during the iteration through the toc
chapters_with_split_content = {} chapters_with_split_content = {}
for i in self.book['toc']: for i in self.book['content']:
if '#' in i[2]: if '#' in i[2]:
this_split = i[2].split('#') this_split = i[2].split('#')
chapter = this_split[0] chapter = this_split[0]
@@ -222,13 +278,8 @@ class EPUB:
# In case a split chapter is encountered, get its content # In case a split chapter is encountered, get its content
# from the split_chapters dictionary # from the split_chapters dictionary
# What could possibly go wrong? # What could possibly go wrong?
# The content list is separated from the toc list because
# the mupdf library returns its own toc a certain way and
# this keeps things uniform
split_chapters = self.book['split_chapters'] split_chapters = self.book['split_chapters']
toc_copy = self.book['toc'][:] toc_copy = self.book['content'][:]
self.book['content'] = []
# Put the book into the book # Put the book into the book
for count, i in enumerate(toc_copy): for count, i in enumerate(toc_copy):
@@ -263,9 +314,11 @@ class EPUB:
else: else:
chapter_content = self.get_chapter_content(chapter_file) chapter_content = self.get_chapter_content(chapter_file)
# The count + 2 is an adjustment due to the cover being inserted below self.book['content'][count][2] = chapter_content
self.book['toc'][count][2] = count + 2
self.book['content'].append(chapter_content) # Cleanup content by removing null chapters
self.book['content'] = [
i for i in self.book['content'] if i[2]]
self.generate_book_cover() self.generate_book_cover()
if self.book['cover']: if self.book['cover']:
@@ -274,51 +327,72 @@ class EPUB:
with open(cover_path, 'wb') as cover_temp: with open(cover_path, 'wb') as cover_temp:
cover_temp.write(self.book['cover']) cover_temp.write(self.book['cover'])
self.book['toc'].insert(0, (1, 'Cover', 1)) # There's probably some rationale to doing an insert here
# But a replacement seems... neater
self.book['content'].insert( self.book['content'].insert(
0, (f'<center><img src="{cover_path}" alt="Cover"></center>')) 0, (1, 'Cover', f'<center><img src="{cover_path}" alt="Cover"></center>'))
def generate_metadata(self): def generate_metadata(self):
metadata = self.opf_dict['package']['metadata'] metadata = self.opf_dict['package']['metadata']
def flattener(this_object):
if isinstance(this_object, collections.OrderedDict):
return this_object['#text']
if isinstance(this_object, list):
if isinstance(this_object[0], collections.OrderedDict):
return this_object[0]['#text']
else:
return this_object[0]
if isinstance(this_object, str):
return this_object
# There are no exception types specified below # There are no exception types specified below
# This is on purpose and makes me long for the days # This is on purpose and makes me long for the days
# of simpler, happier things. # of simpler, happier things.
# Book title # Book title
try: try:
self.book['title'] = metadata['dc:title'] self.book['title'] = flattener(metadata['dc:title'])
if isinstance(self.book['title'], collections.OrderedDict):
self.book['title'] = metadata['dc:title']['#text']
except: except:
print('Title parse error')
self.book['title'] = os.path.splitext( self.book['title'] = os.path.splitext(
os.path.basename(self.book_filename))[0] os.path.basename(self.book_filename))[0]
# Book author # Book author
try: try:
self.book['author'] = metadata['dc:creator']['#text'] self.book['author'] = flattener(metadata['dc:creator'])
except: except:
self.book['author'] = 'Unknown' self.book['author'] = 'Unknown'
# Book year # Book year
try: try:
self.book['year'] = int(metadata['dc:date'][:4]) self.book['year'] = int(flattener(metadata['dc:date'])[:4])
except: except:
self.book['year'] = 9999 self.book['year'] = 9999
# Book isbn # Book isbn
# Both one and multiple schema
self.book['isbn'] = None self.book['isbn'] = None
try: try:
for i in metadata['dc:identifier']: scheme = metadata['dc:identifier']['@opf:scheme'].lower()
if i['@opf:scheme'].lower() == 'isbn': if scheme.lower() == 'isbn':
self.book['isbn'] = i['#text'] self.book['isbn'] = metadata['dc:identifier']['#text']
except:
pass except (TypeError, KeyError):
try:
for i in metadata['dc:identifier']:
if i['@opf:scheme'].lower() == 'isbn':
self.book['isbn'] = i['#text']
break
except:
pass
# Book tags # Book tags
try: try:
self.book['tags'] = metadata['dc:subject'] self.book['tags'] = metadata['dc:subject']
if isinstance(self.book['tags'], str):
self.book['tags'] = [self.book['tags']]
except: except:
self.book['tags'] = [] self.book['tags'] = []