Improve EPUB parser compatibility and speed
Completely break MOBI parser
This commit is contained in:
3
TODO
3
TODO
@@ -96,8 +96,11 @@ TODO
|
|||||||
Bookmark name for a page that's not on the TOC or has nothing before
|
Bookmark name for a page that's not on the TOC or has nothing before
|
||||||
Screen position still keeps jumping when inside a paragraph
|
Screen position still keeps jumping when inside a paragraph
|
||||||
Better recursion needed for fb2 toc
|
Better recursion needed for fb2 toc
|
||||||
|
Initial sort by author in tableview
|
||||||
|
Last column not filling up tableview
|
||||||
|
|
||||||
Secondary:
|
Secondary:
|
||||||
|
Tab tooltip
|
||||||
Additional Settings:
|
Additional Settings:
|
||||||
Disable progressbar - 20% book addition speed improvement
|
Disable progressbar - 20% book addition speed improvement
|
||||||
Disable cover loading when reading - Saves ~2M / book
|
Disable cover loading when reading - Saves ~2M / book
|
||||||
|
@@ -63,5 +63,11 @@ class ParseEPUB:
|
|||||||
self.book_ref.generate_toc()
|
self.book_ref.generate_toc()
|
||||||
self.book_ref.generate_content()
|
self.book_ref.generate_content()
|
||||||
|
|
||||||
|
toc = []
|
||||||
|
content = []
|
||||||
|
for count, i in enumerate(self.book['content']):
|
||||||
|
toc.append((i[0], i[1], count + 1))
|
||||||
|
content.append(i[2])
|
||||||
|
|
||||||
# Return toc, content, images_only
|
# Return toc, content, images_only
|
||||||
return self.book['toc'], self.book['content'], False
|
return toc, content, False
|
||||||
|
@@ -55,10 +55,10 @@ class ParseMOBI:
|
|||||||
self.epub_filepath = shutil.make_archive(zip_file, 'zip', zip_dir)
|
self.epub_filepath = shutil.make_archive(zip_file, 'zip', zip_dir)
|
||||||
self.split_large_xml = True
|
self.split_large_xml = True
|
||||||
|
|
||||||
self.book_ref = EPUB(self.epub_filepath)
|
self.book_ref = EPUB(self.epub_filepath, self.temp_dir)
|
||||||
contents_found = self.book_ref.read_epub()
|
self.book_ref.generate_metadata()
|
||||||
if not contents_found:
|
self.book_ref.generate_toc()
|
||||||
return False
|
self.book_ref.generate_content()
|
||||||
self.book = self.book_ref.book
|
self.book = self.book_ref.book
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@@ -81,6 +81,7 @@ class ParseMOBI:
|
|||||||
return self.book['tags']
|
return self.book['tags']
|
||||||
|
|
||||||
def get_contents(self):
|
def get_contents(self):
|
||||||
|
return
|
||||||
extract_path = os.path.join(self.extract_dir)
|
extract_path = os.path.join(self.extract_dir)
|
||||||
zipfile.ZipFile(self.epub_filepath).extractall(extract_path)
|
zipfile.ZipFile(self.epub_filepath).extractall(extract_path)
|
||||||
|
|
||||||
|
@@ -16,7 +16,6 @@
|
|||||||
|
|
||||||
# TODO
|
# TODO
|
||||||
# See if inserting chapters not in the toc.ncx can be avoided
|
# See if inserting chapters not in the toc.ncx can be avoided
|
||||||
# Missing file order is messed up
|
|
||||||
# Account for stylesheets... eventually
|
# Account for stylesheets... eventually
|
||||||
# Everything needs logging
|
# Everything needs logging
|
||||||
# Mobipocket files
|
# Mobipocket files
|
||||||
@@ -25,8 +24,10 @@ import os
|
|||||||
import zipfile
|
import zipfile
|
||||||
import logging
|
import logging
|
||||||
import collections
|
import collections
|
||||||
|
from urllib.parse import unquote
|
||||||
|
|
||||||
import xmltodict
|
import xmltodict
|
||||||
|
from PyQt5 import QtGui
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -44,11 +45,14 @@ class EPUB:
|
|||||||
self.generate_references()
|
self.generate_references()
|
||||||
|
|
||||||
def find_file(self, filename):
|
def find_file(self, filename):
|
||||||
|
# Get rid of special characters
|
||||||
|
filename = unquote(filename)
|
||||||
|
|
||||||
# First, look for the file in the root of the book
|
# First, look for the file in the root of the book
|
||||||
if filename in self.file_list:
|
if filename in self.file_list:
|
||||||
return filename
|
return filename
|
||||||
|
|
||||||
# Then, search for it elsewhere
|
# Then search for it elsewhere
|
||||||
else:
|
else:
|
||||||
file_basename = os.path.basename(filename)
|
file_basename = os.path.basename(filename)
|
||||||
for i in self.file_list:
|
for i in self.file_list:
|
||||||
@@ -56,7 +60,7 @@ class EPUB:
|
|||||||
return i
|
return i
|
||||||
|
|
||||||
# If the file isn't found
|
# If the file isn't found
|
||||||
logger.error(filename + ' not found')
|
logging.error(filename + ' not found in ' + self.book_filename)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def generate_references(self):
|
def generate_references(self):
|
||||||
@@ -75,7 +79,7 @@ class EPUB:
|
|||||||
container_dict = xmltodict.parse(container_xml)
|
container_dict = xmltodict.parse(container_xml)
|
||||||
packagefile = container_dict['container']['rootfiles']['rootfile']['@full-path']
|
packagefile = container_dict['container']['rootfiles']['rootfile']['@full-path']
|
||||||
else:
|
else:
|
||||||
presumptive_names = ('content.opf', 'package.opf')
|
presumptive_names = ('content.opf', 'package.opf', 'volume.opf')
|
||||||
for i in presumptive_names:
|
for i in presumptive_names:
|
||||||
packagefile = self.find_file(i)
|
packagefile = self.find_file(i)
|
||||||
if packagefile:
|
if packagefile:
|
||||||
@@ -85,11 +89,42 @@ class EPUB:
|
|||||||
self.opf_dict = xmltodict.parse(packagefile_data)
|
self.opf_dict = xmltodict.parse(packagefile_data)
|
||||||
|
|
||||||
def generate_toc(self):
|
def generate_toc(self):
|
||||||
self.book['toc'] = []
|
self.book['content'] = []
|
||||||
|
|
||||||
# I'm currently going with the file always being named toc.ncx
|
def find_alternative_toc():
|
||||||
# But this is epub. The wild west of ebook formats.
|
toc_filename = None
|
||||||
tocfile = self.find_file('toc.ncx')
|
toc_filename_alternative = None
|
||||||
|
manifest = self.opf_dict['package']['manifest']['item']
|
||||||
|
|
||||||
|
for i in manifest:
|
||||||
|
# Behold the burning hoops we're jumping through
|
||||||
|
if i['@id'] == 'ncx':
|
||||||
|
toc_filename = i['@href']
|
||||||
|
if ('ncx' in i['@id']) or ('toc' in i['@id']):
|
||||||
|
toc_filename_alternative = i['@href']
|
||||||
|
if toc_filename and toc_filename_alternative:
|
||||||
|
break
|
||||||
|
|
||||||
|
if not toc_filename:
|
||||||
|
if not toc_filename_alternative:
|
||||||
|
logger.error('No ToC found for: ' + self.book_filename)
|
||||||
|
else:
|
||||||
|
toc_filename = toc_filename_alternative
|
||||||
|
|
||||||
|
logger.info('Using alternate ToC for: ' + self.book_filename)
|
||||||
|
return toc_filename
|
||||||
|
|
||||||
|
# Find the toc.ncx file from the manifest
|
||||||
|
# EPUBs will name literally anything, anything so try
|
||||||
|
# a less stringent approach if the first one doesn't work
|
||||||
|
# The idea is to prioritize 'toc.ncx' since this should work
|
||||||
|
# for the vast majority of books
|
||||||
|
toc_filename = 'toc.ncx'
|
||||||
|
does_toc_exist = self.find_file(toc_filename)
|
||||||
|
if not does_toc_exist:
|
||||||
|
toc_filename = find_alternative_toc()
|
||||||
|
|
||||||
|
tocfile = self.find_file(toc_filename)
|
||||||
tocfile_data = self.zip_file.read(tocfile)
|
tocfile_data = self.zip_file.read(tocfile)
|
||||||
toc_dict = xmltodict.parse(tocfile_data)
|
toc_dict = xmltodict.parse(tocfile_data)
|
||||||
|
|
||||||
@@ -99,21 +134,30 @@ class EPUB:
|
|||||||
level + 1,
|
level + 1,
|
||||||
i['navLabel']['text'],
|
i['navLabel']['text'],
|
||||||
i['content']['@src']] for i in nav_node]
|
i['content']['@src']] for i in nav_node]
|
||||||
self.book['toc'].extend(these_contents)
|
self.book['content'].extend(these_contents)
|
||||||
return
|
return
|
||||||
|
|
||||||
if 'navPoint' in nav_node.keys():
|
if 'navPoint' in nav_node.keys():
|
||||||
recursor(level, nav_node['navPoint'])
|
recursor(level, nav_node['navPoint'])
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.book['toc'].append([
|
self.book['content'].append([
|
||||||
level + 1,
|
level + 1,
|
||||||
nav_node['navLabel']['text'],
|
nav_node['navLabel']['text'],
|
||||||
nav_node['content']['@src']])
|
nav_node['content']['@src']])
|
||||||
|
|
||||||
navpoints = toc_dict['ncx']['navMap']['navPoint']
|
navpoints = toc_dict['ncx']['navMap']['navPoint']
|
||||||
for top_level_nav in navpoints:
|
for top_level_nav in navpoints:
|
||||||
self.book['toc'].append([
|
# Just one chapter
|
||||||
|
if isinstance(top_level_nav, str):
|
||||||
|
self.book['content'].append([
|
||||||
|
1,
|
||||||
|
navpoints['navLabel']['text'],
|
||||||
|
navpoints['content']['@src']])
|
||||||
|
break
|
||||||
|
|
||||||
|
# Multiple chapters
|
||||||
|
self.book['content'].append([
|
||||||
1,
|
1,
|
||||||
top_level_nav['navLabel']['text'],
|
top_level_nav['navLabel']['text'],
|
||||||
top_level_nav['content']['@src']])
|
top_level_nav['content']['@src']])
|
||||||
@@ -124,10 +168,19 @@ class EPUB:
|
|||||||
def get_chapter_content(self, chapter_file):
|
def get_chapter_content(self, chapter_file):
|
||||||
this_file = self.find_file(chapter_file)
|
this_file = self.find_file(chapter_file)
|
||||||
if this_file:
|
if this_file:
|
||||||
return self.zip_file.read(this_file).decode()
|
chapter_content = self.zip_file.read(this_file).decode()
|
||||||
|
|
||||||
|
# Generate a None return for a blank chapter
|
||||||
|
# These will be removed from the contents later
|
||||||
|
contentDocument = QtGui.QTextDocument(None)
|
||||||
|
contentDocument.setHtml(chapter_content)
|
||||||
|
contentText = contentDocument.toPlainText().replace('\n', '')
|
||||||
|
if contentText == '':
|
||||||
|
chapter_content = None
|
||||||
|
|
||||||
|
return chapter_content
|
||||||
else:
|
else:
|
||||||
print('Not found: ' + chapter_file)
|
return 'Possible parse error: ' + chapter_file
|
||||||
return chapter_file
|
|
||||||
|
|
||||||
def parse_split_chapters(self, chapters_with_split_content):
|
def parse_split_chapters(self, chapters_with_split_content):
|
||||||
self.book['split_chapters'] = {}
|
self.book['split_chapters'] = {}
|
||||||
@@ -149,10 +202,13 @@ class EPUB:
|
|||||||
|
|
||||||
markup_split = str(soup).split(str(this_tag))
|
markup_split = str(soup).split(str(this_tag))
|
||||||
soup = BeautifulSoup(markup_split[0], 'lxml')
|
soup = BeautifulSoup(markup_split[0], 'lxml')
|
||||||
this_markup = BeautifulSoup(
|
|
||||||
str(this_tag) + markup_split[1], 'lxml')
|
|
||||||
|
|
||||||
self.book['split_chapters'][chapter_file][this_anchor] = str(this_markup)
|
# If the tag is None, it probably means the content is overlapping
|
||||||
|
# Skipping the insert is the way forward
|
||||||
|
if this_tag:
|
||||||
|
this_markup = BeautifulSoup(
|
||||||
|
str(this_tag).strip() + markup_split[1], 'lxml')
|
||||||
|
self.book['split_chapters'][chapter_file][this_anchor] = str(this_markup)
|
||||||
|
|
||||||
# Remaining markup is assigned here
|
# Remaining markup is assigned here
|
||||||
self.book['split_chapters'][chapter_file]['top_level'] = str(soup)
|
self.book['split_chapters'][chapter_file]['top_level'] = str(soup)
|
||||||
@@ -180,12 +236,11 @@ class EPUB:
|
|||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# TODO
|
|
||||||
# Check what happens in case missing chapters are either
|
|
||||||
# at the beginning or the end of the book
|
|
||||||
chapter_title = 1
|
chapter_title = 1
|
||||||
toc_chapters = [i[2] for i in self.book['toc']]
|
toc_chapters = [
|
||||||
last_valid_index = 0
|
unquote(i[2].split('#')[0]) for i in self.book['content']]
|
||||||
|
|
||||||
|
last_valid_index = -2 # Yes, but why?
|
||||||
for i in spine_final:
|
for i in spine_final:
|
||||||
if not i in toc_chapters:
|
if not i in toc_chapters:
|
||||||
previous_chapter = spine_final[spine_final.index(i) - 1]
|
previous_chapter = spine_final[spine_final.index(i) - 1]
|
||||||
@@ -195,7 +250,8 @@ class EPUB:
|
|||||||
last_valid_index = previous_chapter_toc_index
|
last_valid_index = previous_chapter_toc_index
|
||||||
except ValueError:
|
except ValueError:
|
||||||
last_valid_index += 1
|
last_valid_index += 1
|
||||||
self.book['toc'].insert(
|
|
||||||
|
self.book['content'].insert(
|
||||||
last_valid_index + 1,
|
last_valid_index + 1,
|
||||||
[1, str(chapter_title), i])
|
[1, str(chapter_title), i])
|
||||||
chapter_title += 1
|
chapter_title += 1
|
||||||
@@ -203,7 +259,7 @@ class EPUB:
|
|||||||
# Parse split chapters as below
|
# Parse split chapters as below
|
||||||
# They can be picked up during the iteration through the toc
|
# They can be picked up during the iteration through the toc
|
||||||
chapters_with_split_content = {}
|
chapters_with_split_content = {}
|
||||||
for i in self.book['toc']:
|
for i in self.book['content']:
|
||||||
if '#' in i[2]:
|
if '#' in i[2]:
|
||||||
this_split = i[2].split('#')
|
this_split = i[2].split('#')
|
||||||
chapter = this_split[0]
|
chapter = this_split[0]
|
||||||
@@ -222,13 +278,8 @@ class EPUB:
|
|||||||
# In case a split chapter is encountered, get its content
|
# In case a split chapter is encountered, get its content
|
||||||
# from the split_chapters dictionary
|
# from the split_chapters dictionary
|
||||||
# What could possibly go wrong?
|
# What could possibly go wrong?
|
||||||
|
|
||||||
# The content list is separated from the toc list because
|
|
||||||
# the mupdf library returns its own toc a certain way and
|
|
||||||
# this keeps things uniform
|
|
||||||
split_chapters = self.book['split_chapters']
|
split_chapters = self.book['split_chapters']
|
||||||
toc_copy = self.book['toc'][:]
|
toc_copy = self.book['content'][:]
|
||||||
self.book['content'] = []
|
|
||||||
|
|
||||||
# Put the book into the book
|
# Put the book into the book
|
||||||
for count, i in enumerate(toc_copy):
|
for count, i in enumerate(toc_copy):
|
||||||
@@ -263,9 +314,11 @@ class EPUB:
|
|||||||
else:
|
else:
|
||||||
chapter_content = self.get_chapter_content(chapter_file)
|
chapter_content = self.get_chapter_content(chapter_file)
|
||||||
|
|
||||||
# The count + 2 is an adjustment due to the cover being inserted below
|
self.book['content'][count][2] = chapter_content
|
||||||
self.book['toc'][count][2] = count + 2
|
|
||||||
self.book['content'].append(chapter_content)
|
# Cleanup content by removing null chapters
|
||||||
|
self.book['content'] = [
|
||||||
|
i for i in self.book['content'] if i[2]]
|
||||||
|
|
||||||
self.generate_book_cover()
|
self.generate_book_cover()
|
||||||
if self.book['cover']:
|
if self.book['cover']:
|
||||||
@@ -274,51 +327,72 @@ class EPUB:
|
|||||||
with open(cover_path, 'wb') as cover_temp:
|
with open(cover_path, 'wb') as cover_temp:
|
||||||
cover_temp.write(self.book['cover'])
|
cover_temp.write(self.book['cover'])
|
||||||
|
|
||||||
self.book['toc'].insert(0, (1, 'Cover', 1))
|
# There's probably some rationale to doing an insert here
|
||||||
|
# But a replacement seems... neater
|
||||||
self.book['content'].insert(
|
self.book['content'].insert(
|
||||||
0, (f'<center><img src="{cover_path}" alt="Cover"></center>'))
|
0, (1, 'Cover', f'<center><img src="{cover_path}" alt="Cover"></center>'))
|
||||||
|
|
||||||
def generate_metadata(self):
|
def generate_metadata(self):
|
||||||
metadata = self.opf_dict['package']['metadata']
|
metadata = self.opf_dict['package']['metadata']
|
||||||
|
|
||||||
|
def flattener(this_object):
|
||||||
|
if isinstance(this_object, collections.OrderedDict):
|
||||||
|
return this_object['#text']
|
||||||
|
|
||||||
|
if isinstance(this_object, list):
|
||||||
|
if isinstance(this_object[0], collections.OrderedDict):
|
||||||
|
return this_object[0]['#text']
|
||||||
|
else:
|
||||||
|
return this_object[0]
|
||||||
|
|
||||||
|
if isinstance(this_object, str):
|
||||||
|
return this_object
|
||||||
|
|
||||||
# There are no exception types specified below
|
# There are no exception types specified below
|
||||||
# This is on purpose and makes me long for the days
|
# This is on purpose and makes me long for the days
|
||||||
# of simpler, happier things.
|
# of simpler, happier things.
|
||||||
|
|
||||||
# Book title
|
# Book title
|
||||||
try:
|
try:
|
||||||
self.book['title'] = metadata['dc:title']
|
self.book['title'] = flattener(metadata['dc:title'])
|
||||||
if isinstance(self.book['title'], collections.OrderedDict):
|
|
||||||
self.book['title'] = metadata['dc:title']['#text']
|
|
||||||
except:
|
except:
|
||||||
print('Title parse error')
|
|
||||||
self.book['title'] = os.path.splitext(
|
self.book['title'] = os.path.splitext(
|
||||||
os.path.basename(self.book_filename))[0]
|
os.path.basename(self.book_filename))[0]
|
||||||
|
|
||||||
# Book author
|
# Book author
|
||||||
try:
|
try:
|
||||||
self.book['author'] = metadata['dc:creator']['#text']
|
self.book['author'] = flattener(metadata['dc:creator'])
|
||||||
except:
|
except:
|
||||||
self.book['author'] = 'Unknown'
|
self.book['author'] = 'Unknown'
|
||||||
|
|
||||||
# Book year
|
# Book year
|
||||||
try:
|
try:
|
||||||
self.book['year'] = int(metadata['dc:date'][:4])
|
self.book['year'] = int(flattener(metadata['dc:date'])[:4])
|
||||||
except:
|
except:
|
||||||
self.book['year'] = 9999
|
self.book['year'] = 9999
|
||||||
|
|
||||||
# Book isbn
|
# Book isbn
|
||||||
|
# Both one and multiple schema
|
||||||
self.book['isbn'] = None
|
self.book['isbn'] = None
|
||||||
try:
|
try:
|
||||||
for i in metadata['dc:identifier']:
|
scheme = metadata['dc:identifier']['@opf:scheme'].lower()
|
||||||
if i['@opf:scheme'].lower() == 'isbn':
|
if scheme.lower() == 'isbn':
|
||||||
self.book['isbn'] = i['#text']
|
self.book['isbn'] = metadata['dc:identifier']['#text']
|
||||||
except:
|
|
||||||
pass
|
except (TypeError, KeyError):
|
||||||
|
try:
|
||||||
|
for i in metadata['dc:identifier']:
|
||||||
|
if i['@opf:scheme'].lower() == 'isbn':
|
||||||
|
self.book['isbn'] = i['#text']
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
# Book tags
|
# Book tags
|
||||||
try:
|
try:
|
||||||
self.book['tags'] = metadata['dc:subject']
|
self.book['tags'] = metadata['dc:subject']
|
||||||
|
if isinstance(self.book['tags'], str):
|
||||||
|
self.book['tags'] = [self.book['tags']]
|
||||||
except:
|
except:
|
||||||
self.book['tags'] = []
|
self.book['tags'] = []
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user