Overhaul EPUB parsing and ToC generation

This commit is contained in:
BasioMeusPuga
2019-02-09 04:21:22 +05:30
parent 1e004774c9
commit e4be239bf0
4 changed files with 312 additions and 285 deletions

9
TODO
View File

@@ -36,6 +36,7 @@ TODO
Set focus to newly added file Set focus to newly added file
Reading: Reading:
✓ Drop down for TOC ✓ Drop down for TOC
✓ Treeview navigation for TOC
✓ Override the keypress event of the textedit ✓ Override the keypress event of the textedit
✓ Use format* icons for toolbar buttons ✓ Use format* icons for toolbar buttons
✓ Implement book view settings with a(nother) toolbar ✓ Implement book view settings with a(nother) toolbar
@@ -86,7 +87,6 @@ TODO
Have them save to memory Have them save to memory
✓ fb2 support ✓ fb2 support
✓ Images need to show up in their placeholders ✓ Images need to show up in their placeholders
djvu support
Other: Other:
✓ Define every widget in code ✓ Define every widget in code
Bugs: Bugs:
@@ -98,8 +98,11 @@ TODO
Better recursion needed for fb2 toc Better recursion needed for fb2 toc
Secondary: Secondary:
Additional Settings:
Disable progressbar - 20% book addition speed improvement
Disable cover loading when reading - Saves ~2M / book
Create covers for books without them - VERY SLOW
Special formatting for each chapter's title Special formatting for each chapter's title
Create covers for books without them
Signal end of chapter with some text Signal end of chapter with some text
Graphical themes Graphical themes
Change focus rectangle dimensions Change focus rectangle dimensions
@@ -108,7 +111,7 @@ TODO
Goodreads API: Ratings, Read, Recommendations Goodreads API: Ratings, Read, Recommendations
Get ISBN using python-isbnlib Get ISBN using python-isbnlib
Use embedded fonts + CSS Use embedded fonts + CSS
txt, doc, chm support txt, doc, chm, djvu support
Include icons for filetype emblems Include icons for filetype emblems
Comic view modes Comic view modes
Continuous paging Continuous paging

View File

@@ -29,14 +29,13 @@ class ParseEPUB:
# Maybe also include book description # Maybe also include book description
self.book_ref = None self.book_ref = None
self.book = None self.book = None
self.temp_dir = temp_dir
self.filename = filename self.filename = filename
self.extract_path = os.path.join(temp_dir, file_md5) self.extract_path = os.path.join(temp_dir, file_md5)
def read_book(self): def read_book(self):
self.book_ref = EPUB(self.filename) self.book_ref = EPUB(self.filename, self.temp_dir)
contents_found = self.book_ref.read_epub() self.book_ref.generate_metadata()
if not contents_found:
return False
self.book = self.book_ref.book self.book = self.book_ref.book
return True return True
@@ -61,14 +60,8 @@ class ParseEPUB:
def get_contents(self): def get_contents(self):
zipfile.ZipFile(self.filename).extractall(self.extract_path) zipfile.ZipFile(self.filename).extractall(self.extract_path)
self.book_ref.parse_toc() self.book_ref.generate_toc()
self.book_ref.parse_chapters(temp_dir=self.extract_path) self.book_ref.generate_content()
toc = []
content = []
for count, i in enumerate(self.book['book_list']):
toc.append((1, i[0], count + 1))
content.append(i[1])
# Return toc, content, images_only # Return toc, content, images_only
return toc, content, False return self.book['toc'], self.book['content'], False

View File

@@ -14,175 +14,333 @@
# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
import os # TODO
import logging # See if inserting chapters not in the toc.ncx can be avoided
import zipfile # Missing file order is messed up
from urllib.parse import unquote # Account for stylesheets... eventually
# Everything needs logging
# Mobipocket files
import os
import zipfile
import logging
import collections
import xmltodict
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class EPUB: class EPUB:
def __init__(self, filename): def __init__(self, book_filename, temp_dir):
self.filename = filename self.book_filename = book_filename
self.temp_dir = temp_dir
self.zip_file = None self.zip_file = None
self.file_list = None
self.opf_dict = None
self.book = {} self.book = {}
self.book['split_chapters'] = {}
def read_epub(self): self.generate_references()
# This is the function that should error out in
# case the module cannot process the file
try:
self.load_zip()
contents_path = self.get_file_path(
None, True)
if not contents_path: def find_file(self, filename):
return False # No (valid) opf was found so processing cannot continue # First, look for the file in the root of the book
if filename in self.file_list:
return filename
self.generate_book_metadata(contents_path) # Then, search for it elsewhere
except: # Not specifying an exception type here may be justified else:
file_basename = os.path.basename(filename)
for i in self.file_list:
if os.path.basename(i) == file_basename:
return i
# If the file isn't found
logger.error(filename + ' not found')
return False return False
return True def generate_references(self):
def load_zip(self):
try:
self.zip_file = zipfile.ZipFile( self.zip_file = zipfile.ZipFile(
self.filename, mode='r', allowZip64=True) self.book_filename, mode='r', allowZip64=True)
except (KeyError, AttributeError, zipfile.BadZipFile): self.file_list = self.zip_file.namelist()
logger.error('Malformed zip file ' + self.filename)
return
def parse_xml(self, filename, parser): # Book structure relies on parsing the .opf file
try: # in the book. Now that might be the usual content.opf
this_xml = self.zip_file.read(filename).decode() # or package.opf or it might be named after your favorite
except KeyError: # eldritch abomination. The point is we have to check
short_filename = os.path.basename(self.filename) # the container.xml
warning_string = f'{str(filename)} not found in {short_filename}' container = self.find_file('container.xml')
logger.warning(warning_string) if container:
return container_xml = self.zip_file.read(container)
container_dict = xmltodict.parse(container_xml)
root = BeautifulSoup(this_xml, parser) packagefile = container_dict['container']['rootfiles']['rootfile']['@full-path']
return root
def get_file_path(self, filename, is_content_file=False):
# Use this to get the location of the content.opf file
# And maybe some other file that has a more well formatted
# idea of the TOC
# We're going to all this trouble because there really is
# no going forward without a toc
if is_content_file:
container_location = self.get_file_path('container.xml')
xml = self.parse_xml(container_location, 'xml')
if xml:
root_item = xml.find('rootfile')
try:
return root_item.get('full-path')
except AttributeError:
error_string = f'ePub module: {self.filename} has a malformed container.xml'
logger.error(error_string)
return None
possible_filenames = ('content.opf', 'package.opf')
for i in possible_filenames:
presumptive_location = self.get_file_path(i)
if presumptive_location:
return presumptive_location
for i in self.zip_file.filelist:
if os.path.basename(i.filename) == os.path.basename(filename):
return i.filename
return None
def read_from_zip(self, filename):
filename = unquote(filename)
try:
file_data = self.zip_file.read(filename)
return file_data
except KeyError:
file_path_actual = self.get_file_path(filename)
if file_path_actual:
return self.zip_file.read(file_path_actual)
else: else:
logger.error('ePub module can\'t find ' + filename) presumptive_names = ('content.opf', 'package.opf')
for i in presumptive_names:
packagefile = self.find_file(i)
if packagefile:
break
#______________________________________________________ packagefile_data = self.zip_file.read(packagefile)
self.opf_dict = xmltodict.parse(packagefile_data)
def generate_book_metadata(self, contents_path): def generate_toc(self):
self.book['title'] = os.path.splitext( self.book['toc'] = []
os.path.basename(self.filename))[0]
self.book['author'] = 'Unknown'
self.book['isbn'] = None
self.book['tags'] = None
self.book['cover'] = None
self.book['toc_file'] = 'toc.ncx' # Overwritten if another one exists
# Parse XML # I'm currently going with the file always being named toc.ncx
xml = self.parse_xml(contents_path, 'xml') # But this is epub. The wild west of ebook formats.
tocfile = self.find_file('toc.ncx')
tocfile_data = self.zip_file.read(tocfile)
toc_dict = xmltodict.parse(tocfile_data)
# Parse metadata def recursor(level, nav_node):
item_dict = { if isinstance(nav_node, list):
'title': 'title', these_contents = [[
'author': 'creator', level + 1,
'year': 'date'} i['navLabel']['text'],
i['content']['@src']] for i in nav_node]
self.book['toc'].extend(these_contents)
return
for i in item_dict.items(): if 'navPoint' in nav_node.keys():
item = xml.find(i[1]) recursor(level, nav_node['navPoint'])
if item:
self.book[i[0]] = item.text else:
self.book['toc'].append([
level + 1,
nav_node['navLabel']['text'],
nav_node['content']['@src']])
navpoints = toc_dict['ncx']['navMap']['navPoint']
for top_level_nav in navpoints:
self.book['toc'].append([
1,
top_level_nav['navLabel']['text'],
top_level_nav['content']['@src']])
if 'navPoint' in top_level_nav.keys():
recursor(1, top_level_nav)
def get_chapter_content(self, chapter_file):
this_file = self.find_file(chapter_file)
if this_file:
return self.zip_file.read(this_file).decode()
else:
print('Not found: ' + chapter_file)
return chapter_file
def parse_split_chapters(self, chapters_with_split_content):
self.book['split_chapters'] = {}
# For split chapters, get the whole chapter first, then split
# between ids using their anchors, then "heal" the resultant text
# by creating a BeautifulSoup object. Write its str to the content
for i in chapters_with_split_content.items():
chapter_file = i[0]
self.book['split_chapters'][chapter_file] = {}
chapter_content = self.get_chapter_content(chapter_file)
soup = BeautifulSoup(chapter_content, 'lxml')
split_anchors = i[1]
for this_anchor in reversed(split_anchors):
this_tag = soup.find(
attrs={"id":lambda x: x == this_anchor})
markup_split = str(soup).split(str(this_tag))
soup = BeautifulSoup(markup_split[0], 'lxml')
this_markup = BeautifulSoup(
str(this_tag) + markup_split[1], 'lxml')
self.book['split_chapters'][chapter_file][this_anchor] = str(this_markup)
# Remaining markup is assigned here
self.book['split_chapters'][chapter_file]['top_level'] = str(soup)
def generate_content(self):
# Find all the chapters mentioned in the opf spine
# These are simply ids that correspond to the actual item
# as mentioned in the manifest - which is a comprehensive
# list of files
chapters_in_spine = [
i['@idref']
for i in self.opf_dict['package']['spine']['itemref']]
# Next, find items and ids from the manifest
chapters_from_manifest = {
i['@id']: i['@href']
for i in self.opf_dict['package']['manifest']['item']}
# Finally, check which items are supposed to be in the spine
# on the basis of the id and change the toc accordingly
spine_final = []
for i in chapters_in_spine:
try:
spine_final.append(chapters_from_manifest.pop(i))
except KeyError:
pass
# TODO
# Check what happens in case missing chapters are either
# at the beginning or the end of the book
chapter_title = 1
toc_chapters = [i[2] for i in self.book['toc']]
last_valid_index = 0
for i in spine_final:
if not i in toc_chapters:
previous_chapter = spine_final[spine_final.index(i) - 1]
try:
previous_chapter_toc_index = toc_chapters.index(previous_chapter)
# In case of 2+ consecutive missing chapters
last_valid_index = previous_chapter_toc_index
except ValueError:
last_valid_index += 1
self.book['toc'].insert(
last_valid_index + 1,
[1, str(chapter_title), i])
chapter_title += 1
# Parse split chapters as below
# They can be picked up during the iteration through the toc
chapters_with_split_content = {}
for i in self.book['toc']:
if '#' in i[2]:
this_split = i[2].split('#')
chapter = this_split[0]
anchor = this_split[1]
try: try:
self.book['year'] = int(self.book['year'][:4]) chapters_with_split_content[chapter].append(anchor)
except (TypeError, KeyError, IndexError, ValueError): except KeyError:
chapters_with_split_content[chapter] = []
chapters_with_split_content[chapter].append(anchor)
self.parse_split_chapters(chapters_with_split_content)
# Now we iterate over the ToC as presented in the toc.ncx
# and add chapters to the content list
# In case a split chapter is encountered, get its content
# from the split_chapters dictionary
# What could possibly go wrong?
# The content list is separated from the toc list because
# the mupdf library returns its own toc a certain way and
# this keeps things uniform
split_chapters = self.book['split_chapters']
toc_copy = self.book['toc'][:]
self.book['content'] = []
# Put the book into the book
for count, i in enumerate(toc_copy):
chapter_file = i[2]
# Get split content according to its corresponding id attribute
if '#' in chapter_file:
this_split = chapter_file.split('#')
chapter_file_proper = this_split[0]
this_anchor = this_split[1]
try:
chapter_content = (
split_chapters[chapter_file_proper][this_anchor])
except KeyError:
chapter_content = 'Parse Error'
error_string = (
f'Error parsing {self.book_filename}: {chapter_file_proper}')
logger.error(error_string)
# Get content that remained at the end of the pillaging above
elif chapter_file in split_chapters.keys():
try:
chapter_content = split_chapters[chapter_file]['top_level']
except KeyError:
chapter_content = 'Parse Error'
error_string = (
f'Error parsing {self.book_filename}: {chapter_file}')
logger.error(error_string)
# Vanilla non split chapters
else:
chapter_content = self.get_chapter_content(chapter_file)
# The count + 2 is an adjustment due to the cover being inserted below
self.book['toc'][count][2] = count + 2
self.book['content'].append(chapter_content)
self.generate_book_cover()
if self.book['cover']:
cover_path = os.path.join(
self.temp_dir, os.path.basename(self.book_filename)) + '- cover'
with open(cover_path, 'wb') as cover_temp:
cover_temp.write(self.book['cover'])
self.book['toc'].insert(0, (1, 'Cover', 1))
self.book['content'].insert(
0, (f'<center><img src="{cover_path}" alt="Cover"></center>'))
def generate_metadata(self):
metadata = self.opf_dict['package']['metadata']
# There are no exception types specified below
# This is on purpose and makes me long for the days
# of simpler, happier things.
# Book title
try:
self.book['title'] = metadata['dc:title']
if isinstance(self.book['title'], collections.OrderedDict):
self.book['title'] = metadata['dc:title']['#text']
except:
print('Title parse error')
self.book['title'] = os.path.splitext(
os.path.basename(self.book_filename))[0]
# Book author
try:
self.book['author'] = metadata['dc:creator']['#text']
except:
self.book['author'] = 'Unknown'
# Book year
try:
self.book['year'] = int(metadata['dc:date'][:4])
except:
self.book['year'] = 9999 self.book['year'] = 9999
# Get identifier # Book isbn
identifier_items = xml.find_all('identifier')
for i in identifier_items:
scheme = i.get('scheme')
try:
if scheme.lower() == 'isbn':
self.book['isbn'] = i.text
except AttributeError:
self.book['isbn'] = None self.book['isbn'] = None
# Tags
tag_items = xml.find_all('subject')
tag_list = [i.text for i in tag_items]
self.book['tags'] = tag_list
# Get items
self.book['content_dict'] = {}
all_items = xml.find_all('item')
for i in all_items:
media_type = i.get('media-type')
this_id = i.get('id')
if media_type == 'application/xhtml+xml' or media_type == 'text/html':
self.book['content_dict'][this_id] = i.get('href')
if media_type == 'application/x-dtbncx+xml':
self.book['toc_file'] = i.get('href')
# Cover image
if 'cover' in this_id and media_type.split('/')[0] == 'image':
cover_href = i.get('href')
try: try:
self.book['cover'] = self.zip_file.read(cover_href) for i in metadata['dc:identifier']:
except KeyError: if i['@opf:scheme'].lower() == 'isbn':
# The cover cannot be found according to the self.book['isbn'] = i['#text']
# path specified in the content reference except:
self.book['cover'] = self.zip_file.read( pass
self.get_file_path(cover_href))
# Book tags
try:
self.book['tags'] = metadata['dc:subject']
except:
self.book['tags'] = []
# Book cover
self.generate_book_cover()
def generate_book_cover(self):
# This is separate because the book cover needs to
# be found and extracted both during addition / reading
self.book['cover'] = None
try:
cover_image = [
i['@href'] for i in self.opf_dict['package']['manifest']['item']
if i['@media-type'].split('/')[0] == 'image' and
'cover' in i['@id']][0]
self.book['cover'] = self.zip_file.read(
self.find_file(cover_image))
except:
pass
# Find book cover the hard way
if not self.book['cover']: if not self.book['cover']:
# If no cover is located the conventional way,
# we go looking for the largest image in the book
biggest_image_size = 0 biggest_image_size = 0
biggest_image = None biggest_image = None
for j in self.zip_file.filelist: for j in self.zip_file.filelist:
@@ -192,139 +350,5 @@ class EPUB:
biggest_image_size = j.file_size biggest_image_size = j.file_size
if biggest_image: if biggest_image:
self.book['cover'] = self.read_from_zip(biggest_image) self.book['cover'] = self.zip_file.read(
else: self.find_file(biggest_image))
logger.error('No cover found for: ' + self.filename)
# Parse spine and arrange chapter paths acquired from the opf
# according to the order IN THE SPINE
spine_items = xml.find_all('itemref')
spine_order = []
for i in spine_items:
spine_order.append(i.get('idref'))
self.book['chapters_in_order'] = []
for i in spine_order:
chapter_path = self.book['content_dict'][i]
self.book['chapters_in_order'].append(chapter_path)
def parse_toc(self):
# This has no bearing on the actual order
# We're just using this to get chapter names
self.book['navpoint_dict'] = {}
toc_file = self.book['toc_file']
if toc_file:
toc_file = self.get_file_path(toc_file)
xml = self.parse_xml(toc_file, 'xml')
if not xml:
return
navpoints = xml.find_all('navPoint')
for i in navpoints:
chapter_title = i.find('text').text
chapter_source = i.find('content').get('src')
chapter_source_file = unquote(chapter_source.split('#')[0])
if '#' in chapter_source:
try:
self.book['split_chapters'][chapter_source_file].append(
(chapter_source.split('#')[1], chapter_title))
except KeyError:
self.book['split_chapters'][chapter_source_file] = []
self.book['split_chapters'][chapter_source_file].append(
(chapter_source.split('#')[1], chapter_title))
self.book['navpoint_dict'][chapter_source_file] = chapter_title
def parse_chapters(self, temp_dir=None, split_large_xml=False):
no_title_chapter = 0
self.book['book_list'] = []
for i in self.book['chapters_in_order']:
chapter_data = self.read_from_zip(i).decode()
if i in self.book['split_chapters'] and not split_large_xml:
split_chapters = get_split_content(
chapter_data, self.book['split_chapters'][i])
self.book['book_list'].extend(split_chapters)
elif split_large_xml:
# https://stackoverflow.com/questions/14444732/how-to-split-a-html-page-to-multiple-pages-using-python-and-beautiful-soup
markup = BeautifulSoup(chapter_data, 'xml')
chapters = []
pagebreaks = markup.find_all('pagebreak')
def next_element(elem):
while elem is not None:
elem = elem.next_sibling
if hasattr(elem, 'name'):
return elem
for pbreak in pagebreaks:
chapter = [str(pbreak)]
elem = next_element(pbreak)
while elem and elem.name != 'pagebreak':
chapter.append(str(elem))
elem = next_element(elem)
chapters.append('\n'.join(chapter))
for this_chapter in chapters:
fallback_title = str(no_title_chapter)
self.book['book_list'].append(
(fallback_title, this_chapter + ('<br/>' * 8)))
no_title_chapter += 1
else:
try:
self.book['book_list'].append(
(self.book['navpoint_dict'][i], chapter_data + ('<br/>' * 8)))
except KeyError:
fallback_title = str(no_title_chapter)
self.book['book_list'].append(
(fallback_title, chapter_data))
no_title_chapter += 1
cover_path = os.path.join(temp_dir, os.path.basename(self.filename)) + '- cover'
if self.book['cover']:
with open(cover_path, 'wb') as cover_temp:
cover_temp.write(self.book['cover'])
try:
self.book['book_list'][0] = (
'Cover', f'<center><img src="{cover_path}" alt="Cover"></center>')
except IndexError:
pass
def get_split_content(chapter_data, split_by):
split_anchors = [i[0] for i in split_by]
chapter_titles = [i[1] for i in split_by]
return_list = []
xml = BeautifulSoup(chapter_data, 'lxml')
xml_string = xml.body.prettify()
for count, i in enumerate(split_anchors):
this_split = xml_string.split(i)
current_chapter = this_split[0]
bs_obj = BeautifulSoup(current_chapter, 'lxml')
# Since tags correspond to data following them, the first
# chunk will be ignored
# As will all empty chapters
if bs_obj.text == '\n' or bs_obj.text == '' or count == 0:
continue
bs_obj_string = str(bs_obj).replace('"&gt;', '', 1) + ('<br/>' * 8)
return_list.append(
(chapter_titles[count - 1], bs_obj_string))
xml_string = ''.join(this_split[1:])
bs_obj = BeautifulSoup(xml_string, 'lxml')
bs_obj_string = str(bs_obj).replace('"&gt;', '', 1) + ('<br/>' * 8)
return_list.append(
(chapter_titles[-1], bs_obj_string))
return return_list

View File

@@ -150,6 +150,9 @@ class BookSorter:
i[0]: i[1] for i in all_hashes_and_paths} i[0]: i[1] for i in all_hashes_and_paths}
def database_entry_for_book(self, file_hash): def database_entry_for_book(self, file_hash):
# TODO
# This will probably look a whole lot better with a namedtuple
database_return = database.DatabaseFunctions( database_return = database.DatabaseFunctions(
self.database_path).fetch_data( self.database_path).fetch_data(
('Title', 'Author', 'Year', 'ISBN', 'Tags', ('Title', 'Author', 'Year', 'ISBN', 'Tags',
@@ -246,7 +249,10 @@ class BookSorter:
if cover_image_raw: if cover_image_raw:
cover_image = resize_image(cover_image_raw) cover_image = resize_image(cover_image_raw)
else: else:
cover_image = fetch_cover(title, author) # TODO
# Needs an option
# cover_image = fetch_cover(title, author)
cover_image = None
this_book[file_md5]['cover_image'] = cover_image this_book[file_md5]['cover_image'] = cover_image
this_book[file_md5]['addition_mode'] = self.addition_mode this_book[file_md5]['addition_mode'] = self.addition_mode
@@ -408,3 +414,4 @@ def fetch_cover(title, author):
except: except:
logger.error(f'Couldn\'t find cover for ' + title) logger.error(f'Couldn\'t find cover for ' + title)
return None