Overhaul EPUB parsing and ToC generation
This commit is contained in:
11
TODO
11
TODO
@@ -36,6 +36,7 @@ TODO
|
|||||||
Set focus to newly added file
|
Set focus to newly added file
|
||||||
Reading:
|
Reading:
|
||||||
✓ Drop down for TOC
|
✓ Drop down for TOC
|
||||||
|
✓ Treeview navigation for TOC
|
||||||
✓ Override the keypress event of the textedit
|
✓ Override the keypress event of the textedit
|
||||||
✓ Use format* icons for toolbar buttons
|
✓ Use format* icons for toolbar buttons
|
||||||
✓ Implement book view settings with a(nother) toolbar
|
✓ Implement book view settings with a(nother) toolbar
|
||||||
@@ -86,7 +87,6 @@ TODO
|
|||||||
Have them save to memory
|
Have them save to memory
|
||||||
✓ fb2 support
|
✓ fb2 support
|
||||||
✓ Images need to show up in their placeholders
|
✓ Images need to show up in their placeholders
|
||||||
djvu support
|
|
||||||
Other:
|
Other:
|
||||||
✓ Define every widget in code
|
✓ Define every widget in code
|
||||||
Bugs:
|
Bugs:
|
||||||
@@ -98,8 +98,11 @@ TODO
|
|||||||
Better recursion needed for fb2 toc
|
Better recursion needed for fb2 toc
|
||||||
|
|
||||||
Secondary:
|
Secondary:
|
||||||
|
Additional Settings:
|
||||||
|
Disable progressbar - 20% book addition speed improvement
|
||||||
|
Disable cover loading when reading - Saves ~2M / book
|
||||||
|
Create covers for books without them - VERY SLOW
|
||||||
Special formatting for each chapter's title
|
Special formatting for each chapter's title
|
||||||
Create covers for books without them
|
|
||||||
Signal end of chapter with some text
|
Signal end of chapter with some text
|
||||||
Graphical themes
|
Graphical themes
|
||||||
Change focus rectangle dimensions
|
Change focus rectangle dimensions
|
||||||
@@ -108,7 +111,7 @@ TODO
|
|||||||
Goodreads API: Ratings, Read, Recommendations
|
Goodreads API: Ratings, Read, Recommendations
|
||||||
Get ISBN using python-isbnlib
|
Get ISBN using python-isbnlib
|
||||||
Use embedded fonts + CSS
|
Use embedded fonts + CSS
|
||||||
txt, doc, chm support
|
txt, doc, chm, djvu support
|
||||||
Include icons for filetype emblems
|
Include icons for filetype emblems
|
||||||
Comic view modes
|
Comic view modes
|
||||||
Continuous paging
|
Continuous paging
|
||||||
@@ -116,7 +119,7 @@ TODO
|
|||||||
? Add only one file type if multiple are present
|
? Add only one file type if multiple are present
|
||||||
? Create emblem per filetype
|
? Create emblem per filetype
|
||||||
In application notifications
|
In application notifications
|
||||||
Notification in case the filter is filtering out all files with no option in place
|
Notification in case the filter is filtering out all files with no option in place
|
||||||
Option to fit images to viewport
|
Option to fit images to viewport
|
||||||
|
|
||||||
Need help with:
|
Need help with:
|
||||||
|
@@ -29,14 +29,13 @@ class ParseEPUB:
|
|||||||
# Maybe also include book description
|
# Maybe also include book description
|
||||||
self.book_ref = None
|
self.book_ref = None
|
||||||
self.book = None
|
self.book = None
|
||||||
|
self.temp_dir = temp_dir
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
self.extract_path = os.path.join(temp_dir, file_md5)
|
self.extract_path = os.path.join(temp_dir, file_md5)
|
||||||
|
|
||||||
def read_book(self):
|
def read_book(self):
|
||||||
self.book_ref = EPUB(self.filename)
|
self.book_ref = EPUB(self.filename, self.temp_dir)
|
||||||
contents_found = self.book_ref.read_epub()
|
self.book_ref.generate_metadata()
|
||||||
if not contents_found:
|
|
||||||
return False
|
|
||||||
self.book = self.book_ref.book
|
self.book = self.book_ref.book
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@@ -61,14 +60,8 @@ class ParseEPUB:
|
|||||||
def get_contents(self):
|
def get_contents(self):
|
||||||
zipfile.ZipFile(self.filename).extractall(self.extract_path)
|
zipfile.ZipFile(self.filename).extractall(self.extract_path)
|
||||||
|
|
||||||
self.book_ref.parse_toc()
|
self.book_ref.generate_toc()
|
||||||
self.book_ref.parse_chapters(temp_dir=self.extract_path)
|
self.book_ref.generate_content()
|
||||||
|
|
||||||
toc = []
|
|
||||||
content = []
|
|
||||||
for count, i in enumerate(self.book['book_list']):
|
|
||||||
toc.append((1, i[0], count + 1))
|
|
||||||
content.append(i[1])
|
|
||||||
|
|
||||||
# Return toc, content, images_only
|
# Return toc, content, images_only
|
||||||
return toc, content, False
|
return self.book['toc'], self.book['content'], False
|
||||||
|
@@ -14,175 +14,333 @@
|
|||||||
# You should have received a copy of the GNU General Public License
|
# You should have received a copy of the GNU General Public License
|
||||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
import os
|
# TODO
|
||||||
import logging
|
# See if inserting chapters not in the toc.ncx can be avoided
|
||||||
import zipfile
|
# Missing file order is messed up
|
||||||
from urllib.parse import unquote
|
# Account for stylesheets... eventually
|
||||||
|
# Everything needs logging
|
||||||
|
# Mobipocket files
|
||||||
|
|
||||||
|
import os
|
||||||
|
import zipfile
|
||||||
|
import logging
|
||||||
|
import collections
|
||||||
|
|
||||||
|
import xmltodict
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class EPUB:
|
class EPUB:
|
||||||
def __init__(self, filename):
|
def __init__(self, book_filename, temp_dir):
|
||||||
self.filename = filename
|
self.book_filename = book_filename
|
||||||
|
self.temp_dir = temp_dir
|
||||||
self.zip_file = None
|
self.zip_file = None
|
||||||
|
self.file_list = None
|
||||||
|
self.opf_dict = None
|
||||||
self.book = {}
|
self.book = {}
|
||||||
|
|
||||||
|
self.generate_references()
|
||||||
|
|
||||||
|
def find_file(self, filename):
|
||||||
|
# First, look for the file in the root of the book
|
||||||
|
if filename in self.file_list:
|
||||||
|
return filename
|
||||||
|
|
||||||
|
# Then, search for it elsewhere
|
||||||
|
else:
|
||||||
|
file_basename = os.path.basename(filename)
|
||||||
|
for i in self.file_list:
|
||||||
|
if os.path.basename(i) == file_basename:
|
||||||
|
return i
|
||||||
|
|
||||||
|
# If the file isn't found
|
||||||
|
logger.error(filename + ' not found')
|
||||||
|
return False
|
||||||
|
|
||||||
|
def generate_references(self):
|
||||||
|
self.zip_file = zipfile.ZipFile(
|
||||||
|
self.book_filename, mode='r', allowZip64=True)
|
||||||
|
self.file_list = self.zip_file.namelist()
|
||||||
|
|
||||||
|
# Book structure relies on parsing the .opf file
|
||||||
|
# in the book. Now that might be the usual content.opf
|
||||||
|
# or package.opf or it might be named after your favorite
|
||||||
|
# eldritch abomination. The point is we have to check
|
||||||
|
# the container.xml
|
||||||
|
container = self.find_file('container.xml')
|
||||||
|
if container:
|
||||||
|
container_xml = self.zip_file.read(container)
|
||||||
|
container_dict = xmltodict.parse(container_xml)
|
||||||
|
packagefile = container_dict['container']['rootfiles']['rootfile']['@full-path']
|
||||||
|
else:
|
||||||
|
presumptive_names = ('content.opf', 'package.opf')
|
||||||
|
for i in presumptive_names:
|
||||||
|
packagefile = self.find_file(i)
|
||||||
|
if packagefile:
|
||||||
|
break
|
||||||
|
|
||||||
|
packagefile_data = self.zip_file.read(packagefile)
|
||||||
|
self.opf_dict = xmltodict.parse(packagefile_data)
|
||||||
|
|
||||||
|
def generate_toc(self):
|
||||||
|
self.book['toc'] = []
|
||||||
|
|
||||||
|
# I'm currently going with the file always being named toc.ncx
|
||||||
|
# But this is epub. The wild west of ebook formats.
|
||||||
|
tocfile = self.find_file('toc.ncx')
|
||||||
|
tocfile_data = self.zip_file.read(tocfile)
|
||||||
|
toc_dict = xmltodict.parse(tocfile_data)
|
||||||
|
|
||||||
|
def recursor(level, nav_node):
|
||||||
|
if isinstance(nav_node, list):
|
||||||
|
these_contents = [[
|
||||||
|
level + 1,
|
||||||
|
i['navLabel']['text'],
|
||||||
|
i['content']['@src']] for i in nav_node]
|
||||||
|
self.book['toc'].extend(these_contents)
|
||||||
|
return
|
||||||
|
|
||||||
|
if 'navPoint' in nav_node.keys():
|
||||||
|
recursor(level, nav_node['navPoint'])
|
||||||
|
|
||||||
|
else:
|
||||||
|
self.book['toc'].append([
|
||||||
|
level + 1,
|
||||||
|
nav_node['navLabel']['text'],
|
||||||
|
nav_node['content']['@src']])
|
||||||
|
|
||||||
|
navpoints = toc_dict['ncx']['navMap']['navPoint']
|
||||||
|
for top_level_nav in navpoints:
|
||||||
|
self.book['toc'].append([
|
||||||
|
1,
|
||||||
|
top_level_nav['navLabel']['text'],
|
||||||
|
top_level_nav['content']['@src']])
|
||||||
|
|
||||||
|
if 'navPoint' in top_level_nav.keys():
|
||||||
|
recursor(1, top_level_nav)
|
||||||
|
|
||||||
|
def get_chapter_content(self, chapter_file):
|
||||||
|
this_file = self.find_file(chapter_file)
|
||||||
|
if this_file:
|
||||||
|
return self.zip_file.read(this_file).decode()
|
||||||
|
else:
|
||||||
|
print('Not found: ' + chapter_file)
|
||||||
|
return chapter_file
|
||||||
|
|
||||||
|
def parse_split_chapters(self, chapters_with_split_content):
|
||||||
self.book['split_chapters'] = {}
|
self.book['split_chapters'] = {}
|
||||||
|
|
||||||
def read_epub(self):
|
# For split chapters, get the whole chapter first, then split
|
||||||
# This is the function that should error out in
|
# between ids using their anchors, then "heal" the resultant text
|
||||||
# case the module cannot process the file
|
# by creating a BeautifulSoup object. Write its str to the content
|
||||||
try:
|
for i in chapters_with_split_content.items():
|
||||||
self.load_zip()
|
chapter_file = i[0]
|
||||||
contents_path = self.get_file_path(
|
self.book['split_chapters'][chapter_file] = {}
|
||||||
None, True)
|
|
||||||
|
|
||||||
if not contents_path:
|
chapter_content = self.get_chapter_content(chapter_file)
|
||||||
return False # No (valid) opf was found so processing cannot continue
|
soup = BeautifulSoup(chapter_content, 'lxml')
|
||||||
|
|
||||||
self.generate_book_metadata(contents_path)
|
split_anchors = i[1]
|
||||||
except: # Not specifying an exception type here may be justified
|
for this_anchor in reversed(split_anchors):
|
||||||
return False
|
this_tag = soup.find(
|
||||||
|
attrs={"id":lambda x: x == this_anchor})
|
||||||
|
|
||||||
return True
|
markup_split = str(soup).split(str(this_tag))
|
||||||
|
soup = BeautifulSoup(markup_split[0], 'lxml')
|
||||||
|
this_markup = BeautifulSoup(
|
||||||
|
str(this_tag) + markup_split[1], 'lxml')
|
||||||
|
|
||||||
def load_zip(self):
|
self.book['split_chapters'][chapter_file][this_anchor] = str(this_markup)
|
||||||
try:
|
|
||||||
self.zip_file = zipfile.ZipFile(
|
|
||||||
self.filename, mode='r', allowZip64=True)
|
|
||||||
except (KeyError, AttributeError, zipfile.BadZipFile):
|
|
||||||
logger.error('Malformed zip file ' + self.filename)
|
|
||||||
return
|
|
||||||
|
|
||||||
def parse_xml(self, filename, parser):
|
# Remaining markup is assigned here
|
||||||
try:
|
self.book['split_chapters'][chapter_file]['top_level'] = str(soup)
|
||||||
this_xml = self.zip_file.read(filename).decode()
|
|
||||||
except KeyError:
|
|
||||||
short_filename = os.path.basename(self.filename)
|
|
||||||
warning_string = f'{str(filename)} not found in {short_filename}'
|
|
||||||
logger.warning(warning_string)
|
|
||||||
return
|
|
||||||
|
|
||||||
root = BeautifulSoup(this_xml, parser)
|
def generate_content(self):
|
||||||
return root
|
# Find all the chapters mentioned in the opf spine
|
||||||
|
# These are simply ids that correspond to the actual item
|
||||||
|
# as mentioned in the manifest - which is a comprehensive
|
||||||
|
# list of files
|
||||||
|
chapters_in_spine = [
|
||||||
|
i['@idref']
|
||||||
|
for i in self.opf_dict['package']['spine']['itemref']]
|
||||||
|
|
||||||
def get_file_path(self, filename, is_content_file=False):
|
# Next, find items and ids from the manifest
|
||||||
# Use this to get the location of the content.opf file
|
chapters_from_manifest = {
|
||||||
# And maybe some other file that has a more well formatted
|
i['@id']: i['@href']
|
||||||
# idea of the TOC
|
for i in self.opf_dict['package']['manifest']['item']}
|
||||||
# We're going to all this trouble because there really is
|
|
||||||
# no going forward without a toc
|
|
||||||
if is_content_file:
|
|
||||||
container_location = self.get_file_path('container.xml')
|
|
||||||
xml = self.parse_xml(container_location, 'xml')
|
|
||||||
|
|
||||||
if xml:
|
# Finally, check which items are supposed to be in the spine
|
||||||
root_item = xml.find('rootfile')
|
# on the basis of the id and change the toc accordingly
|
||||||
|
spine_final = []
|
||||||
|
for i in chapters_in_spine:
|
||||||
|
try:
|
||||||
|
spine_final.append(chapters_from_manifest.pop(i))
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# TODO
|
||||||
|
# Check what happens in case missing chapters are either
|
||||||
|
# at the beginning or the end of the book
|
||||||
|
chapter_title = 1
|
||||||
|
toc_chapters = [i[2] for i in self.book['toc']]
|
||||||
|
last_valid_index = 0
|
||||||
|
for i in spine_final:
|
||||||
|
if not i in toc_chapters:
|
||||||
|
previous_chapter = spine_final[spine_final.index(i) - 1]
|
||||||
try:
|
try:
|
||||||
return root_item.get('full-path')
|
previous_chapter_toc_index = toc_chapters.index(previous_chapter)
|
||||||
except AttributeError:
|
# In case of 2+ consecutive missing chapters
|
||||||
error_string = f'ePub module: {self.filename} has a malformed container.xml'
|
last_valid_index = previous_chapter_toc_index
|
||||||
|
except ValueError:
|
||||||
|
last_valid_index += 1
|
||||||
|
self.book['toc'].insert(
|
||||||
|
last_valid_index + 1,
|
||||||
|
[1, str(chapter_title), i])
|
||||||
|
chapter_title += 1
|
||||||
|
|
||||||
|
# Parse split chapters as below
|
||||||
|
# They can be picked up during the iteration through the toc
|
||||||
|
chapters_with_split_content = {}
|
||||||
|
for i in self.book['toc']:
|
||||||
|
if '#' in i[2]:
|
||||||
|
this_split = i[2].split('#')
|
||||||
|
chapter = this_split[0]
|
||||||
|
anchor = this_split[1]
|
||||||
|
|
||||||
|
try:
|
||||||
|
chapters_with_split_content[chapter].append(anchor)
|
||||||
|
except KeyError:
|
||||||
|
chapters_with_split_content[chapter] = []
|
||||||
|
chapters_with_split_content[chapter].append(anchor)
|
||||||
|
|
||||||
|
self.parse_split_chapters(chapters_with_split_content)
|
||||||
|
|
||||||
|
# Now we iterate over the ToC as presented in the toc.ncx
|
||||||
|
# and add chapters to the content list
|
||||||
|
# In case a split chapter is encountered, get its content
|
||||||
|
# from the split_chapters dictionary
|
||||||
|
# What could possibly go wrong?
|
||||||
|
|
||||||
|
# The content list is separated from the toc list because
|
||||||
|
# the mupdf library returns its own toc a certain way and
|
||||||
|
# this keeps things uniform
|
||||||
|
split_chapters = self.book['split_chapters']
|
||||||
|
toc_copy = self.book['toc'][:]
|
||||||
|
self.book['content'] = []
|
||||||
|
|
||||||
|
# Put the book into the book
|
||||||
|
for count, i in enumerate(toc_copy):
|
||||||
|
chapter_file = i[2]
|
||||||
|
|
||||||
|
# Get split content according to its corresponding id attribute
|
||||||
|
if '#' in chapter_file:
|
||||||
|
this_split = chapter_file.split('#')
|
||||||
|
chapter_file_proper = this_split[0]
|
||||||
|
this_anchor = this_split[1]
|
||||||
|
|
||||||
|
try:
|
||||||
|
chapter_content = (
|
||||||
|
split_chapters[chapter_file_proper][this_anchor])
|
||||||
|
except KeyError:
|
||||||
|
chapter_content = 'Parse Error'
|
||||||
|
error_string = (
|
||||||
|
f'Error parsing {self.book_filename}: {chapter_file_proper}')
|
||||||
logger.error(error_string)
|
logger.error(error_string)
|
||||||
return None
|
|
||||||
|
|
||||||
possible_filenames = ('content.opf', 'package.opf')
|
# Get content that remained at the end of the pillaging above
|
||||||
for i in possible_filenames:
|
elif chapter_file in split_chapters.keys():
|
||||||
presumptive_location = self.get_file_path(i)
|
try:
|
||||||
if presumptive_location:
|
chapter_content = split_chapters[chapter_file]['top_level']
|
||||||
return presumptive_location
|
except KeyError:
|
||||||
|
chapter_content = 'Parse Error'
|
||||||
|
error_string = (
|
||||||
|
f'Error parsing {self.book_filename}: {chapter_file}')
|
||||||
|
logger.error(error_string)
|
||||||
|
|
||||||
for i in self.zip_file.filelist:
|
# Vanilla non split chapters
|
||||||
if os.path.basename(i.filename) == os.path.basename(filename):
|
|
||||||
return i.filename
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def read_from_zip(self, filename):
|
|
||||||
filename = unquote(filename)
|
|
||||||
try:
|
|
||||||
file_data = self.zip_file.read(filename)
|
|
||||||
return file_data
|
|
||||||
except KeyError:
|
|
||||||
file_path_actual = self.get_file_path(filename)
|
|
||||||
if file_path_actual:
|
|
||||||
return self.zip_file.read(file_path_actual)
|
|
||||||
else:
|
else:
|
||||||
logger.error('ePub module can\'t find ' + filename)
|
chapter_content = self.get_chapter_content(chapter_file)
|
||||||
|
|
||||||
#______________________________________________________
|
# The count + 2 is an adjustment due to the cover being inserted below
|
||||||
|
self.book['toc'][count][2] = count + 2
|
||||||
|
self.book['content'].append(chapter_content)
|
||||||
|
|
||||||
def generate_book_metadata(self, contents_path):
|
self.generate_book_cover()
|
||||||
self.book['title'] = os.path.splitext(
|
if self.book['cover']:
|
||||||
os.path.basename(self.filename))[0]
|
cover_path = os.path.join(
|
||||||
self.book['author'] = 'Unknown'
|
self.temp_dir, os.path.basename(self.book_filename)) + '- cover'
|
||||||
self.book['isbn'] = None
|
with open(cover_path, 'wb') as cover_temp:
|
||||||
self.book['tags'] = None
|
cover_temp.write(self.book['cover'])
|
||||||
self.book['cover'] = None
|
|
||||||
self.book['toc_file'] = 'toc.ncx' # Overwritten if another one exists
|
|
||||||
|
|
||||||
# Parse XML
|
self.book['toc'].insert(0, (1, 'Cover', 1))
|
||||||
xml = self.parse_xml(contents_path, 'xml')
|
self.book['content'].insert(
|
||||||
|
0, (f'<center><img src="{cover_path}" alt="Cover"></center>'))
|
||||||
|
|
||||||
# Parse metadata
|
def generate_metadata(self):
|
||||||
item_dict = {
|
metadata = self.opf_dict['package']['metadata']
|
||||||
'title': 'title',
|
|
||||||
'author': 'creator',
|
|
||||||
'year': 'date'}
|
|
||||||
|
|
||||||
for i in item_dict.items():
|
# There are no exception types specified below
|
||||||
item = xml.find(i[1])
|
# This is on purpose and makes me long for the days
|
||||||
if item:
|
# of simpler, happier things.
|
||||||
self.book[i[0]] = item.text
|
|
||||||
|
|
||||||
|
# Book title
|
||||||
try:
|
try:
|
||||||
self.book['year'] = int(self.book['year'][:4])
|
self.book['title'] = metadata['dc:title']
|
||||||
except (TypeError, KeyError, IndexError, ValueError):
|
if isinstance(self.book['title'], collections.OrderedDict):
|
||||||
|
self.book['title'] = metadata['dc:title']['#text']
|
||||||
|
except:
|
||||||
|
print('Title parse error')
|
||||||
|
self.book['title'] = os.path.splitext(
|
||||||
|
os.path.basename(self.book_filename))[0]
|
||||||
|
|
||||||
|
# Book author
|
||||||
|
try:
|
||||||
|
self.book['author'] = metadata['dc:creator']['#text']
|
||||||
|
except:
|
||||||
|
self.book['author'] = 'Unknown'
|
||||||
|
|
||||||
|
# Book year
|
||||||
|
try:
|
||||||
|
self.book['year'] = int(metadata['dc:date'][:4])
|
||||||
|
except:
|
||||||
self.book['year'] = 9999
|
self.book['year'] = 9999
|
||||||
|
|
||||||
# Get identifier
|
# Book isbn
|
||||||
identifier_items = xml.find_all('identifier')
|
self.book['isbn'] = None
|
||||||
for i in identifier_items:
|
try:
|
||||||
scheme = i.get('scheme')
|
for i in metadata['dc:identifier']:
|
||||||
try:
|
if i['@opf:scheme'].lower() == 'isbn':
|
||||||
if scheme.lower() == 'isbn':
|
self.book['isbn'] = i['#text']
|
||||||
self.book['isbn'] = i.text
|
except:
|
||||||
except AttributeError:
|
pass
|
||||||
self.book['isbn'] = None
|
|
||||||
|
|
||||||
# Tags
|
# Book tags
|
||||||
tag_items = xml.find_all('subject')
|
try:
|
||||||
tag_list = [i.text for i in tag_items]
|
self.book['tags'] = metadata['dc:subject']
|
||||||
self.book['tags'] = tag_list
|
except:
|
||||||
|
self.book['tags'] = []
|
||||||
|
|
||||||
# Get items
|
# Book cover
|
||||||
self.book['content_dict'] = {}
|
self.generate_book_cover()
|
||||||
all_items = xml.find_all('item')
|
|
||||||
for i in all_items:
|
|
||||||
media_type = i.get('media-type')
|
|
||||||
this_id = i.get('id')
|
|
||||||
|
|
||||||
if media_type == 'application/xhtml+xml' or media_type == 'text/html':
|
def generate_book_cover(self):
|
||||||
self.book['content_dict'][this_id] = i.get('href')
|
# This is separate because the book cover needs to
|
||||||
|
# be found and extracted both during addition / reading
|
||||||
if media_type == 'application/x-dtbncx+xml':
|
self.book['cover'] = None
|
||||||
self.book['toc_file'] = i.get('href')
|
try:
|
||||||
|
cover_image = [
|
||||||
# Cover image
|
i['@href'] for i in self.opf_dict['package']['manifest']['item']
|
||||||
if 'cover' in this_id and media_type.split('/')[0] == 'image':
|
if i['@media-type'].split('/')[0] == 'image' and
|
||||||
cover_href = i.get('href')
|
'cover' in i['@id']][0]
|
||||||
try:
|
self.book['cover'] = self.zip_file.read(
|
||||||
self.book['cover'] = self.zip_file.read(cover_href)
|
self.find_file(cover_image))
|
||||||
except KeyError:
|
except:
|
||||||
# The cover cannot be found according to the
|
pass
|
||||||
# path specified in the content reference
|
|
||||||
self.book['cover'] = self.zip_file.read(
|
|
||||||
self.get_file_path(cover_href))
|
|
||||||
|
|
||||||
|
# Find book cover the hard way
|
||||||
if not self.book['cover']:
|
if not self.book['cover']:
|
||||||
# If no cover is located the conventional way,
|
|
||||||
# we go looking for the largest image in the book
|
|
||||||
biggest_image_size = 0
|
biggest_image_size = 0
|
||||||
biggest_image = None
|
biggest_image = None
|
||||||
for j in self.zip_file.filelist:
|
for j in self.zip_file.filelist:
|
||||||
@@ -192,139 +350,5 @@ class EPUB:
|
|||||||
biggest_image_size = j.file_size
|
biggest_image_size = j.file_size
|
||||||
|
|
||||||
if biggest_image:
|
if biggest_image:
|
||||||
self.book['cover'] = self.read_from_zip(biggest_image)
|
self.book['cover'] = self.zip_file.read(
|
||||||
else:
|
self.find_file(biggest_image))
|
||||||
logger.error('No cover found for: ' + self.filename)
|
|
||||||
|
|
||||||
# Parse spine and arrange chapter paths acquired from the opf
|
|
||||||
# according to the order IN THE SPINE
|
|
||||||
spine_items = xml.find_all('itemref')
|
|
||||||
spine_order = []
|
|
||||||
for i in spine_items:
|
|
||||||
spine_order.append(i.get('idref'))
|
|
||||||
|
|
||||||
self.book['chapters_in_order'] = []
|
|
||||||
for i in spine_order:
|
|
||||||
chapter_path = self.book['content_dict'][i]
|
|
||||||
self.book['chapters_in_order'].append(chapter_path)
|
|
||||||
|
|
||||||
def parse_toc(self):
|
|
||||||
# This has no bearing on the actual order
|
|
||||||
# We're just using this to get chapter names
|
|
||||||
self.book['navpoint_dict'] = {}
|
|
||||||
|
|
||||||
toc_file = self.book['toc_file']
|
|
||||||
if toc_file:
|
|
||||||
toc_file = self.get_file_path(toc_file)
|
|
||||||
|
|
||||||
xml = self.parse_xml(toc_file, 'xml')
|
|
||||||
if not xml:
|
|
||||||
return
|
|
||||||
|
|
||||||
navpoints = xml.find_all('navPoint')
|
|
||||||
|
|
||||||
for i in navpoints:
|
|
||||||
chapter_title = i.find('text').text
|
|
||||||
chapter_source = i.find('content').get('src')
|
|
||||||
chapter_source_file = unquote(chapter_source.split('#')[0])
|
|
||||||
|
|
||||||
if '#' in chapter_source:
|
|
||||||
try:
|
|
||||||
self.book['split_chapters'][chapter_source_file].append(
|
|
||||||
(chapter_source.split('#')[1], chapter_title))
|
|
||||||
except KeyError:
|
|
||||||
self.book['split_chapters'][chapter_source_file] = []
|
|
||||||
self.book['split_chapters'][chapter_source_file].append(
|
|
||||||
(chapter_source.split('#')[1], chapter_title))
|
|
||||||
|
|
||||||
self.book['navpoint_dict'][chapter_source_file] = chapter_title
|
|
||||||
|
|
||||||
def parse_chapters(self, temp_dir=None, split_large_xml=False):
|
|
||||||
no_title_chapter = 0
|
|
||||||
self.book['book_list'] = []
|
|
||||||
|
|
||||||
for i in self.book['chapters_in_order']:
|
|
||||||
chapter_data = self.read_from_zip(i).decode()
|
|
||||||
|
|
||||||
if i in self.book['split_chapters'] and not split_large_xml:
|
|
||||||
split_chapters = get_split_content(
|
|
||||||
chapter_data, self.book['split_chapters'][i])
|
|
||||||
self.book['book_list'].extend(split_chapters)
|
|
||||||
|
|
||||||
elif split_large_xml:
|
|
||||||
# https://stackoverflow.com/questions/14444732/how-to-split-a-html-page-to-multiple-pages-using-python-and-beautiful-soup
|
|
||||||
markup = BeautifulSoup(chapter_data, 'xml')
|
|
||||||
chapters = []
|
|
||||||
pagebreaks = markup.find_all('pagebreak')
|
|
||||||
|
|
||||||
def next_element(elem):
|
|
||||||
while elem is not None:
|
|
||||||
elem = elem.next_sibling
|
|
||||||
if hasattr(elem, 'name'):
|
|
||||||
return elem
|
|
||||||
|
|
||||||
for pbreak in pagebreaks:
|
|
||||||
chapter = [str(pbreak)]
|
|
||||||
elem = next_element(pbreak)
|
|
||||||
while elem and elem.name != 'pagebreak':
|
|
||||||
chapter.append(str(elem))
|
|
||||||
elem = next_element(elem)
|
|
||||||
chapters.append('\n'.join(chapter))
|
|
||||||
|
|
||||||
for this_chapter in chapters:
|
|
||||||
fallback_title = str(no_title_chapter)
|
|
||||||
self.book['book_list'].append(
|
|
||||||
(fallback_title, this_chapter + ('<br/>' * 8)))
|
|
||||||
no_title_chapter += 1
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
self.book['book_list'].append(
|
|
||||||
(self.book['navpoint_dict'][i], chapter_data + ('<br/>' * 8)))
|
|
||||||
except KeyError:
|
|
||||||
fallback_title = str(no_title_chapter)
|
|
||||||
self.book['book_list'].append(
|
|
||||||
(fallback_title, chapter_data))
|
|
||||||
no_title_chapter += 1
|
|
||||||
|
|
||||||
cover_path = os.path.join(temp_dir, os.path.basename(self.filename)) + '- cover'
|
|
||||||
if self.book['cover']:
|
|
||||||
with open(cover_path, 'wb') as cover_temp:
|
|
||||||
cover_temp.write(self.book['cover'])
|
|
||||||
|
|
||||||
try:
|
|
||||||
self.book['book_list'][0] = (
|
|
||||||
'Cover', f'<center><img src="{cover_path}" alt="Cover"></center>')
|
|
||||||
except IndexError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def get_split_content(chapter_data, split_by):
|
|
||||||
split_anchors = [i[0] for i in split_by]
|
|
||||||
chapter_titles = [i[1] for i in split_by]
|
|
||||||
return_list = []
|
|
||||||
|
|
||||||
xml = BeautifulSoup(chapter_data, 'lxml')
|
|
||||||
xml_string = xml.body.prettify()
|
|
||||||
|
|
||||||
for count, i in enumerate(split_anchors):
|
|
||||||
this_split = xml_string.split(i)
|
|
||||||
current_chapter = this_split[0]
|
|
||||||
|
|
||||||
bs_obj = BeautifulSoup(current_chapter, 'lxml')
|
|
||||||
# Since tags correspond to data following them, the first
|
|
||||||
# chunk will be ignored
|
|
||||||
# As will all empty chapters
|
|
||||||
if bs_obj.text == '\n' or bs_obj.text == '' or count == 0:
|
|
||||||
continue
|
|
||||||
bs_obj_string = str(bs_obj).replace('">', '', 1) + ('<br/>' * 8)
|
|
||||||
|
|
||||||
return_list.append(
|
|
||||||
(chapter_titles[count - 1], bs_obj_string))
|
|
||||||
|
|
||||||
xml_string = ''.join(this_split[1:])
|
|
||||||
|
|
||||||
bs_obj = BeautifulSoup(xml_string, 'lxml')
|
|
||||||
bs_obj_string = str(bs_obj).replace('">', '', 1) + ('<br/>' * 8)
|
|
||||||
return_list.append(
|
|
||||||
(chapter_titles[-1], bs_obj_string))
|
|
||||||
|
|
||||||
return return_list
|
|
||||||
|
@@ -150,6 +150,9 @@ class BookSorter:
|
|||||||
i[0]: i[1] for i in all_hashes_and_paths}
|
i[0]: i[1] for i in all_hashes_and_paths}
|
||||||
|
|
||||||
def database_entry_for_book(self, file_hash):
|
def database_entry_for_book(self, file_hash):
|
||||||
|
# TODO
|
||||||
|
# This will probably look a whole lot better with a namedtuple
|
||||||
|
|
||||||
database_return = database.DatabaseFunctions(
|
database_return = database.DatabaseFunctions(
|
||||||
self.database_path).fetch_data(
|
self.database_path).fetch_data(
|
||||||
('Title', 'Author', 'Year', 'ISBN', 'Tags',
|
('Title', 'Author', 'Year', 'ISBN', 'Tags',
|
||||||
@@ -246,7 +249,10 @@ class BookSorter:
|
|||||||
if cover_image_raw:
|
if cover_image_raw:
|
||||||
cover_image = resize_image(cover_image_raw)
|
cover_image = resize_image(cover_image_raw)
|
||||||
else:
|
else:
|
||||||
cover_image = fetch_cover(title, author)
|
# TODO
|
||||||
|
# Needs an option
|
||||||
|
# cover_image = fetch_cover(title, author)
|
||||||
|
cover_image = None
|
||||||
|
|
||||||
this_book[file_md5]['cover_image'] = cover_image
|
this_book[file_md5]['cover_image'] = cover_image
|
||||||
this_book[file_md5]['addition_mode'] = self.addition_mode
|
this_book[file_md5]['addition_mode'] = self.addition_mode
|
||||||
@@ -408,3 +414,4 @@ def fetch_cover(title, author):
|
|||||||
|
|
||||||
except:
|
except:
|
||||||
logger.error(f'Couldn\'t find cover for ' + title)
|
logger.error(f'Couldn\'t find cover for ' + title)
|
||||||
|
return None
|
||||||
|
Reference in New Issue
Block a user