Major improvements to epub parser

This commit is contained in:
BasioMeusPuga
2018-03-10 15:56:04 +05:30
parent 51d00bb9b5
commit ed8f676a05
9 changed files with 117 additions and 60 deletions

View File

@@ -586,7 +586,7 @@ class MainUI(QtWidgets.QMainWindow, mainwindow.Ui_MainWindow):
current_title = current_metadata['title'] current_title = current_metadata['title']
current_author = current_metadata['author'] current_author = current_metadata['author']
current_position = current_metadata['position'] current_position = current_metadata['position']
current_toc = current_metadata['content'].keys() current_toc = [i[0] for i in current_metadata['content']]
self.bookToolBar.tocBox.blockSignals(True) self.bookToolBar.tocBox.blockSignals(True)
self.bookToolBar.tocBox.clear() self.bookToolBar.tocBox.clear()

View File

@@ -97,8 +97,11 @@ class DatabaseFunctions:
isbn = i[1]['isbn'] isbn = i[1]['isbn']
tags = i[1]['tags'] tags = i[1]['tags']
if tags: if tags:
# Is a tuple. Needs to be a string # Is a list. Needs to be a string
tags = ', '.join([j for j in tags if j]) tags = ', '.join([str(j) for j in tags])
else:
# Is still a list. Needs to be None.
tags = None
sql_command_add = ( sql_command_add = (
"INSERT OR REPLACE INTO \ "INSERT OR REPLACE INTO \
@@ -173,7 +176,6 @@ class DatabaseFunctions:
return data return data
def modify_metadata(self, metadata_dict, book_hash): def modify_metadata(self, metadata_dict, book_hash):
def generate_binary(column, data): def generate_binary(column, data):
if column in ('Position', 'LastAccessed', 'Bookmarks'): if column in ('Position', 'LastAccessed', 'Bookmarks'):
return sqlite3.Binary(pickle.dumps(data)) return sqlite3.Binary(pickle.dumps(data))

View File

@@ -19,11 +19,8 @@
import os import os
import sys import sys
import zipfile import zipfile
from urllib.parse import unquote
import pprint
import inspect
import bs4
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@@ -39,9 +36,15 @@ class EPUB:
self.load_zip() self.load_zip()
contents_path = self.get_file_path( contents_path = self.get_file_path(
None, True) None, True)
if not contents_path:
return False # No opf was found so processing cannot continue
self.generate_book_metadata(contents_path) self.generate_book_metadata(contents_path)
self.parse_toc() self.parse_toc()
return True
def load_zip(self): def load_zip(self):
try: try:
self.zip_file = zipfile.ZipFile( self.zip_file = zipfile.ZipFile(
@@ -84,65 +87,106 @@ class EPUB:
if os.path.basename(i.filename) == os.path.basename(filename): if os.path.basename(i.filename) == os.path.basename(filename):
return i.filename return i.filename
return None
def read_from_zip(self, filename): def read_from_zip(self, filename):
filename = unquote(filename)
try: try:
file_data = self.zip_file.read(filename) file_data = self.zip_file.read(filename)
return file_data return file_data
except KeyError: except KeyError:
file_path_actual = self.get_file_path(filename) file_path_actual = self.get_file_path(filename)
if file_path_actual:
return self.zip_file.read(file_path_actual) return self.zip_file.read(file_path_actual)
else:
print('ePub module can\'t find ' + filename)
#______________________________________________________ #______________________________________________________
def generate_book_metadata(self, contents_path): def generate_book_metadata(self, contents_path):
self.book['title'] = 'Unknown'
self.book['author'] = 'Unknown'
self.book['isbn'] = None
self.book['tags'] = None
self.book['cover'] = None
self.book['toc_file'] = 'toc.ncx' # Overwritten if another one exists
# Parse XML
xml = self.parse_xml(contents_path, 'xml')
# Parse metadata # Parse metadata
item_dict = { item_dict = {
'title': 'dc:title', 'title': 'title',
'author': 'dc:creator', 'author': 'creator',
'date': 'dc:date'} 'year': 'date'}
xml = self.parse_xml(contents_path, 'lxml')
for i in item_dict.items(): for i in item_dict.items():
item = xml.find(i[1]) item = xml.find(i[1])
if item: if item:
self.book[i[0]] = item.text self.book[i[0]] = item.text
# Get identifier
xml = self.parse_xml(contents_path, 'xml')
metadata_items = xml.find('metadata')
for i in metadata_items.children:
if isinstance(i, bs4.element.Tag):
try: try:
if i.get('opf:scheme').lower() == 'isbn': self.book['year'] = int(self.book['year'][:4])
except (TypeError, KeyError, IndexError):
self.book['year'] = 9999
# Get identifier
identifier_items = xml.find_all('identifier')
for i in identifier_items:
scheme = i.get('scheme')
try:
if scheme.lower() == 'isbn':
self.book['isbn'] = i.text self.book['isbn'] = i.text
break
except AttributeError: except AttributeError:
self.book['isbn'] = None self.book['isbn'] = None
# Tags
tag_items = xml.find_all('subject')
tag_list = [i.text for i in tag_items]
self.book['tags'] = tag_list
# Get items # Get items
self.book['content_dict'] = {} self.book['content_dict'] = {}
all_items = xml.find_all('item') all_items = xml.find_all('item')
for i in all_items: for i in all_items:
media_type = i.get('media-type') media_type = i.get('media-type')
this_id = i.get('id')
if media_type == 'application/xhtml+xml': if media_type == 'application/xhtml+xml':
self.book['content_dict'][i.get('id')] = i.get('href') self.book['content_dict'][this_id] = i.get('href')
if media_type == 'application/x-dtbncx+xml': if media_type == 'application/x-dtbncx+xml':
self.book['toc_file'] = i.get('href') self.book['toc_file'] = i.get('href')
# Cover image # Cover image
# if i.get('id') == 'cover': if this_id.startswith('cover') and media_type.split('/')[0] == 'image':
# cover_href = i.get('href') cover_href = i.get('href')
# try: try:
# self.book['cover'] = self.zip_file.read(cover_href) self.book['cover'] = self.zip_file.read(cover_href)
# except KeyError: except KeyError:
# # The cover cannot be found according to the # The cover cannot be found according to the
# # path specified in the content reference # path specified in the content reference
# self.book['cover'] = self.zip_file.read( self.book['cover'] = self.zip_file.read(
# self.get_file_path(cover_href)) self.get_file_path(cover_href))
if not self.book['cover']:
# If no cover is located the conventioanl way,
# we go looking for the largest image in the book
biggest_image_size = 0
biggest_image = None
for j in self.zip_file.filelist:
if os.path.splitext(j.filename)[1] in ['.jpg', '.png', '.gif']:
if j.file_size > biggest_image_size:
biggest_image = j.filename
biggest_image_size = j.file_size
if biggest_image:
self.book['cover'] = self.read_from_zip(biggest_image)
else:
print('No cover found for: ' + self.filename)
with open('cover', 'wb') as this_cover:
this_cover.write(self.book['cover'])
# Parse spine and arrange chapter paths acquired from the opf # Parse spine and arrange chapter paths acquired from the opf
# according to the order IN THE SPINE # according to the order IN THE SPINE
@@ -157,24 +201,28 @@ class EPUB:
self.book['chapters_in_order'].append(chapter_path) self.book['chapters_in_order'].append(chapter_path)
def parse_toc(self): def parse_toc(self):
# Try to get chapter names from the toc
# This has no bearing on the actual order # This has no bearing on the actual order
# We're just using this to get chapter names # We're just using this to get chapter names
self.book['navpoint_dict'] = {}
toc_file = self.book['toc_file'] toc_file = self.book['toc_file']
if toc_file:
toc_file = self.get_file_path(toc_file) toc_file = self.get_file_path(toc_file)
xml = self.parse_xml(toc_file, 'xml') xml = self.parse_xml(toc_file, 'xml')
if not xml:
return
navpoints = xml.find_all('navPoint') navpoints = xml.find_all('navPoint')
self.book['navpoint_dict'] = {}
for i in navpoints: for i in navpoints:
chapter_title = i.find('text').text chapter_title = i.find('text').text
chapter_source = i.find('content').get('src') chapter_source = i.find('content').get('src')
chapter_source = chapter_source.split('#')[0] chapter_source = unquote(chapter_source.split('#')[0])
self.book['navpoint_dict'][chapter_source] = chapter_title self.book['navpoint_dict'][chapter_source] = chapter_title
def parse_chapters(self): def parse_chapters(self):
no_title_chapter = 1
self.book['book_list'] = [] self.book['book_list'] = []
for i in self.book['chapters_in_order']: for i in self.book['chapters_in_order']:
chapter_data = self.read_from_zip(i).decode() chapter_data = self.read_from_zip(i).decode()
@@ -182,8 +230,10 @@ class EPUB:
self.book['book_list'].append( self.book['book_list'].append(
(self.book['navpoint_dict'][i], chapter_data)) (self.book['navpoint_dict'][i], chapter_data))
except KeyError: except KeyError:
fallback_title = str(no_title_chapter) + ': No Title'
self.book['book_list'].append( self.book['book_list'].append(
(os.path.splitext(i)[0], chapter_data)) (fallback_title, chapter_data))
no_title_chapter += 1
def main(): def main():
book = EPUB(sys.argv[1]) book = EPUB(sys.argv[1])

View File

@@ -75,9 +75,15 @@ class Library:
author = i[1] author = i[1]
year = i[2] year = i[2]
path = i[4] path = i[4]
tags = i[7]
last_accessed = i[9] last_accessed = i[9]
tags = i[7]
if isinstance(tags, list): # When files are added for the first time
if tags:
tags = ', '.join(str(this_tag) for this_tag in tags)
else:
tags = None
try: try:
date_added = pickle.loads(i[3]) date_added = pickle.loads(i[3])
except TypeError: # Because of datetime.datetime.now() above except TypeError: # Because of datetime.datetime.now() above

View File

@@ -78,7 +78,7 @@ class ParseCBR:
'images_only': True} 'images_only': True}
extract_path = os.path.join(self.temp_dir, self.file_md5) extract_path = os.path.join(self.temp_dir, self.file_md5)
contents = collections.OrderedDict() contents = []
# I'm currently choosing not to keep multiple files in memory # I'm currently choosing not to keep multiple files in memory
self.book.extractall(extract_path) self.book.extractall(extract_path)
@@ -101,6 +101,6 @@ class ParseCBR:
page_name = 'Page ' + str(count + 1) page_name = 'Page ' + str(count + 1)
image_path = os.path.join(extract_path, i) image_path = os.path.join(extract_path, i)
contents[page_name] = image_path contents.append((page_name, image_path))
return contents, file_settings return contents, file_settings

View File

@@ -81,7 +81,7 @@ class ParseCBZ:
'images_only': True} 'images_only': True}
extract_path = os.path.join(self.temp_dir, self.file_md5) extract_path = os.path.join(self.temp_dir, self.file_md5)
contents = collections.OrderedDict() contents = []
# I'm currently choosing not to keep multiple files in memory # I'm currently choosing not to keep multiple files in memory
self.book.extractall(extract_path) self.book.extractall(extract_path)
@@ -104,6 +104,6 @@ class ParseCBZ:
page_name = 'Page ' + str(count + 1) page_name = 'Page ' + str(count + 1)
image_path = os.path.join(extract_path, i) image_path = os.path.join(extract_path, i)
contents[page_name] = image_path contents.append((page_name, image_path))
return contents, file_settings return contents, file_settings

View File

@@ -35,7 +35,10 @@ class ParseEPUB:
def read_book(self): def read_book(self):
self.book_ref = EPUB(self.filename) self.book_ref = EPUB(self.filename)
self.book_ref.read_epub() contents_found = self.book_ref.read_epub()
if not contents_found:
print('Cannot process: ' + self.filename)
return
self.book = self.book_ref.book self.book = self.book_ref.book
def get_title(self): def get_title(self):
@@ -45,19 +48,16 @@ class ParseEPUB:
return self.book['author'] return self.book['author']
def get_year(self): def get_year(self):
return 9999 return self.book['year']
def get_cover_image(self): def get_cover_image(self):
try:
return self.book['cover'] return self.book['cover']
except KeyError:
return None
def get_isbn(self): def get_isbn(self):
return self.book['isbn'] return self.book['isbn']
def get_tags(self): def get_tags(self):
return None return self.book['tags']
def get_contents(self): def get_contents(self):
extract_path = os.path.join(self.temp_dir, self.file_md5) extract_path = os.path.join(self.temp_dir, self.file_md5)

View File

@@ -214,8 +214,8 @@ class BookSorter:
content = all_content[0] content = all_content[0]
images_only = all_content[1]['images_only'] images_only = all_content[1]['images_only']
if not content.keys(): if not content:
content['Invalid'] = 'Possible Parse Error' content = [('Invalid', 'Something went horribly wrong')]
book_data = self.database_entry_for_book(file_md5) book_data = self.database_entry_for_book(file_md5)
position = book_data[0] position = book_data[0]

View File

@@ -53,8 +53,7 @@ class Tab(QtWidgets.QWidget):
self.generate_position() self.generate_position()
current_chapter = 1 current_chapter = 1
chapter_name = list(self.metadata['content'])[current_chapter - 1] chapter_content = self.metadata['content'][current_chapter - 1][1]
chapter_content = self.metadata['content'][chapter_name]
# The content display widget is, by default a QTextBrowser. # The content display widget is, by default a QTextBrowser.
# In case the incoming data is only images # In case the incoming data is only images
@@ -190,7 +189,7 @@ class Tab(QtWidgets.QWidget):
# TODO # TODO
# Calculate lines to incorporate into progress # Calculate lines to incorporate into progress
total_chapters = len(self.metadata['content'].keys()) total_chapters = len(self.metadata['content'])
current_chapter = 1 current_chapter = 1
scroll_value = 0 scroll_value = 0
@@ -250,8 +249,8 @@ class Tab(QtWidgets.QWidget):
self.contentView.show() self.contentView.show()
def change_chapter_tocBox(self): def change_chapter_tocBox(self):
chapter_name = self.window().bookToolBar.tocBox.currentText() chapter_number = self.window().bookToolBar.tocBox.currentIndex()
required_content = self.metadata['content'][chapter_name] required_content = self.metadata['content'][chapter_number][1]
if self.are_we_doing_images_only: if self.are_we_doing_images_only:
self.contentView.loadImage(required_content) self.contentView.loadImage(required_content)
@@ -447,7 +446,7 @@ class PliantQGraphicsView(QtWidgets.QGraphicsView):
# Image panning with mouse # Image panning with mouse
content = self.parent.metadata['content'] content = self.parent.metadata['content']
image_paths = [i[1] for i in content.items()] image_paths = [i[1] for i in content]
def generate_image_cache(current_image): def generate_image_cache(current_image):
print('Building image cache') print('Building image cache')