Major improvements to epub parser
This commit is contained in:
@@ -586,7 +586,7 @@ class MainUI(QtWidgets.QMainWindow, mainwindow.Ui_MainWindow):
|
|||||||
current_title = current_metadata['title']
|
current_title = current_metadata['title']
|
||||||
current_author = current_metadata['author']
|
current_author = current_metadata['author']
|
||||||
current_position = current_metadata['position']
|
current_position = current_metadata['position']
|
||||||
current_toc = current_metadata['content'].keys()
|
current_toc = [i[0] for i in current_metadata['content']]
|
||||||
|
|
||||||
self.bookToolBar.tocBox.blockSignals(True)
|
self.bookToolBar.tocBox.blockSignals(True)
|
||||||
self.bookToolBar.tocBox.clear()
|
self.bookToolBar.tocBox.clear()
|
||||||
|
@@ -97,8 +97,11 @@ class DatabaseFunctions:
|
|||||||
isbn = i[1]['isbn']
|
isbn = i[1]['isbn']
|
||||||
tags = i[1]['tags']
|
tags = i[1]['tags']
|
||||||
if tags:
|
if tags:
|
||||||
# Is a tuple. Needs to be a string
|
# Is a list. Needs to be a string
|
||||||
tags = ', '.join([j for j in tags if j])
|
tags = ', '.join([str(j) for j in tags])
|
||||||
|
else:
|
||||||
|
# Is still a list. Needs to be None.
|
||||||
|
tags = None
|
||||||
|
|
||||||
sql_command_add = (
|
sql_command_add = (
|
||||||
"INSERT OR REPLACE INTO \
|
"INSERT OR REPLACE INTO \
|
||||||
@@ -173,7 +176,6 @@ class DatabaseFunctions:
|
|||||||
return data
|
return data
|
||||||
|
|
||||||
def modify_metadata(self, metadata_dict, book_hash):
|
def modify_metadata(self, metadata_dict, book_hash):
|
||||||
|
|
||||||
def generate_binary(column, data):
|
def generate_binary(column, data):
|
||||||
if column in ('Position', 'LastAccessed', 'Bookmarks'):
|
if column in ('Position', 'LastAccessed', 'Bookmarks'):
|
||||||
return sqlite3.Binary(pickle.dumps(data))
|
return sqlite3.Binary(pickle.dumps(data))
|
||||||
|
@@ -19,11 +19,8 @@
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import zipfile
|
import zipfile
|
||||||
|
from urllib.parse import unquote
|
||||||
|
|
||||||
import pprint
|
|
||||||
import inspect
|
|
||||||
|
|
||||||
import bs4
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
@@ -39,9 +36,15 @@ class EPUB:
|
|||||||
self.load_zip()
|
self.load_zip()
|
||||||
contents_path = self.get_file_path(
|
contents_path = self.get_file_path(
|
||||||
None, True)
|
None, True)
|
||||||
|
|
||||||
|
if not contents_path:
|
||||||
|
return False # No opf was found so processing cannot continue
|
||||||
|
|
||||||
self.generate_book_metadata(contents_path)
|
self.generate_book_metadata(contents_path)
|
||||||
self.parse_toc()
|
self.parse_toc()
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
def load_zip(self):
|
def load_zip(self):
|
||||||
try:
|
try:
|
||||||
self.zip_file = zipfile.ZipFile(
|
self.zip_file = zipfile.ZipFile(
|
||||||
@@ -84,65 +87,106 @@ class EPUB:
|
|||||||
if os.path.basename(i.filename) == os.path.basename(filename):
|
if os.path.basename(i.filename) == os.path.basename(filename):
|
||||||
return i.filename
|
return i.filename
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
def read_from_zip(self, filename):
|
def read_from_zip(self, filename):
|
||||||
|
filename = unquote(filename)
|
||||||
try:
|
try:
|
||||||
file_data = self.zip_file.read(filename)
|
file_data = self.zip_file.read(filename)
|
||||||
return file_data
|
return file_data
|
||||||
except KeyError:
|
except KeyError:
|
||||||
file_path_actual = self.get_file_path(filename)
|
file_path_actual = self.get_file_path(filename)
|
||||||
|
if file_path_actual:
|
||||||
return self.zip_file.read(file_path_actual)
|
return self.zip_file.read(file_path_actual)
|
||||||
|
else:
|
||||||
|
print('ePub module can\'t find ' + filename)
|
||||||
|
|
||||||
#______________________________________________________
|
#______________________________________________________
|
||||||
|
|
||||||
def generate_book_metadata(self, contents_path):
|
def generate_book_metadata(self, contents_path):
|
||||||
|
self.book['title'] = 'Unknown'
|
||||||
|
self.book['author'] = 'Unknown'
|
||||||
|
self.book['isbn'] = None
|
||||||
|
self.book['tags'] = None
|
||||||
|
self.book['cover'] = None
|
||||||
|
self.book['toc_file'] = 'toc.ncx' # Overwritten if another one exists
|
||||||
|
|
||||||
|
# Parse XML
|
||||||
|
xml = self.parse_xml(contents_path, 'xml')
|
||||||
|
|
||||||
# Parse metadata
|
# Parse metadata
|
||||||
item_dict = {
|
item_dict = {
|
||||||
'title': 'dc:title',
|
'title': 'title',
|
||||||
'author': 'dc:creator',
|
'author': 'creator',
|
||||||
'date': 'dc:date'}
|
'year': 'date'}
|
||||||
|
|
||||||
xml = self.parse_xml(contents_path, 'lxml')
|
|
||||||
|
|
||||||
for i in item_dict.items():
|
for i in item_dict.items():
|
||||||
item = xml.find(i[1])
|
item = xml.find(i[1])
|
||||||
if item:
|
if item:
|
||||||
self.book[i[0]] = item.text
|
self.book[i[0]] = item.text
|
||||||
|
|
||||||
# Get identifier
|
|
||||||
xml = self.parse_xml(contents_path, 'xml')
|
|
||||||
|
|
||||||
metadata_items = xml.find('metadata')
|
|
||||||
for i in metadata_items.children:
|
|
||||||
if isinstance(i, bs4.element.Tag):
|
|
||||||
try:
|
try:
|
||||||
if i.get('opf:scheme').lower() == 'isbn':
|
self.book['year'] = int(self.book['year'][:4])
|
||||||
|
except (TypeError, KeyError, IndexError):
|
||||||
|
self.book['year'] = 9999
|
||||||
|
|
||||||
|
# Get identifier
|
||||||
|
identifier_items = xml.find_all('identifier')
|
||||||
|
for i in identifier_items:
|
||||||
|
scheme = i.get('scheme')
|
||||||
|
try:
|
||||||
|
if scheme.lower() == 'isbn':
|
||||||
self.book['isbn'] = i.text
|
self.book['isbn'] = i.text
|
||||||
break
|
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
self.book['isbn'] = None
|
self.book['isbn'] = None
|
||||||
|
|
||||||
|
# Tags
|
||||||
|
tag_items = xml.find_all('subject')
|
||||||
|
tag_list = [i.text for i in tag_items]
|
||||||
|
self.book['tags'] = tag_list
|
||||||
|
|
||||||
# Get items
|
# Get items
|
||||||
self.book['content_dict'] = {}
|
self.book['content_dict'] = {}
|
||||||
all_items = xml.find_all('item')
|
all_items = xml.find_all('item')
|
||||||
for i in all_items:
|
for i in all_items:
|
||||||
media_type = i.get('media-type')
|
media_type = i.get('media-type')
|
||||||
|
this_id = i.get('id')
|
||||||
|
|
||||||
if media_type == 'application/xhtml+xml':
|
if media_type == 'application/xhtml+xml':
|
||||||
self.book['content_dict'][i.get('id')] = i.get('href')
|
self.book['content_dict'][this_id] = i.get('href')
|
||||||
|
|
||||||
if media_type == 'application/x-dtbncx+xml':
|
if media_type == 'application/x-dtbncx+xml':
|
||||||
self.book['toc_file'] = i.get('href')
|
self.book['toc_file'] = i.get('href')
|
||||||
|
|
||||||
# Cover image
|
# Cover image
|
||||||
# if i.get('id') == 'cover':
|
if this_id.startswith('cover') and media_type.split('/')[0] == 'image':
|
||||||
# cover_href = i.get('href')
|
cover_href = i.get('href')
|
||||||
# try:
|
try:
|
||||||
# self.book['cover'] = self.zip_file.read(cover_href)
|
self.book['cover'] = self.zip_file.read(cover_href)
|
||||||
# except KeyError:
|
except KeyError:
|
||||||
# # The cover cannot be found according to the
|
# The cover cannot be found according to the
|
||||||
# # path specified in the content reference
|
# path specified in the content reference
|
||||||
# self.book['cover'] = self.zip_file.read(
|
self.book['cover'] = self.zip_file.read(
|
||||||
# self.get_file_path(cover_href))
|
self.get_file_path(cover_href))
|
||||||
|
|
||||||
|
if not self.book['cover']:
|
||||||
|
# If no cover is located the conventioanl way,
|
||||||
|
# we go looking for the largest image in the book
|
||||||
|
biggest_image_size = 0
|
||||||
|
biggest_image = None
|
||||||
|
for j in self.zip_file.filelist:
|
||||||
|
if os.path.splitext(j.filename)[1] in ['.jpg', '.png', '.gif']:
|
||||||
|
if j.file_size > biggest_image_size:
|
||||||
|
biggest_image = j.filename
|
||||||
|
biggest_image_size = j.file_size
|
||||||
|
|
||||||
|
if biggest_image:
|
||||||
|
self.book['cover'] = self.read_from_zip(biggest_image)
|
||||||
|
else:
|
||||||
|
print('No cover found for: ' + self.filename)
|
||||||
|
|
||||||
|
with open('cover', 'wb') as this_cover:
|
||||||
|
this_cover.write(self.book['cover'])
|
||||||
|
|
||||||
# Parse spine and arrange chapter paths acquired from the opf
|
# Parse spine and arrange chapter paths acquired from the opf
|
||||||
# according to the order IN THE SPINE
|
# according to the order IN THE SPINE
|
||||||
@@ -157,24 +201,28 @@ class EPUB:
|
|||||||
self.book['chapters_in_order'].append(chapter_path)
|
self.book['chapters_in_order'].append(chapter_path)
|
||||||
|
|
||||||
def parse_toc(self):
|
def parse_toc(self):
|
||||||
# Try to get chapter names from the toc
|
|
||||||
# This has no bearing on the actual order
|
# This has no bearing on the actual order
|
||||||
# We're just using this to get chapter names
|
# We're just using this to get chapter names
|
||||||
|
self.book['navpoint_dict'] = {}
|
||||||
|
|
||||||
toc_file = self.book['toc_file']
|
toc_file = self.book['toc_file']
|
||||||
|
if toc_file:
|
||||||
toc_file = self.get_file_path(toc_file)
|
toc_file = self.get_file_path(toc_file)
|
||||||
|
|
||||||
xml = self.parse_xml(toc_file, 'xml')
|
xml = self.parse_xml(toc_file, 'xml')
|
||||||
|
if not xml:
|
||||||
|
return
|
||||||
|
|
||||||
navpoints = xml.find_all('navPoint')
|
navpoints = xml.find_all('navPoint')
|
||||||
|
|
||||||
self.book['navpoint_dict'] = {}
|
|
||||||
for i in navpoints:
|
for i in navpoints:
|
||||||
chapter_title = i.find('text').text
|
chapter_title = i.find('text').text
|
||||||
chapter_source = i.find('content').get('src')
|
chapter_source = i.find('content').get('src')
|
||||||
chapter_source = chapter_source.split('#')[0]
|
chapter_source = unquote(chapter_source.split('#')[0])
|
||||||
self.book['navpoint_dict'][chapter_source] = chapter_title
|
self.book['navpoint_dict'][chapter_source] = chapter_title
|
||||||
|
|
||||||
def parse_chapters(self):
|
def parse_chapters(self):
|
||||||
|
no_title_chapter = 1
|
||||||
self.book['book_list'] = []
|
self.book['book_list'] = []
|
||||||
for i in self.book['chapters_in_order']:
|
for i in self.book['chapters_in_order']:
|
||||||
chapter_data = self.read_from_zip(i).decode()
|
chapter_data = self.read_from_zip(i).decode()
|
||||||
@@ -182,8 +230,10 @@ class EPUB:
|
|||||||
self.book['book_list'].append(
|
self.book['book_list'].append(
|
||||||
(self.book['navpoint_dict'][i], chapter_data))
|
(self.book['navpoint_dict'][i], chapter_data))
|
||||||
except KeyError:
|
except KeyError:
|
||||||
|
fallback_title = str(no_title_chapter) + ': No Title'
|
||||||
self.book['book_list'].append(
|
self.book['book_list'].append(
|
||||||
(os.path.splitext(i)[0], chapter_data))
|
(fallback_title, chapter_data))
|
||||||
|
no_title_chapter += 1
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
book = EPUB(sys.argv[1])
|
book = EPUB(sys.argv[1])
|
||||||
|
@@ -75,9 +75,15 @@ class Library:
|
|||||||
author = i[1]
|
author = i[1]
|
||||||
year = i[2]
|
year = i[2]
|
||||||
path = i[4]
|
path = i[4]
|
||||||
tags = i[7]
|
|
||||||
last_accessed = i[9]
|
last_accessed = i[9]
|
||||||
|
|
||||||
|
tags = i[7]
|
||||||
|
if isinstance(tags, list): # When files are added for the first time
|
||||||
|
if tags:
|
||||||
|
tags = ', '.join(str(this_tag) for this_tag in tags)
|
||||||
|
else:
|
||||||
|
tags = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
date_added = pickle.loads(i[3])
|
date_added = pickle.loads(i[3])
|
||||||
except TypeError: # Because of datetime.datetime.now() above
|
except TypeError: # Because of datetime.datetime.now() above
|
||||||
|
@@ -78,7 +78,7 @@ class ParseCBR:
|
|||||||
'images_only': True}
|
'images_only': True}
|
||||||
|
|
||||||
extract_path = os.path.join(self.temp_dir, self.file_md5)
|
extract_path = os.path.join(self.temp_dir, self.file_md5)
|
||||||
contents = collections.OrderedDict()
|
contents = []
|
||||||
|
|
||||||
# I'm currently choosing not to keep multiple files in memory
|
# I'm currently choosing not to keep multiple files in memory
|
||||||
self.book.extractall(extract_path)
|
self.book.extractall(extract_path)
|
||||||
@@ -101,6 +101,6 @@ class ParseCBR:
|
|||||||
page_name = 'Page ' + str(count + 1)
|
page_name = 'Page ' + str(count + 1)
|
||||||
image_path = os.path.join(extract_path, i)
|
image_path = os.path.join(extract_path, i)
|
||||||
|
|
||||||
contents[page_name] = image_path
|
contents.append((page_name, image_path))
|
||||||
|
|
||||||
return contents, file_settings
|
return contents, file_settings
|
||||||
|
@@ -81,7 +81,7 @@ class ParseCBZ:
|
|||||||
'images_only': True}
|
'images_only': True}
|
||||||
|
|
||||||
extract_path = os.path.join(self.temp_dir, self.file_md5)
|
extract_path = os.path.join(self.temp_dir, self.file_md5)
|
||||||
contents = collections.OrderedDict()
|
contents = []
|
||||||
|
|
||||||
# I'm currently choosing not to keep multiple files in memory
|
# I'm currently choosing not to keep multiple files in memory
|
||||||
self.book.extractall(extract_path)
|
self.book.extractall(extract_path)
|
||||||
@@ -104,6 +104,6 @@ class ParseCBZ:
|
|||||||
page_name = 'Page ' + str(count + 1)
|
page_name = 'Page ' + str(count + 1)
|
||||||
image_path = os.path.join(extract_path, i)
|
image_path = os.path.join(extract_path, i)
|
||||||
|
|
||||||
contents[page_name] = image_path
|
contents.append((page_name, image_path))
|
||||||
|
|
||||||
return contents, file_settings
|
return contents, file_settings
|
||||||
|
@@ -35,7 +35,10 @@ class ParseEPUB:
|
|||||||
|
|
||||||
def read_book(self):
|
def read_book(self):
|
||||||
self.book_ref = EPUB(self.filename)
|
self.book_ref = EPUB(self.filename)
|
||||||
self.book_ref.read_epub()
|
contents_found = self.book_ref.read_epub()
|
||||||
|
if not contents_found:
|
||||||
|
print('Cannot process: ' + self.filename)
|
||||||
|
return
|
||||||
self.book = self.book_ref.book
|
self.book = self.book_ref.book
|
||||||
|
|
||||||
def get_title(self):
|
def get_title(self):
|
||||||
@@ -45,19 +48,16 @@ class ParseEPUB:
|
|||||||
return self.book['author']
|
return self.book['author']
|
||||||
|
|
||||||
def get_year(self):
|
def get_year(self):
|
||||||
return 9999
|
return self.book['year']
|
||||||
|
|
||||||
def get_cover_image(self):
|
def get_cover_image(self):
|
||||||
try:
|
|
||||||
return self.book['cover']
|
return self.book['cover']
|
||||||
except KeyError:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_isbn(self):
|
def get_isbn(self):
|
||||||
return self.book['isbn']
|
return self.book['isbn']
|
||||||
|
|
||||||
def get_tags(self):
|
def get_tags(self):
|
||||||
return None
|
return self.book['tags']
|
||||||
|
|
||||||
def get_contents(self):
|
def get_contents(self):
|
||||||
extract_path = os.path.join(self.temp_dir, self.file_md5)
|
extract_path = os.path.join(self.temp_dir, self.file_md5)
|
||||||
|
@@ -214,8 +214,8 @@ class BookSorter:
|
|||||||
content = all_content[0]
|
content = all_content[0]
|
||||||
images_only = all_content[1]['images_only']
|
images_only = all_content[1]['images_only']
|
||||||
|
|
||||||
if not content.keys():
|
if not content:
|
||||||
content['Invalid'] = 'Possible Parse Error'
|
content = [('Invalid', 'Something went horribly wrong')]
|
||||||
|
|
||||||
book_data = self.database_entry_for_book(file_md5)
|
book_data = self.database_entry_for_book(file_md5)
|
||||||
position = book_data[0]
|
position = book_data[0]
|
||||||
|
11
widgets.py
11
widgets.py
@@ -53,8 +53,7 @@ class Tab(QtWidgets.QWidget):
|
|||||||
self.generate_position()
|
self.generate_position()
|
||||||
current_chapter = 1
|
current_chapter = 1
|
||||||
|
|
||||||
chapter_name = list(self.metadata['content'])[current_chapter - 1]
|
chapter_content = self.metadata['content'][current_chapter - 1][1]
|
||||||
chapter_content = self.metadata['content'][chapter_name]
|
|
||||||
|
|
||||||
# The content display widget is, by default a QTextBrowser.
|
# The content display widget is, by default a QTextBrowser.
|
||||||
# In case the incoming data is only images
|
# In case the incoming data is only images
|
||||||
@@ -190,7 +189,7 @@ class Tab(QtWidgets.QWidget):
|
|||||||
# TODO
|
# TODO
|
||||||
# Calculate lines to incorporate into progress
|
# Calculate lines to incorporate into progress
|
||||||
|
|
||||||
total_chapters = len(self.metadata['content'].keys())
|
total_chapters = len(self.metadata['content'])
|
||||||
|
|
||||||
current_chapter = 1
|
current_chapter = 1
|
||||||
scroll_value = 0
|
scroll_value = 0
|
||||||
@@ -250,8 +249,8 @@ class Tab(QtWidgets.QWidget):
|
|||||||
self.contentView.show()
|
self.contentView.show()
|
||||||
|
|
||||||
def change_chapter_tocBox(self):
|
def change_chapter_tocBox(self):
|
||||||
chapter_name = self.window().bookToolBar.tocBox.currentText()
|
chapter_number = self.window().bookToolBar.tocBox.currentIndex()
|
||||||
required_content = self.metadata['content'][chapter_name]
|
required_content = self.metadata['content'][chapter_number][1]
|
||||||
|
|
||||||
if self.are_we_doing_images_only:
|
if self.are_we_doing_images_only:
|
||||||
self.contentView.loadImage(required_content)
|
self.contentView.loadImage(required_content)
|
||||||
@@ -447,7 +446,7 @@ class PliantQGraphicsView(QtWidgets.QGraphicsView):
|
|||||||
# Image panning with mouse
|
# Image panning with mouse
|
||||||
|
|
||||||
content = self.parent.metadata['content']
|
content = self.parent.metadata['content']
|
||||||
image_paths = [i[1] for i in content.items()]
|
image_paths = [i[1] for i in content]
|
||||||
|
|
||||||
def generate_image_cache(current_image):
|
def generate_image_cache(current_image):
|
||||||
print('Building image cache')
|
print('Building image cache')
|
||||||
|
Reference in New Issue
Block a user