diff --git a/__main__.py b/__main__.py index 98a028f..d03b7d7 100755 --- a/__main__.py +++ b/__main__.py @@ -586,7 +586,7 @@ class MainUI(QtWidgets.QMainWindow, mainwindow.Ui_MainWindow): current_title = current_metadata['title'] current_author = current_metadata['author'] current_position = current_metadata['position'] - current_toc = current_metadata['content'].keys() + current_toc = [i[0] for i in current_metadata['content']] self.bookToolBar.tocBox.blockSignals(True) self.bookToolBar.tocBox.clear() diff --git a/database.py b/database.py index 29eaa85..7901107 100644 --- a/database.py +++ b/database.py @@ -97,8 +97,11 @@ class DatabaseFunctions: isbn = i[1]['isbn'] tags = i[1]['tags'] if tags: - # Is a tuple. Needs to be a string - tags = ', '.join([j for j in tags if j]) + # Is a list. Needs to be a string + tags = ', '.join([str(j) for j in tags]) + else: + # Is still a list. Needs to be None. + tags = None sql_command_add = ( "INSERT OR REPLACE INTO \ @@ -173,7 +176,6 @@ class DatabaseFunctions: return data def modify_metadata(self, metadata_dict, book_hash): - def generate_binary(column, data): if column in ('Position', 'LastAccessed', 'Bookmarks'): return sqlite3.Binary(pickle.dumps(data)) diff --git a/ePub/read_epub.py b/ePub/read_epub.py index 70c6c4e..9104e62 100644 --- a/ePub/read_epub.py +++ b/ePub/read_epub.py @@ -19,11 +19,8 @@ import os import sys import zipfile +from urllib.parse import unquote -import pprint -import inspect - -import bs4 from bs4 import BeautifulSoup @@ -39,9 +36,15 @@ class EPUB: self.load_zip() contents_path = self.get_file_path( None, True) + + if not contents_path: + return False # No opf was found so processing cannot continue + self.generate_book_metadata(contents_path) self.parse_toc() + return True + def load_zip(self): try: self.zip_file = zipfile.ZipFile( @@ -84,65 +87,106 @@ class EPUB: if os.path.basename(i.filename) == os.path.basename(filename): return i.filename + return None + def read_from_zip(self, filename): + filename = unquote(filename) try: file_data = self.zip_file.read(filename) return file_data except KeyError: file_path_actual = self.get_file_path(filename) - return self.zip_file.read(file_path_actual) + if file_path_actual: + return self.zip_file.read(file_path_actual) + else: + print('ePub module can\'t find ' + filename) #______________________________________________________ def generate_book_metadata(self, contents_path): + self.book['title'] = 'Unknown' + self.book['author'] = 'Unknown' + self.book['isbn'] = None + self.book['tags'] = None + self.book['cover'] = None + self.book['toc_file'] = 'toc.ncx' # Overwritten if another one exists + + # Parse XML + xml = self.parse_xml(contents_path, 'xml') + # Parse metadata item_dict = { - 'title': 'dc:title', - 'author': 'dc:creator', - 'date': 'dc:date'} - - xml = self.parse_xml(contents_path, 'lxml') + 'title': 'title', + 'author': 'creator', + 'year': 'date'} for i in item_dict.items(): item = xml.find(i[1]) if item: self.book[i[0]] = item.text - # Get identifier - xml = self.parse_xml(contents_path, 'xml') + try: + self.book['year'] = int(self.book['year'][:4]) + except (TypeError, KeyError, IndexError): + self.book['year'] = 9999 - metadata_items = xml.find('metadata') - for i in metadata_items.children: - if isinstance(i, bs4.element.Tag): - try: - if i.get('opf:scheme').lower() == 'isbn': - self.book['isbn'] = i.text - break - except AttributeError: - self.book['isbn'] = None + # Get identifier + identifier_items = xml.find_all('identifier') + for i in identifier_items: + scheme = i.get('scheme') + try: + if scheme.lower() == 'isbn': + self.book['isbn'] = i.text + except AttributeError: + self.book['isbn'] = None + + # Tags + tag_items = xml.find_all('subject') + tag_list = [i.text for i in tag_items] + self.book['tags'] = tag_list # Get items self.book['content_dict'] = {} all_items = xml.find_all('item') for i in all_items: media_type = i.get('media-type') + this_id = i.get('id') if media_type == 'application/xhtml+xml': - self.book['content_dict'][i.get('id')] = i.get('href') + self.book['content_dict'][this_id] = i.get('href') if media_type == 'application/x-dtbncx+xml': self.book['toc_file'] = i.get('href') # Cover image - # if i.get('id') == 'cover': - # cover_href = i.get('href') - # try: - # self.book['cover'] = self.zip_file.read(cover_href) - # except KeyError: - # # The cover cannot be found according to the - # # path specified in the content reference - # self.book['cover'] = self.zip_file.read( - # self.get_file_path(cover_href)) + if this_id.startswith('cover') and media_type.split('/')[0] == 'image': + cover_href = i.get('href') + try: + self.book['cover'] = self.zip_file.read(cover_href) + except KeyError: + # The cover cannot be found according to the + # path specified in the content reference + self.book['cover'] = self.zip_file.read( + self.get_file_path(cover_href)) + + if not self.book['cover']: + # If no cover is located the conventioanl way, + # we go looking for the largest image in the book + biggest_image_size = 0 + biggest_image = None + for j in self.zip_file.filelist: + if os.path.splitext(j.filename)[1] in ['.jpg', '.png', '.gif']: + if j.file_size > biggest_image_size: + biggest_image = j.filename + biggest_image_size = j.file_size + + if biggest_image: + self.book['cover'] = self.read_from_zip(biggest_image) + else: + print('No cover found for: ' + self.filename) + + with open('cover', 'wb') as this_cover: + this_cover.write(self.book['cover']) # Parse spine and arrange chapter paths acquired from the opf # according to the order IN THE SPINE @@ -157,24 +201,28 @@ class EPUB: self.book['chapters_in_order'].append(chapter_path) def parse_toc(self): - # Try to get chapter names from the toc # This has no bearing on the actual order # We're just using this to get chapter names + self.book['navpoint_dict'] = {} toc_file = self.book['toc_file'] - toc_file = self.get_file_path(toc_file) + if toc_file: + toc_file = self.get_file_path(toc_file) xml = self.parse_xml(toc_file, 'xml') + if not xml: + return + navpoints = xml.find_all('navPoint') - self.book['navpoint_dict'] = {} for i in navpoints: chapter_title = i.find('text').text chapter_source = i.find('content').get('src') - chapter_source = chapter_source.split('#')[0] + chapter_source = unquote(chapter_source.split('#')[0]) self.book['navpoint_dict'][chapter_source] = chapter_title def parse_chapters(self): + no_title_chapter = 1 self.book['book_list'] = [] for i in self.book['chapters_in_order']: chapter_data = self.read_from_zip(i).decode() @@ -182,8 +230,10 @@ class EPUB: self.book['book_list'].append( (self.book['navpoint_dict'][i], chapter_data)) except KeyError: + fallback_title = str(no_title_chapter) + ': No Title' self.book['book_list'].append( - (os.path.splitext(i)[0], chapter_data)) + (fallback_title, chapter_data)) + no_title_chapter += 1 def main(): book = EPUB(sys.argv[1]) diff --git a/library.py b/library.py index 4f274dd..7ee40de 100644 --- a/library.py +++ b/library.py @@ -75,9 +75,15 @@ class Library: author = i[1] year = i[2] path = i[4] - tags = i[7] last_accessed = i[9] + tags = i[7] + if isinstance(tags, list): # When files are added for the first time + if tags: + tags = ', '.join(str(this_tag) for this_tag in tags) + else: + tags = None + try: date_added = pickle.loads(i[3]) except TypeError: # Because of datetime.datetime.now() above diff --git a/parsers/cbr.py b/parsers/cbr.py index ac851f9..c98f594 100644 --- a/parsers/cbr.py +++ b/parsers/cbr.py @@ -78,7 +78,7 @@ class ParseCBR: 'images_only': True} extract_path = os.path.join(self.temp_dir, self.file_md5) - contents = collections.OrderedDict() + contents = [] # I'm currently choosing not to keep multiple files in memory self.book.extractall(extract_path) @@ -101,6 +101,6 @@ class ParseCBR: page_name = 'Page ' + str(count + 1) image_path = os.path.join(extract_path, i) - contents[page_name] = image_path + contents.append((page_name, image_path)) return contents, file_settings diff --git a/parsers/cbz.py b/parsers/cbz.py index a512a9a..cc7b50d 100644 --- a/parsers/cbz.py +++ b/parsers/cbz.py @@ -81,7 +81,7 @@ class ParseCBZ: 'images_only': True} extract_path = os.path.join(self.temp_dir, self.file_md5) - contents = collections.OrderedDict() + contents = [] # I'm currently choosing not to keep multiple files in memory self.book.extractall(extract_path) @@ -104,6 +104,6 @@ class ParseCBZ: page_name = 'Page ' + str(count + 1) image_path = os.path.join(extract_path, i) - contents[page_name] = image_path + contents.append((page_name, image_path)) return contents, file_settings diff --git a/parsers/epub.py b/parsers/epub.py index f06fb1a..db2719c 100644 --- a/parsers/epub.py +++ b/parsers/epub.py @@ -35,7 +35,10 @@ class ParseEPUB: def read_book(self): self.book_ref = EPUB(self.filename) - self.book_ref.read_epub() + contents_found = self.book_ref.read_epub() + if not contents_found: + print('Cannot process: ' + self.filename) + return self.book = self.book_ref.book def get_title(self): @@ -45,19 +48,16 @@ class ParseEPUB: return self.book['author'] def get_year(self): - return 9999 + return self.book['year'] def get_cover_image(self): - try: - return self.book['cover'] - except KeyError: - return None + return self.book['cover'] def get_isbn(self): return self.book['isbn'] def get_tags(self): - return None + return self.book['tags'] def get_contents(self): extract_path = os.path.join(self.temp_dir, self.file_md5) diff --git a/sorter.py b/sorter.py index f4bbe0d..feedb39 100644 --- a/sorter.py +++ b/sorter.py @@ -214,8 +214,8 @@ class BookSorter: content = all_content[0] images_only = all_content[1]['images_only'] - if not content.keys(): - content['Invalid'] = 'Possible Parse Error' + if not content: + content = [('Invalid', 'Something went horribly wrong')] book_data = self.database_entry_for_book(file_md5) position = book_data[0] diff --git a/widgets.py b/widgets.py index b38616c..0bd1f2c 100644 --- a/widgets.py +++ b/widgets.py @@ -53,8 +53,7 @@ class Tab(QtWidgets.QWidget): self.generate_position() current_chapter = 1 - chapter_name = list(self.metadata['content'])[current_chapter - 1] - chapter_content = self.metadata['content'][chapter_name] + chapter_content = self.metadata['content'][current_chapter - 1][1] # The content display widget is, by default a QTextBrowser. # In case the incoming data is only images @@ -190,7 +189,7 @@ class Tab(QtWidgets.QWidget): # TODO # Calculate lines to incorporate into progress - total_chapters = len(self.metadata['content'].keys()) + total_chapters = len(self.metadata['content']) current_chapter = 1 scroll_value = 0 @@ -250,8 +249,8 @@ class Tab(QtWidgets.QWidget): self.contentView.show() def change_chapter_tocBox(self): - chapter_name = self.window().bookToolBar.tocBox.currentText() - required_content = self.metadata['content'][chapter_name] + chapter_number = self.window().bookToolBar.tocBox.currentIndex() + required_content = self.metadata['content'][chapter_number][1] if self.are_we_doing_images_only: self.contentView.loadImage(required_content) @@ -447,7 +446,7 @@ class PliantQGraphicsView(QtWidgets.QGraphicsView): # Image panning with mouse content = self.parent.metadata['content'] - image_paths = [i[1] for i in content.items()] + image_paths = [i[1] for i in content] def generate_image_cache(current_image): print('Building image cache')