Major improvements to epub parser

2018-03-10 15:56:04 +05:30
parent 51d00bb9b5
commit ed8f676a05
9 changed files with 117 additions and 60 deletions
--- a/main.py
+++ b/main.py
@@ -586,7 +586,7 @@ class MainUI(QtWidgets.QMainWindow, mainwindow.Ui_MainWindow):
            current_title = current_metadata['title']
            current_author = current_metadata['author']
            current_position = current_metadata['position']
-            current_toc = current_metadata['content'].keys()
+            current_toc = [i[0] for i in current_metadata['content']]
            self.bookToolBar.tocBox.blockSignals(True)
            self.bookToolBar.tocBox.clear()
--- a/database.py
+++ b/database.py
@@ -97,8 +97,11 @@ class DatabaseFunctions:
            isbn = i[1]['isbn']
            tags = i[1]['tags']
            if tags:
-                # Is a tuple. Needs to be a string
+                # Is a list. Needs to be a string
-                tags = ', '.join([j for j in tags if j])
+                tags = ', '.join([str(j) for j in tags])
            else:
                # Is still a list. Needs to be None.
                tags = None
            sql_command_add = (
                "INSERT OR REPLACE INTO \
@@ -173,7 +176,6 @@ class DatabaseFunctions:
        return data
    def modify_metadata(self, metadata_dict, book_hash):
        def generate_binary(column, data):
            if column in ('Position', 'LastAccessed', 'Bookmarks'):
                return sqlite3.Binary(pickle.dumps(data))
--- a/ePub/read_epub.py
+++ b/ePub/read_epub.py
@@ -19,11 +19,8 @@
 import os
 import sys
 import zipfile
 from urllib.parse import unquote
 import pprint
 import inspect
 import bs4
 from bs4 import BeautifulSoup
@@ -39,9 +36,15 @@ class EPUB:
        self.load_zip()
        contents_path = self.get_file_path(
            None, True)
        if not contents_path:
            return False  # No opf was found so processing cannot continue
        self.generate_book_metadata(contents_path)
        self.parse_toc()
        return True
    def load_zip(self):
        try:
            self.zip_file = zipfile.ZipFile(
@@ -84,65 +87,106 @@ class EPUB:
            if os.path.basename(i.filename) == os.path.basename(filename):
                return i.filename
        return None
    def read_from_zip(self, filename):
        filename = unquote(filename)
        try:
            file_data = self.zip_file.read(filename)
            return file_data
        except KeyError:
            file_path_actual = self.get_file_path(filename)
            if file_path_actual:
                return self.zip_file.read(file_path_actual)
            else:
                print('ePub module can\'t find ' + filename)
    #______________________________________________________
    def generate_book_metadata(self, contents_path):
        self.book['title'] = 'Unknown'
        self.book['author'] = 'Unknown'
        self.book['isbn'] = None
        self.book['tags'] = None
        self.book['cover'] = None
        self.book['toc_file'] = 'toc.ncx'  # Overwritten if another one exists
        # Parse XML
        xml = self.parse_xml(contents_path, 'xml')
        # Parse metadata
        item_dict = {
-            'title': 'dc:title',
+            'title': 'title',
-            'author': 'dc:creator',
+            'author': 'creator',
-            'date': 'dc:date'}
+            'year': 'date'}
        xml = self.parse_xml(contents_path, 'lxml')
        for i in item_dict.items():
            item = xml.find(i[1])
            if item:
                self.book[i[0]] = item.text
        # Get identifier
        xml = self.parse_xml(contents_path, 'xml')
        metadata_items = xml.find('metadata')
        for i in metadata_items.children:
            if isinstance(i, bs4.element.Tag):
        try:
-                    if i.get('opf:scheme').lower() == 'isbn':
+            self.book['year'] = int(self.book['year'][:4])
        except (TypeError, KeyError, IndexError):
            self.book['year'] = 9999
        # Get identifier
        identifier_items = xml.find_all('identifier')
        for i in identifier_items:
            scheme = i.get('scheme')
            try:
                if scheme.lower() == 'isbn':
                    self.book['isbn'] = i.text
                        break
            except AttributeError:
                self.book['isbn'] = None
        # Tags
        tag_items = xml.find_all('subject')
        tag_list = [i.text for i in tag_items]
        self.book['tags'] = tag_list
        # Get items
        self.book['content_dict'] = {}
        all_items = xml.find_all('item')
        for i in all_items:
            media_type = i.get('media-type')
            this_id = i.get('id')
            if media_type == 'application/xhtml+xml':
-                self.book['content_dict'][i.get('id')] = i.get('href')
+                self.book['content_dict'][this_id] = i.get('href')
            if media_type == 'application/x-dtbncx+xml':
                self.book['toc_file'] = i.get('href')
            # Cover image
-            # if i.get('id') == 'cover':
+            if this_id.startswith('cover') and media_type.split('/')[0] == 'image':
-            #     cover_href = i.get('href')
+                cover_href = i.get('href')
-            #     try:
+                try:
-            #         self.book['cover'] = self.zip_file.read(cover_href)
+                    self.book['cover'] = self.zip_file.read(cover_href)
-            #     except KeyError:
+                except KeyError:
-            #         # The cover cannot be found according to the
+                    # The cover cannot be found according to the
-            #         # path specified in the content reference
+                    # path specified in the content reference
-            #         self.book['cover'] = self.zip_file.read(
+                    self.book['cover'] = self.zip_file.read(
-            #             self.get_file_path(cover_href))
+                        self.get_file_path(cover_href))
        if not self.book['cover']:
            # If no cover is located the conventioanl way,
            # we go looking for the largest image in the book
            biggest_image_size = 0
            biggest_image = None
            for j in self.zip_file.filelist:
                if os.path.splitext(j.filename)[1] in ['.jpg', '.png', '.gif']:
                    if j.file_size > biggest_image_size:
                        biggest_image = j.filename
                        biggest_image_size = j.file_size
            if biggest_image:
                self.book['cover'] = self.read_from_zip(biggest_image)
            else:
                print('No cover found for: ' + self.filename)
        with open('cover', 'wb') as this_cover:
            this_cover.write(self.book['cover'])
        # Parse spine and arrange chapter paths acquired from the opf
        # according to the order IN THE SPINE
@@ -157,24 +201,28 @@ class EPUB:
            self.book['chapters_in_order'].append(chapter_path)
    def parse_toc(self):
        # Try to get chapter names from the toc
        # This has no bearing on the actual order
        # We're just using this to get chapter names
        self.book['navpoint_dict'] = {}
        toc_file = self.book['toc_file']
        if toc_file:
            toc_file = self.get_file_path(toc_file)
        xml = self.parse_xml(toc_file, 'xml')
        if not xml:
            return
        navpoints = xml.find_all('navPoint')
        self.book['navpoint_dict'] = {}
        for i in navpoints:
            chapter_title = i.find('text').text
            chapter_source = i.find('content').get('src')
-            chapter_source = chapter_source.split('#')[0]
+            chapter_source = unquote(chapter_source.split('#')[0])
            self.book['navpoint_dict'][chapter_source] = chapter_title
    def parse_chapters(self):
        no_title_chapter = 1
        self.book['book_list'] = []
        for i in self.book['chapters_in_order']:
            chapter_data = self.read_from_zip(i).decode()
@@ -182,8 +230,10 @@ class EPUB:
                self.book['book_list'].append(
                    (self.book['navpoint_dict'][i], chapter_data))
            except KeyError:
                fallback_title = str(no_title_chapter) + ': No Title'
                self.book['book_list'].append(
-                    (os.path.splitext(i)[0], chapter_data))
+                    (fallback_title, chapter_data))
            no_title_chapter += 1
 def main():
    book = EPUB(sys.argv[1])
--- a/library.py
+++ b/library.py
@@ -75,9 +75,15 @@ class Library:
            author = i[1]
            year = i[2]
            path = i[4]
            tags = i[7]
            last_accessed = i[9]
            tags = i[7]
            if isinstance(tags, list):  # When files are added for the first time
                if tags:
                    tags = ', '.join(str(this_tag) for this_tag in tags)
                else:
                    tags = None
            try:
                date_added = pickle.loads(i[3])
            except TypeError:  # Because of datetime.datetime.now() above
--- a/parsers/cbr.py
+++ b/parsers/cbr.py
@@ -78,7 +78,7 @@ class ParseCBR:
            'images_only': True}
        extract_path = os.path.join(self.temp_dir, self.file_md5)
-        contents = collections.OrderedDict()
+        contents = []
        # I'm currently choosing not to keep multiple files in memory
        self.book.extractall(extract_path)
@@ -101,6 +101,6 @@ class ParseCBR:
            page_name = 'Page ' + str(count + 1)
            image_path = os.path.join(extract_path, i)
-            contents[page_name] = image_path
+            contents.append((page_name, image_path))
        return contents, file_settings
--- a/parsers/cbz.py
+++ b/parsers/cbz.py
@@ -81,7 +81,7 @@ class ParseCBZ:
            'images_only': True}
        extract_path = os.path.join(self.temp_dir, self.file_md5)
-        contents = collections.OrderedDict()
+        contents = []
        # I'm currently choosing not to keep multiple files in memory
        self.book.extractall(extract_path)
@@ -104,6 +104,6 @@ class ParseCBZ:
            page_name = 'Page ' + str(count + 1)
            image_path = os.path.join(extract_path, i)
-            contents[page_name] = image_path
+            contents.append((page_name, image_path))
        return contents, file_settings
--- a/parsers/epub.py
+++ b/parsers/epub.py
@@ -35,7 +35,10 @@ class ParseEPUB:
    def read_book(self):
        self.book_ref = EPUB(self.filename)
-        self.book_ref.read_epub()
+        contents_found = self.book_ref.read_epub()
        if not contents_found:
            print('Cannot process: ' + self.filename)
            return
        self.book = self.book_ref.book
    def get_title(self):
@@ -45,19 +48,16 @@ class ParseEPUB:
        return self.book['author']
    def get_year(self):
-        return 9999
+        return self.book['year']
    def get_cover_image(self):
        try:
        return self.book['cover']
        except KeyError:
            return None
    def get_isbn(self):
        return self.book['isbn']
    def get_tags(self):
-        return None
+        return self.book['tags']
    def get_contents(self):
        extract_path = os.path.join(self.temp_dir, self.file_md5)
--- a/sorter.py
+++ b/sorter.py
@@ -214,8 +214,8 @@ class BookSorter:
                content = all_content[0]
                images_only = all_content[1]['images_only']
-                if not content.keys():
+                if not content:
-                    content['Invalid'] = 'Possible Parse Error'
+                    content = [('Invalid', 'Something went horribly wrong')]
                book_data = self.database_entry_for_book(file_md5)
                position = book_data[0]
--- a/widgets.py
+++ b/widgets.py
@@ -53,8 +53,7 @@ class Tab(QtWidgets.QWidget):
            self.generate_position()
            current_chapter = 1
-        chapter_name = list(self.metadata['content'])[current_chapter - 1]
+        chapter_content = self.metadata['content'][current_chapter - 1][1]
        chapter_content = self.metadata['content'][chapter_name]
        # The content display widget is, by default a QTextBrowser.
        # In case the incoming data is only images
@@ -190,7 +189,7 @@ class Tab(QtWidgets.QWidget):
        # TODO
        # Calculate lines to incorporate into progress
-        total_chapters = len(self.metadata['content'].keys())
+        total_chapters = len(self.metadata['content'])
        current_chapter = 1
        scroll_value = 0
@@ -250,8 +249,8 @@ class Tab(QtWidgets.QWidget):
        self.contentView.show()
    def change_chapter_tocBox(self):
-        chapter_name = self.window().bookToolBar.tocBox.currentText()
+        chapter_number = self.window().bookToolBar.tocBox.currentIndex()
-        required_content = self.metadata['content'][chapter_name]
+        required_content = self.metadata['content'][chapter_number][1]
        if self.are_we_doing_images_only:
            self.contentView.loadImage(required_content)
@@ -447,7 +446,7 @@ class PliantQGraphicsView(QtWidgets.QGraphicsView):
        # Image panning with mouse
        content = self.parent.metadata['content']
-        image_paths = [i[1] for i in content.items()]
+        image_paths = [i[1] for i in content]
        def generate_image_cache(current_image):
            print('Building image cache')