Cleanup parsers

2019-02-10 09:03:12 +05:30
parent c6e30b67ad
commit f6f9d01060
8 changed files with 271 additions and 316 deletions
@@ -98,6 +98,7 @@ TODO
        Better recursion needed for fb2 toc
        Initial sort by author in tableview
        Last column not filling up tableview
        Comic view mode changing does not work for newly added books
    Secondary:
        Tab tooltip
@@ -21,6 +21,7 @@ import os
 import time
 import logging
 import zipfile
 import collections
 from lector.rarfile import rarfile
@@ -35,54 +36,36 @@ class ParseCOMIC:
        self.book_extension = os.path.splitext(self.filename)
    def read_book(self):
-        try:
+        if self.book_extension[1] == '.cbz':
-            if self.book_extension[1] == '.cbz':
+            self.book = zipfile.ZipFile(
-                self.book = zipfile.ZipFile(
+                self.filename, mode='r', allowZip64=True)
-                    self.filename, mode='r', allowZip64=True)
+            self.image_list = [
-                self.image_list = [
+                i.filename for i in self.book.infolist()
-                    i.filename for i in self.book.infolist()
+                if not i.is_dir() and is_image(i.filename)]
                    if not i.is_dir() and is_image(i.filename)]
-            elif self.book_extension[1] == '.cbr':
+        elif self.book_extension[1] == '.cbr':
-                self.book = rarfile.RarFile(self.filename)
+            self.book = rarfile.RarFile(self.filename)
-                self.image_list = [
+            self.image_list = [
-                    i.filename for i in self.book.infolist()
+                i.filename for i in self.book.infolist()
-                    if not i.isdir() and is_image(i.filename)]
+                if not i.isdir() and is_image(i.filename)]
-            self.image_list.sort()
+        self.image_list.sort()
            if not self.image_list:
                return False
-            return True
+    def generate_metadata(self):
        except: # Specifying no exception here is warranted
            return False
    def get_title(self):
        title = os.path.basename(self.book_extension[0]).strip(' ')
-        return title
+        author = '<Unknown>'
        isbn = None
        tags = []
        cover = self.book.read(self.image_list[0])
    def get_author(self):
        return 'Unknown'
    def get_year(self):
        creation_time = time.ctime(os.path.getctime(self.filename))
-        creation_year = creation_time.split()[-1]
+        year = creation_time.split()[-1]
        return creation_year
-    def get_cover_image(self):
+        Metadata = collections.namedtuple(
-        # The first image in the archive may not be the cover
+            'Metadata', ['title', 'author', 'year', 'isbn', 'tags', 'cover'])
-        # It is implied, however, that the first image in order
+        return Metadata(title, author, year, isbn, tags, cover)
        # will be the cover
        return self.book.read(self.image_list[0])
-    def get_isbn(self):
+    def generate_content(self):
        return None
    def get_tags(self):
        return None
    def get_contents(self):
        image_number = len(self.image_list)
        toc = [(1, f'Page {i + 1}', i + 1) for i in range(image_number)]
@@ -14,6 +14,9 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 # TODO
 # Maybe also include book description
 import os
 import zipfile
 import logging
@@ -25,47 +28,27 @@ logger = logging.getLogger(__name__)
 class ParseEPUB:
    def __init__(self, filename, temp_dir, file_md5):
        # TODO
        # Maybe also include book description
        self.book_ref = None
        self.book = None
        self.temp_dir = temp_dir
        self.filename = filename
        self.temp_dir = temp_dir
        self.extract_path = os.path.join(temp_dir, file_md5)
    def read_book(self):
-        self.book_ref = EPUB(self.filename, self.temp_dir)
+        self.book = EPUB(self.filename, self.temp_dir)
        self.book_ref.generate_metadata()
        self.book = self.book_ref.book
        return True
-    def get_title(self):
+    def generate_metadata(self):
-        return self.book['title']
+        self.book.generate_metadata()
        return self.book.metadata
-    def get_author(self):
+    def generate_content(self):
        return self.book['author']
    def get_year(self):
        return self.book['year']
    def get_cover_image(self):
        return self.book['cover']
    def get_isbn(self):
        return self.book['isbn']
    def get_tags(self):
        return self.book['tags']
    def get_contents(self):
        zipfile.ZipFile(self.filename).extractall(self.extract_path)
-        self.book_ref.generate_toc()
+        self.book.generate_toc()
-        self.book_ref.generate_content()
+        self.book.generate_content()
        toc = []
        content = []
-        for count, i in enumerate(self.book['content']):
+        for count, i in enumerate(self.book.content):
            toc.append((i[0], i[1], count + 1))
            content.append(i[2])
@@ -14,6 +14,9 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 # TODO
 # Maybe also include book description
 import os
 import logging
@@ -24,46 +27,24 @@ logger = logging.getLogger(__name__)
 class ParseFB2:
    def __init__(self, filename, temp_dir, file_md5):
        # TODO
        # Maybe also include book description
        self.book_ref = None
        self.book = None
        self.filename = filename
        self.extract_path = os.path.join(temp_dir, file_md5)
    def read_book(self):
-        self.book_ref = FB2(self.filename)
+        self.book = FB2(self.filename)
        contents_found = self.book_ref.read_fb2()
        if not contents_found:
            return False
        self.book = self.book_ref.book
        return True
-    def get_title(self):
+    def generate_metadata(self):
-        return self.book['title']
+        self.book.generate_metadata()
        return self.book.metadata
-    def get_author(self):
+    def generate_content(self):
        return self.book['author']
    def get_year(self):
        return self.book['year']
    def get_cover_image(self):
        return self.book['cover']
    def get_isbn(self):
        return self.book['isbn']
    def get_tags(self):
        return self.book['tags']
    def get_contents(self):
        os.makedirs(self.extract_path, exist_ok=True)  # Manual creation is required here
-        self.book_ref.parse_chapters(temp_dir=self.extract_path)
+        self.book.generate_content(temp_dir=self.extract_path)
        toc = []
        content = []
-        for count, i in enumerate(self.book['book_list']):
+        for count, i in enumerate(self.book.content):
            toc.append((i[0], i[1], count + 1))
            content.append(i[2])
@@ -14,11 +14,8 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 # TODO
 # Error handling
 # TOC parsing
 import os
 import collections
 import fitz
 from PyQt5 import QtGui
@@ -36,43 +33,39 @@ class ParsePDF:
        except RuntimeError:
            return False
-    def get_title(self):
+    def generate_metadata(self):
        title = self.book.metadata['title']
        if not title:
            title = os.path.splitext(os.path.basename(self.filename))[0]
        return title
    def get_author(self):
        author = self.book.metadata['author']
        if not author:
            author = 'Unknown'
        return author
    def get_year(self):
        creation_date = self.book.metadata['creationDate']
        try:
            year = creation_date.split(':')[1][:4]
        except (ValueError, AttributeError):
            year = 9999
        return year
-    def get_cover_image(self):
+        isbn = None
        tags = self.book.metadata['keywords']
        if not tags:
            tags = []
        # This is a little roundabout for the cover
        # and I'm sure it's taking a performance hit
        # But it is simple. So there's that.
        cover_page = self.book.loadPage(0)
        # Disabling scaling gets the covers much faster
-        return render_pdf_page(cover_page, True)
+        cover = render_pdf_page(cover_page, True)
-    def get_isbn(self):
+        Metadata = collections.namedtuple(
-        return None
+            'Metadata', ['title', 'author', 'year', 'isbn', 'tags', 'cover'])
        return Metadata(title, author, year, isbn, tags, cover)
-    def get_tags(self):
+    def generate_content(self):
        tags = self.book.metadata['keywords']
        return tags  # Fine if it returns None
    def get_contents(self):
        content = list(range(self.book.pageCount))
        toc = self.book.getToC()
        if not toc:
@@ -37,32 +37,17 @@ class EPUB:
    def __init__(self, book_filename, temp_dir):
        self.book_filename = book_filename
        self.temp_dir = temp_dir
        self.zip_file = None
        self.file_list = None
        self.opf_dict = None
-        self.book = {}
+        self.split_chapters = {}
        self.metadata = None
        self.content = []
        self.generate_references()
    def find_file(self, filename):
        # Get rid of special characters
        filename = unquote(filename)
        # First, look for the file in the root of the book
        if filename in self.file_list:
            return filename
        # Then search for it elsewhere
        else:
            file_basename = os.path.basename(filename)
            for i in self.file_list:
                if os.path.basename(i) == file_basename:
                    return i
        # If the file isn't found
        logging.error(filename + ' not found in ' + self.book_filename)
        return False
    def generate_references(self):
        self.zip_file = zipfile.ZipFile(
            self.book_filename, mode='r', allowZip64=True)
@@ -88,9 +73,26 @@ class EPUB:
        packagefile_data = self.zip_file.read(packagefile)
        self.opf_dict = xmltodict.parse(packagefile_data)
-    def generate_toc(self):
+    def find_file(self, filename):
-        self.book['content'] = []
+        # Get rid of special characters
        filename = unquote(filename)
        # First, look for the file in the root of the book
        if filename in self.file_list:
            return filename
        # Then search for it elsewhere
        else:
            file_basename = os.path.basename(filename)
            for i in self.file_list:
                if os.path.basename(i) == file_basename:
                    return i
        # If the file isn't found
        logging.error(filename + ' not found in ' + self.book_filename)
        return False
    def generate_toc(self):
        def find_alternative_toc():
            toc_filename = None
            toc_filename_alternative = None
@@ -134,14 +136,14 @@ class EPUB:
                    level + 1,
                    i['navLabel']['text'],
                    i['content']['@src']] for i in nav_node]
-                self.book['content'].extend(these_contents)
+                self.content.extend(these_contents)
                return
            if 'navPoint' in nav_node.keys():
                recursor(level, nav_node['navPoint'])
            else:
-                self.book['content'].append([
+                self.content.append([
                    level + 1,
                    nav_node['navLabel']['text'],
                    nav_node['content']['@src']])
@@ -150,14 +152,14 @@ class EPUB:
        for top_level_nav in navpoints:
            # Just one chapter
            if isinstance(top_level_nav, str):
-                self.book['content'].append([
+                self.content.append([
                    1,
                    navpoints['navLabel']['text'],
                    navpoints['content']['@src']])
                break
            # Multiple chapters
-            self.book['content'].append([
+            self.content.append([
                1,
                top_level_nav['navLabel']['text'],
                top_level_nav['content']['@src']])
@@ -183,14 +185,12 @@ class EPUB:
            return 'Possible parse error: ' + chapter_file
    def parse_split_chapters(self, chapters_with_split_content):
        self.book['split_chapters'] = {}
        # For split chapters, get the whole chapter first, then split
        # between ids using their anchors, then "heal" the resultant text
        # by creating a BeautifulSoup object. Write its str to the content
        for i in chapters_with_split_content.items():
            chapter_file = i[0]
-            self.book['split_chapters'][chapter_file] = {}
+            self.split_chapters[chapter_file] = {}
            chapter_content = self.get_chapter_content(chapter_file)
            soup = BeautifulSoup(chapter_content, 'lxml')
@@ -208,10 +208,10 @@ class EPUB:
                if this_tag:
                    this_markup = BeautifulSoup(
                        str(this_tag).strip() + markup_split[1], 'lxml')
-                    self.book['split_chapters'][chapter_file][this_anchor] = str(this_markup)
+                    self.split_chapters[chapter_file][this_anchor] = str(this_markup)
            # Remaining markup is assigned here
-            self.book['split_chapters'][chapter_file]['top_level'] = str(soup)
+            self.split_chapters[chapter_file]['top_level'] = str(soup)
    def generate_content(self):
        # Find all the chapters mentioned in the opf spine
@@ -238,7 +238,7 @@ class EPUB:
        chapter_title = 1
        toc_chapters = [
-            unquote(i[2].split('#')[0]) for i in self.book['content']]
+            unquote(i[2].split('#')[0]) for i in self.content]
        last_valid_index = -2  # Yes, but why?
        for i in spine_final:
@@ -251,7 +251,7 @@ class EPUB:
                except ValueError:
                    last_valid_index += 1
-                self.book['content'].insert(
+                self.content.insert(
                    last_valid_index + 1,
                    [1, str(chapter_title), i])
                chapter_title += 1
@@ -259,7 +259,7 @@ class EPUB:
        # Parse split chapters as below
        # They can be picked up during the iteration through the toc
        chapters_with_split_content = {}
-        for i in self.book['content']:
+        for i in self.content:
            if '#' in i[2]:
                this_split = i[2].split('#')
                chapter = this_split[0]
@@ -278,8 +278,7 @@ class EPUB:
        # In case a split chapter is encountered, get its content
        # from the split_chapters dictionary
        # What could possibly go wrong?
-        split_chapters = self.book['split_chapters']
+        toc_copy = self.content[:]
        toc_copy = self.book['content'][:]
        # Put the book into the book
        for count, i in enumerate(toc_copy):
@@ -293,7 +292,7 @@ class EPUB:
                try:
                    chapter_content = (
-                        split_chapters[chapter_file_proper][this_anchor])
+                        self.split_chapters[chapter_file_proper][this_anchor])
                except KeyError:
                    chapter_content = 'Parse Error'
                    error_string = (
@@ -301,9 +300,9 @@ class EPUB:
                    logger.error(error_string)
            # Get content that remained at the end of the pillaging above
-            elif chapter_file in split_chapters.keys():
+            elif chapter_file in self.split_chapters.keys():
                try:
-                    chapter_content = split_chapters[chapter_file]['top_level']
+                    chapter_content = self.split_chapters[chapter_file]['top_level']
                except KeyError:
                    chapter_content = 'Parse Error'
                    error_string = (
@@ -314,26 +313,26 @@ class EPUB:
            else:
                chapter_content = self.get_chapter_content(chapter_file)
-            self.book['content'][count][2] = chapter_content
+            self.content[count][2] = chapter_content
        # Cleanup content by removing null chapters
-        self.book['content'] = [
+        self.content = [
-            i for i in self.book['content'] if i[2]]
+            i for i in self.content if i[2]]
-        self.generate_book_cover()
+        cover_image = self.generate_book_cover()
-        if self.book['cover']:
+        if cover_image:
            cover_path = os.path.join(
                self.temp_dir, os.path.basename(self.book_filename)) + '- cover'
            with open(cover_path, 'wb') as cover_temp:
-                cover_temp.write(self.book['cover'])
+                cover_temp.write(cover_image)
            # There's probably some rationale to doing an insert here
            # But a replacement seems... neater
-            self.book['content'].insert(
+            self.content.insert(
                0, (1, 'Cover', f'<center><img src="{cover_path}" alt="Cover"></center>'))
    def generate_metadata(self):
-        metadata = self.opf_dict['package']['metadata']
+        book_metadata = self.opf_dict['package']['metadata']
        def flattener(this_object):
            if isinstance(this_object, collections.OrderedDict):
@@ -354,67 +353,76 @@ class EPUB:
        # Book title
        try:
-            self.book['title'] = flattener(metadata['dc:title'])
+            title = flattener(book_metadata['dc:title'])
        except:
-            self.book['title'] = os.path.splitext(
+            logger.warning('Title not found: ' + self.book_filename)
            title = os.path.splitext(
                os.path.basename(self.book_filename))[0]
        # Book author
        try:
-            self.book['author'] = flattener(metadata['dc:creator'])
+            author = flattener(book_metadata['dc:creator'])
        except:
-            self.book['author'] = 'Unknown'
+            logger.warning('Author not found: ' + self.book_filename)
            author = 'Unknown'
        # Book year
        try:
-            self.book['year'] = int(flattener(metadata['dc:date'])[:4])
+            year = int(flattener(book_metadata['dc:date'])[:4])
        except:
-            self.book['year'] = 9999
+            logger.warning('Year not found: ' + self.book_filename)
            year = 9999
        # Book isbn
        # Both one and multiple schema
-        self.book['isbn'] = None
+        isbn = None
        try:
-            scheme = metadata['dc:identifier']['@opf:scheme'].lower()
+            scheme = book_metadata['dc:identifier']['@opf:scheme'].lower()
            if scheme.lower() == 'isbn':
-                self.book['isbn'] = metadata['dc:identifier']['#text']
+                isbn = book_metadata['dc:identifier']['#text']
        except (TypeError, KeyError):
            try:
-                for i in metadata['dc:identifier']:
+                for i in book_metadata['dc:identifier']:
                    if i['@opf:scheme'].lower() == 'isbn':
-                        self.book['isbn'] = i['#text']
+                        isbn = i['#text']
                    break
            except:
                logger.warning('ISBN not found: ' + self.book_filename)
                pass
        # Book tags
        try:
-            self.book['tags'] = metadata['dc:subject']
+            tags = book_metadata['dc:subject']
-            if isinstance(self.book['tags'], str):
+            if isinstance(tags, str):
-                self.book['tags'] = [self.book['tags']]
+                tags = [tags]
        except:
-            self.book['tags'] = []
+            tags = []
        # Book cover
-        self.generate_book_cover()
+        cover = self.generate_book_cover()
        # Named tuple? Named tuple.
        Metadata = collections.namedtuple(
            'Metadata', ['title', 'author', 'year', 'isbn', 'tags', 'cover'])
        self.metadata = Metadata(title, author, year, isbn, tags, cover)
    def generate_book_cover(self):
        # This is separate because the book cover needs to
        # be found and extracted both during addition / reading
-        self.book['cover'] = None
+        book_cover = None
        try:
            cover_image = [
                i['@href'] for i in self.opf_dict['package']['manifest']['item']
                if i['@media-type'].split('/')[0] == 'image' and
                'cover' in i['@id']][0]
-            self.book['cover'] = self.zip_file.read(
+            book_cover = self.zip_file.read(self.find_file(cover_image))
                self.find_file(cover_image))
        except:
            pass
        # Find book cover the hard way
-        if not self.book['cover']:
+        if not book_cover:
            biggest_image_size = 0
            biggest_image = None
            for j in self.zip_file.filelist:
@@ -424,5 +432,10 @@ class EPUB:
                        biggest_image_size = j.file_size
            if biggest_image:
-                self.book['cover'] = self.zip_file.read(
+                book_cover = self.zip_file.read(
                    self.find_file(biggest_image))
        if not book_cover:
            logger.warning('Cover not found: ' + self.book_filename)
        return book_cover
@@ -18,6 +18,7 @@ import os
 import base64
 import zipfile
 import logging
 import collections
 from bs4 import BeautifulSoup
@@ -28,70 +29,59 @@ class FB2:
    def __init__(self, filename):
        self.filename = filename
        self.zip_file = None
        self.book = {}
        self.xml = None
-    def read_fb2(self):
+        self.metadata = None
-        try:
+        self.content = []
            if self.filename.endswith('.fb2.zip'):
                this_book = zipfile.ZipFile(
                    self.filename, mode='r', allowZip64=True)
                for i in this_book.filelist:
                    if os.path.splitext(i.filename)[1] == '.fb2':
                        book_text = this_book.read(i.filename)
                        break
            else:
                with open(self.filename, 'r') as book_file:
                    book_text = book_file.read()
-            self.xml = BeautifulSoup(book_text, 'lxml')
+        self.generate_references()
            self.generate_book_metadata()
        except:  # Not specifying an exception type here may be justified
            return False
-        return True
+    def generate_references(self):
        if self.filename.endswith('.fb2.zip'):
            this_book = zipfile.ZipFile(
                self.filename, mode='r', allowZip64=True)
            for i in this_book.filelist:
                if os.path.splitext(i.filename)[1] == '.fb2':
                    book_text = this_book.read(i.filename)
                    break
-    def generate_book_metadata(self):
+        else:
-        self.book['isbn'] = None
+            with open(self.filename, 'r') as book_file:
-        self.book['tags'] = None
+                book_text = book_file.read()
        self.book['book_list'] = []
        self.xml = BeautifulSoup(book_text, 'lxml')
    def generate_metadata(self):
        # All metadata can be parsed in one pass
        all_tags = self.xml.find('description')
-        self.book['title'] = all_tags.find('book-title').text
+        title = all_tags.find('book-title').text
-        if self.book['title'] == '' or self.book['title'] is None:
+        if title == '' or title is None:
-            self.book['title'] = os.path.splitext(
+            title = os.path.splitext(
                os.path.basename(self.filename))[0]
-        self.book['author'] = all_tags.find(
+        author = all_tags.find(
            'author').getText(separator=' ').replace('\n', ' ')
-        if self.book['author'] == '' or self.book['author'] is None:
+        if author == '' or author is None:
-            self.book['author'] = 'Unknown'
+            author = '<Unknown>'
        # TODO
        # Account for other date formats
        try:
-            self.book['year'] = int(all_tags.find('date').text)
+            year = int(all_tags.find('date').text)
        except ValueError:
-            self.book['year'] = 9999
+            year = 9999
-        # Cover Image
+        isbn = None
-        try:
+        tags = None
            cover_image_xml = self.xml.find('coverpage')
            for i in cover_image_xml:
                cover_image_name = i.get('l:href')
-            cover_image_data = self.xml.find_all('binary')
+        cover = self.generate_book_cover()
            for i in cover_image_data:
                if cover_image_name.endswith(i.get('id')):
                    self.book['cover'] = base64.decodebytes(i.text.encode())
        except (AttributeError, TypeError):
            # Catch TypeError in case no images exist in the book
            logger.error('No cover found for: ' + self.filename)
            self.book['cover'] = None
-    def parse_chapters(self, temp_dir):
+        Metadata = collections.namedtuple(
            'Metadata', ['title', 'author', 'year', 'isbn', 'tags', 'cover'])
        self.metadata = Metadata(title, author, year, isbn, tags, cover)
    def generate_content(self, temp_dir):
        # TODO
        # Check what's up with recursion levels
        # Why is the TypeError happening in get_title
@@ -114,7 +104,7 @@ class FB2:
            children = element.findChildren('section', recursive=False)
            if not children and level != 1:
                this_title, title_xml = get_title(element)
-                self.book['book_list'].append(
+                self.content.append(
                    [level, this_title, title_xml + str(element)])
            else:
                for i in children:
@@ -134,7 +124,7 @@ class FB2:
            if section_children:
                chapter_text = this_title
-            self.book['book_list'].append([1, this_title, chapter_text])
+            self.content.append([1, this_title, chapter_text])
            recursor(1, this_element)
        # Extract all images to the temp_dir
@@ -144,7 +134,7 @@ class FB2:
            image_string = f'<image l:href="#{image_name}"'
            replacement_string = f'<p></p><img src=\"{image_path}\"'
-            for j in self.book['book_list']:
+            for j in self.content:
                j[2] = j[2].replace(
                    image_string, replacement_string)
            try:
@@ -155,9 +145,30 @@ class FB2:
                pass
        # Insert the book cover at the beginning
-        if self.book['cover']:
+        cover_image = self.generate_book_cover()
-            cover_path = os.path.join(temp_dir, 'cover')
+        if cover_image:
-            with open(cover_path, 'wb') as outimage:
+            cover_path = os.path.join(
-                outimage.write(self.book['cover'])
+                temp_dir, os.path.basename(self.filename)) + '- cover'
-            self.book['book_list'].insert(
+            with open(cover_path, 'wb') as cover_temp:
                cover_temp.write(cover_image)
            self.content.insert(
                0, (1, 'Cover', f'<center><img src="{cover_path}" alt="Cover"></center>'))
    def generate_book_cover(self):
        cover = None
        try:
            cover_image_xml = self.xml.find('coverpage')
            for i in cover_image_xml:
                cover_image_name = i.get('l:href')
            cover_image_data = self.xml.find_all('binary')
            for i in cover_image_data:
                if cover_image_name.endswith(i.get('id')):
                    cover = base64.decodebytes(i.text.encode())
        except (AttributeError, TypeError):
            # Catch TypeError in case no images exist in the book
            logger.warning('Cover not found: ' + self.filename)
        return cover
@@ -16,20 +16,9 @@
 # INSTRUCTIONS
 # Every parser is supposed to have the following methods. None returns are not allowed.
-# read_book()
+# read_book() - Initialize book
-# get_title()
+# generate_metadata() - For addition
-# get_author()
+# generate_content() - For reading
 # get_year()
 # get_cover_image()
 # get_isbn()
 # get_tags()
 # get_contents() - Should return a tuple with 0: TOC 1: special_settings (dict)
 # Parsers for files containing only images need to return only images_only = True
 # TODO
 # Maybe shift to insert or replace instead of hash checking
 # See if you want to include a hash of the book's name and author
 # Change thread niceness
 import io
 import os
@@ -211,87 +200,88 @@ class BookSorter:
                break
        if not valid_extension:
-            logger.error(filename + ' has an unsupported extension')
+            logger.error('Unsupported extension: ' + filename)
            return
        book_ref = sorter[file_extension](filename, self.temp_dir, file_md5)
-        # Everything following this is standard
+        try:
-        # None values are accounted for here
+            book_ref.read_book()
-        is_valid = book_ref.read_book()
+        except:
-        if not is_valid:
+            logger.error('Error initializing: ' + filename)
            logger.error('Cannot parse:' + filename)
            return
-        if book_ref.book:
+        this_book = {}
-            # TODO
+        this_book[file_md5] = {
-            # For the love of God clean this up. It's junk.
+            'hash': file_md5,
            'path': filename}
-            this_book = {}
+        # Different modes require different values
-            this_book[file_md5] = {
+        if self.work_mode == 'addition':
-                'hash': file_md5,
+            try:
-                'path': filename}
+                metadata = book_ref.generate_metadata()
            except:
                logger.error('Metadata generation error: ' + filename)
                return
-            # Different modes require different values
+            title = metadata.title
-            if self.work_mode == 'addition':
+            author = metadata.author
-                # Reduce the size of the incoming image
+            year = metadata.year
-                # if one is found
+            isbn = metadata.isbn
                title = book_ref.get_title()
                author = book_ref.get_author()
                year = book_ref.get_year()
                isbn = book_ref.get_isbn()
-                tags = None
+            tags = None
-                if self.auto_tags:
+            if self.auto_tags:
-                    tags = book_ref.get_tags()
+                tags = metadata.tags
-                cover_image_raw = book_ref.get_cover_image()
+            cover_image_raw = metadata.cover
-                if cover_image_raw:
+            if cover_image_raw:
-                    cover_image = resize_image(cover_image_raw)
+                cover_image = resize_image(cover_image_raw)
-                else:
+            else:
-                    # TODO
+                # TODO
-                    # Needs an option
+                # Needs an option
-                    # cover_image = fetch_cover(title, author)
+                # cover_image = fetch_cover(title, author)
-                    cover_image = None
+                cover_image = None
-                this_book[file_md5]['cover_image'] = cover_image
+            this_book[file_md5]['cover_image'] = cover_image
-                this_book[file_md5]['addition_mode'] = self.addition_mode
+            this_book[file_md5]['addition_mode'] = self.addition_mode
-            if self.work_mode == 'reading':
+        if self.work_mode == 'reading':
-                # All books must return the following list
+            try:
-                # Indices are as described below
+                book_breakdown = book_ref.generate_content()
-                book_breakdown = book_ref.get_contents()
+            except:
                logger.error('Content generation error: ' + filename)
                return
-                toc = book_breakdown[0]
+            toc = book_breakdown[0]
-                content = book_breakdown[1]
+            content = book_breakdown[1]
-                images_only = book_breakdown[2]
+            images_only = book_breakdown[2]
-                book_data = self.database_entry_for_book(file_md5)
+            book_data = self.database_entry_for_book(file_md5)
-                title = book_data[0]
+            title = book_data[0]
-                author = book_data[1]
+            author = book_data[1]
-                year = book_data[2]
+            year = book_data[2]
-                isbn = book_data[3]
+            isbn = book_data[3]
-                tags = book_data[4]
+            tags = book_data[4]
-                position = book_data[5]
+            position = book_data[5]
-                bookmarks = book_data[6]
+            bookmarks = book_data[6]
-                cover = book_data[7]
+            cover = book_data[7]
-                annotations = book_data[8]
+            annotations = book_data[8]
-                this_book[file_md5]['position'] = position
+            this_book[file_md5]['position'] = position
-                this_book[file_md5]['bookmarks'] = bookmarks
+            this_book[file_md5]['bookmarks'] = bookmarks
-                this_book[file_md5]['toc'] = toc
+            this_book[file_md5]['toc'] = toc
-                this_book[file_md5]['content'] = content
+            this_book[file_md5]['content'] = content
-                this_book[file_md5]['images_only'] = images_only
+            this_book[file_md5]['images_only'] = images_only
-                this_book[file_md5]['cover'] = cover
+            this_book[file_md5]['cover'] = cover
-                this_book[file_md5]['annotations'] = annotations
+            this_book[file_md5]['annotations'] = annotations
-            this_book[file_md5]['title'] = title
+        this_book[file_md5]['title'] = title
-            this_book[file_md5]['author'] = author
+        this_book[file_md5]['author'] = author
-            this_book[file_md5]['year'] = year
+        this_book[file_md5]['year'] = year
-            this_book[file_md5]['isbn'] = isbn
+        this_book[file_md5]['isbn'] = isbn
-            this_book[file_md5]['tags'] = tags
+        this_book[file_md5]['tags'] = tags
-            return this_book
+        return this_book
    def read_progress(self):
        while True: