Overhaul EPUB parsing and ToC generation

2019-02-09 04:21:22 +05:30
parent 1e004774c9
commit e4be239bf0
4 changed files with 312 additions and 285 deletions
--- a/11
+++ b/11
@@ -36,6 +36,7 @@ TODO
        Set focus to newly added file
    Reading:
        ✓ Drop down for TOC
        ✓ Treeview navigation for TOC
        ✓ Override the keypress event of the textedit
        ✓ Use format* icons for toolbar buttons
        ✓ Implement book view settings with a(nother) toolbar
@@ -86,7 +87,6 @@ TODO
                Have them save to memory
        ✓ fb2 support
            ✓ Images need to show up in their placeholders
        djvu support
    Other:
        ✓ Define every widget in code
    Bugs:
@@ -98,8 +98,11 @@ TODO
        Better recursion needed for fb2 toc
    Secondary:
        Additional Settings:
            Disable progressbar - 20% book addition speed improvement
            Disable cover loading when reading - Saves ~2M / book
            Create covers for books without them - VERY SLOW
        Special formatting for each chapter's title
        Create covers for books without them
        Signal end of chapter with some text
        Graphical themes
        Change focus rectangle dimensions
@@ -108,7 +111,7 @@ TODO
        Goodreads API: Ratings, Read, Recommendations
        Get ISBN using python-isbnlib
        Use embedded fonts + CSS
-        txt, doc, chm support
+        txt, doc, chm, djvu support
        Include icons for filetype emblems
        Comic view modes
            Continuous paging
@@ -116,7 +119,7 @@ TODO
        ? Add only one file type if multiple are present
        ? Create emblem per filetype
        In application notifications
-        Notification in case the filter is filtering out all files with no option in place
+            Notification in case the filter is filtering out all files with no option in place
        Option to fit images to viewport
    Need help with:
--- a/lector/parsers/epub.py
+++ b/lector/parsers/epub.py
@@ -29,14 +29,13 @@ class ParseEPUB:
        # Maybe also include book description
        self.book_ref = None
        self.book = None
        self.temp_dir = temp_dir
        self.filename = filename
        self.extract_path = os.path.join(temp_dir, file_md5)
    def read_book(self):
-        self.book_ref = EPUB(self.filename)
+        self.book_ref = EPUB(self.filename, self.temp_dir)
-        contents_found = self.book_ref.read_epub()
+        self.book_ref.generate_metadata()
        if not contents_found:
            return False
        self.book = self.book_ref.book
        return True
@@ -61,14 +60,8 @@ class ParseEPUB:
    def get_contents(self):
        zipfile.ZipFile(self.filename).extractall(self.extract_path)
-        self.book_ref.parse_toc()
+        self.book_ref.generate_toc()
-        self.book_ref.parse_chapters(temp_dir=self.extract_path)
+        self.book_ref.generate_content()
        toc = []
        content = []
        for count, i in enumerate(self.book['book_list']):
            toc.append((1, i[0], count + 1))
            content.append(i[1])
        # Return toc, content, images_only
-        return toc, content, False
+        return self.book['toc'], self.book['content'], False
--- a/lector/readers/read_epub.py
+++ b/lector/readers/read_epub.py
@@ -14,175 +14,333 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
-import os
+# TODO
-import logging
+# See if inserting chapters not in the toc.ncx can be avoided
-import zipfile
+# Missing file order is messed up
-from urllib.parse import unquote
+# Account for stylesheets... eventually
 # Everything needs logging
 # Mobipocket files
 import os
 import zipfile
 import logging
 import collections
 import xmltodict
 from bs4 import BeautifulSoup
 logger = logging.getLogger(__name__)
 class EPUB:
-    def __init__(self, filename):
+    def __init__(self, book_filename, temp_dir):
-        self.filename = filename
+        self.book_filename = book_filename
        self.temp_dir = temp_dir
        self.zip_file = None
        self.file_list = None
        self.opf_dict = None
        self.book = {}
        self.generate_references()
    def find_file(self, filename):
        # First, look for the file in the root of the book
        if filename in self.file_list:
            return filename
        # Then, search for it elsewhere
        else:
            file_basename = os.path.basename(filename)
            for i in self.file_list:
                if os.path.basename(i) == file_basename:
                    return i
        # If the file isn't found
        logger.error(filename + ' not found')
        return False
    def generate_references(self):
        self.zip_file = zipfile.ZipFile(
            self.book_filename, mode='r', allowZip64=True)
        self.file_list = self.zip_file.namelist()
        # Book structure relies on parsing the .opf file
        # in the book. Now that might be the usual content.opf
        # or package.opf or it might be named after your favorite
        # eldritch abomination. The point is we have to check
        # the container.xml
        container = self.find_file('container.xml')
        if container:
            container_xml = self.zip_file.read(container)
            container_dict = xmltodict.parse(container_xml)
            packagefile = container_dict['container']['rootfiles']['rootfile']['@full-path']
        else:
            presumptive_names = ('content.opf', 'package.opf')
            for i in presumptive_names:
                packagefile = self.find_file(i)
                if packagefile:
                    break
        packagefile_data = self.zip_file.read(packagefile)
        self.opf_dict = xmltodict.parse(packagefile_data)
    def generate_toc(self):
        self.book['toc'] = []
        # I'm currently going with the file always being named toc.ncx
        # But this is epub. The wild west of ebook formats.
        tocfile = self.find_file('toc.ncx')
        tocfile_data = self.zip_file.read(tocfile)
        toc_dict = xmltodict.parse(tocfile_data)
        def recursor(level, nav_node):
            if isinstance(nav_node, list):
                these_contents = [[
                    level + 1,
                    i['navLabel']['text'],
                    i['content']['@src']] for i in nav_node]
                self.book['toc'].extend(these_contents)
                return
            if 'navPoint' in nav_node.keys():
                recursor(level, nav_node['navPoint'])
            else:
                self.book['toc'].append([
                    level + 1,
                    nav_node['navLabel']['text'],
                    nav_node['content']['@src']])
        navpoints = toc_dict['ncx']['navMap']['navPoint']
        for top_level_nav in navpoints:
            self.book['toc'].append([
                1,
                top_level_nav['navLabel']['text'],
                top_level_nav['content']['@src']])
            if 'navPoint' in top_level_nav.keys():
                recursor(1, top_level_nav)
    def get_chapter_content(self, chapter_file):
        this_file = self.find_file(chapter_file)
        if this_file:
            return self.zip_file.read(this_file).decode()
        else:
            print('Not found: ' + chapter_file)
            return chapter_file
    def parse_split_chapters(self, chapters_with_split_content):
        self.book['split_chapters'] = {}
-    def read_epub(self):
+        # For split chapters, get the whole chapter first, then split
-        # This is the function that should error out in
+        # between ids using their anchors, then "heal" the resultant text
-        # case the module cannot process the file
+        # by creating a BeautifulSoup object. Write its str to the content
-        try:
+        for i in chapters_with_split_content.items():
-            self.load_zip()
+            chapter_file = i[0]
-            contents_path = self.get_file_path(
+            self.book['split_chapters'][chapter_file] = {}
                None, True)
-            if not contents_path:
+            chapter_content = self.get_chapter_content(chapter_file)
-                return False  # No (valid) opf was found so processing cannot continue
+            soup = BeautifulSoup(chapter_content, 'lxml')
-            self.generate_book_metadata(contents_path)
+            split_anchors = i[1]
-        except:  # Not specifying an exception type here may be justified
+            for this_anchor in reversed(split_anchors):
-            return False
+                this_tag = soup.find(
                    attrs={"id":lambda x: x == this_anchor})
-        return True
+                markup_split = str(soup).split(str(this_tag))
                soup = BeautifulSoup(markup_split[0], 'lxml')
                this_markup = BeautifulSoup(
                    str(this_tag) + markup_split[1], 'lxml')
-    def load_zip(self):
+                self.book['split_chapters'][chapter_file][this_anchor] = str(this_markup)
        try:
            self.zip_file = zipfile.ZipFile(
                self.filename, mode='r', allowZip64=True)
        except (KeyError, AttributeError, zipfile.BadZipFile):
            logger.error('Malformed zip file ' + self.filename)
            return
-    def parse_xml(self, filename, parser):
+            # Remaining markup is assigned here
-        try:
+            self.book['split_chapters'][chapter_file]['top_level'] = str(soup)
            this_xml = self.zip_file.read(filename).decode()
        except KeyError:
            short_filename = os.path.basename(self.filename)
            warning_string = f'{str(filename)} not found in {short_filename}'
            logger.warning(warning_string)
            return
-        root = BeautifulSoup(this_xml, parser)
+    def generate_content(self):
-        return root
+        # Find all the chapters mentioned in the opf spine
        # These are simply ids that correspond to the actual item
        # as mentioned in the manifest - which is a comprehensive
        # list of files
        chapters_in_spine = [
            i['@idref']
            for i in self.opf_dict['package']['spine']['itemref']]
-    def get_file_path(self, filename, is_content_file=False):
+        # Next, find items and ids from the manifest
-        # Use this to get the location of the content.opf file
+        chapters_from_manifest = {
-        # And maybe some other file that has a more well formatted
+            i['@id']: i['@href']
-        # idea of the TOC
+            for i in self.opf_dict['package']['manifest']['item']}
        # We're going to all this trouble because there really is
        # no going forward without a toc
        if is_content_file:
            container_location = self.get_file_path('container.xml')
            xml = self.parse_xml(container_location, 'xml')
-            if xml:
+        # Finally, check which items are supposed to be in the spine
-                root_item = xml.find('rootfile')
+        # on the basis of the id and change the toc accordingly
        spine_final = []
        for i in chapters_in_spine:
            try:
                spine_final.append(chapters_from_manifest.pop(i))
            except KeyError:
                pass
        # TODO
        # Check what happens in case missing chapters are either
        # at the beginning or the end of the book
        chapter_title = 1
        toc_chapters = [i[2] for i in self.book['toc']]
        last_valid_index = 0
        for i in spine_final:
            if not i in toc_chapters:
                previous_chapter = spine_final[spine_final.index(i) - 1]
                try:
-                    return root_item.get('full-path')
+                    previous_chapter_toc_index = toc_chapters.index(previous_chapter)
-                except AttributeError:
+                    # In case of 2+ consecutive missing chapters
-                    error_string = f'ePub module: {self.filename} has a malformed container.xml'
+                    last_valid_index = previous_chapter_toc_index
                except ValueError:
                    last_valid_index += 1
                self.book['toc'].insert(
                    last_valid_index + 1,
                    [1, str(chapter_title), i])
                chapter_title += 1
        # Parse split chapters as below
        # They can be picked up during the iteration through the toc
        chapters_with_split_content = {}
        for i in self.book['toc']:
            if '#' in i[2]:
                this_split = i[2].split('#')
                chapter = this_split[0]
                anchor = this_split[1]
                try:
                    chapters_with_split_content[chapter].append(anchor)
                except KeyError:
                    chapters_with_split_content[chapter] = []
                    chapters_with_split_content[chapter].append(anchor)
        self.parse_split_chapters(chapters_with_split_content)
        # Now we iterate over the ToC as presented in the toc.ncx
        # and add chapters to the content list
        # In case a split chapter is encountered, get its content
        # from the split_chapters dictionary
        # What could possibly go wrong?
        # The content list is separated from the toc list because
        # the mupdf library returns its own toc a certain way and
        # this keeps things uniform
        split_chapters = self.book['split_chapters']
        toc_copy = self.book['toc'][:]
        self.book['content'] = []
        # Put the book into the book
        for count, i in enumerate(toc_copy):
            chapter_file = i[2]
            # Get split content according to its corresponding id attribute
            if '#' in chapter_file:
                this_split = chapter_file.split('#')
                chapter_file_proper = this_split[0]
                this_anchor = this_split[1]
                try:
                    chapter_content = (
                        split_chapters[chapter_file_proper][this_anchor])
                except KeyError:
                    chapter_content = 'Parse Error'
                    error_string = (
                        f'Error parsing {self.book_filename}: {chapter_file_proper}')
                    logger.error(error_string)
                    return None
-            possible_filenames = ('content.opf', 'package.opf')
+            # Get content that remained at the end of the pillaging above
-            for i in possible_filenames:
+            elif chapter_file in split_chapters.keys():
-                presumptive_location = self.get_file_path(i)
+                try:
-                if presumptive_location:
+                    chapter_content = split_chapters[chapter_file]['top_level']
-                    return presumptive_location
+                except KeyError:
                    chapter_content = 'Parse Error'
                    error_string = (
                        f'Error parsing {self.book_filename}: {chapter_file}')
                    logger.error(error_string)
-        for i in self.zip_file.filelist:
+            # Vanilla non split chapters
            if os.path.basename(i.filename) == os.path.basename(filename):
                return i.filename
        return None
    def read_from_zip(self, filename):
        filename = unquote(filename)
        try:
            file_data = self.zip_file.read(filename)
            return file_data
        except KeyError:
            file_path_actual = self.get_file_path(filename)
            if file_path_actual:
                return self.zip_file.read(file_path_actual)
            else:
-                logger.error('ePub module can\'t find ' + filename)
+                chapter_content = self.get_chapter_content(chapter_file)
-    #______________________________________________________
+            # The count + 2 is an adjustment due to the cover being inserted below
            self.book['toc'][count][2] = count + 2
            self.book['content'].append(chapter_content)
-    def generate_book_metadata(self, contents_path):
+        self.generate_book_cover()
-        self.book['title'] = os.path.splitext(
+        if self.book['cover']:
-            os.path.basename(self.filename))[0]
+            cover_path = os.path.join(
-        self.book['author'] = 'Unknown'
+                self.temp_dir, os.path.basename(self.book_filename)) + '- cover'
-        self.book['isbn'] = None
+            with open(cover_path, 'wb') as cover_temp:
-        self.book['tags'] = None
+                cover_temp.write(self.book['cover'])
        self.book['cover'] = None
        self.book['toc_file'] = 'toc.ncx'  # Overwritten if another one exists
-        # Parse XML
+            self.book['toc'].insert(0, (1, 'Cover', 1))
-        xml = self.parse_xml(contents_path, 'xml')
+            self.book['content'].insert(
                0, (f'<center><img src="{cover_path}" alt="Cover"></center>'))
-        # Parse metadata
+    def generate_metadata(self):
-        item_dict = {
+        metadata = self.opf_dict['package']['metadata']
            'title': 'title',
            'author': 'creator',
            'year': 'date'}
-        for i in item_dict.items():
+        # There are no exception types specified below
-            item = xml.find(i[1])
+        # This is on purpose and makes me long for the days
-            if item:
+        # of simpler, happier things.
                self.book[i[0]] = item.text
        # Book title
        try:
-            self.book['year'] = int(self.book['year'][:4])
+            self.book['title'] = metadata['dc:title']
-        except (TypeError, KeyError, IndexError, ValueError):
+            if isinstance(self.book['title'], collections.OrderedDict):
                self.book['title'] = metadata['dc:title']['#text']
        except:
            print('Title parse error')
            self.book['title'] = os.path.splitext(
                os.path.basename(self.book_filename))[0]
        # Book author
        try:
            self.book['author'] = metadata['dc:creator']['#text']
        except:
            self.book['author'] = 'Unknown'
        # Book year
        try:
            self.book['year'] = int(metadata['dc:date'][:4])
        except:
            self.book['year'] = 9999
-        # Get identifier
+        # Book isbn
-        identifier_items = xml.find_all('identifier')
+        self.book['isbn'] = None
-        for i in identifier_items:
+        try:
-            scheme = i.get('scheme')
+            for i in metadata['dc:identifier']:
-            try:
+                if i['@opf:scheme'].lower() == 'isbn':
-                if scheme.lower() == 'isbn':
+                    self.book['isbn'] = i['#text']
-                    self.book['isbn'] = i.text
+        except:
-            except AttributeError:
+            pass
                self.book['isbn'] = None
-        # Tags
+        # Book tags
-        tag_items = xml.find_all('subject')
+        try:
-        tag_list = [i.text for i in tag_items]
+            self.book['tags'] = metadata['dc:subject']
-        self.book['tags'] = tag_list
+        except:
            self.book['tags'] = []
-        # Get items
+        # Book cover
-        self.book['content_dict'] = {}
+        self.generate_book_cover()
        all_items = xml.find_all('item')
        for i in all_items:
            media_type = i.get('media-type')
            this_id = i.get('id')
-            if media_type == 'application/xhtml+xml' or media_type == 'text/html':
+    def generate_book_cover(self):
-                self.book['content_dict'][this_id] = i.get('href')
+        # This is separate because the book cover needs to
-
+        # be found and extracted both during addition / reading
-            if media_type == 'application/x-dtbncx+xml':
+        self.book['cover'] = None
-                self.book['toc_file'] = i.get('href')
+        try:
-
+            cover_image = [
-            # Cover image
+                i['@href'] for i in self.opf_dict['package']['manifest']['item']
-            if 'cover' in this_id and media_type.split('/')[0] == 'image':
+                if i['@media-type'].split('/')[0] == 'image' and
-                cover_href = i.get('href')
+                'cover' in i['@id']][0]
-                try:
+            self.book['cover'] = self.zip_file.read(
-                    self.book['cover'] = self.zip_file.read(cover_href)
+                self.find_file(cover_image))
-                except KeyError:
+        except:
-                    # The cover cannot be found according to the
+            pass
                    # path specified in the content reference
                    self.book['cover'] = self.zip_file.read(
                        self.get_file_path(cover_href))
        # Find book cover the hard way
        if not self.book['cover']:
            # If no cover is located the conventional way,
            # we go looking for the largest image in the book
            biggest_image_size = 0
            biggest_image = None
            for j in self.zip_file.filelist:
@@ -192,139 +350,5 @@ class EPUB:
                        biggest_image_size = j.file_size
            if biggest_image:
-                self.book['cover'] = self.read_from_zip(biggest_image)
+                self.book['cover'] = self.zip_file.read(
-            else:
+                    self.find_file(biggest_image))
                logger.error('No cover found for: ' + self.filename)
        # Parse spine and arrange chapter paths acquired from the opf
        # according to the order IN THE SPINE
        spine_items = xml.find_all('itemref')
        spine_order = []
        for i in spine_items:
            spine_order.append(i.get('idref'))
        self.book['chapters_in_order'] = []
        for i in spine_order:
            chapter_path = self.book['content_dict'][i]
            self.book['chapters_in_order'].append(chapter_path)
    def parse_toc(self):
        # This has no bearing on the actual order
        # We're just using this to get chapter names
        self.book['navpoint_dict'] = {}
        toc_file = self.book['toc_file']
        if toc_file:
            toc_file = self.get_file_path(toc_file)
        xml = self.parse_xml(toc_file, 'xml')
        if not xml:
            return
        navpoints = xml.find_all('navPoint')
        for i in navpoints:
            chapter_title = i.find('text').text
            chapter_source = i.find('content').get('src')
            chapter_source_file = unquote(chapter_source.split('#')[0])
            if '#' in chapter_source:
                try:
                    self.book['split_chapters'][chapter_source_file].append(
                        (chapter_source.split('#')[1], chapter_title))
                except KeyError:
                    self.book['split_chapters'][chapter_source_file] = []
                    self.book['split_chapters'][chapter_source_file].append(
                        (chapter_source.split('#')[1], chapter_title))
            self.book['navpoint_dict'][chapter_source_file] = chapter_title
    def parse_chapters(self, temp_dir=None, split_large_xml=False):
        no_title_chapter = 0
        self.book['book_list'] = []
        for i in self.book['chapters_in_order']:
            chapter_data = self.read_from_zip(i).decode()
            if i in self.book['split_chapters'] and not split_large_xml:
                split_chapters = get_split_content(
                    chapter_data, self.book['split_chapters'][i])
                self.book['book_list'].extend(split_chapters)
            elif split_large_xml:
                # https://stackoverflow.com/questions/14444732/how-to-split-a-html-page-to-multiple-pages-using-python-and-beautiful-soup
                markup = BeautifulSoup(chapter_data, 'xml')
                chapters = []
                pagebreaks = markup.find_all('pagebreak')
                def next_element(elem):
                    while elem is not None:
                        elem = elem.next_sibling
                        if hasattr(elem, 'name'):
                            return elem
                for pbreak in pagebreaks:
                    chapter = [str(pbreak)]
                    elem = next_element(pbreak)
                    while elem and elem.name != 'pagebreak':
                        chapter.append(str(elem))
                        elem = next_element(elem)
                    chapters.append('\n'.join(chapter))
                for this_chapter in chapters:
                    fallback_title = str(no_title_chapter)
                    self.book['book_list'].append(
                        (fallback_title, this_chapter + ('<br/>' * 8)))
                    no_title_chapter += 1
            else:
                try:
                    self.book['book_list'].append(
                        (self.book['navpoint_dict'][i], chapter_data + ('<br/>' * 8)))
                except KeyError:
                    fallback_title = str(no_title_chapter)
                    self.book['book_list'].append(
                        (fallback_title, chapter_data))
                no_title_chapter += 1
        cover_path = os.path.join(temp_dir, os.path.basename(self.filename)) + '- cover'
        if self.book['cover']:
            with open(cover_path, 'wb') as cover_temp:
                cover_temp.write(self.book['cover'])
            try:
                self.book['book_list'][0] = (
                    'Cover', f'<center><img src="{cover_path}" alt="Cover"></center>')
            except IndexError:
                pass
 def get_split_content(chapter_data, split_by):
    split_anchors = [i[0] for i in split_by]
    chapter_titles = [i[1] for i in split_by]
    return_list = []
    xml = BeautifulSoup(chapter_data, 'lxml')
    xml_string = xml.body.prettify()
    for count, i in enumerate(split_anchors):
        this_split = xml_string.split(i)
        current_chapter = this_split[0]
        bs_obj = BeautifulSoup(current_chapter, 'lxml')
        # Since tags correspond to data following them, the first
        # chunk will be ignored
        # As will all empty chapters
        if bs_obj.text == '\n' or bs_obj.text == '' or count == 0:
            continue
        bs_obj_string = str(bs_obj).replace('"&gt;', '', 1) + ('<br/>' * 8)
        return_list.append(
            (chapter_titles[count - 1], bs_obj_string))
        xml_string = ''.join(this_split[1:])
    bs_obj = BeautifulSoup(xml_string, 'lxml')
    bs_obj_string = str(bs_obj).replace('"&gt;', '', 1) + ('<br/>' * 8)
    return_list.append(
        (chapter_titles[-1], bs_obj_string))
    return return_list
--- a/lector/sorter.py
+++ b/lector/sorter.py
@@ -150,6 +150,9 @@ class BookSorter:
                i[0]: i[1] for i in all_hashes_and_paths}
    def database_entry_for_book(self, file_hash):
        # TODO
        # This will probably look a whole lot better with a namedtuple
        database_return = database.DatabaseFunctions(
            self.database_path).fetch_data(
                ('Title', 'Author', 'Year', 'ISBN', 'Tags',
@@ -246,7 +249,10 @@ class BookSorter:
                if cover_image_raw:
                    cover_image = resize_image(cover_image_raw)
                else:
-                    cover_image = fetch_cover(title, author)
+                    # TODO
                    # Needs an option
                    # cover_image = fetch_cover(title, author)
                    cover_image = None
                this_book[file_md5]['cover_image'] = cover_image
                this_book[file_md5]['addition_mode'] = self.addition_mode
@@ -408,3 +414,4 @@ def fetch_cover(title, author):
    except:
        logger.error(f'Couldn\'t find cover for ' + title)
        return None