Lector/lector/readers/read_epub.py

# This file is a part of Lector, a Qt based ebook reader
# Copyright (C) 2017-2019 BasioMeusPuga

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

# TODO
# See if inserting chapters not in the toc.ncx can be avoided
# Account for stylesheets... eventually

import os
import zipfile
import logging
import collections
from urllib.parse import unquote

import xmltodict
from PyQt5 import QtGui
from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)


class EPUB:
    def __init__(self, book_filename, temp_dir):
        self.book_filename = book_filename
        self.temp_dir = temp_dir

        self.zip_file = None
        self.file_list = None
        self.opf_dict = None
        self.cover_image_name = None
        self.split_chapters = {}

        self.metadata = None
        self.content = []

        self.generate_references()

    def generate_references(self):
        self.zip_file = zipfile.ZipFile(
            self.book_filename, mode='r', allowZip64=True)
        self.file_list = self.zip_file.namelist()

        # Book structure relies on parsing the .opf file
        # in the book. Now that might be the usual content.opf
        # or package.opf or it might be named after your favorite
        # eldritch abomination. The point is we have to check
        # the container.xml
        container = self.find_file('container.xml')
        if container:
            container_xml = self.zip_file.read(container)
            container_dict = xmltodict.parse(container_xml)
            packagefile = container_dict['container']['rootfiles']['rootfile']['@full-path']
        else:
            presumptive_names = ('content.opf', 'package.opf', 'volume.opf')
            for i in presumptive_names:
                packagefile = self.find_file(i)
                if packagefile:
                    logger.info('Using presumptive package file: ' + self.book_filename)
                    break

        packagefile_data = self.zip_file.read(packagefile)
        self.opf_dict = xmltodict.parse(packagefile_data)

    def find_file(self, filename):
        # Get rid of special characters
        filename = unquote(filename)

        # First, look for the file in the root of the book
        if filename in self.file_list:
            return filename

        # Then search for it elsewhere
        else:
            file_basename = os.path.basename(filename)
            for i in self.file_list:
                if os.path.basename(i) == file_basename:
                    return i

        # If the file isn't found
        logger.warning(filename + ' not found in ' + self.book_filename)
        return False

    def generate_toc(self):
        def find_alternative_toc():
            toc_filename = None
            toc_filename_alternative = None
            manifest = self.opf_dict['package']['manifest']['item']

            for i in manifest:
                # Behold the burning hoops we're jumping through
                if i['@id'] == 'ncx':
                    toc_filename = i['@href']
                if ('ncx' in i['@id']) or ('toc' in i['@id']):
                    toc_filename_alternative = i['@href']
                if toc_filename and toc_filename_alternative:
                    break

            if not toc_filename:
                if not toc_filename_alternative:
                    logger.warning('No ToC found for: ' + self.book_filename)
                else:
                    toc_filename = toc_filename_alternative

            logger.info('Using alternate ToC for: ' + self.book_filename)
            return toc_filename

        # Find the toc.ncx file from the manifest
        # EPUBs will name literally anything, anything so try
        # a less stringent approach if the first one doesn't work
        # The idea is to prioritize 'toc.ncx' since this should work
        # for the vast majority of books
        toc_filename = 'toc.ncx'
        does_toc_exist = self.find_file(toc_filename)
        if not does_toc_exist:
            toc_filename = find_alternative_toc()

        tocfile = self.find_file(toc_filename)
        tocfile_data = self.zip_file.read(tocfile)
        toc_dict = xmltodict.parse(tocfile_data)

        def recursor(level, nav_node):
            if isinstance(nav_node, list):
                these_contents = [[
                    level + 1,
                    i['navLabel']['text'],
                    i['content']['@src']] for i in nav_node]
                self.content.extend(these_contents)
                return

            if 'navPoint' in nav_node.keys():
                recursor(level, nav_node['navPoint'])

            else:
                self.content.append([
                    level + 1,
                    nav_node['navLabel']['text'],
                    nav_node['content']['@src']])

        navpoints = toc_dict['ncx']['navMap']['navPoint']
        for top_level_nav in navpoints:
            # Just one chapter
            if isinstance(top_level_nav, str):
                self.content.append([
                    1,
                    navpoints['navLabel']['text'],
                    navpoints['content']['@src']])
                break

            # Multiple chapters
            self.content.append([
                1,
                top_level_nav['navLabel']['text'],
                top_level_nav['content']['@src']])

            if 'navPoint' in top_level_nav.keys():
                recursor(1, top_level_nav)

    def get_chapter_content(self, chapter_file):
        this_file = self.find_file(chapter_file)
        if this_file:
            chapter_content = self.zip_file.read(this_file).decode()

            # Generate a None return for a blank chapter
            # These will be removed from the contents later
            contentDocument = QtGui.QTextDocument(None)
            contentDocument.setHtml(chapter_content)
            contentText = contentDocument.toPlainText().replace('\n', '')
            if contentText == '':
                chapter_content = None

            return chapter_content
        else:
            return 'Possible parse error: ' + chapter_file

    def parse_split_chapters(self, chapters_with_split_content):
        # For split chapters, get the whole chapter first, then split
        # between ids using their anchors, then "heal" the resultant text
        # by creating a BeautifulSoup object. Write its str to the content
        for i in chapters_with_split_content.items():
            chapter_file = i[0]
            self.split_chapters[chapter_file] = {}

            chapter_content = self.get_chapter_content(chapter_file)
            soup = BeautifulSoup(chapter_content, 'lxml')

            split_anchors = i[1]
            for this_anchor in reversed(split_anchors):
                this_tag = soup.find(
                    attrs={"id":lambda x: x == this_anchor})

                markup_split = str(soup).split(str(this_tag))
                soup = BeautifulSoup(markup_split[0], 'lxml')

                # If the tag is None, it probably means the content is overlapping
                # Skipping the insert is the way forward
                if this_tag:
                    this_markup = BeautifulSoup(
                        str(this_tag).strip() + markup_split[1], 'lxml')
                    self.split_chapters[chapter_file][this_anchor] = str(this_markup)

            # Remaining markup is assigned here
            self.split_chapters[chapter_file]['top_level'] = str(soup)

    def generate_content(self):
        # Find all the chapters mentioned in the opf spine
        # These are simply ids that correspond to the actual item
        # as mentioned in the manifest - which is a comprehensive
        # list of files
        try:
            # Multiple chapters
            chapters_in_spine = [
                i['@idref']
                for i in self.opf_dict['package']['spine']['itemref']]
        except TypeError:
            # Single chapter - Large xml
            chapters_in_spine = [
                self.opf_dict['package']['spine']['itemref']['@idref']]

        # Next, find items and ids from the manifest
        # This might error out in case there's only one item in
        # the manifest. Remember that for later.
        chapters_from_manifest = {
            i['@id']: i['@href']
            for i in self.opf_dict['package']['manifest']['item']}

        # Finally, check which items are supposed to be in the spine
        # on the basis of the id and change the toc accordingly
        spine_final = []
        for i in chapters_in_spine:
            try:
                spine_final.append(chapters_from_manifest.pop(i))
            except KeyError:
                pass

        toc_chapters = [
            unquote(i[2].split('#')[0]) for i in self.content]

        for i in spine_final:
            if not i in toc_chapters:
                spine_index = spine_final.index(i)
                if spine_index == 0:  # Or chapter insertion circles back to the end
                    previous_chapter_toc_index = -1
                else:
                    previous_chapter = spine_final[spine_final.index(i) - 1]
                    previous_chapter_toc_index = toc_chapters.index(previous_chapter)

                toc_chapters.insert(
                    previous_chapter_toc_index + 1, i)
                self.content.insert(
                    previous_chapter_toc_index + 1, [1, None, i])

        # Parse split chapters as below
        # They can be picked up during the iteration through the toc
        chapters_with_split_content = {}
        for i in self.content:
            if '#' in i[2]:
                this_split = i[2].split('#')
                chapter = this_split[0]
                anchor = this_split[1]

                try:
                    chapters_with_split_content[chapter].append(anchor)
                except KeyError:
                    chapters_with_split_content[chapter] = []
                    chapters_with_split_content[chapter].append(anchor)

        self.parse_split_chapters(chapters_with_split_content)

        # Now we iterate over the ToC as presented in the toc.ncx
        # and add chapters to the content list
        # In case a split chapter is encountered, get its content
        # from the split_chapters dictionary
        # What could possibly go wrong?
        toc_copy = self.content[:]

        # Put the book into the book
        for count, i in enumerate(toc_copy):
            chapter_file = i[2]

            # Get split content according to its corresponding id attribute
            if '#' in chapter_file:
                this_split = chapter_file.split('#')
                chapter_file_proper = this_split[0]
                this_anchor = this_split[1]

                try:
                    chapter_content = (
                        self.split_chapters[chapter_file_proper][this_anchor])
                except KeyError:
                    chapter_content = 'Parse Error'
                    error_string = (
                        f'Error parsing {self.book_filename}: {chapter_file_proper}')
                    logger.error(error_string)

            # Get content that remained at the end of the pillaging above
            elif chapter_file in self.split_chapters.keys():
                try:
                    chapter_content = self.split_chapters[chapter_file]['top_level']
                except KeyError:
                    chapter_content = 'Parse Error'
                    error_string = (
                        f'Error parsing {self.book_filename}: {chapter_file}')
                    logger.error(error_string)

            # Vanilla non split chapters
            else:
                chapter_content = self.get_chapter_content(chapter_file)

            self.content[count][2] = chapter_content

        # Cleanup content by removing null chapters
        unnamed_chapter_title = 1
        content_copy = []
        for i in self.content:
            if i[2]:
                chapter_title = i[1]
                if not chapter_title:
                    chapter_title = unnamed_chapter_title
                content_copy.append((
                    i[0], str(chapter_title), i[2]))
            unnamed_chapter_title += 1
        self.content = content_copy

        # Get cover image and put it in its place
        # I imagine this involves saying nasty things to it
        # There's no point shifting this to the parser
        # The performance increase is negligible
        cover_image = self.generate_book_cover()

        if cover_image:
            cover_path = os.path.join(
                self.temp_dir, os.path.basename(self.book_filename)) + ' - cover'
            with open(cover_path, 'wb') as cover_temp:
                cover_temp.write(cover_image)

            # This is probably stupid, but I can't stand the idea of
            # having to look at two book covers
            cover_replacement_conditions = (
                self.cover_image_name.lower() + '.jpg' in self.content[0][2].lower(),
                self.cover_image_name.lower() + '.png' in self.content[0][2].lower(),
                'cover' in self.content[0][1].lower())

            if True in cover_replacement_conditions:
                logger.info(
                    f'Replacing cover {cover_replacement_conditions}: {self.book_filename}')
                self.content[0] = (
                    1, 'Cover',
                    f'<center><img src="{cover_path}" alt="Cover"></center>')
            else:
                logger.info('Adding cover: ' + self.book_filename)
                self.content.insert(
                    0,
                    (1, 'Cover',
                     f'<center><img src="{cover_path}" alt="Cover"></center>'))

    def generate_metadata(self):
        book_metadata = self.opf_dict['package']['metadata']

        def flattener(this_object):
            if isinstance(this_object, collections.OrderedDict):
                return this_object['#text']

            if isinstance(this_object, list):
                if isinstance(this_object[0], collections.OrderedDict):
                    return this_object[0]['#text']
                else:
                    return this_object[0]

            if isinstance(this_object, str):
                return this_object

        # There are no exception types specified below
        # This is on purpose and makes me long for the days
        # of simpler, happier things.

        # Book title
        try:
            title = flattener(book_metadata['dc:title'])
        except:
            logger.warning('Title not found: ' + self.book_filename)
            title = os.path.splitext(
                os.path.basename(self.book_filename))[0]

        # Book author
        try:
            author = flattener(book_metadata['dc:creator'])
        except:
            logger.warning('Author not found: ' + self.book_filename)
            author = 'Unknown'

        # Book year
        try:
            year = int(flattener(book_metadata['dc:date'])[:4])
        except:
            logger.warning('Year not found: ' + self.book_filename)
            year = 9999

        # Book isbn
        # Both one and multiple schema
        isbn = None
        try:
            scheme = book_metadata['dc:identifier']['@opf:scheme'].lower()
            if scheme.lower() == 'isbn':
                isbn = book_metadata['dc:identifier']['#text']

        except (TypeError, KeyError):
            try:
                for i in book_metadata['dc:identifier']:
                    if i['@opf:scheme'].lower() == 'isbn':
                        isbn = i['#text']
                    break
            except:
                logger.warning('ISBN not found: ' + self.book_filename)

        # Book tags
        try:
            tags = book_metadata['dc:subject']
            if isinstance(tags, str):
                tags = [tags]
        except:
            tags = []

        # Book cover
        cover = self.generate_book_cover()

        # Named tuple? Named tuple.
        Metadata = collections.namedtuple(
            'Metadata', ['title', 'author', 'year', 'isbn', 'tags', 'cover'])
        self.metadata = Metadata(title, author, year, isbn, tags, cover)

    def generate_book_cover(self):
        # This is separate because the book cover needs to
        # be found and extracted both during addition / reading
        book_cover = None

        try:
            cover_image = [
                i['@href'] for i in self.opf_dict['package']['manifest']['item']
                if i['@media-type'].split('/')[0] == 'image' and
                'cover' in i['@id']][0]
            book_cover = self.zip_file.read(self.find_file(cover_image))
        except:
            logger.warning('Cover not found in opf: ' + self.book_filename)

        # Find book cover the hard way
        if not book_cover:
            biggest_image_size = 0
            cover_image = None
            for j in self.zip_file.filelist:
                if os.path.splitext(j.filename)[1] in ['.jpg', '.jpeg', '.png', '.gif']:
                    if j.file_size > biggest_image_size:
                        cover_image = j.filename
                        biggest_image_size = j.file_size

            if cover_image:
                book_cover = self.zip_file.read(
                    self.find_file(cover_image))

        if not book_cover:
            self.cover_image_name = ''
            logger.warning('Cover not found: ' + self.book_filename)
        else:
            self.cover_image_name = os.path.splitext(
                os.path.basename(cover_image))[0]

        return book_cover