Preliminary pdf support

Consolidate comicbook modules Do not write to temp dir for comics any longer
2018-03-16 18:46:38 +05:30
parent fc2fcb5361
commit 5b3759afe6
9 changed files with 242 additions and 289 deletions
@@ -24,7 +24,7 @@ TODO
        ✓ Context menu: Cache, Read, Edit database, delete, Mark read/unread
        ✓ Information dialog widget
        ✓ Allow editing of database data through the UI + for Bookmarks
-        Include (action) icons with the applications
+        ✓ Include (action) icons with the applications
        Set focus to newly added file
    Reading:
        ✓ Drop down for TOC
@@ -54,6 +54,8 @@ TODO
        Search document using QTextCursor?
        Comic view keyboard shortcuts
    Filetypes:
+        ✓ pdf support
+            Parse TOC
        ✓ epub support
            ✓ Homegrown solution please
        ✓ cbz, cbr support
@@ -67,9 +69,9 @@ TODO
        If there are files open and the database is deleted, TypeErrors result
        Cover culling does not occur if some other tab has initial focus
        Slider position change might be acting up too
+        Take metadata from the database when opening the file

    Secondary:
-        pdf support
        Annotations
        Graphical themes
        Change focus rectangle dimensions
@@ -44,20 +44,21 @@ from PyQt5 import QtCore, QtGui

 from lector import database

-from parsers.cbz import ParseCBZ
-from parsers.cbr import ParseCBR
+from parsers.pdf import ParsePDF
 from parsers.epub import ParseEPUB
 from parsers.mobi import ParseMOBI
+from parsers.comicbooks import ParseCOMIC

 sorter = {
+    'pdf': ParsePDF,
    'epub': ParseEPUB,
    'mobi': ParseMOBI,
    'azw': ParseMOBI,
    'azw3': ParseMOBI,
    'azw4': ParseMOBI,
    'prc': ParseMOBI,
-    'cbz': ParseCBZ,
-    'cbr': ParseCBR,}
+    'cbz': ParseCOMIC,
+    'cbr': ParseCOMIC}

 available_parsers = [i for i in sorter]
 progressbar = None  # This is populated by __main__
@@ -24,8 +24,12 @@

 import os
 import uuid
+import zipfile
 from PyQt5 import QtWidgets, QtGui, QtCore

+import popplerqt5
+from rarfile import rarfile
+
 from lector.models import BookmarkProxyModel
 from lector.sorter import resize_image
 from lector.delegates import BookmarkDelegate
@@ -61,7 +65,8 @@ class Tab(QtWidgets.QWidget):
        # we want a QGraphicsView widget doing all the heavy lifting
        # instead of a QTextBrowser
        if self.are_we_doing_images_only:  # Boolean
-            self.contentView = PliantQGraphicsView(self.window(), self)
+            self.contentView = PliantQGraphicsView(
+                self.metadata['path'], self.window(), self)
            self.contentView.loadImage(chapter_content)
        else:
            self.contentView = PliantQTextBrowser(self.window(), self)
@@ -465,78 +470,57 @@ class Tab(QtWidgets.QWidget):


 class PliantQGraphicsView(QtWidgets.QGraphicsView):
-    def __init__(self, main_window, parent=None):
+    def __init__(self, filepath, main_window, parent=None):
        super(PliantQGraphicsView, self).__init__(parent)
        self.main_window = main_window
        self.parent = parent
+
+        self.qimage = None  # Will be needed to resize pdf
        self.image_pixmap = None
-        self.ignore_wheel_event = False
-        self.ignore_wheel_event_number = 0
-        self.setDragMode(QtWidgets.QGraphicsView.ScrollHandDrag)
-        self.viewport().setCursor(QtCore.Qt.ArrowCursor)
+
+        self.filepath = filepath
+        self.filetype = os.path.splitext(self.filepath)[1][1:]
+
+        if self.filetype == 'cbz':
+            self.book = zipfile.ZipFile(self.filepath)
+
+        elif self.filetype == 'cbr':
+            self.book = rarfile.RarFile(self.filepath)
+
+        elif self.filetype == 'pdf':
+            self.book = popplerqt5.Poppler.Document.load(self.filepath)
+            self.book.setRenderHint(
+                popplerqt5.Poppler.Document.Antialiasing
+                and popplerqt5.Poppler.Document.TextAntialiasing)
+
        self.common_functions = PliantWidgetsCommonFunctions(
            self, self.main_window)
-        self.setMouseTracking(True)
-        self.image_cache = [None for _ in range(4)]

-    def loadImage(self, current_image):
        # TODO
-        # For double page view: 1 before, 1 after
        # Image panning with mouse
+        self.ignore_wheel_event = False
+        self.ignore_wheel_event_number = 0
+        self.setMouseTracking(True)
+        self.setDragMode(QtWidgets.QGraphicsView.ScrollHandDrag)
+        self.viewport().setCursor(QtCore.Qt.ArrowCursor)

-        content = self.parent.metadata['content']
-        image_paths = [i[1] for i in content]
+    def loadImage(self, current_page):
+        # TODO
+        # Threaded caching will still work here
+        # Look at a commit where it's not been deleted
+        # For double page view: 1 before, 1 after

-        def generate_image_cache(current_image):
-            print('Building image cache')
-            current_image_index = image_paths.index(current_image)
+        self.image_pixmap = QtGui.QPixmap()

-            for i in (-1, 0, 1, 2):
-                try:
-                    this_path = image_paths[current_image_index + i]
-                    this_pixmap = QtGui.QPixmap()
-                    this_pixmap.load(this_path)
-                    self.image_cache[i + 1] = (this_path, this_pixmap)
-                except IndexError:
-                    self.image_cache[i + 1] = None
+        if self.filetype in ('cbz', 'cbr'):
+            page_data = self.book.read(current_page)
+            self.image_pixmap.loadFromData(page_data)

-        def refill_cache(remove_value):
-            remove_index = self.image_cache.index(remove_value)
-            refill_pixmap = QtGui.QPixmap()
+        if self.filetype == 'pdf':
+            page_data = self.book.page(current_page)
+            page_qimage = page_data.renderToImage(350, 350)
+            self.image_pixmap.convertFromImage(page_qimage)

-            if remove_index == 1:
-                first_path = self.image_cache[0][0]
-                self.image_cache.pop(3)
-                previous_path = image_paths[image_paths.index(first_path) - 1]
-                refill_pixmap.load(previous_path)
-                self.image_cache.insert(0, (previous_path, refill_pixmap))
-            else:
-                self.image_cache[0] = self.image_cache[1]
-                self.image_cache.pop(1)
-                try:
-                    last_path = self.image_cache[2][0]
-                    next_path = image_paths[image_paths.index(last_path) + 1]
-                    refill_pixmap.load(next_path)
-                    self.image_cache.append((next_path, refill_pixmap))
-                except (IndexError, TypeError):
-                    self.image_cache.append(None)
-
-        def check_cache(current_image):
-            for i in self.image_cache:
-                if i:
-                    if i[0] == current_image:
-                        return_pixmap = i[1]
-                        refill_cache(i)
-                        return return_pixmap
-
-            # No return happened so the image isn't in the cache
-            generate_image_cache(current_image)
-
-        return_pixmap = None
-        while not return_pixmap:
-            return_pixmap = check_cache(current_image)
-
-        self.image_pixmap = return_pixmap
        self.resizeEvent()

    def resizeEvent(self, *args):
@@ -1,106 +0,0 @@
-#!/usr/bin/env python3
-
-# This file is a part of Lector, a Qt based ebook reader
-# Copyright (C) 2017 BasioMeusPuga
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-# TODO
-# Account for files with passwords
-
-import os
-import time
-import collections
-from rarfile import rarfile
-
-
-class ParseCBR:
-    def __init__(self, filename, temp_dir, file_md5):
-        self.filename = filename
-        self.book = None
-        self.temp_dir = temp_dir
-        self.file_md5 = file_md5
-
-    def read_book(self):
-        try:
-            self.book = rarfile.RarFile(self.filename)
-        except:  # Specifying no exception types might be warranted here
-            print('Cannot parse ' + self.filename)
-            return
-
-    def get_title(self):
-        filename = os.path.basename(self.filename)
-        filename_proper = os.path.splitext(filename)[0]
-        return filename_proper
-
-    def get_author(self):
-        return None
-
-    def get_year(self):
-        creation_time = time.ctime(os.path.getctime(self.filename))
-        creation_year = creation_time.split()[-1]
-        return creation_year
-
-    def get_cover_image(self):
-        # The first image in the archive may not be the cover
-        # It is implied, however, that the first image in order
-        # will be the cover
-
-        image_list = [i.filename for i in self.book.infolist() if not i.isdir()]
-        image_list.sort()
-        cover_image_filename = image_list[0]
-
-        for i in self.book.infolist():
-            if not i.isdir():
-                if i.filename == cover_image_filename:
-                    cover_image = self.book.read(i)
-                    return cover_image
-
-    def get_isbn(self):
-        return
-
-    def get_tags(self):
-        return
-
-    def get_contents(self):
-        file_settings = {
-            'images_only': True}
-
-        extract_path = os.path.join(self.temp_dir, self.file_md5)
-        contents = []
-
-        # I'm currently choosing not to keep multiple files in memory
-        self.book.extractall(extract_path)
-
-        found_images = []
-        for i in os.walk(extract_path):
-            if i[2]:  # Implies files were found
-                image_dir = i[0]
-                add_path_to_file = [
-                    os.path.join(image_dir, j) for j in i[2]]
-                found_images.extend(add_path_to_file)
-
-        if not found_images:
-            print('Found nothing in ' + self.filename)
-            return None, file_settings
-
-        found_images.sort()
-
-        for count, i in enumerate(found_images):
-            page_name = 'Page ' + str(count + 1)
-            image_path = os.path.join(extract_path, i)
-
-            contents.append((page_name, image_path))
-
-        return contents, file_settings
@@ -1,109 +0,0 @@
-#!/usr/bin/env python3
-
-# This file is a part of Lector, a Qt based ebook reader
-# Copyright (C) 2017 BasioMeusPuga
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-# TODO
-# Account for files with passwords
-
-import os
-import time
-import zipfile
-import collections
-
-
-class ParseCBZ:
-    def __init__(self, filename, temp_dir, file_md5):
-        self.filename = filename
-        self.book = None
-        self.temp_dir = temp_dir
-        self.file_md5 = file_md5
-
-    def read_book(self):
-        try:
-            self.book = zipfile.ZipFile(self.filename, mode='r', allowZip64=True)
-        except FileNotFoundError:
-            print('Invalid path for ' + self.filename)
-            return
-        except (KeyError, AttributeError, zipfile.BadZipFile):
-            print('Cannot parse ' + self.filename)
-            return
-
-    def get_title(self):
-        filename = os.path.basename(self.book.filename)
-        filename_proper = os.path.splitext(filename)[0]
-        return filename_proper
-
-    def get_author(self):
-        return None
-
-    def get_year(self):
-        creation_time = time.ctime(os.path.getctime(self.filename))
-        creation_year = creation_time.split()[-1]
-        return creation_year
-
-    def get_cover_image(self):
-        # The first image in the archive may not be the cover
-        # It is implied, however, that the first image in order
-        # will be the cover
-
-        image_list = [i.filename for i in self.book.infolist() if not i.is_dir()]
-        image_list.sort()
-        cover_image_filename = image_list[0]
-
-        for i in self.book.infolist():
-            if not i.is_dir():
-                if i.filename == cover_image_filename:
-                    cover_image = self.book.read(i)
-                    return cover_image
-
-    def get_isbn(self):
-        return
-
-    def get_tags(self):
-        return
-
-    def get_contents(self):
-        file_settings = {
-            'images_only': True}
-
-        extract_path = os.path.join(self.temp_dir, self.file_md5)
-        contents = []
-
-        # I'm currently choosing not to keep multiple files in memory
-        self.book.extractall(extract_path)
-
-        found_images = []
-        for i in os.walk(extract_path):
-            if i[2]:  # Implies files were found
-                image_dir = i[0]
-                add_path_to_file = [
-                    os.path.join(image_dir, j) for j in i[2]]
-                found_images.extend(add_path_to_file)
-
-        if not found_images:
-            print('Found nothing in ' + self.filename)
-            return None, file_settings
-
-        found_images.sort()
-
-        for count, i in enumerate(found_images):
-            page_name = 'Page ' + str(count + 1)
-            image_path = os.path.join(extract_path, i)
-
-            contents.append((page_name, image_path))
-
-        return contents, file_settings
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+
+# This file is a part of Lector, a Qt based ebook reader
+# Copyright (C) 2017-18 BasioMeusPuga
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# TODO
+# Account for files with passwords
+
+import os
+import time
+import zipfile
+from rarfile import rarfile
+
+
+class ParseCOMIC:
+    def __init__(self, filename, *args):
+        self.filename = filename
+        self.book = None
+        self.image_list = None
+        self.book_extension = os.path.splitext(self.filename)
+
+    def read_book(self):
+        try:
+            if self.book_extension[1] == '.cbz':
+                self.book = zipfile.ZipFile(
+                    self.filename, mode='r', allowZip64=True)
+                self.image_list = [i.filename for i in self.book.infolist() if not i.is_dir()]
+
+            elif self.book_extension[1] == '.cbr':
+                self.book = rarfile.RarFile(self.filename)
+                self.image_list = [i.filename for i in self.book.infolist() if not i.isdir()]
+
+            self.image_list.sort()
+        except:  # Specifying no exception here is warranted
+            print('Cannot parse ' + self.filename)
+            return
+
+    def get_title(self):
+        return self.book_extension[0]
+
+    def get_author(self):
+        return None
+
+    def get_year(self):
+        creation_time = time.ctime(os.path.getctime(self.filename))
+        creation_year = creation_time.split()[-1]
+        return creation_year
+
+    def get_cover_image(self):
+        # The first image in the archive may not be the cover
+        # It is implied, however, that the first image in order
+        # will be the cover
+        return self.book.read(self.image_list[0])
+
+    def get_isbn(self):
+        return None
+
+    def get_tags(self):
+        return None
+
+    def get_contents(self):
+        file_settings = {'images_only': True}
+        contents = [(f'Page {count + 1}', i) for count, i in enumerate(self.image_list)]
+
+        return contents, file_settings
@@ -28,9 +28,8 @@ class ParseEPUB:
        # Maybe also include book description
        self.book_ref = None
        self.book = None
-        self.temp_dir = temp_dir
        self.filename = filename
-        self.file_md5 = file_md5
+        self.extract_path = os.path.join(temp_dir, file_md5)

    def read_book(self):
        self.book_ref = EPUB(self.filename)
@@ -59,10 +58,9 @@ class ParseEPUB:
        return self.book['tags']

    def get_contents(self):
-        extract_path = os.path.join(self.temp_dir, self.file_md5)
-        zipfile.ZipFile(self.filename).extractall(extract_path)
+        zipfile.ZipFile(self.filename).extractall(self.extract_path)

-        self.book_ref.parse_chapters(temp_dir=self.temp_dir)
+        self.book_ref.parse_chapters(temp_dir=self.extract_path)
        file_settings = {
            'images_only': False}
        return self.book['book_list'], file_settings
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+
+# This file is a part of Lector, a Qt based ebook reader
+# Copyright (C) 2018 BasioMeusPuga
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import io
+from PyQt5 import QtCore
+from bs4 import BeautifulSoup
+
+proceed = True
+try:
+    import popplerqt5
+except ImportError:
+    print('python-poppler-qt5 is not installed. Pdf files will not work.')
+    proceed = False
+
+class ParsePDF:
+    def __init__(self, filename, *args):
+        self.filename = filename
+        self.book = None
+        self.metadata = None
+
+    def read_book(self):
+        if not proceed:
+            return
+
+        self.book = popplerqt5.Poppler.Document.load(self.filename)
+        if not self.book:
+            return
+
+        self.metadata = BeautifulSoup(self.book.metadata(), 'xml')
+
+    def get_title(self):
+        try:
+            title = self.metadata.find('title').text
+            return title.replace('\n', '')
+        except AttributeError:
+            return 'Unknown'
+
+    def get_author(self):
+        try:
+            author = self.metadata.find('creator').text
+            return author.replace('\n', '')
+        except AttributeError:
+            return 'Unknown'
+
+    def get_year(self):
+        try:
+            year = self.metadata.find('MetadataDate').text
+            return year.replace('\n', '')
+        except AttributeError:
+            return 9999
+
+    def get_cover_image(self):
+        self.book.setRenderHint(
+            popplerqt5.Poppler.Document.Antialiasing
+            and popplerqt5.Poppler.Document.TextAntialiasing)
+
+        cover_page = self.book.page(0)
+        cover_image = cover_page.renderToImage(300, 300)
+        return resize_image(cover_image)
+
+    def get_isbn(self):
+        return None
+
+    def get_tags(self):
+        try:
+            tags = self.metadata.find('Keywords').text
+            return tags.replace('\n', '')
+        except AttributeError:
+            return None
+
+    def get_contents(self):
+        file_settings = {'images_only': True}
+        contents = [(f'Page {i + 1}', i) for i in range(self.book.numPages())]
+
+        return contents, file_settings
+
+
+def resize_image(cover_image):
+    cover_image = cover_image.scaled(
+        420, 600, QtCore.Qt.IgnoreAspectRatio)
+
+    byte_array = QtCore.QByteArray()
+    buffer = QtCore.QBuffer(byte_array)
+    buffer.open(QtCore.QIODevice.WriteOnly)
+    cover_image.save(buffer, 'jpg', 75)
+
+    cover_image_final = io.BytesIO(byte_array)
+    cover_image_final.seek(0)
+    return cover_image_final.getvalue()
@@ -5,8 +5,8 @@ from setuptools import setup, find_packages
 HERE = path.abspath(path.dirname(__file__))

 MAJOR_VERSION = '0'
-MINOR_VERSION = '1'
-MICRO_VERSION = '2'
+MINOR_VERSION = '2'
+MICRO_VERSION = '0'
 VERSION = "{}.{}.{}".format(MAJOR_VERSION, MINOR_VERSION, MICRO_VERSION)

 # Get the long description from the README file
@@ -15,7 +15,8 @@ with codecs.open(path.join(HERE, 'README.md'), encoding='utf-8') as f:

 INSTALL_DEPS = ['PyQt5>=5.10.1',
                'requests>=2.18.4',
-                'beautifulsoup4>=4.6.0']
+                'beautifulsoup4>=4.6.0',
+                'python-poppler-qt5>=0.24.2']
 TEST_DEPS = ['pytest',
             'unittest2']
 DEV_DEPS = []
@@ -46,7 +47,7 @@ setup(
    ],

    # What does your project relate to?
-    keywords='qt ebook epub kindle mobi',
+    keywords='qt ebook epub kindle mobi comic cbz cbr pdf',

    packages=find_packages(),