Preliminary pdf support

Consolidate comicbook modules Do not write to temp dir for comics any longer
2018-03-16 18:46:38 +05:30
parent fc2fcb5361
commit 5b3759afe6
9 changed files with 242 additions and 289 deletions
--- a/parsers/cbr.py
+++ b/parsers/cbr.py
@@ -1,106 +0,0 @@
-#!/usr/bin/env python3
-
-# This file is a part of Lector, a Qt based ebook reader
-# Copyright (C) 2017 BasioMeusPuga
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-# TODO
-# Account for files with passwords
-
-import os
-import time
-import collections
-from rarfile import rarfile
-
-
-class ParseCBR:
-    def __init__(self, filename, temp_dir, file_md5):
-        self.filename = filename
-        self.book = None
-        self.temp_dir = temp_dir
-        self.file_md5 = file_md5
-
-    def read_book(self):
-        try:
-            self.book = rarfile.RarFile(self.filename)
-        except:  # Specifying no exception types might be warranted here
-            print('Cannot parse ' + self.filename)
-            return
-
-    def get_title(self):
-        filename = os.path.basename(self.filename)
-        filename_proper = os.path.splitext(filename)[0]
-        return filename_proper
-
-    def get_author(self):
-        return None
-
-    def get_year(self):
-        creation_time = time.ctime(os.path.getctime(self.filename))
-        creation_year = creation_time.split()[-1]
-        return creation_year
-
-    def get_cover_image(self):
-        # The first image in the archive may not be the cover
-        # It is implied, however, that the first image in order
-        # will be the cover
-
-        image_list = [i.filename for i in self.book.infolist() if not i.isdir()]
-        image_list.sort()
-        cover_image_filename = image_list[0]
-
-        for i in self.book.infolist():
-            if not i.isdir():
-                if i.filename == cover_image_filename:
-                    cover_image = self.book.read(i)
-                    return cover_image
-
-    def get_isbn(self):
-        return
-
-    def get_tags(self):
-        return
-
-    def get_contents(self):
-        file_settings = {
-            'images_only': True}
-
-        extract_path = os.path.join(self.temp_dir, self.file_md5)
-        contents = []
-
-        # I'm currently choosing not to keep multiple files in memory
-        self.book.extractall(extract_path)
-
-        found_images = []
-        for i in os.walk(extract_path):
-            if i[2]:  # Implies files were found
-                image_dir = i[0]
-                add_path_to_file = [
-                    os.path.join(image_dir, j) for j in i[2]]
-                found_images.extend(add_path_to_file)
-
-        if not found_images:
-            print('Found nothing in ' + self.filename)
-            return None, file_settings
-
-        found_images.sort()
-
-        for count, i in enumerate(found_images):
-            page_name = 'Page ' + str(count + 1)
-            image_path = os.path.join(extract_path, i)
-
-            contents.append((page_name, image_path))
-
-        return contents, file_settings
--- a/parsers/cbz.py
+++ b/parsers/cbz.py
@@ -1,109 +0,0 @@
-#!/usr/bin/env python3
-
-# This file is a part of Lector, a Qt based ebook reader
-# Copyright (C) 2017 BasioMeusPuga
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-# TODO
-# Account for files with passwords
-
-import os
-import time
-import zipfile
-import collections
-
-
-class ParseCBZ:
-    def __init__(self, filename, temp_dir, file_md5):
-        self.filename = filename
-        self.book = None
-        self.temp_dir = temp_dir
-        self.file_md5 = file_md5
-
-    def read_book(self):
-        try:
-            self.book = zipfile.ZipFile(self.filename, mode='r', allowZip64=True)
-        except FileNotFoundError:
-            print('Invalid path for ' + self.filename)
-            return
-        except (KeyError, AttributeError, zipfile.BadZipFile):
-            print('Cannot parse ' + self.filename)
-            return
-
-    def get_title(self):
-        filename = os.path.basename(self.book.filename)
-        filename_proper = os.path.splitext(filename)[0]
-        return filename_proper
-
-    def get_author(self):
-        return None
-
-    def get_year(self):
-        creation_time = time.ctime(os.path.getctime(self.filename))
-        creation_year = creation_time.split()[-1]
-        return creation_year
-
-    def get_cover_image(self):
-        # The first image in the archive may not be the cover
-        # It is implied, however, that the first image in order
-        # will be the cover
-
-        image_list = [i.filename for i in self.book.infolist() if not i.is_dir()]
-        image_list.sort()
-        cover_image_filename = image_list[0]
-
-        for i in self.book.infolist():
-            if not i.is_dir():
-                if i.filename == cover_image_filename:
-                    cover_image = self.book.read(i)
-                    return cover_image
-
-    def get_isbn(self):
-        return
-
-    def get_tags(self):
-        return
-
-    def get_contents(self):
-        file_settings = {
-            'images_only': True}
-
-        extract_path = os.path.join(self.temp_dir, self.file_md5)
-        contents = []
-
-        # I'm currently choosing not to keep multiple files in memory
-        self.book.extractall(extract_path)
-
-        found_images = []
-        for i in os.walk(extract_path):
-            if i[2]:  # Implies files were found
-                image_dir = i[0]
-                add_path_to_file = [
-                    os.path.join(image_dir, j) for j in i[2]]
-                found_images.extend(add_path_to_file)
-
-        if not found_images:
-            print('Found nothing in ' + self.filename)
-            return None, file_settings
-
-        found_images.sort()
-
-        for count, i in enumerate(found_images):
-            page_name = 'Page ' + str(count + 1)
-            image_path = os.path.join(extract_path, i)
-
-            contents.append((page_name, image_path))
-
-        return contents, file_settings
--- a/parsers/comicbooks.py
+++ b/parsers/comicbooks.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+
+# This file is a part of Lector, a Qt based ebook reader
+# Copyright (C) 2017-18 BasioMeusPuga
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# TODO
+# Account for files with passwords
+
+import os
+import time
+import zipfile
+from rarfile import rarfile
+
+
+class ParseCOMIC:
+    def __init__(self, filename, *args):
+        self.filename = filename
+        self.book = None
+        self.image_list = None
+        self.book_extension = os.path.splitext(self.filename)
+
+    def read_book(self):
+        try:
+            if self.book_extension[1] == '.cbz':
+                self.book = zipfile.ZipFile(
+                    self.filename, mode='r', allowZip64=True)
+                self.image_list = [i.filename for i in self.book.infolist() if not i.is_dir()]
+
+            elif self.book_extension[1] == '.cbr':
+                self.book = rarfile.RarFile(self.filename)
+                self.image_list = [i.filename for i in self.book.infolist() if not i.isdir()]
+
+            self.image_list.sort()
+        except:  # Specifying no exception here is warranted
+            print('Cannot parse ' + self.filename)
+            return
+
+    def get_title(self):
+        return self.book_extension[0]
+
+    def get_author(self):
+        return None
+
+    def get_year(self):
+        creation_time = time.ctime(os.path.getctime(self.filename))
+        creation_year = creation_time.split()[-1]
+        return creation_year
+
+    def get_cover_image(self):
+        # The first image in the archive may not be the cover
+        # It is implied, however, that the first image in order
+        # will be the cover
+        return self.book.read(self.image_list[0])
+
+    def get_isbn(self):
+        return None
+
+    def get_tags(self):
+        return None
+
+    def get_contents(self):
+        file_settings = {'images_only': True}
+        contents = [(f'Page {count + 1}', i) for count, i in enumerate(self.image_list)]
+
+        return contents, file_settings
--- a/parsers/epub.py
+++ b/parsers/epub.py
@@ -28,9 +28,8 @@ class ParseEPUB:
        # Maybe also include book description
        self.book_ref = None
        self.book = None
-        self.temp_dir = temp_dir
        self.filename = filename
-        self.file_md5 = file_md5
+        self.extract_path = os.path.join(temp_dir, file_md5)

    def read_book(self):
        self.book_ref = EPUB(self.filename)
@@ -59,10 +58,9 @@ class ParseEPUB:
        return self.book['tags']

    def get_contents(self):
-        extract_path = os.path.join(self.temp_dir, self.file_md5)
-        zipfile.ZipFile(self.filename).extractall(extract_path)
+        zipfile.ZipFile(self.filename).extractall(self.extract_path)

-        self.book_ref.parse_chapters(temp_dir=self.temp_dir)
+        self.book_ref.parse_chapters(temp_dir=self.extract_path)
        file_settings = {
            'images_only': False}
        return self.book['book_list'], file_settings
--- a/parsers/pdf.py
+++ b/parsers/pdf.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+
+# This file is a part of Lector, a Qt based ebook reader
+# Copyright (C) 2018 BasioMeusPuga
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import io
+from PyQt5 import QtCore
+from bs4 import BeautifulSoup
+
+proceed = True
+try:
+    import popplerqt5
+except ImportError:
+    print('python-poppler-qt5 is not installed. Pdf files will not work.')
+    proceed = False
+
+class ParsePDF:
+    def __init__(self, filename, *args):
+        self.filename = filename
+        self.book = None
+        self.metadata = None
+
+    def read_book(self):
+        if not proceed:
+            return
+
+        self.book = popplerqt5.Poppler.Document.load(self.filename)
+        if not self.book:
+            return
+
+        self.metadata = BeautifulSoup(self.book.metadata(), 'xml')
+
+    def get_title(self):
+        try:
+            title = self.metadata.find('title').text
+            return title.replace('\n', '')
+        except AttributeError:
+            return 'Unknown'
+
+    def get_author(self):
+        try:
+            author = self.metadata.find('creator').text
+            return author.replace('\n', '')
+        except AttributeError:
+            return 'Unknown'
+
+    def get_year(self):
+        try:
+            year = self.metadata.find('MetadataDate').text
+            return year.replace('\n', '')
+        except AttributeError:
+            return 9999
+
+    def get_cover_image(self):
+        self.book.setRenderHint(
+            popplerqt5.Poppler.Document.Antialiasing
+            and popplerqt5.Poppler.Document.TextAntialiasing)
+
+        cover_page = self.book.page(0)
+        cover_image = cover_page.renderToImage(300, 300)
+        return resize_image(cover_image)
+
+    def get_isbn(self):
+        return None
+
+    def get_tags(self):
+        try:
+            tags = self.metadata.find('Keywords').text
+            return tags.replace('\n', '')
+        except AttributeError:
+            return None
+
+    def get_contents(self):
+        file_settings = {'images_only': True}
+        contents = [(f'Page {i + 1}', i) for i in range(self.book.numPages())]
+
+        return contents, file_settings
+
+
+def resize_image(cover_image):
+    cover_image = cover_image.scaled(
+        420, 600, QtCore.Qt.IgnoreAspectRatio)
+
+    byte_array = QtCore.QByteArray()
+    buffer = QtCore.QBuffer(byte_array)
+    buffer.open(QtCore.QIODevice.WriteOnly)
+    cover_image.save(buffer, 'jpg', 75)
+
+    cover_image_final = io.BytesIO(byte_array)
+    cover_image_final.seek(0)
+    return cover_image_final.getvalue()