Files
Lector/lector/sorter.py
BasioMeusPuga 7977bde410 Multiple fixes
2018-04-29 08:11:46 -04:00

326 lines
11 KiB
Python

# This file is a part of Lector, a Qt based ebook reader
# Copyright (C) 2017-2018 BasioMeusPuga
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# INSTRUCTIONS
# Every parser is supposed to have the following methods. None returns are not allowed.
# read_book()
# get_title()
# get_author()
# get_year()
# get_cover_image()
# get_isbn()
# get_tags()
# get_contents() - Should return a tuple with 0: TOC 1: special_settings (dict)
# Parsers for files containing only images need to return only images_only = True
# TODO
# Maybe shift to insert or replace instead of hash checking
# See if you want to include a hash of the book's name and author
# Change thread niceness
import io
import os
import sys
import time
import pickle
import hashlib
import threading
# The multiprocessing module does not work correctly on Windows
if sys.platform.startswith('win'):
from multiprocessing.dummy import Pool, Manager
else:
from multiprocessing import Pool, Manager
from PyQt5 import QtCore, QtGui
from lector import database
from lector.parsers.epub import ParseEPUB
from lector.parsers.mobi import ParseMOBI
from lector.parsers.comicbooks import ParseCOMIC
sorter = {
'epub': ParseEPUB,
'mobi': ParseMOBI,
'azw': ParseMOBI,
'azw3': ParseMOBI,
'azw4': ParseMOBI,
'prc': ParseMOBI,
'cbz': ParseCOMIC,
'cbr': ParseCOMIC}
# The following imports are for optional dependencies
try:
from lector.parsers.pdf import ParsePDF
sorter['pdf'] = ParsePDF
except ImportError:
print('python-poppler-qt5 is not installed. Pdf files will not work.')
available_parsers = [i for i in sorter]
progressbar = None # This is populated by __main__
progress_emitter = None # This is to be made into a global variable
class UpdateProgress(QtCore.QObject):
# This is for thread safety
update_signal = QtCore.pyqtSignal(int)
def connect_to_progressbar(self):
self.update_signal.connect(progressbar.setValue)
def update_progress(self, progress_percent):
self.update_signal.emit(progress_percent)
class BookSorter:
def __init__(self, file_list, mode, database_path, auto_tags=True, temp_dir=None):
# Have the GUI pass a list of files straight to here
# Then, on the basis of what is needed, pass the
# filenames to the requisite functions
# This includes getting file info for the database
# Parsing for the reader proper
# Caching upon closing
self.file_list = [i for i in file_list if os.path.exists(i)]
self.statistics = [0, (len(file_list))]
self.hashes_and_paths = {}
self.work_mode = mode[0]
self.addition_mode = mode[1]
self.database_path = database_path
self.auto_tags = auto_tags
self.temp_dir = temp_dir
if database_path:
self.database_hashes()
self.threading_completed = []
self.queue = Manager().Queue()
self.processed_books = []
if self.work_mode == 'addition':
progress_object_generator()
def database_hashes(self):
all_hashes_and_paths = database.DatabaseFunctions(
self.database_path).fetch_data(
('Hash', 'Path'),
'books',
{'Hash': ''},
'LIKE')
if all_hashes_and_paths:
# self.hashes = [i[0] for i in all_hashes]
self.hashes_and_paths = {
i[0]: i[1] for i in all_hashes_and_paths}
def database_entry_for_book(self, file_hash):
database_return = database.DatabaseFunctions(
self.database_path).fetch_data(
('Title', 'Author', 'Year', 'ISBN', 'Tags',
'Position', 'Bookmarks', 'CoverImage', 'Annotations'),
'books',
{'Hash': file_hash},
'EQUALS')[0]
book_data = []
for count, i in enumerate(database_return):
if count in (5, 6, 8): # Position, Bookmarks, and Annotations are pickled
if i:
book_data.append(pickle.loads(i))
else:
book_data.append(None)
else:
book_data.append(i)
return book_data
def read_book(self, filename):
# filename is expected as a string containg the
# full path of the ebook file
with open(filename, 'rb') as current_book:
# This should speed up addition for larger files
# without compromising the integrity of the process
first_bytes = current_book.read(1024 * 32) # First 32KB of the file
file_md5 = hashlib.md5(first_bytes).hexdigest()
# Update the progress queue
self.queue.put(filename)
# This should not get triggered in reading mode
# IF the file is NOT being loaded into the reader,
# Do not allow addition in case the file
# is already in the database and it remains at its original path
if self.work_mode == 'addition' and file_md5 in self.hashes_and_paths:
if (self.hashes_and_paths[file_md5] == filename
or os.path.exists(self.hashes_and_paths[file_md5])):
if not self.hashes_and_paths[file_md5] == filename:
print(f'{os.path.basename(filename)} is already in database')
return
file_extension = os.path.splitext(filename)[1][1:]
try:
# Get the requisite parser from the sorter dict
book_ref = sorter[file_extension](filename, self.temp_dir, file_md5)
except KeyError:
print(filename + ' has an unsupported extension')
return
# Everything following this is standard
# None values are accounted for here
book_ref.read_book()
if book_ref.book:
this_book = {}
this_book[file_md5] = {
'hash': file_md5,
'path': filename}
# Different modes require different values
if self.work_mode == 'addition':
# Reduce the size of the incoming image
# if one is found
title = book_ref.get_title()
author = book_ref.get_author()
year = book_ref.get_year()
isbn = book_ref.get_isbn()
tags = None
if self.auto_tags:
tags = book_ref.get_tags()
cover_image_raw = book_ref.get_cover_image()
if cover_image_raw:
cover_image = resize_image(cover_image_raw)
else:
cover_image = None
this_book[file_md5]['cover_image'] = cover_image
this_book[file_md5]['addition_mode'] = self.addition_mode
if self.work_mode == 'reading':
all_content = book_ref.get_contents()
# get_contents() returns a tuple. Index 1 is a collection of
# special settings that depend on the kind of data being parsed.
# Currently, this includes:
# Only images included images_only BOOL Book contains only images
content = all_content[0]
images_only = all_content[1]['images_only']
if not content:
content = [('Invalid', 'Something went horribly wrong')]
book_data = self.database_entry_for_book(file_md5)
title = book_data[0]
author = book_data[1]
year = book_data[2]
isbn = book_data[3]
tags = book_data[4]
position = book_data[5]
bookmarks = book_data[6]
cover = book_data[7]
annotations = book_data[8]
this_book[file_md5]['position'] = position
this_book[file_md5]['bookmarks'] = bookmarks
this_book[file_md5]['content'] = content
this_book[file_md5]['images_only'] = images_only
this_book[file_md5]['cover'] = cover
this_book[file_md5]['annotations'] = annotations
this_book[file_md5]['title'] = title
this_book[file_md5]['author'] = author
this_book[file_md5]['year'] = year
this_book[file_md5]['isbn'] = isbn
this_book[file_md5]['tags'] = tags
return this_book
def read_progress(self):
while True:
processed_file = self.queue.get()
self.threading_completed.append(processed_file)
total_number = len(self.file_list)
completed_number = len(self.threading_completed)
if progress_emitter: # Skip update in reading mode
progress_emitter.update_progress(
completed_number * 100 // total_number)
if total_number == completed_number:
break
def initiate_threads(self):
if not self.file_list:
return None
def pool_creator():
_pool = Pool(5)
self.processed_books = _pool.map(
self.read_book, self.file_list)
_pool.close()
_pool.join()
start_time = time.time()
worker_thread = threading.Thread(target=pool_creator)
progress_thread = threading.Thread(target=self.read_progress)
worker_thread.start()
progress_thread.start()
worker_thread.join()
progress_thread.join(timeout=.5)
return_books = {}
# Exclude None returns generated in case of duplication / parse errors
self.processed_books = [i for i in self.processed_books if i]
for i in self.processed_books:
for j in i:
return_books[j] = i[j]
del self.processed_books
print('Finished processing in', time.time() - start_time)
return return_books
def progress_object_generator():
# This has to be kept separate from the BookSorter class because
# the QtObject inheritance disallows pickling
global progress_emitter
progress_emitter = UpdateProgress()
progress_emitter.connect_to_progressbar()
def resize_image(cover_image_raw):
cover_image = QtGui.QImage()
cover_image.loadFromData(cover_image_raw)
cover_image = cover_image.scaled(
420, 600, QtCore.Qt.IgnoreAspectRatio)
byte_array = QtCore.QByteArray()
buffer = QtCore.QBuffer(byte_array)
buffer.open(QtCore.QIODevice.WriteOnly)
cover_image.save(buffer, 'jpg', 75)
cover_image_final = io.BytesIO(byte_array)
cover_image_final.seek(0)
return cover_image_final.getvalue()