Preliminary pdf support

Consolidate comicbook modules
Do not write to temp dir for comics any longer
This commit is contained in:
BasioMeusPuga
2018-03-16 18:46:38 +05:30
parent fc2fcb5361
commit 5b3759afe6
9 changed files with 242 additions and 289 deletions

View File

@@ -1,106 +0,0 @@
#!/usr/bin/env python3
# This file is a part of Lector, a Qt based ebook reader
# Copyright (C) 2017 BasioMeusPuga
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# TODO
# Account for files with passwords
import os
import time
import collections
from rarfile import rarfile
class ParseCBR:
def __init__(self, filename, temp_dir, file_md5):
self.filename = filename
self.book = None
self.temp_dir = temp_dir
self.file_md5 = file_md5
def read_book(self):
try:
self.book = rarfile.RarFile(self.filename)
except: # Specifying no exception types might be warranted here
print('Cannot parse ' + self.filename)
return
def get_title(self):
filename = os.path.basename(self.filename)
filename_proper = os.path.splitext(filename)[0]
return filename_proper
def get_author(self):
return None
def get_year(self):
creation_time = time.ctime(os.path.getctime(self.filename))
creation_year = creation_time.split()[-1]
return creation_year
def get_cover_image(self):
# The first image in the archive may not be the cover
# It is implied, however, that the first image in order
# will be the cover
image_list = [i.filename for i in self.book.infolist() if not i.isdir()]
image_list.sort()
cover_image_filename = image_list[0]
for i in self.book.infolist():
if not i.isdir():
if i.filename == cover_image_filename:
cover_image = self.book.read(i)
return cover_image
def get_isbn(self):
return
def get_tags(self):
return
def get_contents(self):
file_settings = {
'images_only': True}
extract_path = os.path.join(self.temp_dir, self.file_md5)
contents = []
# I'm currently choosing not to keep multiple files in memory
self.book.extractall(extract_path)
found_images = []
for i in os.walk(extract_path):
if i[2]: # Implies files were found
image_dir = i[0]
add_path_to_file = [
os.path.join(image_dir, j) for j in i[2]]
found_images.extend(add_path_to_file)
if not found_images:
print('Found nothing in ' + self.filename)
return None, file_settings
found_images.sort()
for count, i in enumerate(found_images):
page_name = 'Page ' + str(count + 1)
image_path = os.path.join(extract_path, i)
contents.append((page_name, image_path))
return contents, file_settings

View File

@@ -1,109 +0,0 @@
#!/usr/bin/env python3
# This file is a part of Lector, a Qt based ebook reader
# Copyright (C) 2017 BasioMeusPuga
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# TODO
# Account for files with passwords
import os
import time
import zipfile
import collections
class ParseCBZ:
def __init__(self, filename, temp_dir, file_md5):
self.filename = filename
self.book = None
self.temp_dir = temp_dir
self.file_md5 = file_md5
def read_book(self):
try:
self.book = zipfile.ZipFile(self.filename, mode='r', allowZip64=True)
except FileNotFoundError:
print('Invalid path for ' + self.filename)
return
except (KeyError, AttributeError, zipfile.BadZipFile):
print('Cannot parse ' + self.filename)
return
def get_title(self):
filename = os.path.basename(self.book.filename)
filename_proper = os.path.splitext(filename)[0]
return filename_proper
def get_author(self):
return None
def get_year(self):
creation_time = time.ctime(os.path.getctime(self.filename))
creation_year = creation_time.split()[-1]
return creation_year
def get_cover_image(self):
# The first image in the archive may not be the cover
# It is implied, however, that the first image in order
# will be the cover
image_list = [i.filename for i in self.book.infolist() if not i.is_dir()]
image_list.sort()
cover_image_filename = image_list[0]
for i in self.book.infolist():
if not i.is_dir():
if i.filename == cover_image_filename:
cover_image = self.book.read(i)
return cover_image
def get_isbn(self):
return
def get_tags(self):
return
def get_contents(self):
file_settings = {
'images_only': True}
extract_path = os.path.join(self.temp_dir, self.file_md5)
contents = []
# I'm currently choosing not to keep multiple files in memory
self.book.extractall(extract_path)
found_images = []
for i in os.walk(extract_path):
if i[2]: # Implies files were found
image_dir = i[0]
add_path_to_file = [
os.path.join(image_dir, j) for j in i[2]]
found_images.extend(add_path_to_file)
if not found_images:
print('Found nothing in ' + self.filename)
return None, file_settings
found_images.sort()
for count, i in enumerate(found_images):
page_name = 'Page ' + str(count + 1)
image_path = os.path.join(extract_path, i)
contents.append((page_name, image_path))
return contents, file_settings

78
parsers/comicbooks.py Normal file
View File

@@ -0,0 +1,78 @@
#!/usr/bin/env python3
# This file is a part of Lector, a Qt based ebook reader
# Copyright (C) 2017-18 BasioMeusPuga
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# TODO
# Account for files with passwords
import os
import time
import zipfile
from rarfile import rarfile
class ParseCOMIC:
def __init__(self, filename, *args):
self.filename = filename
self.book = None
self.image_list = None
self.book_extension = os.path.splitext(self.filename)
def read_book(self):
try:
if self.book_extension[1] == '.cbz':
self.book = zipfile.ZipFile(
self.filename, mode='r', allowZip64=True)
self.image_list = [i.filename for i in self.book.infolist() if not i.is_dir()]
elif self.book_extension[1] == '.cbr':
self.book = rarfile.RarFile(self.filename)
self.image_list = [i.filename for i in self.book.infolist() if not i.isdir()]
self.image_list.sort()
except: # Specifying no exception here is warranted
print('Cannot parse ' + self.filename)
return
def get_title(self):
return self.book_extension[0]
def get_author(self):
return None
def get_year(self):
creation_time = time.ctime(os.path.getctime(self.filename))
creation_year = creation_time.split()[-1]
return creation_year
def get_cover_image(self):
# The first image in the archive may not be the cover
# It is implied, however, that the first image in order
# will be the cover
return self.book.read(self.image_list[0])
def get_isbn(self):
return None
def get_tags(self):
return None
def get_contents(self):
file_settings = {'images_only': True}
contents = [(f'Page {count + 1}', i) for count, i in enumerate(self.image_list)]
return contents, file_settings

View File

@@ -28,9 +28,8 @@ class ParseEPUB:
# Maybe also include book description
self.book_ref = None
self.book = None
self.temp_dir = temp_dir
self.filename = filename
self.file_md5 = file_md5
self.extract_path = os.path.join(temp_dir, file_md5)
def read_book(self):
self.book_ref = EPUB(self.filename)
@@ -59,10 +58,9 @@ class ParseEPUB:
return self.book['tags']
def get_contents(self):
extract_path = os.path.join(self.temp_dir, self.file_md5)
zipfile.ZipFile(self.filename).extractall(extract_path)
zipfile.ZipFile(self.filename).extractall(self.extract_path)
self.book_ref.parse_chapters(temp_dir=self.temp_dir)
self.book_ref.parse_chapters(temp_dir=self.extract_path)
file_settings = {
'images_only': False}
return self.book['book_list'], file_settings

104
parsers/pdf.py Normal file
View File

@@ -0,0 +1,104 @@
#!/usr/bin/env python3
# This file is a part of Lector, a Qt based ebook reader
# Copyright (C) 2018 BasioMeusPuga
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import io
from PyQt5 import QtCore
from bs4 import BeautifulSoup
proceed = True
try:
import popplerqt5
except ImportError:
print('python-poppler-qt5 is not installed. Pdf files will not work.')
proceed = False
class ParsePDF:
def __init__(self, filename, *args):
self.filename = filename
self.book = None
self.metadata = None
def read_book(self):
if not proceed:
return
self.book = popplerqt5.Poppler.Document.load(self.filename)
if not self.book:
return
self.metadata = BeautifulSoup(self.book.metadata(), 'xml')
def get_title(self):
try:
title = self.metadata.find('title').text
return title.replace('\n', '')
except AttributeError:
return 'Unknown'
def get_author(self):
try:
author = self.metadata.find('creator').text
return author.replace('\n', '')
except AttributeError:
return 'Unknown'
def get_year(self):
try:
year = self.metadata.find('MetadataDate').text
return year.replace('\n', '')
except AttributeError:
return 9999
def get_cover_image(self):
self.book.setRenderHint(
popplerqt5.Poppler.Document.Antialiasing
and popplerqt5.Poppler.Document.TextAntialiasing)
cover_page = self.book.page(0)
cover_image = cover_page.renderToImage(300, 300)
return resize_image(cover_image)
def get_isbn(self):
return None
def get_tags(self):
try:
tags = self.metadata.find('Keywords').text
return tags.replace('\n', '')
except AttributeError:
return None
def get_contents(self):
file_settings = {'images_only': True}
contents = [(f'Page {i + 1}', i) for i in range(self.book.numPages())]
return contents, file_settings
def resize_image(cover_image):
cover_image = cover_image.scaled(
420, 600, QtCore.Qt.IgnoreAspectRatio)
byte_array = QtCore.QByteArray()
buffer = QtCore.QBuffer(byte_array)
buffer.open(QtCore.QIODevice.WriteOnly)
cover_image.save(buffer, 'jpg', 75)
cover_image_final = io.BytesIO(byte_array)
cover_image_final.seek(0)
return cover_image_final.getvalue()