Cleanup parsers

This commit is contained in:
BasioMeusPuga
2019-02-10 09:03:12 +05:30
parent c6e30b67ad
commit f6f9d01060
8 changed files with 271 additions and 316 deletions

1
TODO
View File

@@ -98,6 +98,7 @@ TODO
Better recursion needed for fb2 toc
Initial sort by author in tableview
Last column not filling up tableview
Comic view mode changing does not work for newly added books
Secondary:
Tab tooltip

View File

@@ -21,6 +21,7 @@ import os
import time
import logging
import zipfile
import collections
from lector.rarfile import rarfile
@@ -35,7 +36,6 @@ class ParseCOMIC:
self.book_extension = os.path.splitext(self.filename)
def read_book(self):
try:
if self.book_extension[1] == '.cbz':
self.book = zipfile.ZipFile(
self.filename, mode='r', allowZip64=True)
@@ -50,39 +50,22 @@ class ParseCOMIC:
if not i.isdir() and is_image(i.filename)]
self.image_list.sort()
if not self.image_list:
return False
return True
except: # Specifying no exception here is warranted
return False
def get_title(self):
def generate_metadata(self):
title = os.path.basename(self.book_extension[0]).strip(' ')
return title
author = '<Unknown>'
isbn = None
tags = []
cover = self.book.read(self.image_list[0])
def get_author(self):
return 'Unknown'
def get_year(self):
creation_time = time.ctime(os.path.getctime(self.filename))
creation_year = creation_time.split()[-1]
return creation_year
year = creation_time.split()[-1]
def get_cover_image(self):
# The first image in the archive may not be the cover
# It is implied, however, that the first image in order
# will be the cover
return self.book.read(self.image_list[0])
Metadata = collections.namedtuple(
'Metadata', ['title', 'author', 'year', 'isbn', 'tags', 'cover'])
return Metadata(title, author, year, isbn, tags, cover)
def get_isbn(self):
return None
def get_tags(self):
return None
def get_contents(self):
def generate_content(self):
image_number = len(self.image_list)
toc = [(1, f'Page {i + 1}', i + 1) for i in range(image_number)]

View File

@@ -14,6 +14,9 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# TODO
# Maybe also include book description
import os
import zipfile
import logging
@@ -25,47 +28,27 @@ logger = logging.getLogger(__name__)
class ParseEPUB:
def __init__(self, filename, temp_dir, file_md5):
# TODO
# Maybe also include book description
self.book_ref = None
self.book = None
self.temp_dir = temp_dir
self.filename = filename
self.temp_dir = temp_dir
self.extract_path = os.path.join(temp_dir, file_md5)
def read_book(self):
self.book_ref = EPUB(self.filename, self.temp_dir)
self.book_ref.generate_metadata()
self.book = self.book_ref.book
return True
self.book = EPUB(self.filename, self.temp_dir)
def get_title(self):
return self.book['title']
def generate_metadata(self):
self.book.generate_metadata()
return self.book.metadata
def get_author(self):
return self.book['author']
def get_year(self):
return self.book['year']
def get_cover_image(self):
return self.book['cover']
def get_isbn(self):
return self.book['isbn']
def get_tags(self):
return self.book['tags']
def get_contents(self):
def generate_content(self):
zipfile.ZipFile(self.filename).extractall(self.extract_path)
self.book_ref.generate_toc()
self.book_ref.generate_content()
self.book.generate_toc()
self.book.generate_content()
toc = []
content = []
for count, i in enumerate(self.book['content']):
for count, i in enumerate(self.book.content):
toc.append((i[0], i[1], count + 1))
content.append(i[2])

View File

@@ -14,6 +14,9 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# TODO
# Maybe also include book description
import os
import logging
@@ -24,46 +27,24 @@ logger = logging.getLogger(__name__)
class ParseFB2:
def __init__(self, filename, temp_dir, file_md5):
# TODO
# Maybe also include book description
self.book_ref = None
self.book = None
self.filename = filename
self.extract_path = os.path.join(temp_dir, file_md5)
def read_book(self):
self.book_ref = FB2(self.filename)
contents_found = self.book_ref.read_fb2()
if not contents_found:
return False
self.book = self.book_ref.book
return True
self.book = FB2(self.filename)
def get_title(self):
return self.book['title']
def generate_metadata(self):
self.book.generate_metadata()
return self.book.metadata
def get_author(self):
return self.book['author']
def get_year(self):
return self.book['year']
def get_cover_image(self):
return self.book['cover']
def get_isbn(self):
return self.book['isbn']
def get_tags(self):
return self.book['tags']
def get_contents(self):
def generate_content(self):
os.makedirs(self.extract_path, exist_ok=True) # Manual creation is required here
self.book_ref.parse_chapters(temp_dir=self.extract_path)
self.book.generate_content(temp_dir=self.extract_path)
toc = []
content = []
for count, i in enumerate(self.book['book_list']):
for count, i in enumerate(self.book.content):
toc.append((i[0], i[1], count + 1))
content.append(i[2])

View File

@@ -14,11 +14,8 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# TODO
# Error handling
# TOC parsing
import os
import collections
import fitz
from PyQt5 import QtGui
@@ -36,43 +33,39 @@ class ParsePDF:
except RuntimeError:
return False
def get_title(self):
def generate_metadata(self):
title = self.book.metadata['title']
if not title:
title = os.path.splitext(os.path.basename(self.filename))[0]
return title
def get_author(self):
author = self.book.metadata['author']
if not author:
author = 'Unknown'
return author
def get_year(self):
creation_date = self.book.metadata['creationDate']
try:
year = creation_date.split(':')[1][:4]
except (ValueError, AttributeError):
year = 9999
return year
def get_cover_image(self):
isbn = None
tags = self.book.metadata['keywords']
if not tags:
tags = []
# This is a little roundabout for the cover
# and I'm sure it's taking a performance hit
# But it is simple. So there's that.
cover_page = self.book.loadPage(0)
# Disabling scaling gets the covers much faster
return render_pdf_page(cover_page, True)
cover = render_pdf_page(cover_page, True)
def get_isbn(self):
return None
Metadata = collections.namedtuple(
'Metadata', ['title', 'author', 'year', 'isbn', 'tags', 'cover'])
return Metadata(title, author, year, isbn, tags, cover)
def get_tags(self):
tags = self.book.metadata['keywords']
return tags # Fine if it returns None
def get_contents(self):
def generate_content(self):
content = list(range(self.book.pageCount))
toc = self.book.getToC()
if not toc:

View File

@@ -37,32 +37,17 @@ class EPUB:
def __init__(self, book_filename, temp_dir):
self.book_filename = book_filename
self.temp_dir = temp_dir
self.zip_file = None
self.file_list = None
self.opf_dict = None
self.book = {}
self.split_chapters = {}
self.metadata = None
self.content = []
self.generate_references()
def find_file(self, filename):
# Get rid of special characters
filename = unquote(filename)
# First, look for the file in the root of the book
if filename in self.file_list:
return filename
# Then search for it elsewhere
else:
file_basename = os.path.basename(filename)
for i in self.file_list:
if os.path.basename(i) == file_basename:
return i
# If the file isn't found
logging.error(filename + ' not found in ' + self.book_filename)
return False
def generate_references(self):
self.zip_file = zipfile.ZipFile(
self.book_filename, mode='r', allowZip64=True)
@@ -88,9 +73,26 @@ class EPUB:
packagefile_data = self.zip_file.read(packagefile)
self.opf_dict = xmltodict.parse(packagefile_data)
def generate_toc(self):
self.book['content'] = []
def find_file(self, filename):
# Get rid of special characters
filename = unquote(filename)
# First, look for the file in the root of the book
if filename in self.file_list:
return filename
# Then search for it elsewhere
else:
file_basename = os.path.basename(filename)
for i in self.file_list:
if os.path.basename(i) == file_basename:
return i
# If the file isn't found
logging.error(filename + ' not found in ' + self.book_filename)
return False
def generate_toc(self):
def find_alternative_toc():
toc_filename = None
toc_filename_alternative = None
@@ -134,14 +136,14 @@ class EPUB:
level + 1,
i['navLabel']['text'],
i['content']['@src']] for i in nav_node]
self.book['content'].extend(these_contents)
self.content.extend(these_contents)
return
if 'navPoint' in nav_node.keys():
recursor(level, nav_node['navPoint'])
else:
self.book['content'].append([
self.content.append([
level + 1,
nav_node['navLabel']['text'],
nav_node['content']['@src']])
@@ -150,14 +152,14 @@ class EPUB:
for top_level_nav in navpoints:
# Just one chapter
if isinstance(top_level_nav, str):
self.book['content'].append([
self.content.append([
1,
navpoints['navLabel']['text'],
navpoints['content']['@src']])
break
# Multiple chapters
self.book['content'].append([
self.content.append([
1,
top_level_nav['navLabel']['text'],
top_level_nav['content']['@src']])
@@ -183,14 +185,12 @@ class EPUB:
return 'Possible parse error: ' + chapter_file
def parse_split_chapters(self, chapters_with_split_content):
self.book['split_chapters'] = {}
# For split chapters, get the whole chapter first, then split
# between ids using their anchors, then "heal" the resultant text
# by creating a BeautifulSoup object. Write its str to the content
for i in chapters_with_split_content.items():
chapter_file = i[0]
self.book['split_chapters'][chapter_file] = {}
self.split_chapters[chapter_file] = {}
chapter_content = self.get_chapter_content(chapter_file)
soup = BeautifulSoup(chapter_content, 'lxml')
@@ -208,10 +208,10 @@ class EPUB:
if this_tag:
this_markup = BeautifulSoup(
str(this_tag).strip() + markup_split[1], 'lxml')
self.book['split_chapters'][chapter_file][this_anchor] = str(this_markup)
self.split_chapters[chapter_file][this_anchor] = str(this_markup)
# Remaining markup is assigned here
self.book['split_chapters'][chapter_file]['top_level'] = str(soup)
self.split_chapters[chapter_file]['top_level'] = str(soup)
def generate_content(self):
# Find all the chapters mentioned in the opf spine
@@ -238,7 +238,7 @@ class EPUB:
chapter_title = 1
toc_chapters = [
unquote(i[2].split('#')[0]) for i in self.book['content']]
unquote(i[2].split('#')[0]) for i in self.content]
last_valid_index = -2 # Yes, but why?
for i in spine_final:
@@ -251,7 +251,7 @@ class EPUB:
except ValueError:
last_valid_index += 1
self.book['content'].insert(
self.content.insert(
last_valid_index + 1,
[1, str(chapter_title), i])
chapter_title += 1
@@ -259,7 +259,7 @@ class EPUB:
# Parse split chapters as below
# They can be picked up during the iteration through the toc
chapters_with_split_content = {}
for i in self.book['content']:
for i in self.content:
if '#' in i[2]:
this_split = i[2].split('#')
chapter = this_split[0]
@@ -278,8 +278,7 @@ class EPUB:
# In case a split chapter is encountered, get its content
# from the split_chapters dictionary
# What could possibly go wrong?
split_chapters = self.book['split_chapters']
toc_copy = self.book['content'][:]
toc_copy = self.content[:]
# Put the book into the book
for count, i in enumerate(toc_copy):
@@ -293,7 +292,7 @@ class EPUB:
try:
chapter_content = (
split_chapters[chapter_file_proper][this_anchor])
self.split_chapters[chapter_file_proper][this_anchor])
except KeyError:
chapter_content = 'Parse Error'
error_string = (
@@ -301,9 +300,9 @@ class EPUB:
logger.error(error_string)
# Get content that remained at the end of the pillaging above
elif chapter_file in split_chapters.keys():
elif chapter_file in self.split_chapters.keys():
try:
chapter_content = split_chapters[chapter_file]['top_level']
chapter_content = self.split_chapters[chapter_file]['top_level']
except KeyError:
chapter_content = 'Parse Error'
error_string = (
@@ -314,26 +313,26 @@ class EPUB:
else:
chapter_content = self.get_chapter_content(chapter_file)
self.book['content'][count][2] = chapter_content
self.content[count][2] = chapter_content
# Cleanup content by removing null chapters
self.book['content'] = [
i for i in self.book['content'] if i[2]]
self.content = [
i for i in self.content if i[2]]
self.generate_book_cover()
if self.book['cover']:
cover_image = self.generate_book_cover()
if cover_image:
cover_path = os.path.join(
self.temp_dir, os.path.basename(self.book_filename)) + '- cover'
with open(cover_path, 'wb') as cover_temp:
cover_temp.write(self.book['cover'])
cover_temp.write(cover_image)
# There's probably some rationale to doing an insert here
# But a replacement seems... neater
self.book['content'].insert(
self.content.insert(
0, (1, 'Cover', f'<center><img src="{cover_path}" alt="Cover"></center>'))
def generate_metadata(self):
metadata = self.opf_dict['package']['metadata']
book_metadata = self.opf_dict['package']['metadata']
def flattener(this_object):
if isinstance(this_object, collections.OrderedDict):
@@ -354,67 +353,76 @@ class EPUB:
# Book title
try:
self.book['title'] = flattener(metadata['dc:title'])
title = flattener(book_metadata['dc:title'])
except:
self.book['title'] = os.path.splitext(
logger.warning('Title not found: ' + self.book_filename)
title = os.path.splitext(
os.path.basename(self.book_filename))[0]
# Book author
try:
self.book['author'] = flattener(metadata['dc:creator'])
author = flattener(book_metadata['dc:creator'])
except:
self.book['author'] = 'Unknown'
logger.warning('Author not found: ' + self.book_filename)
author = 'Unknown'
# Book year
try:
self.book['year'] = int(flattener(metadata['dc:date'])[:4])
year = int(flattener(book_metadata['dc:date'])[:4])
except:
self.book['year'] = 9999
logger.warning('Year not found: ' + self.book_filename)
year = 9999
# Book isbn
# Both one and multiple schema
self.book['isbn'] = None
isbn = None
try:
scheme = metadata['dc:identifier']['@opf:scheme'].lower()
scheme = book_metadata['dc:identifier']['@opf:scheme'].lower()
if scheme.lower() == 'isbn':
self.book['isbn'] = metadata['dc:identifier']['#text']
isbn = book_metadata['dc:identifier']['#text']
except (TypeError, KeyError):
try:
for i in metadata['dc:identifier']:
for i in book_metadata['dc:identifier']:
if i['@opf:scheme'].lower() == 'isbn':
self.book['isbn'] = i['#text']
isbn = i['#text']
break
except:
logger.warning('ISBN not found: ' + self.book_filename)
pass
# Book tags
try:
self.book['tags'] = metadata['dc:subject']
if isinstance(self.book['tags'], str):
self.book['tags'] = [self.book['tags']]
tags = book_metadata['dc:subject']
if isinstance(tags, str):
tags = [tags]
except:
self.book['tags'] = []
tags = []
# Book cover
self.generate_book_cover()
cover = self.generate_book_cover()
# Named tuple? Named tuple.
Metadata = collections.namedtuple(
'Metadata', ['title', 'author', 'year', 'isbn', 'tags', 'cover'])
self.metadata = Metadata(title, author, year, isbn, tags, cover)
def generate_book_cover(self):
# This is separate because the book cover needs to
# be found and extracted both during addition / reading
self.book['cover'] = None
book_cover = None
try:
cover_image = [
i['@href'] for i in self.opf_dict['package']['manifest']['item']
if i['@media-type'].split('/')[0] == 'image' and
'cover' in i['@id']][0]
self.book['cover'] = self.zip_file.read(
self.find_file(cover_image))
book_cover = self.zip_file.read(self.find_file(cover_image))
except:
pass
# Find book cover the hard way
if not self.book['cover']:
if not book_cover:
biggest_image_size = 0
biggest_image = None
for j in self.zip_file.filelist:
@@ -424,5 +432,10 @@ class EPUB:
biggest_image_size = j.file_size
if biggest_image:
self.book['cover'] = self.zip_file.read(
book_cover = self.zip_file.read(
self.find_file(biggest_image))
if not book_cover:
logger.warning('Cover not found: ' + self.book_filename)
return book_cover

View File

@@ -18,6 +18,7 @@ import os
import base64
import zipfile
import logging
import collections
from bs4 import BeautifulSoup
@@ -28,11 +29,14 @@ class FB2:
def __init__(self, filename):
self.filename = filename
self.zip_file = None
self.book = {}
self.xml = None
def read_fb2(self):
try:
self.metadata = None
self.content = []
self.generate_references()
def generate_references(self):
if self.filename.endswith('.fb2.zip'):
this_book = zipfile.ZipFile(
self.filename, mode='r', allowZip64=True)
@@ -40,58 +44,44 @@ class FB2:
if os.path.splitext(i.filename)[1] == '.fb2':
book_text = this_book.read(i.filename)
break
else:
with open(self.filename, 'r') as book_file:
book_text = book_file.read()
self.xml = BeautifulSoup(book_text, 'lxml')
self.generate_book_metadata()
except: # Not specifying an exception type here may be justified
return False
return True
def generate_book_metadata(self):
self.book['isbn'] = None
self.book['tags'] = None
self.book['book_list'] = []
def generate_metadata(self):
# All metadata can be parsed in one pass
all_tags = self.xml.find('description')
self.book['title'] = all_tags.find('book-title').text
if self.book['title'] == '' or self.book['title'] is None:
self.book['title'] = os.path.splitext(
title = all_tags.find('book-title').text
if title == '' or title is None:
title = os.path.splitext(
os.path.basename(self.filename))[0]
self.book['author'] = all_tags.find(
author = all_tags.find(
'author').getText(separator=' ').replace('\n', ' ')
if self.book['author'] == '' or self.book['author'] is None:
self.book['author'] = 'Unknown'
if author == '' or author is None:
author = '<Unknown>'
# TODO
# Account for other date formats
try:
self.book['year'] = int(all_tags.find('date').text)
year = int(all_tags.find('date').text)
except ValueError:
self.book['year'] = 9999
year = 9999
# Cover Image
try:
cover_image_xml = self.xml.find('coverpage')
for i in cover_image_xml:
cover_image_name = i.get('l:href')
isbn = None
tags = None
cover_image_data = self.xml.find_all('binary')
for i in cover_image_data:
if cover_image_name.endswith(i.get('id')):
self.book['cover'] = base64.decodebytes(i.text.encode())
except (AttributeError, TypeError):
# Catch TypeError in case no images exist in the book
logger.error('No cover found for: ' + self.filename)
self.book['cover'] = None
cover = self.generate_book_cover()
def parse_chapters(self, temp_dir):
Metadata = collections.namedtuple(
'Metadata', ['title', 'author', 'year', 'isbn', 'tags', 'cover'])
self.metadata = Metadata(title, author, year, isbn, tags, cover)
def generate_content(self, temp_dir):
# TODO
# Check what's up with recursion levels
# Why is the TypeError happening in get_title
@@ -114,7 +104,7 @@ class FB2:
children = element.findChildren('section', recursive=False)
if not children and level != 1:
this_title, title_xml = get_title(element)
self.book['book_list'].append(
self.content.append(
[level, this_title, title_xml + str(element)])
else:
for i in children:
@@ -134,7 +124,7 @@ class FB2:
if section_children:
chapter_text = this_title
self.book['book_list'].append([1, this_title, chapter_text])
self.content.append([1, this_title, chapter_text])
recursor(1, this_element)
# Extract all images to the temp_dir
@@ -144,7 +134,7 @@ class FB2:
image_string = f'<image l:href="#{image_name}"'
replacement_string = f'<p></p><img src=\"{image_path}\"'
for j in self.book['book_list']:
for j in self.content:
j[2] = j[2].replace(
image_string, replacement_string)
try:
@@ -155,9 +145,30 @@ class FB2:
pass
# Insert the book cover at the beginning
if self.book['cover']:
cover_path = os.path.join(temp_dir, 'cover')
with open(cover_path, 'wb') as outimage:
outimage.write(self.book['cover'])
self.book['book_list'].insert(
cover_image = self.generate_book_cover()
if cover_image:
cover_path = os.path.join(
temp_dir, os.path.basename(self.filename)) + '- cover'
with open(cover_path, 'wb') as cover_temp:
cover_temp.write(cover_image)
self.content.insert(
0, (1, 'Cover', f'<center><img src="{cover_path}" alt="Cover"></center>'))
def generate_book_cover(self):
cover = None
try:
cover_image_xml = self.xml.find('coverpage')
for i in cover_image_xml:
cover_image_name = i.get('l:href')
cover_image_data = self.xml.find_all('binary')
for i in cover_image_data:
if cover_image_name.endswith(i.get('id')):
cover = base64.decodebytes(i.text.encode())
except (AttributeError, TypeError):
# Catch TypeError in case no images exist in the book
logger.warning('Cover not found: ' + self.filename)
return cover

View File

@@ -16,20 +16,9 @@
# INSTRUCTIONS
# Every parser is supposed to have the following methods. None returns are not allowed.
# read_book()
# get_title()
# get_author()
# get_year()
# get_cover_image()
# get_isbn()
# get_tags()
# get_contents() - Should return a tuple with 0: TOC 1: special_settings (dict)
# Parsers for files containing only images need to return only images_only = True
# TODO
# Maybe shift to insert or replace instead of hash checking
# See if you want to include a hash of the book's name and author
# Change thread niceness
# read_book() - Initialize book
# generate_metadata() - For addition
# generate_content() - For reading
import io
import os
@@ -211,22 +200,17 @@ class BookSorter:
break
if not valid_extension:
logger.error(filename + ' has an unsupported extension')
logger.error('Unsupported extension: ' + filename)
return
book_ref = sorter[file_extension](filename, self.temp_dir, file_md5)
# Everything following this is standard
# None values are accounted for here
is_valid = book_ref.read_book()
if not is_valid:
logger.error('Cannot parse:' + filename)
try:
book_ref.read_book()
except:
logger.error('Error initializing: ' + filename)
return
if book_ref.book:
# TODO
# For the love of God clean this up. It's junk.
this_book = {}
this_book[file_md5] = {
'hash': file_md5,
@@ -234,18 +218,22 @@ class BookSorter:
# Different modes require different values
if self.work_mode == 'addition':
# Reduce the size of the incoming image
# if one is found
title = book_ref.get_title()
author = book_ref.get_author()
year = book_ref.get_year()
isbn = book_ref.get_isbn()
try:
metadata = book_ref.generate_metadata()
except:
logger.error('Metadata generation error: ' + filename)
return
title = metadata.title
author = metadata.author
year = metadata.year
isbn = metadata.isbn
tags = None
if self.auto_tags:
tags = book_ref.get_tags()
tags = metadata.tags
cover_image_raw = book_ref.get_cover_image()
cover_image_raw = metadata.cover
if cover_image_raw:
cover_image = resize_image(cover_image_raw)
else:
@@ -258,9 +246,11 @@ class BookSorter:
this_book[file_md5]['addition_mode'] = self.addition_mode
if self.work_mode == 'reading':
# All books must return the following list
# Indices are as described below
book_breakdown = book_ref.get_contents()
try:
book_breakdown = book_ref.generate_content()
except:
logger.error('Content generation error: ' + filename)
return
toc = book_breakdown[0]
content = book_breakdown[1]