Cleanup parsers

This commit is contained in:
BasioMeusPuga
2019-02-10 09:03:12 +05:30
parent c6e30b67ad
commit f6f9d01060
8 changed files with 271 additions and 316 deletions

1
TODO
View File

@@ -98,6 +98,7 @@ TODO
Better recursion needed for fb2 toc Better recursion needed for fb2 toc
Initial sort by author in tableview Initial sort by author in tableview
Last column not filling up tableview Last column not filling up tableview
Comic view mode changing does not work for newly added books
Secondary: Secondary:
Tab tooltip Tab tooltip

View File

@@ -21,6 +21,7 @@ import os
import time import time
import logging import logging
import zipfile import zipfile
import collections
from lector.rarfile import rarfile from lector.rarfile import rarfile
@@ -35,54 +36,36 @@ class ParseCOMIC:
self.book_extension = os.path.splitext(self.filename) self.book_extension = os.path.splitext(self.filename)
def read_book(self): def read_book(self):
try: if self.book_extension[1] == '.cbz':
if self.book_extension[1] == '.cbz': self.book = zipfile.ZipFile(
self.book = zipfile.ZipFile( self.filename, mode='r', allowZip64=True)
self.filename, mode='r', allowZip64=True) self.image_list = [
self.image_list = [ i.filename for i in self.book.infolist()
i.filename for i in self.book.infolist() if not i.is_dir() and is_image(i.filename)]
if not i.is_dir() and is_image(i.filename)]
elif self.book_extension[1] == '.cbr': elif self.book_extension[1] == '.cbr':
self.book = rarfile.RarFile(self.filename) self.book = rarfile.RarFile(self.filename)
self.image_list = [ self.image_list = [
i.filename for i in self.book.infolist() i.filename for i in self.book.infolist()
if not i.isdir() and is_image(i.filename)] if not i.isdir() and is_image(i.filename)]
self.image_list.sort() self.image_list.sort()
if not self.image_list:
return False
return True def generate_metadata(self):
except: # Specifying no exception here is warranted
return False
def get_title(self):
title = os.path.basename(self.book_extension[0]).strip(' ') title = os.path.basename(self.book_extension[0]).strip(' ')
return title author = '<Unknown>'
isbn = None
tags = []
cover = self.book.read(self.image_list[0])
def get_author(self):
return 'Unknown'
def get_year(self):
creation_time = time.ctime(os.path.getctime(self.filename)) creation_time = time.ctime(os.path.getctime(self.filename))
creation_year = creation_time.split()[-1] year = creation_time.split()[-1]
return creation_year
def get_cover_image(self): Metadata = collections.namedtuple(
# The first image in the archive may not be the cover 'Metadata', ['title', 'author', 'year', 'isbn', 'tags', 'cover'])
# It is implied, however, that the first image in order return Metadata(title, author, year, isbn, tags, cover)
# will be the cover
return self.book.read(self.image_list[0])
def get_isbn(self): def generate_content(self):
return None
def get_tags(self):
return None
def get_contents(self):
image_number = len(self.image_list) image_number = len(self.image_list)
toc = [(1, f'Page {i + 1}', i + 1) for i in range(image_number)] toc = [(1, f'Page {i + 1}', i + 1) for i in range(image_number)]

View File

@@ -14,6 +14,9 @@
# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
# TODO
# Maybe also include book description
import os import os
import zipfile import zipfile
import logging import logging
@@ -25,47 +28,27 @@ logger = logging.getLogger(__name__)
class ParseEPUB: class ParseEPUB:
def __init__(self, filename, temp_dir, file_md5): def __init__(self, filename, temp_dir, file_md5):
# TODO
# Maybe also include book description
self.book_ref = None
self.book = None self.book = None
self.temp_dir = temp_dir
self.filename = filename self.filename = filename
self.temp_dir = temp_dir
self.extract_path = os.path.join(temp_dir, file_md5) self.extract_path = os.path.join(temp_dir, file_md5)
def read_book(self): def read_book(self):
self.book_ref = EPUB(self.filename, self.temp_dir) self.book = EPUB(self.filename, self.temp_dir)
self.book_ref.generate_metadata()
self.book = self.book_ref.book
return True
def get_title(self): def generate_metadata(self):
return self.book['title'] self.book.generate_metadata()
return self.book.metadata
def get_author(self): def generate_content(self):
return self.book['author']
def get_year(self):
return self.book['year']
def get_cover_image(self):
return self.book['cover']
def get_isbn(self):
return self.book['isbn']
def get_tags(self):
return self.book['tags']
def get_contents(self):
zipfile.ZipFile(self.filename).extractall(self.extract_path) zipfile.ZipFile(self.filename).extractall(self.extract_path)
self.book_ref.generate_toc() self.book.generate_toc()
self.book_ref.generate_content() self.book.generate_content()
toc = [] toc = []
content = [] content = []
for count, i in enumerate(self.book['content']): for count, i in enumerate(self.book.content):
toc.append((i[0], i[1], count + 1)) toc.append((i[0], i[1], count + 1))
content.append(i[2]) content.append(i[2])

View File

@@ -14,6 +14,9 @@
# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
# TODO
# Maybe also include book description
import os import os
import logging import logging
@@ -24,46 +27,24 @@ logger = logging.getLogger(__name__)
class ParseFB2: class ParseFB2:
def __init__(self, filename, temp_dir, file_md5): def __init__(self, filename, temp_dir, file_md5):
# TODO
# Maybe also include book description
self.book_ref = None
self.book = None self.book = None
self.filename = filename self.filename = filename
self.extract_path = os.path.join(temp_dir, file_md5) self.extract_path = os.path.join(temp_dir, file_md5)
def read_book(self): def read_book(self):
self.book_ref = FB2(self.filename) self.book = FB2(self.filename)
contents_found = self.book_ref.read_fb2()
if not contents_found:
return False
self.book = self.book_ref.book
return True
def get_title(self): def generate_metadata(self):
return self.book['title'] self.book.generate_metadata()
return self.book.metadata
def get_author(self): def generate_content(self):
return self.book['author']
def get_year(self):
return self.book['year']
def get_cover_image(self):
return self.book['cover']
def get_isbn(self):
return self.book['isbn']
def get_tags(self):
return self.book['tags']
def get_contents(self):
os.makedirs(self.extract_path, exist_ok=True) # Manual creation is required here os.makedirs(self.extract_path, exist_ok=True) # Manual creation is required here
self.book_ref.parse_chapters(temp_dir=self.extract_path) self.book.generate_content(temp_dir=self.extract_path)
toc = [] toc = []
content = [] content = []
for count, i in enumerate(self.book['book_list']): for count, i in enumerate(self.book.content):
toc.append((i[0], i[1], count + 1)) toc.append((i[0], i[1], count + 1))
content.append(i[2]) content.append(i[2])

View File

@@ -14,11 +14,8 @@
# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
# TODO
# Error handling
# TOC parsing
import os import os
import collections
import fitz import fitz
from PyQt5 import QtGui from PyQt5 import QtGui
@@ -36,43 +33,39 @@ class ParsePDF:
except RuntimeError: except RuntimeError:
return False return False
def get_title(self): def generate_metadata(self):
title = self.book.metadata['title'] title = self.book.metadata['title']
if not title: if not title:
title = os.path.splitext(os.path.basename(self.filename))[0] title = os.path.splitext(os.path.basename(self.filename))[0]
return title
def get_author(self):
author = self.book.metadata['author'] author = self.book.metadata['author']
if not author: if not author:
author = 'Unknown' author = 'Unknown'
return author
def get_year(self):
creation_date = self.book.metadata['creationDate'] creation_date = self.book.metadata['creationDate']
try: try:
year = creation_date.split(':')[1][:4] year = creation_date.split(':')[1][:4]
except (ValueError, AttributeError): except (ValueError, AttributeError):
year = 9999 year = 9999
return year
def get_cover_image(self): isbn = None
tags = self.book.metadata['keywords']
if not tags:
tags = []
# This is a little roundabout for the cover # This is a little roundabout for the cover
# and I'm sure it's taking a performance hit # and I'm sure it's taking a performance hit
# But it is simple. So there's that. # But it is simple. So there's that.
cover_page = self.book.loadPage(0) cover_page = self.book.loadPage(0)
# Disabling scaling gets the covers much faster # Disabling scaling gets the covers much faster
return render_pdf_page(cover_page, True) cover = render_pdf_page(cover_page, True)
def get_isbn(self): Metadata = collections.namedtuple(
return None 'Metadata', ['title', 'author', 'year', 'isbn', 'tags', 'cover'])
return Metadata(title, author, year, isbn, tags, cover)
def get_tags(self): def generate_content(self):
tags = self.book.metadata['keywords']
return tags # Fine if it returns None
def get_contents(self):
content = list(range(self.book.pageCount)) content = list(range(self.book.pageCount))
toc = self.book.getToC() toc = self.book.getToC()
if not toc: if not toc:

View File

@@ -37,32 +37,17 @@ class EPUB:
def __init__(self, book_filename, temp_dir): def __init__(self, book_filename, temp_dir):
self.book_filename = book_filename self.book_filename = book_filename
self.temp_dir = temp_dir self.temp_dir = temp_dir
self.zip_file = None self.zip_file = None
self.file_list = None self.file_list = None
self.opf_dict = None self.opf_dict = None
self.book = {} self.split_chapters = {}
self.metadata = None
self.content = []
self.generate_references() self.generate_references()
def find_file(self, filename):
# Get rid of special characters
filename = unquote(filename)
# First, look for the file in the root of the book
if filename in self.file_list:
return filename
# Then search for it elsewhere
else:
file_basename = os.path.basename(filename)
for i in self.file_list:
if os.path.basename(i) == file_basename:
return i
# If the file isn't found
logging.error(filename + ' not found in ' + self.book_filename)
return False
def generate_references(self): def generate_references(self):
self.zip_file = zipfile.ZipFile( self.zip_file = zipfile.ZipFile(
self.book_filename, mode='r', allowZip64=True) self.book_filename, mode='r', allowZip64=True)
@@ -88,9 +73,26 @@ class EPUB:
packagefile_data = self.zip_file.read(packagefile) packagefile_data = self.zip_file.read(packagefile)
self.opf_dict = xmltodict.parse(packagefile_data) self.opf_dict = xmltodict.parse(packagefile_data)
def generate_toc(self): def find_file(self, filename):
self.book['content'] = [] # Get rid of special characters
filename = unquote(filename)
# First, look for the file in the root of the book
if filename in self.file_list:
return filename
# Then search for it elsewhere
else:
file_basename = os.path.basename(filename)
for i in self.file_list:
if os.path.basename(i) == file_basename:
return i
# If the file isn't found
logging.error(filename + ' not found in ' + self.book_filename)
return False
def generate_toc(self):
def find_alternative_toc(): def find_alternative_toc():
toc_filename = None toc_filename = None
toc_filename_alternative = None toc_filename_alternative = None
@@ -134,14 +136,14 @@ class EPUB:
level + 1, level + 1,
i['navLabel']['text'], i['navLabel']['text'],
i['content']['@src']] for i in nav_node] i['content']['@src']] for i in nav_node]
self.book['content'].extend(these_contents) self.content.extend(these_contents)
return return
if 'navPoint' in nav_node.keys(): if 'navPoint' in nav_node.keys():
recursor(level, nav_node['navPoint']) recursor(level, nav_node['navPoint'])
else: else:
self.book['content'].append([ self.content.append([
level + 1, level + 1,
nav_node['navLabel']['text'], nav_node['navLabel']['text'],
nav_node['content']['@src']]) nav_node['content']['@src']])
@@ -150,14 +152,14 @@ class EPUB:
for top_level_nav in navpoints: for top_level_nav in navpoints:
# Just one chapter # Just one chapter
if isinstance(top_level_nav, str): if isinstance(top_level_nav, str):
self.book['content'].append([ self.content.append([
1, 1,
navpoints['navLabel']['text'], navpoints['navLabel']['text'],
navpoints['content']['@src']]) navpoints['content']['@src']])
break break
# Multiple chapters # Multiple chapters
self.book['content'].append([ self.content.append([
1, 1,
top_level_nav['navLabel']['text'], top_level_nav['navLabel']['text'],
top_level_nav['content']['@src']]) top_level_nav['content']['@src']])
@@ -183,14 +185,12 @@ class EPUB:
return 'Possible parse error: ' + chapter_file return 'Possible parse error: ' + chapter_file
def parse_split_chapters(self, chapters_with_split_content): def parse_split_chapters(self, chapters_with_split_content):
self.book['split_chapters'] = {}
# For split chapters, get the whole chapter first, then split # For split chapters, get the whole chapter first, then split
# between ids using their anchors, then "heal" the resultant text # between ids using their anchors, then "heal" the resultant text
# by creating a BeautifulSoup object. Write its str to the content # by creating a BeautifulSoup object. Write its str to the content
for i in chapters_with_split_content.items(): for i in chapters_with_split_content.items():
chapter_file = i[0] chapter_file = i[0]
self.book['split_chapters'][chapter_file] = {} self.split_chapters[chapter_file] = {}
chapter_content = self.get_chapter_content(chapter_file) chapter_content = self.get_chapter_content(chapter_file)
soup = BeautifulSoup(chapter_content, 'lxml') soup = BeautifulSoup(chapter_content, 'lxml')
@@ -208,10 +208,10 @@ class EPUB:
if this_tag: if this_tag:
this_markup = BeautifulSoup( this_markup = BeautifulSoup(
str(this_tag).strip() + markup_split[1], 'lxml') str(this_tag).strip() + markup_split[1], 'lxml')
self.book['split_chapters'][chapter_file][this_anchor] = str(this_markup) self.split_chapters[chapter_file][this_anchor] = str(this_markup)
# Remaining markup is assigned here # Remaining markup is assigned here
self.book['split_chapters'][chapter_file]['top_level'] = str(soup) self.split_chapters[chapter_file]['top_level'] = str(soup)
def generate_content(self): def generate_content(self):
# Find all the chapters mentioned in the opf spine # Find all the chapters mentioned in the opf spine
@@ -238,7 +238,7 @@ class EPUB:
chapter_title = 1 chapter_title = 1
toc_chapters = [ toc_chapters = [
unquote(i[2].split('#')[0]) for i in self.book['content']] unquote(i[2].split('#')[0]) for i in self.content]
last_valid_index = -2 # Yes, but why? last_valid_index = -2 # Yes, but why?
for i in spine_final: for i in spine_final:
@@ -251,7 +251,7 @@ class EPUB:
except ValueError: except ValueError:
last_valid_index += 1 last_valid_index += 1
self.book['content'].insert( self.content.insert(
last_valid_index + 1, last_valid_index + 1,
[1, str(chapter_title), i]) [1, str(chapter_title), i])
chapter_title += 1 chapter_title += 1
@@ -259,7 +259,7 @@ class EPUB:
# Parse split chapters as below # Parse split chapters as below
# They can be picked up during the iteration through the toc # They can be picked up during the iteration through the toc
chapters_with_split_content = {} chapters_with_split_content = {}
for i in self.book['content']: for i in self.content:
if '#' in i[2]: if '#' in i[2]:
this_split = i[2].split('#') this_split = i[2].split('#')
chapter = this_split[0] chapter = this_split[0]
@@ -278,8 +278,7 @@ class EPUB:
# In case a split chapter is encountered, get its content # In case a split chapter is encountered, get its content
# from the split_chapters dictionary # from the split_chapters dictionary
# What could possibly go wrong? # What could possibly go wrong?
split_chapters = self.book['split_chapters'] toc_copy = self.content[:]
toc_copy = self.book['content'][:]
# Put the book into the book # Put the book into the book
for count, i in enumerate(toc_copy): for count, i in enumerate(toc_copy):
@@ -293,7 +292,7 @@ class EPUB:
try: try:
chapter_content = ( chapter_content = (
split_chapters[chapter_file_proper][this_anchor]) self.split_chapters[chapter_file_proper][this_anchor])
except KeyError: except KeyError:
chapter_content = 'Parse Error' chapter_content = 'Parse Error'
error_string = ( error_string = (
@@ -301,9 +300,9 @@ class EPUB:
logger.error(error_string) logger.error(error_string)
# Get content that remained at the end of the pillaging above # Get content that remained at the end of the pillaging above
elif chapter_file in split_chapters.keys(): elif chapter_file in self.split_chapters.keys():
try: try:
chapter_content = split_chapters[chapter_file]['top_level'] chapter_content = self.split_chapters[chapter_file]['top_level']
except KeyError: except KeyError:
chapter_content = 'Parse Error' chapter_content = 'Parse Error'
error_string = ( error_string = (
@@ -314,26 +313,26 @@ class EPUB:
else: else:
chapter_content = self.get_chapter_content(chapter_file) chapter_content = self.get_chapter_content(chapter_file)
self.book['content'][count][2] = chapter_content self.content[count][2] = chapter_content
# Cleanup content by removing null chapters # Cleanup content by removing null chapters
self.book['content'] = [ self.content = [
i for i in self.book['content'] if i[2]] i for i in self.content if i[2]]
self.generate_book_cover() cover_image = self.generate_book_cover()
if self.book['cover']: if cover_image:
cover_path = os.path.join( cover_path = os.path.join(
self.temp_dir, os.path.basename(self.book_filename)) + '- cover' self.temp_dir, os.path.basename(self.book_filename)) + '- cover'
with open(cover_path, 'wb') as cover_temp: with open(cover_path, 'wb') as cover_temp:
cover_temp.write(self.book['cover']) cover_temp.write(cover_image)
# There's probably some rationale to doing an insert here # There's probably some rationale to doing an insert here
# But a replacement seems... neater # But a replacement seems... neater
self.book['content'].insert( self.content.insert(
0, (1, 'Cover', f'<center><img src="{cover_path}" alt="Cover"></center>')) 0, (1, 'Cover', f'<center><img src="{cover_path}" alt="Cover"></center>'))
def generate_metadata(self): def generate_metadata(self):
metadata = self.opf_dict['package']['metadata'] book_metadata = self.opf_dict['package']['metadata']
def flattener(this_object): def flattener(this_object):
if isinstance(this_object, collections.OrderedDict): if isinstance(this_object, collections.OrderedDict):
@@ -354,67 +353,76 @@ class EPUB:
# Book title # Book title
try: try:
self.book['title'] = flattener(metadata['dc:title']) title = flattener(book_metadata['dc:title'])
except: except:
self.book['title'] = os.path.splitext( logger.warning('Title not found: ' + self.book_filename)
title = os.path.splitext(
os.path.basename(self.book_filename))[0] os.path.basename(self.book_filename))[0]
# Book author # Book author
try: try:
self.book['author'] = flattener(metadata['dc:creator']) author = flattener(book_metadata['dc:creator'])
except: except:
self.book['author'] = 'Unknown' logger.warning('Author not found: ' + self.book_filename)
author = 'Unknown'
# Book year # Book year
try: try:
self.book['year'] = int(flattener(metadata['dc:date'])[:4]) year = int(flattener(book_metadata['dc:date'])[:4])
except: except:
self.book['year'] = 9999 logger.warning('Year not found: ' + self.book_filename)
year = 9999
# Book isbn # Book isbn
# Both one and multiple schema # Both one and multiple schema
self.book['isbn'] = None isbn = None
try: try:
scheme = metadata['dc:identifier']['@opf:scheme'].lower() scheme = book_metadata['dc:identifier']['@opf:scheme'].lower()
if scheme.lower() == 'isbn': if scheme.lower() == 'isbn':
self.book['isbn'] = metadata['dc:identifier']['#text'] isbn = book_metadata['dc:identifier']['#text']
except (TypeError, KeyError): except (TypeError, KeyError):
try: try:
for i in metadata['dc:identifier']: for i in book_metadata['dc:identifier']:
if i['@opf:scheme'].lower() == 'isbn': if i['@opf:scheme'].lower() == 'isbn':
self.book['isbn'] = i['#text'] isbn = i['#text']
break break
except: except:
logger.warning('ISBN not found: ' + self.book_filename)
pass pass
# Book tags # Book tags
try: try:
self.book['tags'] = metadata['dc:subject'] tags = book_metadata['dc:subject']
if isinstance(self.book['tags'], str): if isinstance(tags, str):
self.book['tags'] = [self.book['tags']] tags = [tags]
except: except:
self.book['tags'] = [] tags = []
# Book cover # Book cover
self.generate_book_cover() cover = self.generate_book_cover()
# Named tuple? Named tuple.
Metadata = collections.namedtuple(
'Metadata', ['title', 'author', 'year', 'isbn', 'tags', 'cover'])
self.metadata = Metadata(title, author, year, isbn, tags, cover)
def generate_book_cover(self): def generate_book_cover(self):
# This is separate because the book cover needs to # This is separate because the book cover needs to
# be found and extracted both during addition / reading # be found and extracted both during addition / reading
self.book['cover'] = None book_cover = None
try: try:
cover_image = [ cover_image = [
i['@href'] for i in self.opf_dict['package']['manifest']['item'] i['@href'] for i in self.opf_dict['package']['manifest']['item']
if i['@media-type'].split('/')[0] == 'image' and if i['@media-type'].split('/')[0] == 'image' and
'cover' in i['@id']][0] 'cover' in i['@id']][0]
self.book['cover'] = self.zip_file.read( book_cover = self.zip_file.read(self.find_file(cover_image))
self.find_file(cover_image))
except: except:
pass pass
# Find book cover the hard way # Find book cover the hard way
if not self.book['cover']: if not book_cover:
biggest_image_size = 0 biggest_image_size = 0
biggest_image = None biggest_image = None
for j in self.zip_file.filelist: for j in self.zip_file.filelist:
@@ -424,5 +432,10 @@ class EPUB:
biggest_image_size = j.file_size biggest_image_size = j.file_size
if biggest_image: if biggest_image:
self.book['cover'] = self.zip_file.read( book_cover = self.zip_file.read(
self.find_file(biggest_image)) self.find_file(biggest_image))
if not book_cover:
logger.warning('Cover not found: ' + self.book_filename)
return book_cover

View File

@@ -18,6 +18,7 @@ import os
import base64 import base64
import zipfile import zipfile
import logging import logging
import collections
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@@ -28,70 +29,59 @@ class FB2:
def __init__(self, filename): def __init__(self, filename):
self.filename = filename self.filename = filename
self.zip_file = None self.zip_file = None
self.book = {}
self.xml = None self.xml = None
def read_fb2(self): self.metadata = None
try: self.content = []
if self.filename.endswith('.fb2.zip'):
this_book = zipfile.ZipFile(
self.filename, mode='r', allowZip64=True)
for i in this_book.filelist:
if os.path.splitext(i.filename)[1] == '.fb2':
book_text = this_book.read(i.filename)
break
else:
with open(self.filename, 'r') as book_file:
book_text = book_file.read()
self.xml = BeautifulSoup(book_text, 'lxml') self.generate_references()
self.generate_book_metadata()
except: # Not specifying an exception type here may be justified
return False
return True def generate_references(self):
if self.filename.endswith('.fb2.zip'):
this_book = zipfile.ZipFile(
self.filename, mode='r', allowZip64=True)
for i in this_book.filelist:
if os.path.splitext(i.filename)[1] == '.fb2':
book_text = this_book.read(i.filename)
break
def generate_book_metadata(self): else:
self.book['isbn'] = None with open(self.filename, 'r') as book_file:
self.book['tags'] = None book_text = book_file.read()
self.book['book_list'] = []
self.xml = BeautifulSoup(book_text, 'lxml')
def generate_metadata(self):
# All metadata can be parsed in one pass # All metadata can be parsed in one pass
all_tags = self.xml.find('description') all_tags = self.xml.find('description')
self.book['title'] = all_tags.find('book-title').text title = all_tags.find('book-title').text
if self.book['title'] == '' or self.book['title'] is None: if title == '' or title is None:
self.book['title'] = os.path.splitext( title = os.path.splitext(
os.path.basename(self.filename))[0] os.path.basename(self.filename))[0]
self.book['author'] = all_tags.find( author = all_tags.find(
'author').getText(separator=' ').replace('\n', ' ') 'author').getText(separator=' ').replace('\n', ' ')
if self.book['author'] == '' or self.book['author'] is None: if author == '' or author is None:
self.book['author'] = 'Unknown' author = '<Unknown>'
# TODO # TODO
# Account for other date formats # Account for other date formats
try: try:
self.book['year'] = int(all_tags.find('date').text) year = int(all_tags.find('date').text)
except ValueError: except ValueError:
self.book['year'] = 9999 year = 9999
# Cover Image isbn = None
try: tags = None
cover_image_xml = self.xml.find('coverpage')
for i in cover_image_xml:
cover_image_name = i.get('l:href')
cover_image_data = self.xml.find_all('binary') cover = self.generate_book_cover()
for i in cover_image_data:
if cover_image_name.endswith(i.get('id')):
self.book['cover'] = base64.decodebytes(i.text.encode())
except (AttributeError, TypeError):
# Catch TypeError in case no images exist in the book
logger.error('No cover found for: ' + self.filename)
self.book['cover'] = None
def parse_chapters(self, temp_dir): Metadata = collections.namedtuple(
'Metadata', ['title', 'author', 'year', 'isbn', 'tags', 'cover'])
self.metadata = Metadata(title, author, year, isbn, tags, cover)
def generate_content(self, temp_dir):
# TODO # TODO
# Check what's up with recursion levels # Check what's up with recursion levels
# Why is the TypeError happening in get_title # Why is the TypeError happening in get_title
@@ -114,7 +104,7 @@ class FB2:
children = element.findChildren('section', recursive=False) children = element.findChildren('section', recursive=False)
if not children and level != 1: if not children and level != 1:
this_title, title_xml = get_title(element) this_title, title_xml = get_title(element)
self.book['book_list'].append( self.content.append(
[level, this_title, title_xml + str(element)]) [level, this_title, title_xml + str(element)])
else: else:
for i in children: for i in children:
@@ -134,7 +124,7 @@ class FB2:
if section_children: if section_children:
chapter_text = this_title chapter_text = this_title
self.book['book_list'].append([1, this_title, chapter_text]) self.content.append([1, this_title, chapter_text])
recursor(1, this_element) recursor(1, this_element)
# Extract all images to the temp_dir # Extract all images to the temp_dir
@@ -144,7 +134,7 @@ class FB2:
image_string = f'<image l:href="#{image_name}"' image_string = f'<image l:href="#{image_name}"'
replacement_string = f'<p></p><img src=\"{image_path}\"' replacement_string = f'<p></p><img src=\"{image_path}\"'
for j in self.book['book_list']: for j in self.content:
j[2] = j[2].replace( j[2] = j[2].replace(
image_string, replacement_string) image_string, replacement_string)
try: try:
@@ -155,9 +145,30 @@ class FB2:
pass pass
# Insert the book cover at the beginning # Insert the book cover at the beginning
if self.book['cover']: cover_image = self.generate_book_cover()
cover_path = os.path.join(temp_dir, 'cover') if cover_image:
with open(cover_path, 'wb') as outimage: cover_path = os.path.join(
outimage.write(self.book['cover']) temp_dir, os.path.basename(self.filename)) + '- cover'
self.book['book_list'].insert( with open(cover_path, 'wb') as cover_temp:
cover_temp.write(cover_image)
self.content.insert(
0, (1, 'Cover', f'<center><img src="{cover_path}" alt="Cover"></center>')) 0, (1, 'Cover', f'<center><img src="{cover_path}" alt="Cover"></center>'))
def generate_book_cover(self):
cover = None
try:
cover_image_xml = self.xml.find('coverpage')
for i in cover_image_xml:
cover_image_name = i.get('l:href')
cover_image_data = self.xml.find_all('binary')
for i in cover_image_data:
if cover_image_name.endswith(i.get('id')):
cover = base64.decodebytes(i.text.encode())
except (AttributeError, TypeError):
# Catch TypeError in case no images exist in the book
logger.warning('Cover not found: ' + self.filename)
return cover

View File

@@ -16,20 +16,9 @@
# INSTRUCTIONS # INSTRUCTIONS
# Every parser is supposed to have the following methods. None returns are not allowed. # Every parser is supposed to have the following methods. None returns are not allowed.
# read_book() # read_book() - Initialize book
# get_title() # generate_metadata() - For addition
# get_author() # generate_content() - For reading
# get_year()
# get_cover_image()
# get_isbn()
# get_tags()
# get_contents() - Should return a tuple with 0: TOC 1: special_settings (dict)
# Parsers for files containing only images need to return only images_only = True
# TODO
# Maybe shift to insert or replace instead of hash checking
# See if you want to include a hash of the book's name and author
# Change thread niceness
import io import io
import os import os
@@ -211,87 +200,88 @@ class BookSorter:
break break
if not valid_extension: if not valid_extension:
logger.error(filename + ' has an unsupported extension') logger.error('Unsupported extension: ' + filename)
return return
book_ref = sorter[file_extension](filename, self.temp_dir, file_md5) book_ref = sorter[file_extension](filename, self.temp_dir, file_md5)
# Everything following this is standard try:
# None values are accounted for here book_ref.read_book()
is_valid = book_ref.read_book() except:
if not is_valid: logger.error('Error initializing: ' + filename)
logger.error('Cannot parse:' + filename)
return return
if book_ref.book: this_book = {}
# TODO this_book[file_md5] = {
# For the love of God clean this up. It's junk. 'hash': file_md5,
'path': filename}
this_book = {} # Different modes require different values
this_book[file_md5] = { if self.work_mode == 'addition':
'hash': file_md5, try:
'path': filename} metadata = book_ref.generate_metadata()
except:
logger.error('Metadata generation error: ' + filename)
return
# Different modes require different values title = metadata.title
if self.work_mode == 'addition': author = metadata.author
# Reduce the size of the incoming image year = metadata.year
# if one is found isbn = metadata.isbn
title = book_ref.get_title()
author = book_ref.get_author()
year = book_ref.get_year()
isbn = book_ref.get_isbn()
tags = None tags = None
if self.auto_tags: if self.auto_tags:
tags = book_ref.get_tags() tags = metadata.tags
cover_image_raw = book_ref.get_cover_image() cover_image_raw = metadata.cover
if cover_image_raw: if cover_image_raw:
cover_image = resize_image(cover_image_raw) cover_image = resize_image(cover_image_raw)
else: else:
# TODO # TODO
# Needs an option # Needs an option
# cover_image = fetch_cover(title, author) # cover_image = fetch_cover(title, author)
cover_image = None cover_image = None
this_book[file_md5]['cover_image'] = cover_image this_book[file_md5]['cover_image'] = cover_image
this_book[file_md5]['addition_mode'] = self.addition_mode this_book[file_md5]['addition_mode'] = self.addition_mode
if self.work_mode == 'reading': if self.work_mode == 'reading':
# All books must return the following list try:
# Indices are as described below book_breakdown = book_ref.generate_content()
book_breakdown = book_ref.get_contents() except:
logger.error('Content generation error: ' + filename)
return
toc = book_breakdown[0] toc = book_breakdown[0]
content = book_breakdown[1] content = book_breakdown[1]
images_only = book_breakdown[2] images_only = book_breakdown[2]
book_data = self.database_entry_for_book(file_md5) book_data = self.database_entry_for_book(file_md5)
title = book_data[0] title = book_data[0]
author = book_data[1] author = book_data[1]
year = book_data[2] year = book_data[2]
isbn = book_data[3] isbn = book_data[3]
tags = book_data[4] tags = book_data[4]
position = book_data[5] position = book_data[5]
bookmarks = book_data[6] bookmarks = book_data[6]
cover = book_data[7] cover = book_data[7]
annotations = book_data[8] annotations = book_data[8]
this_book[file_md5]['position'] = position this_book[file_md5]['position'] = position
this_book[file_md5]['bookmarks'] = bookmarks this_book[file_md5]['bookmarks'] = bookmarks
this_book[file_md5]['toc'] = toc this_book[file_md5]['toc'] = toc
this_book[file_md5]['content'] = content this_book[file_md5]['content'] = content
this_book[file_md5]['images_only'] = images_only this_book[file_md5]['images_only'] = images_only
this_book[file_md5]['cover'] = cover this_book[file_md5]['cover'] = cover
this_book[file_md5]['annotations'] = annotations this_book[file_md5]['annotations'] = annotations
this_book[file_md5]['title'] = title this_book[file_md5]['title'] = title
this_book[file_md5]['author'] = author this_book[file_md5]['author'] = author
this_book[file_md5]['year'] = year this_book[file_md5]['year'] = year
this_book[file_md5]['isbn'] = isbn this_book[file_md5]['isbn'] = isbn
this_book[file_md5]['tags'] = tags this_book[file_md5]['tags'] = tags
return this_book return this_book
def read_progress(self): def read_progress(self):
while True: while True: