#!/usr/bin/env python3 import os import re import hashlib from multiprocessing.dummy import Pool import ebooklib.epub class ParseEPUB: def __init__(self, filename): # TODO # Maybe also include book description self.filename = filename self.book = None def read_epub(self): try: self.book = ebooklib.epub.read_epub(self.filename) except (KeyError, AttributeError): print('Cannot parse ' + self.filename) return def get_title(self): return self.book.title.strip() def get_author(self): try: return self.book.metadata['http://purl.org/dc/elements/1.1/']['creator'][0][0] except KeyError: return None def get_year(self): try: return self.book.metadata['http://purl.org/dc/elements/1.1/']['date'][0][0][:4] except KeyError: return None def get_cover_image(self): # TODO # Generate a cover image in case one isn't found # This has to be done or the database module will # error out # Get cover image # This seems hack-ish, but that's never stopped me before image_path = None try: cover = self.book.metadata['http://www.idpf.org/2007/opf']['cover'][0][1]['content'] cover_item = self.book.get_item_with_id(cover) if cover_item: return cover_item.get_content() # In case no cover_item is returned, # we look for a cover in the guide for j in self.book.guide: try: if (j['title'].lower in ['cover', 'cover-image', 'coverimage'] or j['type'] == 'coverimagestandard'): image_path = j['href'] break except KeyError: pass # And if all else fails, we find # the first image referenced in the book # Fuck everything if not image_path: for j in self.book.items: if j.media_type == 'application/xhtml+xml': _regex = re.search(r"src=\"(.*)\"\/", j.content.decode('utf-8')) if _regex: image_path = _regex[1] break for k in self.book.get_items_of_type(ebooklib.ITEM_IMAGE): if os.path.basename(k.file_name) == os.path.basename(image_path): image_content = k.get_content() break return image_content except KeyError: return def get_isbn(self): try: identifier = self.book.metadata['http://purl.org/dc/elements/1.1/']['identifier'] for i in identifier: identifier_provider = i[1]['{http://www.idpf.org/2007/opf}scheme'] if identifier_provider.lower() == 'isbn': isbn = i[0] return isbn except KeyError: return class BookSorter: def __init__(self, file_list): # Have the GUI pass a list of files straight to here # Then, on the basis of what is needed, pass the # filenames to the requisite functions # This includes getting file info for the database # Parsing for the reader proper # Caching upon closing self.file_list = file_list self.all_books = {} def read_book(self, filename): # filename is expected as a string containg the # full path of the ebook file # TODO # See if you want to include a hash of the book's name and author with open(filename, 'rb') as current_book: file_md5 = hashlib.md5(current_book.read()).hexdigest() if file_md5 in self.all_books.items(): return # TODO # See if tags can be generated from book content book_ref = ParseEPUB(filename) book_ref.read_epub() if book_ref.book: title = book_ref.get_title() author = book_ref.get_author() year = book_ref.get_year() cover_image = book_ref.get_cover_image() isbn = book_ref.get_isbn() self.all_books[file_md5] = { 'title': title, 'author': author, 'year': year, 'isbn': isbn, 'path': filename, 'cover_image': cover_image} def initiate_threads(self): _pool = Pool(5) _pool.map(self.read_book, self.file_list) _pool.close() _pool.join() return self.all_books