diff --git a/TODO b/TODO index d5bc4c8..b39684f 100644 --- a/TODO +++ b/TODO @@ -36,6 +36,7 @@ TODO Set focus to newly added file Reading: ✓ Drop down for TOC + ✓ Treeview navigation for TOC ✓ Override the keypress event of the textedit ✓ Use format* icons for toolbar buttons ✓ Implement book view settings with a(nother) toolbar @@ -86,7 +87,6 @@ TODO Have them save to memory ✓ fb2 support ✓ Images need to show up in their placeholders - djvu support Other: ✓ Define every widget in code Bugs: @@ -98,8 +98,11 @@ TODO Better recursion needed for fb2 toc Secondary: + Additional Settings: + Disable progressbar - 20% book addition speed improvement + Disable cover loading when reading - Saves ~2M / book + Create covers for books without them - VERY SLOW Special formatting for each chapter's title - Create covers for books without them Signal end of chapter with some text Graphical themes Change focus rectangle dimensions @@ -108,7 +111,7 @@ TODO Goodreads API: Ratings, Read, Recommendations Get ISBN using python-isbnlib Use embedded fonts + CSS - txt, doc, chm support + txt, doc, chm, djvu support Include icons for filetype emblems Comic view modes Continuous paging @@ -116,7 +119,7 @@ TODO ? Add only one file type if multiple are present ? Create emblem per filetype In application notifications - Notification in case the filter is filtering out all files with no option in place + Notification in case the filter is filtering out all files with no option in place Option to fit images to viewport Need help with: diff --git a/lector/parsers/epub.py b/lector/parsers/epub.py index c9eb96d..69a3309 100644 --- a/lector/parsers/epub.py +++ b/lector/parsers/epub.py @@ -29,14 +29,13 @@ class ParseEPUB: # Maybe also include book description self.book_ref = None self.book = None + self.temp_dir = temp_dir self.filename = filename self.extract_path = os.path.join(temp_dir, file_md5) def read_book(self): - self.book_ref = EPUB(self.filename) - contents_found = self.book_ref.read_epub() - if not contents_found: - return False + self.book_ref = EPUB(self.filename, self.temp_dir) + self.book_ref.generate_metadata() self.book = self.book_ref.book return True @@ -61,14 +60,8 @@ class ParseEPUB: def get_contents(self): zipfile.ZipFile(self.filename).extractall(self.extract_path) - self.book_ref.parse_toc() - self.book_ref.parse_chapters(temp_dir=self.extract_path) - - toc = [] - content = [] - for count, i in enumerate(self.book['book_list']): - toc.append((1, i[0], count + 1)) - content.append(i[1]) + self.book_ref.generate_toc() + self.book_ref.generate_content() # Return toc, content, images_only - return toc, content, False + return self.book['toc'], self.book['content'], False diff --git a/lector/readers/read_epub.py b/lector/readers/read_epub.py index 40aaeab..3ba2eba 100644 --- a/lector/readers/read_epub.py +++ b/lector/readers/read_epub.py @@ -14,175 +14,333 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -import os -import logging -import zipfile -from urllib.parse import unquote +# TODO +# See if inserting chapters not in the toc.ncx can be avoided +# Missing file order is messed up +# Account for stylesheets... eventually +# Everything needs logging +# Mobipocket files +import os +import zipfile +import logging +import collections + +import xmltodict from bs4 import BeautifulSoup logger = logging.getLogger(__name__) class EPUB: - def __init__(self, filename): - self.filename = filename + def __init__(self, book_filename, temp_dir): + self.book_filename = book_filename + self.temp_dir = temp_dir self.zip_file = None + self.file_list = None + self.opf_dict = None self.book = {} + + self.generate_references() + + def find_file(self, filename): + # First, look for the file in the root of the book + if filename in self.file_list: + return filename + + # Then, search for it elsewhere + else: + file_basename = os.path.basename(filename) + for i in self.file_list: + if os.path.basename(i) == file_basename: + return i + + # If the file isn't found + logger.error(filename + ' not found') + return False + + def generate_references(self): + self.zip_file = zipfile.ZipFile( + self.book_filename, mode='r', allowZip64=True) + self.file_list = self.zip_file.namelist() + + # Book structure relies on parsing the .opf file + # in the book. Now that might be the usual content.opf + # or package.opf or it might be named after your favorite + # eldritch abomination. The point is we have to check + # the container.xml + container = self.find_file('container.xml') + if container: + container_xml = self.zip_file.read(container) + container_dict = xmltodict.parse(container_xml) + packagefile = container_dict['container']['rootfiles']['rootfile']['@full-path'] + else: + presumptive_names = ('content.opf', 'package.opf') + for i in presumptive_names: + packagefile = self.find_file(i) + if packagefile: + break + + packagefile_data = self.zip_file.read(packagefile) + self.opf_dict = xmltodict.parse(packagefile_data) + + def generate_toc(self): + self.book['toc'] = [] + + # I'm currently going with the file always being named toc.ncx + # But this is epub. The wild west of ebook formats. + tocfile = self.find_file('toc.ncx') + tocfile_data = self.zip_file.read(tocfile) + toc_dict = xmltodict.parse(tocfile_data) + + def recursor(level, nav_node): + if isinstance(nav_node, list): + these_contents = [[ + level + 1, + i['navLabel']['text'], + i['content']['@src']] for i in nav_node] + self.book['toc'].extend(these_contents) + return + + if 'navPoint' in nav_node.keys(): + recursor(level, nav_node['navPoint']) + + else: + self.book['toc'].append([ + level + 1, + nav_node['navLabel']['text'], + nav_node['content']['@src']]) + + navpoints = toc_dict['ncx']['navMap']['navPoint'] + for top_level_nav in navpoints: + self.book['toc'].append([ + 1, + top_level_nav['navLabel']['text'], + top_level_nav['content']['@src']]) + + if 'navPoint' in top_level_nav.keys(): + recursor(1, top_level_nav) + + def get_chapter_content(self, chapter_file): + this_file = self.find_file(chapter_file) + if this_file: + return self.zip_file.read(this_file).decode() + else: + print('Not found: ' + chapter_file) + return chapter_file + + def parse_split_chapters(self, chapters_with_split_content): self.book['split_chapters'] = {} - def read_epub(self): - # This is the function that should error out in - # case the module cannot process the file - try: - self.load_zip() - contents_path = self.get_file_path( - None, True) + # For split chapters, get the whole chapter first, then split + # between ids using their anchors, then "heal" the resultant text + # by creating a BeautifulSoup object. Write its str to the content + for i in chapters_with_split_content.items(): + chapter_file = i[0] + self.book['split_chapters'][chapter_file] = {} - if not contents_path: - return False # No (valid) opf was found so processing cannot continue + chapter_content = self.get_chapter_content(chapter_file) + soup = BeautifulSoup(chapter_content, 'lxml') - self.generate_book_metadata(contents_path) - except: # Not specifying an exception type here may be justified - return False + split_anchors = i[1] + for this_anchor in reversed(split_anchors): + this_tag = soup.find( + attrs={"id":lambda x: x == this_anchor}) - return True + markup_split = str(soup).split(str(this_tag)) + soup = BeautifulSoup(markup_split[0], 'lxml') + this_markup = BeautifulSoup( + str(this_tag) + markup_split[1], 'lxml') - def load_zip(self): - try: - self.zip_file = zipfile.ZipFile( - self.filename, mode='r', allowZip64=True) - except (KeyError, AttributeError, zipfile.BadZipFile): - logger.error('Malformed zip file ' + self.filename) - return + self.book['split_chapters'][chapter_file][this_anchor] = str(this_markup) - def parse_xml(self, filename, parser): - try: - this_xml = self.zip_file.read(filename).decode() - except KeyError: - short_filename = os.path.basename(self.filename) - warning_string = f'{str(filename)} not found in {short_filename}' - logger.warning(warning_string) - return + # Remaining markup is assigned here + self.book['split_chapters'][chapter_file]['top_level'] = str(soup) - root = BeautifulSoup(this_xml, parser) - return root + def generate_content(self): + # Find all the chapters mentioned in the opf spine + # These are simply ids that correspond to the actual item + # as mentioned in the manifest - which is a comprehensive + # list of files + chapters_in_spine = [ + i['@idref'] + for i in self.opf_dict['package']['spine']['itemref']] - def get_file_path(self, filename, is_content_file=False): - # Use this to get the location of the content.opf file - # And maybe some other file that has a more well formatted - # idea of the TOC - # We're going to all this trouble because there really is - # no going forward without a toc - if is_content_file: - container_location = self.get_file_path('container.xml') - xml = self.parse_xml(container_location, 'xml') + # Next, find items and ids from the manifest + chapters_from_manifest = { + i['@id']: i['@href'] + for i in self.opf_dict['package']['manifest']['item']} - if xml: - root_item = xml.find('rootfile') + # Finally, check which items are supposed to be in the spine + # on the basis of the id and change the toc accordingly + spine_final = [] + for i in chapters_in_spine: + try: + spine_final.append(chapters_from_manifest.pop(i)) + except KeyError: + pass + + # TODO + # Check what happens in case missing chapters are either + # at the beginning or the end of the book + chapter_title = 1 + toc_chapters = [i[2] for i in self.book['toc']] + last_valid_index = 0 + for i in spine_final: + if not i in toc_chapters: + previous_chapter = spine_final[spine_final.index(i) - 1] try: - return root_item.get('full-path') - except AttributeError: - error_string = f'ePub module: {self.filename} has a malformed container.xml' + previous_chapter_toc_index = toc_chapters.index(previous_chapter) + # In case of 2+ consecutive missing chapters + last_valid_index = previous_chapter_toc_index + except ValueError: + last_valid_index += 1 + self.book['toc'].insert( + last_valid_index + 1, + [1, str(chapter_title), i]) + chapter_title += 1 + + # Parse split chapters as below + # They can be picked up during the iteration through the toc + chapters_with_split_content = {} + for i in self.book['toc']: + if '#' in i[2]: + this_split = i[2].split('#') + chapter = this_split[0] + anchor = this_split[1] + + try: + chapters_with_split_content[chapter].append(anchor) + except KeyError: + chapters_with_split_content[chapter] = [] + chapters_with_split_content[chapter].append(anchor) + + self.parse_split_chapters(chapters_with_split_content) + + # Now we iterate over the ToC as presented in the toc.ncx + # and add chapters to the content list + # In case a split chapter is encountered, get its content + # from the split_chapters dictionary + # What could possibly go wrong? + + # The content list is separated from the toc list because + # the mupdf library returns its own toc a certain way and + # this keeps things uniform + split_chapters = self.book['split_chapters'] + toc_copy = self.book['toc'][:] + self.book['content'] = [] + + # Put the book into the book + for count, i in enumerate(toc_copy): + chapter_file = i[2] + + # Get split content according to its corresponding id attribute + if '#' in chapter_file: + this_split = chapter_file.split('#') + chapter_file_proper = this_split[0] + this_anchor = this_split[1] + + try: + chapter_content = ( + split_chapters[chapter_file_proper][this_anchor]) + except KeyError: + chapter_content = 'Parse Error' + error_string = ( + f'Error parsing {self.book_filename}: {chapter_file_proper}') logger.error(error_string) - return None - possible_filenames = ('content.opf', 'package.opf') - for i in possible_filenames: - presumptive_location = self.get_file_path(i) - if presumptive_location: - return presumptive_location + # Get content that remained at the end of the pillaging above + elif chapter_file in split_chapters.keys(): + try: + chapter_content = split_chapters[chapter_file]['top_level'] + except KeyError: + chapter_content = 'Parse Error' + error_string = ( + f'Error parsing {self.book_filename}: {chapter_file}') + logger.error(error_string) - for i in self.zip_file.filelist: - if os.path.basename(i.filename) == os.path.basename(filename): - return i.filename - - return None - - def read_from_zip(self, filename): - filename = unquote(filename) - try: - file_data = self.zip_file.read(filename) - return file_data - except KeyError: - file_path_actual = self.get_file_path(filename) - if file_path_actual: - return self.zip_file.read(file_path_actual) + # Vanilla non split chapters else: - logger.error('ePub module can\'t find ' + filename) + chapter_content = self.get_chapter_content(chapter_file) - #______________________________________________________ + # The count + 2 is an adjustment due to the cover being inserted below + self.book['toc'][count][2] = count + 2 + self.book['content'].append(chapter_content) - def generate_book_metadata(self, contents_path): - self.book['title'] = os.path.splitext( - os.path.basename(self.filename))[0] - self.book['author'] = 'Unknown' - self.book['isbn'] = None - self.book['tags'] = None - self.book['cover'] = None - self.book['toc_file'] = 'toc.ncx' # Overwritten if another one exists + self.generate_book_cover() + if self.book['cover']: + cover_path = os.path.join( + self.temp_dir, os.path.basename(self.book_filename)) + '- cover' + with open(cover_path, 'wb') as cover_temp: + cover_temp.write(self.book['cover']) - # Parse XML - xml = self.parse_xml(contents_path, 'xml') + self.book['toc'].insert(0, (1, 'Cover', 1)) + self.book['content'].insert( + 0, (f'
Cover
')) - # Parse metadata - item_dict = { - 'title': 'title', - 'author': 'creator', - 'year': 'date'} + def generate_metadata(self): + metadata = self.opf_dict['package']['metadata'] - for i in item_dict.items(): - item = xml.find(i[1]) - if item: - self.book[i[0]] = item.text + # There are no exception types specified below + # This is on purpose and makes me long for the days + # of simpler, happier things. + # Book title try: - self.book['year'] = int(self.book['year'][:4]) - except (TypeError, KeyError, IndexError, ValueError): + self.book['title'] = metadata['dc:title'] + if isinstance(self.book['title'], collections.OrderedDict): + self.book['title'] = metadata['dc:title']['#text'] + except: + print('Title parse error') + self.book['title'] = os.path.splitext( + os.path.basename(self.book_filename))[0] + + # Book author + try: + self.book['author'] = metadata['dc:creator']['#text'] + except: + self.book['author'] = 'Unknown' + + # Book year + try: + self.book['year'] = int(metadata['dc:date'][:4]) + except: self.book['year'] = 9999 - # Get identifier - identifier_items = xml.find_all('identifier') - for i in identifier_items: - scheme = i.get('scheme') - try: - if scheme.lower() == 'isbn': - self.book['isbn'] = i.text - except AttributeError: - self.book['isbn'] = None + # Book isbn + self.book['isbn'] = None + try: + for i in metadata['dc:identifier']: + if i['@opf:scheme'].lower() == 'isbn': + self.book['isbn'] = i['#text'] + except: + pass - # Tags - tag_items = xml.find_all('subject') - tag_list = [i.text for i in tag_items] - self.book['tags'] = tag_list + # Book tags + try: + self.book['tags'] = metadata['dc:subject'] + except: + self.book['tags'] = [] - # Get items - self.book['content_dict'] = {} - all_items = xml.find_all('item') - for i in all_items: - media_type = i.get('media-type') - this_id = i.get('id') + # Book cover + self.generate_book_cover() - if media_type == 'application/xhtml+xml' or media_type == 'text/html': - self.book['content_dict'][this_id] = i.get('href') - - if media_type == 'application/x-dtbncx+xml': - self.book['toc_file'] = i.get('href') - - # Cover image - if 'cover' in this_id and media_type.split('/')[0] == 'image': - cover_href = i.get('href') - try: - self.book['cover'] = self.zip_file.read(cover_href) - except KeyError: - # The cover cannot be found according to the - # path specified in the content reference - self.book['cover'] = self.zip_file.read( - self.get_file_path(cover_href)) + def generate_book_cover(self): + # This is separate because the book cover needs to + # be found and extracted both during addition / reading + self.book['cover'] = None + try: + cover_image = [ + i['@href'] for i in self.opf_dict['package']['manifest']['item'] + if i['@media-type'].split('/')[0] == 'image' and + 'cover' in i['@id']][0] + self.book['cover'] = self.zip_file.read( + self.find_file(cover_image)) + except: + pass + # Find book cover the hard way if not self.book['cover']: - # If no cover is located the conventional way, - # we go looking for the largest image in the book biggest_image_size = 0 biggest_image = None for j in self.zip_file.filelist: @@ -192,139 +350,5 @@ class EPUB: biggest_image_size = j.file_size if biggest_image: - self.book['cover'] = self.read_from_zip(biggest_image) - else: - logger.error('No cover found for: ' + self.filename) - - # Parse spine and arrange chapter paths acquired from the opf - # according to the order IN THE SPINE - spine_items = xml.find_all('itemref') - spine_order = [] - for i in spine_items: - spine_order.append(i.get('idref')) - - self.book['chapters_in_order'] = [] - for i in spine_order: - chapter_path = self.book['content_dict'][i] - self.book['chapters_in_order'].append(chapter_path) - - def parse_toc(self): - # This has no bearing on the actual order - # We're just using this to get chapter names - self.book['navpoint_dict'] = {} - - toc_file = self.book['toc_file'] - if toc_file: - toc_file = self.get_file_path(toc_file) - - xml = self.parse_xml(toc_file, 'xml') - if not xml: - return - - navpoints = xml.find_all('navPoint') - - for i in navpoints: - chapter_title = i.find('text').text - chapter_source = i.find('content').get('src') - chapter_source_file = unquote(chapter_source.split('#')[0]) - - if '#' in chapter_source: - try: - self.book['split_chapters'][chapter_source_file].append( - (chapter_source.split('#')[1], chapter_title)) - except KeyError: - self.book['split_chapters'][chapter_source_file] = [] - self.book['split_chapters'][chapter_source_file].append( - (chapter_source.split('#')[1], chapter_title)) - - self.book['navpoint_dict'][chapter_source_file] = chapter_title - - def parse_chapters(self, temp_dir=None, split_large_xml=False): - no_title_chapter = 0 - self.book['book_list'] = [] - - for i in self.book['chapters_in_order']: - chapter_data = self.read_from_zip(i).decode() - - if i in self.book['split_chapters'] and not split_large_xml: - split_chapters = get_split_content( - chapter_data, self.book['split_chapters'][i]) - self.book['book_list'].extend(split_chapters) - - elif split_large_xml: - # https://stackoverflow.com/questions/14444732/how-to-split-a-html-page-to-multiple-pages-using-python-and-beautiful-soup - markup = BeautifulSoup(chapter_data, 'xml') - chapters = [] - pagebreaks = markup.find_all('pagebreak') - - def next_element(elem): - while elem is not None: - elem = elem.next_sibling - if hasattr(elem, 'name'): - return elem - - for pbreak in pagebreaks: - chapter = [str(pbreak)] - elem = next_element(pbreak) - while elem and elem.name != 'pagebreak': - chapter.append(str(elem)) - elem = next_element(elem) - chapters.append('\n'.join(chapter)) - - for this_chapter in chapters: - fallback_title = str(no_title_chapter) - self.book['book_list'].append( - (fallback_title, this_chapter + ('
' * 8))) - no_title_chapter += 1 - else: - try: - self.book['book_list'].append( - (self.book['navpoint_dict'][i], chapter_data + ('
' * 8))) - except KeyError: - fallback_title = str(no_title_chapter) - self.book['book_list'].append( - (fallback_title, chapter_data)) - no_title_chapter += 1 - - cover_path = os.path.join(temp_dir, os.path.basename(self.filename)) + '- cover' - if self.book['cover']: - with open(cover_path, 'wb') as cover_temp: - cover_temp.write(self.book['cover']) - - try: - self.book['book_list'][0] = ( - 'Cover', f'
Cover
') - except IndexError: - pass - -def get_split_content(chapter_data, split_by): - split_anchors = [i[0] for i in split_by] - chapter_titles = [i[1] for i in split_by] - return_list = [] - - xml = BeautifulSoup(chapter_data, 'lxml') - xml_string = xml.body.prettify() - - for count, i in enumerate(split_anchors): - this_split = xml_string.split(i) - current_chapter = this_split[0] - - bs_obj = BeautifulSoup(current_chapter, 'lxml') - # Since tags correspond to data following them, the first - # chunk will be ignored - # As will all empty chapters - if bs_obj.text == '\n' or bs_obj.text == '' or count == 0: - continue - bs_obj_string = str(bs_obj).replace('">', '', 1) + ('
' * 8) - - return_list.append( - (chapter_titles[count - 1], bs_obj_string)) - - xml_string = ''.join(this_split[1:]) - - bs_obj = BeautifulSoup(xml_string, 'lxml') - bs_obj_string = str(bs_obj).replace('">', '', 1) + ('
' * 8) - return_list.append( - (chapter_titles[-1], bs_obj_string)) - - return return_list + self.book['cover'] = self.zip_file.read( + self.find_file(biggest_image)) diff --git a/lector/sorter.py b/lector/sorter.py index ecfe90a..5230a3e 100644 --- a/lector/sorter.py +++ b/lector/sorter.py @@ -150,6 +150,9 @@ class BookSorter: i[0]: i[1] for i in all_hashes_and_paths} def database_entry_for_book(self, file_hash): + # TODO + # This will probably look a whole lot better with a namedtuple + database_return = database.DatabaseFunctions( self.database_path).fetch_data( ('Title', 'Author', 'Year', 'ISBN', 'Tags', @@ -246,7 +249,10 @@ class BookSorter: if cover_image_raw: cover_image = resize_image(cover_image_raw) else: - cover_image = fetch_cover(title, author) + # TODO + # Needs an option + # cover_image = fetch_cover(title, author) + cover_image = None this_book[file_md5]['cover_image'] = cover_image this_book[file_md5]['addition_mode'] = self.addition_mode @@ -408,3 +414,4 @@ def fetch_cover(title, author): except: logger.error(f'Couldn\'t find cover for ' + title) + return None