#!/usr/bin/env python3 # This file is a part of Lector, a Qt based ebook reader # Copyright (C) 2017 BasioMeusPuga # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . import os import sys import zipfile import pprint import inspect import bs4 from bs4 import BeautifulSoup class EPUB: def __init__(self, filename): self.filename = filename self.zip_file = None self.book = {} def read_book(self): # This is the function that should error out in # case the module cannot process the file self.load_zip() contents_path = self.get_file_path('content.opf') self.generate_book_metadata(contents_path) self.parse_toc() def load_zip(self): try: self.zip_file = zipfile.ZipFile( self.filename, mode='r', allowZip64=True) except (KeyError, AttributeError, zipfile.BadZipFile): print('Cannot parse ' + self.filename) return def parse_xml(self, filename, parser): try: this_xml = self.zip_file.read(filename).decode() except KeyError: print('File not found in zip') return root = BeautifulSoup(this_xml, parser) return root def get_file_path(self, filename): # Use this to get the location of the content.opf file # And maybe some other file that has a more well formatted # idea of the TOC for i in self.zip_file.filelist: if os.path.basename(i.filename) == filename: return i.filename def generate_book_metadata(self, contents_path): item_dict = { 'title': 'dc:title', 'author': 'dc:creator', 'date': 'dc:date'} # Parse metadata xml = self.parse_xml(contents_path, 'lxml') for i in item_dict.items(): item = xml.find(i[1]) if item: self.book[i[0]] = item.text # Get identifier xml = self.parse_xml(contents_path, 'xml') metadata_items = xml.find('metadata') for i in metadata_items.children: if isinstance(i, bs4.element.Tag): try: if i.get('opf:scheme').lower() == 'isbn': self.book['isbn'] = i.text break except AttributeError: self.book['isbn'] = None # Get items book_items = {} all_items = xml.find_all('item') for i in all_items: media_type = i.get('media-type') if media_type == 'application/xhtml+xml': book_items[i.get('id')] = i.get('href') if media_type == 'application/x-dtbncx+xml': self.book['toc_file'] = i.get('href') if i.get('id') == 'cover': self.book['cover'] = self.zip_file.read(i.get('href')) # Parse spine spine_items = xml.find_all('itemref') spine_order = [] for i in spine_items: spine_order.append(i.get('idref')) # book_order = [] # for i in spine_order: # try: # book_order.append(book_items[i]) # except KeyError: # pass # self.book['book_order'] = book_order def parse_toc(self): # Try to get chapter names from the toc try: toc_file = self.book['toc_file'] except KeyError: toc_file = self.get_file_path('toc.ncx') xml = self.parse_xml(toc_file, 'xml') navpoints = xml.find_all('navPoint') self.book['navpoint_dict'] = {} for i in navpoints: chapter_title = i.find('text').text chapter_source = i.find('content').get('src') chapter_source = chapter_source.split('#')[0] self.book['navpoint_dict'][chapter_title] = chapter_source # self.book['navpoint_dict'] = {} # for i in self.book['book_order']: # try: # self.book['navpoint_dict'][i] = navpoint_dict[i] # except: # # TODO # # Create title # self.book['navpoint_dict'][i] = 'Unspecified' # # Reverse the dict # reverse_dict = {i[1]: i[0] for i in self.book['navpoint_dict'].items()} # self.book['navpoint_dict'] = reverse_dict def parse_chapters(self): for i in self.book['navpoint_dict'].items(): try: self.book['navpoint_dict'][i[0]] = self.zip_file.read(i[1]).decode() except KeyError: print(i[1] + ' skipped') def main(): book = EPUB(sys.argv[1]) book.read_book() book.parse_chapters() if __name__ == '__main__': main()