#!/usr/bin/env python3 # This file is a part of Lector, a Qt based ebook reader # Copyright (C) 2017 BasioMeusPuga # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . import os import sys import zipfile import pprint import inspect import bs4 from bs4 import BeautifulSoup class EPUB: def __init__(self, filename): self.filename = filename self.zip_file = None self.book = {} def read_epub(self): # This is the function that should error out in # case the module cannot process the file self.load_zip() contents_path = self.get_file_path( None, True) self.generate_book_metadata(contents_path) self.parse_toc() def load_zip(self): try: self.zip_file = zipfile.ZipFile( self.filename, mode='r', allowZip64=True) except (KeyError, AttributeError, zipfile.BadZipFile): print('Cannot parse ' + self.filename) return def parse_xml(self, filename, parser): try: this_xml = self.zip_file.read(filename).decode() except KeyError: print('File not found in zip') return root = BeautifulSoup(this_xml, parser) return root def get_file_path(self, filename, is_content_file=False): # Use this to get the location of the content.opf file # And maybe some other file that has a more well formatted # We're going to all this trouble because there really is # no going forward without a toc if is_content_file: container_location = self.get_file_path('container.xml') xml = self.parse_xml(container_location, 'xml') root_item = xml.find('rootfile') if root_item: return root_item.get('full-path') else: possible_filenames = ('content.opf', 'package.opf') for i in possible_filenames: presumptive_location = self.get_file_path(i) if presumptive_location: return presumptive_location for i in self.zip_file.filelist: if os.path.basename(i.filename) == os.path.basename(filename): return i.filename def read_from_zip(self, filename): try: file_data = self.zip_file.read(filename) return file_data except KeyError: file_path_actual = self.get_file_path(filename) return self.zip_file.read(file_path_actual) #______________________________________________________ def generate_book_metadata(self, contents_path): # Parse metadata item_dict = { 'title': 'dc:title', 'author': 'dc:creator', 'date': 'dc:date'} xml = self.parse_xml(contents_path, 'lxml') for i in item_dict.items(): item = xml.find(i[1]) if item: self.book[i[0]] = item.text # Get identifier xml = self.parse_xml(contents_path, 'xml') metadata_items = xml.find('metadata') for i in metadata_items.children: if isinstance(i, bs4.element.Tag): try: if i.get('opf:scheme').lower() == 'isbn': self.book['isbn'] = i.text break except AttributeError: self.book['isbn'] = None # Get items self.book['content_dict'] = {} all_items = xml.find_all('item') for i in all_items: media_type = i.get('media-type') if media_type == 'application/xhtml+xml': self.book['content_dict'][i.get('id')] = i.get('href') if media_type == 'application/x-dtbncx+xml': self.book['toc_file'] = i.get('href') # Cover image # if i.get('id') == 'cover': # cover_href = i.get('href') # try: # self.book['cover'] = self.zip_file.read(cover_href) # except KeyError: # # The cover cannot be found according to the # # path specified in the content reference # self.book['cover'] = self.zip_file.read( # self.get_file_path(cover_href)) # Parse spine and arrange chapter paths acquired from the opf # according to the order IN THE SPINE spine_items = xml.find_all('itemref') spine_order = [] for i in spine_items: spine_order.append(i.get('idref')) self.book['chapters_in_order'] = [] for i in spine_order: chapter_path = self.book['content_dict'][i] self.book['chapters_in_order'].append(chapter_path) def parse_toc(self): # Try to get chapter names from the toc # This has no bearing on the actual order # We're just using this to get chapter names toc_file = self.book['toc_file'] toc_file = self.get_file_path(toc_file) xml = self.parse_xml(toc_file, 'xml') navpoints = xml.find_all('navPoint') self.book['navpoint_dict'] = {} for i in navpoints: chapter_title = i.find('text').text chapter_source = i.find('content').get('src') chapter_source = chapter_source.split('#')[0] self.book['navpoint_dict'][chapter_source] = chapter_title def parse_chapters(self): self.book['book_list'] = [] for i in self.book['chapters_in_order']: chapter_data = self.read_from_zip(i).decode() try: self.book['book_list'].append( (self.book['navpoint_dict'][i], chapter_data)) except KeyError: self.book['book_list'].append( (os.path.splitext(i)[0], chapter_data)) def main(): book = EPUB(sys.argv[1]) book.read_epub() book.parse_chapters() if __name__ == '__main__': main()