Files
Lector/ePub/read_epub.py
2018-03-10 12:28:14 +05:30

195 lines
6.5 KiB
Python

#!/usr/bin/env python3
# This file is a part of Lector, a Qt based ebook reader
# Copyright (C) 2017 BasioMeusPuga
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import os
import sys
import zipfile
import pprint
import inspect
import bs4
from bs4 import BeautifulSoup
class EPUB:
def __init__(self, filename):
self.filename = filename
self.zip_file = None
self.book = {}
def read_epub(self):
# This is the function that should error out in
# case the module cannot process the file
self.load_zip()
contents_path = self.get_file_path(
None, True)
self.generate_book_metadata(contents_path)
self.parse_toc()
def load_zip(self):
try:
self.zip_file = zipfile.ZipFile(
self.filename, mode='r', allowZip64=True)
except (KeyError, AttributeError, zipfile.BadZipFile):
print('Cannot parse ' + self.filename)
return
def parse_xml(self, filename, parser):
try:
this_xml = self.zip_file.read(filename).decode()
except KeyError:
print('File not found in zip')
return
root = BeautifulSoup(this_xml, parser)
return root
def get_file_path(self, filename, is_content_file=False):
# Use this to get the location of the content.opf file
# And maybe some other file that has a more well formatted
# We're going to all this trouble because there really is
# no going forward without a toc
if is_content_file:
container_location = self.get_file_path('container.xml')
xml = self.parse_xml(container_location, 'xml')
root_item = xml.find('rootfile')
if root_item:
return root_item.get('full-path')
else:
possible_filenames = ('content.opf', 'package.opf')
for i in possible_filenames:
presumptive_location = self.get_file_path(i)
if presumptive_location:
return presumptive_location
for i in self.zip_file.filelist:
if os.path.basename(i.filename) == os.path.basename(filename):
return i.filename
def read_from_zip(self, filename):
try:
file_data = self.zip_file.read(filename)
return file_data
except KeyError:
file_path_actual = self.get_file_path(filename)
return self.zip_file.read(file_path_actual)
#______________________________________________________
def generate_book_metadata(self, contents_path):
# Parse metadata
item_dict = {
'title': 'dc:title',
'author': 'dc:creator',
'date': 'dc:date'}
xml = self.parse_xml(contents_path, 'lxml')
for i in item_dict.items():
item = xml.find(i[1])
if item:
self.book[i[0]] = item.text
# Get identifier
xml = self.parse_xml(contents_path, 'xml')
metadata_items = xml.find('metadata')
for i in metadata_items.children:
if isinstance(i, bs4.element.Tag):
try:
if i.get('opf:scheme').lower() == 'isbn':
self.book['isbn'] = i.text
break
except AttributeError:
self.book['isbn'] = None
# Get items
self.book['content_dict'] = {}
all_items = xml.find_all('item')
for i in all_items:
media_type = i.get('media-type')
if media_type == 'application/xhtml+xml':
self.book['content_dict'][i.get('id')] = i.get('href')
if media_type == 'application/x-dtbncx+xml':
self.book['toc_file'] = i.get('href')
# Cover image
# if i.get('id') == 'cover':
# cover_href = i.get('href')
# try:
# self.book['cover'] = self.zip_file.read(cover_href)
# except KeyError:
# # The cover cannot be found according to the
# # path specified in the content reference
# self.book['cover'] = self.zip_file.read(
# self.get_file_path(cover_href))
# Parse spine and arrange chapter paths acquired from the opf
# according to the order IN THE SPINE
spine_items = xml.find_all('itemref')
spine_order = []
for i in spine_items:
spine_order.append(i.get('idref'))
self.book['chapters_in_order'] = []
for i in spine_order:
chapter_path = self.book['content_dict'][i]
self.book['chapters_in_order'].append(chapter_path)
def parse_toc(self):
# Try to get chapter names from the toc
# This has no bearing on the actual order
# We're just using this to get chapter names
toc_file = self.book['toc_file']
toc_file = self.get_file_path(toc_file)
xml = self.parse_xml(toc_file, 'xml')
navpoints = xml.find_all('navPoint')
self.book['navpoint_dict'] = {}
for i in navpoints:
chapter_title = i.find('text').text
chapter_source = i.find('content').get('src')
chapter_source = chapter_source.split('#')[0]
self.book['navpoint_dict'][chapter_source] = chapter_title
def parse_chapters(self):
self.book['book_list'] = []
for i in self.book['chapters_in_order']:
chapter_data = self.read_from_zip(i).decode()
try:
self.book['book_list'].append(
(self.book['navpoint_dict'][i], chapter_data))
except KeyError:
self.book['book_list'].append(
(os.path.splitext(i)[0], chapter_data))
def main():
book = EPUB(sys.argv[1])
book.read_epub()
book.parse_chapters()
if __name__ == '__main__':
main()