From 51d00bb9b5d503fb786b19925b5e3b64a2c7c1e2 Mon Sep 17 00:00:00 2001 From: BasioMeusPuga Date: Sat, 10 Mar 2018 12:28:14 +0530 Subject: [PATCH] Improve epub parsing --- ePub/read_epub.py | 108 ++++++++++++++++++++++++++++------------------ parsers/epub.py | 6 +-- 2 files changed, 68 insertions(+), 46 deletions(-) diff --git a/ePub/read_epub.py b/ePub/read_epub.py index 3df4048..70c6c4e 100644 --- a/ePub/read_epub.py +++ b/ePub/read_epub.py @@ -33,11 +33,12 @@ class EPUB: self.zip_file = None self.book = {} - def read_book(self): + def read_epub(self): # This is the function that should error out in # case the module cannot process the file self.load_zip() - contents_path = self.get_file_path('content.opf') + contents_path = self.get_file_path( + None, True) self.generate_book_metadata(contents_path) self.parse_toc() @@ -59,22 +60,47 @@ class EPUB: root = BeautifulSoup(this_xml, parser) return root - def get_file_path(self, filename): + def get_file_path(self, filename, is_content_file=False): # Use this to get the location of the content.opf file # And maybe some other file that has a more well formatted - # idea of the TOC + + # We're going to all this trouble because there really is + # no going forward without a toc + if is_content_file: + container_location = self.get_file_path('container.xml') + xml = self.parse_xml(container_location, 'xml') + + root_item = xml.find('rootfile') + if root_item: + return root_item.get('full-path') + else: + possible_filenames = ('content.opf', 'package.opf') + for i in possible_filenames: + presumptive_location = self.get_file_path(i) + if presumptive_location: + return presumptive_location + for i in self.zip_file.filelist: - if os.path.basename(i.filename) == filename: + if os.path.basename(i.filename) == os.path.basename(filename): return i.filename + def read_from_zip(self, filename): + try: + file_data = self.zip_file.read(filename) + return file_data + except KeyError: + file_path_actual = self.get_file_path(filename) + return self.zip_file.read(file_path_actual) + + #______________________________________________________ def generate_book_metadata(self, contents_path): + # Parse metadata item_dict = { 'title': 'dc:title', 'author': 'dc:creator', 'date': 'dc:date'} - # Parse metadata xml = self.parse_xml(contents_path, 'lxml') for i in item_dict.items(): @@ -96,39 +122,47 @@ class EPUB: self.book['isbn'] = None # Get items - book_items = {} + self.book['content_dict'] = {} all_items = xml.find_all('item') for i in all_items: media_type = i.get('media-type') if media_type == 'application/xhtml+xml': - book_items[i.get('id')] = i.get('href') + self.book['content_dict'][i.get('id')] = i.get('href') + if media_type == 'application/x-dtbncx+xml': self.book['toc_file'] = i.get('href') - if i.get('id') == 'cover': - self.book['cover'] = self.zip_file.read(i.get('href')) - # Parse spine + # Cover image + # if i.get('id') == 'cover': + # cover_href = i.get('href') + # try: + # self.book['cover'] = self.zip_file.read(cover_href) + # except KeyError: + # # The cover cannot be found according to the + # # path specified in the content reference + # self.book['cover'] = self.zip_file.read( + # self.get_file_path(cover_href)) + + # Parse spine and arrange chapter paths acquired from the opf + # according to the order IN THE SPINE spine_items = xml.find_all('itemref') spine_order = [] for i in spine_items: spine_order.append(i.get('idref')) - # book_order = [] - # for i in spine_order: - # try: - # book_order.append(book_items[i]) - # except KeyError: - # pass - - # self.book['book_order'] = book_order + self.book['chapters_in_order'] = [] + for i in spine_order: + chapter_path = self.book['content_dict'][i] + self.book['chapters_in_order'].append(chapter_path) def parse_toc(self): # Try to get chapter names from the toc - try: - toc_file = self.book['toc_file'] - except KeyError: - toc_file = self.get_file_path('toc.ncx') + # This has no bearing on the actual order + # We're just using this to get chapter names + + toc_file = self.book['toc_file'] + toc_file = self.get_file_path(toc_file) xml = self.parse_xml(toc_file, 'xml') navpoints = xml.find_all('navPoint') @@ -138,32 +172,22 @@ class EPUB: chapter_title = i.find('text').text chapter_source = i.find('content').get('src') chapter_source = chapter_source.split('#')[0] - self.book['navpoint_dict'][chapter_title] = chapter_source - - # self.book['navpoint_dict'] = {} - # for i in self.book['book_order']: - # try: - # self.book['navpoint_dict'][i] = navpoint_dict[i] - # except: - # # TODO - # # Create title - # self.book['navpoint_dict'][i] = 'Unspecified' - - # # Reverse the dict - # reverse_dict = {i[1]: i[0] for i in self.book['navpoint_dict'].items()} - # self.book['navpoint_dict'] = reverse_dict + self.book['navpoint_dict'][chapter_source] = chapter_title def parse_chapters(self): - for i in self.book['navpoint_dict'].items(): + self.book['book_list'] = [] + for i in self.book['chapters_in_order']: + chapter_data = self.read_from_zip(i).decode() try: - self.book['navpoint_dict'][i[0]] = self.zip_file.read(i[1]).decode() + self.book['book_list'].append( + (self.book['navpoint_dict'][i], chapter_data)) except KeyError: - print(i[1] + ' skipped') - + self.book['book_list'].append( + (os.path.splitext(i)[0], chapter_data)) def main(): book = EPUB(sys.argv[1]) - book.read_book() + book.read_epub() book.parse_chapters() if __name__ == '__main__': diff --git a/parsers/epub.py b/parsers/epub.py index 42a8122..f06fb1a 100644 --- a/parsers/epub.py +++ b/parsers/epub.py @@ -35,9 +35,7 @@ class ParseEPUB: def read_book(self): self.book_ref = EPUB(self.filename) - contents_path = self.book_ref.get_file_path('content.opf') - self.book_ref.generate_book(contents_path) - self.book_ref.parse_toc() + self.book_ref.read_epub() self.book = self.book_ref.book def get_title(self): @@ -68,7 +66,7 @@ class ParseEPUB: self.book_ref.parse_chapters() file_settings = { 'images_only': False} - return self.book['navpoint_dict'], file_settings + return self.book['book_list'], file_settings class HidePrinting: def __enter__(self):