From 4a30c8bdc7295938ffe2553e051b6ad3c74ce97b Mon Sep 17 00:00:00 2001 From: BasioMeusPuga Date: Sat, 10 Mar 2018 09:43:41 +0530 Subject: [PATCH] Begin work on ePub parser --- TODO | 1 + ePub/__init__.py | 1 + ePub/read_epub.py | 170 ++++++++++++++++++++++++++++++++++++++++++++++ parsers/epub.py | 79 +++++++++++++++++++++ sorter.py | 3 +- 5 files changed, 253 insertions(+), 1 deletion(-) create mode 100644 ePub/__init__.py create mode 100644 ePub/read_epub.py create mode 100644 parsers/epub.py diff --git a/TODO b/TODO index a7c2b02..e244e1a 100644 --- a/TODO +++ b/TODO @@ -58,6 +58,7 @@ TODO Limit the extra files produced by KindleUnpack Have them save to memory epub support + Homegrown solution please Other: ✓ Define every widget in code Bugs: diff --git a/ePub/__init__.py b/ePub/__init__.py new file mode 100644 index 0000000..8d1c8b6 --- /dev/null +++ b/ePub/__init__.py @@ -0,0 +1 @@ + diff --git a/ePub/read_epub.py b/ePub/read_epub.py new file mode 100644 index 0000000..3df4048 --- /dev/null +++ b/ePub/read_epub.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 + +# This file is a part of Lector, a Qt based ebook reader +# Copyright (C) 2017 BasioMeusPuga + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import os +import sys +import zipfile + +import pprint +import inspect + +import bs4 +from bs4 import BeautifulSoup + + +class EPUB: + def __init__(self, filename): + self.filename = filename + self.zip_file = None + self.book = {} + + def read_book(self): + # This is the function that should error out in + # case the module cannot process the file + self.load_zip() + contents_path = self.get_file_path('content.opf') + self.generate_book_metadata(contents_path) + self.parse_toc() + + def load_zip(self): + try: + self.zip_file = zipfile.ZipFile( + self.filename, mode='r', allowZip64=True) + except (KeyError, AttributeError, zipfile.BadZipFile): + print('Cannot parse ' + self.filename) + return + + def parse_xml(self, filename, parser): + try: + this_xml = self.zip_file.read(filename).decode() + except KeyError: + print('File not found in zip') + return + + root = BeautifulSoup(this_xml, parser) + return root + + def get_file_path(self, filename): + # Use this to get the location of the content.opf file + # And maybe some other file that has a more well formatted + # idea of the TOC + for i in self.zip_file.filelist: + if os.path.basename(i.filename) == filename: + return i.filename + + + def generate_book_metadata(self, contents_path): + item_dict = { + 'title': 'dc:title', + 'author': 'dc:creator', + 'date': 'dc:date'} + + # Parse metadata + xml = self.parse_xml(contents_path, 'lxml') + + for i in item_dict.items(): + item = xml.find(i[1]) + if item: + self.book[i[0]] = item.text + + # Get identifier + xml = self.parse_xml(contents_path, 'xml') + + metadata_items = xml.find('metadata') + for i in metadata_items.children: + if isinstance(i, bs4.element.Tag): + try: + if i.get('opf:scheme').lower() == 'isbn': + self.book['isbn'] = i.text + break + except AttributeError: + self.book['isbn'] = None + + # Get items + book_items = {} + all_items = xml.find_all('item') + for i in all_items: + media_type = i.get('media-type') + + if media_type == 'application/xhtml+xml': + book_items[i.get('id')] = i.get('href') + if media_type == 'application/x-dtbncx+xml': + self.book['toc_file'] = i.get('href') + if i.get('id') == 'cover': + self.book['cover'] = self.zip_file.read(i.get('href')) + + # Parse spine + spine_items = xml.find_all('itemref') + spine_order = [] + for i in spine_items: + spine_order.append(i.get('idref')) + + # book_order = [] + # for i in spine_order: + # try: + # book_order.append(book_items[i]) + # except KeyError: + # pass + + # self.book['book_order'] = book_order + + def parse_toc(self): + # Try to get chapter names from the toc + try: + toc_file = self.book['toc_file'] + except KeyError: + toc_file = self.get_file_path('toc.ncx') + + xml = self.parse_xml(toc_file, 'xml') + navpoints = xml.find_all('navPoint') + + self.book['navpoint_dict'] = {} + for i in navpoints: + chapter_title = i.find('text').text + chapter_source = i.find('content').get('src') + chapter_source = chapter_source.split('#')[0] + self.book['navpoint_dict'][chapter_title] = chapter_source + + # self.book['navpoint_dict'] = {} + # for i in self.book['book_order']: + # try: + # self.book['navpoint_dict'][i] = navpoint_dict[i] + # except: + # # TODO + # # Create title + # self.book['navpoint_dict'][i] = 'Unspecified' + + # # Reverse the dict + # reverse_dict = {i[1]: i[0] for i in self.book['navpoint_dict'].items()} + # self.book['navpoint_dict'] = reverse_dict + + def parse_chapters(self): + for i in self.book['navpoint_dict'].items(): + try: + self.book['navpoint_dict'][i[0]] = self.zip_file.read(i[1]).decode() + except KeyError: + print(i[1] + ' skipped') + + +def main(): + book = EPUB(sys.argv[1]) + book.read_book() + book.parse_chapters() + +if __name__ == '__main__': + main() diff --git a/parsers/epub.py b/parsers/epub.py new file mode 100644 index 0000000..42a8122 --- /dev/null +++ b/parsers/epub.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 + +# This file is a part of Lector, a Qt based ebook reader +# Copyright (C) 2017 BasioMeusPuga + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import os +import sys +import zipfile + +from ePub.read_epub import EPUB + + +class ParseEPUB: + def __init__(self, filename, temp_dir, file_md5): + # TODO + # Maybe also include book description + self.book_ref = None + self.book = None + self.temp_dir = temp_dir + self.filename = filename + self.file_md5 = file_md5 + + def read_book(self): + self.book_ref = EPUB(self.filename) + contents_path = self.book_ref.get_file_path('content.opf') + self.book_ref.generate_book(contents_path) + self.book_ref.parse_toc() + self.book = self.book_ref.book + + def get_title(self): + return self.book['title'] + + def get_author(self): + return self.book['author'] + + def get_year(self): + return 9999 + + def get_cover_image(self): + try: + return self.book['cover'] + except KeyError: + return None + + def get_isbn(self): + return self.book['isbn'] + + def get_tags(self): + return None + + def get_contents(self): + extract_path = os.path.join(self.temp_dir, self.file_md5) + zipfile.ZipFile(self.filename).extractall(extract_path) + + self.book_ref.parse_chapters() + file_settings = { + 'images_only': False} + return self.book['navpoint_dict'], file_settings + +class HidePrinting: + def __enter__(self): + self._original_stdout = sys.stdout + sys.stdout = None + + def __exit__(self, exc_type, exc_val, exc_tb): + sys.stdout = self._original_stdout diff --git a/sorter.py b/sorter.py index 5ed3f24..f4bbe0d 100644 --- a/sorter.py +++ b/sorter.py @@ -47,9 +47,10 @@ import database from parsers.ebook import ParseEBook from parsers.cbz import ParseCBZ from parsers.cbr import ParseCBR +from parsers.epub import ParseEPUB sorter = { - 'epub': ParseEBook, + 'epub': ParseEPUB, 'mobi': ParseEBook, 'azw': ParseEBook, 'cbz': ParseCBZ,