diff --git a/ePub/read_epub.py b/ePub/read_epub.py index 9104e62..514445b 100644 --- a/ePub/read_epub.py +++ b/ePub/read_epub.py @@ -57,7 +57,7 @@ class EPUB: try: this_xml = self.zip_file.read(filename).decode() except KeyError: - print('File not found in zip') + print(str(filename) + ' not found in zip') return root = BeautifulSoup(this_xml, parser) @@ -73,8 +73,8 @@ class EPUB: container_location = self.get_file_path('container.xml') xml = self.parse_xml(container_location, 'xml') - root_item = xml.find('rootfile') - if root_item: + if xml: + root_item = xml.find('rootfile') return root_item.get('full-path') else: possible_filenames = ('content.opf', 'package.opf') @@ -152,14 +152,14 @@ class EPUB: media_type = i.get('media-type') this_id = i.get('id') - if media_type == 'application/xhtml+xml': + if media_type == 'application/xhtml+xml' or media_type == 'text/html': self.book['content_dict'][this_id] = i.get('href') if media_type == 'application/x-dtbncx+xml': self.book['toc_file'] = i.get('href') # Cover image - if this_id.startswith('cover') and media_type.split('/')[0] == 'image': + if 'cover' in this_id and media_type.split('/')[0] == 'image': cover_href = i.get('href') try: self.book['cover'] = self.zip_file.read(cover_href) @@ -175,7 +175,7 @@ class EPUB: biggest_image_size = 0 biggest_image = None for j in self.zip_file.filelist: - if os.path.splitext(j.filename)[1] in ['.jpg', '.png', '.gif']: + if os.path.splitext(j.filename)[1] in ['.jpg', '.jpeg', '.png', '.gif']: if j.file_size > biggest_image_size: biggest_image = j.filename biggest_image_size = j.file_size @@ -185,9 +185,6 @@ class EPUB: else: print('No cover found for: ' + self.filename) - with open('cover', 'wb') as this_cover: - this_cover.write(self.book['cover']) - # Parse spine and arrange chapter paths acquired from the opf # according to the order IN THE SPINE spine_items = xml.find_all('itemref') @@ -221,19 +218,47 @@ class EPUB: chapter_source = unquote(chapter_source.split('#')[0]) self.book['navpoint_dict'][chapter_source] = chapter_title - def parse_chapters(self): + def parse_chapters(self, split_large_xml=False): no_title_chapter = 1 self.book['book_list'] = [] for i in self.book['chapters_in_order']: chapter_data = self.read_from_zip(i).decode() - try: - self.book['book_list'].append( - (self.book['navpoint_dict'][i], chapter_data)) - except KeyError: - fallback_title = str(no_title_chapter) + ': No Title' - self.book['book_list'].append( - (fallback_title, chapter_data)) - no_title_chapter += 1 + + if not split_large_xml: + try: + self.book['book_list'].append( + (self.book['navpoint_dict'][i], chapter_data)) + except KeyError: + fallback_title = str(no_title_chapter) + self.book['book_list'].append( + (fallback_title, chapter_data)) + no_title_chapter += 1 + + else: + # https://stackoverflow.com/questions/14444732/how-to-split-a-html-page-to-multiple-pages-using-python-and-beautiful-soup + markup = BeautifulSoup(chapter_data, 'xml') + chapters = [] + pagebreaks = markup.find_all('pagebreak') + + def next_element(elem): + while elem is not None: + elem = elem.next_sibling + if hasattr(elem, 'name'): + return elem + + for pbreak in pagebreaks: + chapter = [str(pbreak)] + elem = next_element(pbreak) + while elem and elem.name != 'pagebreak': + chapter.append(str(elem)) + elem = next_element(elem) + chapters.append('\n'.join(chapter)) + + for this_chapter in chapters: + fallback_title = str(no_title_chapter) + self.book['book_list'].append( + (fallback_title, this_chapter)) + no_title_chapter += 1 def main(): book = EPUB(sys.argv[1]) diff --git a/parsers/epub.py b/parsers/epub.py index db2719c..ccddeda 100644 --- a/parsers/epub.py +++ b/parsers/epub.py @@ -67,11 +67,3 @@ class ParseEPUB: file_settings = { 'images_only': False} return self.book['book_list'], file_settings - -class HidePrinting: - def __enter__(self): - self._original_stdout = sys.stdout - sys.stdout = None - - def __exit__(self, exc_type, exc_val, exc_tb): - sys.stdout = self._original_stdout diff --git a/parsers/mobi.py b/parsers/mobi.py new file mode 100644 index 0000000..604a3ff --- /dev/null +++ b/parsers/mobi.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 + +# This file is a part of Lector, a Qt based ebook reader +# Copyright (C) 2017 BasioMeusPuga + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# This module parses Amazon ebooks using KindleUnpack to first create an +# epub that is then read the usual way + +import os +import sys +import shutil +import zipfile + +from ePub.read_epub import EPUB +import KindleUnpack.kindleunpack as KindleUnpack + + +class ParseMOBI: + def __init__(self, filename, temp_dir, file_md5): + self.book_ref = None + self.book = None + self.filename = filename + self.epub_filepath = None + self.split_large_xml = False + self.extract_dir = os.path.join(temp_dir, file_md5) + + def read_book(self): + with HidePrinting(): + KindleUnpack.unpackBook(self.filename, self.extract_dir) + + epub_filename = os.path.splitext( + os.path.basename(self.filename))[0] + '.epub' + + self.epub_filepath = os.path.join( + self.extract_dir, 'mobi8', epub_filename) + if not os.path.exists(self.epub_filepath): + zip_dir = os.path.join(self.extract_dir, 'mobi7') + zip_file = os.path.join( + self.extract_dir, epub_filename) + self.epub_filepath = shutil.make_archive(zip_file, 'zip', zip_dir) + self.split_large_xml = True + + self.book_ref = EPUB(self.epub_filepath) + contents_found = self.book_ref.read_epub() + if not contents_found: + print('Cannot process: ' + self.filename) + return + self.book = self.book_ref.book + + def get_title(self): + return self.book['title'] + + def get_author(self): + return self.book['author'] + + def get_year(self): + return self.book['year'] + + def get_cover_image(self): + return self.book['cover'] + + def get_isbn(self): + return self.book['isbn'] + + def get_tags(self): + return self.book['tags'] + + def get_contents(self): + extract_path = os.path.join(self.extract_dir) + zipfile.ZipFile(self.epub_filepath).extractall(extract_path) + + self.book_ref.parse_chapters(self.split_large_xml) + file_settings = { + 'images_only': False} + return self.book['book_list'], file_settings + +class HidePrinting: + def __enter__(self): + self._original_stdout = sys.stdout + sys.stdout = None + + def __exit__(self, exc_type, exc_val, exc_tb): + sys.stdout = self._original_stdout diff --git a/sorter.py b/sorter.py index feedb39..1290e0c 100644 --- a/sorter.py +++ b/sorter.py @@ -44,15 +44,18 @@ from PyQt5 import QtCore, QtGui import database -from parsers.ebook import ParseEBook from parsers.cbz import ParseCBZ from parsers.cbr import ParseCBR from parsers.epub import ParseEPUB +from parsers.mobi import ParseMOBI sorter = { 'epub': ParseEPUB, - 'mobi': ParseEBook, - 'azw': ParseEBook, + 'mobi': ParseMOBI, + 'azw': ParseMOBI, + 'azw3': ParseMOBI, + 'azw4': ParseMOBI, + 'prc': ParseMOBI, 'cbz': ParseCBZ, 'cbr': ParseCBR,}