Implement .mobi parser

Eliminate dependency on ebooklib
2018-03-10 17:35:02 +05:30
parent ed8f676a05
commit d7d49897f1
4 changed files with 145 additions and 29 deletions
@@ -57,7 +57,7 @@ class EPUB:
        try:
            this_xml = self.zip_file.read(filename).decode()
        except KeyError:
-            print('File not found in zip')
+            print(str(filename) + ' not found in zip')
            return

        root = BeautifulSoup(this_xml, parser)
@@ -73,8 +73,8 @@ class EPUB:
            container_location = self.get_file_path('container.xml')
            xml = self.parse_xml(container_location, 'xml')

-            root_item = xml.find('rootfile')
-            if root_item:
+            if xml:
+                root_item = xml.find('rootfile')
                return root_item.get('full-path')
            else:
                possible_filenames = ('content.opf', 'package.opf')
@@ -152,14 +152,14 @@ class EPUB:
            media_type = i.get('media-type')
            this_id = i.get('id')

-            if media_type == 'application/xhtml+xml':
+            if media_type == 'application/xhtml+xml' or media_type == 'text/html':
                self.book['content_dict'][this_id] = i.get('href')

            if media_type == 'application/x-dtbncx+xml':
                self.book['toc_file'] = i.get('href')

            # Cover image
-            if this_id.startswith('cover') and media_type.split('/')[0] == 'image':
+            if 'cover' in this_id and media_type.split('/')[0] == 'image':
                cover_href = i.get('href')
                try:
                    self.book['cover'] = self.zip_file.read(cover_href)
@@ -175,7 +175,7 @@ class EPUB:
            biggest_image_size = 0
            biggest_image = None
            for j in self.zip_file.filelist:
-                if os.path.splitext(j.filename)[1] in ['.jpg', '.png', '.gif']:
+                if os.path.splitext(j.filename)[1] in ['.jpg', '.jpeg', '.png', '.gif']:
                    if j.file_size > biggest_image_size:
                        biggest_image = j.filename
                        biggest_image_size = j.file_size
@@ -185,9 +185,6 @@ class EPUB:
            else:
                print('No cover found for: ' + self.filename)

-        with open('cover', 'wb') as this_cover:
-            this_cover.write(self.book['cover'])
-
        # Parse spine and arrange chapter paths acquired from the opf
        # according to the order IN THE SPINE
        spine_items = xml.find_all('itemref')
@@ -221,19 +218,47 @@ class EPUB:
            chapter_source = unquote(chapter_source.split('#')[0])
            self.book['navpoint_dict'][chapter_source] = chapter_title

-    def parse_chapters(self):
+    def parse_chapters(self, split_large_xml=False):
        no_title_chapter = 1
        self.book['book_list'] = []
        for i in self.book['chapters_in_order']:
            chapter_data = self.read_from_zip(i).decode()
-            try:
-                self.book['book_list'].append(
-                    (self.book['navpoint_dict'][i], chapter_data))
-            except KeyError:
-                fallback_title = str(no_title_chapter) + ': No Title'
-                self.book['book_list'].append(
-                    (fallback_title, chapter_data))
-            no_title_chapter += 1
+
+            if not split_large_xml:
+                try:
+                    self.book['book_list'].append(
+                        (self.book['navpoint_dict'][i], chapter_data))
+                except KeyError:
+                    fallback_title = str(no_title_chapter)
+                    self.book['book_list'].append(
+                        (fallback_title, chapter_data))
+                no_title_chapter += 1
+
+            else:
+                # https://stackoverflow.com/questions/14444732/how-to-split-a-html-page-to-multiple-pages-using-python-and-beautiful-soup
+                markup = BeautifulSoup(chapter_data, 'xml')
+                chapters = []
+                pagebreaks = markup.find_all('pagebreak')
+
+                def next_element(elem):
+                    while elem is not None:
+                        elem = elem.next_sibling
+                        if hasattr(elem, 'name'):
+                            return elem
+
+                for pbreak in pagebreaks:
+                    chapter = [str(pbreak)]
+                    elem = next_element(pbreak)
+                    while elem and elem.name != 'pagebreak':
+                        chapter.append(str(elem))
+                        elem = next_element(elem)
+                    chapters.append('\n'.join(chapter))
+
+                for this_chapter in chapters:
+                    fallback_title = str(no_title_chapter)
+                    self.book['book_list'].append(
+                        (fallback_title, this_chapter))
+                    no_title_chapter += 1

 def main():
    book = EPUB(sys.argv[1])
@@ -67,11 +67,3 @@ class ParseEPUB:
        file_settings = {
            'images_only': False}
        return self.book['book_list'], file_settings
-
-class HidePrinting:
-    def __enter__(self):
-        self._original_stdout = sys.stdout
-        sys.stdout = None
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        sys.stdout = self._original_stdout
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+# This file is a part of Lector, a Qt based ebook reader
+# Copyright (C) 2017 BasioMeusPuga
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# This module parses Amazon ebooks using KindleUnpack to first create an
+# epub that is then read the usual way
+
+import os
+import sys
+import shutil
+import zipfile
+
+from ePub.read_epub import EPUB
+import KindleUnpack.kindleunpack as KindleUnpack
+
+
+class ParseMOBI:
+    def __init__(self, filename, temp_dir, file_md5):
+        self.book_ref = None
+        self.book = None
+        self.filename = filename
+        self.epub_filepath = None
+        self.split_large_xml = False
+        self.extract_dir = os.path.join(temp_dir, file_md5)
+
+    def read_book(self):
+        with HidePrinting():
+            KindleUnpack.unpackBook(self.filename, self.extract_dir)
+
+        epub_filename = os.path.splitext(
+            os.path.basename(self.filename))[0] + '.epub'
+
+        self.epub_filepath = os.path.join(
+            self.extract_dir, 'mobi8', epub_filename)
+        if not os.path.exists(self.epub_filepath):
+            zip_dir = os.path.join(self.extract_dir, 'mobi7')
+            zip_file = os.path.join(
+                self.extract_dir, epub_filename)
+            self.epub_filepath = shutil.make_archive(zip_file, 'zip', zip_dir)
+            self.split_large_xml = True
+
+        self.book_ref = EPUB(self.epub_filepath)
+        contents_found = self.book_ref.read_epub()
+        if not contents_found:
+            print('Cannot process: ' + self.filename)
+            return
+        self.book = self.book_ref.book
+
+    def get_title(self):
+        return self.book['title']
+
+    def get_author(self):
+        return self.book['author']
+
+    def get_year(self):
+        return self.book['year']
+
+    def get_cover_image(self):
+        return self.book['cover']
+
+    def get_isbn(self):
+        return self.book['isbn']
+
+    def get_tags(self):
+        return self.book['tags']
+
+    def get_contents(self):
+        extract_path = os.path.join(self.extract_dir)
+        zipfile.ZipFile(self.epub_filepath).extractall(extract_path)
+
+        self.book_ref.parse_chapters(self.split_large_xml)
+        file_settings = {
+            'images_only': False}
+        return self.book['book_list'], file_settings
+
+class HidePrinting:
+    def __enter__(self):
+        self._original_stdout = sys.stdout
+        sys.stdout = None
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        sys.stdout = self._original_stdout
@@ -44,15 +44,18 @@ from PyQt5 import QtCore, QtGui

 import database

-from parsers.ebook import ParseEBook
 from parsers.cbz import ParseCBZ
 from parsers.cbr import ParseCBR
 from parsers.epub import ParseEPUB
+from parsers.mobi import ParseMOBI

 sorter = {
    'epub': ParseEPUB,
-    'mobi': ParseEBook,
-    'azw': ParseEBook,
+    'mobi': ParseMOBI,
+    'azw': ParseMOBI,
+    'azw3': ParseMOBI,
+    'azw4': ParseMOBI,
+    'prc': ParseMOBI,
    'cbz': ParseCBZ,
    'cbr': ParseCBR,}