Implement .mobi parser

Eliminate dependency on ebooklib
This commit is contained in:
BasioMeusPuga
2018-03-10 17:35:02 +05:30
parent ed8f676a05
commit d7d49897f1
4 changed files with 145 additions and 29 deletions

View File

@@ -57,7 +57,7 @@ class EPUB:
try: try:
this_xml = self.zip_file.read(filename).decode() this_xml = self.zip_file.read(filename).decode()
except KeyError: except KeyError:
print('File not found in zip') print(str(filename) + ' not found in zip')
return return
root = BeautifulSoup(this_xml, parser) root = BeautifulSoup(this_xml, parser)
@@ -73,8 +73,8 @@ class EPUB:
container_location = self.get_file_path('container.xml') container_location = self.get_file_path('container.xml')
xml = self.parse_xml(container_location, 'xml') xml = self.parse_xml(container_location, 'xml')
root_item = xml.find('rootfile') if xml:
if root_item: root_item = xml.find('rootfile')
return root_item.get('full-path') return root_item.get('full-path')
else: else:
possible_filenames = ('content.opf', 'package.opf') possible_filenames = ('content.opf', 'package.opf')
@@ -152,14 +152,14 @@ class EPUB:
media_type = i.get('media-type') media_type = i.get('media-type')
this_id = i.get('id') this_id = i.get('id')
if media_type == 'application/xhtml+xml': if media_type == 'application/xhtml+xml' or media_type == 'text/html':
self.book['content_dict'][this_id] = i.get('href') self.book['content_dict'][this_id] = i.get('href')
if media_type == 'application/x-dtbncx+xml': if media_type == 'application/x-dtbncx+xml':
self.book['toc_file'] = i.get('href') self.book['toc_file'] = i.get('href')
# Cover image # Cover image
if this_id.startswith('cover') and media_type.split('/')[0] == 'image': if 'cover' in this_id and media_type.split('/')[0] == 'image':
cover_href = i.get('href') cover_href = i.get('href')
try: try:
self.book['cover'] = self.zip_file.read(cover_href) self.book['cover'] = self.zip_file.read(cover_href)
@@ -175,7 +175,7 @@ class EPUB:
biggest_image_size = 0 biggest_image_size = 0
biggest_image = None biggest_image = None
for j in self.zip_file.filelist: for j in self.zip_file.filelist:
if os.path.splitext(j.filename)[1] in ['.jpg', '.png', '.gif']: if os.path.splitext(j.filename)[1] in ['.jpg', '.jpeg', '.png', '.gif']:
if j.file_size > biggest_image_size: if j.file_size > biggest_image_size:
biggest_image = j.filename biggest_image = j.filename
biggest_image_size = j.file_size biggest_image_size = j.file_size
@@ -185,9 +185,6 @@ class EPUB:
else: else:
print('No cover found for: ' + self.filename) print('No cover found for: ' + self.filename)
with open('cover', 'wb') as this_cover:
this_cover.write(self.book['cover'])
# Parse spine and arrange chapter paths acquired from the opf # Parse spine and arrange chapter paths acquired from the opf
# according to the order IN THE SPINE # according to the order IN THE SPINE
spine_items = xml.find_all('itemref') spine_items = xml.find_all('itemref')
@@ -221,19 +218,47 @@ class EPUB:
chapter_source = unquote(chapter_source.split('#')[0]) chapter_source = unquote(chapter_source.split('#')[0])
self.book['navpoint_dict'][chapter_source] = chapter_title self.book['navpoint_dict'][chapter_source] = chapter_title
def parse_chapters(self): def parse_chapters(self, split_large_xml=False):
no_title_chapter = 1 no_title_chapter = 1
self.book['book_list'] = [] self.book['book_list'] = []
for i in self.book['chapters_in_order']: for i in self.book['chapters_in_order']:
chapter_data = self.read_from_zip(i).decode() chapter_data = self.read_from_zip(i).decode()
try:
self.book['book_list'].append( if not split_large_xml:
(self.book['navpoint_dict'][i], chapter_data)) try:
except KeyError: self.book['book_list'].append(
fallback_title = str(no_title_chapter) + ': No Title' (self.book['navpoint_dict'][i], chapter_data))
self.book['book_list'].append( except KeyError:
(fallback_title, chapter_data)) fallback_title = str(no_title_chapter)
no_title_chapter += 1 self.book['book_list'].append(
(fallback_title, chapter_data))
no_title_chapter += 1
else:
# https://stackoverflow.com/questions/14444732/how-to-split-a-html-page-to-multiple-pages-using-python-and-beautiful-soup
markup = BeautifulSoup(chapter_data, 'xml')
chapters = []
pagebreaks = markup.find_all('pagebreak')
def next_element(elem):
while elem is not None:
elem = elem.next_sibling
if hasattr(elem, 'name'):
return elem
for pbreak in pagebreaks:
chapter = [str(pbreak)]
elem = next_element(pbreak)
while elem and elem.name != 'pagebreak':
chapter.append(str(elem))
elem = next_element(elem)
chapters.append('\n'.join(chapter))
for this_chapter in chapters:
fallback_title = str(no_title_chapter)
self.book['book_list'].append(
(fallback_title, this_chapter))
no_title_chapter += 1
def main(): def main():
book = EPUB(sys.argv[1]) book = EPUB(sys.argv[1])

View File

@@ -67,11 +67,3 @@ class ParseEPUB:
file_settings = { file_settings = {
'images_only': False} 'images_only': False}
return self.book['book_list'], file_settings return self.book['book_list'], file_settings
class HidePrinting:
def __enter__(self):
self._original_stdout = sys.stdout
sys.stdout = None
def __exit__(self, exc_type, exc_val, exc_tb):
sys.stdout = self._original_stdout

96
parsers/mobi.py Normal file
View File

@@ -0,0 +1,96 @@
#!/usr/bin/env python3
# This file is a part of Lector, a Qt based ebook reader
# Copyright (C) 2017 BasioMeusPuga
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# This module parses Amazon ebooks using KindleUnpack to first create an
# epub that is then read the usual way
import os
import sys
import shutil
import zipfile
from ePub.read_epub import EPUB
import KindleUnpack.kindleunpack as KindleUnpack
class ParseMOBI:
def __init__(self, filename, temp_dir, file_md5):
self.book_ref = None
self.book = None
self.filename = filename
self.epub_filepath = None
self.split_large_xml = False
self.extract_dir = os.path.join(temp_dir, file_md5)
def read_book(self):
with HidePrinting():
KindleUnpack.unpackBook(self.filename, self.extract_dir)
epub_filename = os.path.splitext(
os.path.basename(self.filename))[0] + '.epub'
self.epub_filepath = os.path.join(
self.extract_dir, 'mobi8', epub_filename)
if not os.path.exists(self.epub_filepath):
zip_dir = os.path.join(self.extract_dir, 'mobi7')
zip_file = os.path.join(
self.extract_dir, epub_filename)
self.epub_filepath = shutil.make_archive(zip_file, 'zip', zip_dir)
self.split_large_xml = True
self.book_ref = EPUB(self.epub_filepath)
contents_found = self.book_ref.read_epub()
if not contents_found:
print('Cannot process: ' + self.filename)
return
self.book = self.book_ref.book
def get_title(self):
return self.book['title']
def get_author(self):
return self.book['author']
def get_year(self):
return self.book['year']
def get_cover_image(self):
return self.book['cover']
def get_isbn(self):
return self.book['isbn']
def get_tags(self):
return self.book['tags']
def get_contents(self):
extract_path = os.path.join(self.extract_dir)
zipfile.ZipFile(self.epub_filepath).extractall(extract_path)
self.book_ref.parse_chapters(self.split_large_xml)
file_settings = {
'images_only': False}
return self.book['book_list'], file_settings
class HidePrinting:
def __enter__(self):
self._original_stdout = sys.stdout
sys.stdout = None
def __exit__(self, exc_type, exc_val, exc_tb):
sys.stdout = self._original_stdout

View File

@@ -44,15 +44,18 @@ from PyQt5 import QtCore, QtGui
import database import database
from parsers.ebook import ParseEBook
from parsers.cbz import ParseCBZ from parsers.cbz import ParseCBZ
from parsers.cbr import ParseCBR from parsers.cbr import ParseCBR
from parsers.epub import ParseEPUB from parsers.epub import ParseEPUB
from parsers.mobi import ParseMOBI
sorter = { sorter = {
'epub': ParseEPUB, 'epub': ParseEPUB,
'mobi': ParseEBook, 'mobi': ParseMOBI,
'azw': ParseEBook, 'azw': ParseMOBI,
'azw3': ParseMOBI,
'azw4': ParseMOBI,
'prc': ParseMOBI,
'cbz': ParseCBZ, 'cbz': ParseCBZ,
'cbr': ParseCBR,} 'cbr': ParseCBR,}