Implement .mobi parser
Eliminate dependency on ebooklib
This commit is contained in:
@@ -57,7 +57,7 @@ class EPUB:
|
|||||||
try:
|
try:
|
||||||
this_xml = self.zip_file.read(filename).decode()
|
this_xml = self.zip_file.read(filename).decode()
|
||||||
except KeyError:
|
except KeyError:
|
||||||
print('File not found in zip')
|
print(str(filename) + ' not found in zip')
|
||||||
return
|
return
|
||||||
|
|
||||||
root = BeautifulSoup(this_xml, parser)
|
root = BeautifulSoup(this_xml, parser)
|
||||||
@@ -73,8 +73,8 @@ class EPUB:
|
|||||||
container_location = self.get_file_path('container.xml')
|
container_location = self.get_file_path('container.xml')
|
||||||
xml = self.parse_xml(container_location, 'xml')
|
xml = self.parse_xml(container_location, 'xml')
|
||||||
|
|
||||||
|
if xml:
|
||||||
root_item = xml.find('rootfile')
|
root_item = xml.find('rootfile')
|
||||||
if root_item:
|
|
||||||
return root_item.get('full-path')
|
return root_item.get('full-path')
|
||||||
else:
|
else:
|
||||||
possible_filenames = ('content.opf', 'package.opf')
|
possible_filenames = ('content.opf', 'package.opf')
|
||||||
@@ -152,14 +152,14 @@ class EPUB:
|
|||||||
media_type = i.get('media-type')
|
media_type = i.get('media-type')
|
||||||
this_id = i.get('id')
|
this_id = i.get('id')
|
||||||
|
|
||||||
if media_type == 'application/xhtml+xml':
|
if media_type == 'application/xhtml+xml' or media_type == 'text/html':
|
||||||
self.book['content_dict'][this_id] = i.get('href')
|
self.book['content_dict'][this_id] = i.get('href')
|
||||||
|
|
||||||
if media_type == 'application/x-dtbncx+xml':
|
if media_type == 'application/x-dtbncx+xml':
|
||||||
self.book['toc_file'] = i.get('href')
|
self.book['toc_file'] = i.get('href')
|
||||||
|
|
||||||
# Cover image
|
# Cover image
|
||||||
if this_id.startswith('cover') and media_type.split('/')[0] == 'image':
|
if 'cover' in this_id and media_type.split('/')[0] == 'image':
|
||||||
cover_href = i.get('href')
|
cover_href = i.get('href')
|
||||||
try:
|
try:
|
||||||
self.book['cover'] = self.zip_file.read(cover_href)
|
self.book['cover'] = self.zip_file.read(cover_href)
|
||||||
@@ -175,7 +175,7 @@ class EPUB:
|
|||||||
biggest_image_size = 0
|
biggest_image_size = 0
|
||||||
biggest_image = None
|
biggest_image = None
|
||||||
for j in self.zip_file.filelist:
|
for j in self.zip_file.filelist:
|
||||||
if os.path.splitext(j.filename)[1] in ['.jpg', '.png', '.gif']:
|
if os.path.splitext(j.filename)[1] in ['.jpg', '.jpeg', '.png', '.gif']:
|
||||||
if j.file_size > biggest_image_size:
|
if j.file_size > biggest_image_size:
|
||||||
biggest_image = j.filename
|
biggest_image = j.filename
|
||||||
biggest_image_size = j.file_size
|
biggest_image_size = j.file_size
|
||||||
@@ -185,9 +185,6 @@ class EPUB:
|
|||||||
else:
|
else:
|
||||||
print('No cover found for: ' + self.filename)
|
print('No cover found for: ' + self.filename)
|
||||||
|
|
||||||
with open('cover', 'wb') as this_cover:
|
|
||||||
this_cover.write(self.book['cover'])
|
|
||||||
|
|
||||||
# Parse spine and arrange chapter paths acquired from the opf
|
# Parse spine and arrange chapter paths acquired from the opf
|
||||||
# according to the order IN THE SPINE
|
# according to the order IN THE SPINE
|
||||||
spine_items = xml.find_all('itemref')
|
spine_items = xml.find_all('itemref')
|
||||||
@@ -221,20 +218,48 @@ class EPUB:
|
|||||||
chapter_source = unquote(chapter_source.split('#')[0])
|
chapter_source = unquote(chapter_source.split('#')[0])
|
||||||
self.book['navpoint_dict'][chapter_source] = chapter_title
|
self.book['navpoint_dict'][chapter_source] = chapter_title
|
||||||
|
|
||||||
def parse_chapters(self):
|
def parse_chapters(self, split_large_xml=False):
|
||||||
no_title_chapter = 1
|
no_title_chapter = 1
|
||||||
self.book['book_list'] = []
|
self.book['book_list'] = []
|
||||||
for i in self.book['chapters_in_order']:
|
for i in self.book['chapters_in_order']:
|
||||||
chapter_data = self.read_from_zip(i).decode()
|
chapter_data = self.read_from_zip(i).decode()
|
||||||
|
|
||||||
|
if not split_large_xml:
|
||||||
try:
|
try:
|
||||||
self.book['book_list'].append(
|
self.book['book_list'].append(
|
||||||
(self.book['navpoint_dict'][i], chapter_data))
|
(self.book['navpoint_dict'][i], chapter_data))
|
||||||
except KeyError:
|
except KeyError:
|
||||||
fallback_title = str(no_title_chapter) + ': No Title'
|
fallback_title = str(no_title_chapter)
|
||||||
self.book['book_list'].append(
|
self.book['book_list'].append(
|
||||||
(fallback_title, chapter_data))
|
(fallback_title, chapter_data))
|
||||||
no_title_chapter += 1
|
no_title_chapter += 1
|
||||||
|
|
||||||
|
else:
|
||||||
|
# https://stackoverflow.com/questions/14444732/how-to-split-a-html-page-to-multiple-pages-using-python-and-beautiful-soup
|
||||||
|
markup = BeautifulSoup(chapter_data, 'xml')
|
||||||
|
chapters = []
|
||||||
|
pagebreaks = markup.find_all('pagebreak')
|
||||||
|
|
||||||
|
def next_element(elem):
|
||||||
|
while elem is not None:
|
||||||
|
elem = elem.next_sibling
|
||||||
|
if hasattr(elem, 'name'):
|
||||||
|
return elem
|
||||||
|
|
||||||
|
for pbreak in pagebreaks:
|
||||||
|
chapter = [str(pbreak)]
|
||||||
|
elem = next_element(pbreak)
|
||||||
|
while elem and elem.name != 'pagebreak':
|
||||||
|
chapter.append(str(elem))
|
||||||
|
elem = next_element(elem)
|
||||||
|
chapters.append('\n'.join(chapter))
|
||||||
|
|
||||||
|
for this_chapter in chapters:
|
||||||
|
fallback_title = str(no_title_chapter)
|
||||||
|
self.book['book_list'].append(
|
||||||
|
(fallback_title, this_chapter))
|
||||||
|
no_title_chapter += 1
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
book = EPUB(sys.argv[1])
|
book = EPUB(sys.argv[1])
|
||||||
book.read_epub()
|
book.read_epub()
|
||||||
|
@@ -67,11 +67,3 @@ class ParseEPUB:
|
|||||||
file_settings = {
|
file_settings = {
|
||||||
'images_only': False}
|
'images_only': False}
|
||||||
return self.book['book_list'], file_settings
|
return self.book['book_list'], file_settings
|
||||||
|
|
||||||
class HidePrinting:
|
|
||||||
def __enter__(self):
|
|
||||||
self._original_stdout = sys.stdout
|
|
||||||
sys.stdout = None
|
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
||||||
sys.stdout = self._original_stdout
|
|
||||||
|
96
parsers/mobi.py
Normal file
96
parsers/mobi.py
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
# This file is a part of Lector, a Qt based ebook reader
|
||||||
|
# Copyright (C) 2017 BasioMeusPuga
|
||||||
|
|
||||||
|
# This program is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation, either version 3 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
# This module parses Amazon ebooks using KindleUnpack to first create an
|
||||||
|
# epub that is then read the usual way
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import shutil
|
||||||
|
import zipfile
|
||||||
|
|
||||||
|
from ePub.read_epub import EPUB
|
||||||
|
import KindleUnpack.kindleunpack as KindleUnpack
|
||||||
|
|
||||||
|
|
||||||
|
class ParseMOBI:
|
||||||
|
def __init__(self, filename, temp_dir, file_md5):
|
||||||
|
self.book_ref = None
|
||||||
|
self.book = None
|
||||||
|
self.filename = filename
|
||||||
|
self.epub_filepath = None
|
||||||
|
self.split_large_xml = False
|
||||||
|
self.extract_dir = os.path.join(temp_dir, file_md5)
|
||||||
|
|
||||||
|
def read_book(self):
|
||||||
|
with HidePrinting():
|
||||||
|
KindleUnpack.unpackBook(self.filename, self.extract_dir)
|
||||||
|
|
||||||
|
epub_filename = os.path.splitext(
|
||||||
|
os.path.basename(self.filename))[0] + '.epub'
|
||||||
|
|
||||||
|
self.epub_filepath = os.path.join(
|
||||||
|
self.extract_dir, 'mobi8', epub_filename)
|
||||||
|
if not os.path.exists(self.epub_filepath):
|
||||||
|
zip_dir = os.path.join(self.extract_dir, 'mobi7')
|
||||||
|
zip_file = os.path.join(
|
||||||
|
self.extract_dir, epub_filename)
|
||||||
|
self.epub_filepath = shutil.make_archive(zip_file, 'zip', zip_dir)
|
||||||
|
self.split_large_xml = True
|
||||||
|
|
||||||
|
self.book_ref = EPUB(self.epub_filepath)
|
||||||
|
contents_found = self.book_ref.read_epub()
|
||||||
|
if not contents_found:
|
||||||
|
print('Cannot process: ' + self.filename)
|
||||||
|
return
|
||||||
|
self.book = self.book_ref.book
|
||||||
|
|
||||||
|
def get_title(self):
|
||||||
|
return self.book['title']
|
||||||
|
|
||||||
|
def get_author(self):
|
||||||
|
return self.book['author']
|
||||||
|
|
||||||
|
def get_year(self):
|
||||||
|
return self.book['year']
|
||||||
|
|
||||||
|
def get_cover_image(self):
|
||||||
|
return self.book['cover']
|
||||||
|
|
||||||
|
def get_isbn(self):
|
||||||
|
return self.book['isbn']
|
||||||
|
|
||||||
|
def get_tags(self):
|
||||||
|
return self.book['tags']
|
||||||
|
|
||||||
|
def get_contents(self):
|
||||||
|
extract_path = os.path.join(self.extract_dir)
|
||||||
|
zipfile.ZipFile(self.epub_filepath).extractall(extract_path)
|
||||||
|
|
||||||
|
self.book_ref.parse_chapters(self.split_large_xml)
|
||||||
|
file_settings = {
|
||||||
|
'images_only': False}
|
||||||
|
return self.book['book_list'], file_settings
|
||||||
|
|
||||||
|
class HidePrinting:
|
||||||
|
def __enter__(self):
|
||||||
|
self._original_stdout = sys.stdout
|
||||||
|
sys.stdout = None
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
sys.stdout = self._original_stdout
|
@@ -44,15 +44,18 @@ from PyQt5 import QtCore, QtGui
|
|||||||
|
|
||||||
import database
|
import database
|
||||||
|
|
||||||
from parsers.ebook import ParseEBook
|
|
||||||
from parsers.cbz import ParseCBZ
|
from parsers.cbz import ParseCBZ
|
||||||
from parsers.cbr import ParseCBR
|
from parsers.cbr import ParseCBR
|
||||||
from parsers.epub import ParseEPUB
|
from parsers.epub import ParseEPUB
|
||||||
|
from parsers.mobi import ParseMOBI
|
||||||
|
|
||||||
sorter = {
|
sorter = {
|
||||||
'epub': ParseEPUB,
|
'epub': ParseEPUB,
|
||||||
'mobi': ParseEBook,
|
'mobi': ParseMOBI,
|
||||||
'azw': ParseEBook,
|
'azw': ParseMOBI,
|
||||||
|
'azw3': ParseMOBI,
|
||||||
|
'azw4': ParseMOBI,
|
||||||
|
'prc': ParseMOBI,
|
||||||
'cbz': ParseCBZ,
|
'cbz': ParseCBZ,
|
||||||
'cbr': ParseCBR,}
|
'cbr': ParseCBR,}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user