Implement support for Amazon ebooks

2018-03-09 17:25:44 +05:30
parent 2cbd2df9a5
commit a70838348b
5 changed files with 51 additions and 10 deletions
--- a/parsers/epub.py
+++ b/parsers/epub.py
@@ -1,164 +0,0 @@
-#!/usr/bin/env python3
-
-# This file is a part of Lector, a Qt based ebook reader
-# Copyright (C) 2017 BasioMeusPuga
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-import os
-import re
-import zipfile
-import collections
-from urllib.parse import unquote
-
-import ebooklib.epub
-
-
-class ParseEPUB:
-    def __init__(self, filename, temp_dir, file_md5):
-        # TODO
-        # Maybe also include book description
-        self.filename = filename
-        self.book = None
-        self.temp_dir = temp_dir
-        self.file_md5 = file_md5
-
-    def read_book(self):
-        try:
-            self.book = ebooklib.epub.read_epub(self.filename)
-        except (KeyError, AttributeError, FileNotFoundError):
-            print('Cannot parse ' + self.filename)
-            return
-
-    def get_title(self):
-        return self.book.title.strip()
-
-    def get_author(self):
-        try:
-            return self.book.metadata['http://purl.org/dc/elements/1.1/']['creator'][0][0]
-        except KeyError:
-            return
-
-    def get_year(self):
-        try:
-            return self.book.metadata['http://purl.org/dc/elements/1.1/']['date'][0][0][:4]
-        except KeyError:
-            return
-
-    def get_cover_image(self):
-        # Get cover image
-        # This seems hack-ish, but that's never stopped me before
-        image_path = None
-        try:
-            cover = self.book.metadata['http://www.idpf.org/2007/opf']['cover'][0][1]['content']
-            cover_item = self.book.get_item_with_id(cover)
-            if cover_item:
-                return cover_item.get_content()
-        except KeyError:
-            pass
-
-        # In case no cover_item is returned, we look for a cover in the guide
-        for i in self.book.guide:
-            try:
-                if (i['title'].lower in ['cover', 'cover-image', 'coverimage'] or
-                        i['type'] == 'coverimagestandard'):
-                    image_path = i['href']
-                break
-            except KeyError:
-                pass
-
-        # If that fails, we find the first image referenced in the book
-        if not image_path:
-            for i in self.book.items:
-                if i.media_type == 'application/xhtml+xml':
-                    _regex = re.search(r"src=\"(.*)\"\/", i.content.decode('utf-8'))
-                    if _regex:
-                        image_path = _regex[1]
-                    break
-
-        if image_path:
-            for i in self.book.get_items_of_type(ebooklib.ITEM_IMAGE):
-                if os.path.basename(i.file_name) == os.path.basename(image_path):
-                    return i.get_content()
-
-        # And if that too fails, we get the first image referenced in the file
-        for i in self.book.items:
-            if i.media_type == 'image/jpeg' or i.media_type == 'image/png':
-                return i.get_content()
-
-    def get_isbn(self):
-        try:
-            identifier = self.book.metadata['http://purl.org/dc/elements/1.1/']['identifier']
-            for i in identifier:
-                identifier_provider = i[1]['{http://www.idpf.org/2007/opf}scheme']
-                if identifier_provider.lower() == 'isbn':
-                    isbn = i[0]
-                    return isbn
-        except KeyError:
-            return
-
-    def get_tags(self):
-        try:
-            subject = self.book.metadata['http://purl.org/dc/elements/1.1/']['subject']
-            tags = [i[0] for i in subject]
-            return tags
-        except KeyError:
-            return
-
-    def get_contents(self):
-        extract_path = os.path.join(self.temp_dir, self.file_md5)
-        zipfile.ZipFile(self.filename).extractall(extract_path)
-
-        contents = collections.OrderedDict()
-
-        def flatten_section(toc_element):
-            output_list = []
-            for i in toc_element:
-                if isinstance(i, (tuple, list)):
-                    output_list.extend(flatten_section(i))
-                else:
-                    output_list.append(i)
-            return output_list
-
-        for i in self.book.toc:
-            if isinstance(i, (tuple, list)):
-                flattened = flatten_section(i)
-
-                for j in flattened:
-                    title = j.title
-                    href = unquote(j.href)
-                    try:
-                        content = self.book.get_item_with_href(href).get_content()
-                        contents[title] = content.decode()
-                    except AttributeError:
-                        pass
-
-            else:
-                title = i.title
-                href = unquote(i.href)
-                try:
-                    content = self.book.get_item_with_href(href).get_content()
-                    if content:
-                        contents[title] = content.decode()
-                    else:
-                        raise AttributeError
-                except AttributeError:
-                    contents[title] = 'Parse Error'
-
-        # Special settings that have to be returned with the file
-        # Referenced in sorter.py
-        file_settings = {
-            'images_only': False}
-
-        return contents, file_settings