Start of ebook parser

2017-11-06 13:32:17 +05:30
parent f3d748f5dd
commit 263b3742ee
2 changed files with 67 additions and 0 deletions
@@ -25,3 +25,6 @@ class DatabaseFunctions:
        # database at time of closing
        self.database.commit()
    def add_to_database(self, book_data, image_data):
        pass
@@ -0,0 +1,64 @@
 #!/usr/bin/env python3
 import os
 import re
 import collections
 import ebooklib.epub
 def get_book_essentials(filename):
    book = ebooklib.epub.read_epub(filename)
    # Get book title
    title = book.title.strip()
    # Get cover image
    # This seems hack-ish, but that's never stopped me before
    image_path = None
    try:
        cover = book.metadata['http://www.idpf.org/2007/opf']['cover'][0][1]['content']
        cover_item = book.get_item_with_id(cover)
        # In case no cover_item is returned, we search the items
        # in the book and get the first referenced image
        if not cover_item:
            for j in book.guide:
                try:
                    if (j['title'].lower in ['cover', 'cover-image', 'coverimage'] or j['type'] == 'coverimagestandard'):
                        image_path = j['href']
                    break
                except KeyError:
                    pass
            if not image_path:
                for j in book.items:
                    if j.media_type == 'application/xhtml+xml':
                        _regex = re.search(r"src=\"(.*)\"\/", j.content.decode('utf-8'))
                        if _regex:
                            image_path = _regex[1]
                        break
            for k in book.get_items_of_type(ebooklib.ITEM_IMAGE):
                if os.path.basename(k.file_name) == os.path.basename(image_path):
                    image_content = k.get_content()
        else:
            image_content = cover_item.get_content()
    except KeyError:
        print('Cannot parse ' + filename)
    # Get ISBN ID
    isbn_id = None
    try:
        identifier = book.metadata['http://purl.org/dc/elements/1.1/']['identifier']
        for i in identifier:
            identifier_provider = i[1]['{http://www.idpf.org/2007/opf}scheme']
            if identifier_provider.lower() == 'isbn':
                isbn_id = i[0]
                break
    except KeyError:
        pass
    with open('/home/akhil/aa.jpg', 'bw') as myimg:
        myimg.write(image_content)