From 263b3742eef55a5f9891dc89decf7502ceac18e7 Mon Sep 17 00:00:00 2001 From: BasioMeusPuga Date: Mon, 6 Nov 2017 13:32:17 +0530 Subject: [PATCH] Start of ebook parser --- database.py | 3 +++ parser.py | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 parser.py diff --git a/database.py b/database.py index d0ce4c4..98de04e 100644 --- a/database.py +++ b/database.py @@ -25,3 +25,6 @@ class DatabaseFunctions: # database at time of closing self.database.commit() + + def add_to_database(self, book_data, image_data): + pass diff --git a/parser.py b/parser.py new file mode 100644 index 0000000..6116926 --- /dev/null +++ b/parser.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 + +import os +import re +import collections +import ebooklib.epub + + +def get_book_essentials(filename): + book = ebooklib.epub.read_epub(filename) + + # Get book title + title = book.title.strip() + + # Get cover image + # This seems hack-ish, but that's never stopped me before + image_path = None + try: + cover = book.metadata['http://www.idpf.org/2007/opf']['cover'][0][1]['content'] + cover_item = book.get_item_with_id(cover) + + # In case no cover_item is returned, we search the items + # in the book and get the first referenced image + if not cover_item: + for j in book.guide: + try: + if (j['title'].lower in ['cover', 'cover-image', 'coverimage'] or j['type'] == 'coverimagestandard'): + image_path = j['href'] + break + except KeyError: + pass + + if not image_path: + for j in book.items: + if j.media_type == 'application/xhtml+xml': + _regex = re.search(r"src=\"(.*)\"\/", j.content.decode('utf-8')) + if _regex: + image_path = _regex[1] + break + + for k in book.get_items_of_type(ebooklib.ITEM_IMAGE): + if os.path.basename(k.file_name) == os.path.basename(image_path): + image_content = k.get_content() + + else: + image_content = cover_item.get_content() + + except KeyError: + print('Cannot parse ' + filename) + + # Get ISBN ID + isbn_id = None + try: + identifier = book.metadata['http://purl.org/dc/elements/1.1/']['identifier'] + for i in identifier: + identifier_provider = i[1]['{http://www.idpf.org/2007/opf}scheme'] + if identifier_provider.lower() == 'isbn': + isbn_id = i[0] + break + except KeyError: + pass + + with open('/home/akhil/aa.jpg', 'bw') as myimg: + myimg.write(image_content)