From 263b3742eef55a5f9891dc89decf7502ceac18e7 Mon Sep 17 00:00:00 2001
From: BasioMeusPuga <disgruntled.mob@gmail.com>
Date: Mon, 6 Nov 2017 13:32:17 +0530
Subject: [PATCH] Start of ebook parser

---
 database.py |  3 +++
 parser.py   | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100644 parser.py

diff --git a/database.py b/database.py
index d0ce4c4..98de04e 100644
--- a/database.py
+++ b/database.py
@@ -25,3 +25,6 @@ class DatabaseFunctions:
         # database at time of closing
 
         self.database.commit()
+    
+    def add_to_database(self, book_data, image_data):
+        pass
diff --git a/parser.py b/parser.py
new file mode 100644
index 0000000..6116926
--- /dev/null
+++ b/parser.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+
+import os
+import re
+import collections
+import ebooklib.epub
+
+
+def get_book_essentials(filename):
+    book = ebooklib.epub.read_epub(filename)
+
+    # Get book title
+    title = book.title.strip()
+
+    # Get cover image
+    # This seems hack-ish, but that's never stopped me before
+    image_path = None
+    try:
+        cover = book.metadata['http://www.idpf.org/2007/opf']['cover'][0][1]['content']
+        cover_item = book.get_item_with_id(cover)
+
+        # In case no cover_item is returned, we search the items
+        # in the book and get the first referenced image
+        if not cover_item:
+            for j in book.guide:
+                try:
+                    if (j['title'].lower in ['cover', 'cover-image', 'coverimage'] or j['type'] == 'coverimagestandard'):
+                        image_path = j['href']
+                    break
+                except KeyError:
+                    pass
+
+            if not image_path:
+                for j in book.items:
+                    if j.media_type == 'application/xhtml+xml':
+                        _regex = re.search(r"src=\"(.*)\"\/", j.content.decode('utf-8'))
+                        if _regex:
+                            image_path = _regex[1]
+                        break
+
+            for k in book.get_items_of_type(ebooklib.ITEM_IMAGE):
+                if os.path.basename(k.file_name) == os.path.basename(image_path):
+                    image_content = k.get_content()
+
+        else:
+            image_content = cover_item.get_content()
+
+    except KeyError:
+        print('Cannot parse ' + filename)
+
+    # Get ISBN ID
+    isbn_id = None
+    try:
+        identifier = book.metadata['http://purl.org/dc/elements/1.1/']['identifier']
+        for i in identifier:
+            identifier_provider = i[1]['{http://www.idpf.org/2007/opf}scheme']
+            if identifier_provider.lower() == 'isbn':
+                isbn_id = i[0]
+                break
+    except KeyError:
+        pass
+
+    with open('/home/akhil/aa.jpg', 'bw') as myimg:
+        myimg.write(image_content)