cbz parsing

2017-11-12 09:37:39 +05:30
parent 7fbea194c0
commit 405ea3547c
6 changed files with 117 additions and 17 deletions
--- a/parsers/cbz.py
+++ b/parsers/cbz.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+
+import os
+import time
+import zipfile
+import tempfile
+import collections
+
+
+class ParseCBZ:
+    def __init__(self, filename):
+        # TODO
+        # Maybe also include book description
+        self.filename = filename
+        self.book = None
+
+    def read_book(self):
+        try:
+            self.book = zipfile.ZipFile(self.filename, mode='r', allowZip64=True)
+        except (KeyError, AttributeError, FileNotFoundError, zipfile.BadZipFile):
+            print('Cannot parse ' + self.filename)
+            return
+
+    def get_title(self):
+        filename = os.path.basename(self.book.filename)
+        filename_proper = os.path.splitext(filename)[0]
+        return filename_proper
+
+    def get_author(self):
+        return None
+
+    def get_year(self):
+        creation_time = time.ctime(os.path.getctime(self.filename))
+        creation_year = creation_time.split()[-1]
+        return creation_year
+
+    def get_cover_image(self):
+        cover_image_info = self.book.infolist()[0]
+        cover_image = self.book.read(cover_image_info)
+        return cover_image
+
+    def get_isbn(self):
+        return None
+
+    def get_contents(self):
+        contents = collections.OrderedDict()
+        # This is a brute force approach
+        # Maybe try reading from the file as everything
+        # matures a little bit more
+        tmp_dir = tempfile.mkdtemp()
+
+        contents = collections.OrderedDict()
+        for count, i in enumerate(self.book.infolist()):
+            self.book.extract(i, path=tmp_dir)
+            page_name = 'Page ' + str(count + 1)
+            image_path = os.path.join(tmp_dir, i.filename)
+            # This does image returns.
+            # TODO
+            # Image resizing, formatting
+            # Cleanup after exit
+            contents[page_name] = "<img src='%s'/>" % image_path
+        return contents, tmp_dir
--- a/parsers/epub.py
+++ b/parsers/epub.py
@@ -5,8 +5,8 @@
 # get_author()
 # get_year()
 # get_cover_image()
-# get_isbn
-# TODO More for get contents, get TOC
+# get_isbn()
+# get_contents() - Should return a tuple with 0: TOC 1: Deletable temp_directory

 import os
 import re
@@ -89,7 +89,7 @@ class ParseEPUB:
            return image_content

        except KeyError:
-            return
+            return None

    def get_isbn(self):
        try:
@@ -100,7 +100,7 @@ class ParseEPUB:
                    isbn = i[0]
                    return isbn
        except KeyError:
-            return
+            return None

    def get_contents(self):
        contents = collections.OrderedDict()
@@ -137,4 +137,6 @@ class ParseEPUB:
                        raise AttributeError
                except AttributeError:
                    contents[title] = ''
-        return contents
+
+        # The 1th index is a directory that has to be cleaned up if needed
+        return contents, None