Parse cbr files

2017-11-17 00:30:56 +05:30
parent 3f09c5afb9
commit 10dcc14fd0
12 changed files with 4347 additions and 16 deletions
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+
+import os
+import time
+import collections
+from rarfile import rarfile
+
+
+class ParseCBR:
+    def __init__(self, filename, temp_dir, file_md5):
+        self.filename = filename
+        self.book = None
+        self.temp_dir = temp_dir
+        self.file_md5 = file_md5
+
+    def read_book(self):
+        try:
+            self.book = rarfile.RarFile(self.filename)
+        except:  # Specifying no exception types might be warranted here
+            print('Cannot parse ' + self.filename)
+            return
+
+    def get_title(self):
+        filename = os.path.basename(self.filename)
+        filename_proper = os.path.splitext(filename)[0]
+        return filename_proper
+
+    def get_author(self):
+        return None
+
+    def get_year(self):
+        creation_time = time.ctime(os.path.getctime(self.filename))
+        creation_year = creation_time.split()[-1]
+        return creation_year
+
+    def get_cover_image(self):
+        # The first image in the archive may not be the cover
+        # It is implied, however, that the first image in order
+        # will be the cover
+
+        image_list = [i.filename for i in self.book.infolist() if not i.isdir()]
+        image_list.sort()
+        cover_image_filename = image_list[0]
+
+        for i in self.book.infolist():
+            if not i.isdir():
+                if i.filename == cover_image_filename:
+                    cover_image = self.book.read(i)
+                    return cover_image
+
+    def get_isbn(self):
+        return None
+
+    def get_contents(self):
+        # TODO
+        # CBR files containing multiple directories for multiple chapters
+
+        file_settings = {
+            'images_only': True}
+
+        extract_path = os.path.join(self.temp_dir, self.file_md5)
+        contents = collections.OrderedDict()
+        # This is a brute force approach
+        # Maybe try reading from the file as everything
+        # matures a little bit more
+
+        contents = collections.OrderedDict()
+
+        # I'm currently choosing not to keep multiple files in memory
+        self.book.extractall(extract_path)
+
+        found_images = []
+        for i in os.walk(extract_path):
+            if i[2]:  # Implies files were found
+                image_dir = i[0]
+                found_images = i[2]
+                break
+
+        if not found_images:
+            print('Found nothing in ' + self.filename)
+            return None, file_settings
+
+        found_images.sort()
+
+        for count, i in enumerate(found_images):
+            page_name = 'Page ' + str(count + 1)
+            image_path = os.path.join(extract_path, image_dir, i)
+
+            contents[page_name] = image_path
+
+        return contents, file_settings
@@ -56,13 +56,9 @@ class ParseCBZ:

    def get_contents(self):
        # TODO
-        # Image resizing, formatting
-        # Include this as a collection of absolute paths only
-        # Post processing can be carried out by the program
        # CBZ files containing multiple directories for multiple chapters

        file_settings = {
-            'temp_dir': self.temp_dir,
            'images_only': True}

        extract_path = os.path.join(self.temp_dir, self.file_md5)
@@ -90,10 +86,9 @@ class ParseCBZ:
        found_images.sort()

        for count, i in enumerate(found_images):
-            page_name = 'Page ' + str(count)
+            page_name = 'Page ' + str(count + 1)
            image_path = os.path.join(extract_path, image_dir, i)

            contents[page_name] = image_path
-            # contents[page_name] = "<img src='%s' align='middle'/>" % image_path

        return contents, file_settings
@@ -144,7 +144,6 @@ class ParseEPUB:
        # Special settings that have to be returned with the file
        # Referenced in sorter.py
        file_settings = {
-            'temp_dir': extract_path,
            'images_only': False}

        return contents, file_settings