Speed up file addition

Improve fb2 parser Fix extension checking
2018-06-14 16:10:27 -04:00
parent 4a2da61b51
commit a0e463bc58
6 changed files with 56 additions and 32 deletions
@@ -58,6 +58,7 @@ class ParseEPUB:
    def get_contents(self):
        zipfile.ZipFile(self.filename).extractall(self.extract_path)

+        self.book_ref.parse_toc()
        self.book_ref.parse_chapters(temp_dir=self.extract_path)
        file_settings = {
            'images_only': False}
@@ -15,7 +15,6 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

 import os
-import zipfile

 from lector.readers.read_fb2 import FB2

@@ -56,12 +55,8 @@ class ParseFB2:
        return self.book['tags']

    def get_contents(self):
-        # TODO
-        # Make this save images to the temp path
-        # Relative file paths should then point there
-        # zipfile.ZipFile(self.filename).extractall(self.extract_path)
-
-        # self.book_ref.parse_chapters(temp_dir=self.extract_path)
+        os.makedirs(self.extract_path, exist_ok=True)  # Manual creation is required here
+        self.book_ref.parse_chapters(temp_dir=self.extract_path)
        file_settings = {
            'images_only': False}
        return self.book['book_list'], file_settings
@@ -40,7 +40,6 @@ class EPUB:
                return False  # No (valid) opf was found so processing cannot continue

            self.generate_book_metadata(contents_path)
-            self.parse_toc()
        except:  # Not specifying an exception type here may be justified
            return False

@@ -48,25 +48,29 @@ class FB2:
        return True

    def generate_book_metadata(self):
-        self.book['title'] = os.path.splitext(
-            os.path.basename(self.filename))[0]
-        self.book['author'] = 'Unknown'
        self.book['isbn'] = None
        self.book['tags'] = None
        self.book['cover'] = None
-        self.book['year'] = 9999
        self.book['book_list'] = []

-        # TODO
-        # Look for other components of book metadata here
-        for i in self.xml.find_all():
+        # All metadata can be parsed in one pass
+        all_tags = self.xml.find('description')

-            if i.name == 'section':
-                for j in i:
-                    if j.name == 'title':
-                        this_title = j.text
-                self.book['book_list'].append(
-                    (this_title, str(i)))
+        self.book['title'] = all_tags.find('book-title').text
+        if self.book['title'] == '' or self.book['title'] is None:
+            self.book['title'] = os.path.splitext(
+                os.path.basename(self.filename))[0]
+
+        self.book['author'] = all_tags.find('author').getText(separator=' ').replace('\n', ' ')
+        if self.book['author'] == '' or self.book['author'] is None:
+            self.book['author'] = 'Unknown'
+
+        # TODO
+        # Account for other date formats
+        try:
+            self.book['year'] = int(all_tags.find('date').text)
+        except ValueError:
+            self.book['year'] = 9999

        # Cover Image
        cover_image_xml = self.xml.find('coverpage')
@@ -75,8 +79,26 @@ class FB2:

        cover_image_data = self.xml.find_all('binary')
        for i in cover_image_data:
-
-            # TODO
-            # Account for other images as well
            if cover_image_name.endswith(i.get('id')):
                self.book['cover'] = base64.decodebytes(i.text.encode())
+
+    def parse_chapters(self, temp_dir):
+        # There's no need to parse the TOC separately because
+        # everything is linear
+        for i in self.xml.find_all('section'):
+            for j in i:
+                if j.name == 'title':
+                    this_title = j.getText(separator=' ')
+            self.book['book_list'].append(
+                (this_title, str(i)))
+
+        # Extract all images to the temp_dir
+        for i in self.xml.find_all('binary'):
+            this_image_name = i.get('id')
+            this_image_path = os.path.join(temp_dir, this_image_name)
+            try:
+                this_image_data = base64.decodebytes(i.text.encode())
+                with open(this_image_path, 'wb') as outimage:
+                    outimage.write(this_image_data)
+            except AttributeError:
+                pass
@@ -175,15 +175,21 @@ class BookSorter:
                    print(f'{os.path.basename(filename)} is already in database')
                return

-        # Using os.extsep like so allows for file extensions with multiple dots
-        file_extension = os.path.basename(filename).split(os.extsep, 1)[1]
-        try:
-            # Get the requisite parser from the sorter dict
-            book_ref = sorter[file_extension](filename, self.temp_dir, file_md5)
-        except KeyError:
+        # This allows for eliminating issues with filenames that have
+        # a dot in them. All hail the roundabout fix.
+        valid_extension = False
+        for i in sorter:
+            if os.path.basename(filename).endswith(i):
+                file_extension = i
+                valid_extension = True
+                break
+
+        if not valid_extension:
            print(filename + ' has an unsupported extension')
            return

+        book_ref = sorter[file_extension](filename, self.temp_dir, file_md5)
+
        # Everything following this is standard
        # None values are accounted for here
        book_ref.read_book()