Speed up file addition

Improve fb2 parser
Fix extension checking
This commit is contained in:
BasioMeusPuga
2018-06-14 16:10:27 -04:00
parent 4a2da61b51
commit a0e463bc58
6 changed files with 56 additions and 32 deletions

View File

@@ -58,6 +58,7 @@ class ParseEPUB:
def get_contents(self):
zipfile.ZipFile(self.filename).extractall(self.extract_path)
self.book_ref.parse_toc()
self.book_ref.parse_chapters(temp_dir=self.extract_path)
file_settings = {
'images_only': False}

View File

@@ -15,7 +15,6 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import os
import zipfile
from lector.readers.read_fb2 import FB2
@@ -56,12 +55,8 @@ class ParseFB2:
return self.book['tags']
def get_contents(self):
# TODO
# Make this save images to the temp path
# Relative file paths should then point there
# zipfile.ZipFile(self.filename).extractall(self.extract_path)
# self.book_ref.parse_chapters(temp_dir=self.extract_path)
os.makedirs(self.extract_path, exist_ok=True) # Manual creation is required here
self.book_ref.parse_chapters(temp_dir=self.extract_path)
file_settings = {
'images_only': False}
return self.book['book_list'], file_settings

View File

@@ -40,7 +40,6 @@ class EPUB:
return False # No (valid) opf was found so processing cannot continue
self.generate_book_metadata(contents_path)
self.parse_toc()
except: # Not specifying an exception type here may be justified
return False

View File

@@ -48,25 +48,29 @@ class FB2:
return True
def generate_book_metadata(self):
self.book['title'] = os.path.splitext(
os.path.basename(self.filename))[0]
self.book['author'] = 'Unknown'
self.book['isbn'] = None
self.book['tags'] = None
self.book['cover'] = None
self.book['year'] = 9999
self.book['book_list'] = []
# TODO
# Look for other components of book metadata here
for i in self.xml.find_all():
# All metadata can be parsed in one pass
all_tags = self.xml.find('description')
if i.name == 'section':
for j in i:
if j.name == 'title':
this_title = j.text
self.book['book_list'].append(
(this_title, str(i)))
self.book['title'] = all_tags.find('book-title').text
if self.book['title'] == '' or self.book['title'] is None:
self.book['title'] = os.path.splitext(
os.path.basename(self.filename))[0]
self.book['author'] = all_tags.find('author').getText(separator=' ').replace('\n', ' ')
if self.book['author'] == '' or self.book['author'] is None:
self.book['author'] = 'Unknown'
# TODO
# Account for other date formats
try:
self.book['year'] = int(all_tags.find('date').text)
except ValueError:
self.book['year'] = 9999
# Cover Image
cover_image_xml = self.xml.find('coverpage')
@@ -75,8 +79,26 @@ class FB2:
cover_image_data = self.xml.find_all('binary')
for i in cover_image_data:
# TODO
# Account for other images as well
if cover_image_name.endswith(i.get('id')):
self.book['cover'] = base64.decodebytes(i.text.encode())
def parse_chapters(self, temp_dir):
# There's no need to parse the TOC separately because
# everything is linear
for i in self.xml.find_all('section'):
for j in i:
if j.name == 'title':
this_title = j.getText(separator=' ')
self.book['book_list'].append(
(this_title, str(i)))
# Extract all images to the temp_dir
for i in self.xml.find_all('binary'):
this_image_name = i.get('id')
this_image_path = os.path.join(temp_dir, this_image_name)
try:
this_image_data = base64.decodebytes(i.text.encode())
with open(this_image_path, 'wb') as outimage:
outimage.write(this_image_data)
except AttributeError:
pass

View File

@@ -175,15 +175,21 @@ class BookSorter:
print(f'{os.path.basename(filename)} is already in database')
return
# Using os.extsep like so allows for file extensions with multiple dots
file_extension = os.path.basename(filename).split(os.extsep, 1)[1]
try:
# Get the requisite parser from the sorter dict
book_ref = sorter[file_extension](filename, self.temp_dir, file_md5)
except KeyError:
# This allows for eliminating issues with filenames that have
# a dot in them. All hail the roundabout fix.
valid_extension = False
for i in sorter:
if os.path.basename(filename).endswith(i):
file_extension = i
valid_extension = True
break
if not valid_extension:
print(filename + ' has an unsupported extension')
return
book_ref = sorter[file_extension](filename, self.temp_dir, file_md5)
# Everything following this is standard
# None values are accounted for here
book_ref.read_book()