Speed up file addition

Improve fb2 parser
Fix extension checking
This commit is contained in:
BasioMeusPuga
2018-06-14 16:10:27 -04:00
parent 4a2da61b51
commit a0e463bc58
6 changed files with 56 additions and 32 deletions

5
TODO
View File

@@ -77,6 +77,8 @@ TODO
✓ mobi, azw support
Limit the extra files produced by KindleUnpack
Have them save to memory
✓ fb2 support
Images need to show up in their placeholders
Other:
✓ Define every widget in code
Bugs:
@@ -84,7 +86,6 @@ TODO
Clean up 'switch' page layout
Colors aren't loaded properly for annotation previews
Cover page shouldn't be scolled midway
It's possible the addition function is also parsing the whole book.
Secondary:
Graphical themes
@@ -98,7 +99,7 @@ TODO
Use embedded fonts + CSS
Scrolling: Smooth / By Line
Shift to logging instead of print statements
txt, doc, chm, djvu, fb2 support
txt, doc, chm, djvu support
Include icons for filetype emblems
Comic view modes
Continuous paging

View File

@@ -58,6 +58,7 @@ class ParseEPUB:
def get_contents(self):
zipfile.ZipFile(self.filename).extractall(self.extract_path)
self.book_ref.parse_toc()
self.book_ref.parse_chapters(temp_dir=self.extract_path)
file_settings = {
'images_only': False}

View File

@@ -15,7 +15,6 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import os
import zipfile
from lector.readers.read_fb2 import FB2
@@ -56,12 +55,8 @@ class ParseFB2:
return self.book['tags']
def get_contents(self):
# TODO
# Make this save images to the temp path
# Relative file paths should then point there
# zipfile.ZipFile(self.filename).extractall(self.extract_path)
# self.book_ref.parse_chapters(temp_dir=self.extract_path)
os.makedirs(self.extract_path, exist_ok=True) # Manual creation is required here
self.book_ref.parse_chapters(temp_dir=self.extract_path)
file_settings = {
'images_only': False}
return self.book['book_list'], file_settings

View File

@@ -40,7 +40,6 @@ class EPUB:
return False # No (valid) opf was found so processing cannot continue
self.generate_book_metadata(contents_path)
self.parse_toc()
except: # Not specifying an exception type here may be justified
return False

View File

@@ -48,25 +48,29 @@ class FB2:
return True
def generate_book_metadata(self):
self.book['title'] = os.path.splitext(
os.path.basename(self.filename))[0]
self.book['author'] = 'Unknown'
self.book['isbn'] = None
self.book['tags'] = None
self.book['cover'] = None
self.book['year'] = 9999
self.book['book_list'] = []
# TODO
# Look for other components of book metadata here
for i in self.xml.find_all():
# All metadata can be parsed in one pass
all_tags = self.xml.find('description')
if i.name == 'section':
for j in i:
if j.name == 'title':
this_title = j.text
self.book['book_list'].append(
(this_title, str(i)))
self.book['title'] = all_tags.find('book-title').text
if self.book['title'] == '' or self.book['title'] is None:
self.book['title'] = os.path.splitext(
os.path.basename(self.filename))[0]
self.book['author'] = all_tags.find('author').getText(separator=' ').replace('\n', ' ')
if self.book['author'] == '' or self.book['author'] is None:
self.book['author'] = 'Unknown'
# TODO
# Account for other date formats
try:
self.book['year'] = int(all_tags.find('date').text)
except ValueError:
self.book['year'] = 9999
# Cover Image
cover_image_xml = self.xml.find('coverpage')
@@ -75,8 +79,26 @@ class FB2:
cover_image_data = self.xml.find_all('binary')
for i in cover_image_data:
# TODO
# Account for other images as well
if cover_image_name.endswith(i.get('id')):
self.book['cover'] = base64.decodebytes(i.text.encode())
def parse_chapters(self, temp_dir):
# There's no need to parse the TOC separately because
# everything is linear
for i in self.xml.find_all('section'):
for j in i:
if j.name == 'title':
this_title = j.getText(separator=' ')
self.book['book_list'].append(
(this_title, str(i)))
# Extract all images to the temp_dir
for i in self.xml.find_all('binary'):
this_image_name = i.get('id')
this_image_path = os.path.join(temp_dir, this_image_name)
try:
this_image_data = base64.decodebytes(i.text.encode())
with open(this_image_path, 'wb') as outimage:
outimage.write(this_image_data)
except AttributeError:
pass

View File

@@ -175,15 +175,21 @@ class BookSorter:
print(f'{os.path.basename(filename)} is already in database')
return
# Using os.extsep like so allows for file extensions with multiple dots
file_extension = os.path.basename(filename).split(os.extsep, 1)[1]
try:
# Get the requisite parser from the sorter dict
book_ref = sorter[file_extension](filename, self.temp_dir, file_md5)
except KeyError:
# This allows for eliminating issues with filenames that have
# a dot in them. All hail the roundabout fix.
valid_extension = False
for i in sorter:
if os.path.basename(filename).endswith(i):
file_extension = i
valid_extension = True
break
if not valid_extension:
print(filename + ' has an unsupported extension')
return
book_ref = sorter[file_extension](filename, self.temp_dir, file_md5)
# Everything following this is standard
# None values are accounted for here
book_ref.read_book()