Splitting for ePubs with anchor navigation
This commit is contained in:
@@ -28,6 +28,7 @@ class EPUB:
|
|||||||
self.filename = filename
|
self.filename = filename
|
||||||
self.zip_file = None
|
self.zip_file = None
|
||||||
self.book = {}
|
self.book = {}
|
||||||
|
self.book['split_chapters'] = {}
|
||||||
|
|
||||||
def read_epub(self):
|
def read_epub(self):
|
||||||
# This is the function that should error out in
|
# This is the function that should error out in
|
||||||
@@ -215,26 +216,32 @@ class EPUB:
|
|||||||
for i in navpoints:
|
for i in navpoints:
|
||||||
chapter_title = i.find('text').text
|
chapter_title = i.find('text').text
|
||||||
chapter_source = i.find('content').get('src')
|
chapter_source = i.find('content').get('src')
|
||||||
chapter_source = unquote(chapter_source.split('#')[0])
|
chapter_source_file = unquote(chapter_source.split('#')[0])
|
||||||
self.book['navpoint_dict'][chapter_source] = chapter_title
|
|
||||||
|
if '#' in chapter_source:
|
||||||
|
try:
|
||||||
|
self.book['split_chapters'][chapter_source_file].append(
|
||||||
|
(chapter_source.split('#')[1], chapter_title))
|
||||||
|
except KeyError:
|
||||||
|
self.book['split_chapters'][chapter_source_file] = []
|
||||||
|
self.book['split_chapters'][chapter_source_file].append(
|
||||||
|
(chapter_source.split('#')[1], chapter_title))
|
||||||
|
|
||||||
|
self.book['navpoint_dict'][chapter_source_file] = chapter_title
|
||||||
|
|
||||||
def parse_chapters(self, temp_dir=None, split_large_xml=False):
|
def parse_chapters(self, temp_dir=None, split_large_xml=False):
|
||||||
no_title_chapter = 0
|
no_title_chapter = 0
|
||||||
self.book['book_list'] = []
|
self.book['book_list'] = []
|
||||||
|
|
||||||
for i in self.book['chapters_in_order']:
|
for i in self.book['chapters_in_order']:
|
||||||
chapter_data = self.read_from_zip(i).decode()
|
chapter_data = self.read_from_zip(i).decode()
|
||||||
|
|
||||||
if not split_large_xml:
|
if i in self.book['split_chapters']:
|
||||||
try:
|
split_chapters = get_split_content(
|
||||||
self.book['book_list'].append(
|
chapter_data, self.book['split_chapters'][i])
|
||||||
(self.book['navpoint_dict'][i], chapter_data))
|
self.book['book_list'].extend(split_chapters)
|
||||||
except KeyError:
|
|
||||||
fallback_title = str(no_title_chapter)
|
|
||||||
self.book['book_list'].append(
|
|
||||||
(fallback_title, chapter_data))
|
|
||||||
no_title_chapter += 1
|
|
||||||
|
|
||||||
else:
|
elif split_large_xml:
|
||||||
# https://stackoverflow.com/questions/14444732/how-to-split-a-html-page-to-multiple-pages-using-python-and-beautiful-soup
|
# https://stackoverflow.com/questions/14444732/how-to-split-a-html-page-to-multiple-pages-using-python-and-beautiful-soup
|
||||||
markup = BeautifulSoup(chapter_data, 'xml')
|
markup = BeautifulSoup(chapter_data, 'xml')
|
||||||
chapters = []
|
chapters = []
|
||||||
@@ -259,6 +266,15 @@ class EPUB:
|
|||||||
self.book['book_list'].append(
|
self.book['book_list'].append(
|
||||||
(fallback_title, this_chapter))
|
(fallback_title, this_chapter))
|
||||||
no_title_chapter += 1
|
no_title_chapter += 1
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
self.book['book_list'].append(
|
||||||
|
(self.book['navpoint_dict'][i], chapter_data))
|
||||||
|
except KeyError:
|
||||||
|
fallback_title = str(no_title_chapter)
|
||||||
|
self.book['book_list'].append(
|
||||||
|
(fallback_title, chapter_data))
|
||||||
|
no_title_chapter += 1
|
||||||
|
|
||||||
cover_path = os.path.join(temp_dir, os.path.basename(self.filename)) + '- cover'
|
cover_path = os.path.join(temp_dir, os.path.basename(self.filename)) + '- cover'
|
||||||
if self.book['cover']:
|
if self.book['cover']:
|
||||||
@@ -267,3 +283,35 @@ class EPUB:
|
|||||||
|
|
||||||
self.book['book_list'][0] = (
|
self.book['book_list'][0] = (
|
||||||
'Cover', f'<center><img src="{cover_path}" alt="Cover"></center>')
|
'Cover', f'<center><img src="{cover_path}" alt="Cover"></center>')
|
||||||
|
|
||||||
|
|
||||||
|
def get_split_content(chapter_data, split_by):
|
||||||
|
split_anchors = [i[0] for i in split_by]
|
||||||
|
chapter_titles = [i[1] for i in split_by]
|
||||||
|
return_list = []
|
||||||
|
|
||||||
|
xml = BeautifulSoup(chapter_data, 'lxml')
|
||||||
|
xml_string = xml.body.prettify()
|
||||||
|
|
||||||
|
for count, i in enumerate(split_anchors):
|
||||||
|
this_split = xml_string.split(i)
|
||||||
|
current_chapter = this_split[0]
|
||||||
|
|
||||||
|
bs_obj = BeautifulSoup(current_chapter, 'lxml')
|
||||||
|
# Since tags correspond to data following them, the first
|
||||||
|
# chunk will be ignored
|
||||||
|
# As will all empty chapters
|
||||||
|
if bs_obj.text == '\n' or bs_obj.text == '' or count == 0:
|
||||||
|
continue
|
||||||
|
bs_obj_string = str(bs_obj).replace('">', '', 1)
|
||||||
|
|
||||||
|
return_list.append(
|
||||||
|
(chapter_titles[count - 1], bs_obj_string))
|
||||||
|
xml_string = this_split[1]
|
||||||
|
|
||||||
|
bs_obj = BeautifulSoup(xml_string, 'lxml')
|
||||||
|
bs_obj_string = str(bs_obj).replace('">', '', 1)
|
||||||
|
return_list.append(
|
||||||
|
(chapter_titles[-1], bs_obj_string))
|
||||||
|
|
||||||
|
return return_list
|
||||||
|
@@ -189,7 +189,6 @@ class MainUI(QtWidgets.QMainWindow, mainwindow.Ui_MainWindow):
|
|||||||
|
|
||||||
# The library refresh button on the Library tab
|
# The library refresh button on the Library tab
|
||||||
self.reloadLibrary.setIcon(self.QImageFactory.get_image('reload'))
|
self.reloadLibrary.setIcon(self.QImageFactory.get_image('reload'))
|
||||||
self.reloadLibrary.setIconSize(QtCore.QSize(22, 22))
|
|
||||||
self.reloadLibrary.setObjectName('reloadLibrary')
|
self.reloadLibrary.setObjectName('reloadLibrary')
|
||||||
self.reloadLibrary.setToolTip('Scan library')
|
self.reloadLibrary.setToolTip('Scan library')
|
||||||
self.reloadLibrary.setAutoRaise(True)
|
self.reloadLibrary.setAutoRaise(True)
|
||||||
|
Reference in New Issue
Block a user