diff --git a/download books in bulks.py b/download books in bulks.py index 7c2ba21..5b3e516 100644 --- a/download books in bulks.py +++ b/download books in bulks.py @@ -1,198 +1,198 @@ -import selenium -from selenium.webdriver.common import action_chains -import urllib3 -import bs4 -import re -import os -import glob -from selenium.common import exceptions -from selenium import webdriver -import img2pdf -import threading -from selenium.webdriver.common.action_chains import ActionChains -from selenium.webdriver.common.actions import interaction -from selenium.webdriver.common import keys -# from parser import ArgumentParser - -ACTS = [] -LAST_ACTS = [] -SOURCES = [] -Books = [] -THREADS = [] -PATHS = [] -OLD_REMOVE = [] - - -def remove_text(text: str): - '''remove_text removes the url from the text file - - :param text: url to remove - :type text: str - ''' - with open("BooksToDownload", "r", encoding="utf_8") as file: - data_text = file.readlines() - if text in data_text: - data_text.pop(data_text.index(text)) - with open("BooksToDownload", 'w', encoding='utf_8') as file: - file.writelines(data_text) - else: - remove_text(f"{text}\n") - - -def set_folder_name(html: bs4.BeautifulSoup): - name = html.find("title").text if html.text else " None" - return name[name.find(" - ")+3:].replace('"', "''").replace("\\", '||').replace(r':', r'׃').replace(r"/", r"|").replace("\n", "").replace('?', ';;') - - -def down_to_list(url: str): - data = urllib3.PoolManager().request("GET", url) - return data.data if data.data else None - - -def _split_styler(style: str): - begin = style.find('"')+1 - end = style.rfind('"') - return style[begin:end] - - -def update_SOURCES(index: int): - global SOURCES, ACTS - keys_html = bs4.BeautifulSoup( - ACTS[index]._driver.page_source, "html.parser").find_all( - "div", attrs={"class": "BV_oImage"}) - dic_update = {key.attrs["id"]: _split_styler(key.attrs["style"]) - for key in keys_html if "http" in key.attrs["style"]} - SOURCES[index].update(dic_update) - - -def do_action_now(index: int): - global SOURCES - global ACTS - ACTS[index].perform() - name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser")) - update_SOURCES(index) - if not os.path.exists("ignore/"+f"{name}"): - os.mkdir("ignore/"+f"{name}") - files = list(SOURCES[index]) - for s in SOURCES[index]: - if SOURCES[index][s]: - if not os.path.exists(f"ignore/{name}/{files.index(s):04}.jpg"): - with open(f"ignore/{name}/{files.index(s):04}.jpg","wb") as F: - F.write(urllib3.PoolManager().request("GET", SOURCES[index][s]).data) - # files[files.index(s)] = urllib3.PoolManager().request("GET", SOURCES[index][s]).data - return files -# def check_and_act(index: int,last): - - -def get_first_empty(index: int): - global SOURCES - for s in SOURCES[index]: - if not SOURCES[index][s]: - return s - return None - - -def act_now(index: int, path: str = None): - global SOURCES - global ACTS - global couters - global TREADS - global OLD_REMOVE - global LAST_ACTS - global treads - s = 0 - name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser")) - save_first = "" - last = list(SOURCES[index].keys())[-1] - while "" in SOURCES[index].values(): - if s == 0: - LAST_ACTS[index].perform() - s = 1 - save_first = get_first_empty(index) - url_now = ACTS[index]._driver.current_url - url_now = url_now[:url_now.find("#")+1] + save_first - if SOURCES[index][last] and "" in SOURCES[index].values(): - ACTS[index]._driver.get(url_now) - do_action_now(index) - SOURCES[index][last] = "" - else: - do_action_now(index) - if SOURCES[index] and "" not in SOURCES[index].values(): - couters += 1 - pathus = f'{path}/{name}.pdf' if path else f"ignore/{name}/{name}.pdf" - with open(pathus, "wb") as file: - file.write(img2pdf.convert(glob.glob(f"ignore/{name}/*jpg"))) - ACTS[index]._driver.quit() - remove_text(OLD_REMOVE[index]) - treads -= 1 - - -def open_firefox(url: str): - '''open_firefox opens the firefox browser on the specific url, and sets all the settings for the specific session - - :param url: url to run the firefox on - :type url: str - ''' - web = give_me_web() - global SOURCES - global ACTS - global LAST_ACTS - if not url.startswith("#") and url: - book = webdriver.Firefox(web[0], executable_path=web[1], options=web[2]) - url = url if url.endswith("#1.undefined.8.none") else f'{url}#1.undefined.8.none' - book.get(url) - act = action_chains.ActionChains(book) - lst_act = action_chains.ActionChains(book) - lst_act._actions = [lst_act.key_down(keys.Keys.END), lst_act.pause(3), lst_act.key_up(keys.Keys.END)] - act._actions = [act.send_keys(keys.Keys.PAGE_DOWN)] - LAST_ACTS.append(lst_act) - ACTS.append(act) - SOURCES.append({}) - - -def give_me_web(): - options = webdriver.FirefoxOptions() - fp = webdriver.FirefoxProfile() - fp.set_preference("browser.download.folderList", 2) - fp.set_preference("browser.download.manager.showWhenStarting", False) - fp.set_preference("browser.download.dir", "ignore") - fp.set_preference( - "browser.helperApps.neverAsk.saveToDisk", - "attachment/csv, text/plain, application/octet-stream, application/binary, text/csv, application/csv, application/excel, text/comma-separated-values, text/xml, application/xml, application/xls, excel/xls, application/excel 97-2003,application/Microsoft Excel 97-2003 Worksheet, application/vnd.ms-excel") - fp.set_preference( - "browser.helperApps.neverAsk.openFile", - "application/PDF, application/FDF, application/XFDF, application/LSL, application/LSO, application/LSS, application/IQY, application/RQY, application/XLK, application/XLS, application/XLT, application/POT application/PPS, application/PPT, application/DOS, application/DOT, application/WKS, application/BAT, application/PS, application/EPS, application/WCH, application/WCM, application/WB1, application/WB3, application/RTF, application/DOC, application/MDB, application/MDE, application/WBK, application/WB1, application/WCH, application/WCM, application/AD, application/ADP, application/vnd.ms-excel") - fp.set_preference("browser.download.panel.shown", False) - options.add_argument('--lang=EN') - options.headless = True - fire = "geckodriver" - return (fp, fire, options) - - -with open("BooksToDownload", "r", encoding="utf_8") as file: - books = file.read().split("\n") - -for b in books: - if b.find('````') > -1 and not b.startswith("#"): - OLD_REMOVE.append(b) - PATHS.append(b[b.rfind('`')+1:]) - b = b[:b.find('`')] - elif not b.startswith("#"): - OLD_REMOVE.append(b) - PATHS.append(None) - t1 = threading.Thread(None, open_firefox, args=(b,)) - t1.start() - THREADS.append(t1) -for t in THREADS: - t.join() -lasts = [] -for i in range(len(ACTS)): - SOURCES[i].update({key.attrs["id"]: "" - for key in bs4.BeautifulSoup(ACTS[i]._driver.page_source, "html.parser").find_all( - "div", attrs={"class": "BV_oImage"})}) - lasts.append(list(SOURCES[i].keys())[-1]) -couters = 0 -treads = len(ACTS)-1 -for i in range(len(ACTS)): - T = threading.Thread(None, act_now, args=(i, PATHS[i])) - T.start() +import selenium +from selenium.webdriver.common import action_chains +import urllib3 +import bs4 +import re +import os +import glob +from selenium.common import exceptions +from selenium import webdriver +import img2pdf +import threading +from selenium.webdriver.common.action_chains import ActionChains +from selenium.webdriver.common.actions import interaction +from selenium.webdriver.common import keys +# from parser import ArgumentParser + +ACTS = [] +LAST_ACTS = [] +SOURCES = [] +Books = [] +THREADS = [] +PATHS = [] +OLD_REMOVE = [] + + +def remove_text(text: str): + '''remove_text removes the url from the text file + + :param text: url to remove + :type text: str + ''' + with open("BooksToDownload", "r", encoding="utf_8") as file: + data_text = file.readlines() + if text in data_text: + data_text.pop(data_text.index(text)) + with open("BooksToDownload", 'w', encoding='utf_8') as file: + file.writelines(data_text) + else: + remove_text(f"{text}\n") + + +def set_folder_name(html: bs4.BeautifulSoup): + name = html.find("title").text if html.text else " None" + return name[name.find(" - ")+3:].replace('"', "''").replace("\\", '||').replace(r':', r'׃').replace(r"/", r"|").replace("\n", "").replace('?', ';;') + + +def down_to_list(url: str): + data = urllib3.PoolManager().request("GET", url) + return data.data if data.data else None + + +def _split_styler(style: str): + begin = style.find('"')+1 + end = style.rfind('"') + return style[begin:end] + + +def update_SOURCES(index: int): + global SOURCES, ACTS + keys_html = bs4.BeautifulSoup( + ACTS[index]._driver.page_source, "html.parser").find_all( + "div", attrs={"class": "BV_oImage"}) + dic_update = {key.attrs["id"]: _split_styler(key.attrs["style"]) + for key in keys_html if "http" in key.attrs["style"]} + SOURCES[index].update(dic_update) + + +def do_action_now(index: int): + global SOURCES + global ACTS + ACTS[index].perform() + name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser")) + update_SOURCES(index) + if not os.path.exists("ignore/"+f"{name}"): + os.mkdir("ignore/"+f"{name}") + files = list(SOURCES[index]) + for s in SOURCES[index]: + if SOURCES[index][s]: + if not os.path.exists(f"ignore/{name}/{files.index(s):04}.jpg"): + with open(f"ignore/{name}/{files.index(s):04}.jpg","wb") as F: + F.write(urllib3.PoolManager().request("GET", SOURCES[index][s]).data) + # files[files.index(s)] = urllib3.PoolManager().request("GET", SOURCES[index][s]).data + return files +# def check_and_act(index: int,last): + + +def get_first_empty(index: int): + global SOURCES + for s in SOURCES[index]: + if not SOURCES[index][s]: + return s + return None + + +def act_now(index: int, path: str = None): + global SOURCES + global ACTS + global couters + global TREADS + global OLD_REMOVE + global LAST_ACTS + global treads + s = 0 + name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser")) + save_first = "" + last = list(SOURCES[index].keys())[-1] + while "" in SOURCES[index].values(): + if s == 0: + LAST_ACTS[index].perform() + s = 1 + save_first = get_first_empty(index) + url_now = ACTS[index]._driver.current_url + url_now = url_now[:url_now.find("#")+1] + save_first + if SOURCES[index][last] and "" in SOURCES[index].values(): + ACTS[index]._driver.get(url_now) + do_action_now(index) + SOURCES[index][last] = "" + else: + do_action_now(index) + if SOURCES[index] and "" not in SOURCES[index].values(): + couters += 1 + pathus = f'{path}/{name}.pdf' if path else f"ignore/{name}/{name}.pdf" + with open(pathus, "wb") as file: + file.write(img2pdf.convert(glob.glob(f"ignore/{name}/*.jpg"))) + ACTS[index]._driver.quit() + remove_text(OLD_REMOVE[index]) + treads -= 1 + + +def open_firefox(url: str): + '''open_firefox opens the firefox browser on the specific url, and sets all the settings for the specific session + + :param url: url to run the firefox on + :type url: str + ''' + web = give_me_web() + global SOURCES + global ACTS + global LAST_ACTS + if not url.startswith("#") and url: + book = webdriver.Firefox(web[0], executable_path=web[1], options=web[2]) + url = url if url.endswith("#1.undefined.8.none") else f'{url}#1.undefined.8.none' + book.get(url) + act = action_chains.ActionChains(book) + lst_act = action_chains.ActionChains(book) + lst_act._actions = [lst_act.key_down(keys.Keys.END), lst_act.pause(3), lst_act.key_up(keys.Keys.END)] + act._actions = [act.send_keys(keys.Keys.PAGE_DOWN)] + LAST_ACTS.append(lst_act) + ACTS.append(act) + SOURCES.append({}) + + +def give_me_web(): + options = webdriver.FirefoxOptions() + fp = webdriver.FirefoxProfile() + fp.set_preference("browser.download.folderList", 2) + fp.set_preference("browser.download.manager.showWhenStarting", False) + fp.set_preference("browser.download.dir", "ignore") + fp.set_preference( + "browser.helperApps.neverAsk.saveToDisk", + "attachment/csv, text/plain, application/octet-stream, application/binary, text/csv, application/csv, application/excel, text/comma-separated-values, text/xml, application/xml, application/xls, excel/xls, application/excel 97-2003,application/Microsoft Excel 97-2003 Worksheet, application/vnd.ms-excel") + fp.set_preference( + "browser.helperApps.neverAsk.openFile", + "application/PDF, application/FDF, application/XFDF, application/LSL, application/LSO, application/LSS, application/IQY, application/RQY, application/XLK, application/XLS, application/XLT, application/POT application/PPS, application/PPT, application/DOS, application/DOT, application/WKS, application/BAT, application/PS, application/EPS, application/WCH, application/WCM, application/WB1, application/WB3, application/RTF, application/DOC, application/MDB, application/MDE, application/WBK, application/WB1, application/WCH, application/WCM, application/AD, application/ADP, application/vnd.ms-excel") + fp.set_preference("browser.download.panel.shown", False) + options.add_argument('--lang=EN') + options.headless = True + fire = "geckodriver" + return (fp, fire, options) + + +with open("BooksToDownload", "r", encoding="utf_8") as file: + books = file.read().split("\n") + +for b in books: + if b.find('````') > -1 and not b.startswith("#"): + OLD_REMOVE.append(b) + PATHS.append(b[b.rfind('`')+1:]) + b = b[:b.find('`')] + elif not b.startswith("#"): + OLD_REMOVE.append(b) + PATHS.append(None) + t1 = threading.Thread(None, open_firefox, args=(b,)) + t1.start() + THREADS.append(t1) +for t in THREADS: + t.join() +lasts = [] +for i in range(len(ACTS)): + SOURCES[i].update({key.attrs["id"]: "" + for key in bs4.BeautifulSoup(ACTS[i]._driver.page_source, "html.parser").find_all( + "div", attrs={"class": "BV_oImage"})}) + lasts.append(list(SOURCES[i].keys())[-1]) +couters = 0 +treads = len(ACTS)-1 +for i in range(len(ACTS)): + T = threading.Thread(None, act_now, args=(i, PATHS[i])) + T.start()