import selenium from selenium.webdriver.common import action_chains import urllib3 import bs4 import re import os import glob from selenium.common import exceptions from selenium import webdriver import img2pdf import threading from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.actions import interaction from selenium.webdriver.common import keys # from parser import ArgumentParser ACTS = [] LAST_ACTS = [] SOURCES = [] Books = [] THREADS = [] PATHS = [] OLD_REMOVE = [] def remove_text(text: str): '''remove_text removes the url from the text file :param text: url to remove :type text: str ''' with open("BooksToDownload", "r", encoding="utf_8") as file: data_text = file.readlines() if text in data_text: data_text.pop(data_text.index(text)) with open("BooksToDownload", 'w', encoding='utf_8') as file: file.writelines(data_text) else: remove_text(f"{text}\n") def set_folder_name(html: bs4.BeautifulSoup): name = html.find("title").text if html.text else " None" return name[name.find(" - ")+3:].replace('"', "''").replace("\\", '||').replace(r':', r'׃').replace(r"/", r"|").replace("\n", "").replace('?', ';;') def down_to_list(url: str): data = urllib3.PoolManager().request("GET", url) return data.data if data.data else None def _split_styler(style: str): begin = style.find('"')+1 end = style.rfind('"') return style[begin:end] def update_SOURCES(index: int): global SOURCES, ACTS keys_html = bs4.BeautifulSoup( ACTS[index]._driver.page_source, "html.parser").find_all( "div", attrs={"class": "BV_oImage"}) dic_update = {key.attrs["id"]: _split_styler(key.attrs["style"]) for key in keys_html if "http" in key.attrs["style"]} SOURCES[index].update(dic_update) def do_action_now(index: int): global SOURCES global ACTS ACTS[index].perform() name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser")) update_SOURCES(index) if not os.path.exists("ignore/"+f"{name}"): os.mkdir("ignore/"+f"{name}") files = list(SOURCES[index]) for s in SOURCES[index]: if SOURCES[index][s]: if not os.path.exists(f"ignore/{name}/{files.index(s):04}.jpg"): with open(f"ignore/{name}/{files.index(s):04}.jpg","wb") as F: F.write(urllib3.PoolManager().request("GET", SOURCES[index][s]).data) # files[files.index(s)] = urllib3.PoolManager().request("GET", SOURCES[index][s]).data return files # def check_and_act(index: int,last): def get_first_empty(index: int): global SOURCES for s in SOURCES[index]: if not SOURCES[index][s]: return s return None def act_now(index: int, path: str = None): global SOURCES global ACTS global couters global TREADS global OLD_REMOVE global LAST_ACTS global treads s = 0 name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser")) save_first = "" last = list(SOURCES[index].keys())[-1] while "" in SOURCES[index].values(): if s == 0: LAST_ACTS[index].perform() s = 1 save_first = get_first_empty(index) url_now = ACTS[index]._driver.current_url url_now = url_now[:url_now.find("#")+1] + save_first if SOURCES[index][last] and "" in SOURCES[index].values(): ACTS[index]._driver.get(url_now) do_action_now(index) SOURCES[index][last] = "" else: do_action_now(index) if SOURCES[index] and "" not in SOURCES[index].values(): couters += 1 pathus = f'{path}/{name}.pdf' if path else f"ignore/{name}/{name}.pdf" with open(pathus, "wb") as file: file.write(img2pdf.convert(glob.glob(f"ignore/{name}/*jpg"))) ACTS[index]._driver.quit() remove_text(OLD_REMOVE[index]) treads -= 1 def open_firefox(url: str): '''open_firefox opens the firefox browser on the specific url, and sets all the settings for the specific session :param url: url to run the firefox on :type url: str ''' web = give_me_web() global SOURCES global ACTS global LAST_ACTS if not url.startswith("#") and url: book = webdriver.Firefox(web[0], executable_path=web[1], options=web[2]) url = url if url.endswith("#1.undefined.8.none") else f'{url}#1.undefined.8.none' book.get(url) act = action_chains.ActionChains(book) lst_act = action_chains.ActionChains(book) lst_act._actions = [lst_act.key_down(keys.Keys.END), lst_act.pause(3), lst_act.key_up(keys.Keys.END)] act._actions = [act.send_keys(keys.Keys.PAGE_DOWN)] LAST_ACTS.append(lst_act) ACTS.append(act) SOURCES.append({}) def give_me_web(): options = webdriver.FirefoxOptions() fp = webdriver.FirefoxProfile() fp.set_preference("browser.download.folderList", 2) fp.set_preference("browser.download.manager.showWhenStarting", False) fp.set_preference("browser.download.dir", "ignore") fp.set_preference( "browser.helperApps.neverAsk.saveToDisk", "attachment/csv, text/plain, application/octet-stream, application/binary, text/csv, application/csv, application/excel, text/comma-separated-values, text/xml, application/xml, application/xls, excel/xls, application/excel 97-2003,application/Microsoft Excel 97-2003 Worksheet, application/vnd.ms-excel") fp.set_preference( "browser.helperApps.neverAsk.openFile", "application/PDF, application/FDF, application/XFDF, application/LSL, application/LSO, application/LSS, application/IQY, application/RQY, application/XLK, application/XLS, application/XLT, application/POT application/PPS, application/PPT, application/DOS, application/DOT, application/WKS, application/BAT, application/PS, application/EPS, application/WCH, application/WCM, application/WB1, application/WB3, application/RTF, application/DOC, application/MDB, application/MDE, application/WBK, application/WB1, application/WCH, application/WCM, application/AD, application/ADP, application/vnd.ms-excel") fp.set_preference("browser.download.panel.shown", False) options.add_argument('--lang=EN') options.headless = True fire = "geckodriver" return (fp, fire, options) with open("BooksToDownload", "r", encoding="utf_8") as file: books = file.read().split("\n") for b in books: if b.find('````') > -1 and not b.startswith("#"): OLD_REMOVE.append(b) PATHS.append(b[b.rfind('`')+1:]) b = b[:b.find('`')] elif not b.startswith("#"): OLD_REMOVE.append(b) PATHS.append(None) t1 = threading.Thread(None, open_firefox, args=(b,)) t1.start() THREADS.append(t1) for t in THREADS: t.join() lasts = [] for i in range(len(ACTS)): SOURCES[i].update({key.attrs["id"]: "" for key in bs4.BeautifulSoup(ACTS[i]._driver.page_source, "html.parser").find_all( "div", attrs={"class": "BV_oImage"})}) lasts.append(list(SOURCES[i].keys())[-1]) couters = 0 treads = len(ACTS)-1 for i in range(len(ACTS)): T = threading.Thread(None, act_now, args=(i, PATHS[i])) T.start()