diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b05ae27 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.vscode/ +venv/ \ No newline at end of file diff --git a/BooksToDownload b/BooksToDownload new file mode 100644 index 0000000..4b3e77d --- /dev/null +++ b/BooksToDownload @@ -0,0 +1 @@ +https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=105179261 \ No newline at end of file diff --git a/download books in bulks.py b/download books in bulks.py new file mode 100644 index 0000000..c97bc32 --- /dev/null +++ b/download books in bulks.py @@ -0,0 +1,194 @@ +import selenium +from selenium.webdriver.common import action_chains +import urllib3 +import bs4 +import re +import os +import glob +from selenium.common import exceptions +from selenium import webdriver +import img2pdf +import threading +from selenium.webdriver.common.action_chains import ActionChains +from selenium.webdriver.common.actions import interaction +from selenium.webdriver.common import keys +# from parser import ArgumentParser + +ACTS = [] +LAST_ACTS = [] +SOURCES = [] +Books = [] +THREADS = [] +PATHS = [] +OLD_REMOVE = [] + + +def remove_text(text: str): + '''remove_text removes the url from the text file + + :param text: url to remove + :type text: str + ''' + with open("BooksToDownload", "r", encoding="utf_8") as file: + data_text = file.readlines() + if text in data_text: + data_text.pop(data_text.index(text)) + with open("BooksToDownload", 'w', encoding='utf_8') as file: + file.writelines(data_text) + else: + remove_text(f"{text}\n") + + +def set_folder_name(html: bs4.BeautifulSoup): + name = html.find("title").text if html.text else " None" + return name[name.find(" - ")+3:].replace('"', "''").replace("\\", '||').replace(r':', r'׃').replace(r"/", r"|").replace("\n", "").replace('?', ';;') + + +def down_to_list(url: str): + data = urllib3.PoolManager().request("GET", url) + return data.data if data.data else None + + +def _split_styler(style: str): + begin = style.find('"')+1 + end = style.rfind('"') + return style[begin:end] + + +def update_SOURCES(index: int): + global SOURCES, ACTS + keys_html = bs4.BeautifulSoup( + ACTS[index]._driver.page_source, "html.parser").find_all( + "div", attrs={"class": "BV_oImage"}) + dic_update = {key.attrs["id"]: _split_styler(key.attrs["style"]) + for key in keys_html if "http" in key.attrs["style"]} + SOURCES[index].update(dic_update) + + +def do_action_now(index: int): + global SOURCES + global ACTS + ACTS[index].perform() + name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser")) + update_SOURCES(index) + if not os.path.exists("ignore/"+f"{name}"): + os.mkdir("ignore/"+f"{name}") + files = list(SOURCES[index]) + for s in SOURCES[index]: + if SOURCES[index][s]: + files[files.index(s)] = urllib3.PoolManager().request("GET", SOURCES[index][s]).data + return files +# def check_and_act(index: int,last): + + +def get_first_empty(index: int): + global SOURCES + for s in SOURCES[index]: + if not SOURCES[index][s]: + return s + return None + + +def act_now(index: int, path: str = None): + global SOURCES + global ACTS + global couters + global TREADS + global OLD_REMOVE + global LAST_ACTS + global treads + s = 0 + name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser")) + save_first = "" + last = list(SOURCES[index].keys())[-1] + while "" in SOURCES[index].values(): + if s == 0: + LAST_ACTS[index].perform() + s = 1 + save_first = get_first_empty(index) + url_now = ACTS[index]._driver.current_url + url_now = url_now[:url_now.find("#")+1] + save_first + if SOURCES[index][last] and "" in SOURCES[index].values(): + ACTS[index]._driver.get(url_now) + pages = do_action_now(index) + SOURCES[index][last] = "" + else: + pages = do_action_now(index) + if SOURCES[index] and "" not in SOURCES[index].values(): + couters += 1 + pathus = f'{path}/{name}.pdf' if path else f"ignore/{name}/{name}.pdf" + with open(pathus, "wb") as file: + file.write(img2pdf.convert(pages)) + ACTS[index]._driver.quit() + remove_text(OLD_REMOVE[index]) + treads -= 1 + + +def open_firefox(url: str): + '''open_firefox opens the firefox browser on the specific url, and sets all the settings for the specific session + + :param url: url to run the firefox on + :type url: str + ''' + web = give_me_web() + global SOURCES + global ACTS + global LAST_ACTS + if not url.startswith("#") and url: + book = webdriver.Firefox(web[0], executable_path=web[1], options=web[2]) + url = url if url.endswith("#1.undefined.8.none") else f'{url}#1.undefined.8.none' + book.get(url) + act = action_chains.ActionChains(book) + lst_act = action_chains.ActionChains(book) + lst_act._actions = [lst_act.key_down(keys.Keys.END), lst_act.pause(3), lst_act.key_up(keys.Keys.END)] + act._actions = [act.send_keys(keys.Keys.PAGE_DOWN)] + LAST_ACTS.append(lst_act) + ACTS.append(act) + SOURCES.append({}) + + +def give_me_web(): + options = webdriver.FirefoxOptions() + fp = webdriver.FirefoxProfile() + fp.set_preference("browser.download.folderList", 2) + fp.set_preference("browser.download.manager.showWhenStarting", False) + fp.set_preference("browser.download.dir", "ignore") + fp.set_preference( + "browser.helperApps.neverAsk.saveToDisk", + "attachment/csv, text/plain, application/octet-stream, application/binary, text/csv, application/csv, application/excel, text/comma-separated-values, text/xml, application/xml, application/xls, excel/xls, application/excel 97-2003,application/Microsoft Excel 97-2003 Worksheet, application/vnd.ms-excel") + fp.set_preference( + "browser.helperApps.neverAsk.openFile", + "application/PDF, application/FDF, application/XFDF, application/LSL, application/LSO, application/LSS, application/IQY, application/RQY, application/XLK, application/XLS, application/XLT, application/POT application/PPS, application/PPT, application/DOS, application/DOT, application/WKS, application/BAT, application/PS, application/EPS, application/WCH, application/WCM, application/WB1, application/WB3, application/RTF, application/DOC, application/MDB, application/MDE, application/WBK, application/WB1, application/WCH, application/WCM, application/AD, application/ADP, application/vnd.ms-excel") + fp.set_preference("browser.download.panel.shown", False) + options.add_argument('--lang=EN') + fire = "geckodriver" + return (fp, fire, options) + + +with open("BooksToDownload", "r", encoding="utf_8") as file: + books = file.read().split("\n") + +for b in books: + if b.find('````') > -1 and not b.startswith("#"): + OLD_REMOVE.append(b) + PATHS.append(b[b.rfind('`')+1:]) + b = b[:b.find('`')] + elif not b.startswith("#"): + OLD_REMOVE.append(b) + PATHS.append(None) + t1 = threading.Thread(None, open_firefox, args=(b,)) + t1.start() + THREADS.append(t1) +for t in THREADS: + t.join() +lasts = [] +for i in range(len(ACTS)): + SOURCES[i].update({key.attrs["id"]: "" + for key in bs4.BeautifulSoup(ACTS[i]._driver.page_source, "html.parser").find_all( + "div", attrs={"class": "BV_oImage"})}) + lasts.append(list(SOURCES[i].keys())[-1]) +couters = 0 +treads = len(ACTS)-1 +for i in range(len(ACTS)): + T = threading.Thread(None, act_now, args=(i, PATHS[i])) + T.start() diff --git a/geckodriver b/geckodriver new file mode 100755 index 0000000..8f84330 Binary files /dev/null and b/geckodriver differ diff --git a/geckodriver.log b/geckodriver.log new file mode 100644 index 0000000..b512a93 --- /dev/null +++ b/geckodriver.log @@ -0,0 +1,13 @@ +1682411282110 geckodriver INFO Listening on 127.0.0.1:60235 +1682411282158 mozrunner::runner INFO Running command: MOZ_CRASHREPORTER="1" MOZ_CRASHREPORTER_NO_REPORT="1" MOZ_CRASHREPORTER_SHUTDOWN="1" MOZ_NO_REMOTE="1" "/usr ... EN" "--remote-debugging-port" "52929" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "/tmp/rust_mozprofileyJU3uD" +1682411283746 Marionette INFO Marionette enabled +1682411283750 Marionette INFO Listening on port 41397 +WebDriver BiDi listening on ws://localhost:52929 +Read port: 41397 +1682411283910 RemoteAgent WARN TLS certificate errors will be ignored for this session +console.warn: SearchSettings: "get: No settings file exists, new profile?" (new NotFoundError("Could not open the file at /tmp/rust_mozprofileyJU3uD/search.json.mozlz4", (void 0))) +Missing chrome or resource URL: resource://gre/modules/UpdateListener.jsm +Missing chrome or resource URL: resource://gre/modules/UpdateListener.sys.mjs +DevTools listening on ws://localhost:52929/devtools/browser/6a936edc-696b-4097-8331-7af0838c543e +JavaScript warning: https://cdn.cet.ac.il/libs/cet.editorManager/1.0/cetEditorManager.js, line 733: unreachable code after return statement +JavaScript warning: https://kotar.cet.ac.il/ClientResourcesServingHandler.ashx?h=420b0f7c6bfad886e847426ac12334514f5f4fc6&t=javascript&minify=True, line 100: unreachable code after return statement