update bookbulk

2023-04-25 11:32:01 +03:00
parent 4debd8ebfa
commit 116858bf48
5 changed files with 210 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+.vscode/
+venv/
--- a/1
+++ b/1
@@ -0,0 +1 @@
+https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=105179261
--- a/bulks.py
+++ b/bulks.py
@@ -0,0 +1,194 @@
+import selenium
+from selenium.webdriver.common import action_chains
+import urllib3
+import bs4
+import re
+import os
+import glob
+from selenium.common import exceptions
+from selenium import webdriver
+import img2pdf
+import threading
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.common.actions import interaction
+from selenium.webdriver.common import keys
+# from parser import ArgumentParser
+
+ACTS = []
+LAST_ACTS = []
+SOURCES = []
+Books = []
+THREADS = []
+PATHS = []
+OLD_REMOVE = []
+
+
+def remove_text(text: str):
+    '''remove_text removes the url from the text file
+
+    :param text: url to remove
+    :type text: str
+    '''
+    with open("BooksToDownload", "r", encoding="utf_8") as file:
+        data_text = file.readlines()
+    if text in data_text:
+        data_text.pop(data_text.index(text))
+        with open("BooksToDownload", 'w', encoding='utf_8') as file:
+            file.writelines(data_text)
+    else:
+        remove_text(f"{text}\n")
+
+
+def set_folder_name(html: bs4.BeautifulSoup):
+    name = html.find("title").text if html.text else " None"
+    return name[name.find(" - ")+3:].replace('"', "''").replace("\\", '||').replace(r':', r'׃').replace(r"/", r"|").replace("\n", "").replace('?', ';;')
+
+
+def down_to_list(url: str):
+    data = urllib3.PoolManager().request("GET", url)
+    return data.data if data.data else None
+
+
+def _split_styler(style: str):
+    begin = style.find('"')+1
+    end = style.rfind('"')
+    return style[begin:end]
+
+
+def update_SOURCES(index: int):
+    global SOURCES, ACTS
+    keys_html = bs4.BeautifulSoup(
+        ACTS[index]._driver.page_source, "html.parser").find_all(
+        "div", attrs={"class": "BV_oImage"})
+    dic_update = {key.attrs["id"]: _split_styler(key.attrs["style"])
+                  for key in keys_html if "http" in key.attrs["style"]}
+    SOURCES[index].update(dic_update)
+
+
+def do_action_now(index: int):
+    global SOURCES
+    global ACTS
+    ACTS[index].perform()
+    name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser"))
+    update_SOURCES(index)
+    if not os.path.exists("ignore/"+f"{name}"):
+        os.mkdir("ignore/"+f"{name}")
+    files = list(SOURCES[index])
+    for s in SOURCES[index]:
+        if SOURCES[index][s]:
+            files[files.index(s)] = urllib3.PoolManager().request("GET", SOURCES[index][s]).data
+    return files
+# def check_and_act(index: int,last):
+
+
+def get_first_empty(index: int):
+    global SOURCES
+    for s in SOURCES[index]:
+        if not SOURCES[index][s]:
+            return s
+    return None
+
+
+def act_now(index: int, path: str = None):
+    global SOURCES
+    global ACTS
+    global couters
+    global TREADS
+    global OLD_REMOVE
+    global LAST_ACTS
+    global treads
+    s = 0
+    name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser"))
+    save_first = ""
+    last = list(SOURCES[index].keys())[-1]
+    while "" in SOURCES[index].values():
+        if s == 0:
+            LAST_ACTS[index].perform()
+            s = 1
+        save_first = get_first_empty(index)
+        url_now = ACTS[index]._driver.current_url
+        url_now = url_now[:url_now.find("#")+1] + save_first
+        if SOURCES[index][last] and "" in SOURCES[index].values():
+            ACTS[index]._driver.get(url_now)
+            pages = do_action_now(index)
+            SOURCES[index][last] = ""
+        else:
+            pages = do_action_now(index)
+    if SOURCES[index] and "" not in SOURCES[index].values():
+        couters += 1
+        pathus = f'{path}/{name}.pdf' if path else f"ignore/{name}/{name}.pdf"
+        with open(pathus, "wb") as file:
+            file.write(img2pdf.convert(pages))
+            ACTS[index]._driver.quit()
+            remove_text(OLD_REMOVE[index])
+            treads -= 1
+
+
+def open_firefox(url: str):
+    '''open_firefox opens the firefox browser on the specific url, and sets all the settings for the specific session
+
+    :param url: url to run the firefox on
+    :type url: str
+    '''
+    web = give_me_web()
+    global SOURCES
+    global ACTS
+    global LAST_ACTS
+    if not url.startswith("#") and url:
+        book = webdriver.Firefox(web[0], executable_path=web[1], options=web[2])
+        url = url if url.endswith("#1.undefined.8.none") else f'{url}#1.undefined.8.none'
+        book.get(url)
+        act = action_chains.ActionChains(book)
+        lst_act = action_chains.ActionChains(book)
+        lst_act._actions = [lst_act.key_down(keys.Keys.END), lst_act.pause(3), lst_act.key_up(keys.Keys.END)]
+        act._actions = [act.send_keys(keys.Keys.PAGE_DOWN)]
+        LAST_ACTS.append(lst_act)
+        ACTS.append(act)
+        SOURCES.append({})
+
+
+def give_me_web():
+    options = webdriver.FirefoxOptions()
+    fp = webdriver.FirefoxProfile()
+    fp.set_preference("browser.download.folderList", 2)
+    fp.set_preference("browser.download.manager.showWhenStarting", False)
+    fp.set_preference("browser.download.dir", "ignore")
+    fp.set_preference(
+        "browser.helperApps.neverAsk.saveToDisk",
+        "attachment/csv, text/plain, application/octet-stream, application/binary, text/csv, application/csv, application/excel, text/comma-separated-values, text/xml, application/xml, application/xls, excel/xls, application/excel 97-2003,application/Microsoft Excel 97-2003 Worksheet, application/vnd.ms-excel")
+    fp.set_preference(
+        "browser.helperApps.neverAsk.openFile",
+        "application/PDF, application/FDF, application/XFDF, application/LSL, application/LSO, application/LSS, application/IQY, application/RQY, application/XLK, application/XLS, application/XLT, application/POT application/PPS, application/PPT, application/DOS, application/DOT, application/WKS, application/BAT, application/PS, application/EPS, application/WCH, application/WCM, application/WB1, application/WB3, application/RTF, application/DOC, application/MDB, application/MDE, application/WBK, application/WB1, application/WCH, application/WCM, application/AD, application/ADP, application/vnd.ms-excel")
+    fp.set_preference("browser.download.panel.shown", False)
+    options.add_argument('--lang=EN')
+    fire = "geckodriver"
+    return (fp, fire, options)
+
+
+with open("BooksToDownload", "r", encoding="utf_8") as file:
+    books = file.read().split("\n")
+
+for b in books:
+    if b.find('````') > -1 and not b.startswith("#"):
+        OLD_REMOVE.append(b)
+        PATHS.append(b[b.rfind('`')+1:])
+        b = b[:b.find('`')]
+    elif not b.startswith("#"):
+        OLD_REMOVE.append(b)
+        PATHS.append(None)
+    t1 = threading.Thread(None, open_firefox, args=(b,))
+    t1.start()
+    THREADS.append(t1)
+for t in THREADS:
+    t.join()
+lasts = []
+for i in range(len(ACTS)):
+    SOURCES[i].update({key.attrs["id"]: ""
+                       for key in bs4.BeautifulSoup(ACTS[i]._driver.page_source, "html.parser").find_all(
+        "div", attrs={"class": "BV_oImage"})})
+    lasts.append(list(SOURCES[i].keys())[-1])
+couters = 0
+treads = len(ACTS)-1
+for i in range(len(ACTS)):
+    T = threading.Thread(None, act_now, args=(i, PATHS[i]))
+    T.start()
--- a/BIN
+++ b/BIN
--- a/geckodriver.log
+++ b/geckodriver.log
@@ -0,0 +1,13 @@
+1682411282110	geckodriver	INFO	Listening on 127.0.0.1:60235
+1682411282158	mozrunner::runner	INFO	Running command: MOZ_CRASHREPORTER="1" MOZ_CRASHREPORTER_NO_REPORT="1" MOZ_CRASHREPORTER_SHUTDOWN="1" MOZ_NO_REMOTE="1" "/usr ... EN" "--remote-debugging-port" "52929" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "/tmp/rust_mozprofileyJU3uD"
+1682411283746	Marionette	INFO	Marionette enabled
+1682411283750	Marionette	INFO	Listening on port 41397
+WebDriver BiDi listening on ws://localhost:52929
+Read port: 41397
+1682411283910	RemoteAgent	WARN	TLS certificate errors will be ignored for this session
+console.warn: SearchSettings: "get: No settings file exists, new profile?" (new NotFoundError("Could not open the file at /tmp/rust_mozprofileyJU3uD/search.json.mozlz4", (void 0)))
+Missing chrome or resource URL: resource://gre/modules/UpdateListener.jsm
+Missing chrome or resource URL: resource://gre/modules/UpdateListener.sys.mjs
+DevTools listening on ws://localhost:52929/devtools/browser/6a936edc-696b-4097-8331-7af0838c543e
+JavaScript warning: https://cdn.cet.ac.il/libs/cet.editorManager/1.0/cetEditorManager.js, line 733: unreachable code after return statement
+JavaScript warning: https://kotar.cet.ac.il/ClientResourcesServingHandler.ashx?h=420b0f7c6bfad886e847426ac12334514f5f4fc6&t=javascript&minify=True, line 100: unreachable code after return statement
				`@@ -0,0 +1 @@`
				`https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=105179261`