Merge branch 'master' of https://git.saret.tk/saret/BulkBooks

books to download
Merge pull request 'master' (#3 ) from Mooooooooo/BulkBooks:master into master
2023-05-02 22:33:40 +03:00 · 2023-05-02 22:33:31 +03:00 · 2023-05-02 14:47:44 +03:00 · 2023-05-02 14:46:22 +03:00 · 2023-05-01 23:08:46 +03:00 · 2023-05-01 22:56:39 +03:00
5 changed files with 262 additions and 197 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 .vscode/
-venv/
+venv/
 ignore/
--- a/34
+++ b/34
@@ -1 +1,33 @@
-https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=105179261
+https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110994706
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110918661
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110930833
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110933810
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110938002
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110938002
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110942621
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110942621
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110948032
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110948032
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110959329
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=105256337
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=105018830
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=109444642
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=109400325
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=109392561
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=107884166
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=97645077
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=97645077
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=102594097
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=102591827
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=102588217
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=102589202
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=101052334
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=101048613
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=101986400
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=100976710
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=100974786
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=108426718
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=108236946
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=106246523
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=104502712
 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=103818662
--- a/README.md
+++ b/README.md
@@ -1,2 +1,12 @@
-# BulkBooks
+# Bulk Books:
 This script's goal is to help you to download books from [Kotar](https://kotar.cet.ac.il/).
 ## How To?
 1. You need an Academic Access to Kotar.
 1. You need to have python>=3.9
 1. download the requiremetns. (It might be prefered by using venv).
 1. Add the links to the __BooksToDownload__ file.
 1. Run the script.
 Enjoy.
--- a/bulks.py
+++ b/bulks.py
@@ -1,194 +1,191 @@
-import selenium
+import selenium
-from selenium.webdriver.common import action_chains
+from selenium.webdriver.common import action_chains
-import urllib3
+import urllib3
-import bs4
+import bs4
-import re
+import re
-import os
+import os
-import glob
+import glob
-from selenium.common import exceptions
+from selenium.common import exceptions
-from selenium import webdriver
+from selenium import webdriver
-import img2pdf
+import img2pdf
-import threading
+import threading
-from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.common.action_chains import ActionChains
-from selenium.webdriver.common.actions import interaction
+from selenium.webdriver.common.actions import interaction
-from selenium.webdriver.common import keys
+from selenium.webdriver.common import keys
-# from parser import ArgumentParser
+# from parser import ArgumentParser
-
+
-ACTS = []
+ACTS = []
-LAST_ACTS = []
+LAST_ACTS = []
-SOURCES = []
+SOURCES = []
-Books = []
+Books = []
-THREADS = []
+THREADS = []
-PATHS = []
+PATHS = []
-OLD_REMOVE = []
+OLD_REMOVE = []
-
+BROWSER_PREFENCES = {"browser.download.folderList": 2, "browser.download.manager.showWhenStarting": False, "browser.download.dir": "ignore", "browser.helperApps.neverAsk.saveToDisk": "attachment/csv, text/plain, application/octet-stream, application/binary, text/csv, application/csv, application/excel, text/comma-separated-values, text/xml, application/xml, application/xls, excel/xls, application/excel 97-2003,application/Microsoft Excel 97-2003 Worksheet, application/vnd.ms-excel", "browser.helperApps.neverAsk.openFile":
-
+                     "application/PDF, application/FDF, application/XFDF, application/LSL, application/LSO, application/LSS, application/IQY, application/RQY, application/XLK, application/XLS, application/XLT, application/POT application/PPS, application/PPT, application/DOS, application/DOT, application/WKS, application/BAT, application/PS, application/EPS, application/WCH, application/WCM, application/WB1, application/WB3, application/RTF, application/DOC, application/MDB, application/MDE, application/WBK, application/WB1, application/WCH, application/WCM, application/AD, application/ADP, application/vnd.ms-excel", "browser.download.panel.shown": False}
-def remove_text(text: str):
+
-    '''remove_text removes the url from the text file
+
-
+def remove_text(text: str):
-    :param text: url to remove
+    '''remove_text removes the url from the text file
-    :type text: str
+
-    '''
+    :param text: url to remove
-    with open("BooksToDownload", "r", encoding="utf_8") as file:
+    :type text: str
-        data_text = file.readlines()
+    '''
-    if text in data_text:
+    with open("BooksToDownload", "r", encoding="utf_8") as file:
-        data_text.pop(data_text.index(text))
+        data_text = file.readlines()
-        with open("BooksToDownload", 'w', encoding='utf_8') as file:
+    if text in data_text:
-            file.writelines(data_text)
+        data_text.pop(data_text.index(text))
-    else:
+        with open("BooksToDownload", 'w', encoding='utf_8') as file:
-        remove_text(f"{text}\n")
+            file.writelines(data_text)
-
+    else:
-
+        remove_text(f"{text}\n")
-def set_folder_name(html: bs4.BeautifulSoup):
+
-    name = html.find("title").text if html.text else " None"
+
-    return name[name.find(" - ")+3:].replace('"', "''").replace("\\", '||').replace(r':', r'׃').replace(r"/", r"|").replace("\n", "").replace('?', ';;')
+def set_folder_name(html: bs4.BeautifulSoup):
-
+    name = html.find("title").text if html.text else " None"
-
+    return name[name.find(" - ")+3:].replace('"', "''").replace("\\", '||').replace(r':', r'׃').replace(r"/", r"|").replace("\n", "").replace('?', ';;')
-def down_to_list(url: str):
+
-    data = urllib3.PoolManager().request("GET", url)
+
-    return data.data if data.data else None
+def down_to_list(url: str):
-
+    data = urllib3.PoolManager().request("GET", url)
-
+    return data.data if data.data else None
-def _split_styler(style: str):
+
-    begin = style.find('"')+1
+
-    end = style.rfind('"')
+def _split_styler(style: str):
-    return style[begin:end]
+    begin = style.find('"')+1
-
+    end = style.rfind('"')
-
+    return style[begin:end]
-def update_SOURCES(index: int):
+
-    global SOURCES, ACTS
+
-    keys_html = bs4.BeautifulSoup(
+def update_SOURCES(index: int):
-        ACTS[index]._driver.page_source, "html.parser").find_all(
+    global SOURCES, ACTS
-        "div", attrs={"class": "BV_oImage"})
+    keys_html = bs4.BeautifulSoup(
-    dic_update = {key.attrs["id"]: _split_styler(key.attrs["style"])
+        ACTS[index]._driver.page_source, "html.parser").find_all(
-                  for key in keys_html if "http" in key.attrs["style"]}
+        "div", attrs={"class": "BV_oImage"})
-    SOURCES[index].update(dic_update)
+    dic_update = {key.attrs["id"]: _split_styler(key.attrs["style"])
-
+                  for key in keys_html if "http" in key.attrs["style"]}
-
+    SOURCES[index].update(dic_update)
-def do_action_now(index: int):
+
-    global SOURCES
+
-    global ACTS
+def do_action_now(index: int):
-    ACTS[index].perform()
+    global SOURCES
-    name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser"))
+    global ACTS
-    update_SOURCES(index)
+    ACTS[index].perform()
-    if not os.path.exists("ignore/"+f"{name}"):
+    name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser"))
-        os.mkdir("ignore/"+f"{name}")
+    update_SOURCES(index)
-    files = list(SOURCES[index])
+    if not os.path.exists("ignore/"+f"{name}"):
-    for s in SOURCES[index]:
+        os.mkdir("ignore/"+f"{name}")
-        if SOURCES[index][s]:
+    files = list(SOURCES[index])
-            files[files.index(s)] = urllib3.PoolManager().request("GET", SOURCES[index][s]).data
+    for s in SOURCES[index]:
-    return files
+        if SOURCES[index][s]:
-# def check_and_act(index: int,last):
+            if not os.path.exists(f"ignore/{name}/{files.index(s):04}.jpg"):
-
+                with open(f"ignore/{name}/{files.index(s):04}.jpg", "wb") as F:
-
+                    F.write(urllib3.PoolManager().request("GET", SOURCES[index][s]).data)
-def get_first_empty(index: int):
+            # files[files.index(s)] = urllib3.PoolManager().request("GET", SOURCES[index][s]).data
-    global SOURCES
+    return files
-    for s in SOURCES[index]:
+# def check_and_act(index: int,last):
-        if not SOURCES[index][s]:
+
-            return s
+
-    return None
+def get_first_empty(index: int):
-
+    global SOURCES
-
+    for s in SOURCES[index]:
-def act_now(index: int, path: str = None):
+        if not SOURCES[index][s]:
-    global SOURCES
+            return s
-    global ACTS
+    return None
-    global couters
+
-    global TREADS
+
-    global OLD_REMOVE
+def act_now(index: int, path: str = None):
-    global LAST_ACTS
+    global SOURCES
-    global treads
+    global ACTS
-    s = 0
+    global couters
-    name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser"))
+    global TREADS
-    save_first = ""
+    global OLD_REMOVE
-    last = list(SOURCES[index].keys())[-1]
+    global LAST_ACTS
-    while "" in SOURCES[index].values():
+    global treads
-        if s == 0:
+    s = 0
-            LAST_ACTS[index].perform()
+    name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser"))
-            s = 1
+    save_first = ""
-        save_first = get_first_empty(index)
+    last = list(SOURCES[index].keys())[-1]
-        url_now = ACTS[index]._driver.current_url
+    while "" in SOURCES[index].values():
-        url_now = url_now[:url_now.find("#")+1] + save_first
+        if s == 0:
-        if SOURCES[index][last] and "" in SOURCES[index].values():
+            LAST_ACTS[index].perform()
-            ACTS[index]._driver.get(url_now)
+            s = 1
-            pages = do_action_now(index)
+        save_first = get_first_empty(index)
-            SOURCES[index][last] = ""
+        url_now = ACTS[index]._driver.current_url
-        else:
+        url_now = url_now[:url_now.find("#")+1] + save_first
-            pages = do_action_now(index)
+        if SOURCES[index][last] and "" in SOURCES[index].values():
-    if SOURCES[index] and "" not in SOURCES[index].values():
+            ACTS[index]._driver.get(url_now)
-        couters += 1
+            do_action_now(index)
-        pathus = f'{path}/{name}.pdf' if path else f"ignore/{name}/{name}.pdf"
+            SOURCES[index][last] = ""
-        with open(pathus, "wb") as file:
+        else:
-            file.write(img2pdf.convert(pages))
+            do_action_now(index)
-            ACTS[index]._driver.quit()
+    if SOURCES[index] and "" not in SOURCES[index].values():
-            remove_text(OLD_REMOVE[index])
+        couters += 1
-            treads -= 1
+        pathus = f'{path}/{name}.pdf' if path else f"ignore/{name}/{name}.pdf"
-
+        with open(pathus, "wb") as file:
-
+            file.write(img2pdf.convert(glob.glob(f"ignore/{name}/*.jpg")))
-def open_firefox(url: str):
+            ACTS[index]._driver.quit()
-    '''open_firefox opens the firefox browser on the specific url, and sets all the settings for the specific session
+            remove_text(OLD_REMOVE[index])
-
+            treads -= 1
-    :param url: url to run the firefox on
+
-    :type url: str
+
-    '''
+def open_firefox(url: str):
-    web = give_me_web()
+    '''open_firefox opens the firefox browser on the specific url, and sets all the settings for the specific session
-    global SOURCES
+
-    global ACTS
+    :param url: url to run the firefox on
-    global LAST_ACTS
+    :type url: str
-    if not url.startswith("#") and url:
+    '''
-        book = webdriver.Firefox(web[0], executable_path=web[1], options=web[2])
+    web = give_me_web()
-        url = url if url.endswith("#1.undefined.8.none") else f'{url}#1.undefined.8.none'
+    global SOURCES
-        book.get(url)
+    global ACTS
-        act = action_chains.ActionChains(book)
+    global LAST_ACTS
-        lst_act = action_chains.ActionChains(book)
+    if not url.startswith("#") and url:
-        lst_act._actions = [lst_act.key_down(keys.Keys.END), lst_act.pause(3), lst_act.key_up(keys.Keys.END)]
+        book = webdriver.Firefox(web[0], executable_path=web[1], options=web[2])
-        act._actions = [act.send_keys(keys.Keys.PAGE_DOWN)]
+        url = url if url.endswith("#1.undefined.8.none") else f'{url}#1.undefined.8.none'
-        LAST_ACTS.append(lst_act)
+        book.get(url)
-        ACTS.append(act)
+        act = action_chains.ActionChains(book)
-        SOURCES.append({})
+        lst_act = action_chains.ActionChains(book)
-
+        lst_act._actions = [lst_act.key_down(keys.Keys.END), lst_act.pause(3), lst_act.key_up(keys.Keys.END)]
-
+        act._actions = [act.send_keys(keys.Keys.PAGE_DOWN)]
-def give_me_web():
+        LAST_ACTS.append(lst_act)
-    options = webdriver.FirefoxOptions()
+        ACTS.append(act)
-    fp = webdriver.FirefoxProfile()
+        SOURCES.append({})
-    fp.set_preference("browser.download.folderList", 2)
+
-    fp.set_preference("browser.download.manager.showWhenStarting", False)
+
-    fp.set_preference("browser.download.dir", "ignore")
+def give_me_web():
-    fp.set_preference(
+    options = webdriver.FirefoxOptions()
-        "browser.helperApps.neverAsk.saveToDisk",
+    fp = webdriver.FirefoxProfile()
-        "attachment/csv, text/plain, application/octet-stream, application/binary, text/csv, application/csv, application/excel, text/comma-separated-values, text/xml, application/xml, application/xls, excel/xls, application/excel 97-2003,application/Microsoft Excel 97-2003 Worksheet, application/vnd.ms-excel")
+    for key,val in BROWSER_PREFENCES:
-    fp.set_preference(
+        fp.set_preference(key, val)
-        "browser.helperApps.neverAsk.openFile",
+    options.add_argument('--lang=EN')
-        "application/PDF, application/FDF, application/XFDF, application/LSL, application/LSO, application/LSS, application/IQY, application/RQY, application/XLK, application/XLS, application/XLT, application/POT application/PPS, application/PPT, application/DOS, application/DOT, application/WKS, application/BAT, application/PS, application/EPS, application/WCH, application/WCM, application/WB1, application/WB3, application/RTF, application/DOC, application/MDB, application/MDE, application/WBK, application/WB1, application/WCH, application/WCM, application/AD, application/ADP, application/vnd.ms-excel")
+    options.headless = True
-    fp.set_preference("browser.download.panel.shown", False)
+    fire = "geckodriver"
-    options.add_argument('--lang=EN')
+    return (fp, fire, options)
-    fire = "geckodriver"
+
-    return (fp, fire, options)
+if __name__ == "__main__":
-
+    with open("BooksToDownload", "r", encoding="utf_8") as file:
-
+        books = file.read().split("\n")
-with open("BooksToDownload", "r", encoding="utf_8") as file:
+    for b in books:
-    books = file.read().split("\n")
+        if b.find('````') > -1 and not b.startswith("#"):
-
+            OLD_REMOVE.append(b)
-for b in books:
+            PATHS.append(b[b.rfind('`')+1:])
-    if b.find('````') > -1 and not b.startswith("#"):
+            b = b[:b.find('`')]
-        OLD_REMOVE.append(b)
+        elif not b.startswith("#"):
-        PATHS.append(b[b.rfind('`')+1:])
+            OLD_REMOVE.append(b)
-        b = b[:b.find('`')]
+            PATHS.append(None)
-    elif not b.startswith("#"):
+        t1 = threading.Thread(None, open_firefox, args=(b,))
-        OLD_REMOVE.append(b)
+        t1.start()
-        PATHS.append(None)
+        THREADS.append(t1)
-    t1 = threading.Thread(None, open_firefox, args=(b,))
+    for t in THREADS:
-    t1.start()
+        t.join()
-    THREADS.append(t1)
+    lasts = []
-for t in THREADS:
+    for i in range(len(ACTS)):
-    t.join()
+        SOURCES[i].update({key.attrs["id"]: ""
-lasts = []
+                        for key in bs4.BeautifulSoup(ACTS[i]._driver.page_source, "html.parser").find_all(
-for i in range(len(ACTS)):
+            "div", attrs={"class": "BV_oImage"})})
-    SOURCES[i].update({key.attrs["id"]: ""
+        lasts.append(list(SOURCES[i].keys())[-1])
-                       for key in bs4.BeautifulSoup(ACTS[i]._driver.page_source, "html.parser").find_all(
+    couters = 0
-        "div", attrs={"class": "BV_oImage"})})
+    treads = len(ACTS)-1
-    lasts.append(list(SOURCES[i].keys())[-1])
+    for i in range(len(ACTS)):
-couters = 0
+        T = threading.Thread(None, act_now, args=(i, PATHS[i]))
-treads = len(ACTS)-1
+        T.start()
 for i in range(len(ACTS)):
    T = threading.Thread(None, act_now, args=(i, PATHS[i]))
    T.start()
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,25 @@
 async-generator==1.10
 attrs==23.1.0
 beautifulsoup4==4.12.2
 bs4==0.0.1
 certifi==2022.12.7
 chromedriver-autoinstaller==0.4.0
 deprecation==2.1.0
 exceptiongroup==1.1.1
 h11==0.14.0
 idna==3.4
 img2pdf==0.4.4
 lxml==4.9.2
 outcome==1.2.0
 packaging==23.1
 pikepdf==7.2.0
 Pillow==9.5.0
 PySocks==1.7.1
 selenium==4.9.0
 sniffio==1.3.0
 sortedcontainers==2.4.0
 soupsieve==2.4.1
 trio==0.22.0
 trio-websocket==0.10.2
 urllib3==1.26.15
 wsproto==1.2.0
Author	SHA1	Message	Date
1kamma	520303cd03	Merge branch 'master' of https://git.saret.tk/saret/BulkBooks	2023-05-02 22:33:40 +03:00
1kamma	14bd12ee8f	books to download	2023-05-02 22:33:31 +03:00
Benny Saret	6e0446af3f	Merge pull request 'master' (#3 ) from Mooooooooo/BulkBooks:master into master Reviewed-on: https://git.saret.tk/saret/BulkBooks/pulls/3	2023-05-02 14:47:44 +03:00
saret	18f4876291	Merge branch 'master' of https://git.saret.tk/Mooooooooo/BulkBooks	2023-05-02 14:46:22 +03:00
Benny Saret	8163dd903d	Modify The Readme Added explanations of how to use the script.	2023-05-01 23:08:46 +03:00
1kamma	6d19d28f6e	prettified the code	2023-05-01 22:56:39 +03:00
work	d73c5c579f	Merge branch 'master' of https://git.saret.tk/Mooooooooo/BulkBooks	2023-04-25 12:54:33 +03:00
work	4eefe79bab	added requirements	2023-04-25 12:52:57 +03:00
Benny Saret	864ab7ad1e	Merge pull request 'master' (#1 ) from saret/BulkBooks:master into master Reviewed-on: https://git.saret.tk/Mooooooooo/BulkBooks/pulls/1	2023-04-25 12:44:39 +03:00
Benny Saret	3055dc05f9	typo	2023-04-25 12:28:23 +03:00
Benny Saret	29f6324bc2	Merge pull request 'master' (#2 ) from Mooooooooo/BulkBooks:master into master Reviewed-on: https://git.saret.tk/saret/BulkBooks/pulls/2	2023-04-25 12:22:46 +03:00
work	7fc8519e4a	path location	2023-04-25 12:20:06 +03:00
work	8cbdcee256	headless	2023-04-25 12:17:03 +03:00
benny	d9dfaaa68c	preformence improvement	2023-04-25 12:11:12 +03:00