Compare commits

...

10 Commits

4 changed files with 260 additions and 200 deletions

View File

@@ -1 +1,33 @@
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=105179261 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110994706
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110918661
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110930833
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110933810
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110938002
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110938002
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110942621
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110942621
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110948032
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110948032
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110959329
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=105256337
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=105018830
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=109444642
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=109400325
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=109392561
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=107884166
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=97645077
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=97645077
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=102594097
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=102591827
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=102588217
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=102589202
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=101052334
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=101048613
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=101986400
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=100976710
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=100974786
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=108426718
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=108236946
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=106246523
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=104502712
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=103818662

View File

@@ -1,2 +1,12 @@
# BulkBooks # Bulk Books:
This script's goal is to help you to download books from [Kotar](https://kotar.cet.ac.il/).
## How To?
1. You need an Academic Access to Kotar.
1. You need to have python>=3.9
1. download the requiremetns. (It might be prefered by using venv).
1. Add the links to the __BooksToDownload__ file.
1. Run the script.
Enjoy.

View File

@@ -1,198 +1,191 @@
import selenium import selenium
from selenium.webdriver.common import action_chains from selenium.webdriver.common import action_chains
import urllib3 import urllib3
import bs4 import bs4
import re import re
import os import os
import glob import glob
from selenium.common import exceptions from selenium.common import exceptions
from selenium import webdriver from selenium import webdriver
import img2pdf import img2pdf
import threading import threading
from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.actions import interaction from selenium.webdriver.common.actions import interaction
from selenium.webdriver.common import keys from selenium.webdriver.common import keys
# from parser import ArgumentParser # from parser import ArgumentParser
ACTS = [] ACTS = []
LAST_ACTS = [] LAST_ACTS = []
SOURCES = [] SOURCES = []
Books = [] Books = []
THREADS = [] THREADS = []
PATHS = [] PATHS = []
OLD_REMOVE = [] OLD_REMOVE = []
BROWSER_PREFENCES = {"browser.download.folderList": 2, "browser.download.manager.showWhenStarting": False, "browser.download.dir": "ignore", "browser.helperApps.neverAsk.saveToDisk": "attachment/csv, text/plain, application/octet-stream, application/binary, text/csv, application/csv, application/excel, text/comma-separated-values, text/xml, application/xml, application/xls, excel/xls, application/excel 97-2003,application/Microsoft Excel 97-2003 Worksheet, application/vnd.ms-excel", "browser.helperApps.neverAsk.openFile":
"application/PDF, application/FDF, application/XFDF, application/LSL, application/LSO, application/LSS, application/IQY, application/RQY, application/XLK, application/XLS, application/XLT, application/POT application/PPS, application/PPT, application/DOS, application/DOT, application/WKS, application/BAT, application/PS, application/EPS, application/WCH, application/WCM, application/WB1, application/WB3, application/RTF, application/DOC, application/MDB, application/MDE, application/WBK, application/WB1, application/WCH, application/WCM, application/AD, application/ADP, application/vnd.ms-excel", "browser.download.panel.shown": False}
def remove_text(text: str):
'''remove_text removes the url from the text file
def remove_text(text: str):
:param text: url to remove '''remove_text removes the url from the text file
:type text: str
''' :param text: url to remove
with open("BooksToDownload", "r", encoding="utf_8") as file: :type text: str
data_text = file.readlines() '''
if text in data_text: with open("BooksToDownload", "r", encoding="utf_8") as file:
data_text.pop(data_text.index(text)) data_text = file.readlines()
with open("BooksToDownload", 'w', encoding='utf_8') as file: if text in data_text:
file.writelines(data_text) data_text.pop(data_text.index(text))
else: with open("BooksToDownload", 'w', encoding='utf_8') as file:
remove_text(f"{text}\n") file.writelines(data_text)
else:
remove_text(f"{text}\n")
def set_folder_name(html: bs4.BeautifulSoup):
name = html.find("title").text if html.text else " None"
return name[name.find(" - ")+3:].replace('"', "''").replace("\\", '||').replace(r':', r'׃').replace(r"/", r"|").replace("\n", "").replace('?', ';;') def set_folder_name(html: bs4.BeautifulSoup):
name = html.find("title").text if html.text else " None"
return name[name.find(" - ")+3:].replace('"', "''").replace("\\", '||').replace(r':', r'׃').replace(r"/", r"|").replace("\n", "").replace('?', ';;')
def down_to_list(url: str):
data = urllib3.PoolManager().request("GET", url)
return data.data if data.data else None def down_to_list(url: str):
data = urllib3.PoolManager().request("GET", url)
return data.data if data.data else None
def _split_styler(style: str):
begin = style.find('"')+1
end = style.rfind('"') def _split_styler(style: str):
return style[begin:end] begin = style.find('"')+1
end = style.rfind('"')
return style[begin:end]
def update_SOURCES(index: int):
global SOURCES, ACTS
keys_html = bs4.BeautifulSoup( def update_SOURCES(index: int):
ACTS[index]._driver.page_source, "html.parser").find_all( global SOURCES, ACTS
"div", attrs={"class": "BV_oImage"}) keys_html = bs4.BeautifulSoup(
dic_update = {key.attrs["id"]: _split_styler(key.attrs["style"]) ACTS[index]._driver.page_source, "html.parser").find_all(
for key in keys_html if "http" in key.attrs["style"]} "div", attrs={"class": "BV_oImage"})
SOURCES[index].update(dic_update) dic_update = {key.attrs["id"]: _split_styler(key.attrs["style"])
for key in keys_html if "http" in key.attrs["style"]}
SOURCES[index].update(dic_update)
def do_action_now(index: int):
global SOURCES
global ACTS def do_action_now(index: int):
ACTS[index].perform() global SOURCES
name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser")) global ACTS
update_SOURCES(index) ACTS[index].perform()
if not os.path.exists("ignore/"+f"{name}"): name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser"))
os.mkdir("ignore/"+f"{name}") update_SOURCES(index)
files = list(SOURCES[index]) if not os.path.exists("ignore/"+f"{name}"):
for s in SOURCES[index]: os.mkdir("ignore/"+f"{name}")
if SOURCES[index][s]: files = list(SOURCES[index])
if not os.path.exists(f"ignore/{name}/{files.index(s):04}.jpg"): for s in SOURCES[index]:
with open(f"ignore/{name}/{files.index(s):04}.jpg","wb") as F: if SOURCES[index][s]:
F.write(urllib3.PoolManager().request("GET", SOURCES[index][s]).data) if not os.path.exists(f"ignore/{name}/{files.index(s):04}.jpg"):
# files[files.index(s)] = urllib3.PoolManager().request("GET", SOURCES[index][s]).data with open(f"ignore/{name}/{files.index(s):04}.jpg", "wb") as F:
return files F.write(urllib3.PoolManager().request("GET", SOURCES[index][s]).data)
# def check_and_act(index: int,last): # files[files.index(s)] = urllib3.PoolManager().request("GET", SOURCES[index][s]).data
return files
# def check_and_act(index: int,last):
def get_first_empty(index: int):
global SOURCES
for s in SOURCES[index]: def get_first_empty(index: int):
if not SOURCES[index][s]: global SOURCES
return s for s in SOURCES[index]:
return None if not SOURCES[index][s]:
return s
return None
def act_now(index: int, path: str = None):
global SOURCES
global ACTS def act_now(index: int, path: str = None):
global couters global SOURCES
global TREADS global ACTS
global OLD_REMOVE global couters
global LAST_ACTS global TREADS
global treads global OLD_REMOVE
s = 0 global LAST_ACTS
name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser")) global treads
save_first = "" s = 0
last = list(SOURCES[index].keys())[-1] name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser"))
while "" in SOURCES[index].values(): save_first = ""
if s == 0: last = list(SOURCES[index].keys())[-1]
LAST_ACTS[index].perform() while "" in SOURCES[index].values():
s = 1 if s == 0:
save_first = get_first_empty(index) LAST_ACTS[index].perform()
url_now = ACTS[index]._driver.current_url s = 1
url_now = url_now[:url_now.find("#")+1] + save_first save_first = get_first_empty(index)
if SOURCES[index][last] and "" in SOURCES[index].values(): url_now = ACTS[index]._driver.current_url
ACTS[index]._driver.get(url_now) url_now = url_now[:url_now.find("#")+1] + save_first
do_action_now(index) if SOURCES[index][last] and "" in SOURCES[index].values():
SOURCES[index][last] = "" ACTS[index]._driver.get(url_now)
else: do_action_now(index)
do_action_now(index) SOURCES[index][last] = ""
if SOURCES[index] and "" not in SOURCES[index].values(): else:
couters += 1 do_action_now(index)
pathus = f'{path}/{name}.pdf' if path else f"ignore/{name}/{name}.pdf" if SOURCES[index] and "" not in SOURCES[index].values():
with open(pathus, "wb") as file: couters += 1
file.write(img2pdf.convert(glob.glob(f"ignore/{name}/*jpg"))) pathus = f'{path}/{name}.pdf' if path else f"ignore/{name}/{name}.pdf"
ACTS[index]._driver.quit() with open(pathus, "wb") as file:
remove_text(OLD_REMOVE[index]) file.write(img2pdf.convert(glob.glob(f"ignore/{name}/*.jpg")))
treads -= 1 ACTS[index]._driver.quit()
remove_text(OLD_REMOVE[index])
treads -= 1
def open_firefox(url: str):
'''open_firefox opens the firefox browser on the specific url, and sets all the settings for the specific session
def open_firefox(url: str):
:param url: url to run the firefox on '''open_firefox opens the firefox browser on the specific url, and sets all the settings for the specific session
:type url: str
''' :param url: url to run the firefox on
web = give_me_web() :type url: str
global SOURCES '''
global ACTS web = give_me_web()
global LAST_ACTS global SOURCES
if not url.startswith("#") and url: global ACTS
book = webdriver.Firefox(web[0], executable_path=web[1], options=web[2]) global LAST_ACTS
url = url if url.endswith("#1.undefined.8.none") else f'{url}#1.undefined.8.none' if not url.startswith("#") and url:
book.get(url) book = webdriver.Firefox(web[0], executable_path=web[1], options=web[2])
act = action_chains.ActionChains(book) url = url if url.endswith("#1.undefined.8.none") else f'{url}#1.undefined.8.none'
lst_act = action_chains.ActionChains(book) book.get(url)
lst_act._actions = [lst_act.key_down(keys.Keys.END), lst_act.pause(3), lst_act.key_up(keys.Keys.END)] act = action_chains.ActionChains(book)
act._actions = [act.send_keys(keys.Keys.PAGE_DOWN)] lst_act = action_chains.ActionChains(book)
LAST_ACTS.append(lst_act) lst_act._actions = [lst_act.key_down(keys.Keys.END), lst_act.pause(3), lst_act.key_up(keys.Keys.END)]
ACTS.append(act) act._actions = [act.send_keys(keys.Keys.PAGE_DOWN)]
SOURCES.append({}) LAST_ACTS.append(lst_act)
ACTS.append(act)
SOURCES.append({})
def give_me_web():
options = webdriver.FirefoxOptions()
fp = webdriver.FirefoxProfile() def give_me_web():
fp.set_preference("browser.download.folderList", 2) options = webdriver.FirefoxOptions()
fp.set_preference("browser.download.manager.showWhenStarting", False) fp = webdriver.FirefoxProfile()
fp.set_preference("browser.download.dir", "ignore") for key,val in BROWSER_PREFENCES:
fp.set_preference( fp.set_preference(key, val)
"browser.helperApps.neverAsk.saveToDisk", options.add_argument('--lang=EN')
"attachment/csv, text/plain, application/octet-stream, application/binary, text/csv, application/csv, application/excel, text/comma-separated-values, text/xml, application/xml, application/xls, excel/xls, application/excel 97-2003,application/Microsoft Excel 97-2003 Worksheet, application/vnd.ms-excel") options.headless = True
fp.set_preference( fire = "geckodriver"
"browser.helperApps.neverAsk.openFile", return (fp, fire, options)
"application/PDF, application/FDF, application/XFDF, application/LSL, application/LSO, application/LSS, application/IQY, application/RQY, application/XLK, application/XLS, application/XLT, application/POT application/PPS, application/PPT, application/DOS, application/DOT, application/WKS, application/BAT, application/PS, application/EPS, application/WCH, application/WCM, application/WB1, application/WB3, application/RTF, application/DOC, application/MDB, application/MDE, application/WBK, application/WB1, application/WCH, application/WCM, application/AD, application/ADP, application/vnd.ms-excel")
fp.set_preference("browser.download.panel.shown", False) if __name__ == "__main__":
options.add_argument('--lang=EN') with open("BooksToDownload", "r", encoding="utf_8") as file:
options.headless = True books = file.read().split("\n")
fire = "geckodriver" for b in books:
return (fp, fire, options) if b.find('````') > -1 and not b.startswith("#"):
OLD_REMOVE.append(b)
PATHS.append(b[b.rfind('`')+1:])
with open("BooksToDownload", "r", encoding="utf_8") as file: b = b[:b.find('`')]
books = file.read().split("\n") elif not b.startswith("#"):
OLD_REMOVE.append(b)
for b in books: PATHS.append(None)
if b.find('````') > -1 and not b.startswith("#"): t1 = threading.Thread(None, open_firefox, args=(b,))
OLD_REMOVE.append(b) t1.start()
PATHS.append(b[b.rfind('`')+1:]) THREADS.append(t1)
b = b[:b.find('`')] for t in THREADS:
elif not b.startswith("#"): t.join()
OLD_REMOVE.append(b) lasts = []
PATHS.append(None) for i in range(len(ACTS)):
t1 = threading.Thread(None, open_firefox, args=(b,)) SOURCES[i].update({key.attrs["id"]: ""
t1.start() for key in bs4.BeautifulSoup(ACTS[i]._driver.page_source, "html.parser").find_all(
THREADS.append(t1) "div", attrs={"class": "BV_oImage"})})
for t in THREADS: lasts.append(list(SOURCES[i].keys())[-1])
t.join() couters = 0
lasts = [] treads = len(ACTS)-1
for i in range(len(ACTS)): for i in range(len(ACTS)):
SOURCES[i].update({key.attrs["id"]: "" T = threading.Thread(None, act_now, args=(i, PATHS[i]))
for key in bs4.BeautifulSoup(ACTS[i]._driver.page_source, "html.parser").find_all( T.start()
"div", attrs={"class": "BV_oImage"})})
lasts.append(list(SOURCES[i].keys())[-1])
couters = 0
treads = len(ACTS)-1
for i in range(len(ACTS)):
T = threading.Thread(None, act_now, args=(i, PATHS[i]))
T.start()

25
requirements.txt Normal file
View File

@@ -0,0 +1,25 @@
async-generator==1.10
attrs==23.1.0
beautifulsoup4==4.12.2
bs4==0.0.1
certifi==2022.12.7
chromedriver-autoinstaller==0.4.0
deprecation==2.1.0
exceptiongroup==1.1.1
h11==0.14.0
idna==3.4
img2pdf==0.4.4
lxml==4.9.2
outcome==1.2.0
packaging==23.1
pikepdf==7.2.0
Pillow==9.5.0
PySocks==1.7.1
selenium==4.9.0
sniffio==1.3.0
sortedcontainers==2.4.0
soupsieve==2.4.1
trio==0.22.0
trio-websocket==0.10.2
urllib3==1.26.15
wsproto==1.2.0