Compare commits

..

14 Commits

5 changed files with 262 additions and 197 deletions

3
.gitignore vendored
View File

@@ -1,2 +1,3 @@
.vscode/ .vscode/
venv/ venv/
ignore/

View File

@@ -1 +1,33 @@
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=105179261 https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110994706
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110918661
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110930833
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110933810
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110938002
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110938002
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110942621
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110942621
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110948032
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110948032
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=110959329
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=105256337
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=105018830
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=109444642
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=109400325
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=109392561
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=107884166
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=97645077
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=97645077
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=102594097
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=102591827
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=102588217
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=102589202
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=101052334
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=101048613
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=101986400
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=100976710
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=100974786
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=108426718
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=108236946
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=106246523
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=104502712
https://kotar.cet.ac.il/KotarApp/Viewer.aspx?nBookID=103818662

View File

@@ -1,2 +1,12 @@
# BulkBooks # Bulk Books:
This script's goal is to help you to download books from [Kotar](https://kotar.cet.ac.il/).
## How To?
1. You need an Academic Access to Kotar.
1. You need to have python>=3.9
1. download the requiremetns. (It might be prefered by using venv).
1. Add the links to the __BooksToDownload__ file.
1. Run the script.
Enjoy.

View File

@@ -1,194 +1,191 @@
import selenium import selenium
from selenium.webdriver.common import action_chains from selenium.webdriver.common import action_chains
import urllib3 import urllib3
import bs4 import bs4
import re import re
import os import os
import glob import glob
from selenium.common import exceptions from selenium.common import exceptions
from selenium import webdriver from selenium import webdriver
import img2pdf import img2pdf
import threading import threading
from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.actions import interaction from selenium.webdriver.common.actions import interaction
from selenium.webdriver.common import keys from selenium.webdriver.common import keys
# from parser import ArgumentParser # from parser import ArgumentParser
ACTS = [] ACTS = []
LAST_ACTS = [] LAST_ACTS = []
SOURCES = [] SOURCES = []
Books = [] Books = []
THREADS = [] THREADS = []
PATHS = [] PATHS = []
OLD_REMOVE = [] OLD_REMOVE = []
BROWSER_PREFENCES = {"browser.download.folderList": 2, "browser.download.manager.showWhenStarting": False, "browser.download.dir": "ignore", "browser.helperApps.neverAsk.saveToDisk": "attachment/csv, text/plain, application/octet-stream, application/binary, text/csv, application/csv, application/excel, text/comma-separated-values, text/xml, application/xml, application/xls, excel/xls, application/excel 97-2003,application/Microsoft Excel 97-2003 Worksheet, application/vnd.ms-excel", "browser.helperApps.neverAsk.openFile":
"application/PDF, application/FDF, application/XFDF, application/LSL, application/LSO, application/LSS, application/IQY, application/RQY, application/XLK, application/XLS, application/XLT, application/POT application/PPS, application/PPT, application/DOS, application/DOT, application/WKS, application/BAT, application/PS, application/EPS, application/WCH, application/WCM, application/WB1, application/WB3, application/RTF, application/DOC, application/MDB, application/MDE, application/WBK, application/WB1, application/WCH, application/WCM, application/AD, application/ADP, application/vnd.ms-excel", "browser.download.panel.shown": False}
def remove_text(text: str):
'''remove_text removes the url from the text file
def remove_text(text: str):
:param text: url to remove '''remove_text removes the url from the text file
:type text: str
''' :param text: url to remove
with open("BooksToDownload", "r", encoding="utf_8") as file: :type text: str
data_text = file.readlines() '''
if text in data_text: with open("BooksToDownload", "r", encoding="utf_8") as file:
data_text.pop(data_text.index(text)) data_text = file.readlines()
with open("BooksToDownload", 'w', encoding='utf_8') as file: if text in data_text:
file.writelines(data_text) data_text.pop(data_text.index(text))
else: with open("BooksToDownload", 'w', encoding='utf_8') as file:
remove_text(f"{text}\n") file.writelines(data_text)
else:
remove_text(f"{text}\n")
def set_folder_name(html: bs4.BeautifulSoup):
name = html.find("title").text if html.text else " None"
return name[name.find(" - ")+3:].replace('"', "''").replace("\\", '||').replace(r':', r'׃').replace(r"/", r"|").replace("\n", "").replace('?', ';;') def set_folder_name(html: bs4.BeautifulSoup):
name = html.find("title").text if html.text else " None"
return name[name.find(" - ")+3:].replace('"', "''").replace("\\", '||').replace(r':', r'׃').replace(r"/", r"|").replace("\n", "").replace('?', ';;')
def down_to_list(url: str):
data = urllib3.PoolManager().request("GET", url)
return data.data if data.data else None def down_to_list(url: str):
data = urllib3.PoolManager().request("GET", url)
return data.data if data.data else None
def _split_styler(style: str):
begin = style.find('"')+1
end = style.rfind('"') def _split_styler(style: str):
return style[begin:end] begin = style.find('"')+1
end = style.rfind('"')
return style[begin:end]
def update_SOURCES(index: int):
global SOURCES, ACTS
keys_html = bs4.BeautifulSoup( def update_SOURCES(index: int):
ACTS[index]._driver.page_source, "html.parser").find_all( global SOURCES, ACTS
"div", attrs={"class": "BV_oImage"}) keys_html = bs4.BeautifulSoup(
dic_update = {key.attrs["id"]: _split_styler(key.attrs["style"]) ACTS[index]._driver.page_source, "html.parser").find_all(
for key in keys_html if "http" in key.attrs["style"]} "div", attrs={"class": "BV_oImage"})
SOURCES[index].update(dic_update) dic_update = {key.attrs["id"]: _split_styler(key.attrs["style"])
for key in keys_html if "http" in key.attrs["style"]}
SOURCES[index].update(dic_update)
def do_action_now(index: int):
global SOURCES
global ACTS def do_action_now(index: int):
ACTS[index].perform() global SOURCES
name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser")) global ACTS
update_SOURCES(index) ACTS[index].perform()
if not os.path.exists("ignore/"+f"{name}"): name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser"))
os.mkdir("ignore/"+f"{name}") update_SOURCES(index)
files = list(SOURCES[index]) if not os.path.exists("ignore/"+f"{name}"):
for s in SOURCES[index]: os.mkdir("ignore/"+f"{name}")
if SOURCES[index][s]: files = list(SOURCES[index])
files[files.index(s)] = urllib3.PoolManager().request("GET", SOURCES[index][s]).data for s in SOURCES[index]:
return files if SOURCES[index][s]:
# def check_and_act(index: int,last): if not os.path.exists(f"ignore/{name}/{files.index(s):04}.jpg"):
with open(f"ignore/{name}/{files.index(s):04}.jpg", "wb") as F:
F.write(urllib3.PoolManager().request("GET", SOURCES[index][s]).data)
def get_first_empty(index: int): # files[files.index(s)] = urllib3.PoolManager().request("GET", SOURCES[index][s]).data
global SOURCES return files
for s in SOURCES[index]: # def check_and_act(index: int,last):
if not SOURCES[index][s]:
return s
return None def get_first_empty(index: int):
global SOURCES
for s in SOURCES[index]:
def act_now(index: int, path: str = None): if not SOURCES[index][s]:
global SOURCES return s
global ACTS return None
global couters
global TREADS
global OLD_REMOVE def act_now(index: int, path: str = None):
global LAST_ACTS global SOURCES
global treads global ACTS
s = 0 global couters
name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser")) global TREADS
save_first = "" global OLD_REMOVE
last = list(SOURCES[index].keys())[-1] global LAST_ACTS
while "" in SOURCES[index].values(): global treads
if s == 0: s = 0
LAST_ACTS[index].perform() name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser"))
s = 1 save_first = ""
save_first = get_first_empty(index) last = list(SOURCES[index].keys())[-1]
url_now = ACTS[index]._driver.current_url while "" in SOURCES[index].values():
url_now = url_now[:url_now.find("#")+1] + save_first if s == 0:
if SOURCES[index][last] and "" in SOURCES[index].values(): LAST_ACTS[index].perform()
ACTS[index]._driver.get(url_now) s = 1
pages = do_action_now(index) save_first = get_first_empty(index)
SOURCES[index][last] = "" url_now = ACTS[index]._driver.current_url
else: url_now = url_now[:url_now.find("#")+1] + save_first
pages = do_action_now(index) if SOURCES[index][last] and "" in SOURCES[index].values():
if SOURCES[index] and "" not in SOURCES[index].values(): ACTS[index]._driver.get(url_now)
couters += 1 do_action_now(index)
pathus = f'{path}/{name}.pdf' if path else f"ignore/{name}/{name}.pdf" SOURCES[index][last] = ""
with open(pathus, "wb") as file: else:
file.write(img2pdf.convert(pages)) do_action_now(index)
ACTS[index]._driver.quit() if SOURCES[index] and "" not in SOURCES[index].values():
remove_text(OLD_REMOVE[index]) couters += 1
treads -= 1 pathus = f'{path}/{name}.pdf' if path else f"ignore/{name}/{name}.pdf"
with open(pathus, "wb") as file:
file.write(img2pdf.convert(glob.glob(f"ignore/{name}/*.jpg")))
def open_firefox(url: str): ACTS[index]._driver.quit()
'''open_firefox opens the firefox browser on the specific url, and sets all the settings for the specific session remove_text(OLD_REMOVE[index])
treads -= 1
:param url: url to run the firefox on
:type url: str
''' def open_firefox(url: str):
web = give_me_web() '''open_firefox opens the firefox browser on the specific url, and sets all the settings for the specific session
global SOURCES
global ACTS :param url: url to run the firefox on
global LAST_ACTS :type url: str
if not url.startswith("#") and url: '''
book = webdriver.Firefox(web[0], executable_path=web[1], options=web[2]) web = give_me_web()
url = url if url.endswith("#1.undefined.8.none") else f'{url}#1.undefined.8.none' global SOURCES
book.get(url) global ACTS
act = action_chains.ActionChains(book) global LAST_ACTS
lst_act = action_chains.ActionChains(book) if not url.startswith("#") and url:
lst_act._actions = [lst_act.key_down(keys.Keys.END), lst_act.pause(3), lst_act.key_up(keys.Keys.END)] book = webdriver.Firefox(web[0], executable_path=web[1], options=web[2])
act._actions = [act.send_keys(keys.Keys.PAGE_DOWN)] url = url if url.endswith("#1.undefined.8.none") else f'{url}#1.undefined.8.none'
LAST_ACTS.append(lst_act) book.get(url)
ACTS.append(act) act = action_chains.ActionChains(book)
SOURCES.append({}) lst_act = action_chains.ActionChains(book)
lst_act._actions = [lst_act.key_down(keys.Keys.END), lst_act.pause(3), lst_act.key_up(keys.Keys.END)]
act._actions = [act.send_keys(keys.Keys.PAGE_DOWN)]
def give_me_web(): LAST_ACTS.append(lst_act)
options = webdriver.FirefoxOptions() ACTS.append(act)
fp = webdriver.FirefoxProfile() SOURCES.append({})
fp.set_preference("browser.download.folderList", 2)
fp.set_preference("browser.download.manager.showWhenStarting", False)
fp.set_preference("browser.download.dir", "ignore") def give_me_web():
fp.set_preference( options = webdriver.FirefoxOptions()
"browser.helperApps.neverAsk.saveToDisk", fp = webdriver.FirefoxProfile()
"attachment/csv, text/plain, application/octet-stream, application/binary, text/csv, application/csv, application/excel, text/comma-separated-values, text/xml, application/xml, application/xls, excel/xls, application/excel 97-2003,application/Microsoft Excel 97-2003 Worksheet, application/vnd.ms-excel") for key,val in BROWSER_PREFENCES:
fp.set_preference( fp.set_preference(key, val)
"browser.helperApps.neverAsk.openFile", options.add_argument('--lang=EN')
"application/PDF, application/FDF, application/XFDF, application/LSL, application/LSO, application/LSS, application/IQY, application/RQY, application/XLK, application/XLS, application/XLT, application/POT application/PPS, application/PPT, application/DOS, application/DOT, application/WKS, application/BAT, application/PS, application/EPS, application/WCH, application/WCM, application/WB1, application/WB3, application/RTF, application/DOC, application/MDB, application/MDE, application/WBK, application/WB1, application/WCH, application/WCM, application/AD, application/ADP, application/vnd.ms-excel") options.headless = True
fp.set_preference("browser.download.panel.shown", False) fire = "geckodriver"
options.add_argument('--lang=EN') return (fp, fire, options)
fire = "geckodriver"
return (fp, fire, options) if __name__ == "__main__":
with open("BooksToDownload", "r", encoding="utf_8") as file:
books = file.read().split("\n")
with open("BooksToDownload", "r", encoding="utf_8") as file: for b in books:
books = file.read().split("\n") if b.find('````') > -1 and not b.startswith("#"):
OLD_REMOVE.append(b)
for b in books: PATHS.append(b[b.rfind('`')+1:])
if b.find('````') > -1 and not b.startswith("#"): b = b[:b.find('`')]
OLD_REMOVE.append(b) elif not b.startswith("#"):
PATHS.append(b[b.rfind('`')+1:]) OLD_REMOVE.append(b)
b = b[:b.find('`')] PATHS.append(None)
elif not b.startswith("#"): t1 = threading.Thread(None, open_firefox, args=(b,))
OLD_REMOVE.append(b) t1.start()
PATHS.append(None) THREADS.append(t1)
t1 = threading.Thread(None, open_firefox, args=(b,)) for t in THREADS:
t1.start() t.join()
THREADS.append(t1) lasts = []
for t in THREADS: for i in range(len(ACTS)):
t.join() SOURCES[i].update({key.attrs["id"]: ""
lasts = [] for key in bs4.BeautifulSoup(ACTS[i]._driver.page_source, "html.parser").find_all(
for i in range(len(ACTS)): "div", attrs={"class": "BV_oImage"})})
SOURCES[i].update({key.attrs["id"]: "" lasts.append(list(SOURCES[i].keys())[-1])
for key in bs4.BeautifulSoup(ACTS[i]._driver.page_source, "html.parser").find_all( couters = 0
"div", attrs={"class": "BV_oImage"})}) treads = len(ACTS)-1
lasts.append(list(SOURCES[i].keys())[-1]) for i in range(len(ACTS)):
couters = 0 T = threading.Thread(None, act_now, args=(i, PATHS[i]))
treads = len(ACTS)-1 T.start()
for i in range(len(ACTS)):
T = threading.Thread(None, act_now, args=(i, PATHS[i]))
T.start()

25
requirements.txt Normal file
View File

@@ -0,0 +1,25 @@
async-generator==1.10
attrs==23.1.0
beautifulsoup4==4.12.2
bs4==0.0.1
certifi==2022.12.7
chromedriver-autoinstaller==0.4.0
deprecation==2.1.0
exceptiongroup==1.1.1
h11==0.14.0
idna==3.4
img2pdf==0.4.4
lxml==4.9.2
outcome==1.2.0
packaging==23.1
pikepdf==7.2.0
Pillow==9.5.0
PySocks==1.7.1
selenium==4.9.0
sniffio==1.3.0
sortedcontainers==2.4.0
soupsieve==2.4.1
trio==0.22.0
trio-websocket==0.10.2
urllib3==1.26.15
wsproto==1.2.0