Files
BulkBooks/download books in bulks.py
2023-05-01 22:56:39 +03:00

191 lines
7.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import selenium
from selenium.webdriver.common import action_chains
import urllib3
import bs4
import re
import os
import glob
from selenium.common import exceptions
from selenium import webdriver
import img2pdf
import threading
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.actions import interaction
from selenium.webdriver.common import keys
# from parser import ArgumentParser
ACTS = []
LAST_ACTS = []
SOURCES = []
Books = []
THREADS = []
PATHS = []
OLD_REMOVE = []
BROWSER_PREFENCES = {"browser.download.folderList": 2, "browser.download.manager.showWhenStarting": False, "browser.download.dir": "ignore", "browser.helperApps.neverAsk.saveToDisk": "attachment/csv, text/plain, application/octet-stream, application/binary, text/csv, application/csv, application/excel, text/comma-separated-values, text/xml, application/xml, application/xls, excel/xls, application/excel 97-2003,application/Microsoft Excel 97-2003 Worksheet, application/vnd.ms-excel", "browser.helperApps.neverAsk.openFile":
"application/PDF, application/FDF, application/XFDF, application/LSL, application/LSO, application/LSS, application/IQY, application/RQY, application/XLK, application/XLS, application/XLT, application/POT application/PPS, application/PPT, application/DOS, application/DOT, application/WKS, application/BAT, application/PS, application/EPS, application/WCH, application/WCM, application/WB1, application/WB3, application/RTF, application/DOC, application/MDB, application/MDE, application/WBK, application/WB1, application/WCH, application/WCM, application/AD, application/ADP, application/vnd.ms-excel", "browser.download.panel.shown": False}
def remove_text(text: str):
'''remove_text removes the url from the text file
:param text: url to remove
:type text: str
'''
with open("BooksToDownload", "r", encoding="utf_8") as file:
data_text = file.readlines()
if text in data_text:
data_text.pop(data_text.index(text))
with open("BooksToDownload", 'w', encoding='utf_8') as file:
file.writelines(data_text)
else:
remove_text(f"{text}\n")
def set_folder_name(html: bs4.BeautifulSoup):
name = html.find("title").text if html.text else " None"
return name[name.find(" - ")+3:].replace('"', "''").replace("\\", '||').replace(r':', r'׃').replace(r"/", r"|").replace("\n", "").replace('?', ';;')
def down_to_list(url: str):
data = urllib3.PoolManager().request("GET", url)
return data.data if data.data else None
def _split_styler(style: str):
begin = style.find('"')+1
end = style.rfind('"')
return style[begin:end]
def update_SOURCES(index: int):
global SOURCES, ACTS
keys_html = bs4.BeautifulSoup(
ACTS[index]._driver.page_source, "html.parser").find_all(
"div", attrs={"class": "BV_oImage"})
dic_update = {key.attrs["id"]: _split_styler(key.attrs["style"])
for key in keys_html if "http" in key.attrs["style"]}
SOURCES[index].update(dic_update)
def do_action_now(index: int):
global SOURCES
global ACTS
ACTS[index].perform()
name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser"))
update_SOURCES(index)
if not os.path.exists("ignore/"+f"{name}"):
os.mkdir("ignore/"+f"{name}")
files = list(SOURCES[index])
for s in SOURCES[index]:
if SOURCES[index][s]:
if not os.path.exists(f"ignore/{name}/{files.index(s):04}.jpg"):
with open(f"ignore/{name}/{files.index(s):04}.jpg", "wb") as F:
F.write(urllib3.PoolManager().request("GET", SOURCES[index][s]).data)
# files[files.index(s)] = urllib3.PoolManager().request("GET", SOURCES[index][s]).data
return files
# def check_and_act(index: int,last):
def get_first_empty(index: int):
global SOURCES
for s in SOURCES[index]:
if not SOURCES[index][s]:
return s
return None
def act_now(index: int, path: str = None):
global SOURCES
global ACTS
global couters
global TREADS
global OLD_REMOVE
global LAST_ACTS
global treads
s = 0
name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser"))
save_first = ""
last = list(SOURCES[index].keys())[-1]
while "" in SOURCES[index].values():
if s == 0:
LAST_ACTS[index].perform()
s = 1
save_first = get_first_empty(index)
url_now = ACTS[index]._driver.current_url
url_now = url_now[:url_now.find("#")+1] + save_first
if SOURCES[index][last] and "" in SOURCES[index].values():
ACTS[index]._driver.get(url_now)
do_action_now(index)
SOURCES[index][last] = ""
else:
do_action_now(index)
if SOURCES[index] and "" not in SOURCES[index].values():
couters += 1
pathus = f'{path}/{name}.pdf' if path else f"ignore/{name}/{name}.pdf"
with open(pathus, "wb") as file:
file.write(img2pdf.convert(glob.glob(f"ignore/{name}/*.jpg")))
ACTS[index]._driver.quit()
remove_text(OLD_REMOVE[index])
treads -= 1
def open_firefox(url: str):
'''open_firefox opens the firefox browser on the specific url, and sets all the settings for the specific session
:param url: url to run the firefox on
:type url: str
'''
web = give_me_web()
global SOURCES
global ACTS
global LAST_ACTS
if not url.startswith("#") and url:
book = webdriver.Firefox(web[0], executable_path=web[1], options=web[2])
url = url if url.endswith("#1.undefined.8.none") else f'{url}#1.undefined.8.none'
book.get(url)
act = action_chains.ActionChains(book)
lst_act = action_chains.ActionChains(book)
lst_act._actions = [lst_act.key_down(keys.Keys.END), lst_act.pause(3), lst_act.key_up(keys.Keys.END)]
act._actions = [act.send_keys(keys.Keys.PAGE_DOWN)]
LAST_ACTS.append(lst_act)
ACTS.append(act)
SOURCES.append({})
def give_me_web():
options = webdriver.FirefoxOptions()
fp = webdriver.FirefoxProfile()
for key,val in BROWSER_PREFENCES:
fp.set_preference(key, val)
options.add_argument('--lang=EN')
options.headless = True
fire = "geckodriver"
return (fp, fire, options)
if __name__ == "__main__":
with open("BooksToDownload", "r", encoding="utf_8") as file:
books = file.read().split("\n")
for b in books:
if b.find('````') > -1 and not b.startswith("#"):
OLD_REMOVE.append(b)
PATHS.append(b[b.rfind('`')+1:])
b = b[:b.find('`')]
elif not b.startswith("#"):
OLD_REMOVE.append(b)
PATHS.append(None)
t1 = threading.Thread(None, open_firefox, args=(b,))
t1.start()
THREADS.append(t1)
for t in THREADS:
t.join()
lasts = []
for i in range(len(ACTS)):
SOURCES[i].update({key.attrs["id"]: ""
for key in bs4.BeautifulSoup(ACTS[i]._driver.page_source, "html.parser").find_all(
"div", attrs={"class": "BV_oImage"})})
lasts.append(list(SOURCES[i].keys())[-1])
couters = 0
treads = len(ACTS)-1
for i in range(len(ACTS)):
T = threading.Thread(None, act_now, args=(i, PATHS[i]))
T.start()