191 lines
7.3 KiB
Python
191 lines
7.3 KiB
Python
import selenium
|
||
from selenium.webdriver.common import action_chains
|
||
import urllib3
|
||
import bs4
|
||
import re
|
||
import os
|
||
import glob
|
||
from selenium.common import exceptions
|
||
from selenium import webdriver
|
||
import img2pdf
|
||
import threading
|
||
from selenium.webdriver.common.action_chains import ActionChains
|
||
from selenium.webdriver.common.actions import interaction
|
||
from selenium.webdriver.common import keys
|
||
# from parser import ArgumentParser
|
||
|
||
ACTS = []
|
||
LAST_ACTS = []
|
||
SOURCES = []
|
||
Books = []
|
||
THREADS = []
|
||
PATHS = []
|
||
OLD_REMOVE = []
|
||
BROWSER_PREFENCES = {"browser.download.folderList": 2, "browser.download.manager.showWhenStarting": False, "browser.download.dir": "ignore", "browser.helperApps.neverAsk.saveToDisk": "attachment/csv, text/plain, application/octet-stream, application/binary, text/csv, application/csv, application/excel, text/comma-separated-values, text/xml, application/xml, application/xls, excel/xls, application/excel 97-2003,application/Microsoft Excel 97-2003 Worksheet, application/vnd.ms-excel", "browser.helperApps.neverAsk.openFile":
|
||
"application/PDF, application/FDF, application/XFDF, application/LSL, application/LSO, application/LSS, application/IQY, application/RQY, application/XLK, application/XLS, application/XLT, application/POT application/PPS, application/PPT, application/DOS, application/DOT, application/WKS, application/BAT, application/PS, application/EPS, application/WCH, application/WCM, application/WB1, application/WB3, application/RTF, application/DOC, application/MDB, application/MDE, application/WBK, application/WB1, application/WCH, application/WCM, application/AD, application/ADP, application/vnd.ms-excel", "browser.download.panel.shown": False}
|
||
|
||
|
||
def remove_text(text: str):
|
||
'''remove_text removes the url from the text file
|
||
|
||
:param text: url to remove
|
||
:type text: str
|
||
'''
|
||
with open("BooksToDownload", "r", encoding="utf_8") as file:
|
||
data_text = file.readlines()
|
||
if text in data_text:
|
||
data_text.pop(data_text.index(text))
|
||
with open("BooksToDownload", 'w', encoding='utf_8') as file:
|
||
file.writelines(data_text)
|
||
else:
|
||
remove_text(f"{text}\n")
|
||
|
||
|
||
def set_folder_name(html: bs4.BeautifulSoup):
|
||
name = html.find("title").text if html.text else " None"
|
||
return name[name.find(" - ")+3:].replace('"', "''").replace("\\", '||').replace(r':', r'׃').replace(r"/", r"|").replace("\n", "").replace('?', ';;')
|
||
|
||
|
||
def down_to_list(url: str):
|
||
data = urllib3.PoolManager().request("GET", url)
|
||
return data.data if data.data else None
|
||
|
||
|
||
def _split_styler(style: str):
|
||
begin = style.find('"')+1
|
||
end = style.rfind('"')
|
||
return style[begin:end]
|
||
|
||
|
||
def update_SOURCES(index: int):
|
||
global SOURCES, ACTS
|
||
keys_html = bs4.BeautifulSoup(
|
||
ACTS[index]._driver.page_source, "html.parser").find_all(
|
||
"div", attrs={"class": "BV_oImage"})
|
||
dic_update = {key.attrs["id"]: _split_styler(key.attrs["style"])
|
||
for key in keys_html if "http" in key.attrs["style"]}
|
||
SOURCES[index].update(dic_update)
|
||
|
||
|
||
def do_action_now(index: int):
|
||
global SOURCES
|
||
global ACTS
|
||
ACTS[index].perform()
|
||
name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser"))
|
||
update_SOURCES(index)
|
||
if not os.path.exists("ignore/"+f"{name}"):
|
||
os.mkdir("ignore/"+f"{name}")
|
||
files = list(SOURCES[index])
|
||
for s in SOURCES[index]:
|
||
if SOURCES[index][s]:
|
||
if not os.path.exists(f"ignore/{name}/{files.index(s):04}.jpg"):
|
||
with open(f"ignore/{name}/{files.index(s):04}.jpg", "wb") as F:
|
||
F.write(urllib3.PoolManager().request("GET", SOURCES[index][s]).data)
|
||
# files[files.index(s)] = urllib3.PoolManager().request("GET", SOURCES[index][s]).data
|
||
return files
|
||
# def check_and_act(index: int,last):
|
||
|
||
|
||
def get_first_empty(index: int):
|
||
global SOURCES
|
||
for s in SOURCES[index]:
|
||
if not SOURCES[index][s]:
|
||
return s
|
||
return None
|
||
|
||
|
||
def act_now(index: int, path: str = None):
|
||
global SOURCES
|
||
global ACTS
|
||
global couters
|
||
global TREADS
|
||
global OLD_REMOVE
|
||
global LAST_ACTS
|
||
global treads
|
||
s = 0
|
||
name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser"))
|
||
save_first = ""
|
||
last = list(SOURCES[index].keys())[-1]
|
||
while "" in SOURCES[index].values():
|
||
if s == 0:
|
||
LAST_ACTS[index].perform()
|
||
s = 1
|
||
save_first = get_first_empty(index)
|
||
url_now = ACTS[index]._driver.current_url
|
||
url_now = url_now[:url_now.find("#")+1] + save_first
|
||
if SOURCES[index][last] and "" in SOURCES[index].values():
|
||
ACTS[index]._driver.get(url_now)
|
||
do_action_now(index)
|
||
SOURCES[index][last] = ""
|
||
else:
|
||
do_action_now(index)
|
||
if SOURCES[index] and "" not in SOURCES[index].values():
|
||
couters += 1
|
||
pathus = f'{path}/{name}.pdf' if path else f"ignore/{name}/{name}.pdf"
|
||
with open(pathus, "wb") as file:
|
||
file.write(img2pdf.convert(glob.glob(f"ignore/{name}/*.jpg")))
|
||
ACTS[index]._driver.quit()
|
||
remove_text(OLD_REMOVE[index])
|
||
treads -= 1
|
||
|
||
|
||
def open_firefox(url: str):
|
||
'''open_firefox opens the firefox browser on the specific url, and sets all the settings for the specific session
|
||
|
||
:param url: url to run the firefox on
|
||
:type url: str
|
||
'''
|
||
web = give_me_web()
|
||
global SOURCES
|
||
global ACTS
|
||
global LAST_ACTS
|
||
if not url.startswith("#") and url:
|
||
book = webdriver.Firefox(web[0], executable_path=web[1], options=web[2])
|
||
url = url if url.endswith("#1.undefined.8.none") else f'{url}#1.undefined.8.none'
|
||
book.get(url)
|
||
act = action_chains.ActionChains(book)
|
||
lst_act = action_chains.ActionChains(book)
|
||
lst_act._actions = [lst_act.key_down(keys.Keys.END), lst_act.pause(3), lst_act.key_up(keys.Keys.END)]
|
||
act._actions = [act.send_keys(keys.Keys.PAGE_DOWN)]
|
||
LAST_ACTS.append(lst_act)
|
||
ACTS.append(act)
|
||
SOURCES.append({})
|
||
|
||
|
||
def give_me_web():
|
||
options = webdriver.FirefoxOptions()
|
||
fp = webdriver.FirefoxProfile()
|
||
for key,val in BROWSER_PREFENCES:
|
||
fp.set_preference(key, val)
|
||
options.add_argument('--lang=EN')
|
||
options.headless = True
|
||
fire = "geckodriver"
|
||
return (fp, fire, options)
|
||
|
||
if __name__ == "__main__":
|
||
with open("BooksToDownload", "r", encoding="utf_8") as file:
|
||
books = file.read().split("\n")
|
||
for b in books:
|
||
if b.find('````') > -1 and not b.startswith("#"):
|
||
OLD_REMOVE.append(b)
|
||
PATHS.append(b[b.rfind('`')+1:])
|
||
b = b[:b.find('`')]
|
||
elif not b.startswith("#"):
|
||
OLD_REMOVE.append(b)
|
||
PATHS.append(None)
|
||
t1 = threading.Thread(None, open_firefox, args=(b,))
|
||
t1.start()
|
||
THREADS.append(t1)
|
||
for t in THREADS:
|
||
t.join()
|
||
lasts = []
|
||
for i in range(len(ACTS)):
|
||
SOURCES[i].update({key.attrs["id"]: ""
|
||
for key in bs4.BeautifulSoup(ACTS[i]._driver.page_source, "html.parser").find_all(
|
||
"div", attrs={"class": "BV_oImage"})})
|
||
lasts.append(list(SOURCES[i].keys())[-1])
|
||
couters = 0
|
||
treads = len(ACTS)-1
|
||
for i in range(len(ACTS)):
|
||
T = threading.Thread(None, act_now, args=(i, PATHS[i]))
|
||
T.start() |