This commit is contained in:
work
2023-04-25 12:54:33 +03:00

View File

@@ -1,198 +1,198 @@
import selenium import selenium
from selenium.webdriver.common import action_chains from selenium.webdriver.common import action_chains
import urllib3 import urllib3
import bs4 import bs4
import re import re
import os import os
import glob import glob
from selenium.common import exceptions from selenium.common import exceptions
from selenium import webdriver from selenium import webdriver
import img2pdf import img2pdf
import threading import threading
from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.actions import interaction from selenium.webdriver.common.actions import interaction
from selenium.webdriver.common import keys from selenium.webdriver.common import keys
# from parser import ArgumentParser # from parser import ArgumentParser
ACTS = [] ACTS = []
LAST_ACTS = [] LAST_ACTS = []
SOURCES = [] SOURCES = []
Books = [] Books = []
THREADS = [] THREADS = []
PATHS = [] PATHS = []
OLD_REMOVE = [] OLD_REMOVE = []
def remove_text(text: str): def remove_text(text: str):
'''remove_text removes the url from the text file '''remove_text removes the url from the text file
:param text: url to remove :param text: url to remove
:type text: str :type text: str
''' '''
with open("BooksToDownload", "r", encoding="utf_8") as file: with open("BooksToDownload", "r", encoding="utf_8") as file:
data_text = file.readlines() data_text = file.readlines()
if text in data_text: if text in data_text:
data_text.pop(data_text.index(text)) data_text.pop(data_text.index(text))
with open("BooksToDownload", 'w', encoding='utf_8') as file: with open("BooksToDownload", 'w', encoding='utf_8') as file:
file.writelines(data_text) file.writelines(data_text)
else: else:
remove_text(f"{text}\n") remove_text(f"{text}\n")
def set_folder_name(html: bs4.BeautifulSoup): def set_folder_name(html: bs4.BeautifulSoup):
name = html.find("title").text if html.text else " None" name = html.find("title").text if html.text else " None"
return name[name.find(" - ")+3:].replace('"', "''").replace("\\", '||').replace(r':', r'׃').replace(r"/", r"|").replace("\n", "").replace('?', ';;') return name[name.find(" - ")+3:].replace('"', "''").replace("\\", '||').replace(r':', r'׃').replace(r"/", r"|").replace("\n", "").replace('?', ';;')
def down_to_list(url: str): def down_to_list(url: str):
data = urllib3.PoolManager().request("GET", url) data = urllib3.PoolManager().request("GET", url)
return data.data if data.data else None return data.data if data.data else None
def _split_styler(style: str): def _split_styler(style: str):
begin = style.find('"')+1 begin = style.find('"')+1
end = style.rfind('"') end = style.rfind('"')
return style[begin:end] return style[begin:end]
def update_SOURCES(index: int): def update_SOURCES(index: int):
global SOURCES, ACTS global SOURCES, ACTS
keys_html = bs4.BeautifulSoup( keys_html = bs4.BeautifulSoup(
ACTS[index]._driver.page_source, "html.parser").find_all( ACTS[index]._driver.page_source, "html.parser").find_all(
"div", attrs={"class": "BV_oImage"}) "div", attrs={"class": "BV_oImage"})
dic_update = {key.attrs["id"]: _split_styler(key.attrs["style"]) dic_update = {key.attrs["id"]: _split_styler(key.attrs["style"])
for key in keys_html if "http" in key.attrs["style"]} for key in keys_html if "http" in key.attrs["style"]}
SOURCES[index].update(dic_update) SOURCES[index].update(dic_update)
def do_action_now(index: int): def do_action_now(index: int):
global SOURCES global SOURCES
global ACTS global ACTS
ACTS[index].perform() ACTS[index].perform()
name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser")) name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser"))
update_SOURCES(index) update_SOURCES(index)
if not os.path.exists("ignore/"+f"{name}"): if not os.path.exists("ignore/"+f"{name}"):
os.mkdir("ignore/"+f"{name}") os.mkdir("ignore/"+f"{name}")
files = list(SOURCES[index]) files = list(SOURCES[index])
for s in SOURCES[index]: for s in SOURCES[index]:
if SOURCES[index][s]: if SOURCES[index][s]:
if not os.path.exists(f"ignore/{name}/{files.index(s):04}.jpg"): if not os.path.exists(f"ignore/{name}/{files.index(s):04}.jpg"):
with open(f"ignore/{name}/{files.index(s):04}.jpg","wb") as F: with open(f"ignore/{name}/{files.index(s):04}.jpg","wb") as F:
F.write(urllib3.PoolManager().request("GET", SOURCES[index][s]).data) F.write(urllib3.PoolManager().request("GET", SOURCES[index][s]).data)
# files[files.index(s)] = urllib3.PoolManager().request("GET", SOURCES[index][s]).data # files[files.index(s)] = urllib3.PoolManager().request("GET", SOURCES[index][s]).data
return files return files
# def check_and_act(index: int,last): # def check_and_act(index: int,last):
def get_first_empty(index: int): def get_first_empty(index: int):
global SOURCES global SOURCES
for s in SOURCES[index]: for s in SOURCES[index]:
if not SOURCES[index][s]: if not SOURCES[index][s]:
return s return s
return None return None
def act_now(index: int, path: str = None): def act_now(index: int, path: str = None):
global SOURCES global SOURCES
global ACTS global ACTS
global couters global couters
global TREADS global TREADS
global OLD_REMOVE global OLD_REMOVE
global LAST_ACTS global LAST_ACTS
global treads global treads
s = 0 s = 0
name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser")) name = set_folder_name(bs4.BeautifulSoup(ACTS[index]._driver.page_source, "html.parser"))
save_first = "" save_first = ""
last = list(SOURCES[index].keys())[-1] last = list(SOURCES[index].keys())[-1]
while "" in SOURCES[index].values(): while "" in SOURCES[index].values():
if s == 0: if s == 0:
LAST_ACTS[index].perform() LAST_ACTS[index].perform()
s = 1 s = 1
save_first = get_first_empty(index) save_first = get_first_empty(index)
url_now = ACTS[index]._driver.current_url url_now = ACTS[index]._driver.current_url
url_now = url_now[:url_now.find("#")+1] + save_first url_now = url_now[:url_now.find("#")+1] + save_first
if SOURCES[index][last] and "" in SOURCES[index].values(): if SOURCES[index][last] and "" in SOURCES[index].values():
ACTS[index]._driver.get(url_now) ACTS[index]._driver.get(url_now)
do_action_now(index) do_action_now(index)
SOURCES[index][last] = "" SOURCES[index][last] = ""
else: else:
do_action_now(index) do_action_now(index)
if SOURCES[index] and "" not in SOURCES[index].values(): if SOURCES[index] and "" not in SOURCES[index].values():
couters += 1 couters += 1
pathus = f'{path}/{name}.pdf' if path else f"ignore/{name}/{name}.pdf" pathus = f'{path}/{name}.pdf' if path else f"ignore/{name}/{name}.pdf"
with open(pathus, "wb") as file: with open(pathus, "wb") as file:
file.write(img2pdf.convert(glob.glob(f"ignore/{name}/*jpg"))) file.write(img2pdf.convert(glob.glob(f"ignore/{name}/*.jpg")))
ACTS[index]._driver.quit() ACTS[index]._driver.quit()
remove_text(OLD_REMOVE[index]) remove_text(OLD_REMOVE[index])
treads -= 1 treads -= 1
def open_firefox(url: str): def open_firefox(url: str):
'''open_firefox opens the firefox browser on the specific url, and sets all the settings for the specific session '''open_firefox opens the firefox browser on the specific url, and sets all the settings for the specific session
:param url: url to run the firefox on :param url: url to run the firefox on
:type url: str :type url: str
''' '''
web = give_me_web() web = give_me_web()
global SOURCES global SOURCES
global ACTS global ACTS
global LAST_ACTS global LAST_ACTS
if not url.startswith("#") and url: if not url.startswith("#") and url:
book = webdriver.Firefox(web[0], executable_path=web[1], options=web[2]) book = webdriver.Firefox(web[0], executable_path=web[1], options=web[2])
url = url if url.endswith("#1.undefined.8.none") else f'{url}#1.undefined.8.none' url = url if url.endswith("#1.undefined.8.none") else f'{url}#1.undefined.8.none'
book.get(url) book.get(url)
act = action_chains.ActionChains(book) act = action_chains.ActionChains(book)
lst_act = action_chains.ActionChains(book) lst_act = action_chains.ActionChains(book)
lst_act._actions = [lst_act.key_down(keys.Keys.END), lst_act.pause(3), lst_act.key_up(keys.Keys.END)] lst_act._actions = [lst_act.key_down(keys.Keys.END), lst_act.pause(3), lst_act.key_up(keys.Keys.END)]
act._actions = [act.send_keys(keys.Keys.PAGE_DOWN)] act._actions = [act.send_keys(keys.Keys.PAGE_DOWN)]
LAST_ACTS.append(lst_act) LAST_ACTS.append(lst_act)
ACTS.append(act) ACTS.append(act)
SOURCES.append({}) SOURCES.append({})
def give_me_web(): def give_me_web():
options = webdriver.FirefoxOptions() options = webdriver.FirefoxOptions()
fp = webdriver.FirefoxProfile() fp = webdriver.FirefoxProfile()
fp.set_preference("browser.download.folderList", 2) fp.set_preference("browser.download.folderList", 2)
fp.set_preference("browser.download.manager.showWhenStarting", False) fp.set_preference("browser.download.manager.showWhenStarting", False)
fp.set_preference("browser.download.dir", "ignore") fp.set_preference("browser.download.dir", "ignore")
fp.set_preference( fp.set_preference(
"browser.helperApps.neverAsk.saveToDisk", "browser.helperApps.neverAsk.saveToDisk",
"attachment/csv, text/plain, application/octet-stream, application/binary, text/csv, application/csv, application/excel, text/comma-separated-values, text/xml, application/xml, application/xls, excel/xls, application/excel 97-2003,application/Microsoft Excel 97-2003 Worksheet, application/vnd.ms-excel") "attachment/csv, text/plain, application/octet-stream, application/binary, text/csv, application/csv, application/excel, text/comma-separated-values, text/xml, application/xml, application/xls, excel/xls, application/excel 97-2003,application/Microsoft Excel 97-2003 Worksheet, application/vnd.ms-excel")
fp.set_preference( fp.set_preference(
"browser.helperApps.neverAsk.openFile", "browser.helperApps.neverAsk.openFile",
"application/PDF, application/FDF, application/XFDF, application/LSL, application/LSO, application/LSS, application/IQY, application/RQY, application/XLK, application/XLS, application/XLT, application/POT application/PPS, application/PPT, application/DOS, application/DOT, application/WKS, application/BAT, application/PS, application/EPS, application/WCH, application/WCM, application/WB1, application/WB3, application/RTF, application/DOC, application/MDB, application/MDE, application/WBK, application/WB1, application/WCH, application/WCM, application/AD, application/ADP, application/vnd.ms-excel") "application/PDF, application/FDF, application/XFDF, application/LSL, application/LSO, application/LSS, application/IQY, application/RQY, application/XLK, application/XLS, application/XLT, application/POT application/PPS, application/PPT, application/DOS, application/DOT, application/WKS, application/BAT, application/PS, application/EPS, application/WCH, application/WCM, application/WB1, application/WB3, application/RTF, application/DOC, application/MDB, application/MDE, application/WBK, application/WB1, application/WCH, application/WCM, application/AD, application/ADP, application/vnd.ms-excel")
fp.set_preference("browser.download.panel.shown", False) fp.set_preference("browser.download.panel.shown", False)
options.add_argument('--lang=EN') options.add_argument('--lang=EN')
options.headless = True options.headless = True
fire = "geckodriver" fire = "geckodriver"
return (fp, fire, options) return (fp, fire, options)
with open("BooksToDownload", "r", encoding="utf_8") as file: with open("BooksToDownload", "r", encoding="utf_8") as file:
books = file.read().split("\n") books = file.read().split("\n")
for b in books: for b in books:
if b.find('````') > -1 and not b.startswith("#"): if b.find('````') > -1 and not b.startswith("#"):
OLD_REMOVE.append(b) OLD_REMOVE.append(b)
PATHS.append(b[b.rfind('`')+1:]) PATHS.append(b[b.rfind('`')+1:])
b = b[:b.find('`')] b = b[:b.find('`')]
elif not b.startswith("#"): elif not b.startswith("#"):
OLD_REMOVE.append(b) OLD_REMOVE.append(b)
PATHS.append(None) PATHS.append(None)
t1 = threading.Thread(None, open_firefox, args=(b,)) t1 = threading.Thread(None, open_firefox, args=(b,))
t1.start() t1.start()
THREADS.append(t1) THREADS.append(t1)
for t in THREADS: for t in THREADS:
t.join() t.join()
lasts = [] lasts = []
for i in range(len(ACTS)): for i in range(len(ACTS)):
SOURCES[i].update({key.attrs["id"]: "" SOURCES[i].update({key.attrs["id"]: ""
for key in bs4.BeautifulSoup(ACTS[i]._driver.page_source, "html.parser").find_all( for key in bs4.BeautifulSoup(ACTS[i]._driver.page_source, "html.parser").find_all(
"div", attrs={"class": "BV_oImage"})}) "div", attrs={"class": "BV_oImage"})})
lasts.append(list(SOURCES[i].keys())[-1]) lasts.append(list(SOURCES[i].keys())[-1])
couters = 0 couters = 0
treads = len(ACTS)-1 treads = len(ACTS)-1
for i in range(len(ACTS)): for i in range(len(ACTS)):
T = threading.Thread(None, act_now, args=(i, PATHS[i])) T = threading.Thread(None, act_now, args=(i, PATHS[i]))
T.start() T.start()