@@ -1,198 +1,191 @@
import selenium
import selenium
from selenium . webdriver . common import action_chains
from selenium . webdriver . common import action_chains
import urllib3
import urllib3
import bs4
import bs4
import re
import re
import os
import os
import glob
import glob
from selenium . common import exceptions
from selenium . common import exceptions
from selenium import webdriver
from selenium import webdriver
import img2pdf
import img2pdf
import threading
import threading
from selenium . webdriver . common . action_chains import ActionChains
from selenium . webdriver . common . action_chains import ActionChains
from selenium . webdriver . common . actions import interaction
from selenium . webdriver . common . actions import interaction
from selenium . webdriver . common import keys
from selenium . webdriver . common import keys
# from parser import ArgumentParser
# from parser import ArgumentParser
ACTS = [ ]
ACTS = [ ]
LAST_ACTS = [ ]
LAST_ACTS = [ ]
SOURCES = [ ]
SOURCES = [ ]
Books = [ ]
Books = [ ]
THREADS = [ ]
THREADS = [ ]
PATHS = [ ]
PATHS = [ ]
OLD_REMOVE = [ ]
OLD_REMOVE = [ ]
BROWSER_PREFENCES = { " browser.download.folderList " : 2 , " browser.download.manager.showWhenStarting " : False , " browser.download.dir " : " ignore " , " browser.helperApps.neverAsk.saveToDisk " : " attachment/csv, text/plain, application/octet-stream, application/binary, text/csv, application/csv, application/excel, text/comma-separated-values, text/xml, application/xml, application/xls, excel/xls, application/excel 97-2003,application/Microsoft Excel 97-2003 Worksheet, application/vnd.ms-excel " , " browser.helperApps.neverAsk.openFile " :
" application/PDF, application/FDF, application/XFDF, application/LSL, application/LSO, application/LSS, application/IQY, application/RQY, application/XLK, application/XLS, application/XLT, application/POT application/PPS, application/PPT, application/DOS, application/DOT, application/WKS, application/BAT, application/PS, application/EPS, application/WCH, application/WCM, application/WB1, application/WB3, application/RTF, application/DOC, application/MDB, application/MDE, application/WBK, application/WB1, application/WCH, application/WCM, application/AD, application/ADP, application/vnd.ms-excel " , " browser.download.panel.shown " : False }
def remove_text ( text : str ) :
''' remove_text removes the url from the text file
def remove_text ( text : str ) :
:param text: url to remov e
''' remove_text removes the url from the text fil e
:type text: str
'''
:param text: url to remove
with open ( " BooksToDownload " , " r " , encoding = " utf_8 " ) as file :
:type text: str
data_text = file . readlines ( )
'''
if text in data_text :
with open ( " BooksToDownload " , " r " , encoding = " utf_8 " ) as file :
data_text . pop ( data_text . index ( text ) )
data_text = file . readlines ( )
with open ( " BooksToDownload " , ' w ' , encoding = ' utf_8 ' ) as file :
if text in data_text :
file . writelines ( data_text )
data_text . pop ( data_text . index ( text ) )
els e:
with open ( " BooksToDownload " , ' w ' , encoding = ' utf_8 ' ) as fil e:
remove_text ( f " { text } \n " )
file . writelines ( data_text )
else :
remove_text ( f " { text } \n " )
def set_folder_name ( html : bs4 . BeautifulSoup ) :
name = html . find ( " title " ) . text if html . text else " None "
return name [ name . find ( " - " ) + 3 : ] . replace ( ' " ' , " ' ' " ) . replace ( " \\ " , ' || ' ) . replace ( r ' : ' , r ' ׃ ' ) . replace ( r " / " , r " | " ) . replace ( " \n " , " " ) . replace ( ' ? ' , ' ;; ' )
def set_folder_name ( html : bs4 . BeautifulSoup ) :
name = html . find ( " title " ) . text if html . text else " None "
return name [ name . find ( " - " ) + 3 : ] . replace ( ' " ' , " ' ' " ) . replace ( " \\ " , ' || ' ) . replace ( r ' : ' , r ' ׃ ' ) . replace ( r " / " , r " | " ) . replace ( " \n " , " " ) . replace ( ' ? ' , ' ;; ' )
def down_to_list ( url : str ) :
data = urllib3 . PoolManager ( ) . request ( " GET " , url )
return data . data if data . data else None
def down_to_list ( url : str ) :
data = urllib3 . PoolManager ( ) . request ( " GET " , url )
return data . data if data . data else None
def _split_styler ( style : str ) :
begin = style . find ( ' " ' ) + 1
end = style . rfind ( ' " ' )
def _split_styler ( style : str ) :
return style [ beg in: end ]
begin = style . f ind ( ' " ' ) + 1
end = style . rfind ( ' " ' )
return style [ begin : end ]
def update_SOURCES ( index : int ) :
global SOURCES , ACTS
keys_html = bs4 . BeautifulSoup (
def update_SOURCES ( index : int ) :
ACTS [ index ] . _driver . page_source , " html.parser " ) . find_all (
global SOURCES , ACTS
" div " , attrs = { " class " : " BV_oImage " } )
keys_html = bs4 . BeautifulSoup (
dic_update = { key . attrs [ " id " ] : _split_styler ( key . attrs [ " style " ] )
ACTS [ index ] . _driver . page_source , " html.parser " ) . find_all (
for key in keys_html if " http " in key . attrs [ " styl e" ] }
" div " , attrs = { " class " : " BV_oImag e" } )
SOURCES [ index ] . update ( dic_update )
dic_update = { key . attrs [ " id " ] : _split_styler ( key . attrs [ " style " ] )
for key in keys_html if " http " in key . attrs [ " style " ] }
SOURCES [ index ] . update ( dic_update )
def do_action_now ( index : int ) :
global SOURCES
global ACTS
def do_action_now ( index : int ) :
ACTS [ index ] . perform ( )
global SOURCES
name = set_folder_name ( bs4 . BeautifulSoup ( ACTS [ index ] . _driver . page_source , " html.parser " ) )
global ACTS
update_SOURCE S( index )
ACT S[ index ] . perform ( )
if not os . path . exists ( " ignore/ " + f " { name } " ) :
name = set_folder_name ( bs4 . BeautifulSoup ( ACTS [ index ] . _driver . page_source , " html.parser " ) )
os . mkdir ( " ignore/ " + f " { name } " )
update_SOURCES ( index )
files = l ist( SOURCES [ index ] )
if not os . path . ex ists ( " ignore/ " + f " { name } " ) :
for s in SOURCES [ index ] :
os . mkdir ( " ignore/ " + f " { name } " )
if SOURCES [ index ] [ s ] :
files = list ( SOURCES [ index ] )
if not o s. path . exists ( f " ignore/ { name } / { files . index ( s ) : 04 } .jpg " ) :
for s in SOURCES [ index ] :
with open ( f " ignore/ { name } / { files . index ( s ) : 04 } .jpg " , " wb " ) as F :
if SOURCES [ index ] [ s ] :
F . write ( urllib3 . PoolManager ( ) . reque st ( " GET " , SOURCES [ index ] [ s ] ) . data )
if not os . path . exi sts ( f " ignore/ { name } / { files . index ( s ) : 04 } .jpg " ) :
# files[files.index(s)] = urllib3.PoolManager().request("GET", SOURCES[index][s]).data
with open ( f " ignore/ { name } / { files . index ( s ) : 04 } .jpg " , " wb " ) as F :
return files
F . write ( urllib3 . PoolManager ( ) . request ( " GET " , SOURCES [ index ] [ s ] ) . data )
# def check_and_act(index: int,last):
# files[files.index(s)] = urllib3.PoolManager().request("GET", SOURCES[index][s]).data
return files
# def check_and_act(index: int,last):
def get_first_empty ( index : int ) :
global SOURCES
for s in SOURCES [ index ] :
def get_first_empty ( index : int ) :
if not SOURCES [ index ] [ s ] :
global SOURCES
return s
for s in SOURCES [ index ] :
return None
if not SOURCES [ index ] [ s ] :
return s
return None
def act_now ( index : int , path : str = None ) :
global SOURCES
global ACTS
def act_now ( index : int , path : str = None ) :
global couters
global SOURCES
global TREAD S
global ACT S
global OLD_REMOVE
global couters
global LAST_ACT S
global TREAD S
global treads
global OLD_REMOVE
s = 0
global LAST_ACTS
name = set_folder_name ( bs4 . BeautifulSoup ( ACTS [ index ] . _driver . page_source , " html.parser " ) )
global treads
save_first = " "
s = 0
last = list ( SOURCES [ index ] . keys ( ) ) [ - 1 ]
name = set_folder_name ( bs4 . BeautifulSoup ( ACTS [ index ] . _driver . page_source , " html.parser " ) )
while " " in SOURCES [ index ] . values ( ) :
save_first = " "
if s == 0 :
last = list ( SOURCES [ index ] . keys ( ) ) [ - 1 ]
LAST_ACT S [ index ] . perform ( )
while " " in SOURCE S[ index ] . values ( ) :
s = 1
if s == 0 :
save_first = get_first_empty ( index )
LAST_ACTS [ index ] . perform ( )
url_now = ACTS [ index ] . _driver . current_url
s = 1
url_now = url_now [ : url_now . find ( " # " ) + 1 ] + save_first
save_first = get_first_empty ( index )
if SOURCES [ index ] [ last ] and " " in SOURCES [ index ] . values ( ) :
url_now = ACTS [ index ] . _driver . current_url
ACTS [ index ] . _driver . get ( url_now )
url_now = url_now [ : url_now . find ( " # " ) + 1 ] + save_first
do_action_now ( index )
if SOURCES [ index ] [ last ] and " " in SOURCES [ index ] . values ( ) :
SOURCE S[ index ] [ last ] = " "
ACT S[ index ] . _driver . get ( url_now )
else :
do_action_now ( index )
do_action_now ( index )
SOURCES [ index ] [ last ] = " "
if SOURCES [ index ] and " " not in SOURCES [ index ] . values ( ) :
else :
couters + = 1
do_action_now ( index )
pathus = f ' { path } / { name } .pdf ' if path else f " ignore/ { name } / { name } .pdf "
if SOURCES [ index ] and " " not in SOURCES [ index ] . values ( ) :
with open ( pathus , " wb " ) as file :
couters + = 1
file . write ( img2pdf . convert ( glob . glob ( f " ignore/ { name } /*jpg " ) ) )
pathus = f ' { path } / { name } .pdf ' if path else f " ignore/ { name } /{ name } .pdf "
ACTS [ index ] . _driver . quit ( )
with open ( pathus , " wb " ) as file :
remove_text ( OLD_REMOVE [ index ] )
file . write ( img2pdf . convert ( glob . glob ( f " ignore/ { name } /*.jpg " ) ) )
treads - = 1
ACTS [ index ] . _driver . quit ( )
remove_text ( OLD_REMOVE [ index ] )
treads - = 1
def open_firefox ( url : str ) :
''' open_firefox opens the firefox browser on the specific url, and sets all the settings for the specific session
def open_firefox ( url : str ) :
:param url: url to run the firefox on
''' open_firefox opens the firefox browser on the specific url, and sets all the settings for the specific sessi on
:type url: str
'''
:param url: url to run the firefox on
web = give_me_web ( )
:type url: str
global SOURCES
'''
global ACTS
web = give_me_web ( )
global LAST_ACT S
global SOURCE S
if not url . startswith ( " # " ) and url :
global ACTS
book = webdriver . Firefox ( web [ 0 ] , executable_path = web [ 1 ] , options = web [ 2 ] )
global LAST_ACTS
url = url if url . end swith( " #1.undefined.8.none " ) else f ' { url } #1.undefined.8.none '
if not url . start swith( " #" ) and url :
book . get ( url )
book = webdriver . Firefox ( web [ 0 ] , executable_path = web [ 1 ] , options = web [ 2 ] )
act = action_chains . ActionChains ( book )
url = url if url . endswith ( " #1.undefined.8.none " ) else f ' { url } #1.undefined.8.none '
lst_act = action_chains . ActionChains ( book )
book . get ( url )
lst_act . _actions = [ lst_act . key_down ( keys . Keys . END ) , lst_act . pause ( 3 ) , lst_act . key_up ( keys . Keys . END ) ]
act = action_chains . ActionChains ( book )
act . _actions = [ act. send_keys ( keys . Keys . PAGE_DOWN ) ]
lst_act = action_chains . ActionChains ( book )
LAST_ACTS . append ( lst_act )
lst_act . _actions = [ lst_act . key_down ( keys . Keys . END ) , lst_act . pause ( 3 ) , lst_act . key_up ( keys . Keys . END ) ]
ACTS . append ( act )
act . _actions = [ act . send_keys ( keys . Keys . PAGE_DOWN ) ]
SOURCE S. append ( { } )
LAST_ACT S. append ( lst_act )
ACTS . append ( act )
SOURCES . append ( { } )
def give_me_web ( ) :
options = webdriver . FirefoxOptions ( )
fp = webdriver . FirefoxProfile ( )
def give_me_web ( ) :
fp . set_preference ( " browser.download.folderList " , 2 )
options = webdriver . FirefoxOptions ( )
fp . set_preference ( " browser.download.manager.showWhenStarting " , False )
fp = webdriver . FirefoxProfile ( )
fp . set_preference ( " browser.download.dir " , " ignore " )
for key , val in BROWSER_PREFENCES :
fp . set_preference (
fp . set_preference ( key , val )
" browser.helperApps.neverAsk.saveToDisk " ,
options . add_argument ( ' --lang=EN ' )
" attachment/csv, text/plain, application/octet-stream, application/binary, text/csv, application/csv, application/excel, text/comma-separated-values, text/xml, application/xml, application/xls, excel/xls, application/excel 97-2003,application/Microsoft Excel 97-2003 Worksheet, application/vnd.ms-excel " )
options . headless = True
fp . set_preference (
fire = " geckodriver "
" browser.helperApps.neverAsk.openFile " ,
return ( fp , fire , options )
" application/PDF, application/FDF, application/XFDF, application/LSL, application/LSO, application/LSS, application/IQY, application/RQY, application/XLK, application/XLS, application/XLT, application/POT application/PPS, application/PPT, application/DOS, application/DOT, application/WKS, application/BAT, application/PS, application/EPS, application/WCH, application/WCM, application/WB1, application/WB3, application/RTF, application/DOC, application/MDB, application/MDE, application/WBK, application/WB1, application/WCH, application/WCM, application/AD, application/ADP, application/vnd.ms-excel " )
fp . set_preference ( " browser.download.panel.shown " , False )
if __name__ == " __main__ " :
options . add_argument ( ' --lang=EN ' )
with open ( " BooksToDownload " , " r " , encoding = " utf_8 " ) as file :
options . headless = True
books = file . read ( ) . split ( " \n " )
fire = " geckodriver "
for b in books :
return ( fp , fire , options )
if b . find ( ' ```` ' ) > - 1 and not b . startswith ( " # " ) :
OLD_REMOVE . append ( b )
PATHS . append ( b [ b . rfind ( ' ` ' ) + 1 : ] )
with open ( " BooksToDownload " , " r " , encoding = " utf_8 " ) as file :
b = b [ : b . find ( ' ` ' ) ]
books = file . read ( ) . spl it( " \n " )
elif not b . startsw ith ( " # " ) :
OLD_REMOVE . append ( b )
for b in books :
PATHS . append ( None )
if b . find ( ' ```` ' ) > - 1 and not b . startswith ( " # " ) :
t1 = threading . Thread ( None , open_firefox , args = ( b , ) )
OLD_REMOVE . append ( b )
t1 . start ( )
PA THS. append ( b [ b . rfind ( ' ` ' ) + 1 : ] )
THREAD S . append ( t1 )
b = b [ : b . find ( ' ` ' ) ]
for t in THREADS :
elif not b . startswith ( " # " ) :
t . join ( )
OLD_REMOVE . append ( b )
lasts = [ ]
PATHS . append ( None )
for i in range ( len ( ACTS ) ) :
t1 = threading . Thread ( None , open_firefox , args = ( b , ) )
SOURCES [ i ] . update ( { key . attrs [ " id " ] : " "
t1 . start ( )
for key in bs4 . BeautifulSoup ( ACTS [ i ] . _driver . page_source , " html.parser " ) . find_all (
THREADS . append ( t1 )
" div " , attrs = { " class " : " BV_oImage " } ) } )
for t in THREADS :
lasts . append ( list ( SOURCES [ i ] . keys ( ) ) [ - 1 ] )
t . join ( )
couters = 0
lasts = [ ]
treads = len ( ACTS ) - 1
for i in range ( len ( ACTS ) ) :
for i in range ( len ( ACTS ) ) :
SOURCES [ i ] . update ( { key . attrs [ " id " ] : " "
T = threading . Thread ( None , act_now , args = ( i , PATHS [ i ] ) )
for key in bs4 . BeautifulSoup ( ACTS [ i ] . _driver . page_source , " html.parser " ) . find_all (
T . start ( )
" div " , attrs = { " class " : " BV_oImage " } ) } )
lasts . append ( list ( SOURCES [ i ] . keys ( ) ) [ - 1 ] )
couters = 0
treads = len ( ACTS ) - 1
for i in range ( len ( ACTS ) ) :
T = threading . Thread ( None , act_now , args = ( i , PATHS [ i ] ) )
T . start ( )