232 lines
6.9 KiB
Python
232 lines
6.9 KiB
Python
import sys
|
|
import re
|
|
import socket
|
|
from threading import Lock, Timer
|
|
from contextlib import contextmanager
|
|
|
|
try:
|
|
# Python 3
|
|
from urllib.parse import urlparse
|
|
except (ImportError):
|
|
# Python 2
|
|
from urlparse import urlparse
|
|
|
|
from . import __version__
|
|
|
|
from .show_error import show_error
|
|
from .console_write import console_write
|
|
from .cache import set_cache, get_cache
|
|
from .unicode import unicode_from_os
|
|
|
|
from .downloaders import DOWNLOADERS
|
|
from .downloaders.binary_not_found_error import BinaryNotFoundError
|
|
from .downloaders.rate_limit_exception import RateLimitException
|
|
from .downloaders.no_ca_cert_exception import NoCaCertException
|
|
from .downloaders.downloader_exception import DownloaderException
|
|
from .http_cache import HttpCache
|
|
|
|
|
|
# A dict of domains - each points to a list of downloaders
|
|
_managers = {}
|
|
|
|
# How many managers are currently checked out
|
|
_in_use = 0
|
|
|
|
# Make sure connection management doesn't run into threading issues
|
|
_lock = Lock()
|
|
|
|
# A timer used to disconnect all managers after a period of no usage
|
|
_timer = None
|
|
|
|
|
|
@contextmanager
|
|
def downloader(url, settings):
|
|
try:
|
|
manager = _grab(url, settings)
|
|
yield manager
|
|
|
|
finally:
|
|
_release(url, manager)
|
|
|
|
|
|
def _grab(url, settings):
|
|
global _managers, _lock, _in_use, _timer
|
|
|
|
_lock.acquire()
|
|
try:
|
|
if _timer:
|
|
_timer.cancel()
|
|
_timer = None
|
|
|
|
hostname = urlparse(url).hostname.lower()
|
|
if hostname not in _managers:
|
|
_managers[hostname] = []
|
|
|
|
if not _managers[hostname]:
|
|
_managers[hostname].append(DownloadManager(settings))
|
|
|
|
_in_use += 1
|
|
|
|
return _managers[hostname].pop()
|
|
|
|
finally:
|
|
_lock.release()
|
|
|
|
|
|
def _release(url, manager):
|
|
global _managers, _lock, _in_use, _timer
|
|
|
|
_lock.acquire()
|
|
try:
|
|
hostname = urlparse(url).hostname.lower()
|
|
_managers[hostname].insert(0, manager)
|
|
|
|
_in_use -= 1
|
|
|
|
if _timer:
|
|
_timer.cancel()
|
|
_timer = None
|
|
|
|
if _in_use == 0:
|
|
_timer = Timer(5.0, close_all_connections)
|
|
_timer.start()
|
|
|
|
finally:
|
|
_lock.release()
|
|
|
|
|
|
def close_all_connections():
|
|
global _managers, _lock, _in_use, _timer
|
|
|
|
_lock.acquire()
|
|
try:
|
|
if _timer:
|
|
_timer.cancel()
|
|
_timer = None
|
|
|
|
for domain, managers in _managers.items():
|
|
for manager in managers:
|
|
manager.close()
|
|
_managers = {}
|
|
|
|
finally:
|
|
_lock.release()
|
|
|
|
|
|
class DownloadManager(object):
|
|
def __init__(self, settings):
|
|
# Cache the downloader for re-use
|
|
self.downloader = None
|
|
|
|
user_agent = settings.get('user_agent')
|
|
if user_agent and user_agent.find('%s') != -1:
|
|
settings['user_agent'] = user_agent % __version__
|
|
|
|
self.settings = settings
|
|
if settings.get('http_cache'):
|
|
cache_length = settings.get('http_cache_length', 604800)
|
|
self.settings['cache'] = HttpCache(cache_length)
|
|
|
|
def close(self):
|
|
if self.downloader:
|
|
self.downloader.close()
|
|
self.downloader = None
|
|
|
|
def fetch(self, url, error_message, prefer_cached=False):
|
|
"""
|
|
Downloads a URL and returns the contents
|
|
|
|
:param url:
|
|
The string URL to download
|
|
|
|
:param error_message:
|
|
The error message to include if the download fails
|
|
|
|
:param prefer_cached:
|
|
If cached version of the URL content is preferred over a new request
|
|
|
|
:raises:
|
|
DownloaderException: if there was an error downloading the URL
|
|
|
|
:return:
|
|
The string contents of the URL
|
|
"""
|
|
|
|
is_ssl = re.search('^https://', url) != None
|
|
|
|
# Make sure we have a downloader, and it supports SSL if we need it
|
|
if not self.downloader or (is_ssl and not self.downloader.supports_ssl()):
|
|
for downloader_class in DOWNLOADERS:
|
|
try:
|
|
downloader = downloader_class(self.settings)
|
|
if is_ssl and not downloader.supports_ssl():
|
|
continue
|
|
self.downloader = downloader
|
|
break
|
|
except (BinaryNotFoundError):
|
|
pass
|
|
|
|
if not self.downloader:
|
|
error_string = u'Unable to download %s due to no ssl module available and no capable program found. Please install curl or wget.' % url
|
|
show_error(error_string)
|
|
raise DownloaderException(error_string)
|
|
|
|
url = url.replace(' ', '%20')
|
|
hostname = urlparse(url).hostname
|
|
if hostname:
|
|
hostname = hostname.lower()
|
|
timeout = self.settings.get('timeout', 3)
|
|
|
|
rate_limited_domains = get_cache('rate_limited_domains', [])
|
|
no_ca_cert_domains = get_cache('no_ca_cert_domains', [])
|
|
|
|
if self.settings.get('debug'):
|
|
try:
|
|
ip = socket.gethostbyname(hostname)
|
|
except (socket.gaierror) as e:
|
|
ip = unicode_from_os(e)
|
|
except (TypeError) as e:
|
|
ip = None
|
|
|
|
console_write(u"Download Debug", True)
|
|
console_write(u" URL: %s" % url)
|
|
console_write(u" Resolved IP: %s" % ip)
|
|
console_write(u" Timeout: %s" % str(timeout))
|
|
|
|
if hostname in rate_limited_domains:
|
|
error_string = u"Skipping due to hitting rate limit for %s" % hostname
|
|
if self.settings.get('debug'):
|
|
console_write(u" %s" % error_string)
|
|
raise DownloaderException(error_string)
|
|
|
|
if hostname in no_ca_cert_domains:
|
|
error_string = u" Skipping since there are no CA certs for %s" % hostname
|
|
if self.settings.get('debug'):
|
|
console_write(u" %s" % error_string)
|
|
raise DownloaderException(error_string)
|
|
|
|
try:
|
|
return self.downloader.download(url, error_message, timeout, 3, prefer_cached)
|
|
|
|
except (RateLimitException) as e:
|
|
|
|
rate_limited_domains.append(hostname)
|
|
set_cache('rate_limited_domains', rate_limited_domains, self.settings.get('cache_length'))
|
|
|
|
error_string = (u'Hit rate limit of %s for %s, skipping all futher ' +
|
|
u'download requests for this domain') % (e.limit, e.domain)
|
|
console_write(error_string, True)
|
|
raise
|
|
|
|
except (NoCaCertException) as e:
|
|
|
|
no_ca_cert_domains.append(hostname)
|
|
set_cache('no_ca_cert_domains', no_ca_cert_domains, self.settings.get('cache_length'))
|
|
|
|
error_string = (u'No CA certs available for %s, skipping all futher ' +
|
|
u'download requests for this domain. If you are on a trusted ' +
|
|
u'network, you can add the CA certs by running the "Grab ' +
|
|
u'CA Certs" command from the command palette.') % e.domain
|
|
console_write(error_string, True)
|
|
raise
|