348 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			348 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import tempfile
 | |
| import re
 | |
| import os
 | |
| 
 | |
| from ..console_write import console_write
 | |
| from ..unicode import unicode_from_os
 | |
| from ..open_compat import open_compat, read_compat
 | |
| from .cli_downloader import CliDownloader
 | |
| from .non_http_error import NonHttpError
 | |
| from .non_clean_exit_error import NonCleanExitError
 | |
| from .rate_limit_exception import RateLimitException
 | |
| from .downloader_exception import DownloaderException
 | |
| from .cert_provider import CertProvider
 | |
| from .decoding_downloader import DecodingDownloader
 | |
| from .limiting_downloader import LimitingDownloader
 | |
| from .caching_downloader import CachingDownloader
 | |
| 
 | |
| 
 | |
| class WgetDownloader(CliDownloader, CertProvider, DecodingDownloader, LimitingDownloader, CachingDownloader):
 | |
|     """
 | |
|     A downloader that uses the command line program wget
 | |
| 
 | |
|     :param settings:
 | |
|         A dict of the various Package Control settings. The Sublime Text
 | |
|         Settings API is not used because this code is run in a thread.
 | |
| 
 | |
|     :raises:
 | |
|         BinaryNotFoundError: when wget can not be found
 | |
|     """
 | |
| 
 | |
|     def __init__(self, settings):
 | |
|         self.settings = settings
 | |
|         self.debug = settings.get('debug')
 | |
|         self.wget = self.find_binary('wget')
 | |
| 
 | |
|     def close(self):
 | |
|         """
 | |
|         No-op for compatibility with UrllibDownloader and WinINetDownloader
 | |
|         """
 | |
| 
 | |
|         pass
 | |
| 
 | |
|     def download(self, url, error_message, timeout, tries, prefer_cached=False):
 | |
|         """
 | |
|         Downloads a URL and returns the contents
 | |
| 
 | |
|         :param url:
 | |
|             The URL to download
 | |
| 
 | |
|         :param error_message:
 | |
|             A string to include in the console error that is printed
 | |
|             when an error occurs
 | |
| 
 | |
|         :param timeout:
 | |
|             The int number of seconds to set the timeout to
 | |
| 
 | |
|         :param tries:
 | |
|             The int number of times to try and download the URL in the case of
 | |
|             a timeout or HTTP 503 error
 | |
| 
 | |
|         :param prefer_cached:
 | |
|             If a cached version should be returned instead of trying a new request
 | |
| 
 | |
|         :raises:
 | |
|             NoCaCertException: when no CA certs can be found for the url
 | |
|             RateLimitException: when a rate limit is hit
 | |
|             DownloaderException: when any other download error occurs
 | |
| 
 | |
|         :return:
 | |
|             The string contents of the URL
 | |
|         """
 | |
| 
 | |
|         if prefer_cached:
 | |
|             cached = self.retrieve_cached(url)
 | |
|             if cached:
 | |
|                 return cached
 | |
| 
 | |
|         self.tmp_file = tempfile.NamedTemporaryFile().name
 | |
|         command = [self.wget, '--connect-timeout=' + str(int(timeout)), '-o',
 | |
|             self.tmp_file, '-O', '-', '-U', self.settings.get('user_agent')]
 | |
| 
 | |
|         request_headers = {
 | |
|             # Don't be alarmed if the response from the server does not select
 | |
|             # one of these since the server runs a relatively new version of
 | |
|             # OpenSSL which supports compression on the SSL layer, and Apache
 | |
|             # will use that instead of HTTP-level encoding.
 | |
|             'Accept-Encoding': 'gzip,deflate'
 | |
|         }
 | |
|         request_headers = self.add_conditional_headers(url, request_headers)
 | |
| 
 | |
|         for name, value in request_headers.items():
 | |
|             command.extend(['--header', "%s: %s" % (name, value)])
 | |
| 
 | |
|         secure_url_match = re.match('^https://([^/]+)', url)
 | |
|         if secure_url_match != None:
 | |
|             secure_domain = secure_url_match.group(1)
 | |
|             bundle_path = self.check_certs(secure_domain, timeout)
 | |
|             command.append(u'--ca-certificate=' + bundle_path)
 | |
| 
 | |
|         if self.debug:
 | |
|             command.append('-d')
 | |
|         else:
 | |
|             command.append('-S')
 | |
| 
 | |
|         http_proxy = self.settings.get('http_proxy')
 | |
|         https_proxy = self.settings.get('https_proxy')
 | |
|         proxy_username = self.settings.get('proxy_username')
 | |
|         proxy_password = self.settings.get('proxy_password')
 | |
| 
 | |
|         if proxy_username:
 | |
|             command.append(u"--proxy-user=%s" % proxy_username)
 | |
|         if proxy_password:
 | |
|             command.append(u"--proxy-password=%s" % proxy_password)
 | |
| 
 | |
|         if self.debug:
 | |
|             console_write(u"Wget Debug Proxy", True)
 | |
|             console_write(u"  http_proxy: %s" % http_proxy)
 | |
|             console_write(u"  https_proxy: %s" % https_proxy)
 | |
|             console_write(u"  proxy_username: %s" % proxy_username)
 | |
|             console_write(u"  proxy_password: %s" % proxy_password)
 | |
| 
 | |
|         command.append(url)
 | |
| 
 | |
|         if http_proxy:
 | |
|             os.putenv('http_proxy', http_proxy)
 | |
|         if https_proxy:
 | |
|             os.putenv('https_proxy', https_proxy)
 | |
| 
 | |
|         error_string = None
 | |
|         while tries > 0:
 | |
|             tries -= 1
 | |
|             try:
 | |
|                 result = self.execute(command)
 | |
| 
 | |
|                 general, headers = self.parse_output()
 | |
|                 encoding = headers.get('content-encoding')
 | |
|                 if encoding:
 | |
|                     result = self.decode_response(encoding, result)
 | |
| 
 | |
|                 result = self.cache_result('get', url, general['status'],
 | |
|                     headers, result)
 | |
| 
 | |
|                 return result
 | |
| 
 | |
|             except (NonCleanExitError) as e:
 | |
| 
 | |
|                 try:
 | |
|                     general, headers = self.parse_output()
 | |
|                     self.handle_rate_limit(headers, url)
 | |
| 
 | |
|                     if general['status'] == 304:
 | |
|                         return self.cache_result('get', url, general['status'],
 | |
|                             headers, None)
 | |
| 
 | |
|                     if general['status'] == 503 and tries != 0:
 | |
|                         # GitHub and BitBucket seem to rate limit via 503
 | |
|                         error_string = u'Downloading %s was rate limited' % url
 | |
|                         if tries:
 | |
|                             error_string += ', trying again'
 | |
|                             if self.debug:
 | |
|                                 console_write(error_string, True)
 | |
|                         continue
 | |
| 
 | |
|                     download_error = 'HTTP error %s' % general['status']
 | |
| 
 | |
|                 except (NonHttpError) as e:
 | |
| 
 | |
|                     download_error = unicode_from_os(e)
 | |
| 
 | |
|                     # GitHub and BitBucket seem to time out a lot
 | |
|                     if download_error.find('timed out') != -1:
 | |
|                         error_string = u'Downloading %s timed out' % url
 | |
|                         if tries:
 | |
|                             error_string += ', trying again'
 | |
|                             if self.debug:
 | |
|                                 console_write(error_string, True)
 | |
|                         continue
 | |
| 
 | |
|                 error_string = u'%s %s downloading %s.' % (error_message, download_error, url)
 | |
| 
 | |
|             break
 | |
| 
 | |
|         raise DownloaderException(error_string)
 | |
| 
 | |
|     def supports_ssl(self):
 | |
|         """
 | |
|         Indicates if the object can handle HTTPS requests
 | |
| 
 | |
|         :return:
 | |
|             If the object supports HTTPS requests
 | |
|         """
 | |
| 
 | |
|         return True
 | |
| 
 | |
|     def parse_output(self):
 | |
|         """
 | |
|         Parses the wget output file, prints debug information and returns headers
 | |
| 
 | |
|         :return:
 | |
|             A tuple of (general, headers) where general is a dict with the keys:
 | |
|               `version` - HTTP version number (string)
 | |
|               `status` - HTTP status code (integer)
 | |
|               `message` - HTTP status message (string)
 | |
|             And headers is a dict with the keys being lower-case version of the
 | |
|             HTTP header names.
 | |
|         """
 | |
| 
 | |
|         with open_compat(self.tmp_file, 'r') as f:
 | |
|             output = read_compat(f).splitlines()
 | |
|         self.clean_tmp_file()
 | |
| 
 | |
|         error = None
 | |
|         header_lines = []
 | |
|         if self.debug:
 | |
|             section = 'General'
 | |
|             last_section = None
 | |
|             for line in output:
 | |
|                 if section == 'General':
 | |
|                     if self.skippable_line(line):
 | |
|                         continue
 | |
| 
 | |
|                 # Skip blank lines
 | |
|                 if line.strip() == '':
 | |
|                     continue
 | |
| 
 | |
|                 # Error lines
 | |
|                 if line[0:5] == 'wget:':
 | |
|                     error = line[5:].strip()
 | |
|                 if line[0:7] == 'failed:':
 | |
|                     error = line[7:].strip()
 | |
| 
 | |
|                 if line == '---request begin---':
 | |
|                     section = 'Write'
 | |
|                     continue
 | |
|                 elif line == '---request end---':
 | |
|                     section = 'General'
 | |
|                     continue
 | |
|                 elif line == '---response begin---':
 | |
|                     section = 'Read'
 | |
|                     continue
 | |
|                 elif line == '---response end---':
 | |
|                     section = 'General'
 | |
|                     continue
 | |
| 
 | |
|                 if section != last_section:
 | |
|                     console_write(u"Wget HTTP Debug %s" % section, True)
 | |
| 
 | |
|                 if section == 'Read':
 | |
|                     header_lines.append(line)
 | |
| 
 | |
|                 console_write(u'  ' + line)
 | |
|                 last_section = section
 | |
| 
 | |
|         else:
 | |
|             for line in output:
 | |
|                 if self.skippable_line(line):
 | |
|                     continue
 | |
| 
 | |
|                 # Check the resolving and connecting to lines for errors
 | |
|                 if re.match('(Resolving |Connecting to )', line):
 | |
|                     failed_match = re.search(' failed: (.*)$', line)
 | |
|                     if failed_match:
 | |
|                         error = failed_match.group(1).strip()
 | |
| 
 | |
|                 # Error lines
 | |
|                 if line[0:5] == 'wget:':
 | |
|                     error = line[5:].strip()
 | |
|                 if line[0:7] == 'failed:':
 | |
|                     error = line[7:].strip()
 | |
| 
 | |
|                 if line[0:2] == '  ':
 | |
|                     header_lines.append(line.lstrip())
 | |
| 
 | |
|         if error:
 | |
|             raise NonHttpError(error)
 | |
| 
 | |
|         return self.parse_headers(header_lines)
 | |
| 
 | |
|     def skippable_line(self, line):
 | |
|         """
 | |
|         Determines if a debug line is skippable - usually because of extraneous
 | |
|         or duplicate information.
 | |
| 
 | |
|         :param line:
 | |
|             The debug line to check
 | |
| 
 | |
|         :return:
 | |
|             True if the line is skippable, otherwise None
 | |
|         """
 | |
| 
 | |
|         # Skip date lines
 | |
|         if re.match('--\d{4}-\d{2}-\d{2}', line):
 | |
|             return True
 | |
|         if re.match('\d{4}-\d{2}-\d{2}', line):
 | |
|             return True
 | |
|         # Skip HTTP status code lines since we already have that info
 | |
|         if re.match('\d{3} ', line):
 | |
|             return True
 | |
|         # Skip Saving to and progress lines
 | |
|         if re.match('(Saving to:|\s*\d+K)', line):
 | |
|             return True
 | |
|         # Skip notice about ignoring body on HTTP error
 | |
|         if re.match('Skipping \d+ byte', line):
 | |
|             return True
 | |
| 
 | |
|     def parse_headers(self, output=None):
 | |
|         """
 | |
|         Parses HTTP headers into two dict objects
 | |
| 
 | |
|         :param output:
 | |
|             An array of header lines, if None, loads from temp output file
 | |
| 
 | |
|         :return:
 | |
|             A tuple of (general, headers) where general is a dict with the keys:
 | |
|               `version` - HTTP version number (string)
 | |
|               `status` - HTTP status code (integer)
 | |
|               `message` - HTTP status message (string)
 | |
|             And headers is a dict with the keys being lower-case version of the
 | |
|             HTTP header names.
 | |
|         """
 | |
| 
 | |
|         if not output:
 | |
|             with open_compat(self.tmp_file, 'r') as f:
 | |
|                 output = read_compat(f).splitlines()
 | |
|             self.clean_tmp_file()
 | |
| 
 | |
|         general = {
 | |
|             'version': '0.9',
 | |
|             'status':  200,
 | |
|             'message': 'OK'
 | |
|         }
 | |
|         headers = {}
 | |
|         for line in output:
 | |
|             # When using the -S option, headers have two spaces before them,
 | |
|             # additionally, valid headers won't have spaces, so this is always
 | |
|             # a safe operation to perform
 | |
|             line = line.lstrip()
 | |
|             if line.find('HTTP/') == 0:
 | |
|                 match = re.match('HTTP/(\d\.\d)\s+(\d+)(?:\s+(.*))?$', line)
 | |
|                 general['version'] = match.group(1)
 | |
|                 general['status'] = int(match.group(2))
 | |
|                 general['message'] = match.group(3) or ''
 | |
|             else:
 | |
|                 name, value = line.split(':', 1)
 | |
|                 headers[name.lower()] = value.strip()
 | |
| 
 | |
|         return (general, headers)
 |