186 lines
		
	
	
		
			5.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			186 lines
		
	
	
		
			5.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import sys
 | |
| import re
 | |
| import json
 | |
| import hashlib
 | |
| 
 | |
| from ..console_write import console_write
 | |
| 
 | |
| 
 | |
| class CachingDownloader(object):
 | |
|     """
 | |
|     A base downloader that will use a caching backend to cache HTTP requests
 | |
|     and make conditional requests.
 | |
|     """
 | |
| 
 | |
|     def add_conditional_headers(self, url, headers):
 | |
|         """
 | |
|         Add `If-Modified-Since` and `If-None-Match` headers to a request if a
 | |
|         cached copy exists
 | |
| 
 | |
|         :param headers:
 | |
|             A dict with the request headers
 | |
| 
 | |
|         :return:
 | |
|             The request headers dict, possibly with new headers added
 | |
|         """
 | |
| 
 | |
|         if not self.settings.get('cache'):
 | |
|             return headers
 | |
| 
 | |
|         info_key = self.generate_key(url, '.info')
 | |
|         info_json = self.settings['cache'].get(info_key)
 | |
| 
 | |
|         if not info_json:
 | |
|             return headers
 | |
| 
 | |
|         # Make sure we have the cached content to use if we get a 304
 | |
|         key = self.generate_key(url)
 | |
|         if not self.settings['cache'].has(key):
 | |
|             return headers
 | |
| 
 | |
|         try:
 | |
|             info = json.loads(info_json.decode('utf-8'))
 | |
|         except ValueError:
 | |
|             return headers
 | |
| 
 | |
|         etag = info.get('etag')
 | |
|         if etag:
 | |
|             headers['If-None-Match'] = etag
 | |
| 
 | |
|         last_modified = info.get('last-modified')
 | |
|         if last_modified:
 | |
|             headers['If-Modified-Since'] = last_modified
 | |
| 
 | |
|         return headers
 | |
| 
 | |
|     def cache_result(self, method, url, status, headers, content):
 | |
|         """
 | |
|         Processes a request result, either caching the result, or returning
 | |
|         the cached version of the url.
 | |
| 
 | |
|         :param method:
 | |
|             The HTTP method used for the request
 | |
| 
 | |
|         :param url:
 | |
|             The url of the request
 | |
| 
 | |
|         :param status:
 | |
|             The numeric response status of the request
 | |
| 
 | |
|         :param headers:
 | |
|             A dict of reponse headers, with keys being lowercase
 | |
| 
 | |
|         :param content:
 | |
|             The response content
 | |
| 
 | |
|         :return:
 | |
|             The response content
 | |
|         """
 | |
| 
 | |
|         debug = self.settings.get('debug', False)
 | |
| 
 | |
|         if not self.settings.get('cache'):
 | |
|             if debug:
 | |
|                 console_write(u"Skipping cache since there is no cache object", True)
 | |
|             return content
 | |
| 
 | |
|         if method.lower() != 'get':
 | |
|             if debug:
 | |
|                 console_write(u"Skipping cache since the HTTP method != GET", True)
 | |
|             return content
 | |
| 
 | |
|         status = int(status)
 | |
| 
 | |
|         # Don't do anything unless it was successful or not modified
 | |
|         if status not in [200, 304]:
 | |
|             if debug:
 | |
|                 console_write(u"Skipping cache since the HTTP status code not one of: 200, 304", True)
 | |
|             return content
 | |
| 
 | |
|         key = self.generate_key(url)
 | |
| 
 | |
|         if status == 304:
 | |
|             cached_content = self.settings['cache'].get(key)
 | |
|             if cached_content:
 | |
|                 if debug:
 | |
|                     console_write(u"Using cached content for %s" % url, True)
 | |
|                 return cached_content
 | |
| 
 | |
|             # If we got a 304, but did not have the cached content
 | |
|             # stop here so we don't cache an empty response
 | |
|             return content
 | |
| 
 | |
|         # If we got here, the status is 200
 | |
| 
 | |
|         # Respect some basic cache control headers
 | |
|         cache_control = headers.get('cache-control', '')
 | |
|         if cache_control:
 | |
|             fields = re.split(',\s*', cache_control)
 | |
|             for field in fields:
 | |
|                 if field == 'no-store':
 | |
|                     return content
 | |
| 
 | |
|         # Don't ever cache zip/binary files for the sake of hard drive space
 | |
|         if headers.get('content-type') in ['application/zip', 'application/octet-stream']:
 | |
|             if debug:
 | |
|                 console_write(u"Skipping cache since the response is a zip file", True)
 | |
|             return content
 | |
| 
 | |
|         etag = headers.get('etag')
 | |
|         last_modified = headers.get('last-modified')
 | |
| 
 | |
|         if not etag and not last_modified:
 | |
|             return content
 | |
| 
 | |
|         struct = {'etag': etag, 'last-modified': last_modified}
 | |
|         struct_json = json.dumps(struct, indent=4)
 | |
| 
 | |
|         info_key = self.generate_key(url, '.info')
 | |
|         if debug:
 | |
|             console_write(u"Caching %s in %s" % (url, key), True)
 | |
| 
 | |
|         self.settings['cache'].set(info_key, struct_json.encode('utf-8'))
 | |
|         self.settings['cache'].set(key, content)
 | |
| 
 | |
|         return content
 | |
| 
 | |
|     def generate_key(self, url, suffix=''):
 | |
|         """
 | |
|         Generates a key to store the cache under
 | |
| 
 | |
|         :param url:
 | |
|             The URL being cached
 | |
| 
 | |
|         :param suffix:
 | |
|             A string to append to the key
 | |
| 
 | |
|         :return:
 | |
|             A string key for the URL
 | |
|         """
 | |
| 
 | |
|         if sys.version_info >= (3,) or isinstance(url, unicode):
 | |
|             url = url.encode('utf-8')
 | |
| 
 | |
|         key = hashlib.md5(url).hexdigest()
 | |
|         return key + suffix
 | |
| 
 | |
|     def retrieve_cached(self, url):
 | |
|         """
 | |
|         Tries to return the cached content for a URL
 | |
| 
 | |
|         :param url:
 | |
|             The URL to get the cached content for
 | |
| 
 | |
|         :return:
 | |
|             The cached content
 | |
|         """
 | |
| 
 | |
|         key = self.generate_key(url)
 | |
|         if not self.settings['cache'].has(key):
 | |
|             return False
 | |
| 
 | |
|         if self.settings.get('debug'):
 | |
|             console_write(u"Using cached content for %s" % url, True)
 | |
| 
 | |
|         return self.settings['cache'].get(key)
 |