186 lines
5.2 KiB
Python
186 lines
5.2 KiB
Python
import sys
|
|
import re
|
|
import json
|
|
import hashlib
|
|
|
|
from ..console_write import console_write
|
|
|
|
|
|
class CachingDownloader(object):
|
|
"""
|
|
A base downloader that will use a caching backend to cache HTTP requests
|
|
and make conditional requests.
|
|
"""
|
|
|
|
def add_conditional_headers(self, url, headers):
|
|
"""
|
|
Add `If-Modified-Since` and `If-None-Match` headers to a request if a
|
|
cached copy exists
|
|
|
|
:param headers:
|
|
A dict with the request headers
|
|
|
|
:return:
|
|
The request headers dict, possibly with new headers added
|
|
"""
|
|
|
|
if not self.settings.get('cache'):
|
|
return headers
|
|
|
|
info_key = self.generate_key(url, '.info')
|
|
info_json = self.settings['cache'].get(info_key)
|
|
|
|
if not info_json:
|
|
return headers
|
|
|
|
# Make sure we have the cached content to use if we get a 304
|
|
key = self.generate_key(url)
|
|
if not self.settings['cache'].has(key):
|
|
return headers
|
|
|
|
try:
|
|
info = json.loads(info_json.decode('utf-8'))
|
|
except ValueError:
|
|
return headers
|
|
|
|
etag = info.get('etag')
|
|
if etag:
|
|
headers['If-None-Match'] = etag
|
|
|
|
last_modified = info.get('last-modified')
|
|
if last_modified:
|
|
headers['If-Modified-Since'] = last_modified
|
|
|
|
return headers
|
|
|
|
def cache_result(self, method, url, status, headers, content):
|
|
"""
|
|
Processes a request result, either caching the result, or returning
|
|
the cached version of the url.
|
|
|
|
:param method:
|
|
The HTTP method used for the request
|
|
|
|
:param url:
|
|
The url of the request
|
|
|
|
:param status:
|
|
The numeric response status of the request
|
|
|
|
:param headers:
|
|
A dict of reponse headers, with keys being lowercase
|
|
|
|
:param content:
|
|
The response content
|
|
|
|
:return:
|
|
The response content
|
|
"""
|
|
|
|
debug = self.settings.get('debug', False)
|
|
|
|
if not self.settings.get('cache'):
|
|
if debug:
|
|
console_write(u"Skipping cache since there is no cache object", True)
|
|
return content
|
|
|
|
if method.lower() != 'get':
|
|
if debug:
|
|
console_write(u"Skipping cache since the HTTP method != GET", True)
|
|
return content
|
|
|
|
status = int(status)
|
|
|
|
# Don't do anything unless it was successful or not modified
|
|
if status not in [200, 304]:
|
|
if debug:
|
|
console_write(u"Skipping cache since the HTTP status code not one of: 200, 304", True)
|
|
return content
|
|
|
|
key = self.generate_key(url)
|
|
|
|
if status == 304:
|
|
cached_content = self.settings['cache'].get(key)
|
|
if cached_content:
|
|
if debug:
|
|
console_write(u"Using cached content for %s" % url, True)
|
|
return cached_content
|
|
|
|
# If we got a 304, but did not have the cached content
|
|
# stop here so we don't cache an empty response
|
|
return content
|
|
|
|
# If we got here, the status is 200
|
|
|
|
# Respect some basic cache control headers
|
|
cache_control = headers.get('cache-control', '')
|
|
if cache_control:
|
|
fields = re.split(',\s*', cache_control)
|
|
for field in fields:
|
|
if field == 'no-store':
|
|
return content
|
|
|
|
# Don't ever cache zip/binary files for the sake of hard drive space
|
|
if headers.get('content-type') in ['application/zip', 'application/octet-stream']:
|
|
if debug:
|
|
console_write(u"Skipping cache since the response is a zip file", True)
|
|
return content
|
|
|
|
etag = headers.get('etag')
|
|
last_modified = headers.get('last-modified')
|
|
|
|
if not etag and not last_modified:
|
|
return content
|
|
|
|
struct = {'etag': etag, 'last-modified': last_modified}
|
|
struct_json = json.dumps(struct, indent=4)
|
|
|
|
info_key = self.generate_key(url, '.info')
|
|
if debug:
|
|
console_write(u"Caching %s in %s" % (url, key), True)
|
|
|
|
self.settings['cache'].set(info_key, struct_json.encode('utf-8'))
|
|
self.settings['cache'].set(key, content)
|
|
|
|
return content
|
|
|
|
def generate_key(self, url, suffix=''):
|
|
"""
|
|
Generates a key to store the cache under
|
|
|
|
:param url:
|
|
The URL being cached
|
|
|
|
:param suffix:
|
|
A string to append to the key
|
|
|
|
:return:
|
|
A string key for the URL
|
|
"""
|
|
|
|
if sys.version_info >= (3,) or isinstance(url, unicode):
|
|
url = url.encode('utf-8')
|
|
|
|
key = hashlib.md5(url).hexdigest()
|
|
return key + suffix
|
|
|
|
def retrieve_cached(self, url):
|
|
"""
|
|
Tries to return the cached content for a URL
|
|
|
|
:param url:
|
|
The URL to get the cached content for
|
|
|
|
:return:
|
|
The cached content
|
|
"""
|
|
|
|
key = self.generate_key(url)
|
|
if not self.settings['cache'].has(key):
|
|
return False
|
|
|
|
if self.settings.get('debug'):
|
|
console_write(u"Using cached content for %s" % url, True)
|
|
|
|
return self.settings['cache'].get(key)
|