[z3-checkins] r37695 - z3/deliverance/branches/cache_aware/deliverance
ltucker at codespeak.net
ltucker at codespeak.net
Wed Jan 31 22:53:07 CET 2007
Author: ltucker
Date: Wed Jan 31 22:53:04 2007
New Revision: 37695
Added:
z3/deliverance/branches/cache_aware/deliverance/cache_utils.py
z3/deliverance/branches/cache_aware/deliverance/resource_fetcher.py
Modified:
z3/deliverance/branches/cache_aware/deliverance/wsgimiddleware.py
Log:
primary changes to support caching
Added: z3/deliverance/branches/cache_aware/deliverance/cache_utils.py
==============================================================================
--- (empty file)
+++ z3/deliverance/branches/cache_aware/deliverance/cache_utils.py Wed Jan 31 22:53:04 2007
@@ -0,0 +1,487 @@
+import re
+from paste.response import header_value, replace_header
+from sets import Set
+
+
+"""
+utilities for fusing cache related HTTP headers from
+multiple sources
+
+XXX there is probably a good amount of work in here
+that Paste could simplify
+
+TODO:
+handle expires
+handle last-modified
+"""
+
+
+def merge_cache_headers(self, response_info, new_headers):
+ """
+ replaces cache related headers in new_headers
+ with caching info calculated cache_info
+ (a map of urls to wsgi response triples)
+ """
+
+ cache_info = {}
+ for uri, response in response_info.items():
+ cache_info[uri] = response[1]
+
+ cache_control = merge_cache_control(cache_info.values())
+ if cache_control:
+ replace_header(new_headers, 'cache-control', cache_control)
+
+ etag = merge_etags_from_headers(cache_info)
+ if etag is not None:
+ replace_header(new_headers, 'etag', etag )
+
+ vary = merge_vary_from_headers(cache_info)
+ if vary is not None:
+ replace_header(new_headers, 'vary', vary)
+
+ # XXX Expires
+
+
+
+
+def merge_cache_control(header_sets):
+ """
+ computes a value for the cache-control header based on the
+ values of the cache-control headers found in the list of
+ wsgi-style response header lists.
+
+ >>> headerses = []
+ >>> headerses.append([ ('cache-control', "public, max-age = 10") ])
+ >>> headerses.append([ ('cache-control', "public, max-age = 5") ])
+ >>> headerses.append([ ('cache-control', "public, max-age = 2") ])
+ >>> merge_cache_control(headerses)
+ 'public, max-age = 2'
+
+ >>> headerses = []
+ >>> headerses.append([ ('cache-control', "public, max-age = 10") ])
+ >>> headerses.append([ ('cache-control', "private, max-age = 5") ])
+ >>> headerses.append([ ('cache-control', "public, max-age = 2") ])
+ >>> merge_cache_control(headerses)
+ 'private, max-age = 2'
+
+ """
+
+ cache_ctls = [parse_cache_directives(header_value(x,'cache-control')) for x in header_sets]
+
+ # apply cache-control merging policies
+ new_cache_ctl = dict()
+ merge_if_all('public',new_cache_ctl, cache_ctls)
+ merge_if_any('private',new_cache_ctl, cache_ctls)
+ merge_if_any('private',new_cache_ctl, cache_ctls)
+ merge_if_any('no-cache',new_cache_ctl, cache_ctls)
+ merge_if_any('no-store',new_cache_ctl, cache_ctls)
+ merge_if_any('no-transform', new_cache_ctl, cache_ctls)
+ merge_if_any('must-revalidate', new_cache_ctl, cache_ctls)
+ merge_if_any('proxy-revalidate', new_cache_ctl, cache_ctls)
+ merge_minimum('max-age', new_cache_ctl, cache_ctls)
+ merge_minimum('smax-age', new_cache_ctl, cache_ctls)
+
+ return flatten_directive_map(new_cache_ctl)
+
+
+def merge_etags_from_headers(headers_map):
+ """
+ accepts a map from uris to wsgi-style header lists
+ returns the value for the etag merged from all
+ etag headers present in the header lists
+ """
+ etag_map = {}
+ for uri, headers in headers_map.items():
+ etag = header_value(headers,'etag')
+ if etag is not None and len(etag) != 0:
+ etag_map[uri] = etag
+ return merge_etags(etag_map)
+
+
+def merge_vary_from_headers(headers_map):
+ """
+ XXX set ordering
+ >>> d = {'a': [ ('Vary', '"foo, bar"') ], 'b': [ ('Vary', '"bar, quux"') ]}
+ >>> merge_vary_from_headers(d)
+ '"quux, foo, bar"'
+
+ >>> d = {}
+ >>> v = merge_vary_from_headers(d)
+ >>> v is None
+ True
+
+ """
+ vary_fields = Set()
+ for val in [ header_value(x, 'vary') for x in headers_map.values() ]:
+ vary_fields.update(parse_fieldname_list(val))
+
+ if len(vary_fields):
+ return '\"%s\"' % ', '.join(vary_fields)
+ else:
+ return None
+
+
+def parse_merged_etag(composite_tag):
+ """
+ given a composite etag computed by merge_etags,
+ computes a map from resource identifiers to
+ respective etags
+
+ >>> d = parse_merged_etag('deliverance:apple,15,some_apple_etag,orange,16,some_orange_etag')
+ >>> print_sorted_dict(d)
+ {'apple': 'some_apple_etag', 'orange': 'some_orange_etag'}
+
+
+ >>> d = parse_merged_etag('some_raND0m_g0bbl7+yGook')
+ >>> d
+ {}
+
+ >>> d = parse_merged_etag('deliverance:some_,99,ra,ND0m_g0,bb,l7+yGook')
+ >>> d
+ {}
+
+ """
+ if not composite_tag.startswith('deliverance:'):
+ return {}
+
+ tags = dict();
+
+ composite_tag = composite_tag[len('deliverance:'):]
+ while len(composite_tag) > 0:
+ resource,composite_tag = pop_et_token(composite_tag)
+ if resource is None:
+ return tags
+ tag_len, composite_tag = pop_et_token(composite_tag)
+ if tag_len is None:
+ return tags
+ try:
+ tag_len = int(tag_len)
+ except:
+ return {}
+
+ if len(composite_tag) >= tag_len:
+ tags[resource] = composite_tag[:tag_len]
+ composite_tag = composite_tag[tag_len+1:]
+ else:
+ return {}
+
+ return tags
+
+
+
+#############
+# helpers
+#############
+
+
+def pop_et_token(ctag):
+ """
+ finds the first comma separated token, returns a tuple
+ containing the token and the rest of the string given
+
+ >>> pop_et_token("abc,def,ghi")
+ ('abc', 'def,ghi')
+ """
+ sep = ctag.find(',')
+ if sep == -1:
+ return (None,ctag)
+ else:
+ return (ctag[:sep],ctag[sep+1:])
+
+
+
+
+CSL_QUOTE_PAT = '".*?"'
+def parse_header_list(hval):
+ """
+ split comma separated list into elements, ignoring quoted
+ commas.
+ eg:
+
+ >>> parse_header_list('max-age = 10, public')
+ ['max-age = 10', 'public']
+
+ >>> parse_header_list('max-age = 10, public = "foo, bar"')
+ ['max-age = 10', 'public = "foo, bar"']
+
+ >> parse_header_list('public')
+ ['public']
+ """
+ quoted_strings = re.findall(CSL_QUOTE_PAT,hval)
+ no_quote_val = re.sub(CSL_QUOTE_PAT,'?',hval)
+ vals = [x.strip() for x in no_quote_val.split(',')]
+
+ for i,val in enumerate(vals):
+ qpos = val.find('?')
+ if qpos != -1:
+ vals[i] = val.replace('?',quoted_strings.pop())
+
+ return vals
+
+
+def parse_cache_directive(directive):
+ """
+ returns a tuple for the directive containing the name of
+ the directive and a list of arguments. eg:
+
+ >>> parse_cache_directive('foo = 10')
+ ('foo', '10')
+
+ >>> parse_cache_directive('foo = "bar"')
+ ('foo', '"bar"')
+
+ >>> parse_cache_directive('foo = "bar, quux, baz"')
+ ('foo', '"bar, quux, baz"')
+
+ >>> parse_cache_directive("foo")
+ ('foo', None)
+ """
+ split = directive.find('=')
+ if (split == -1):
+ return (directive,None)
+ else:
+ return (directive[0:split].strip(),
+ directive[split+1:].strip())
+
+def parse_fieldname_list(val):
+ """
+ parses directive value(s) into a list, eg:
+
+ >>> parse_fieldname_list('foo')
+ ['foo']
+
+ >>> parse_fieldname_list('"foo"')
+ ['foo']
+
+ >>> parse_fieldname_list('"foo, bar,quux"')
+ ['foo', 'bar', 'quux']
+
+ >>> parse_fieldname_list('""')
+ []
+
+ >>> parse_fieldname_list(None)
+ []
+ """
+
+ if val is None:
+ return []
+
+ if val.startswith('"'):
+ val = val[1:]
+ if val.endswith('"'):
+ val = val[:-1]
+ val = val.strip()
+
+ if len(val) == 0:
+ return []
+
+ return [x.strip().lower() for x in val.split(',')]
+
+
+def parse_cache_directives(hval):
+ """
+ returns a dict mapping directives to raw values
+
+ >>> print_sorted_dict(parse_cache_directives('max-age = 10, public'))
+ {'max-age': '10', 'public': None}
+
+ >>> print_sorted_dict(parse_cache_directives('max-age = 10, public = "foo, bar"'))
+ {'max-age': '10', 'public': '"foo, bar"'}
+ """
+ if hval is None:
+ return {}
+
+ dirs = dict()
+ for (name,val) in [parse_cache_directive(x) for x in parse_header_list(hval)]:
+ dirs[name] = val
+ return dirs
+
+def merge_expire_header(cc, headers):
+ """
+ this reformulates any expire header in headers and
+ places an equivalent cache-control header in cc
+ """
+ pass
+
+def merge_etags(etag_map):
+ """
+ given a map of resource identifiers to etags,
+ computes a composite etag
+
+ XXX dict ordering
+ >>> d = {'apple': 'some_apple_etag', 'orange': 'some_orange_etag'}
+ >>> merge_etags(d)
+ 'deliverance:orange,16,some_orange_etag,apple,15,some_apple_etag'
+ """
+ if etag_map is None or len(etag_map) == 0:
+ return None
+
+ composite_etag="deliverance:"
+
+ for k,v in etag_map.items():
+ composite_etag += "%s,%d,%s," % (k,len(v),v)
+ composite_etag = composite_etag[:-1]
+ return composite_etag
+
+
+
+def merge_if_all(directive, newcc, cc):
+ """
+ puts the directive given in the new cache-control
+ directives newcc if the directive appears in all
+ sets of directives cc
+
+ expects cc is a list of dicts of the form produced by
+ parse_cache_directives
+ eg:
+
+ >>> d = dict()
+ >>> ccs = [{'public': None, 'max-age': '10'}, {'public': None, 'max-age': '20'}]
+ >>> merge_if_all('public',d,ccs)
+ >>> d
+ {'public': None}
+
+ >>> d = dict()
+ >>> ccs = [{'public': None, 'max-age': '10'}, {'max-age': '20'}]
+ >>> merge_if_all('public', d, ccs)
+ >>> d
+ {}
+ """
+ for c in cc:
+ if not c.has_key(directive):
+ return
+ newcc[directive] = None
+
+def merge_if_any(directive, newcc, cc):
+ """
+ puts the directive given in the new cache-control
+ directives newcc if the directive appears in any of
+ the sets of directives cc. merges any fieldname
+ lists that appear in cc for the directive. if any
+ instance has no fieldnames, no fieldnames are used
+ in the output.
+
+ expects cc is a list of dicts of the form produced by
+ parse_cache_directives
+
+ >>> d = dict()
+ >>> ccs = [{'private': None, 'max-age': '10'}, {'max-age': '20'}]
+ >>> merge_if_any('private', d, ccs)
+ >>> d
+ {'private': None}
+
+ >>> d = dict()
+ >>> ccs = [{'private': '"foo, bar"', 'max-age': '10'}, {'max-age': '9'}, {'private': '"quux, bar"'}]
+ >>> merge_if_any('private', d, ccs)
+ >>> d
+ {'private': '"quux, foo, bar"'}
+
+ >>> d = dict()
+ >>> ccs = [{'private': '"foo, bar"', 'max-age': '10'}, {'max-age': '20'}, {'private': None}]
+ >>> merge_if_any('private', d, ccs)
+ >>> d
+ {'private': None}
+
+ """
+ present = False
+ field_set = Set()
+
+ for c in cc:
+ if c.has_key(directive):
+ present = True
+ if c[directive] is not None:
+ if field_set is not None:
+ field_set.update(parse_fieldname_list(c[directive]))
+ else:
+ field_set = None
+
+ if present:
+ if field_set and len(field_set):
+ newcc[directive] = '"' + ', '.join(field_set) + '"'
+ else:
+ newcc[directive] = None
+
+def merge_minimum(directive, newcc, cc):
+ """
+ puts the minimum value specified for the directive
+ among all instances of the directive in the set cc
+ into the dict newcc.
+ if the directive does not appear in a particular
+ set, the value is not placed in newcc.
+
+ expects cc is a list of dicts of the form produced by
+ parse_cache_directives
+
+ >>> d = dict()
+ >>> ccs = [{'max-age': '10'}, {'max-age': '20'} ]
+ >>> merge_minimum('max-age', d, ccs)
+ >>> d
+ {'max-age': '10'}
+
+ >>> d = dict()
+ >>> ccs = [{'max-age': '10'}, {'smax-age': '20'} ]
+ >>> merge_minimum('max-age', d, ccs)
+ >>> d
+ {}
+ """
+
+ if len(cc) == 0:
+ return
+
+ if cc[0].has_key(directive):
+ min = int(cc[0][directive])
+ else:
+ return
+
+ for c in cc:
+ if c.has_key(directive):
+ dval = int(c[directive])
+ if dval < min:
+ min = dval
+ else:
+ return
+
+ newcc[directive] = str(min)
+
+def flatten_directive_map(d):
+ """
+ flattens a map of directive -> fieldnames
+ back into the HTTP comma separated list
+ form suitable as a value for the
+ cache-control header
+ """
+ dstr = ''
+ last = len(d) -1
+ for i, k in enumerate(d.keys()):
+ dstr += k
+ if d[k]:
+ dstr += ' = %s' % d[k]
+ if (i != last):
+ dstr += ', '
+
+ return dstr
+
+
+#########################
+# just test support
+#########################
+
+def print_sorted_dict(d):
+ keys = d.keys()
+ keys.sort()
+ last = len(keys)-1
+ dstr = '{'
+ for i, k in enumerate(keys):
+ dstr += "%s: %s" % (k.__repr__(), d[k].__repr__())
+ if i < last:
+ dstr += ', '
+ dstr += '}'
+ print dstr
+
+
+def _test():
+ import doctest
+ doctest.testmod()
+
+if __name__ == "__main__":
+ _test()
Added: z3/deliverance/branches/cache_aware/deliverance/resource_fetcher.py
==============================================================================
--- (empty file)
+++ z3/deliverance/branches/cache_aware/deliverance/resource_fetcher.py Wed Jan 31 22:53:04 2007
@@ -0,0 +1,128 @@
+import deliverance.wsgimiddleware
+from StringIO import StringIO
+from paste.wsgilib import intercept_output
+from paste.proxy import TransparentProxy
+from paste.request import construct_url
+from paste.response import header_value
+import urlparse
+from deliverance.utils import DeliveranceError
+
+
+class InternalResourceFetcher(object):
+ def __init__(self, in_environ, uri, app, headers_only=False):
+ self.uri = uri
+ self.app = app
+
+ if 'paste.recursive.include' in in_environ:
+ self.environ = in_environ['paste.recursive.include'].original_environ.copy()
+ else:
+ self.environ = in_environ.copy()
+
+ if not self.uri.startswith('/'):
+ self.uri = '/' + self.uri
+
+ self.environ['PATH_INFO'] = uri
+
+ base_url = in_environ['deliverance.base-url']
+ if base_url is not None:
+ self.environ['SCRIPT_NAME'] = urlparse.urlparse(base_url)[2]
+ else:
+ self.environ['SCRIPT_NAME'] = ''
+
+ if headers_only:
+ self.environ['REQUEST_METHOD'] = 'HEAD'
+ else:
+ self.environ['REQUEST_METHOD'] = 'GET'
+
+ self.environ['CONTENT_LENGTH'] = '0'
+ self.environ['wsgi.input'] = StringIO('')
+ self.environ['CONTENT_TYPE'] = ''
+ self.environ['QUERY_STRING'] = 'notheme'
+
+ if 'HTTP_ACCEPT_ENCODING' in self.environ:
+ self.environ['HTTP_ACCEPT_ENCODING'] = ''
+
+ def wsgi_get(self):
+ print "Internal Resource get: %s" % self.uri
+ if 'paste.recursive.include' in self.environ:
+ print "Doing paste.recursive.include"
+ # Try to do the redirect this way...
+ includer = self.environ['paste.recursive.include']
+ res = includer(self.uri, self.environ)
+ return (res.status, res.headers, res.body)
+ else:
+ print "Doing intercept"
+ return intercept_output(self.environ, self.app)
+
+
+ def get(self):
+ path_info = self.environ['PATH_INFO']
+ status, headers, body = self.wsgi_get()
+
+ if not status.startswith('200'):
+ loc = header_value(headers, 'location')
+ if loc:
+ loc = ' location=%r' % loc
+ else:
+ loc = ''
+ raise DeliveranceError(
+ "Request for internal resource at %s (%r) failed with status code %r%s"
+ % (construct_url(self.environ), path_info, status,
+ loc))
+ return body
+
+
+class ExternalResourceFetcher(object):
+ def __init__(self, uri, headers_only=False):
+ self.uri = uri
+
+ url_chunks = urlparse.urlsplit(uri)
+ loc = urlparse.urlsplit(uri)
+
+ self.environ = {}
+
+ if headers_only:
+ self.environ['REQUEST_METHOD'] = 'HEAD'
+ else:
+ self.environ['REQUEST_METHOD'] = 'GET'
+
+ self.environ['CONTENT_LENGTH'] = '0'
+ self.environ['wsgi.input'] = StringIO('')
+
+ self.environ['wsgi.url_scheme'] = loc[0]
+ self.environ['wsgi.version'] = (1, 0)
+ self.environ['HTTP_HOST'] = loc[1]
+ self.environ['PATH_INFO'] = loc[2]
+ self.environ['QUERY_STRING'] = loc[3]
+
+ self.environ['SCRIPT_INFO'] = ''
+
+ #if loc[0].find(':') != -1:
+ # self.environ['SERVER_NAME'],self.environ['SERVER_PORT'] = loc[0].split(':')
+ #else:
+ # self.environ['SERVER_NAME'] = loc[0]
+ # if loc[0] == 'https':
+ # self.environ['SERVER_PORT'] = '443'
+ # else:
+ # self.environ['SERVER_PORT'] = '80'
+
+ def wsgi_get(self):
+ print "External Resource get: %s" % self.uri
+ proxy_app = TransparentProxy()
+ return intercept_output(self.environ, proxy_app)
+
+ def get(self):
+ status, headers, body = self.wsgi_get()
+
+ if not status.startswith('200'):
+ loc = header_value(headers, 'location')
+ if loc:
+ loc = ' location=%r' % loc
+ else:
+ loc = ''
+ raise DeliveranceError(
+ "Request for external resource at %s failed with status code %r%s"
+ % (construct_url(self.environ), status,
+ loc))
+
+ return body
Modified: z3/deliverance/branches/cache_aware/deliverance/wsgimiddleware.py
==============================================================================
--- z3/deliverance/branches/cache_aware/deliverance/wsgimiddleware.py (original)
+++ z3/deliverance/branches/cache_aware/deliverance/wsgimiddleware.py Wed Jan 31 22:53:04 2007
@@ -13,14 +13,22 @@
from htmlserialize import tostring
from deliverance.utils import DeliveranceError
from deliverance.utils import DELIVERANCE_ERROR_PAGE
+from deliverance.resource_fetcher import InternalResourceFetcher, ExternalResourceFetcher
+from deliverance import cache_utils
import sys
import datetime
import threading
import traceback
from StringIO import StringIO
+from sets import Set
DELIVERANCE_BASE_URL = 'deliverance.base-url'
+DELIVERANCE_CACHE = 'deliverance.cache'
+IGNORE_EXTENSIONS = ['js','css','gif','jpg','jpeg','pdf','ps','doc','png','ico','mov','mpg','mpeg', 'mp3','m4a',
+ 'txt','rtf']
+
+IGNORE_URL_PATTERN = re.compile("^.*\.(%s)$" % '|'.join(IGNORE_EXTENSIONS))
class DeliveranceMiddleware(object):
"""
@@ -58,7 +66,7 @@
else:
self._rendererType = renderer
- def get_renderer(self,environ):
+ def get_renderer(self, environ):
"""
retrieve the deliverance Renderer representing the transformation this
middlware represents. Renderer may change according to caching rules.
@@ -72,7 +80,7 @@
finally:
self._lock.release()
- def create_renderer(self,environ):
+ def create_renderer(self, environ):
"""
construct a new deliverance Renderer from the
information passed to the initializer. A new copy
@@ -85,7 +93,7 @@
self.theme_uri)
def reference_resolver(href, parse, encoding=None):
- text = self.get_resource(environ,href)
+ text = self.get_resource(environ, href)
if parse == "xml":
return etree.XML(text)
if parse == "html":
@@ -132,7 +140,7 @@
initializer.
"""
try:
- return self.get_resource(environ,self.rule_uri)
+ return self.get_resource(environ, self.rule_uri)
except Exception, message:
newmessage = "Unable to retrieve rules from " + self.rule_uri
if message:
@@ -146,7 +154,7 @@
initializer.
"""
try:
- return self.get_resource(environ,self.theme_uri)
+ return self.get_resource(environ, self.theme_uri)
except Exception, message:
newmessage = "Unable to retrieve theme page from " + self.theme_uri
if message:
@@ -165,27 +173,37 @@
try:
qs = environ.get('QUERY_STRING', '')
environ[DELIVERANCE_BASE_URL] = construct_url(environ, with_path_info=False, with_query_string=False)
+ environ[DELIVERANCE_CACHE] = {}
notheme = 'notheme' in qs
if notheme:
return self.app(environ, start_response)
- if 'HTTP_ACCEPT_ENCODING' in environ:
- del environ['HTTP_ACCEPT_ENCODING']
- status, headers, body = intercept_output(
- environ, self.app,
- self.should_intercept,
- start_response)
+ # unsupported
+ if 'HTTP_ACCEPT_ENCODING' in environ:
+ environ['HTTP_ACCEPT_ENCODING'] = ''
+ if 'HTTP_IF_MATCH' in environ:
+ environ['HTTP_IF_MATCH'] = ''
+ if 'HTTP_IF_UNMODIFIED_SINCE' in environ:
+ environ['HTTP_IF_UNMODIFIED_SINCE'] = ''
+
+ status, headers, body = self.rebuild_check(environ, start_response)
- # ignore non-html responses
+ # non-html responses, or rebuild is not necessary: bail out
if status is None:
return body
- # don't theme html snippets
- if self.hasHTMLTag(body):
- body = self.filter_body(environ, body)
+ # perform actual themeing
+ print "Doing themeing"
+
+ body = self.filter_body(environ, body)
replace_header(headers, 'content-length', str(len(body)))
replace_header(headers, 'content-type', 'text/html; charset=utf-8')
+
+ cache_utils.merge_cache_headers(environ,
+ environ[DELIVERANCE_CACHE],
+ headers)
+
start_response(status, headers)
return [body]
@@ -209,7 +227,7 @@
"""
type = header_value(headers, 'content-type')
if type is None:
- return False
+ return True # yerg, 304s can have no content-type
return type.startswith('text/html') or type.startswith('application/xhtml+xml')
def filter_body(self, environ, body):
@@ -220,76 +238,133 @@
content = self.get_renderer(environ).render(parseHTML(body))
return tostring(content)
- def get_resource(self, environ, uri):
- """
- retrieve the data referred to by the uri given.
- """
- internalBaseURL = environ.get(DELIVERANCE_BASE_URL,None)
- uri = urlparse.urljoin(internalBaseURL, uri)
-
- if internalBaseURL and uri.startswith(internalBaseURL):
- return self.get_internal_resource(environ, uri[len(internalBaseURL):])
- else:
- return self.get_external_resource(uri)
- def relative_uri(self, uri):
- """
- returns true if uri is relative, false if
- the uri is absolute.
- """
- if re.search(r'^[a-zA-Z]+:', uri):
- return False
- else:
- return True
+ def rebuild_check(self, environ, start_response):
+ print "===== rebuild check ====="
+ # perform the request for content
- def get_external_resource(self, uri):
- """
- get the data referred to by the uri given
- using urllib (not through the wrapped app)
- """
- f = urllib.urlopen(uri)
- content = f.read()
- f.close()
- return content
+ content_url = construct_url(environ)
- def get_internal_resource(self, in_environ, uri):
- """
- get the data referred to by the uri given
- by using the wrapped WSGI application
- """
+ status, headers, body = intercept_output(environ, self.app,
+ self.should_intercept,
+ start_response)
+
+
+ if status is None:
+ # should_intercept says this isn't HTML, we're done
+ print "ignore non-html: %s" % construct_url(environ)
+ return (None, None, body)
+ if self.should_ignore_url(content_url):
+ print "ignore blacklisted url: %s" % construct_url(environ)
+ start_response(status, headers)
+ return (None, None, [body])
+
+ # cache the response so we can look at its headers later
+ environ[DELIVERANCE_CACHE][content_url] = (status, headers, body)
- if 'paste.recursive.include' in in_environ:
- environ = in_environ['paste.recursive.include'].original_environ.copy()
- else:
- environ = in_environ.copy()
+ # it was modified or an error, give it back for themeing
+ if not status.startswith('304'):
+ print "Content %s modified, continue..." % content_url
+
+ # if it's not a full HTML page, skip it
+ if not self.hasHTMLTag(body):
+ print "ignore non-html-tagged: %s" % construct_url(environ)
+ start_response(status, headers)
+ return (None, None, [body])
+
+ # send it back for rebuild
+ return (status, headers, body)
- if not uri.startswith('/'):
- uri = '/' + uri
- environ['PATH_INFO'] = uri
- environ['SCRIPT_NAME'] = in_environ[DELIVERANCE_BASE_URL]
- environ['REQUEST_METHOD'] = 'GET'
- environ['CONTENT_LENGTH'] = '0'
- environ['wsgi.input'] = StringIO('')
- environ['CONTENT_TYPE'] = ''
- if environ['QUERY_STRING']:
- environ['QUERY_STRING'] += '¬heme'
- else:
- environ['QUERY_STRING'] = 'notheme'
+ # got 304 Not Modified for content, check other resources
+ rules = etree.XML(self.rule(environ))
+ resources = self.get_resource_uris(rules)
+ if self.any_modified(environ, resources):
+ # something changed,
+ # get the content explicitly and give it back
+ print "explicitly requesting %s" % construct_url(environ)
+ if 'HTTP_IF_MODIFIED_SINCE' in environ:
+ environ['HTTP_IF_MODIFIED_SINCE'] = ''
+ if 'HTTP_IF_NONE_MATCH' in environ:
+ environ['HTTP_IF_NONE_MATCH'] = ''
+ environ['CACHE-CONTROL'] = 'no-cache'
+
+ status, headers, body = intercept_output(environ, self.app)
+
+ if not self.hasHTMLTag(body):
+ # XXX yarg, we didn't care about it!
+ print "ARG ignore non-html: status: %s, %s" % (status, construct_url(environ))
+ #print "Environ: " , environ , " Headers: ", headers
+ start_response(status, headers)
+ return (None, None, [body])
+
+ environ[DELIVERANCE_CACHE][content_url] = (status, headers, body)
+ return (status, headers, body)
+
+ # nothing was modified, give back a 304
+ print "giving back 304: %s" % construct_url(environ)
+ cache_utils.merge_cache_headers(environ,
+ environ[DELIVERANCE_CACHE],
+ headers)
+ start_response('304 Not Modified', headers)
- if 'HTTP_ACCEPT_ENCODING' in environ:
- environ['HTTP_ACCEPT_ENCODING'] = ''
+ return (None,None,[])
+
+ def any_modified(self, environ, resources):
+ """
+ returns a tuple containing a boolean and map of uris to HTTP response headers.
+ the first value represents whether any resource in resources has been
+ modified based on the checks contained in environ. The uris in the list
+ resources are associated with their respective response headers in the
+ second element of the tuple.
+ """
+
+ print "====== rebuild check ======"
+ moddate = None
+ etag_map = {}
+
+ if 'HTTP_IF_MODIFIED_SINCE' in environ:
+ print "using modification date: %s" % environ['HTTP_IF_MODIFIED_SINCE']
+ moddate = environ['HTTP_IF_MODIFIED_SINCE']
+ if 'HTTP_IF_NONE_MATCH' in environ:
+ print "using composite etag: %s" % environ['HTTP_IF_NONE_MATCH']
+ etag_map = cache_utils.parse_merged_etag(environ['HTTP_IF_NONE_MATCH'])
+
+ for uri in resources:
+ if (self.check_modification(environ, uri,
+ moddate,
+ etag_map.get(uri,None))):
+ return True
- if 'paste.recursive.include' in in_environ:
- # Try to do the redirect this way...
- includer = in_environ['paste.recursive.include']
- res = includer(uri,environ)
- return res.body
+ return False
- path_info = environ['PATH_INFO']
- status, headers, body = intercept_output(environ, self.app)
- if not status.startswith('200'):
+ def get_resource(self, environ, uri):
+ """
+ retrieve the content from the uri given,
+ uses cache if possible. throws exception if
+ response is not 200
+ """
+ if uri in environ[DELIVERANCE_CACHE]:
+ response = environ[DELIVERANCE_CACHE][uri]
+ if response[0].startswith('200'):
+ print "using previously fetched content for %s" % uri
+ return response[2]
+
+ print "fetching resource from scratch: %s" % uri
+ fetcher = self.get_fetcher(environ, uri)
+
+ # eliminate validation headers, we want the content
+ if 'HTTP_IF_MODIFIED_SINCE' in fetcher.environ:
+ fetcher.environ['HTTP_IF_MODIFIED_SINCE'] = ''
+ if 'HTTP_IF_NONE_MATCH' in fetcher.environ:
+ fetcher.environ['HTTP_IF_NONE_MATCH'] = ''
+ fetcher.environ['CACHE-CONTROL'] = 'no-cache'
+
+ status, headers, body = fetcher.wsgi_get()
+
+ if not status.startswith('200'):
+ path_info = uri
loc = header_value(headers, 'location')
if loc:
loc = ' location=%r' % loc
@@ -299,7 +374,79 @@
"Request for internal resource at %s (%r) failed with status code %r%s"
% (construct_url(environ), path_info, status,
loc))
+
+ environ[DELIVERANCE_CACHE][uri] = (status, headers, body)
+
return body
+
+
+ def get_fetcher(self, environ, uri):
+ internalBaseURL = environ.get(DELIVERANCE_BASE_URL,None)
+ uri = urlparse.urljoin(internalBaseURL, uri)
+
+ if internalBaseURL and uri.startswith(internalBaseURL):
+ return InternalResourceFetcher(environ, uri[len(internalBaseURL):],
+ self.app)
+ else:
+ return ExternalResourceFetcher(uri)
+
+
+ def get_resource_uris(self, rules):
+ """
+ retrieves a list of uris pointing to the resources that
+ are components of rendering (excluding content)
+ """
+ resources = Set()
+ resources.add(self.rule_uri)
+ resources.add(self.theme_uri)
+
+ for rule in rules:
+ href = rule.get("href",None)
+ if href is not None:
+ resources.add(href)
+
+ return list(resources)
+
+
+ def check_modification(self, environ, uri, httpdate_since=None, etag=None):
+ """
+ if httpdate_since is set to an httpdate the If-Modified-Since HTTP header
+ is used to check for modification
+
+ if etag is set to an etag for the resource, the If-None-Match HTTP header
+ is used to check for modification
+
+ """
+
+ print "[!] Checking modification for: [%s] w/ [%s,%s]" % (uri, httpdate_since, etag)
+
+ fetcher = self.get_fetcher(environ, uri)
+
+ if httpdate_since:
+ fetcher.environ['HTTP_IF_MODIFIED_SINCE'] = httpdate_since
+ else:
+ fetcher.environ['HTTP_IF_MODIFIED_SINCE'] = ''
+
+
+ if etag:
+ fetcher.environ['HTTP_IF_NONE_MATCH'] = etag
+ else:
+ fetcher.environ['HTTP_IF_NONE_MATCH'] = ''
+
+
+ status, headers, body = fetcher.wsgi_get()
+ environ[DELIVERANCE_CACHE][uri] = (status, headers, body)
+
+ print "status was: [%s]" % status
+ if not (status.startswith('200') or status.startswith('304')):
+ print "status(%s), environ => %s, headers => %s" % (status, fetcher.environ, headers)
+
+ if status.startswith('304'): # Not Modified
+ return False
+
+ return True
+
+
HTML_DOC_PAT = re.compile(r"^.*<\s*html(\s*|>).*$",re.I|re.M)
def hasHTMLTag(self, body):
@@ -311,6 +458,11 @@
"""
return self.HTML_DOC_PAT.search(body) is not None
+
+ def should_ignore_url(self, url):
+ # blacklisting can happen here as well
+ return re.match(IGNORE_URL_PATTERN, url) is not None
+
def make_filter(app, global_conf,
theme_uri=None, rule_uri=None):
assert theme_uri is not None, (
More information about the z3-checkins
mailing list