[z3-checkins] r37695 - z3/deliverance/branches/cache_aware/deliverance

ltucker at codespeak.net ltucker at codespeak.net
Wed Jan 31 22:53:07 CET 2007


Author: ltucker
Date: Wed Jan 31 22:53:04 2007
New Revision: 37695

Added:
   z3/deliverance/branches/cache_aware/deliverance/cache_utils.py
   z3/deliverance/branches/cache_aware/deliverance/resource_fetcher.py
Modified:
   z3/deliverance/branches/cache_aware/deliverance/wsgimiddleware.py
Log:
primary changes to support caching

Added: z3/deliverance/branches/cache_aware/deliverance/cache_utils.py
==============================================================================
--- (empty file)
+++ z3/deliverance/branches/cache_aware/deliverance/cache_utils.py	Wed Jan 31 22:53:04 2007
@@ -0,0 +1,487 @@
+import re
+from paste.response import header_value, replace_header
+from sets import Set
+
+
+"""
+utilities for fusing cache related HTTP headers from 
+multiple sources 
+
+XXX there is probably a good amount of work in here 
+that Paste could simplify 
+
+TODO: 
+handle expires 
+handle last-modified
+"""
+
+
+def merge_cache_headers(self, response_info, new_headers): 
+    """
+    replaces cache related headers in new_headers 
+    with caching info calculated cache_info 
+    (a map of urls to wsgi response triples) 
+    """
+
+    cache_info = {}
+    for uri, response in response_info.items(): 
+        cache_info[uri] = response[1]
+
+    cache_control = merge_cache_control(cache_info.values())
+    if cache_control:         
+        replace_header(new_headers, 'cache-control', cache_control)
+
+    etag = merge_etags_from_headers(cache_info)
+    if etag is not None: 
+        replace_header(new_headers, 'etag', etag )
+
+    vary = merge_vary_from_headers(cache_info)
+    if vary is not None: 
+        replace_header(new_headers, 'vary', vary)
+
+    # XXX Expires 
+
+
+
+
+def merge_cache_control(header_sets): 
+    """
+    computes a value for the cache-control header based on the 
+    values of the cache-control headers found in the list of 
+    wsgi-style response header lists. 
+
+    >>> headerses = []
+    >>> headerses.append([ ('cache-control', "public, max-age = 10") ])
+    >>> headerses.append([ ('cache-control', "public, max-age = 5") ])
+    >>> headerses.append([ ('cache-control', "public, max-age = 2") ])
+    >>> merge_cache_control(headerses)
+    'public, max-age = 2'
+
+    >>> headerses = []
+    >>> headerses.append([ ('cache-control', "public, max-age = 10") ])
+    >>> headerses.append([ ('cache-control', "private, max-age = 5") ])
+    >>> headerses.append([ ('cache-control', "public, max-age = 2") ])
+    >>> merge_cache_control(headerses)
+    'private, max-age = 2'
+
+    """
+    
+    cache_ctls = [parse_cache_directives(header_value(x,'cache-control')) for x in header_sets]
+        
+    # apply cache-control merging policies 
+    new_cache_ctl = dict() 
+    merge_if_all('public',new_cache_ctl, cache_ctls)
+    merge_if_any('private',new_cache_ctl, cache_ctls) 
+    merge_if_any('private',new_cache_ctl, cache_ctls) 
+    merge_if_any('no-cache',new_cache_ctl, cache_ctls)
+    merge_if_any('no-store',new_cache_ctl, cache_ctls)
+    merge_if_any('no-transform', new_cache_ctl, cache_ctls)
+    merge_if_any('must-revalidate', new_cache_ctl, cache_ctls)
+    merge_if_any('proxy-revalidate', new_cache_ctl, cache_ctls) 
+    merge_minimum('max-age', new_cache_ctl, cache_ctls)
+    merge_minimum('smax-age', new_cache_ctl, cache_ctls)
+
+    return flatten_directive_map(new_cache_ctl)
+
+    
+def merge_etags_from_headers(headers_map): 
+    """
+    accepts a map from uris to wsgi-style header lists 
+    returns the value for the etag merged from all 
+    etag headers present in the header lists 
+    """
+    etag_map = {}
+    for uri, headers in headers_map.items(): 
+        etag = header_value(headers,'etag')
+        if etag is not None and len(etag) != 0: 
+            etag_map[uri] = etag
+    return merge_etags(etag_map)
+    
+
+def merge_vary_from_headers(headers_map): 
+    """
+    XXX set ordering 
+    >>> d = {'a': [ ('Vary', '"foo, bar"') ], 'b': [ ('Vary', '"bar, quux"') ]}
+    >>> merge_vary_from_headers(d)
+    '"quux, foo, bar"'
+
+    >>> d = {}
+    >>> v = merge_vary_from_headers(d)
+    >>> v is None
+    True
+
+    """
+    vary_fields = Set()
+    for val in [ header_value(x, 'vary') for x in headers_map.values() ]: 
+        vary_fields.update(parse_fieldname_list(val))
+
+    if len(vary_fields): 
+        return '\"%s\"' % ', '.join(vary_fields)
+    else: 
+        return None
+
+    
+def parse_merged_etag(composite_tag): 
+    """
+    given a composite etag computed by merge_etags, 
+    computes a map from resource identifiers to 
+    respective etags 
+
+    >>> d = parse_merged_etag('deliverance:apple,15,some_apple_etag,orange,16,some_orange_etag')
+    >>> print_sorted_dict(d)
+    {'apple': 'some_apple_etag', 'orange': 'some_orange_etag'}
+
+
+    >>> d = parse_merged_etag('some_raND0m_g0bbl7+yGook')
+    >>> d
+    {}
+
+    >>> d = parse_merged_etag('deliverance:some_,99,ra,ND0m_g0,bb,l7+yGook')
+    >>> d
+    {}
+
+    """
+    if not composite_tag.startswith('deliverance:'): 
+        return {}
+
+    tags = dict(); 
+
+    composite_tag = composite_tag[len('deliverance:'):]
+    while len(composite_tag) > 0: 
+        resource,composite_tag = pop_et_token(composite_tag)
+        if resource is None:
+            return tags 
+        tag_len, composite_tag = pop_et_token(composite_tag)
+        if tag_len is None:             
+            return tags
+        try:
+            tag_len = int(tag_len)
+        except: 
+            return {}
+        
+        if len(composite_tag) >= tag_len: 
+            tags[resource] = composite_tag[:tag_len]
+            composite_tag = composite_tag[tag_len+1:]
+        else:
+            return {}
+
+    return tags 
+    
+
+    
+#############
+# helpers 
+############# 
+
+
+def pop_et_token(ctag): 
+    """
+    finds the first comma separated token, returns a tuple 
+    containing the token and the rest of the string given 
+    
+    >>> pop_et_token("abc,def,ghi")
+    ('abc', 'def,ghi')
+    """
+    sep = ctag.find(',')
+    if sep == -1:    
+        return (None,ctag)
+    else:
+        return (ctag[:sep],ctag[sep+1:])
+
+
+
+
+CSL_QUOTE_PAT = '".*?"'
+def parse_header_list(hval): 
+    """
+    split comma separated list into elements, ignoring quoted 
+    commas. 
+    eg: 
+    
+    >>> parse_header_list('max-age = 10, public')
+    ['max-age = 10', 'public']
+    
+    >>> parse_header_list('max-age = 10, public = "foo, bar"')
+    ['max-age = 10', 'public = "foo, bar"']
+    
+    >> parse_header_list('public')
+    ['public']
+    """
+    quoted_strings = re.findall(CSL_QUOTE_PAT,hval)
+    no_quote_val = re.sub(CSL_QUOTE_PAT,'?',hval)
+    vals = [x.strip() for x in no_quote_val.split(',')]
+    
+    for i,val in enumerate(vals): 
+        qpos = val.find('?')
+        if qpos != -1: 
+            vals[i] = val.replace('?',quoted_strings.pop())
+
+    return vals
+       
+    
+def parse_cache_directive(directive): 
+    """
+    returns a tuple for the directive containing the name of 
+    the directive and a list of arguments. eg:  
+
+    >>> parse_cache_directive('foo = 10') 
+    ('foo', '10')
+    
+    >>> parse_cache_directive('foo = "bar"')
+    ('foo', '"bar"')
+    
+    >>> parse_cache_directive('foo = "bar, quux, baz"')
+    ('foo', '"bar, quux, baz"')
+    
+    >>> parse_cache_directive("foo")
+    ('foo', None)
+    """
+    split = directive.find('=')
+    if (split == -1): 
+        return (directive,None)
+    else:
+        return (directive[0:split].strip(), 
+                directive[split+1:].strip())
+
+def parse_fieldname_list(val): 
+    """
+    parses directive value(s) into a list, eg: 
+    
+    >>> parse_fieldname_list('foo')
+    ['foo']
+
+    >>> parse_fieldname_list('"foo"')
+    ['foo']
+
+    >>> parse_fieldname_list('"foo, bar,quux"')
+    ['foo', 'bar', 'quux']
+
+    >>> parse_fieldname_list('""')
+    []
+
+    >>> parse_fieldname_list(None)
+    []
+    """
+
+    if val is None: 
+        return [] 
+
+    if val.startswith('"'): 
+        val = val[1:]
+    if val.endswith('"'): 
+        val = val[:-1]
+    val = val.strip()
+
+    if len(val) == 0: 
+        return []    
+
+    return [x.strip().lower() for x in val.split(',')]
+    
+
+def parse_cache_directives(hval): 
+    """
+    returns a dict mapping directives to raw values  
+     
+    >>> print_sorted_dict(parse_cache_directives('max-age = 10, public'))
+    {'max-age': '10', 'public': None}
+    
+    >>> print_sorted_dict(parse_cache_directives('max-age = 10, public = "foo, bar"'))
+    {'max-age': '10', 'public': '"foo, bar"'}
+    """
+    if hval is None: 
+        return {}
+
+    dirs = dict()
+    for (name,val) in [parse_cache_directive(x) for x in parse_header_list(hval)]: 
+        dirs[name] = val
+    return dirs 
+
+def merge_expire_header(cc, headers): 
+    """
+    this reformulates any expire header in headers and 
+    places an equivalent cache-control header in cc 
+    """
+    pass 
+
+def merge_etags(etag_map): 
+    """
+    given a map of resource identifiers to etags, 
+    computes a composite etag 
+
+    XXX dict ordering 
+    >>> d = {'apple': 'some_apple_etag', 'orange': 'some_orange_etag'}
+    >>> merge_etags(d) 
+    'deliverance:orange,16,some_orange_etag,apple,15,some_apple_etag'
+    """
+    if etag_map is None or len(etag_map) == 0:
+        return None
+
+    composite_etag="deliverance:"
+
+    for k,v in etag_map.items(): 
+        composite_etag += "%s,%d,%s," % (k,len(v),v)
+    composite_etag = composite_etag[:-1]
+    return composite_etag 
+    
+
+        
+def merge_if_all(directive, newcc, cc): 
+    """
+    puts the directive given in the new cache-control 
+    directives newcc if the directive appears in all 
+    sets of directives cc 
+
+    expects cc is a list of dicts of the form produced by 
+    parse_cache_directives 
+    eg: 
+
+    >>> d = dict()
+    >>> ccs = [{'public': None, 'max-age': '10'}, {'public': None, 'max-age': '20'}]
+    >>> merge_if_all('public',d,ccs)
+    >>> d
+    {'public': None}
+
+    >>> d = dict()
+    >>> ccs = [{'public': None, 'max-age': '10'}, {'max-age': '20'}]
+    >>> merge_if_all('public', d, ccs)
+    >>> d
+    {}
+    """
+    for c in cc:         
+        if not c.has_key(directive): 
+            return 
+    newcc[directive] = None
+
+def merge_if_any(directive, newcc, cc): 
+    """
+    puts the directive given in the new cache-control 
+    directives newcc if the directive appears in any of 
+    the sets of directives cc. merges any fieldname 
+    lists that appear in cc for the directive. if any 
+    instance has no fieldnames, no fieldnames are used 
+    in the output. 
+
+    expects cc is a list of dicts of the form produced by 
+    parse_cache_directives 
+
+    >>> d = dict()
+    >>> ccs = [{'private': None, 'max-age': '10'}, {'max-age': '20'}]
+    >>> merge_if_any('private', d, ccs)
+    >>> d
+    {'private': None}
+
+    >>> d = dict()
+    >>> ccs = [{'private': '"foo, bar"', 'max-age': '10'}, {'max-age': '9'}, {'private': '"quux, bar"'}]
+    >>> merge_if_any('private', d, ccs)
+    >>> d
+    {'private': '"quux, foo, bar"'}
+
+    >>> d = dict()
+    >>> ccs = [{'private': '"foo, bar"', 'max-age': '10'}, {'max-age': '20'}, {'private': None}]
+    >>> merge_if_any('private', d, ccs)
+    >>> d
+    {'private': None}
+
+    """
+    present = False 
+    field_set = Set()
+
+    for c in cc: 
+        if c.has_key(directive): 
+            present = True
+            if c[directive] is not None:
+                if field_set is not None: 
+                    field_set.update(parse_fieldname_list(c[directive]))
+            else:
+                field_set = None
+
+    if present:
+        if field_set and len(field_set):             
+            newcc[directive] = '"' + ', '.join(field_set) + '"'
+        else:
+            newcc[directive] = None
+
+def merge_minimum(directive, newcc, cc): 
+    """ 
+    puts the minimum value specified for the directive 
+    among all instances of the directive in the set cc
+    into the dict newcc. 
+    if the directive does not appear in a particular 
+    set, the value is not placed in newcc. 
+    
+    expects cc is a list of dicts of the form produced by 
+    parse_cache_directives 
+
+    >>> d = dict()
+    >>> ccs = [{'max-age': '10'}, {'max-age': '20'} ]
+    >>> merge_minimum('max-age', d, ccs)
+    >>> d
+    {'max-age': '10'}
+
+    >>> d = dict()
+    >>> ccs = [{'max-age': '10'}, {'smax-age': '20'} ]
+    >>> merge_minimum('max-age', d, ccs)
+    >>> d
+    {}
+    """
+
+    if len(cc) == 0:
+        return 
+
+    if cc[0].has_key(directive): 
+        min = int(cc[0][directive])
+    else: 
+        return 
+
+    for c in cc: 
+        if c.has_key(directive): 
+            dval = int(c[directive])
+            if dval < min:
+                min = dval 
+        else: 
+            return
+
+    newcc[directive] = str(min)
+
+def flatten_directive_map(d): 
+    """ 
+    flattens a map of directive -> fieldnames 
+    back into the HTTP comma separated list 
+    form suitable as a value for the 
+    cache-control header 
+    """ 
+    dstr = ''
+    last = len(d) -1
+    for i, k in enumerate(d.keys()): 
+        dstr += k 
+        if d[k]: 
+            dstr += ' = %s' % d[k]
+        if (i != last): 
+            dstr += ', '
+
+    return dstr
+
+
+#########################
+# just test support 
+#########################
+
+def print_sorted_dict(d): 
+    keys = d.keys()
+    keys.sort()
+    last = len(keys)-1
+    dstr = '{'
+    for i, k in enumerate(keys): 
+        dstr += "%s: %s" % (k.__repr__(), d[k].__repr__())
+        if i < last: 
+            dstr += ', '
+    dstr += '}'
+    print dstr 
+
+ 
+def _test():
+    import doctest
+    doctest.testmod()
+
+if __name__ == "__main__":
+    _test()

Added: z3/deliverance/branches/cache_aware/deliverance/resource_fetcher.py
==============================================================================
--- (empty file)
+++ z3/deliverance/branches/cache_aware/deliverance/resource_fetcher.py	Wed Jan 31 22:53:04 2007
@@ -0,0 +1,128 @@
+import deliverance.wsgimiddleware 
+from StringIO import StringIO
+from paste.wsgilib import intercept_output
+from paste.proxy import TransparentProxy 
+from paste.request import construct_url
+from paste.response import header_value
+import urlparse
+from deliverance.utils import DeliveranceError
+
+
+class InternalResourceFetcher(object): 
+    def __init__(self, in_environ, uri, app, headers_only=False): 
+        self.uri = uri 
+        self.app = app 
+
+        if 'paste.recursive.include' in in_environ:
+            self.environ = in_environ['paste.recursive.include'].original_environ.copy()
+        else:
+            self.environ = in_environ.copy()
+            
+        if not self.uri.startswith('/'):
+            self.uri = '/' + self.uri
+
+        self.environ['PATH_INFO'] = uri
+
+        base_url = in_environ['deliverance.base-url']
+        if base_url is not None:
+            self.environ['SCRIPT_NAME'] = urlparse.urlparse(base_url)[2]
+        else: 
+            self.environ['SCRIPT_NAME'] = ''
+
+        if headers_only: 
+            self.environ['REQUEST_METHOD'] = 'HEAD'
+        else: 
+            self.environ['REQUEST_METHOD'] = 'GET'
+
+        self.environ['CONTENT_LENGTH'] = '0'
+        self.environ['wsgi.input'] = StringIO('')
+        self.environ['CONTENT_TYPE'] = ''
+        self.environ['QUERY_STRING'] = 'notheme'
+
+        if 'HTTP_ACCEPT_ENCODING' in self.environ:
+	    self.environ['HTTP_ACCEPT_ENCODING'] = '' 
+
+    def wsgi_get(self): 
+        print "Internal Resource get: %s" % self.uri
+        if 'paste.recursive.include' in self.environ: 
+            print "Doing paste.recursive.include"
+            # Try to do the redirect this way...
+            includer = self.environ['paste.recursive.include']
+            res = includer(self.uri, self.environ)
+            return (res.status, res.headers, res.body)
+        else: 
+            print "Doing intercept"
+            return intercept_output(self.environ, self.app)
+
+
+    def get(self): 
+        path_info = self.environ['PATH_INFO']
+        status, headers, body = self.wsgi_get()
+
+        if not status.startswith('200'):
+            loc = header_value(headers, 'location')
+            if loc:
+                loc = ' location=%r' % loc
+            else:
+                loc = ''
+            raise DeliveranceError(
+                "Request for internal resource at %s (%r) failed with status code %r%s"
+                % (construct_url(self.environ), path_info, status,
+                   loc))
+        return body
+
+
+class ExternalResourceFetcher(object): 
+    def __init__(self, uri, headers_only=False): 
+        self.uri = uri 
+        
+        url_chunks = urlparse.urlsplit(uri)
+        loc = urlparse.urlsplit(uri) 
+        
+        self.environ = {}
+        
+        if headers_only: 
+            self.environ['REQUEST_METHOD'] = 'HEAD'
+        else:
+            self.environ['REQUEST_METHOD'] = 'GET'
+
+        self.environ['CONTENT_LENGTH'] = '0'
+        self.environ['wsgi.input'] = StringIO('')
+
+        self.environ['wsgi.url_scheme'] = loc[0]
+        self.environ['wsgi.version'] = (1, 0)
+        self.environ['HTTP_HOST'] = loc[1]
+        self.environ['PATH_INFO'] = loc[2]
+        self.environ['QUERY_STRING'] = loc[3]
+
+        self.environ['SCRIPT_INFO'] = ''
+
+        #if loc[0].find(':') != -1: 
+        #    self.environ['SERVER_NAME'],self.environ['SERVER_PORT'] = loc[0].split(':')
+        #else: 
+        #    self.environ['SERVER_NAME'] = loc[0]
+        #    if loc[0] == 'https': 
+        #        self.environ['SERVER_PORT'] = '443'
+        #    else: 
+        #        self.environ['SERVER_PORT'] = '80'
+
+    def wsgi_get(self): 
+        print "External Resource get: %s" % self.uri
+        proxy_app = TransparentProxy() 
+        return intercept_output(self.environ, proxy_app)
+
+    def get(self): 
+        status, headers, body = self.wsgi_get()
+
+        if not status.startswith('200'):
+            loc = header_value(headers, 'location')
+            if loc:
+                loc = ' location=%r' % loc
+            else:
+                loc = ''
+            raise DeliveranceError(
+                "Request for external resource at %s failed with status code %r%s"
+                % (construct_url(self.environ), status,
+                   loc))
+
+        return body 

Modified: z3/deliverance/branches/cache_aware/deliverance/wsgimiddleware.py
==============================================================================
--- z3/deliverance/branches/cache_aware/deliverance/wsgimiddleware.py	(original)
+++ z3/deliverance/branches/cache_aware/deliverance/wsgimiddleware.py	Wed Jan 31 22:53:04 2007
@@ -13,14 +13,22 @@
 from htmlserialize import tostring
 from deliverance.utils import DeliveranceError
 from deliverance.utils import DELIVERANCE_ERROR_PAGE
+from deliverance.resource_fetcher import InternalResourceFetcher, ExternalResourceFetcher
+from deliverance import cache_utils
 import sys 
 import datetime
 import threading
 import traceback
 from StringIO import StringIO
+from sets import Set
 
 DELIVERANCE_BASE_URL = 'deliverance.base-url'
+DELIVERANCE_CACHE = 'deliverance.cache'
 
+IGNORE_EXTENSIONS = ['js','css','gif','jpg','jpeg','pdf','ps','doc','png','ico','mov','mpg','mpeg', 'mp3','m4a', 
+                     'txt','rtf']
+
+IGNORE_URL_PATTERN = re.compile("^.*\.(%s)$" % '|'.join(IGNORE_EXTENSIONS))
 
 class DeliveranceMiddleware(object):
     """
@@ -58,7 +66,7 @@
         else:
             self._rendererType = renderer
 
-    def get_renderer(self,environ):
+    def get_renderer(self, environ):
         """
         retrieve the deliverance Renderer representing the transformation this 
         middlware represents. Renderer may change according to caching rules. 
@@ -72,7 +80,7 @@
         finally:
             self._lock.release()
 
-    def create_renderer(self,environ):
+    def create_renderer(self, environ):
         """
         construct a new deliverance Renderer from the 
         information passed to the initializer.  A new copy 
@@ -85,7 +93,7 @@
             self.theme_uri)
 
         def reference_resolver(href, parse, encoding=None):
-            text = self.get_resource(environ,href)
+            text = self.get_resource(environ, href)
             if parse == "xml":
                 return etree.XML(text)
             if parse == "html":
@@ -132,7 +140,7 @@
         initializer. 
         """
         try:
-            return self.get_resource(environ,self.rule_uri)
+            return self.get_resource(environ, self.rule_uri)
         except Exception, message:
             newmessage = "Unable to retrieve rules from " + self.rule_uri 
             if message:
@@ -146,7 +154,7 @@
         initializer. 
         """
         try:
-            return self.get_resource(environ,self.theme_uri)
+            return self.get_resource(environ, self.theme_uri)
         except Exception, message:
             newmessage = "Unable to retrieve theme page from " + self.theme_uri 
             if message:
@@ -165,27 +173,37 @@
         try:
             qs = environ.get('QUERY_STRING', '')
             environ[DELIVERANCE_BASE_URL] = construct_url(environ, with_path_info=False, with_query_string=False)
+            environ[DELIVERANCE_CACHE] = {} 
             notheme = 'notheme' in qs
             if notheme:
                 return self.app(environ, start_response)
-            if 'HTTP_ACCEPT_ENCODING' in environ:
-                del environ['HTTP_ACCEPT_ENCODING']
 
-            status, headers, body = intercept_output(
-                environ, self.app,
-                self.should_intercept,
-                start_response)
+            # unsupported 
+            if 'HTTP_ACCEPT_ENCODING' in environ:
+                environ['HTTP_ACCEPT_ENCODING'] = '' 
+            if 'HTTP_IF_MATCH' in environ: 
+                environ['HTTP_IF_MATCH'] = '' 
+            if 'HTTP_IF_UNMODIFIED_SINCE' in environ: 
+                environ['HTTP_IF_UNMODIFIED_SINCE'] = '' 
+            
+            status, headers, body = self.rebuild_check(environ, start_response)
 
-            # ignore non-html responses 
+            # non-html responses, or rebuild is not necessary: bail out 
             if status is None:
                 return body
 
-            # don't theme html snippets 
-            if self.hasHTMLTag(body):
-                body = self.filter_body(environ, body)
+            # perform actual themeing 
+            print "Doing themeing" 
+
+            body = self.filter_body(environ, body)
 
             replace_header(headers, 'content-length', str(len(body)))
             replace_header(headers, 'content-type', 'text/html; charset=utf-8')
+
+            cache_utils.merge_cache_headers(environ, 
+                                            environ[DELIVERANCE_CACHE], 
+                                            headers)
+
             start_response(status, headers)
             return [body]
         
@@ -209,7 +227,7 @@
         """
         type = header_value(headers, 'content-type')
         if type is None:
-            return False
+            return True # yerg, 304s can have no content-type 
         return type.startswith('text/html') or type.startswith('application/xhtml+xml')
 
     def filter_body(self, environ, body):
@@ -220,76 +238,133 @@
         content = self.get_renderer(environ).render(parseHTML(body))
         return tostring(content)
 
-    def get_resource(self, environ, uri):
-        """
-        retrieve the data referred to by the uri given. 
-        """
-        internalBaseURL = environ.get(DELIVERANCE_BASE_URL,None)
-        uri = urlparse.urljoin(internalBaseURL, uri)
-        
-        if  internalBaseURL and uri.startswith(internalBaseURL):
-            return self.get_internal_resource(environ, uri[len(internalBaseURL):])
-        else:
-            return self.get_external_resource(uri)
 
-    def relative_uri(self, uri):
-        """
-        returns true if uri is relative, false if 
-        the uri is absolute. 
-        """
-        if re.search(r'^[a-zA-Z]+:', uri):
-            return False
-        else:
-            return True
+    def rebuild_check(self, environ, start_response): 
+        print "===== rebuild check ====="
+        # perform the request for content  
 
-    def get_external_resource(self, uri):
-        """
-        get the data referred to by the uri given 
-        using urllib (not through the wrapped app)
-        """
-        f = urllib.urlopen(uri)
-        content = f.read()
-        f.close()
-        return content
+        content_url = construct_url(environ)
 
-    def get_internal_resource(self, in_environ, uri):
-        """
-        get the data referred to by the uri given 
-        by using the wrapped WSGI application 
-        """
+        status, headers, body = intercept_output(environ, self.app,
+                                                 self.should_intercept,
+                                                 start_response)            
+
+
+        if status is None: 
+            # should_intercept says this isn't HTML, we're done
+            print "ignore non-html: %s" % construct_url(environ)
+            return (None, None, body)
 
+        if self.should_ignore_url(content_url): 
+            print "ignore blacklisted url: %s" % construct_url(environ)
+            start_response(status, headers)
+            return (None, None, [body])
+
+        # cache the response so we can look at its headers later 
+        environ[DELIVERANCE_CACHE][content_url] = (status, headers, body)
         
-        if 'paste.recursive.include' in in_environ:
-            environ = in_environ['paste.recursive.include'].original_environ.copy()
-        else:
-            environ = in_environ.copy()
+        # it was modified or an error, give it back for themeing 
+        if not status.startswith('304'): 
+            print "Content %s modified, continue..." % content_url 
+
+            # if it's not a full HTML page, skip it 
+            if not self.hasHTMLTag(body): 
+                print "ignore non-html-tagged: %s" % construct_url(environ)
+                start_response(status, headers)
+                return (None, None, [body])
+
+            # send it back for rebuild 
+            return (status, headers, body)
             
-        if not uri.startswith('/'):
-            uri = '/' + uri
-        environ['PATH_INFO'] = uri
-        environ['SCRIPT_NAME'] = in_environ[DELIVERANCE_BASE_URL]
-        environ['REQUEST_METHOD'] = 'GET'
-        environ['CONTENT_LENGTH'] = '0'
-        environ['wsgi.input'] = StringIO('')
-        environ['CONTENT_TYPE'] = ''
-        if environ['QUERY_STRING']:
-            environ['QUERY_STRING'] += '&notheme'
-        else:
-            environ['QUERY_STRING'] = 'notheme'
+        # got 304 Not Modified for content, check other resources 
+        rules = etree.XML(self.rule(environ))
+        resources = self.get_resource_uris(rules)
+        if self.any_modified(environ, resources): 
+            # something changed, 
+            # get the content explicitly and give it back 
+            print "explicitly requesting %s" % construct_url(environ)
+            if 'HTTP_IF_MODIFIED_SINCE' in environ: 
+                environ['HTTP_IF_MODIFIED_SINCE'] = ''
+            if 'HTTP_IF_NONE_MATCH' in environ: 
+                environ['HTTP_IF_NONE_MATCH'] = '' 
+            environ['CACHE-CONTROL'] = 'no-cache'
+
+            status, headers, body = intercept_output(environ, self.app)
+
+            if not self.hasHTMLTag(body): 
+                # XXX yarg, we didn't care about it!
+                print "ARG ignore non-html: status: %s, %s" % (status, construct_url(environ))
+                #print "Environ: " , environ , " Headers: ", headers 
+                start_response(status, headers)
+                return (None, None, [body])
+
+            environ[DELIVERANCE_CACHE][content_url] = (status, headers, body)
+            return (status, headers, body)
+
+        # nothing was modified, give back a 304 
+        print "giving back 304: %s" % construct_url(environ)
+        cache_utils.merge_cache_headers(environ, 
+                                        environ[DELIVERANCE_CACHE], 
+                                        headers)
+        start_response('304 Not Modified', headers)
 
-        if 'HTTP_ACCEPT_ENCODING' in environ:
-	    environ['HTTP_ACCEPT_ENCODING'] = '' 
+        return (None,None,[])
+        
+    def any_modified(self, environ, resources): 
+        """
+        returns a tuple containing a boolean and map of uris to HTTP response headers.  
+        the first value represents whether any resource in resources has been 
+        modified based on the checks contained in environ.  The uris in the list 
+        resources are associated with their respective response headers in the 
+        second element of the tuple. 
+        """
+
+        print "====== rebuild check ======"
+        moddate = None
+        etag_map = {}
+
+        if 'HTTP_IF_MODIFIED_SINCE' in environ: 
+            print "using modification date: %s" % environ['HTTP_IF_MODIFIED_SINCE']
+            moddate = environ['HTTP_IF_MODIFIED_SINCE']            
+        if 'HTTP_IF_NONE_MATCH' in environ: 
+            print "using composite etag: %s" % environ['HTTP_IF_NONE_MATCH']
+            etag_map = cache_utils.parse_merged_etag(environ['HTTP_IF_NONE_MATCH'])
+            
+        for uri in resources:
+            if (self.check_modification(environ, uri, 
+                                        moddate, 
+                                        etag_map.get(uri,None))): 
+                return True
 
-        if 'paste.recursive.include' in in_environ:
-            # Try to do the redirect this way...
-            includer = in_environ['paste.recursive.include']
-            res = includer(uri,environ)
-            return res.body
+        return False 
 
 
-        path_info = environ['PATH_INFO']
-        status, headers, body = intercept_output(environ, self.app)
-        if not status.startswith('200'):
+    def get_resource(self, environ, uri):
+        """
+        retrieve the content from the uri given, 
+        uses cache if possible. throws exception if 
+        response is not 200 
+        """
+        if uri in environ[DELIVERANCE_CACHE]: 
+            response = environ[DELIVERANCE_CACHE][uri]
+            if response[0].startswith('200'): 
+                print "using previously fetched content for %s" % uri 
+                return response[2]
+
+        print "fetching resource from scratch: %s" % uri 
+        fetcher = self.get_fetcher(environ, uri)
+        
+        # eliminate validation headers, we want the content 
+        if 'HTTP_IF_MODIFIED_SINCE' in fetcher.environ: 
+            fetcher.environ['HTTP_IF_MODIFIED_SINCE'] = ''
+        if 'HTTP_IF_NONE_MATCH' in fetcher.environ: 
+            fetcher.environ['HTTP_IF_NONE_MATCH'] = '' 
+        fetcher.environ['CACHE-CONTROL'] = 'no-cache'
+        
+        status, headers, body = fetcher.wsgi_get()         
+        
+        if not status.startswith('200'): 
+            path_info = uri 
             loc = header_value(headers, 'location')
             if loc:
                 loc = ' location=%r' % loc
@@ -299,7 +374,79 @@
                 "Request for internal resource at %s (%r) failed with status code %r%s"
                 % (construct_url(environ), path_info, status,
                    loc))
+
+        environ[DELIVERANCE_CACHE][uri] = (status, headers, body)
+
         return body
+            
+
+    def get_fetcher(self, environ, uri): 
+        internalBaseURL = environ.get(DELIVERANCE_BASE_URL,None)
+        uri = urlparse.urljoin(internalBaseURL, uri)        
+
+        if  internalBaseURL and uri.startswith(internalBaseURL):
+            return InternalResourceFetcher(environ, uri[len(internalBaseURL):],
+                                           self.app)
+        else:
+            return ExternalResourceFetcher(uri)        
+
+
+    def get_resource_uris(self, rules): 
+        """
+        retrieves a list of uris pointing to the resources that 
+        are components of rendering (excluding content) 
+        """
+        resources = Set()
+        resources.add(self.rule_uri)
+        resources.add(self.theme_uri)
+
+        for rule in rules: 
+            href = rule.get("href",None)
+            if href is not None:
+                resources.add(href)
+
+        return list(resources)
+
+            
+    def check_modification(self, environ, uri, httpdate_since=None, etag=None): 
+        """
+        if httpdate_since is set to an httpdate the If-Modified-Since HTTP header 
+          is used to check for modification 
+
+        if etag is set to an etag for the resource, the If-None-Match HTTP header 
+          is used to check for modification 
+
+        """
+
+        print "[!] Checking modification for: [%s] w/ [%s,%s]" % (uri, httpdate_since, etag)
+
+        fetcher = self.get_fetcher(environ, uri)
+        
+        if httpdate_since: 
+            fetcher.environ['HTTP_IF_MODIFIED_SINCE'] = httpdate_since 
+        else: 
+            fetcher.environ['HTTP_IF_MODIFIED_SINCE'] = ''
+        
+
+        if etag: 
+            fetcher.environ['HTTP_IF_NONE_MATCH'] = etag
+        else: 
+            fetcher.environ['HTTP_IF_NONE_MATCH'] = ''
+
+
+        status, headers, body = fetcher.wsgi_get()
+        environ[DELIVERANCE_CACHE][uri] = (status, headers, body)
+
+        print "status was: [%s]" % status
+        if not (status.startswith('200') or status.startswith('304')): 
+            print "status(%s), environ => %s, headers => %s" % (status, fetcher.environ, headers)
+
+        if status.startswith('304'): # Not Modified 
+            return False 
+
+        return True
+
+
 
     HTML_DOC_PAT = re.compile(r"^.*<\s*html(\s*|>).*$",re.I|re.M)
     def hasHTMLTag(self, body):
@@ -311,6 +458,11 @@
         """
         return self.HTML_DOC_PAT.search(body) is not None
 
+
+    def should_ignore_url(self, url): 
+        # blacklisting can happen here as well 
+        return re.match(IGNORE_URL_PATTERN, url) is not None
+
 def make_filter(app, global_conf,
                 theme_uri=None, rule_uri=None):
     assert theme_uri is not None, (


More information about the z3-checkins mailing list