[lxml-dev] schema validation and resolvers

Michael Ballbach ballbach at rten.net
Wed Jun 25 01:04:40 CEST 2008


On Tue, Jun 24, 2008 at 09:36:37AM +0200, Stefan Behnel wrote:
> I think that's the way to go. And yes, it must be a stack (i.e. a
> list) in this case. I'd just push the right parser (not only its
> context) as a "current thread parser" in cases where we know that
> libxml2 will not provide us with a context (XInclude is another case,
> happy to see that resolved at the same time). Then we can fall back to
> using the top-most parser if the _private pointer is NULL.

How does this look? (Unit tests follow the patch)

Index: src/lxml/xinclude.pxi
===================================================================
--- src/lxml/xinclude.pxi	(revision 56012)
+++ src/lxml/xinclude.pxi	(working copy)
@@ -33,12 +33,14 @@
         # i.e. as a sibling, which does not conflict with traversal.
         cdef int result
         self._error_log.connect()
+        __GLOBAL_PARSER_CONTEXT.pushImpliedContext(node._doc._parser)
         with nogil:
             if node._doc._parser is not None:
                 result = xinclude.xmlXIncludeProcessTreeFlags(
                     node._c_node, node._doc._parser._parse_options)
             else:
                 result = xinclude.xmlXIncludeProcessTree(node._c_node)
+        __GLOBAL_PARSER_CONTEXT.popImpliedContext()
         self._error_log.disconnect()
 
         if result == -1:
Index: src/lxml/xmlschema.pxi
===================================================================
--- src/lxml/xmlschema.pxi	(revision 56012)
+++ src/lxml/xmlschema.pxi	(working copy)
@@ -65,7 +65,13 @@
             raise XMLSchemaParseError, u"No tree or file given"
 
         if parser_ctxt is not NULL:
+            # calling xmlSchemaParse on a schema with imports or includes will
+            # cause libxml2 to create an internal context for parsing, so push
+            # an implied context to route resolve requests to the document's parser
+            __GLOBAL_PARSER_CONTEXT.pushImpliedContext(doc._parser)
             self._c_schema = xmlschema.xmlSchemaParse(parser_ctxt)
+            __GLOBAL_PARSER_CONTEXT.popImpliedContext()
+
             if _LIBXML_VERSION_INT >= 20624:
                 xmlschema.xmlSchemaFreeParserCtxt(parser_ctxt)
 
Index: src/lxml/parser.pxi
===================================================================
--- src/lxml/parser.pxi	(revision 56012)
+++ src/lxml/parser.pxi	(working copy)
@@ -42,6 +42,11 @@
 
     cdef tree.xmlDict* _c_dict
     cdef _BaseParser _default_parser
+    cdef object _implied_parser_contexts
+
+    def __init__(self):
+        self._implied_parser_contexts = []
+
     def __dealloc__(self):
         if self._c_dict is not NULL:
             xmlparser.xmlDictFree(self._c_dict)
@@ -131,6 +136,38 @@
         # otherwise we'd free data that's in use => segfault
         self.initThreadDictRef(&result.dict)
 
+    cdef xmlparser.xmlParserCtxt *findImpliedContext(self) with gil:
+        u"""Return any current implied xml parser context for this thread. This
+        is used when the _local_resolver function is called with a context
+        that was generated from within libxml2 - which happens when parsing
+        schema and xinclude external references."""
+
+        cdef _ParserDictionaryContext thread_context
+        cdef _BaseParser implied_parser
+        cdef Py_ssize_t count
+
+        # see if we have a current implied parser
+        count = python.PyList_GET_SIZE(self._implied_parser_contexts)
+        if count != 0:
+            implied_parser = python.PyList_GET_ITEM(self._implied_parser_contexts, count - 1)
+            python.Py_INCREF(implied_parser) # borrowed reference
+            if implied_parser is not None:
+                return implied_parser._getParserContext()._c_ctxt
+
+        # we don't, so use the thread's default parser context
+        thread_context = __GLOBAL_PARSER_CONTEXT._findThreadParserContext()
+        return thread_context._default_parser._getParserContext()._c_ctxt
+
+    cdef void pushImpliedContext(self, context) with gil:
+        u"Push a new implied context object."
+        if context is not None and not isinstance(context, _BaseParser):
+            raise TypeError, u"implied contexts must be _ParserContext objects"
+        python.PyList_Append(self._implied_parser_contexts, context)
+
+    cdef void popImpliedContext(self) with gil:
+        u"Pop and return the current implied context object."
+        self._implied_parser_contexts.pop()
+
 cdef _ParserDictionaryContext __GLOBAL_PARSER_CONTEXT
 __GLOBAL_PARSER_CONTEXT = _ParserDictionaryContext()
 __GLOBAL_PARSER_CONTEXT.initMainParserContext()
@@ -399,7 +436,13 @@
     # when we declare a Python object, Pyrex will INCREF(None) !
     cdef xmlparser.xmlParserInput* c_input
     cdef int error
+
+    # if there is no _ParserDictionaryContext associated with the xmlParserCtxt
+    # passed, check to see if the thread state object has an implied context.
     if c_context._private is NULL:
+        c_context = __GLOBAL_PARSER_CONTEXT.findImpliedContext()
+
+    if c_context._private is NULL:
         if __DEFAULT_ENTITY_LOADER is NULL:
             return NULL
         return __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)







Unit tests:

Index: src/lxml/tests/test_etree.py
===================================================================
--- src/lxml/tests/test_etree.py	(revision 56012)
+++ src/lxml/tests/test_etree.py	(working copy)
@@ -2342,6 +2342,28 @@
             'a',
             tree.getroot()[1].tag)
 
+    def test_xinclude_resolver(self):
+        """Test that xinclude references can be processed by a resolver."""
+
+        class res(etree.Resolver):
+            def __init__(self, text):
+                self.text = text
+                self.called = False
+
+            def resolve(self, url, Id, context):
+                self.called = True
+                return self.resolve_string(self.text, context)
+ 
+        include_text = open(fileInTestDir('test.xml')).read()
+        parser = etree.XMLParser()
+        res_instance = res(include_text)
+        parser.resolvers.add(res_instance)
+        tree = etree.parse(fileInTestDir('include/test_xinclude.xml'), parser = parser)
+        self.include(tree)
+
+        # make sure the resolver was used
+        self.assert_(res_instance.called)
+
 class ETreeXIncludeTestCase(XIncludeTestCase):
     def include(self, tree):
         tree.xinclude()
Index: src/lxml/tests/test_xmlschema.py
===================================================================
--- src/lxml/tests/test_xmlschema.py	(revision 56012)
+++ src/lxml/tests/test_xmlschema.py	(working copy)
@@ -152,7 +152,108 @@
         self.assert_(tree_valid.xmlschema(schema))
         self.assert_(not tree_invalid.xmlschema(schema))
 
-    
+    #
+    # schema + resolvers tests&data:
+    #
+
+    resolver_schema_int = BytesIO("""\
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"
+    xmlns:etype="http://codespeak.net/lxml/test/external"
+    targetNamespace="http://codespeak.net/lxml/test/internal">
+        <xsd:import namespace="http://codespeak.net/lxml/test/external" schemaLocation="XXX.xsd" />
+        <xsd:element name="a" type="etype:AType"/>
+</xsd:schema>""")
+
+    resolver_schema_int2 = BytesIO("""\
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"
+    xmlns:etype="http://codespeak.net/lxml/test/external"
+    targetNamespace="http://codespeak.net/lxml/test/internal">
+        <xsd:import namespace="http://codespeak.net/lxml/test/external" schemaLocation="YYY.xsd" />
+        <xsd:element name="a" type="etype:AType"/>
+</xsd:schema>""")
+
+    resolver_schema_ext = """\
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"
+    targetNamespace="http://codespeak.net/lxml/test/external">
+    <xsd:complexType name="AType">
+      <xsd:sequence><xsd:element name="b" type="xsd:string" minOccurs="0" maxOccurs="unbounded" /></xsd:sequence>
+    </xsd:complexType>
+</xsd:schema>""" 
+
+    class simple_resolver(etree.Resolver):
+        def __init__(self, schema):
+            self.schema = schema
+
+        def resolve(self, url, Id, context):
+            assert(url == 'XXX.xsd')
+            return self.resolve_string(self.schema, context)
+
+    def test_xmlschema_resolvers(self):
+        """Test that resolvers work with schema."""
+
+        parser = etree.XMLParser()
+        parser.resolvers.add(self.simple_resolver(self.resolver_schema_ext))
+        schema_doc = etree.parse(self.resolver_schema_int, parser = parser)
+        schema = etree.XMLSchema(schema_doc)
+
+    def test_xmlschema_resolvers_root(self):
+        """Test that the default resolver will get called if there's no
+        specific parser resolver."""
+
+        root_resolver = self.simple_resolver(self.resolver_schema_ext)
+        etree.get_default_parser().resolvers.add(root_resolver)
+        schema_doc = etree.parse(self.resolver_schema_int)
+        schema = etree.XMLSchema(schema_doc)
+        etree.get_default_parser().resolvers.remove(root_resolver)
+
+    def test_xmlschema_resolvers_noroot(self):
+        """Test that the default resolver will not get called when a more
+        specific resolver is registered."""
+
+        class res_root(etree.Resolver):
+            def resolve(self, url, Id, context):
+                assert(False)
+                return None
+
+        root_resolver = res_root()
+        etree.get_default_parser().resolvers.add(root_resolver)
+        parser = etree.XMLParser()
+        parser.resolvers.add(self.simple_resolver(self.resolver_schema_ext))
+        schema_doc = etree.parse(self.resolver_schema_int, parser = parser)
+        schema = etree.XMLSchema(schema_doc)
+        etree.get_default_parser().resolvers.remove(root_resolver)
+
+    def test_xmlschema_nested_resolvers(self):
+        """Test that resolvers work in a nested fashion."""
+
+        class res_nested(etree.Resolver):
+            def __init__(self, ext_schema):
+                self.ext_schema = ext_schema
+
+            def resolve(self, url, Id, context):
+                assert(url == 'YYY.xsd')
+                return self.resolve_string(self.ext_schema, context)
+
+        class res(etree.Resolver):
+            def __init__(self, ext_schema_1, ext_schema_2):
+                self.ext_schema_1 = ext_schema_1
+                self.ext_schema_2 = ext_schema_2
+
+            def resolve(self, url, Id, context):
+                assert(url == 'XXX.xsd')
+
+                new_parser = etree.XMLParser()
+                new_parser.resolvers.add(res_nested(self.ext_schema_2))
+                new_schema_doc = etree.parse(self.ext_schema_1, parser = new_parser)
+                new_schema = etree.XMLSchema(new_schema_doc)
+
+                return self.resolve_string(ETreeXMLSchemaTestCase.resolver_schema_ext, context)
+
+        parser = etree.XMLParser()
+        parser.resolvers.add(res(self.resolver_schema_int2, self.resolver_schema_ext))
+        schema_doc = etree.parse(self.resolver_schema_int, parser = parser)
+        schema = etree.XMLSchema(schema_doc)
+
 def test_suite():
     suite = unittest.TestSuite()
     suite.addTests([unittest.makeSuite(ETreeXMLSchemaTestCase)])

-- 
Michael Ballbach, N0ZTQ
ballbach at rten.net -- PGP KeyID: 0xA05D5555
http://www.rten.net/
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: not available
Url : http://codespeak.net/pipermail/lxml-dev/attachments/20080624/b0633e38/attachment.pgp 


More information about the lxml-dev mailing list