[Lxml-checkins] r50334 - in lxml/trunk: . src/lxml src/lxml/tests

scoder at codespeak.net scoder at codespeak.net
Fri Jan 4 19:22:01 CET 2008


Author: scoder
Date: Fri Jan  4 19:22:01 2008
New Revision: 50334

Modified:
   lxml/trunk/   (props changed)
   lxml/trunk/CHANGES.txt
   lxml/trunk/src/lxml/apihelpers.pxi
   lxml/trunk/src/lxml/lxml.etree.pyx
   lxml/trunk/src/lxml/tests/test_etree.py
Log:
 r3205 at delle:  sbehnel | 2008-01-04 19:21:48 +0100
 check entity/character references in Entity() factory


Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt	(original)
+++ lxml/trunk/CHANGES.txt	Fri Jan  4 19:22:01 2008
@@ -8,6 +8,9 @@
 Features added
 --------------
 
+* Invalid entity names and character references will now be rejected
+  by the ``Entity()`` factory.
+
 * ``entity.text`` now returns the textual representation of the
   entity, e.g. ``&``.
 

Modified: lxml/trunk/src/lxml/apihelpers.pxi
==============================================================================
--- lxml/trunk/src/lxml/apihelpers.pxi	(original)
+++ lxml/trunk/src/lxml/apihelpers.pxi	Fri Jan  4 19:22:01 2008
@@ -1043,22 +1043,41 @@
         c_name = c_name + 1
     return 1
 
+cdef bint _characterReferenceIsValid(char* c_name):
+    cdef bint is_hex
+    if c_name[0] == c'x':
+        c_name += 1
+        is_hex = 1
+    else:
+        is_hex = 0
+    if c_name[0] == c'\0':
+        return 0
+    while c_name[0] != c'\0':
+        if c_name[0] < c'0' or c_name[0] > c'9':
+            if not is_hex:
+                return 0
+            if not (c_name[0] >= c'a' and c_name[0] <= c'f'):
+                if not (c_name[0] >= c'A' and c_name[0] <= c'F'):
+                    return 0
+        c_name += 1
+    return 1
+
 cdef int _tagValidOrRaise(tag_utf) except -1:
     if not _pyXmlNameIsValid(tag_utf):
-        raise ValueError, "Invalid tag name %r" % \
-              python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', 'strict')
+        raise ValueError("Invalid tag name %r" % \
+              python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', 'strict'))
     return 0
 
 cdef int _htmlTagValidOrRaise(tag_utf) except -1:
     if not _pyHtmlNameIsValid(tag_utf):
-        raise ValueError, "Invalid HTML tag name %r" % \
-              python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', 'strict')
+        raise ValueError("Invalid HTML tag name %r" % \
+              python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', 'strict'))
     return 0
 
 cdef int _attributeValidOrRaise(name_utf) except -1:
     if not _pyXmlNameIsValid(name_utf):
-        raise ValueError, "Invalid attribute name %r" % \
-              python.PyUnicode_FromEncodedObject(name_utf, 'UTF-8', 'strict')
+        raise ValueError("Invalid attribute name %r" % \
+              python.PyUnicode_FromEncodedObject(name_utf, 'UTF-8', 'strict'))
     return 0
 
 cdef object _namespacedName(xmlNode* c_node):

Modified: lxml/trunk/src/lxml/lxml.etree.pyx
==============================================================================
--- lxml/trunk/src/lxml/lxml.etree.pyx	(original)
+++ lxml/trunk/src/lxml/lxml.etree.pyx	Fri Jan  4 19:22:01 2008
@@ -2110,18 +2110,26 @@
 PI = ProcessingInstruction
 
 def Entity(name):
-    """Entity factory.  This factory function creates a special element that
-    will be serialized as an XML entity.  Note, however, that the entity will
-    not be automatically declared in the document.  A document that uses
-    entities requires a DTD.
+    """Entity factory.  This factory function creates a special element
+    that will be serialized as an XML entity reference or character
+    reference.  Note, however, that entities will not be automatically
+    declared in the document.  A document that uses entity references
+    requires a DTD to define the entities.
     """
     cdef _Document doc
     cdef xmlNode*  c_node
     cdef xmlDoc*   c_doc
-    name = _utf8(name)
+    cdef char* c_name
+    name_utf = _utf8(name)
+    c_name = _cstr(name_utf)
+    if c_name[0] == c'#':
+        if not _characterReferenceIsValid(c_name + 1):
+            raise ValueError("Invalid character reference: '%s'" % name)
+    elif not _xmlNameIsValid(c_name):
+        raise ValueError("Invalid entity reference: '%s'" % name)
     c_doc = _newDoc()
     doc = _documentFactory(c_doc, None)
-    c_node = _createEntity(c_doc, _cstr(name))
+    c_node = _createEntity(c_doc, c_name)
     tree.xmlAddChild(<xmlNode*>c_doc, c_node)
     return _elementFactory(doc, c_node)
 

Modified: lxml/trunk/src/lxml/tests/test_etree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_etree.py	(original)
+++ lxml/trunk/src/lxml/tests/test_etree.py	Fri Jan  4 19:22:01 2008
@@ -605,6 +605,21 @@
         self.assertEquals('<root>&test;</root>',
                           tostring(root))
 
+    def test_entity_values(self):
+        Entity = self.etree.Entity
+        self.assertEquals(Entity("test").text, '&test;')
+        self.assertEquals(Entity("#17683").text, '&#17683;')
+        self.assertEquals(Entity("#x1768").text, '&#x1768;')
+        self.assertEquals(Entity("#x98AF").text, '&#x98AF;')
+
+    def test_entity_error(self):
+        Entity = self.etree.Entity
+        self.assertRaises(ValueError, Entity, 'a b c')
+        self.assertRaises(ValueError, Entity, 'a,b')
+        self.assertRaises(AssertionError, Entity, 'a\0b')
+        self.assertRaises(ValueError, Entity, '#abc')
+        self.assertRaises(ValueError, Entity, '#xxyz')
+
     # TypeError in etree, AssertionError in ElementTree;
     def test_setitem_assert(self):
         Element = self.etree.Element


More information about the lxml-checkins mailing list