[Lxml-checkins] r44117 - in lxml/trunk: . src/lxml src/lxml/tests

scoder at codespeak.net scoder at codespeak.net
Sat Jun 9 16:55:11 CEST 2007


Author: scoder
Date: Sat Jun  9 16:55:10 2007
New Revision: 44117

Modified:
   lxml/trunk/CHANGES.txt
   lxml/trunk/src/lxml/apihelpers.pxi
   lxml/trunk/src/lxml/tests/test_etree.py
   lxml/trunk/src/lxml/tree.pxd
Log:
check incoming strings for low ASCII characters

Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt	(original)
+++ lxml/trunk/CHANGES.txt	Sat Jun  9 16:55:10 2007
@@ -50,6 +50,9 @@
 Bugs fixed
 ----------
 
+* API functions now check incoming strings for XML conformity.  Zero bytes or
+  low ASCII characters are no longer accepted.
+
 * The XML parser did not report undefined entities as error
 
 * The text in exceptions raised by XML parsers, validators and XPath

Modified: lxml/trunk/src/lxml/apihelpers.pxi
==============================================================================
--- lxml/trunk/src/lxml/apihelpers.pxi	(original)
+++ lxml/trunk/src/lxml/apihelpers.pxi	Sat Jun  9 16:55:10 2007
@@ -595,16 +595,20 @@
     cdef char* s
     cdef char* c_end
     cdef char c
+    cdef int is_non_ascii
     s = _cstr(pystring)
     c_end = s + python.PyString_GET_SIZE(pystring)
+    is_non_ascii = 0
     while s < c_end:
         c = s[0]
+        if c & 0x80:
+            is_non_ascii = 1
         if c == c'\0':
             return -1 # invalid!
-        if c & 0x80:
-            return 1  # non-ASCII
+        if is_non_ascii == 0 and not tree.xmlIsChar_ch(c):
+            return -1 # invalid!
         s = s + 1
-    return 0          # plain 7-bit ASCII
+    return is_non_ascii
 
 cdef object funicode(char* s):
     cdef Py_ssize_t slen
@@ -625,12 +629,15 @@
 cdef object _utf8(object s):
     if python.PyString_Check(s):
         assert not isutf8py(s), \
-               "All strings must be Unicode or ASCII"
-        return s
+               "All strings must be XML compatible, either Unicode or ASCII"
     elif python.PyUnicode_Check(s):
-        return python.PyUnicode_AsUTF8String(s)
+        # FIXME: we should test these strings, too ...
+        s = python.PyUnicode_AsUTF8String(s)
+        assert isutf8py(s) != -1, \
+               "All strings must be XML compatible, either Unicode or ASCII"
     else:
         raise TypeError, "Argument must be string or unicode."
+    return s
 
 cdef object _encodeFilename(object filename):
     if filename is None:

Modified: lxml/trunk/src/lxml/tests/test_etree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_etree.py	(original)
+++ lxml/trunk/src/lxml/tests/test_etree.py	Sat Jun  9 16:55:10 2007
@@ -1433,6 +1433,41 @@
 
         self.assertRaises(AssertionError, Element, 'ha\0ho')
 
+    def test_unicode_byte_zero(self):
+        Element = self.etree.Element
+
+        a = Element('a')
+        self.assertRaises(AssertionError, setattr, a, "text", u'ha\0ho')
+        self.assertRaises(AssertionError, setattr, a, "tail", u'ha\0ho')
+
+        self.assertRaises(AssertionError, Element, u'ha\0ho')
+
+    def test_byte_invalid(self):
+        Element = self.etree.Element
+
+        a = Element('a')
+        self.assertRaises(AssertionError, setattr, a, "text", 'ha\x07ho')
+        self.assertRaises(AssertionError, setattr, a, "text", 'ha\x02ho')
+
+        self.assertRaises(AssertionError, setattr, a, "tail", 'ha\x07ho')
+        self.assertRaises(AssertionError, setattr, a, "tail", 'ha\x02ho')
+
+        self.assertRaises(AssertionError, Element, 'ha\x07ho')
+        self.assertRaises(AssertionError, Element, 'ha\x02ho')
+
+    def test_unicode_byte_invalid(self):
+        Element = self.etree.Element
+
+        a = Element('a')
+        self.assertRaises(AssertionError, setattr, a, "text", u'ha\x07ho')
+        self.assertRaises(AssertionError, setattr, a, "text", u'ha\x02ho')
+
+        self.assertRaises(AssertionError, setattr, a, "tail", u'ha\x07ho')
+        self.assertRaises(AssertionError, setattr, a, "tail", u'ha\x02ho')
+
+        self.assertRaises(AssertionError, Element, u'ha\x07ho')
+        self.assertRaises(AssertionError, Element, u'ha\x02ho')
+
     def test_encoding_tostring_utf16(self):
         # ElementTree fails to serialize this
         tostring = self.etree.tostring

Modified: lxml/trunk/src/lxml/tree.pxd
==============================================================================
--- lxml/trunk/src/lxml/tree.pxd	(original)
+++ lxml/trunk/src/lxml/tree.pxd	Sat Jun  9 16:55:10 2007
@@ -41,6 +41,9 @@
     cdef xmlCharEncoding xmlDetectCharEncoding(char* text, int len)
     cdef char* xmlGetCharEncodingName(xmlCharEncoding enc)
 
+cdef extern from "libxml/chvalid.h":
+    cdef int xmlIsChar_ch(char c)
+
 cdef extern from "libxml/hash.h":
     ctypedef struct xmlHashTable
     ctypedef void xmlHashScanner(void* payload, void* data, char* name)


More information about the lxml-checkins mailing list