[Lxml-checkins] r44117 - in lxml/trunk: . src/lxml src/lxml/tests
scoder at codespeak.net
scoder at codespeak.net
Sat Jun 9 16:55:11 CEST 2007
Author: scoder
Date: Sat Jun 9 16:55:10 2007
New Revision: 44117
Modified:
lxml/trunk/CHANGES.txt
lxml/trunk/src/lxml/apihelpers.pxi
lxml/trunk/src/lxml/tests/test_etree.py
lxml/trunk/src/lxml/tree.pxd
Log:
check incoming strings for low ASCII characters
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Sat Jun 9 16:55:10 2007
@@ -50,6 +50,9 @@
Bugs fixed
----------
+* API functions now check incoming strings for XML conformity. Zero bytes or
+ low ASCII characters are no longer accepted.
+
* The XML parser did not report undefined entities as error
* The text in exceptions raised by XML parsers, validators and XPath
Modified: lxml/trunk/src/lxml/apihelpers.pxi
==============================================================================
--- lxml/trunk/src/lxml/apihelpers.pxi (original)
+++ lxml/trunk/src/lxml/apihelpers.pxi Sat Jun 9 16:55:10 2007
@@ -595,16 +595,20 @@
cdef char* s
cdef char* c_end
cdef char c
+ cdef int is_non_ascii
s = _cstr(pystring)
c_end = s + python.PyString_GET_SIZE(pystring)
+ is_non_ascii = 0
while s < c_end:
c = s[0]
+ if c & 0x80:
+ is_non_ascii = 1
if c == c'\0':
return -1 # invalid!
- if c & 0x80:
- return 1 # non-ASCII
+ if is_non_ascii == 0 and not tree.xmlIsChar_ch(c):
+ return -1 # invalid!
s = s + 1
- return 0 # plain 7-bit ASCII
+ return is_non_ascii
cdef object funicode(char* s):
cdef Py_ssize_t slen
@@ -625,12 +629,15 @@
cdef object _utf8(object s):
if python.PyString_Check(s):
assert not isutf8py(s), \
- "All strings must be Unicode or ASCII"
- return s
+ "All strings must be XML compatible, either Unicode or ASCII"
elif python.PyUnicode_Check(s):
- return python.PyUnicode_AsUTF8String(s)
+ # FIXME: we should test these strings, too ...
+ s = python.PyUnicode_AsUTF8String(s)
+ assert isutf8py(s) != -1, \
+ "All strings must be XML compatible, either Unicode or ASCII"
else:
raise TypeError, "Argument must be string or unicode."
+ return s
cdef object _encodeFilename(object filename):
if filename is None:
Modified: lxml/trunk/src/lxml/tests/test_etree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_etree.py (original)
+++ lxml/trunk/src/lxml/tests/test_etree.py Sat Jun 9 16:55:10 2007
@@ -1433,6 +1433,41 @@
self.assertRaises(AssertionError, Element, 'ha\0ho')
+ def test_unicode_byte_zero(self):
+ Element = self.etree.Element
+
+ a = Element('a')
+ self.assertRaises(AssertionError, setattr, a, "text", u'ha\0ho')
+ self.assertRaises(AssertionError, setattr, a, "tail", u'ha\0ho')
+
+ self.assertRaises(AssertionError, Element, u'ha\0ho')
+
+ def test_byte_invalid(self):
+ Element = self.etree.Element
+
+ a = Element('a')
+ self.assertRaises(AssertionError, setattr, a, "text", 'ha\x07ho')
+ self.assertRaises(AssertionError, setattr, a, "text", 'ha\x02ho')
+
+ self.assertRaises(AssertionError, setattr, a, "tail", 'ha\x07ho')
+ self.assertRaises(AssertionError, setattr, a, "tail", 'ha\x02ho')
+
+ self.assertRaises(AssertionError, Element, 'ha\x07ho')
+ self.assertRaises(AssertionError, Element, 'ha\x02ho')
+
+ def test_unicode_byte_invalid(self):
+ Element = self.etree.Element
+
+ a = Element('a')
+ self.assertRaises(AssertionError, setattr, a, "text", u'ha\x07ho')
+ self.assertRaises(AssertionError, setattr, a, "text", u'ha\x02ho')
+
+ self.assertRaises(AssertionError, setattr, a, "tail", u'ha\x07ho')
+ self.assertRaises(AssertionError, setattr, a, "tail", u'ha\x02ho')
+
+ self.assertRaises(AssertionError, Element, u'ha\x07ho')
+ self.assertRaises(AssertionError, Element, u'ha\x02ho')
+
def test_encoding_tostring_utf16(self):
# ElementTree fails to serialize this
tostring = self.etree.tostring
Modified: lxml/trunk/src/lxml/tree.pxd
==============================================================================
--- lxml/trunk/src/lxml/tree.pxd (original)
+++ lxml/trunk/src/lxml/tree.pxd Sat Jun 9 16:55:10 2007
@@ -41,6 +41,9 @@
cdef xmlCharEncoding xmlDetectCharEncoding(char* text, int len)
cdef char* xmlGetCharEncodingName(xmlCharEncoding enc)
+cdef extern from "libxml/chvalid.h":
+ cdef int xmlIsChar_ch(char c)
+
cdef extern from "libxml/hash.h":
ctypedef struct xmlHashTable
ctypedef void xmlHashScanner(void* payload, void* data, char* name)
More information about the lxml-checkins
mailing list