[Lxml-checkins] r47250 - in lxml/trunk: . src/lxml src/lxml/tests
scoder at codespeak.net
scoder at codespeak.net
Sun Oct 7 06:30:36 CEST 2007
Author: scoder
Date: Sun Oct 7 06:30:34 2007
New Revision: 47250
Modified:
lxml/trunk/CHANGES.txt
lxml/trunk/src/lxml/apihelpers.pxi
lxml/trunk/src/lxml/etree.pyx
lxml/trunk/src/lxml/tests/test_htmlparser.py
lxml/trunk/src/lxml/tests/test_unicode.py
Log:
let tag name validation distinguish HTML/XML tags based on the related parser, allow ':' in HTML tags
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Sun Oct 7 06:30:34 2007
@@ -16,9 +16,11 @@
Other changes
-------------
-* lxml.etree no longer validates unicode characters in tag names to
- avoid rejecting HTML tags. Only special characters like ':' and '>'
- are rejected.
+* Tag name validation in lxml.etree (and lxml.html) now distinguishes
+ between HTML tags and XML tags based on the parser that was used to
+ parse or create them. HTML tags no longer reject any non-ASCII
+ characters in tag names but only spaces and the special characters
+ '<>&/'.
2.0alpha3 (2007-09-26)
Modified: lxml/trunk/src/lxml/apihelpers.pxi
==============================================================================
--- lxml/trunk/src/lxml/apihelpers.pxi (original)
+++ lxml/trunk/src/lxml/apihelpers.pxi Sun Oct 7 06:30:34 2007
@@ -99,7 +99,10 @@
"""
cdef xmlNode* c_node
ns_utf, name_utf = _getNsTag(tag)
- _tagValidOrRaise(name_utf)
+ if parser is not None and parser._for_html:
+ _htmlTagValidOrRaise(name_utf)
+ else:
+ _tagValidOrRaise(name_utf)
if doc is not None:
c_doc = doc._c_doc
elif c_doc is NULL:
@@ -147,16 +150,22 @@
If 'c_doc' is also NULL, a new xmlDoc will be created.
"""
+ cdef _BaseParser parser
cdef _Document doc
cdef xmlNode* c_node
cdef xmlDoc* c_doc
if parent is None or parent._doc is None:
return None
ns_utf, name_utf = _getNsTag(tag)
- _tagValidOrRaise(name_utf)
doc = parent._doc
c_doc = doc._c_doc
+ parser = doc._parser
+ if parser is not None and parser._for_html:
+ _htmlTagValidOrRaise(name_utf)
+ else:
+ _tagValidOrRaise(name_utf)
+
c_node = _createElement(c_doc, name_utf)
if c_node is NULL:
python.PyErr_NoMemory()
@@ -175,6 +184,7 @@
cdef _initNodeAttributes(xmlNode* c_node, _Document doc, attrib, extra):
"""Initialise the attributes of an element node.
"""
+ cdef bint is_html
cdef xmlNs* c_ns
# 'extra' is not checked here (expected to be a keyword dict)
if attrib is not None and not hasattr(attrib, 'items'):
@@ -185,9 +195,11 @@
else:
attrib.update(extra)
if attrib:
+ is_html = doc._parser._for_html
for name, value in attrib.items():
attr_ns_utf, attr_name_utf = _getNsTag(name)
- _attributeValidOrRaise(attr_name_utf)
+ if not is_html:
+ _attributeValidOrRaise(attr_name_utf)
value_utf = _utf8(value)
if attr_ns_utf is None:
tree.xmlNewProp(c_node, _cstr(attr_name_utf), _cstr(value_utf))
@@ -242,7 +254,8 @@
cdef char* c_value
cdef char* c_tag
ns, tag = _getNsTag(key)
- _attributeValidOrRaise(tag)
+ if not element._doc._parser._for_html:
+ _attributeValidOrRaise(tag)
c_tag = _cstr(tag)
if isinstance(value, QName):
value = _resolveQNameText(element, value)
@@ -790,13 +803,17 @@
cdef int _pyXmlNameIsValid(name_utf8):
return _xmlNameIsValid(_cstr(name_utf8))
+cdef int _pyHtmlNameIsValid(name_utf8):
+ return _htmlNameIsValid(_cstr(name_utf8))
+
cdef int _xmlNameIsValid(char* c_name):
- #return tree.xmlValidateNCName(c_name, 0) == 0
+ return tree.xmlValidateNCName(c_name, 0) == 0
+
+cdef int _htmlNameIsValid(char* c_name):
if c_name is NULL or c_name[0] == c'\0':
return 0
while c_name[0] != c'\0':
- if c_name[0] == c':' or \
- c_name[0] == c'&' or \
+ if c_name[0] == c'&' or \
c_name[0] == c'<' or \
c_name[0] == c'>' or \
c_name[0] == c'/' or \
@@ -815,6 +832,12 @@
python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', 'strict')
return 0
+cdef int _htmlTagValidOrRaise(tag_utf) except -1:
+ if not _pyHtmlNameIsValid(tag_utf):
+ raise ValueError, "Invalid HTML tag name %r" % \
+ python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', 'strict')
+ return 0
+
cdef int _attributeValidOrRaise(name_utf) except -1:
if not _pyXmlNameIsValid(name_utf):
raise ValueError, "Invalid attribute name %r" % \
Modified: lxml/trunk/src/lxml/etree.pyx
==============================================================================
--- lxml/trunk/src/lxml/etree.pyx (original)
+++ lxml/trunk/src/lxml/etree.pyx Sun Oct 7 06:30:34 2007
@@ -707,8 +707,13 @@
return self._tag
def __set__(self, value):
+ cdef _BaseParser parser
ns, name = _getNsTag(value)
- _tagValidOrRaise(name)
+ parser = self._doc._parser
+ if parser is not None and parser._for_html:
+ _htmlTagValidOrRaise(name)
+ else:
+ _tagValidOrRaise(name)
self._tag = value
tree.xmlNodeSetName(self._c_node, _cstr(name))
if ns is None:
Modified: lxml/trunk/src/lxml/tests/test_htmlparser.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_htmlparser.py (original)
+++ lxml/trunk/src/lxml/tests/test_htmlparser.py Sun Oct 7 06:30:34 2007
@@ -39,6 +39,74 @@
self.assertRaises(self.etree.XMLSyntaxError,
parse, f, parser)
+ def test_html_element_name_empty(self):
+ parser = self.etree.HTMLParser()
+ Element = parser.makeelement
+
+ el = Element('name')
+ self.assertRaises(ValueError, Element, '{}')
+ self.assertRaises(ValueError, setattr, el, 'tag', '{}')
+
+ self.assertRaises(ValueError, Element, '{test}')
+ self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
+
+ def test_html_element_name_colon(self):
+ parser = self.etree.HTMLParser()
+ Element = parser.makeelement
+
+ pname = Element('p:name')
+ self.assertEquals(pname.tag, 'p:name')
+
+ pname = Element('{test}p:name')
+ self.assertEquals(pname.tag, '{test}p:name')
+
+ pname = Element('name')
+ pname.tag = 'p:name'
+ self.assertEquals(pname.tag, 'p:name')
+
+ def test_html_element_name_space(self):
+ parser = self.etree.HTMLParser()
+ Element = parser.makeelement
+
+ self.assertRaises(ValueError, Element, ' name ')
+ self.assertRaises(ValueError, Element, 'na me')
+ self.assertRaises(ValueError, Element, '{test} name')
+
+ el = Element('name')
+ self.assertRaises(ValueError, setattr, el, 'tag', ' name ')
+
+ def test_html_subelement_name_empty(self):
+ parser = self.etree.HTMLParser()
+ Element = parser.makeelement
+
+ SubElement = self.etree.SubElement
+
+ el = Element('name')
+ self.assertRaises(ValueError, SubElement, el, '{}')
+ self.assertRaises(ValueError, SubElement, el, '{test}')
+
+ def test_html_subelement_name_colon(self):
+ parser = self.etree.HTMLParser()
+ Element = parser.makeelement
+ SubElement = self.etree.SubElement
+
+ el = Element('name')
+ pname = SubElement(el, 'p:name')
+ self.assertEquals(pname.tag, 'p:name')
+
+ pname = SubElement(el, '{test}p:name')
+ self.assertEquals(pname.tag, '{test}p:name')
+
+ def test_html_subelement_name_space(self):
+ parser = self.etree.HTMLParser()
+ Element = parser.makeelement
+ SubElement = self.etree.SubElement
+
+ el = Element('name')
+ self.assertRaises(ValueError, SubElement, el, ' name ')
+ self.assertRaises(ValueError, SubElement, el, 'na me')
+ self.assertRaises(ValueError, SubElement, el, '{test} name')
+
def test_module_parse_html_norecover(self):
parser = self.etree.HTMLParser(recover=False)
parse = self.etree.parse
Modified: lxml/trunk/src/lxml/tests/test_unicode.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_unicode.py (original)
+++ lxml/trunk/src/lxml/tests/test_unicode.py Sun Oct 7 06:30:34 2007
@@ -5,9 +5,9 @@
ascii_uni = u'a'
-# klingon = u"\uF8D2" # not valid for XML names
+klingon = u"\uF8D2" # not valid for XML names
-invalid_tag = "\u0680:\u3120"
+invalid_tag = "test" + klingon
uni = u'Ã\u0680\u3120' # some non-ASCII characters
More information about the lxml-checkins
mailing list