[Lxml-checkins] r47250 - in lxml/trunk: . src/lxml src/lxml/tests

scoder at codespeak.net scoder at codespeak.net
Sun Oct 7 06:30:36 CEST 2007


Author: scoder
Date: Sun Oct  7 06:30:34 2007
New Revision: 47250

Modified:
   lxml/trunk/CHANGES.txt
   lxml/trunk/src/lxml/apihelpers.pxi
   lxml/trunk/src/lxml/etree.pyx
   lxml/trunk/src/lxml/tests/test_htmlparser.py
   lxml/trunk/src/lxml/tests/test_unicode.py
Log:
let tag name validation distinguish HTML/XML tags based on the related parser, allow ':' in HTML tags

Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt	(original)
+++ lxml/trunk/CHANGES.txt	Sun Oct  7 06:30:34 2007
@@ -16,9 +16,11 @@
 Other changes
 -------------
 
-* lxml.etree no longer validates unicode characters in tag names to
-  avoid rejecting HTML tags.  Only special characters like ':' and '>'
-  are rejected.
+* Tag name validation in lxml.etree (and lxml.html) now distinguishes
+  between HTML tags and XML tags based on the parser that was used to
+  parse or create them.  HTML tags no longer reject any non-ASCII
+  characters in tag names but only spaces and the special characters
+  '<>&/'.
 
 
 2.0alpha3 (2007-09-26)

Modified: lxml/trunk/src/lxml/apihelpers.pxi
==============================================================================
--- lxml/trunk/src/lxml/apihelpers.pxi	(original)
+++ lxml/trunk/src/lxml/apihelpers.pxi	Sun Oct  7 06:30:34 2007
@@ -99,7 +99,10 @@
     """
     cdef xmlNode* c_node
     ns_utf, name_utf = _getNsTag(tag)
-    _tagValidOrRaise(name_utf)
+    if parser is not None and parser._for_html:
+        _htmlTagValidOrRaise(name_utf)
+    else:
+        _tagValidOrRaise(name_utf)
     if doc is not None:
         c_doc = doc._c_doc
     elif c_doc is NULL:
@@ -147,16 +150,22 @@
 
     If 'c_doc' is also NULL, a new xmlDoc will be created.
     """
+    cdef _BaseParser parser
     cdef _Document doc
     cdef xmlNode* c_node
     cdef xmlDoc* c_doc
     if parent is None or parent._doc is None:
         return None
     ns_utf, name_utf = _getNsTag(tag)
-    _tagValidOrRaise(name_utf)
     doc = parent._doc
     c_doc = doc._c_doc
 
+    parser = doc._parser
+    if parser is not None and parser._for_html:
+        _htmlTagValidOrRaise(name_utf)
+    else:
+        _tagValidOrRaise(name_utf)
+
     c_node = _createElement(c_doc, name_utf)
     if c_node is NULL:
         python.PyErr_NoMemory()
@@ -175,6 +184,7 @@
 cdef _initNodeAttributes(xmlNode* c_node, _Document doc, attrib, extra):
     """Initialise the attributes of an element node.
     """
+    cdef bint is_html
     cdef xmlNs* c_ns
     # 'extra' is not checked here (expected to be a keyword dict)
     if attrib is not None and not hasattr(attrib, 'items'):
@@ -185,9 +195,11 @@
         else:
             attrib.update(extra)
     if attrib:
+        is_html = doc._parser._for_html
         for name, value in attrib.items():
             attr_ns_utf, attr_name_utf = _getNsTag(name)
-            _attributeValidOrRaise(attr_name_utf)
+            if not is_html:
+                _attributeValidOrRaise(attr_name_utf)
             value_utf = _utf8(value)
             if attr_ns_utf is None:
                 tree.xmlNewProp(c_node, _cstr(attr_name_utf), _cstr(value_utf))
@@ -242,7 +254,8 @@
     cdef char* c_value
     cdef char* c_tag
     ns, tag = _getNsTag(key)
-    _attributeValidOrRaise(tag)
+    if not element._doc._parser._for_html:
+        _attributeValidOrRaise(tag)
     c_tag = _cstr(tag)
     if isinstance(value, QName):
         value = _resolveQNameText(element, value)
@@ -790,13 +803,17 @@
 cdef int _pyXmlNameIsValid(name_utf8):
     return _xmlNameIsValid(_cstr(name_utf8))
 
+cdef int _pyHtmlNameIsValid(name_utf8):
+    return _htmlNameIsValid(_cstr(name_utf8))
+
 cdef int _xmlNameIsValid(char* c_name):
-    #return tree.xmlValidateNCName(c_name, 0) == 0
+    return tree.xmlValidateNCName(c_name, 0) == 0
+
+cdef int _htmlNameIsValid(char* c_name):
     if c_name is NULL or c_name[0] == c'\0':
         return 0
     while c_name[0] != c'\0':
-        if c_name[0] == c':' or \
-                c_name[0] == c'&' or \
+        if c_name[0] == c'&' or \
                 c_name[0] == c'<' or \
                 c_name[0] == c'>' or \
                 c_name[0] == c'/' or \
@@ -815,6 +832,12 @@
               python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', 'strict')
     return 0
 
+cdef int _htmlTagValidOrRaise(tag_utf) except -1:
+    if not _pyHtmlNameIsValid(tag_utf):
+        raise ValueError, "Invalid HTML tag name %r" % \
+              python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', 'strict')
+    return 0
+
 cdef int _attributeValidOrRaise(name_utf) except -1:
     if not _pyXmlNameIsValid(name_utf):
         raise ValueError, "Invalid attribute name %r" % \

Modified: lxml/trunk/src/lxml/etree.pyx
==============================================================================
--- lxml/trunk/src/lxml/etree.pyx	(original)
+++ lxml/trunk/src/lxml/etree.pyx	Sun Oct  7 06:30:34 2007
@@ -707,8 +707,13 @@
             return self._tag
     
         def __set__(self, value):
+            cdef _BaseParser parser
             ns, name = _getNsTag(value)
-            _tagValidOrRaise(name)
+            parser = self._doc._parser
+            if parser is not None and parser._for_html:
+                _htmlTagValidOrRaise(name)
+            else:
+                _tagValidOrRaise(name)
             self._tag = value
             tree.xmlNodeSetName(self._c_node, _cstr(name))
             if ns is None:

Modified: lxml/trunk/src/lxml/tests/test_htmlparser.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_htmlparser.py	(original)
+++ lxml/trunk/src/lxml/tests/test_htmlparser.py	Sun Oct  7 06:30:34 2007
@@ -39,6 +39,74 @@
         self.assertRaises(self.etree.XMLSyntaxError,
                           parse, f, parser)
 
+    def test_html_element_name_empty(self):
+        parser = self.etree.HTMLParser()
+        Element = parser.makeelement
+
+        el = Element('name')
+        self.assertRaises(ValueError, Element, '{}')
+        self.assertRaises(ValueError, setattr, el, 'tag', '{}')
+
+        self.assertRaises(ValueError, Element, '{test}')
+        self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
+
+    def test_html_element_name_colon(self):
+        parser = self.etree.HTMLParser()
+        Element = parser.makeelement
+
+        pname = Element('p:name')
+        self.assertEquals(pname.tag, 'p:name')
+
+        pname = Element('{test}p:name')
+        self.assertEquals(pname.tag, '{test}p:name')
+
+        pname = Element('name')
+        pname.tag = 'p:name'
+        self.assertEquals(pname.tag, 'p:name')
+
+    def test_html_element_name_space(self):
+        parser = self.etree.HTMLParser()
+        Element = parser.makeelement
+
+        self.assertRaises(ValueError, Element, ' name ')
+        self.assertRaises(ValueError, Element, 'na me')
+        self.assertRaises(ValueError, Element, '{test} name')
+
+        el = Element('name')
+        self.assertRaises(ValueError, setattr, el, 'tag', ' name ')
+
+    def test_html_subelement_name_empty(self):
+        parser = self.etree.HTMLParser()
+        Element = parser.makeelement
+
+        SubElement = self.etree.SubElement
+
+        el = Element('name')
+        self.assertRaises(ValueError, SubElement, el, '{}')
+        self.assertRaises(ValueError, SubElement, el, '{test}')
+
+    def test_html_subelement_name_colon(self):
+        parser = self.etree.HTMLParser()
+        Element = parser.makeelement
+        SubElement = self.etree.SubElement
+
+        el = Element('name')
+        pname = SubElement(el, 'p:name')
+        self.assertEquals(pname.tag, 'p:name')
+
+        pname = SubElement(el, '{test}p:name')
+        self.assertEquals(pname.tag, '{test}p:name')
+
+    def test_html_subelement_name_space(self):
+        parser = self.etree.HTMLParser()
+        Element = parser.makeelement
+        SubElement = self.etree.SubElement
+
+        el = Element('name')
+        self.assertRaises(ValueError, SubElement, el, ' name ')
+        self.assertRaises(ValueError, SubElement, el, 'na me')
+        self.assertRaises(ValueError, SubElement, el, '{test} name')
+
     def test_module_parse_html_norecover(self):
         parser = self.etree.HTMLParser(recover=False)
         parse = self.etree.parse

Modified: lxml/trunk/src/lxml/tests/test_unicode.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_unicode.py	(original)
+++ lxml/trunk/src/lxml/tests/test_unicode.py	Sun Oct  7 06:30:34 2007
@@ -5,9 +5,9 @@
 
 ascii_uni = u'a'
 
-# klingon = u"\uF8D2" # not valid for XML names
+klingon = u"\uF8D2" # not valid for XML names
 
-invalid_tag = "\u0680:\u3120"
+invalid_tag = "test" + klingon
 
 uni = u'Ã\u0680\u3120' # some non-ASCII characters
 


More information about the lxml-checkins mailing list