From scoder at codespeak.net Sun Jul 1 15:00:45 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 1 Jul 2007 15:00:45 +0200 (CEST) Subject: [Lxml-checkins] r44649 - lxml/trunk/src/lxml Message-ID: <20070701130045.C5AC780C6@code0.codespeak.net> Author: scoder Date: Sun Jul 1 15:00:43 2007 New Revision: 44649 Modified: lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/python.pxd Log: small optimisation in _Element.nsmap Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Sun Jul 1 15:00:43 2007 @@ -739,15 +739,16 @@ cdef xmlNs* c_ns nsmap = {} c_node = self._c_node - while c_node is not NULL and _isElement(c_node): + while c_node is not NULL and c_node.type == tree.XML_ELEMENT_NODE: c_ns = c_node.nsDef while c_ns is not NULL: if c_ns.prefix is NULL: prefix = None else: prefix = funicode(c_ns.prefix) - if prefix not in nsmap: - nsmap[prefix] = funicode(c_ns.href) + if not python.PyDict_Contains(nsmap, prefix): + python.PyDict_SetItem( + nsmap, prefix, funicode(c_ns.href)) c_ns = c_ns.next c_node = c_node.parent return nsmap Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Sun Jul 1 15:00:43 2007 @@ -55,6 +55,7 @@ cdef int PyDict_DelItem(object d, object key) except -1 cdef void PyDict_Clear(object d) cdef object PyDict_Copy(object d) + cdef int PyDict_Contains(object d, object key) except -1 cdef Py_ssize_t PyDict_Size(object d) cdef object PySequence_List(object o) cdef object PySequence_Tuple(object o) From scoder at codespeak.net Sun Jul 1 15:05:00 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 1 Jul 2007 15:05:00 +0200 (CEST) Subject: [Lxml-checkins] r44650 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20070701130500.93E4380C6@code0.codespeak.net> Author: scoder Date: Sun Jul 1 15:05:00 2007 New Revision: 44650 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/objectify.pyx lxml/trunk/src/lxml/tests/test_objectify.py Log: objectify: loads of test updates by Holger (merged from 1.3 branch), fixes for passing None and Element values to DataElement(), type checking in DataElement() Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sun Jul 1 15:05:00 2007 @@ -8,6 +8,9 @@ Features added -------------- +* objectify.DataElement now supports setting values from existing data + elements (not just plain Python types) and reuses defined namespaces etc. + * E-factory support for lxml.objectify (``objectify.E``) * Entity support through an ``Entity`` factory and element classes. XML @@ -30,6 +33,10 @@ Bugs fixed ---------- +* objectify.DataElement didn't set up None value correctly + +* objectify.DataElement didn't check the value against the provided type hints + * Reference-counting bug in ``Element.attrib.pop()`` * The XML parser did not report undefined entities as error Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Sun Jul 1 15:05:00 2007 @@ -1918,10 +1918,34 @@ cdef python.PyObject* dict_result if nsmap is None: nsmap = _DEFAULT_NSMAP - if attrib is not None: + if attrib is not None and attrib: if python.PyDict_Size(_attributes): + attrib = dict(attrib) attrib.update(_attributes) _attributes = attrib + if isinstance(_value, ObjectifiedDataElement): + # reuse existing nsmap unless redefined in nsmap parameter + temp = _value.nsmap + if temp is not None and temp: + temp = dict(_value.nsmap) + temp.update(nsmap) + nsmap = temp + # reuse existing attributes unless redefined in attrib/_attributes + temp = _value.attrib + if temp is not None and temp: + temp = dict(_value.attrib) + temp.update(_attributes) + _attributes = temp + # reuse existing xsi:type or py:pytype attributes, unless provided as + # arguments + if _xsi is None and _pytype is None: + dict_result = python.PyDict_GetItem(_attributes, + XML_SCHEMA_INSTANCE_TYPE_ATTR) + if dict_result is not NULL: + _xsi = dict_result + dict_result = python.PyDict_GetItem(_attributes, PYTYPE_ATTRIBUTE) + if dict_result is not NULL: + _pytype = dict_result if _xsi is not None: if ':' in _xsi: prefix, name = _xsi.split(':', 1) @@ -1956,23 +1980,34 @@ strval = "true" else: strval = "false" + elif _value is None: + strval = None else: strval = str(_value) if _pytype is None: - for type_check, pytype in _TYPE_CHECKS: - try: - type_check(strval) - _pytype = (pytype).name - break - except IGNORABLE_ERRORS: - pass + if strval is not None: + for type_check, pytype in _TYPE_CHECKS: + try: + type_check(strval) + _pytype = (pytype).name + break + except IGNORABLE_ERRORS: + pass if _pytype is None: if _value is None: - _pytype = "none" + python.PyDict_SetItem(_attributes, XML_SCHEMA_INSTANCE_NIL_ATTR, "true") elif python._isString(_value): _pytype = "str" - if _pytype is not None: + else: + # check if type information from arguments is valid + dict_result = python.PyDict_GetItem(_PYTYPE_DICT, _pytype) + if dict_result is not NULL: + type_check = (dict_result).type_check + if type_check is not None: + type_check(strval) + + if _pytype is not None: python.PyDict_SetItem(_attributes, PYTYPE_ATTRIBUTE, _pytype) return _makeElement("value", strval, _attributes, nsmap) Modified: lxml/trunk/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_objectify.py (original) +++ lxml/trunk/src/lxml/tests/test_objectify.py Sun Jul 1 15:05:00 2007 @@ -13,10 +13,32 @@ from lxml import objectify +PYTYPE_NAMESPACE = "http://codespeak.net/lxml/objectify/pytype" XML_SCHEMA_NS = "http://www.w3.org/2001/XMLSchema" XML_SCHEMA_INSTANCE_NS = "http://www.w3.org/2001/XMLSchema-instance" XML_SCHEMA_INSTANCE_TYPE_ATTR = "{%s}type" % XML_SCHEMA_INSTANCE_NS XML_SCHEMA_NIL_ATTR = "{%s}nil" % XML_SCHEMA_INSTANCE_NS +DEFAULT_NSMAP = { "py" : PYTYPE_NAMESPACE, + "xsi" : XML_SCHEMA_INSTANCE_NS, + "xsd" : XML_SCHEMA_NS} + +objectclass2xsitype = { + # objectify built-in + objectify.IntElement: ("int", "short", "byte", "unsignedShort", + "unsignedByte",), + objectify.LongElement: ("integer", "nonPositiveInteger", "negativeInteger", + "long", "nonNegativeInteger", "unsignedLong", + "unsignedInt", "positiveInteger",), + objectify.FloatElement: ("float", "double"), + objectify.BoolElement: ("boolean",), + objectify.StringElement: ("string", "normalizedString", "token", "language", + "Name", "NCName", "ID", "IDREF", "ENTITY", + "NMTOKEN", ), + # None: xsi:nil="true" + } + +xsitype2objclass = dict(( (v, k) for k in objectclass2xsitype + for v in objectclass2xsitype[k] )) xml_str = '''\ @@ -52,6 +74,205 @@ self.etree.Namespace("otherNS").clear() objectify.setPytypeAttributeTag() + def test_element_nsmap_default(self): + elt = objectify.Element("test") + self.assertEquals(elt.nsmap, DEFAULT_NSMAP) + + def test_element_nsmap_empty(self): + nsmap = {} + elt = objectify.Element("test", nsmap=nsmap) + self.assertEquals(elt.nsmap.values(), [PYTYPE_NAMESPACE]) + + def test_element_nsmap_custom_prefixes(self): + nsmap = {"mypy": PYTYPE_NAMESPACE, + "myxsi": XML_SCHEMA_INSTANCE_NS, + "myxsd": XML_SCHEMA_NS} + elt = objectify.Element("test", nsmap=nsmap) + self.assertEquals(elt.nsmap, nsmap) + + def test_element_nsmap_custom(self): + nsmap = {"my": "someNS", + "myother": "someOtherNS", + "myxsd": XML_SCHEMA_NS} + elt = objectify.Element("test", nsmap=nsmap) + self.assert_(PYTYPE_NAMESPACE in elt.nsmap.values()) + for prefix, ns in nsmap.items(): + self.assert_(prefix in elt.nsmap) + self.assertEquals(nsmap[prefix], elt.nsmap[prefix]) + + def test_sub_element_nsmap_default(self): + root = objectify.Element("root") + root.sub = objectify.Element("test") + self.assertEquals(root.sub.nsmap, DEFAULT_NSMAP) + + def test_sub_element_nsmap_empty(self): + root = objectify.Element("root") + nsmap = {} + root.sub = objectify.Element("test", nsmap=nsmap) + self.assertEquals(root.sub.nsmap, DEFAULT_NSMAP) + + def test_sub_element_nsmap_custom_prefixes(self): + root = objectify.Element("root") + nsmap = {"mypy": PYTYPE_NAMESPACE, + "myxsi": XML_SCHEMA_INSTANCE_NS, + "myxsd": XML_SCHEMA_NS} + root.sub = objectify.Element("test", nsmap=nsmap) + self.assertEquals(root.sub.nsmap, DEFAULT_NSMAP) + + def test_sub_element_nsmap_custom(self): + root = objectify.Element("root") + nsmap = {"my": "someNS", + "myother": "someOtherNS", + "myxsd": XML_SCHEMA_NS,} + root.sub = objectify.Element("test", nsmap=nsmap) + expected = nsmap.copy() + del expected["myxsd"] + expected.update(DEFAULT_NSMAP) + self.assertEquals(root.sub.nsmap, expected) + + def test_data_element_nsmap_default(self): + value = objectify.DataElement("test this") + self.assertEquals(value.nsmap, DEFAULT_NSMAP) + + def test_data_element_nsmap_empty(self): + nsmap = {} + value = objectify.DataElement("test this", nsmap=nsmap) + self.assertEquals(value.nsmap.values(), [PYTYPE_NAMESPACE]) + + def test_data_element_nsmap_custom_prefixes(self): + nsmap = {"mypy": PYTYPE_NAMESPACE, + "myxsi": XML_SCHEMA_INSTANCE_NS, + "myxsd": XML_SCHEMA_NS} + value = objectify.DataElement("test this", nsmap=nsmap) + self.assertEquals(value.nsmap, nsmap) + + def test_data_element_nsmap_custom(self): + nsmap = {"my": "someNS", + "myother": "someOtherNS", + "myxsd": XML_SCHEMA_NS,} + value = objectify.DataElement("test", nsmap=nsmap) + self.assert_(PYTYPE_NAMESPACE in value.nsmap.values()) + for prefix, ns in nsmap.items(): + self.assert_(prefix in value.nsmap) + self.assertEquals(nsmap[prefix], value.nsmap[prefix]) + + def test_sub_data_element_nsmap_default(self): + root = objectify.Element("root") + root.value = objectify.DataElement("test this") + self.assertEquals(root.value.nsmap, DEFAULT_NSMAP) + + def test_sub_data_element_nsmap_empty(self): + root = objectify.Element("root") + nsmap = {} + root.value = objectify.DataElement("test this", nsmap=nsmap) + self.assertEquals(root.value.nsmap, DEFAULT_NSMAP) + + def test_sub_data_element_nsmap_custom_prefixes(self): + root = objectify.Element("root") + nsmap = {"mypy": PYTYPE_NAMESPACE, + "myxsi": XML_SCHEMA_INSTANCE_NS, + "myxsd": XML_SCHEMA_NS} + root.value = objectify.DataElement("test this", nsmap=nsmap) + self.assertEquals(root.value.nsmap, DEFAULT_NSMAP) + + def test_sub_data_element_nsmap_custom(self): + root = objectify.Element("root") + nsmap = {"my": "someNS", + "myother": "someOtherNS", + "myxsd": XML_SCHEMA_NS} + root.value = objectify.DataElement("test", nsmap=nsmap) + expected = nsmap.copy() + del expected["myxsd"] + expected.update(DEFAULT_NSMAP) + self.assertEquals(root.value.nsmap, expected) + + def test_data_element_attrib_attributes_precedence(self): + # keyword arguments override attrib entries + value = objectify.DataElement(23, _pytype="str", _xsi="foobar", + attrib={"gnu": "muh", "cat": "meeow", + "dog": "wuff"}, + bird="tchilp", dog="grrr") + self.assertEquals(value.get("gnu"), "muh") + self.assertEquals(value.get("cat"), "meeow") + self.assertEquals(value.get("dog"), "grrr") + self.assertEquals(value.get("bird"), "tchilp") + + def test_data_element_data_element_arg(self): + # Check that DataElement preserves all attributes ObjectifiedDataElement + # arguments + arg = objectify.DataElement(23, _pytype="str", _xsi="foobar", + attrib={"gnu": "muh", "cat": "meeow", + "dog": "wuff"}, + bird="tchilp", dog="grrr") + value = objectify.DataElement(arg) + self.assert_(isinstance(value, objectify.StringElement)) + for attr in arg.attrib: + self.assertEquals(value.get(attr), arg.get(attr)) + + def test_data_element_data_element_arg_pytype(self): + # Check that _pytype arg overrides original py:pytype of + # ObjectifiedDataElement + arg = objectify.DataElement(23, _pytype="str", _xsi="foobar", + attrib={"gnu": "muh", "cat": "meeow", + "dog": "wuff"}, + bird="tchilp", dog="grrr") + value = objectify.DataElement(arg, _pytype="int") + self.assert_(isinstance(value, objectify.IntElement)) + self.assertEquals(value.get(objectify.PYTYPE_ATTRIBUTE), "int") + for attr in arg.attrib: + if not attr == objectify.PYTYPE_ATTRIBUTE: + self.assertEquals(value.get(attr), arg.get(attr)) + + def test_data_element_data_element_arg_xsitype(self): + # Check that _xsi arg overrides original xsi:type of given + # ObjectifiedDataElement + arg = objectify.DataElement(23, _pytype="str", _xsi="foobar", + attrib={"gnu": "muh", "cat": "meeow", + "dog": "wuff"}, + bird="tchilp", dog="grrr") + value = objectify.DataElement(arg, _xsi="xsd:int") + self.assert_(isinstance(value, objectify.IntElement)) + self.assertEquals(value.get(XML_SCHEMA_INSTANCE_TYPE_ATTR), "xsd:int") + self.assertEquals(value.get(objectify.PYTYPE_ATTRIBUTE), "int") + for attr in arg.attrib: + if not attr in [objectify.PYTYPE_ATTRIBUTE, + XML_SCHEMA_INSTANCE_TYPE_ATTR]: + self.assertEquals(value.get(attr), arg.get(attr)) + + def test_data_element_data_element_arg_pytype_xsitype(self): + # Check that _pytype and _xsi args override original py:pytype and + # xsi:type attributes of given ObjectifiedDataElement + arg = objectify.DataElement(23, _pytype="str", _xsi="foobar", + attrib={"gnu": "muh", "cat": "meeow", + "dog": "wuff"}, + bird="tchilp", dog="grrr") + value = objectify.DataElement(arg, _pytype="int", _xsi="xsd:int") + self.assert_(isinstance(value, objectify.IntElement)) + self.assertEquals(value.get(objectify.PYTYPE_ATTRIBUTE), "int") + self.assertEquals(value.get(XML_SCHEMA_INSTANCE_TYPE_ATTR), "xsd:int") + for attr in arg.attrib: + if not attr in [objectify.PYTYPE_ATTRIBUTE, + XML_SCHEMA_INSTANCE_TYPE_ATTR]: + self.assertEquals(value.get(attr), arg.get(attr)) + + def test_data_element_invalid_pytype(self): + self.assertRaises(ValueError, objectify.DataElement, 3.1415, + _pytype="int") + + def test_data_element_invalid_xsi(self): + self.assertRaises(ValueError, objectify.DataElement, 3.1415, + _xsi="xsd:int") + + def test_data_element_data_element_arg_invalid_pytype(self): + arg = objectify.DataElement(3.1415) + self.assertRaises(ValueError, objectify.DataElement, arg, + _pytype="int") + + def test_data_element_data_element_arg_invalid_xsi(self): + arg = objectify.DataElement(3.1415) + self.assertRaises(ValueError, objectify.DataElement, arg, + _xsi="xsd:int") + def test_root(self): root = self.Element("test") self.assert_(isinstance(root, objectify.ObjectifiedElement)) @@ -268,7 +489,7 @@ Element = self.Element SubElement = self.etree.SubElement - nil_attr = "{http://www.w3.org/2001/XMLSchema-instance}nil" + nil_attr = XML_SCHEMA_NIL_ATTR root = Element("{objectified}root") SubElement(root, "{objectified}none") SubElement(root, "{objectified}none", {nil_attr : "true"}) @@ -282,13 +503,19 @@ value = objectify.DataElement(None) self.assert_(isinstance(value, objectify.NoneElement)) self.assertEquals(value, None) + self.assertEquals(value.get(XML_SCHEMA_NIL_ATTR), "true") def test_type_bool(self): Element = self.Element SubElement = self.etree.SubElement root = Element("{objectified}root") - root.none = 'true' - self.assert_(isinstance(root.none, objectify.BoolElement)) + root.bool = 'true' + self.assert_(isinstance(root.bool, objectify.BoolElement)) + self.assertEquals(root.bool, True) + + root.bool = 'false' + self.assert_(isinstance(root.bool, objectify.BoolElement)) + self.assertEquals(root.bool, False) def test_data_element_bool(self): value = objectify.DataElement(True) @@ -357,6 +584,24 @@ self.assert_(isinstance(value, objectify.FloatElement)) self.assertEquals(value, 5.5) + def test_data_element_xsitypes(self): + for xsi, objclass in xsitype2objclass.iteritems(): + # 1 is a valid value for all ObjectifiedDataElement classes + value = objectify.DataElement(1, _xsi=xsi) + self.assert_(isinstance(value, objclass)) + + def test_data_element_xsitypes_xsdprefixed(self): + for xsi, objclass in xsitype2objclass.iteritems(): + # 1 is a valid value for all ObjectifiedDataElement classes + value = objectify.DataElement(1, _xsi="xsd:%s" % xsi) + self.assert_(isinstance(value, objclass)) + + def test_data_element_xsitypes_prefixed(self): + for xsi, objclass in xsitype2objclass.iteritems(): + # 1 is a valid value for all ObjectifiedDataElement classes + self.assertRaises(ValueError, objectify.DataElement, 1, + _xsi="foo:%s" % xsi) + def test_schema_types(self): XML = self.XML root = XML('''\ @@ -401,9 +646,9 @@ for b in root.b: self.assert_(isinstance(b, objectify.BoolElement)) - self.assertEquals(True, root.b[0]) + self.assertEquals(True, root.b[0]) self.assertEquals(False, root.b[1]) - self.assertEquals(True, root.b[2]) + self.assertEquals(True, root.b[2]) self.assertEquals(False, root.b[3]) for f in root.f: @@ -416,7 +661,7 @@ for l in root.l: self.assert_(isinstance(l, objectify.LongElement)) - self.assertEquals(5l, l) + self.assertEquals(5L, l) for i in root.i: self.assert_(isinstance(i, objectify.IntElement)) @@ -425,6 +670,75 @@ self.assert_(isinstance(root.n, objectify.NoneElement)) self.assertEquals(None, root.n) + def test_schema_types_prefixed(self): + XML = self.XML + root = XML('''\ + + true + false + 1 + 0 + + 5 + 5 + + 5 + 5 + 5 + 5 + 5 + 5 + 5 + 5 + 5 + 5 + + 5 + 5 + 5 + 5 + 5 + 5 + 5 + 5 + + 5 + 5 + 5 + 5 + 5 + + + + ''') + + for b in root.b: + self.assert_(isinstance(b, objectify.BoolElement)) + self.assertEquals(True, root.b[0]) + self.assertEquals(False, root.b[1]) + self.assertEquals(True, root.b[2]) + self.assertEquals(False, root.b[3]) + + for f in root.f: + self.assert_(isinstance(f, objectify.FloatElement)) + self.assertEquals(5, f) + + for s in root.s: + self.assert_(isinstance(s, objectify.StringElement)) + self.assertEquals("5", s) + + for l in root.l: + self.assert_(isinstance(l, objectify.LongElement)) + self.assertEquals(5L, l) + + for i in root.i: + self.assert_(isinstance(i, objectify.IntElement)) + self.assertEquals(5, i) + + self.assert_(isinstance(root.n, objectify.NoneElement)) + self.assertEquals(None, root.n) + def test_type_str_sequence(self): XML = self.XML root = XML(u'whytry') @@ -539,19 +853,19 @@ child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE) for c in root.iterchildren() ] - self.assertEquals("int", child_types[0]) - self.assertEquals("str", child_types[1]) - self.assertEquals("float", child_types[2]) - self.assertEquals("str", child_types[3]) - self.assertEquals("bool", child_types[4]) - self.assertEquals("none", child_types[5]) - self.assertEquals(None, child_types[6]) - self.assertEquals("float", child_types[7]) - self.assertEquals("float", child_types[8]) - self.assertEquals("str", child_types[9]) - self.assertEquals("int", child_types[10]) - self.assertEquals("int", child_types[11]) - self.assertEquals("int", child_types[12]) + self.assertEquals("int", child_types[ 0]) + self.assertEquals("str", child_types[ 1]) + self.assertEquals("float", child_types[ 2]) + self.assertEquals("str", child_types[ 3]) + self.assertEquals("bool", child_types[ 4]) + self.assertEquals("none", child_types[ 5]) + self.assertEquals(None, child_types[ 6]) + self.assertEquals("float", child_types[ 7]) + self.assertEquals("float", child_types[ 8]) + self.assertEquals("str", child_types[ 9]) + self.assertEquals("int", child_types[10]) + self.assertEquals("int", child_types[11]) + self.assertEquals("int", child_types[12]) self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) @@ -771,7 +1085,8 @@ XML = self.XML root = XML(u'''\ + xmlns:py="http://codespeak.net/lxml/objectify/pytype" + xmlns:xsd="http://www.w3.org/2001/XMLSchema"> 5 test 1.1 @@ -779,9 +1094,9 @@ true - 5 - 5 - 23 + 5 + 5 + 23 42 300 2 @@ -811,6 +1126,51 @@ for c in root.getiterator(): self.assertEquals(None, c.get(XML_SCHEMA_INSTANCE_TYPE_ATTR)) + def test_pytype_deannotate(self): + XML = self.XML + root = XML(u'''\ + + 5 + test + 1.1 + \uF8D2 + true + + + 5 + 5 + 23 + 42 + 300 + 2 + + ''') + objectify.annotate(root) + objectify.deannotate(root, xsi=False) + + child_types = [ c.get(XML_SCHEMA_INSTANCE_TYPE_ATTR) + for c in root.iterchildren() ] + self.assertEquals("xsd:int", child_types[ 0]) + self.assertEquals("xsd:string", child_types[ 1]) + self.assertEquals("xsd:float", child_types[ 2]) + self.assertEquals("xsd:string", child_types[ 3]) + self.assertEquals("xsd:boolean", child_types[ 4]) + self.assertEquals(None, child_types[ 5]) + self.assertEquals(None, child_types[ 6]) + self.assertEquals("xsd:double", child_types[ 7]) + self.assertEquals("xsd:float", child_types[ 8]) + self.assertEquals("xsd:string", child_types[ 9]) + self.assertEquals("xsd:string", child_types[10]) + self.assertEquals("xsd:float", child_types[11]) + self.assertEquals("xsd:long", child_types[12]) + + self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) + + for c in root.getiterator(): + self.assertEquals(None, c.get(objectify.PYTYPE_ATTRIBUTE)) + def test_change_pytype_attribute(self): XML = self.XML From scoder at codespeak.net Sun Jul 1 15:15:15 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 1 Jul 2007 15:15:15 +0200 (CEST) Subject: [Lxml-checkins] r44651 - in lxml/branch/lxml-1.3: . src/lxml src/lxml/tests Message-ID: <20070701131515.35A5D80C8@code0.codespeak.net> Author: scoder Date: Sun Jul 1 15:15:14 2007 New Revision: 44651 Modified: lxml/branch/lxml-1.3/CHANGES.txt lxml/branch/lxml-1.3/src/lxml/objectify.pyx lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py Log: objectify: loads of test updates by Holger (merged from 1.3 branch), fixes for passing None and Element values to DataElement(), type checking in DataElement() Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Sun Jul 1 15:15:14 2007 @@ -8,9 +8,16 @@ Features added -------------- +* objectify.DataElement now supports setting values from existing data + elements (not just plain Python types) and reuses defined namespaces etc. + Bugs fixed ---------- +* objectify.DataElement didn't set up None value correctly + +* objectify.DataElement didn't check the value against the provided type hints + * Reference-counting bug in ``Element.attrib.pop()`` Modified: lxml/branch/lxml-1.3/src/lxml/objectify.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/objectify.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/objectify.pyx Sun Jul 1 15:15:14 2007 @@ -1667,10 +1667,34 @@ cdef python.PyObject* dict_result if nsmap is None: nsmap = _DEFAULT_NSMAP - if attrib is not None: + if attrib is not None and attrib: if python.PyDict_Size(_attributes): + attrib = dict(attrib) attrib.update(_attributes) _attributes = attrib + if isinstance(_value, ObjectifiedDataElement): + # reuse existing nsmap unless redefined in nsmap parameter + temp = _value.nsmap + if temp is not None and temp: + temp = dict(_value.nsmap) + temp.update(nsmap) + nsmap = temp + # reuse existing attributes unless redefined in attrib/_attributes + temp = _value.attrib + if temp is not None and temp: + temp = dict(_value.attrib) + temp.update(_attributes) + _attributes = temp + # reuse existing xsi:type or py:pytype attributes, unless provided as + # arguments + if _xsi is None and _pytype is None: + dict_result = python.PyDict_GetItem(_attributes, + XML_SCHEMA_INSTANCE_TYPE_ATTR) + if dict_result is not NULL: + _xsi = dict_result + dict_result = python.PyDict_GetItem(_attributes, PYTYPE_ATTRIBUTE) + if dict_result is not NULL: + _pytype = dict_result if _xsi is not None: if ':' in _xsi: prefix, name = _xsi.split(':', 1) @@ -1693,23 +1717,34 @@ strval = "true" else: strval = "false" + elif _value is None: + strval = None else: strval = str(_value) if _pytype is None: - for type_check, pytype in _TYPE_CHECKS: - try: - type_check(strval) - _pytype = (pytype).name - break - except IGNORABLE_ERRORS: - pass + if strval is not None: + for type_check, pytype in _TYPE_CHECKS: + try: + type_check(strval) + _pytype = (pytype).name + break + except IGNORABLE_ERRORS: + pass if _pytype is None: if _value is None: - _pytype = "none" + python.PyDict_SetItem(_attributes, XML_SCHEMA_INSTANCE_NIL_ATTR, "true") elif python._isString(_value): _pytype = "str" - if _pytype is not None: + else: + # check if type information from arguments is valid + dict_result = python.PyDict_GetItem(_PYTYPE_DICT, _pytype) + if dict_result is not NULL: + type_check = (dict_result).type_check + if type_check is not None: + type_check(strval) + + if _pytype is not None: python.PyDict_SetItem(_attributes, PYTYPE_ATTRIBUTE, _pytype) return _makeElement("value", strval, _attributes, nsmap) Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py Sun Jul 1 15:15:14 2007 @@ -74,116 +74,205 @@ self.etree.Namespace("otherNS").clear() objectify.setPytypeAttributeTag() - def test_element_nsmap(self): - # default nsmap - root = objectify.Element("test") - self.assertEquals(root.nsmap, DEFAULT_NSMAP) + def test_element_nsmap_default(self): + elt = objectify.Element("test") + self.assertEquals(elt.nsmap, DEFAULT_NSMAP) - # empty nsmap + def test_element_nsmap_empty(self): nsmap = {} - root = objectify.Element("test", nsmap=nsmap) - self.assertEquals(root.nsmap.values(), [PYTYPE_NAMESPACE]) + elt = objectify.Element("test", nsmap=nsmap) + self.assertEquals(elt.nsmap.values(), [PYTYPE_NAMESPACE]) - # nsmap with custom prefixes - nsmap = {"mypy" : PYTYPE_NAMESPACE, - "myxsi" : XML_SCHEMA_INSTANCE_NS, - "myxsd" : XML_SCHEMA_NS} - root = objectify.Element("test", nsmap=nsmap) - self.assertEquals(root.nsmap, nsmap) + def test_element_nsmap_custom_prefixes(self): + nsmap = {"mypy": PYTYPE_NAMESPACE, + "myxsi": XML_SCHEMA_INSTANCE_NS, + "myxsd": XML_SCHEMA_NS} + elt = objectify.Element("test", nsmap=nsmap) + self.assertEquals(elt.nsmap, nsmap) - # custom nsmap - nsmap = {"my" : "someNS", - "myother" : "someOtherNS", - } - root = objectify.Element("test", nsmap=nsmap) - self.assert_(PYTYPE_NAMESPACE in root.nsmap.values()) + def test_element_nsmap_custom(self): + nsmap = {"my": "someNS", + "myother": "someOtherNS", + "myxsd": XML_SCHEMA_NS} + elt = objectify.Element("test", nsmap=nsmap) + self.assert_(PYTYPE_NAMESPACE in elt.nsmap.values()) for prefix, ns in nsmap.items(): - self.assert_(prefix in root.nsmap) - self.assertEquals(nsmap[prefix], root.nsmap[prefix]) + self.assert_(prefix in elt.nsmap) + self.assertEquals(nsmap[prefix], elt.nsmap[prefix]) - def test_sub_element_nsmap(self): + def test_sub_element_nsmap_default(self): root = objectify.Element("root") - # default nsmap root.sub = objectify.Element("test") self.assertEquals(root.sub.nsmap, DEFAULT_NSMAP) - # empty nsmap + def test_sub_element_nsmap_empty(self): + root = objectify.Element("root") nsmap = {} root.sub = objectify.Element("test", nsmap=nsmap) self.assertEquals(root.sub.nsmap, DEFAULT_NSMAP) - # nsmap with custom prefixes - nsmap = {"mypy" : PYTYPE_NAMESPACE, - "myxsi" : XML_SCHEMA_INSTANCE_NS, - "myxsd" : XML_SCHEMA_NS} + def test_sub_element_nsmap_custom_prefixes(self): + root = objectify.Element("root") + nsmap = {"mypy": PYTYPE_NAMESPACE, + "myxsi": XML_SCHEMA_INSTANCE_NS, + "myxsd": XML_SCHEMA_NS} root.sub = objectify.Element("test", nsmap=nsmap) self.assertEquals(root.sub.nsmap, DEFAULT_NSMAP) - # custom nsmap - nsmap = {"my" : "someNS", - "myother" : "someOtherNS", - } + def test_sub_element_nsmap_custom(self): + root = objectify.Element("root") + nsmap = {"my": "someNS", + "myother": "someOtherNS", + "myxsd": XML_SCHEMA_NS,} root.sub = objectify.Element("test", nsmap=nsmap) expected = nsmap.copy() + del expected["myxsd"] expected.update(DEFAULT_NSMAP) self.assertEquals(root.sub.nsmap, expected) - def test_data_element_nsmap(self): - # default nsmap + def test_data_element_nsmap_default(self): value = objectify.DataElement("test this") self.assertEquals(value.nsmap, DEFAULT_NSMAP) - # empty nsmap + def test_data_element_nsmap_empty(self): nsmap = {} value = objectify.DataElement("test this", nsmap=nsmap) self.assertEquals(value.nsmap.values(), [PYTYPE_NAMESPACE]) - # nsmap with custom prefixes - nsmap = {"mypy" : PYTYPE_NAMESPACE, - "myxsi" : XML_SCHEMA_INSTANCE_NS, - "myxsd" : XML_SCHEMA_NS} - + def test_data_element_nsmap_custom_prefixes(self): + nsmap = {"mypy": PYTYPE_NAMESPACE, + "myxsi": XML_SCHEMA_INSTANCE_NS, + "myxsd": XML_SCHEMA_NS} value = objectify.DataElement("test this", nsmap=nsmap) self.assertEquals(value.nsmap, nsmap) - # custom nsmap - nsmap = {"my" : "someNS", - "myother" : "someOtherNS", - } + def test_data_element_nsmap_custom(self): + nsmap = {"my": "someNS", + "myother": "someOtherNS", + "myxsd": XML_SCHEMA_NS,} value = objectify.DataElement("test", nsmap=nsmap) self.assert_(PYTYPE_NAMESPACE in value.nsmap.values()) for prefix, ns in nsmap.items(): self.assert_(prefix in value.nsmap) self.assertEquals(nsmap[prefix], value.nsmap[prefix]) - def test_sub_data_element_nsmap(self): + def test_sub_data_element_nsmap_default(self): root = objectify.Element("root") - # default nsmap root.value = objectify.DataElement("test this") self.assertEquals(root.value.nsmap, DEFAULT_NSMAP) - # empty nsmap + def test_sub_data_element_nsmap_empty(self): + root = objectify.Element("root") nsmap = {} root.value = objectify.DataElement("test this", nsmap=nsmap) self.assertEquals(root.value.nsmap, DEFAULT_NSMAP) - # nsmap with custom prefixes - nsmap = {"mypy" : PYTYPE_NAMESPACE, - "myxsi" : XML_SCHEMA_INSTANCE_NS, - "myxsd" : XML_SCHEMA_NS} - + def test_sub_data_element_nsmap_custom_prefixes(self): + root = objectify.Element("root") + nsmap = {"mypy": PYTYPE_NAMESPACE, + "myxsi": XML_SCHEMA_INSTANCE_NS, + "myxsd": XML_SCHEMA_NS} root.value = objectify.DataElement("test this", nsmap=nsmap) self.assertEquals(root.value.nsmap, DEFAULT_NSMAP) - # custom nsmap - nsmap = {"my" : "someNS", - "myother" : "someOtherNS", - } + def test_sub_data_element_nsmap_custom(self): + root = objectify.Element("root") + nsmap = {"my": "someNS", + "myother": "someOtherNS", + "myxsd": XML_SCHEMA_NS} root.value = objectify.DataElement("test", nsmap=nsmap) expected = nsmap.copy() + del expected["myxsd"] expected.update(DEFAULT_NSMAP) self.assertEquals(root.value.nsmap, expected) + def test_data_element_attrib_attributes_precedence(self): + # keyword arguments override attrib entries + value = objectify.DataElement(23, _pytype="str", _xsi="foobar", + attrib={"gnu": "muh", "cat": "meeow", + "dog": "wuff"}, + bird="tchilp", dog="grrr") + self.assertEquals(value.get("gnu"), "muh") + self.assertEquals(value.get("cat"), "meeow") + self.assertEquals(value.get("dog"), "grrr") + self.assertEquals(value.get("bird"), "tchilp") + + def test_data_element_data_element_arg(self): + # Check that DataElement preserves all attributes ObjectifiedDataElement + # arguments + arg = objectify.DataElement(23, _pytype="str", _xsi="foobar", + attrib={"gnu": "muh", "cat": "meeow", + "dog": "wuff"}, + bird="tchilp", dog="grrr") + value = objectify.DataElement(arg) + self.assert_(isinstance(value, objectify.StringElement)) + for attr in arg.attrib: + self.assertEquals(value.get(attr), arg.get(attr)) + + def test_data_element_data_element_arg_pytype(self): + # Check that _pytype arg overrides original py:pytype of + # ObjectifiedDataElement + arg = objectify.DataElement(23, _pytype="str", _xsi="foobar", + attrib={"gnu": "muh", "cat": "meeow", + "dog": "wuff"}, + bird="tchilp", dog="grrr") + value = objectify.DataElement(arg, _pytype="int") + self.assert_(isinstance(value, objectify.IntElement)) + self.assertEquals(value.get(objectify.PYTYPE_ATTRIBUTE), "int") + for attr in arg.attrib: + if not attr == objectify.PYTYPE_ATTRIBUTE: + self.assertEquals(value.get(attr), arg.get(attr)) + + def test_data_element_data_element_arg_xsitype(self): + # Check that _xsi arg overrides original xsi:type of given + # ObjectifiedDataElement + arg = objectify.DataElement(23, _pytype="str", _xsi="foobar", + attrib={"gnu": "muh", "cat": "meeow", + "dog": "wuff"}, + bird="tchilp", dog="grrr") + value = objectify.DataElement(arg, _xsi="xsd:int") + self.assert_(isinstance(value, objectify.IntElement)) + self.assertEquals(value.get(XML_SCHEMA_INSTANCE_TYPE_ATTR), "xsd:int") + self.assertEquals(value.get(objectify.PYTYPE_ATTRIBUTE), "int") + for attr in arg.attrib: + if not attr in [objectify.PYTYPE_ATTRIBUTE, + XML_SCHEMA_INSTANCE_TYPE_ATTR]: + self.assertEquals(value.get(attr), arg.get(attr)) + + def test_data_element_data_element_arg_pytype_xsitype(self): + # Check that _pytype and _xsi args override original py:pytype and + # xsi:type attributes of given ObjectifiedDataElement + arg = objectify.DataElement(23, _pytype="str", _xsi="foobar", + attrib={"gnu": "muh", "cat": "meeow", + "dog": "wuff"}, + bird="tchilp", dog="grrr") + value = objectify.DataElement(arg, _pytype="int", _xsi="xsd:int") + self.assert_(isinstance(value, objectify.IntElement)) + self.assertEquals(value.get(objectify.PYTYPE_ATTRIBUTE), "int") + self.assertEquals(value.get(XML_SCHEMA_INSTANCE_TYPE_ATTR), "xsd:int") + for attr in arg.attrib: + if not attr in [objectify.PYTYPE_ATTRIBUTE, + XML_SCHEMA_INSTANCE_TYPE_ATTR]: + self.assertEquals(value.get(attr), arg.get(attr)) + + def test_data_element_invalid_pytype(self): + self.assertRaises(ValueError, objectify.DataElement, 3.1415, + _pytype="int") + + def test_data_element_invalid_xsi(self): + self.assertRaises(ValueError, objectify.DataElement, 3.1415, + _xsi="xsd:int") + + def test_data_element_data_element_arg_invalid_pytype(self): + arg = objectify.DataElement(3.1415) + self.assertRaises(ValueError, objectify.DataElement, arg, + _pytype="int") + + def test_data_element_data_element_arg_invalid_xsi(self): + arg = objectify.DataElement(3.1415) + self.assertRaises(ValueError, objectify.DataElement, arg, + _xsi="xsd:int") + def test_root(self): root = self.Element("test") self.assert_(isinstance(root, objectify.ObjectifiedElement)) @@ -400,7 +489,7 @@ Element = self.Element SubElement = self.etree.SubElement - nil_attr = "{http://www.w3.org/2001/XMLSchema-instance}nil" + nil_attr = XML_SCHEMA_NIL_ATTR root = Element("{objectified}root") SubElement(root, "{objectified}none") SubElement(root, "{objectified}none", {nil_attr : "true"}) @@ -414,6 +503,7 @@ value = objectify.DataElement(None) self.assert_(isinstance(value, objectify.NoneElement)) self.assertEquals(value, None) + self.assertEquals(value.get(XML_SCHEMA_NIL_ATTR), "true") def test_type_bool(self): Element = self.Element From scoder at codespeak.net Sun Jul 1 15:16:28 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 1 Jul 2007 15:16:28 +0200 (CEST) Subject: [Lxml-checkins] r44652 - lxml/trunk Message-ID: <20070701131628.2B2EE80C8@code0.codespeak.net> Author: scoder Date: Sun Jul 1 15:16:27 2007 New Revision: 44652 Modified: lxml/trunk/CHANGES.txt Log: changelog update Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sun Jul 1 15:16:27 2007 @@ -33,6 +33,8 @@ Bugs fixed ---------- +* Better way to prevent crashes in Element proxy cleanup code + * objectify.DataElement didn't set up None value correctly * objectify.DataElement didn't check the value against the provided type hints From scoder at codespeak.net Sun Jul 1 15:18:17 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 1 Jul 2007 15:18:17 +0200 (CEST) Subject: [Lxml-checkins] r44653 - lxml/trunk/src/lxml Message-ID: <20070701131817.C179280C6@code0.codespeak.net> Author: scoder Date: Sun Jul 1 15:18:17 2007 New Revision: 44653 Modified: lxml/trunk/src/lxml/proxy.pxi Log: cleanup Modified: lxml/trunk/src/lxml/proxy.pxi ============================================================================== --- lxml/trunk/src/lxml/proxy.pxi (original) +++ lxml/trunk/src/lxml/proxy.pxi Sun Jul 1 15:18:17 2007 @@ -173,19 +173,6 @@ tree.END_FOR_EACH_ELEMENT_FROM(c_node) return 1 -## cdef void _deallocDocument(xmlDoc* c_doc): -## """We cannot rely on Python's GC to *always* dealloc the _Document *after* -## all proxies it contains => traverse the document and mark all its proxies -## as dead by deleting their xmlNode* reference. -## """ -## cdef xmlNode* c_node -## c_node = c_doc.children -## tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_doc, c_node, 1) -## if c_node._private is not NULL: -## (<_Element>c_node._private)._c_node = NULL -## tree.END_FOR_EACH_ELEMENT_FROM(c_node) -## tree.xmlFreeDoc(c_doc) - ################################################################################ # fix _Document references and namespaces when a node changes documents From scoder at codespeak.net Sun Jul 1 15:19:00 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 1 Jul 2007 15:19:00 +0200 (CEST) Subject: [Lxml-checkins] r44654 - in lxml/branch/lxml-1.3: . src/lxml Message-ID: <20070701131900.1555A80A2@code0.codespeak.net> Author: scoder Date: Sun Jul 1 15:18:59 2007 New Revision: 44654 Modified: lxml/branch/lxml-1.3/CHANGES.txt lxml/branch/lxml-1.3/src/lxml/etree.pyx lxml/branch/lxml-1.3/src/lxml/proxy.pxi lxml/branch/lxml-1.3/src/lxml/python.pxd Log: merged in proxy deallocation update from trunk Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Sun Jul 1 15:18:59 2007 @@ -14,6 +14,8 @@ Bugs fixed ---------- +* Better way to prevent crashes in Element proxy cleanup code + * objectify.DataElement didn't set up None value correctly * objectify.DataElement didn't check the value against the provided type hints Modified: lxml/branch/lxml-1.3/src/lxml/etree.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/etree.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/etree.pyx Sun Jul 1 15:18:59 2007 @@ -243,8 +243,8 @@ #displayNode(self._c_doc, 0) #print self._c_doc, self._c_doc.dict is __GLOBAL_PARSER_CONTEXT._c_dict #print self._c_doc, canDeallocateChildNodes(self._c_doc) - #tree.xmlFreeDoc(c_doc) - _deallocDocument(self._c_doc) + tree.xmlFreeDoc(self._c_doc) + #_deallocDocument(self._c_doc) cdef getroot(self): cdef xmlNode* c_node Modified: lxml/branch/lxml-1.3/src/lxml/proxy.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/proxy.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/proxy.pxi Sun Jul 1 15:18:59 2007 @@ -27,6 +27,8 @@ #print "registering for:", proxy._c_node assert c_node._private is NULL, "double registering proxy!" c_node._private = proxy + # additional INCREF to make sure _Document is GC-ed LAST! + python.Py_INCREF(proxy._doc) cdef unregisterProxy(_Element proxy): """Unregister a proxy for the node it's proxying for. @@ -35,6 +37,7 @@ c_node = proxy._c_node assert c_node._private is proxy, "Tried to unregister unknown proxy" c_node._private = NULL + python.Py_DECREF(proxy._doc) ################################################################################ # temporarily make a node the root node of its document @@ -170,19 +173,6 @@ tree.END_FOR_EACH_ELEMENT_FROM(c_node) return 1 -cdef void _deallocDocument(xmlDoc* c_doc): - """We cannot rely on Python's GC to *always* dealloc the _Document *after* - all proxies it contains => traverse the document and mark all its proxies - as dead by deleting their xmlNode* reference. - """ - cdef xmlNode* c_node - c_node = c_doc.children - tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_doc, c_node, 1) - if c_node._private is not NULL: - (<_Element>c_node._private)._c_node = NULL - tree.END_FOR_EACH_ELEMENT_FROM(c_node) - tree.xmlFreeDoc(c_doc) - ################################################################################ # fix _Document references and namespaces when a node changes documents @@ -303,6 +293,8 @@ if c_element._private is not NULL: element = <_Element>c_element._private if element._doc is not doc: + python.Py_INCREF(doc) + python.Py_DECREF(element._doc) element._doc = doc if c_element is c_start_node: @@ -321,7 +313,11 @@ # fix _Document reference (may dealloc the original document!) if c_element._private is not NULL: - (<_Element>c_element._private)._doc = doc + element = <_Element>c_element._private + if element._doc is not doc: + python.Py_INCREF(doc) + python.Py_DECREF(element._doc) + element._doc = doc if c_element is c_start_node: break Modified: lxml/branch/lxml-1.3/src/lxml/python.pxd ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/python.pxd (original) +++ lxml/branch/lxml-1.3/src/lxml/python.pxd Sun Jul 1 15:18:59 2007 @@ -9,6 +9,7 @@ cdef int PY_SSIZE_T_MAX cdef void Py_INCREF(object o) + cdef void Py_DECREF(object o) cdef FILE* PyFile_AsFile(object p) cdef int PyFile_Check(object p) From scoder at codespeak.net Mon Jul 2 10:30:20 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 2 Jul 2007 10:30:20 +0200 (CEST) Subject: [Lxml-checkins] r44658 - lxml/trunk/src/lxml Message-ID: <20070702083020.B092A80B0@code0.codespeak.net> Author: scoder Date: Mon Jul 2 10:30:19 2007 New Revision: 44658 Modified: lxml/trunk/src/lxml/xpath.pxi Log: let repr(XPath()) return original path expression Modified: lxml/trunk/src/lxml/xpath.pxi ============================================================================== --- lxml/trunk/src/lxml/xpath.pxi (original) +++ lxml/trunk/src/lxml/xpath.pxi Mon Jul 2 10:30:19 2007 @@ -392,6 +392,9 @@ if self._xpath is not NULL: xpath.xmlXPathFreeCompExpr(self._xpath) + def __repr__(self): + return path + cdef object _replace_strings cdef object _find_namespaces From scoder at codespeak.net Mon Jul 2 10:31:03 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 2 Jul 2007 10:31:03 +0200 (CEST) Subject: [Lxml-checkins] r44659 - lxml/trunk/src/lxml/tests Message-ID: <20070702083103.BD3A980B3@code0.codespeak.net> Author: scoder Date: Mon Jul 2 10:31:03 2007 New Revision: 44659 Modified: lxml/trunk/src/lxml/tests/test_classlookup.py lxml/trunk/src/lxml/tests/test_dtd.py lxml/trunk/src/lxml/tests/test_elementtree.py lxml/trunk/src/lxml/tests/test_errors.py lxml/trunk/src/lxml/tests/test_etree.py lxml/trunk/src/lxml/tests/test_htmlparser.py lxml/trunk/src/lxml/tests/test_io.py lxml/trunk/src/lxml/tests/test_nsclasses.py lxml/trunk/src/lxml/tests/test_objectify.py lxml/trunk/src/lxml/tests/test_pyclasslookup.py lxml/trunk/src/lxml/tests/test_relaxng.py lxml/trunk/src/lxml/tests/test_sax.py lxml/trunk/src/lxml/tests/test_xmlschema.py lxml/trunk/src/lxml/tests/test_xpathevaluator.py lxml/trunk/src/lxml/tests/test_xslt.py Log: disable calling unittest scripts directly Modified: lxml/trunk/src/lxml/tests/test_classlookup.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_classlookup.py (original) +++ lxml/trunk/src/lxml/tests/test_classlookup.py Mon Jul 2 10:31:03 2007 @@ -178,4 +178,4 @@ return suite if __name__ == '__main__': - unittest.main() + print 'to test use test.py %s' % __file__ Modified: lxml/trunk/src/lxml/tests/test_dtd.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_dtd.py (original) +++ lxml/trunk/src/lxml/tests/test_dtd.py Mon Jul 2 10:31:03 2007 @@ -72,4 +72,4 @@ return suite if __name__ == '__main__': - unittest.main() + print 'to test use test.py %s' % __file__ Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Mon Jul 2 10:31:03 2007 @@ -2544,4 +2544,4 @@ return suite if __name__ == '__main__': - unittest.main() + print 'to test use test.py %s' % __file__ Modified: lxml/trunk/src/lxml/tests/test_errors.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_errors.py (original) +++ lxml/trunk/src/lxml/tests/test_errors.py Mon Jul 2 10:31:03 2007 @@ -25,4 +25,4 @@ return suite if __name__ == '__main__': - unittest.main() + print 'to test use test.py %s' % __file__ Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Mon Jul 2 10:31:03 2007 @@ -1704,4 +1704,4 @@ return suite if __name__ == '__main__': - unittest.main() + print 'to test use test.py %s' % __file__ Modified: lxml/trunk/src/lxml/tests/test_htmlparser.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_htmlparser.py (original) +++ lxml/trunk/src/lxml/tests/test_htmlparser.py Mon Jul 2 10:31:03 2007 @@ -115,4 +115,4 @@ return suite if __name__ == '__main__': - unittest.main() + print 'to test use test.py %s' % __file__ Modified: lxml/trunk/src/lxml/tests/test_io.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_io.py (original) +++ lxml/trunk/src/lxml/tests/test_io.py Mon Jul 2 10:31:03 2007 @@ -172,4 +172,4 @@ return suite if __name__ == '__main__': - unittest.main() + print 'to test use test.py %s' % __file__ Modified: lxml/trunk/src/lxml/tests/test_nsclasses.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_nsclasses.py (original) +++ lxml/trunk/src/lxml/tests/test_nsclasses.py Mon Jul 2 10:31:03 2007 @@ -163,4 +163,4 @@ return suite if __name__ == '__main__': - unittest.main() + print 'to test use test.py %s' % __file__ Modified: lxml/trunk/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_objectify.py (original) +++ lxml/trunk/src/lxml/tests/test_objectify.py Mon Jul 2 10:31:03 2007 @@ -1598,4 +1598,4 @@ return suite if __name__ == '__main__': - unittest.main() + print 'to test use test.py %s' % __file__ Modified: lxml/trunk/src/lxml/tests/test_pyclasslookup.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_pyclasslookup.py (original) +++ lxml/trunk/src/lxml/tests/test_pyclasslookup.py Mon Jul 2 10:31:03 2007 @@ -287,4 +287,4 @@ return suite if __name__ == '__main__': - unittest.main() + print 'to test use test.py %s' % __file__ Modified: lxml/trunk/src/lxml/tests/test_relaxng.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_relaxng.py (original) +++ lxml/trunk/src/lxml/tests/test_relaxng.py Mon Jul 2 10:31:03 2007 @@ -142,4 +142,4 @@ return suite if __name__ == '__main__': - unittest.main() + print 'to test use test.py %s' % __file__ Modified: lxml/trunk/src/lxml/tests/test_sax.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_sax.py (original) +++ lxml/trunk/src/lxml/tests/test_sax.py Mon Jul 2 10:31:03 2007 @@ -222,4 +222,4 @@ return suite if __name__ == '__main__': - unittest.main() + print 'to test use test.py %s' % __file__ Modified: lxml/trunk/src/lxml/tests/test_xmlschema.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xmlschema.py (original) +++ lxml/trunk/src/lxml/tests/test_xmlschema.py Mon Jul 2 10:31:03 2007 @@ -81,4 +81,4 @@ return suite if __name__ == '__main__': - unittest.main() + print 'to test use test.py %s' % __file__ Modified: lxml/trunk/src/lxml/tests/test_xpathevaluator.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xpathevaluator.py (original) +++ lxml/trunk/src/lxml/tests/test_xpathevaluator.py Mon Jul 2 10:31:03 2007 @@ -532,4 +532,4 @@ return suite if __name__ == '__main__': - unittest.main() + print 'to test use test.py %s' % __file__ Modified: lxml/trunk/src/lxml/tests/test_xslt.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xslt.py (original) +++ lxml/trunk/src/lxml/tests/test_xslt.py Mon Jul 2 10:31:03 2007 @@ -1011,4 +1011,4 @@ return suite if __name__ == '__main__': - unittest.main() + print 'to test use test.py %s' % __file__ From scoder at codespeak.net Mon Jul 2 10:53:42 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 2 Jul 2007 10:53:42 +0200 (CEST) Subject: [Lxml-checkins] r44660 - lxml/trunk/doc Message-ID: <20070702085342.89A9080C1@code0.codespeak.net> Author: scoder Date: Mon Jul 2 10:53:42 2007 New Revision: 44660 Modified: lxml/trunk/doc/objectify.txt Log: doc: writing objectify tag languages with E-factory Modified: lxml/trunk/doc/objectify.txt ============================================================================== --- lxml/trunk/doc/objectify.txt (original) +++ lxml/trunk/doc/objectify.txt Mon Jul 2 10:53:42 2007 @@ -287,7 +287,24 @@ true how - + +This allows you to write up a specific language in tags:: + + >>> ROOT = objectify.E.root + >>> TITLE = objectify.E.title + >>> TYPE = objectify.E.type + + >>> root = ROOT( + ... TITLE("The title"), + ... TYPE(5) + ... ) + + >>> print etree.tostring(root, pretty_print=True) + + The title + 5 + + Namespace handling ------------------ From scoder at codespeak.net Mon Jul 2 10:55:24 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 2 Jul 2007 10:55:24 +0200 (CEST) Subject: [Lxml-checkins] r44661 - in lxml/branch/lxml-1.3: . doc src/lxml Message-ID: <20070702085524.D8D7C80D2@code0.codespeak.net> Author: scoder Date: Mon Jul 2 10:55:24 2007 New Revision: 44661 Modified: lxml/branch/lxml-1.3/CHANGES.txt lxml/branch/lxml-1.3/doc/objectify.txt lxml/branch/lxml-1.3/src/lxml/builder.py lxml/branch/lxml-1.3/src/lxml/objectify.pyx Log: objectify: merged in E-factory support from trunk Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Mon Jul 2 10:55:24 2007 @@ -11,6 +11,8 @@ * objectify.DataElement now supports setting values from existing data elements (not just plain Python types) and reuses defined namespaces etc. +* E-factory support for lxml.objectify (``objectify.E``) + Bugs fixed ---------- Modified: lxml/branch/lxml-1.3/doc/objectify.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/objectify.txt (original) +++ lxml/branch/lxml-1.3/doc/objectify.txt Mon Jul 2 10:55:24 2007 @@ -266,6 +266,45 @@ notB +Tree generation with the E-factory +---------------------------------- + +To simplify the generation of trees even further, you can use the E-factory:: + + >>> E = objectify.E + >>> root = E.root( + ... E.a(5), + ... E.b(6.1), + ... E.c(True), + ... E.d("how") + ... ) + + >>> print etree.tostring(root, pretty_print=True) + + 5 + 6.1 + true + how + + +This allows you to write up a specific language in tags:: + + >>> ROOT = objectify.E.root + >>> TITLE = objectify.E.title + >>> TYPE = objectify.E.type + + >>> root = ROOT( + ... TITLE("The title"), + ... TYPE(5) + ... ) + + >>> print etree.tostring(root, pretty_print=True) + + The title + 5 + + + Namespace handling ------------------ Modified: lxml/branch/lxml-1.3/src/lxml/builder.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/builder.py (original) +++ lxml/branch/lxml-1.3/src/lxml/builder.py Mon Jul 2 10:55:24 2007 @@ -140,7 +140,10 @@ elem[-1].tail = (elem[-1].tail or "") + item else: elem.text = (elem.text or "") + item - typemap[str] = typemap[unicode] = add_text + if str not in typemap: + typemap[str] = add_text + if unicode not in typemap: + typemap[unicode] = add_text def add_dict(elem, item): attrib = elem.attrib @@ -149,7 +152,8 @@ attrib[k] = v else: attrib[k] = typemap[type(v)](None, v) - typemap[dict] = add_dict + if dict not in typemap: + typemap[dict] = add_dict self._typemap = typemap Modified: lxml/branch/lxml-1.3/src/lxml/objectify.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/objectify.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/objectify.pyx Mon Jul 2 10:55:24 2007 @@ -65,6 +65,8 @@ cdef object islice from itertools import islice +cdef object _ElementMaker +from builder import ElementMaker as _ElementMaker # namespace/name for "pytype" hint attribute cdef object PYTYPE_NAMESPACE @@ -1633,6 +1635,42 @@ parser = objectify_parser return _parse(f, parser) +class ElementMaker(_ElementMaker): + def __init__(self, typemap=None): + if typemap is None: + typemap = {} + else: + typemap = typemap.copy() + + typemap[__builtin__.str] = __add_text + typemap[__builtin__.unicode] = __add_text + typemap[__builtin__.int] = __add_text + typemap[__builtin__.long] = __add_text + typemap[__builtin__.float] = __add_text + typemap[__builtin__.bool] = __add_text + + _ElementMaker.__init__(self, typemap, objectify_parser.makeelement) + +def __add_text(_Element elem not None, text): + cdef tree.xmlNode* c_child + if isinstance(text, bool): + text = str(text).lower() + else: + text = str(text) + c_child = cetree.findChildBackwards(elem._c_node, 0) + if c_child is not NULL: + old = cetree.tailOf(c_child) + if old is not None: + text = old + text + cetree.setTailText(c_child, text) + else: + old = cetree.textOf(elem._c_node) + if old is not None: + text = old + text + cetree.setNodeText(elem._c_node, text) + +E = ElementMaker() + cdef object _DEFAULT_NSMAP _DEFAULT_NSMAP = { "py" : PYTYPE_NAMESPACE, "xsi" : XML_SCHEMA_INSTANCE_NS, From scoder at codespeak.net Mon Jul 2 10:55:57 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 2 Jul 2007 10:55:57 +0200 (CEST) Subject: [Lxml-checkins] r44662 - lxml/branch/lxml-1.3 Message-ID: <20070702085557.B011680E3@code0.codespeak.net> Author: scoder Date: Mon Jul 2 10:55:57 2007 New Revision: 44662 Modified: lxml/branch/lxml-1.3/version.txt Log: version 1.3.1 Modified: lxml/branch/lxml-1.3/version.txt ============================================================================== --- lxml/branch/lxml-1.3/version.txt (original) +++ lxml/branch/lxml-1.3/version.txt Mon Jul 2 10:55:57 2007 @@ -1 +1 @@ -1.3 +1.3.1 From scoder at codespeak.net Mon Jul 2 11:03:35 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 2 Jul 2007 11:03:35 +0200 (CEST) Subject: [Lxml-checkins] r44663 - in lxml/branch/lxml-1.3: . doc Message-ID: <20070702090335.CA74880B0@code0.codespeak.net> Author: scoder Date: Mon Jul 2 11:03:35 2007 New Revision: 44663 Modified: lxml/branch/lxml-1.3/CHANGES.txt lxml/branch/lxml-1.3/doc/main.txt Log: prepare release of 1.3.1 Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Mon Jul 2 11:03:35 2007 @@ -2,8 +2,8 @@ lxml changelog ============== -Under development -================= +1.3.1 (2007-07-02) +================== Features added -------------- Modified: lxml/branch/lxml-1.3/doc/main.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/main.txt (original) +++ lxml/branch/lxml-1.3/doc/main.txt Mon Jul 2 11:03:35 2007 @@ -129,7 +129,7 @@ .. _`lxml at the Python cheeseshop`: http://cheeseshop.python.org/pypi/lxml/ .. _`this key`: pubkey.asc -The latest version is `lxml 1.3`_, released 2007-06-24 (`changes for 1.3`_). +The latest version is `lxml 1.3.1`_, released 2007-07-02 (`changes for 1.3.1`_). `Older versions`_ are listed below. .. _`Older versions`: #old-versions @@ -179,6 +179,8 @@ Old Versions ------------ +* `lxml 1.3`_, released 2007-06-24 (`changes for 1.3`_) + * `lxml 1.2.1`_, released 2007-02-27 (`changes for 1.2.1`_) * `lxml 1.2`_, released 2007-02-20 (`changes for 1.2`_) @@ -215,6 +217,7 @@ * `lxml 0.5`_, released 2005-04-08 +.. _`lxml 1.3.1`: lxml-1.3.1.tgz .. _`lxml 1.3`: lxml-1.3.tgz .. _`lxml 1.2.1`: lxml-1.2.1.tgz .. _`lxml 1.2`: lxml-1.2.tgz @@ -235,7 +238,8 @@ .. _`lxml 0.5.1`: lxml-0.5.1.tgz .. _`lxml 0.5`: lxml-0.5.tgz -.. _`CHANGES for 1.3`: changes-1.3.html +.. _`changes for 1.3.1`: changes-1.3.1.html +.. _`changes for 1.3`: changes-1.3.html .. _`changes for 1.2.1`: changes-1.2.1.html .. _`changes for 1.2`: changes-1.2.html .. _`changes for 1.1.2`: changes-1.1.2.html From scoder at codespeak.net Mon Jul 2 11:05:48 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 2 Jul 2007 11:05:48 +0200 (CEST) Subject: [Lxml-checkins] r44664 - in lxml/trunk: . doc Message-ID: <20070702090548.3FC0E80BB@code0.codespeak.net> Author: scoder Date: Mon Jul 2 11:05:47 2007 New Revision: 44664 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/main.txt Log: merged in release changes for 1.3.1 Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon Jul 2 11:05:47 2007 @@ -8,11 +8,6 @@ Features added -------------- -* objectify.DataElement now supports setting values from existing data - elements (not just plain Python types) and reuses defined namespaces etc. - -* E-factory support for lxml.objectify (``objectify.E``) - * Entity support through an ``Entity`` factory and element classes. XML parsers now have a ``resolve_entities`` keyword argument that can be set to False to keep entities in the document. @@ -33,14 +28,6 @@ Bugs fixed ---------- -* Better way to prevent crashes in Element proxy cleanup code - -* objectify.DataElement didn't set up None value correctly - -* objectify.DataElement didn't check the value against the provided type hints - -* Reference-counting bug in ``Element.attrib.pop()`` - * The XML parser did not report undefined entities as error * The text in exceptions raised by XML parsers, validators and XPath @@ -56,6 +43,29 @@ * major refactoring in XPath/XSLT extension function code +1.3.1 (2007-07-02) +================== + +Features added +-------------- + +* objectify.DataElement now supports setting values from existing data + elements (not just plain Python types) and reuses defined namespaces etc. + +* E-factory support for lxml.objectify (``objectify.E``) + +Bugs fixed +---------- + +* Better way to prevent crashes in Element proxy cleanup code + +* objectify.DataElement didn't set up None value correctly + +* objectify.DataElement didn't check the value against the provided type hints + +* Reference-counting bug in ``Element.attrib.pop()`` + + 1.3 (2007-06-24) ================ Modified: lxml/trunk/doc/main.txt ============================================================================== --- lxml/trunk/doc/main.txt (original) +++ lxml/trunk/doc/main.txt Mon Jul 2 11:05:47 2007 @@ -129,7 +129,7 @@ .. _`lxml at the Python cheeseshop`: http://cheeseshop.python.org/pypi/lxml/ .. _`this key`: pubkey.asc -The latest version is `lxml 1.3`_, released 2007-06-24 (`changes for 1.3`_). +The latest version is `lxml 1.3.1`_, released 2007-07-02 (`changes for 1.3.1`_). `Older versions`_ are listed below. .. _`Older versions`: #old-versions @@ -179,6 +179,8 @@ Old Versions ------------ +* `lxml 1.3`_, released 2007-06-24 (`changes for 1.3`_) + * `lxml 1.2.1`_, released 2007-02-27 (`changes for 1.2.1`_) * `lxml 1.2`_, released 2007-02-20 (`changes for 1.2`_) @@ -215,6 +217,7 @@ * `lxml 0.5`_, released 2005-04-08 +.. _`lxml 1.3.1`: lxml-1.3.1.tgz .. _`lxml 1.3`: lxml-1.3.tgz .. _`lxml 1.2.1`: lxml-1.2.1.tgz .. _`lxml 1.2`: lxml-1.2.tgz @@ -235,7 +238,8 @@ .. _`lxml 0.5.1`: lxml-0.5.1.tgz .. _`lxml 0.5`: lxml-0.5.tgz -.. _`CHANGES for 1.3`: changes-1.3.html +.. _`changes for 1.3.1`: changes-1.3.1.html +.. _`changes for 1.3`: changes-1.3.html .. _`changes for 1.2.1`: changes-1.2.1.html .. _`changes for 1.2`: changes-1.2.html .. _`changes for 1.1.2`: changes-1.1.2.html From scoder at codespeak.net Mon Jul 2 16:49:19 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 2 Jul 2007 16:49:19 +0200 (CEST) Subject: [Lxml-checkins] r44668 - lxml/trunk/src/lxml Message-ID: <20070702144919.81EA780BA@code0.codespeak.net> Author: scoder Date: Mon Jul 2 16:49:18 2007 New Revision: 44668 Modified: lxml/trunk/src/lxml/objectify.pyx lxml/trunk/src/lxml/pyclasslookup.pyx Log: provide __version__ in all Pyrex modules Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Mon Jul 2 16:49:18 2007 @@ -13,6 +13,8 @@ # initialize C-API of lxml.etree import_etree(etree) +__version__ = etree.__version__ + cdef object SubElement SubElement = etree.SubElement Modified: lxml/trunk/src/lxml/pyclasslookup.pyx ============================================================================== --- lxml/trunk/src/lxml/pyclasslookup.pyx (original) +++ lxml/trunk/src/lxml/pyclasslookup.pyx Mon Jul 2 16:49:18 2007 @@ -15,6 +15,8 @@ # initialize C-API of lxml.etree import_etree(etree) +__version__ = etree.__version__ + cdef class _ElementProxy: cdef tree.xmlNode* _c_node cdef object _source_proxy From scoder at codespeak.net Mon Jul 2 16:50:28 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 2 Jul 2007 16:50:28 +0200 (CEST) Subject: [Lxml-checkins] r44669 - lxml/trunk/src/lxml Message-ID: <20070702145028.9E8C780BC@code0.codespeak.net> Author: scoder Date: Mon Jul 2 16:50:28 2007 New Revision: 44669 Modified: lxml/trunk/src/lxml/etree.pyx Log: support dev/alpha/beta versions with number postfix Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Mon Jul 2 16:50:28 2007 @@ -131,12 +131,19 @@ try: item = int(item) except ValueError: - if item == 'dev': - item = -3 - elif item == 'alpha': - item = -2 - elif item == 'beta': - item = -1 + if item.startswith('dev'): + count = item[3:] + item = -30 + elif item.startswith('alpha'): + count = item[5:] + item = -20 + elif item.startswith('beta'): + count = item[4:] + item = -10 + else: + count = 0 + if count: + item = item + int(count) version_list.append(item) return tuple(version_list) From scoder at codespeak.net Mon Jul 2 16:52:54 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 2 Jul 2007 16:52:54 +0200 (CEST) Subject: [Lxml-checkins] r44670 - lxml/branch/lxml-1.3/src/lxml Message-ID: <20070702145254.EAD7D80B6@code0.codespeak.net> Author: scoder Date: Mon Jul 2 16:52:54 2007 New Revision: 44670 Modified: lxml/branch/lxml-1.3/src/lxml/objectify.pyx lxml/branch/lxml-1.3/src/lxml/pyclasslookup.pyx Log: provide __version__ in all Pyrex modules Modified: lxml/branch/lxml-1.3/src/lxml/objectify.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/objectify.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/objectify.pyx Mon Jul 2 16:52:54 2007 @@ -13,6 +13,8 @@ # initialize C-API of lxml.etree import_etree(etree) +__version__ = etree.__version__ + cdef object SubElement SubElement = etree.SubElement Modified: lxml/branch/lxml-1.3/src/lxml/pyclasslookup.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/pyclasslookup.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/pyclasslookup.pyx Mon Jul 2 16:52:54 2007 @@ -15,6 +15,8 @@ # initialize C-API of lxml.etree import_etree(etree) +__version__ = etree.__version__ + cdef class _ElementProxy: cdef tree.xmlNode* _c_node cdef object _source_proxy From ianb at codespeak.net Mon Jul 2 19:20:57 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Mon, 2 Jul 2007 19:20:57 +0200 (CEST) Subject: [Lxml-checkins] r44675 - lxml/branch/html/src/lxml/html Message-ID: <20070702172057.73F2580BA@code0.codespeak.net> Author: ianb Date: Mon Jul 2 19:20:56 2007 New Revision: 44675 Modified: lxml/branch/html/src/lxml/html/css.py Log: Change up the namespace function a bit Modified: lxml/branch/html/src/lxml/html/css.py ============================================================================== --- lxml/branch/html/src/lxml/html/css.py (original) +++ lxml/branch/html/src/lxml/html/css.py Mon Jul 2 19:20:56 2007 @@ -155,7 +155,9 @@ def _make_lower_case(context, s): return s.lower() -etree.FunctionNamespace("css")['lower-case'] = _make_lower_case +ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/') +ns.prefix = 'css' +ns['lower-case'] = _make_lower_case class Pseudo(object): """ From scoder at codespeak.net Mon Jul 2 19:34:04 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 2 Jul 2007 19:34:04 +0200 (CEST) Subject: [Lxml-checkins] r44676 - lxml/trunk/src/lxml Message-ID: <20070702173404.056A080B2@code0.codespeak.net> Author: scoder Date: Mon Jul 2 19:34:04 2007 New Revision: 44676 Modified: lxml/trunk/src/lxml/etree.pyx Log: allow for a lot of alpha/beta versions :) Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Mon Jul 2 19:34:04 2007 @@ -133,16 +133,16 @@ except ValueError: if item.startswith('dev'): count = item[3:] - item = -30 + item = -300 elif item.startswith('alpha'): count = item[5:] - item = -20 + item = -200 elif item.startswith('beta'): count = item[4:] - item = -10 + item = -100 else: count = 0 - if count: + if count.trim(): item = item + int(count) version_list.append(item) return tuple(version_list) From scoder at codespeak.net Mon Jul 2 19:34:27 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 2 Jul 2007 19:34:27 +0200 (CEST) Subject: [Lxml-checkins] r44677 - lxml/trunk/src/lxml Message-ID: <20070702173427.9CC8F80BA@code0.codespeak.net> Author: scoder Date: Mon Jul 2 19:34:27 2007 New Revision: 44677 Modified: lxml/trunk/src/lxml/etree.pyx Log: small fix Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Mon Jul 2 19:34:27 2007 @@ -142,7 +142,7 @@ item = -100 else: count = 0 - if count.trim(): + if count: item = item + int(count) version_list.append(item) return tuple(version_list) From scoder at codespeak.net Mon Jul 2 20:29:52 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 2 Jul 2007 20:29:52 +0200 (CEST) Subject: [Lxml-checkins] r44678 - lxml/trunk/src/lxml/tests Message-ID: <20070702182952.9567380AB@code0.codespeak.net> Author: scoder Date: Mon Jul 2 20:29:50 2007 New Revision: 44678 Modified: lxml/trunk/src/lxml/tests/test_etree.py Log: test script cleanup Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Mon Jul 2 20:29:50 2007 @@ -14,7 +14,7 @@ from common_imports import SillyFileLike, canonicalize, doctest print -print "TESTED VERSION:" +print "TESTED VERSION:", etree.__version__ print " Python: ", sys.version_info print " lxml.etree: ", etree.LXML_VERSION print " libxml used: ", etree.LIBXML_VERSION @@ -24,7 +24,7 @@ print try: - sorted(()) + sorted except NameError: # Python 2.3 def sorted(seq): From scoder at codespeak.net Mon Jul 2 21:44:11 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 2 Jul 2007 21:44:11 +0200 (CEST) Subject: [Lxml-checkins] r44679 - lxml/trunk/doc Message-ID: <20070702194411.F1DB280AA@code0.codespeak.net> Author: scoder Date: Mon Jul 2 21:44:10 2007 New Revision: 44679 Modified: lxml/trunk/doc/tutorial.txt Log: doctest readability Modified: lxml/trunk/doc/tutorial.txt ============================================================================== --- lxml/trunk/doc/tutorial.txt (original) +++ lxml/trunk/doc/tutorial.txt Mon Jul 2 21:44:10 2007 @@ -426,7 +426,7 @@ ... E.body( ... E.h1("Hello!", CLASS("title")), ... E.p("This is a paragraph with ", E.b("bold"), " text in it!"), - ... E.p("This is another paragraph, with a ", + ... E.p("This is another paragraph, with a", "\n ", ... E.a("link", href="http://www.python.org"), "."), ... E.p("Here are some reservered characters: ."), ... etree.XML("

And finally an embedded XHTML fragment.

"), @@ -442,7 +442,8 @@

Hello!

This is a paragraph with bold text in it!

-

This is another paragraph, with a link.

+

This is another paragraph, with a + link.

Here are some reservered characters: <spam&egg>.

And finally an embedded XHTML fragment.

From ianb at codespeak.net Tue Jul 3 00:54:34 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Tue, 3 Jul 2007 00:54:34 +0200 (CEST) Subject: [Lxml-checkins] r44681 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070702225434.C572980B0@code0.codespeak.net> Author: ianb Date: Tue Jul 3 00:54:33 2007 New Revision: 44681 Modified: lxml/branch/html/src/lxml/html/css.py lxml/branch/html/src/lxml/html/tests/test_css.py lxml/branch/html/src/lxml/html/tests/test_css.txt lxml/branch/html/src/lxml/html/tests/test_css_select.txt Log: Created CSSSelector, added a public __all__ for the css module; renamed the xpath() function; put in some tests for parse_series and fixed the results; added NotImplemented errors for *:something-of-type psuedoclasses, which I can't well implement with XPath; change inheritance of the exceptions Modified: lxml/branch/html/src/lxml/html/css.py ============================================================================== --- lxml/branch/html/src/lxml/html/css.py (original) +++ lxml/branch/html/src/lxml/html/css.py Tue Jul 3 00:54:33 2007 @@ -1,12 +1,31 @@ import re from lxml import etree -class SelectorSyntaxError(Exception): +__all__ = ['SelectorSyntaxError', 'ExpressionError', + 'CSSSelector'] + +class SelectorSyntaxError(SyntaxError): pass -class ExpressionError(Exception): +class ExpressionError(RuntimeError): pass +class CSSSelector(etree.XPath): + + def __init__(self, css): + path = css_to_xpath(css) + etree.XPath.__init__(self, path) + self.css = css + + def __repr__(self): + return '<%s %s for %r>' % ( + self.__class__.__name__, + hex(abs(id(self)))[2:], + self.css) + +############################## +## Token objects: + class _UniToken(unicode): def __new__(cls, contents, pos): obj = unicode.__new__(cls, contents) @@ -91,14 +110,14 @@ def _xpath_nth_child(self, xpath, expr, last=False, add_name_test=True): a, b = parse_series(expr) - if not a: + if not a and not b: # a=0 means nothing is returned... xpath.add_condition('false() and position() = 0') return xpath if add_name_test: xpath.add_name_test() xpath.add_star_prefix() - if a == 1: + if a == 0: if last: b = 'last() - %s' % b xpath.add_condition('position() = %s' % b) @@ -111,12 +130,17 @@ b_neg = str(-b) else: b_neg = '+%s' % (-b) - expr = '(position() %s) mod %s = 0' % (b_neg, a) + if a != 1: + expr = ['(position() %s) mod %s = 0' % (b_neg, a)] + else: + expr = [] if b >= 0: - expr += ' and position() >= %s' % b + expr.append('position() >= %s' % b) elif b < 0 and last: - expr += ' and position() < (last() %s)' % b - xpath.add_condition(expr) + expr.append('position() < (last() %s)' % b) + expr = ' and '.join(expr) + if expr: + xpath.add_condition(expr) return xpath # FIXME: handle an+b, odd, even # an+b means every-a, plus b, e.g., 2n+1 means odd @@ -130,6 +154,9 @@ return self._xpath_nth_child(xpath, expr, last=True) def _xpath_nth_of_type(self, xpath, expr): + if xpath.element == '*': + raise NotImplementedError( + "*:nth-of-type() is not implemented") return self._xpath_nth_child(xpath, expr, add_name_test=False) def _xpath_nth_last_of_type(self, xpath, expr): @@ -215,11 +242,17 @@ return xpath def _xpath_first_of_type(self, xpath): + if xpath.element == '*': + raise NotImplementedError( + "*:first-of-type is not implemented") xpath.add_star_prefix() xpath.add_condition('position() = 1') return xpath def _xpath_last_of_type(self, xpath): + if xpath.element == '*': + raise NotImplementedError( + "*:last-of-type is not implemented") xpath.add_star_prefix() xpath.add_condition('position() = last()') return xpath @@ -230,6 +263,9 @@ return xpath def _xpath_only_of_type(self, xpath): + if xpath.element == '*': + raise NotImplementedError( + "*:only-of-type is not implemented") xpath.add_condition('last() = 1') return xpath @@ -343,7 +379,7 @@ else: # FIXME: Should we lowercase here? el = '%s:%s' % (self.namespace, self.element) - return XPath(element=el) + return XPathExpr(element=el) class Hash(object): """ @@ -375,7 +411,7 @@ def xpath(self): paths = [item.xpath() for item in self.items] - return XPathOr(paths) + return XPathExprOr(paths) class CombinedSelector(object): @@ -435,9 +471,9 @@ return xpath ############################## -## XPath objects: +## XPathExpr objects: -def xpath(css_expr, prefix='descendant-or-self::'): +def css_to_xpath(css_expr, prefix='descendant-or-self::'): if isinstance(css_expr, basestring): css_expr = parse(css_expr) expr = css_expr.xpath() @@ -447,14 +483,7 @@ expr.add_prefix(prefix) return str(expr) -def run_xpath(doc, xpath): - return [el for el in doc.xpath(xpath) - if isinstance(el, etree.ElementBase)] - -def run_css(doc, css): - return run_xpath(doc, xpath(css)) - -class XPath(object): +class XPathExpr(object): def __init__(self, prefix=None, path=None, element='*', condition=None, star_prefix=False): @@ -529,7 +558,7 @@ self.element = other.element self.condition = other.condition -class XPathOr(XPath): +class XPathExprOr(XPathExpr): """ Represents on |'d expressions. Note that unfortunately it isn't @@ -547,7 +576,9 @@ return ' | '.join([prefix + str(i) for i in self.items]) def xpath_repr(s): - # FIXME: I don't think this is right + # FIXME: I don't think this is right, but lacking any reasonable + # specification on what XPath literals look like (which doesn't seem + # to be in the XPath specification) it is hard to do 'right' if isinstance(s, Element): # This is probably a symbol that looks like an expression... s = s._format_element() @@ -703,11 +734,11 @@ if isinstance(s, Element): s = s._format_element() if not s or s == '*': - # Happens when there's nothing, which CSS things of as * - return (1, 0) + # Happens when there's nothing, which the CSS parser thinks of as * + return (0, 0) if isinstance(s, int): # Happens when you just get a number - return (1, s) + return (0, s) if s == 'odd': return (2, 1) elif s == 'even': @@ -716,7 +747,7 @@ return (1, 0) if 'n' not in s: # Just a b - return int(s) + return (0, int(s)) a, b = s.split('n', 1) if not a: a = 1 Modified: lxml/branch/html/src/lxml/html/tests/test_css.py ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_css.py (original) +++ lxml/branch/html/src/lxml/html/tests/test_css.py Tue Jul 3 00:54:33 2007 @@ -69,7 +69,7 @@ body = doc.xpath('//body')[0] bad = [] selector, count = self.selectors[self.index] - xpath = css.xpath(css.parse(selector)) + xpath = css.css_to_xpath(css.parse(selector)) try: results = body.xpath(xpath) except Exception, e: Modified: lxml/branch/html/src/lxml/html/tests/test_css.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_css.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_css.txt Tue Jul 3 00:54:33 2007 @@ -110,6 +110,23 @@ e/following-sibling::f >>> xpath('div#container p') div[@id = 'container']/descendant::p - >>> # FIXME: This isn't right, but I don't know what *is* right >>> xpath('p *:only-of-type') - p/descendant::*[last() = 1] + Traceback (most recent call last): + ... + NotImplementedError: *:only-of-type is not implemented + +Then of parse_series: + + >>> from lxml.html.css import parse_series + >>> parse_series('1n+3') + (1, 3) + >>> parse_series('n-5') + (1, -5) + >>> parse_series('odd') + (2, 1) + >>> parse_series('3n') + (3, 0) + >>> parse_series('n') + (1, 0) + >>> parse_series('5') + (0, 5) \ No newline at end of file Modified: lxml/branch/html/src/lxml/html/tests/test_css_select.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_css_select.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_css_select.txt Tue Jul 3 00:54:33 2007 @@ -1,7 +1,7 @@ This is a test of CSS selectors. We setup a document we'll use for all our selections, and a function make querying simpler: - >>> from lxml.html.css import run_css, xpath + >>> from lxml.html.css import CSSSelector >>> from lxml.html import HTML >>> doc = HTML(''' ... @@ -35,10 +35,10 @@ >>> for count, el in enumerate(doc.getiterator()): ... order[el] = count >>> def select_ids(selector): - ... items = run_css(doc, selector) + ... items = CSSSelector(selector)(doc) ... if not items: ... return 'empty' - ... items = run_css(doc, selector) + ... items = CSSSelector(selector)(doc) ... items.sort(key=lambda el: order[el]) ... return ', '.join([el.get('id', 'nil') for el in items]) >>> def pcss(main, *selectors): @@ -114,7 +114,9 @@ >>> pcss('div *:only-child') foobar-span >>> pcss('p *:only-of-type') - p-em + Traceback (most recent call last): + ... + NotImplementedError: *:only-of-type is not implemented >>> pcss('p:only-of-type') paragraph >>> pcss('a:empty') From ianb at codespeak.net Tue Jul 3 01:26:02 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Tue, 3 Jul 2007 01:26:02 +0200 (CEST) Subject: [Lxml-checkins] r44682 - lxml/branch/html/src/lxml/html Message-ID: <20070702232602.5859580BC@code0.codespeak.net> Author: ianb Date: Tue Jul 3 01:26:00 2007 New Revision: 44682 Modified: lxml/branch/html/src/lxml/html/css.py Log: Fix a problem with nth-last-child; adjust el:empty a little Modified: lxml/branch/html/src/lxml/html/css.py ============================================================================== --- lxml/branch/html/src/lxml/html/css.py (original) +++ lxml/branch/html/src/lxml/html/css.py Tue Jul 3 01:26:00 2007 @@ -110,7 +110,7 @@ def _xpath_nth_child(self, xpath, expr, last=False, add_name_test=True): a, b = parse_series(expr) - if not a and not b: + if not a and not b and not last: # a=0 means nothing is returned... xpath.add_condition('false() and position() = 0') return xpath @@ -270,7 +270,7 @@ return xpath def _xpath_empty(self, xpath): - xpath.add_condition("count(./child::*) = 0 and normalize-space(.) = ''") + xpath.add_condition("not(*) and not(normalize-space())") return xpath class Attrib(object): From ianb at codespeak.net Tue Jul 3 01:58:14 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Tue, 3 Jul 2007 01:58:14 +0200 (CEST) Subject: [Lxml-checkins] r44683 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070702235814.ACBA680BB@code0.codespeak.net> Author: ianb Date: Tue Jul 3 01:58:14 2007 New Revision: 44683 Modified: lxml/branch/html/src/lxml/html/css.py lxml/branch/html/src/lxml/html/tests/test_css.py lxml/branch/html/src/lxml/html/tests/test_css.txt lxml/branch/html/src/lxml/html/tests/test_css_select.txt Log: Fix :only-child; adjust some tests that were acquired, where I don't understand the numbers they used. Modified: lxml/branch/html/src/lxml/html/css.py ============================================================================== --- lxml/branch/html/src/lxml/html/css.py (original) +++ lxml/branch/html/src/lxml/html/css.py Tue Jul 3 01:58:14 2007 @@ -259,6 +259,7 @@ def _xpath_only_child(self, xpath): xpath.add_name_test() + xpath.add_star_prefix() xpath.add_condition('last() = 1') return xpath Modified: lxml/branch/html/src/lxml/html/tests/test_css.py ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_css.py (original) +++ lxml/branch/html/src/lxml/html/tests/test_css.py Tue Jul 3 01:58:14 2007 @@ -12,9 +12,14 @@ class CSSTestCase(unittest.TestCase): selectors = [ - ('*', 252), + ## Changed from original; probably because I'm only searching the body + #('*', 252), + ('*', 246), ('div:only-child', 22), # ? - ('div:contains(CELIA)', 243), + ## Changed from original, because the original doesn't make sense. + ## There really aren't that many occurrances of 'celia' + #('div:contains(CELIA)', 243), + ('div:contains(CELIA)', 30), ('div:nth-child(even)', 106), ('div:nth-child(2n)', 106), ('div:nth-child(odd)', 137), Modified: lxml/branch/html/src/lxml/html/tests/test_css.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_css.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_css.txt Tue Jul 3 01:58:14 2007 @@ -87,11 +87,11 @@ >>> xpath('E:last-of-type') */e[position() = last()] >>> xpath('E:only-child') - *[name() = 'e' and (last() = 1)] + */*[name() = 'e' and (last() = 1)] >>> xpath('E:only-of-type') e[last() = 1] >>> xpath('E:empty') - e[count(./child::*) = 0 and normalize-space(.) = ''] + e[not(*) and not(normalize-space())] >>> xpath('E:contains("foo")') e[contains(css:lower-case(string(.)), 'foo')] >>> xpath('E.warning') @@ -129,4 +129,4 @@ >>> parse_series('n') (1, 0) >>> parse_series('5') - (0, 5) \ No newline at end of file + (0, 5) Modified: lxml/branch/html/src/lxml/html/tests/test_css_select.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_css_select.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_css_select.txt Tue Jul 3 01:58:14 2007 @@ -148,4 +148,3 @@ nofollow-anchor >>> pcss('ol#first-ol li:last-child', 'ol#first-ol *:last-child') seventh-li - \ No newline at end of file From ianb at codespeak.net Tue Jul 3 02:10:56 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Tue, 3 Jul 2007 02:10:56 +0200 (CEST) Subject: [Lxml-checkins] r44684 - lxml/branch/html/src/lxml/html Message-ID: <20070703001056.A86F380BB@code0.codespeak.net> Author: ianb Date: Tue Jul 3 02:10:56 2007 New Revision: 44684 Modified: lxml/branch/html/src/lxml/html/css.py Log: Add some fast translation for id, class, and plain element name matches Modified: lxml/branch/html/src/lxml/html/css.py ============================================================================== --- lxml/branch/html/src/lxml/html/css.py (original) +++ lxml/branch/html/src/lxml/html/css.py Tue Jul 3 02:10:56 2007 @@ -474,8 +474,23 @@ ############################## ## XPathExpr objects: +_el_re = re.compile(r'^\w+\s*$') +_id_re = re.compile(r'^(\w*)#(\w+)\s*$') +_class_re = re.compile(r'^(\w*)\.(\w+)\s*$') + def css_to_xpath(css_expr, prefix='descendant-or-self::'): if isinstance(css_expr, basestring): + match = _el_re.search(css_expr) + if match is not None: + return '%s%s' % (prefix, match.group(0).strip()) + match = _id_re.search(css_expr) + if match is not None: + return "%s%s[@id = '%s']" % ( + prefix, match.group(1) or '*', match.group(2)) + match = _class_re.search(css_expr) + if match is not None: + return "%s%s[contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % ( + prefix, match.group(1) or '*', match.group(2)) css_expr = parse(css_expr) expr = css_expr.xpath() assert expr is not None, ( From ianb at codespeak.net Tue Jul 3 03:29:50 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Tue, 3 Jul 2007 03:29:50 +0200 (CEST) Subject: [Lxml-checkins] r44685 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070703012950.D3C5180BE@code0.codespeak.net> Author: ianb Date: Tue Jul 3 03:29:49 2007 New Revision: 44685 Modified: lxml/branch/html/src/lxml/html/clean.py lxml/branch/html/src/lxml/html/tests/test_clean.txt Log: Moved to a class-based cleaner instead of a function. Resulting rearrangement Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Tue Jul 3 03:29:49 2007 @@ -64,66 +64,44 @@ clean(doc, **kw) return tostring(doc) -# FIXME: I really have to figure out what a sane set of defaults is -# for these keyword arguments. And is this signature out of control? -# What about if we want things like whitelisting of or other -# controls? Maybe this has to be more than a function. -def clean(doc, - scripts=True, - javascript=True, - comments=True, - style=False, - links=True, - meta=True, - page_structure=True, - processing_instructions=True, - embedded=True, - frames=True, - forms=True, - annoying_tags=True, - remove_tags=None, - allow_tags=None, - strip_tags=True, - remove_unknown_tags=True, - safe_attrs_only=True, - add_nofollow=False, - # callbacks? - ): +class Cleaner(object): """ - Cleans the document of each of the possible offending elements: + Instances cleans the document of each of the possible offending + elements. The cleaning is controlled by attributes; you can + override attributes in a subclass, or set them in the constructor. ``scripts``: - Any `` + ... + ... + ... + ... + ... + ... a link + ... another link + ...

a paragraph

+ ...
secret EVIL!
+ ... of EVIL! + ... + ...
+ ... Password: + ...
+ ... annoying EVIL! + ... spam spam SPAM! + ... + ... + ... ''' + +To remove the all suspicious content from this unparsed document, use the +``clean_html`` function.:: + + >>> from lxml.html.clean import clean_html + + >>> print clean_html(html) + + +
+ + a link + another link +

a paragraph

+
secret EVIL!
+ of EVIL! + Password: + annoying EVIL! + spam spam SPAM! + +
+ + + +The ``Cleaner`` class supports several keyword arguments to control exactly +which content is removed:: + + >>> from lxml.html.clean import Cleaner + + >>> cleaner = Cleaner(page_structure=False, links=False) + >>> print cleaner.clean_html(html) + + + + + + + a link + another link +

a paragraph

+
secret EVIL!
+ of EVIL! + Password: + annoying EVIL! + spam spam SPAM! + + + + + >>> cleaner = Cleaner(style=True, links=True, add_nofollow=True, + ... page_structure=False, safe_attrs_only=False) + + >>> print cleaner.clean_html(html) + + + + + a link + another link +

a paragraph

+
secret EVIL!
+ of EVIL! + Password: + annoying EVIL! + spam spam SPAM! + + + Modified: lxml/branch/html/doc/mkhtml.py ============================================================================== --- lxml/branch/html/doc/mkhtml.py (original) +++ lxml/branch/html/doc/mkhtml.py Thu Jul 5 21:50:03 2007 @@ -6,7 +6,8 @@ 'performance.txt', 'build.txt')), ('Developing with lxml', ('tutorial.txt', 'api.txt', 'parsing.txt', 'validation.txt', 'xpathxslt.txt', - 'objectify.txt')), + 'objectify.txt', 'lxmlhtml.txt', + 'cssselect.txt')), ('Extending lxml', ('resolvers.txt', 'extensions.txt', 'element_classes.txt', 'sax.txt', 'capi.txt')), ] From scoder at codespeak.net Thu Jul 5 21:52:14 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 5 Jul 2007 21:52:14 +0200 (CEST) Subject: [Lxml-checkins] r44757 - lxml/trunk/src/lxml Message-ID: <20070705195214.D36FD80E9@code0.codespeak.net> Author: scoder Date: Thu Jul 5 21:52:14 2007 New Revision: 44757 Modified: lxml/trunk/src/lxml/objectify.pyx Log: small fix Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Thu Jul 5 21:52:14 2007 @@ -1054,7 +1054,7 @@ result = python.PyDict_GetItem(_PYTYPE_DICT, name) if result is NULL: return None - return (result)._stringify + return (result)._stringify return result def __contains__(self, type): From scoder at codespeak.net Thu Jul 5 21:52:24 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 5 Jul 2007 21:52:24 +0200 (CEST) Subject: [Lxml-checkins] r44758 - lxml/trunk/doc Message-ID: <20070705195224.B539780E9@code0.codespeak.net> Author: scoder Date: Thu Jul 5 21:52:24 2007 New Revision: 44758 Modified: lxml/trunk/doc/main.txt Log: small doc update Modified: lxml/trunk/doc/main.txt ============================================================================== --- lxml/trunk/doc/main.txt (original) +++ lxml/trunk/doc/main.txt Thu Jul 5 21:52:24 2007 @@ -35,6 +35,10 @@ .. _FAQ: FAQ.html +This page describes the current in-development version of lxml that will +eventually become lxml 2.0. + + Documentation ------------- From scoder at codespeak.net Thu Jul 5 21:56:34 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 5 Jul 2007 21:56:34 +0200 (CEST) Subject: [Lxml-checkins] r44759 - lxml/branch/html/src/lxml/html/tests Message-ID: <20070705195634.6C80080F3@code0.codespeak.net> Author: scoder Date: Thu Jul 5 21:56:34 2007 New Revision: 44759 Modified: lxml/branch/html/src/lxml/html/tests/test_css.txt Log: small import fix after module move Modified: lxml/branch/html/src/lxml/html/tests/test_css.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_css.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_css.txt Thu Jul 5 21:56:34 2007 @@ -1,6 +1,6 @@ A quick test of tokenizing: - >>> from lxml.html.css import tokenize, parse + >>> from lxml.cssselect import tokenize, parse >>> def ptok(s): ... for item in tokenize(s): ... print repr(item) @@ -117,7 +117,7 @@ Then of parse_series: - >>> from lxml.html.css import parse_series + >>> from lxml.cssselect import parse_series >>> parse_series('1n+3') (1, 3) >>> parse_series('n-5') From ianb at codespeak.net Fri Jul 6 02:03:51 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 6 Jul 2007 02:03:51 +0200 (CEST) Subject: [Lxml-checkins] r44764 - lxml/branch/html/doc Message-ID: <20070706000351.85305811D@code0.codespeak.net> Author: ianb Date: Fri Jul 6 02:03:49 2007 New Revision: 44764 Modified: lxml/branch/html/doc/cssselect.txt lxml/branch/html/doc/lxmlhtml.txt Log: added some docs Modified: lxml/branch/html/doc/cssselect.txt ============================================================================== --- lxml/branch/html/doc/cssselect.txt (original) +++ lxml/branch/html/doc/cssselect.txt Fri Jul 6 02:03:49 2007 @@ -15,7 +15,6 @@ .. 1 Finding nodes - The CSSSelector class ===================== @@ -23,4 +22,70 @@ provides the same interface as the XPath_ class, but accepts a CSS selector expression as input:: - >>> \ No newline at end of file + >>> sel = CSSSelector('div.content') + >>> sel + + +The selector actually compiles to XPath, and you can see the +expression by inspecting the object:: + + >>> sel.path + "descendant-or-self::div[contains(concat(' ', normalize-space(@class), ' '), ' content ')]" + +To use the selector, simply call it with a document or element +object:: + + >>> from lxml.etree import HTML + >>> h = HTML('''
+ ...
+ ... text + ...
''') + >>> [e.get('id') for e in sel(h)] + ['inner'] + +CSS Selectors +============= + +This libraries attempts to implement CSS selectors `as described in +the w3c specification +`_. Many of +the pseudo-classes do not apply in this context, including all +`dynamic pseudo-classes +`_. +In particular these will not be available: + +* link state: ``:link``, ``:visited``, ``:target`` +* actions: ``:hover``, ``:active``, ``:focus`` +* UI states: ``:enabled``, ``:disabled``, ``:indeterminate`` + (``:checked`` and ``:unchecked`` *are* available) + +Also, none of the psuedo-elements apply, because the selector only +returns elements and psuedo-elements select portions of text, like +``::first-line``. + +Namespaces +---------- + +In CSS you can use ``namespace-prefix|element``, similar to +``namespace-prefix:element`` in an XPath expression. In fact, it maps +one-to-one, and the same rules are used to map namespace prefixes to +namespace URIs. + +Limitations +=========== + +These applicable pseudoclasses are not yet implemented: + +* ``:lang(language)`` +* ``:root`` +* ``*:first-of-type``, ``*:last-of-type``, ``*:nth-of-type``, + ``*:nth-last-of-type``, ``*:only-of-type``. All of these work when + you specify an element type, but not with ``*`` + +Unlike XPath you cannot provide parameters in your expressions -- all +expressions are completely static. + +XPath has underspecified string quoting rules (there seems to be no +string quoting at all), so if you use expressions that contain +characters that requiring quoting you might have problems with the +translation from CSS to XPath. Modified: lxml/branch/html/doc/lxmlhtml.txt ============================================================================== --- lxml/branch/html/doc/lxmlhtml.txt (original) +++ lxml/branch/html/doc/lxmlhtml.txt Fri Jul 6 02:03:49 2007 @@ -28,16 +28,17 @@ One of the interesting modules in the ``lxml.html`` package deals with doctests. It can be hard to compare two HTML pages for equality, as -whitespace sequences need to be ignored and the structural formatting can -differ. This is even more a problem in doctests, where output is tested for -equality and small differences in whitespace or the order of attributes can -let a test fail. And given the verbosity of tag-based languages, it may take -more than a quick look to find the actual differences in the doctest output. - -Luckily, lxml provides the ``lxml.doctestcompare`` module that supports -relaxed comparison of XML and HTML pages and provides a readable diff in the -output when a test fails. It is most easily used by importing the -``usedoctest`` module in a doctest:: +whitespace differences aren't meaningful and the structural formatting +can differ. This is even more a problem in doctests, where output is +tested for equality and small differences in whitespace or the order +of attributes can let a test fail. And given the verbosity of +tag-based languages, it may take more than a quick look to find the +actual differences in the doctest output. + +Luckily, lxml provides the ``lxml.doctestcompare`` module that +supports relaxed comparison of XML and HTML pages and provides a +readable diff in the output when a test fails. The HTML comparison is +most easily used by importing the ``usedoctest`` module in a doctest:: >>> from lxml.html import usedoctest @@ -70,8 +71,9 @@ above. This allows you to concentrate on readability in your doctests, even if the real output is a straight ugly HTML one-liner. -Note that there is also an ``lxml.usedoctest`` module which you can import for -XML comparisons. +Note that there is also an ``lxml.usedoctest`` module which you can +import for XML comparisons. The HTML parser notably ignores +namespaces and some other XMLisms. Parsing HTML @@ -120,10 +122,76 @@ +Note that you should use ``lxml.html.tostring`` and **not** +``lxml.tostring``. ``lxml.tostring(doc)`` will return the XML +representation of the document, which is not valid HTML. In +particular, things like ```` will be +serialized as `` Modified: lxml/branch/html/src/lxml/html/tests/test_feedparser_data.py ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_feedparser_data.py (original) +++ lxml/branch/html/src/lxml/html/tests/test_feedparser_data.py Mon Jul 9 21:27:25 2007 @@ -5,7 +5,6 @@ from lxml.tests.common_imports import doctest from lxml.doctestcompare import LHTMLOutputChecker -from lxml.html import HTML, parse_element from lxml.html.clean import clean, Cleaner feed_dirs = [ Modified: lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt Mon Jul 9 21:27:25 2007 @@ -14,7 +14,7 @@ Some basics:: - >>> from lxml.html import usedoctest, parse_element, tostring + >>> from lxml.html import usedoctest, tostring >>> from lxml.html import rewrite_links >>> print rewrite_links( ... 'link', relocate_href) From scoder at codespeak.net Mon Jul 9 22:13:49 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 9 Jul 2007 22:13:49 +0200 (CEST) Subject: [Lxml-checkins] r44883 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20070709201349.792C3822F@code0.codespeak.net> Author: scoder Date: Mon Jul 9 22:13:48 2007 New Revision: 44883 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/tests/test_etree.py Log: raise ValueError tag names containing ':' Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon Jul 9 22:13:48 2007 @@ -34,6 +34,8 @@ Bugs fixed ---------- +* ``Element()`` did not raise an exception on tag names containing ':' + * The XML parser did not report undefined entities as error * The text in exceptions raised by XML parsers, validators and XPath Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Mon Jul 9 22:13:48 2007 @@ -708,6 +708,8 @@ c_ns_end = cstd.strchr(c_tag, c'}') if c_ns_end is NULL: raise ValueError, "Invalid tag name" + if cstd.strchr(c_ns_end, c':') is not NULL: + raise ValueError, "Invalid tag name" nslen = c_ns_end - c_tag taglen = python.PyString_GET_SIZE(tag) - nslen - 2 if taglen == 0: @@ -717,6 +719,8 @@ tag = python.PyString_FromStringAndSize(c_ns_end+1, taglen) elif python.PyString_GET_SIZE(tag) == 0: raise ValueError, "Empty tag name" + elif cstd.strchr(c_tag, c':') is not NULL: + raise ValueError, "Invalid tag name" return ns, tag cdef object _namespacedName(xmlNode* c_node): Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Mon Jul 9 22:13:48 2007 @@ -52,14 +52,27 @@ def test_element_names(self): Element = self.etree.Element - el = Element('name') self.assertEquals(el.tag, 'name') el = Element('{}name') self.assertEquals(el.tag, 'name') + + def test_element_name_empty(self): + Element = self.etree.Element + el = Element('name') + self.assertRaises(ValueError, Element, '{}') + self.assertRaises(ValueError, setattr, el, 'tag', '{}') + self.assertRaises(ValueError, Element, '{test}') self.assertRaises(ValueError, setattr, el, 'tag', '{test}') + def test_element_name_colon(self): + Element = self.etree.Element + self.assertRaises(ValueError, Element, 'p:name') + + el = Element('name') + self.assertRaises(ValueError, setattr, el, 'tag', 'p:name') + def test_attribute_set(self): # ElementTree accepts arbitrary attribute values # lxml.etree allows only strings From scoder at codespeak.net Mon Jul 9 22:14:23 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 9 Jul 2007 22:14:23 +0200 (CEST) Subject: [Lxml-checkins] r44884 - lxml/trunk/src/lxml/tests Message-ID: <20070709201423.8EECF822E@code0.codespeak.net> Author: scoder Date: Mon Jul 9 22:14:23 2007 New Revision: 44884 Modified: lxml/trunk/src/lxml/tests/test_etree.py Log: extended test case Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Mon Jul 9 22:14:23 2007 @@ -69,6 +69,7 @@ def test_element_name_colon(self): Element = self.etree.Element self.assertRaises(ValueError, Element, 'p:name') + self.assertRaises(ValueError, Element, '{test}p:name') el = Element('name') self.assertRaises(ValueError, setattr, el, 'tag', 'p:name') From scoder at codespeak.net Mon Jul 9 22:40:32 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 9 Jul 2007 22:40:32 +0200 (CEST) Subject: [Lxml-checkins] r44885 - in lxml/trunk: . src/lxml Message-ID: <20070709204032.9F0D48227@code0.codespeak.net> Author: scoder Date: Mon Jul 9 22:40:32 2007 New Revision: 44885 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/etree.pyx Log: give fromstring() its own implementation and docstring, independent of XML() Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon Jul 9 22:40:32 2007 @@ -8,6 +8,9 @@ Features added -------------- +* ``etree.fromstring()`` now supports parsing both HTML and XML, depending on + the parser you pass. + * Support ``base_url`` keyword argument in ``HTML()`` and ``XML()`` * Extended type support for ``objectify.E`` based on registered PyTypes. Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Mon Jul 9 22:40:32 2007 @@ -1935,7 +1935,9 @@ def XML(text, _BaseParser parser=None, base_url=None): """Parses an XML document from a string constant. This function can be used - to embed "XML literals" in Python code. + to embed "XML literals" in Python code, like in + + >>> root = etree.XML("") To override the parser with a different ``XMLParser`` you can pass it to the ``parser`` keyword argument. @@ -1952,7 +1954,21 @@ doc = _parseMemoryDocument(text, base_url, parser) return doc.getroot() -fromstring = XML +def fromstring(text, _BaseParser parser=None, base_url=None): + """Parses an XML document from a string. + + To override the default parser with a different parser you can pass it to + the ``parser`` keyword argument. + + The ``base_url`` keyword argument allows to set the original base URL of + the document to support relative Paths when looking up external entities + (DTD, XInclude, ...). + """ + cdef _Document doc + if parser is None: + parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser() + doc = _parseMemoryDocument(text, base_url, parser) + return doc.getroot() def iselement(element): """Checks if an object appears to be a valid element object. From lxml-checkins at codespeak.net Wed Jul 11 17:15:53 2007 From: lxml-checkins at codespeak.net (lxml-checkins at codespeak.net) Date: Wed, 11 Jul 2007 17:15:53 +0200 (CEST) Subject: [Lxml-checkins] Doctor Janine : 45% off your order Message-ID: <20070711061858.11514.qmail@granato.finbeta.com> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20070711/429a4a81/attachment-0001.htm From scoder at codespeak.net Thu Jul 12 00:03:26 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 12 Jul 2007 00:03:26 +0200 (CEST) Subject: [Lxml-checkins] r44936 - lxml/trunk/doc Message-ID: <20070711220326.3EAA18160@code0.codespeak.net> Author: scoder Date: Thu Jul 12 00:03:25 2007 New Revision: 44936 Modified: lxml/trunk/doc/parsing.txt Log: note on parsing from filenames Modified: lxml/trunk/doc/parsing.txt ============================================================================== --- lxml/trunk/doc/parsing.txt (original) +++ lxml/trunk/doc/parsing.txt Thu Jul 12 00:03:25 2007 @@ -28,10 +28,21 @@ >>> xml = '' - >>> et = etree.parse(StringIO(xml)) - >>> print etree.tostring(et.getroot()) + >>> tree = etree.parse(StringIO(xml)) + >>> print etree.tostring(tree.getroot()) +Note how the ``parse()`` function reads from a file-like object here. If +parsing is done from a real file, it is more common (and also more efficient) +to pass a filename or a URL. HTTP and FTP access is directly supported by +libxml2, as well as gzip-compressed files (.gz). + +If you want to parse from memory and still provide a base URL for the document +(e.g. to support relative paths in an XInclude), you can provide the +``base_url`` keyword argument:: + + >>> tree = etree.parse("test.xml") + Parser options -------------- @@ -40,8 +51,8 @@ example is easily extended to clean up namespaces during parsing:: >>> parser = etree.XMLParser(ns_clean=True) - >>> et = etree.parse(StringIO(xml), parser) - >>> print etree.tostring(et.getroot()) + >>> tree = etree.parse(StringIO(xml), parser) + >>> print etree.tostring(tree.getroot()) The keyword arguments in the constructor are mainly based on the libxml2 @@ -81,9 +92,9 @@ >>> broken_html = "test<body><h1>page title</h3>" >>> parser = etree.HTMLParser() - >>> et = etree.parse(StringIO(broken_html), parser) + >>> tree = etree.parse(StringIO(broken_html), parser) - >>> print etree.tostring(et.getroot(), pretty_print=True) + >>> print etree.tostring(tree.getroot(), pretty_print=True) <html> <head> <title>test From scoder at codespeak.net Thu Jul 12 00:13:33 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 12 Jul 2007 00:13:33 +0200 (CEST) Subject: [Lxml-checkins] r44937 - in lxml/branch/lxml-1.3: . src/lxml src/lxml/tests Message-ID: <20070711221333.5535B8160@code0.codespeak.net> Author: scoder Date: Thu Jul 12 00:13:32 2007 New Revision: 44937 Modified: lxml/branch/lxml-1.3/CHANGES.txt lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi lxml/branch/lxml-1.3/src/lxml/etree.pyx lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py Log: trunk merge: split of fromstring() and XML(), raise exception on tag names containing ':' Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Thu Jul 12 00:13:32 2007 @@ -8,11 +8,16 @@ Features added -------------- +* ``etree.fromstring()`` now supports parsing both HTML and XML, depending on + the parser you pass. + * Support ``base_url`` keyword argument in ``HTML()`` and ``XML()`` Bugs fixed ---------- +* ``Element()`` did not raise an exception on tag names containing ':' + 1.3.2 (2007-07-03) ================== Modified: lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi Thu Jul 12 00:13:32 2007 @@ -708,6 +708,8 @@ c_ns_end = cstd.strchr(c_tag, c'}') if c_ns_end is NULL: raise ValueError, "Invalid tag name" + if cstd.strchr(c_ns_end, c':') is not NULL: + raise ValueError, "Invalid tag name" nslen = c_ns_end - c_tag taglen = python.PyString_GET_SIZE(tag) - nslen - 2 if taglen == 0: @@ -717,6 +719,8 @@ tag = python.PyString_FromStringAndSize(c_ns_end+1, taglen) elif python.PyString_GET_SIZE(tag) == 0: raise ValueError, "Empty tag name" + elif cstd.strchr(c_tag, c':') is not NULL: + raise ValueError, "Invalid tag name" return ns, tag cdef object _namespacedName(xmlNode* c_node): Modified: lxml/branch/lxml-1.3/src/lxml/etree.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/etree.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/etree.pyx Thu Jul 12 00:13:32 2007 @@ -1892,7 +1892,9 @@ def XML(text, _BaseParser parser=None, base_url=None): """Parses an XML document from a string constant. This function can be used - to embed "XML literals" in Python code. + to embed "XML literals" in Python code, like in + + >>> root = etree.XML("") To override the parser with a different ``XMLParser`` you can pass it to the ``parser`` keyword argument. @@ -1909,7 +1911,21 @@ doc = _parseMemoryDocument(text, base_url, parser) return doc.getroot() -fromstring = XML +def fromstring(text, _BaseParser parser=None, base_url=None): + """Parses an XML document from a string. + + To override the default parser with a different parser you can pass it to + the ``parser`` keyword argument. + + The ``base_url`` keyword argument allows to set the original base URL of + the document to support relative Paths when looking up external entities + (DTD, XInclude, ...). + """ + cdef _Document doc + if parser is None: + parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser() + doc = _parseMemoryDocument(text, base_url, parser) + return doc.getroot() def iselement(element): """Checks if an object appears to be a valid element object. Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py Thu Jul 12 00:13:32 2007 @@ -52,14 +52,28 @@ def test_element_names(self): Element = self.etree.Element - el = Element('name') self.assertEquals(el.tag, 'name') el = Element('{}name') self.assertEquals(el.tag, 'name') + + def test_element_name_empty(self): + Element = self.etree.Element + el = Element('name') + self.assertRaises(ValueError, Element, '{}') + self.assertRaises(ValueError, setattr, el, 'tag', '{}') + self.assertRaises(ValueError, Element, '{test}') self.assertRaises(ValueError, setattr, el, 'tag', '{test}') + def test_element_name_colon(self): + Element = self.etree.Element + self.assertRaises(ValueError, Element, 'p:name') + self.assertRaises(ValueError, Element, '{test}p:name') + + el = Element('name') + self.assertRaises(ValueError, setattr, el, 'tag', 'p:name') + def test_attribute_set(self): # ElementTree accepts arbitrary attribute values # lxml.etree allows only strings From scoder at codespeak.net Thu Jul 12 00:29:48 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 12 Jul 2007 00:29:48 +0200 (CEST) Subject: [Lxml-checkins] r44938 - lxml/trunk/doc Message-ID: <20070711222948.718F580DB@code0.codespeak.net> Author: scoder Date: Thu Jul 12 00:29:47 2007 New Revision: 44938 Modified: lxml/trunk/doc/parsing.txt Log: parser doc cleanup: better intro Modified: lxml/trunk/doc/parsing.txt ============================================================================== --- lxml/trunk/doc/parsing.txt (original) +++ lxml/trunk/doc/parsing.txt Thu Jul 12 00:29:47 2007 @@ -24,24 +24,35 @@ Parsers are represented by parser objects. There is support for parsing both XML and (broken) HTML. Note that XHTML is best parsed as XML, parsing it with the HTML parser can lead to unexpected results. Here is a simple example for -XML parsing:: +parsing XML from an in-memory string:: >>> xml = '' + >>> root = etree.fromstring(xml) + >>> print etree.tostring(root) + + +To read from a file or file-like object, you can use the ``parse()`` function, +which returns an ``ElementTree`` object:: + >>> tree = etree.parse(StringIO(xml)) >>> print etree.tostring(tree.getroot()) Note how the ``parse()`` function reads from a file-like object here. If -parsing is done from a real file, it is more common (and also more efficient) -to pass a filename or a URL. HTTP and FTP access is directly supported by -libxml2, as well as gzip-compressed files (.gz). +parsing is done from a real file, it is more common (and also somewhat more +efficient) to pass a filename:: + + >>> tree = etree.parse("test.xml") + +lxml can parse from a local file, an HTTP URL or an FTP URL. It also +auto-detects and reads gzip-compressed XML files (.gz). If you want to parse from memory and still provide a base URL for the document -(e.g. to support relative paths in an XInclude), you can provide the -``base_url`` keyword argument:: +(e.g. to support relative paths in an XInclude), you can pass the ``base_url`` +keyword argument:: - >>> tree = etree.parse("test.xml") + >>> root = etree.fromstring(xml, base_url="http://where.it/is/from.xml") Parser options From scoder at codespeak.net Thu Jul 12 00:32:45 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 12 Jul 2007 00:32:45 +0200 (CEST) Subject: [Lxml-checkins] r44939 - lxml/trunk/doc Message-ID: <20070711223245.332D480DB@code0.codespeak.net> Author: scoder Date: Thu Jul 12 00:32:44 2007 New Revision: 44939 Modified: lxml/trunk/doc/parsing.txt Log: doctest fix Modified: lxml/trunk/doc/parsing.txt ============================================================================== --- lxml/trunk/doc/parsing.txt (original) +++ lxml/trunk/doc/parsing.txt Thu Jul 12 00:32:44 2007 @@ -43,7 +43,7 @@ parsing is done from a real file, it is more common (and also somewhat more efficient) to pass a filename:: - >>> tree = etree.parse("test.xml") + >>> tree = etree.parse("doc/test.xml") lxml can parse from a local file, an HTTP URL or an FTP URL. It also auto-detects and reads gzip-compressed XML files (.gz). From scoder at codespeak.net Thu Jul 12 00:34:18 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 12 Jul 2007 00:34:18 +0200 (CEST) Subject: [Lxml-checkins] r44940 - lxml/branch/lxml-1.3/doc Message-ID: <20070711223418.D34F28115@code0.codespeak.net> Author: scoder Date: Thu Jul 12 00:34:17 2007 New Revision: 44940 Modified: lxml/branch/lxml-1.3/doc/parsing.txt Log: parser doc merge from trunk Modified: lxml/branch/lxml-1.3/doc/parsing.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/parsing.txt (original) +++ lxml/branch/lxml-1.3/doc/parsing.txt Thu Jul 12 00:34:17 2007 @@ -24,14 +24,36 @@ Parsers are represented by parser objects. There is support for parsing both XML and (broken) HTML. Note that XHTML is best parsed as XML, parsing it with the HTML parser can lead to unexpected results. Here is a simple example for -XML parsing:: +parsing XML from an in-memory string:: >>> xml = '' - >>> et = etree.parse(StringIO(xml)) - >>> print etree.tostring(et.getroot()) + >>> root = etree.fromstring(xml) + >>> print etree.tostring(root) +To read from a file or file-like object, you can use the ``parse()`` function, +which returns an ``ElementTree`` object:: + + >>> tree = etree.parse(StringIO(xml)) + >>> print etree.tostring(tree.getroot()) + + +Note how the ``parse()`` function reads from a file-like object here. If +parsing is done from a real file, it is more common (and also somewhat more +efficient) to pass a filename:: + + >>> tree = etree.parse("doc/test.xml") + +lxml can parse from a local file, an HTTP URL or an FTP URL. It also +auto-detects and reads gzip-compressed XML files (.gz). + +If you want to parse from memory and still provide a base URL for the document +(e.g. to support relative paths in an XInclude), you can pass the ``base_url`` +keyword argument:: + + >>> root = etree.fromstring(xml, base_url="http://where.it/is/from.xml") + Parser options -------------- @@ -40,8 +62,8 @@ example is easily extended to clean up namespaces during parsing:: >>> parser = etree.XMLParser(ns_clean=True) - >>> et = etree.parse(StringIO(xml), parser) - >>> print etree.tostring(et.getroot()) + >>> tree = etree.parse(StringIO(xml), parser) + >>> print etree.tostring(tree.getroot()) The keyword arguments in the constructor are mainly based on the libxml2 @@ -81,9 +103,9 @@ >>> broken_html = "test<body><h1>page title</h3>" >>> parser = etree.HTMLParser() - >>> et = etree.parse(StringIO(broken_html), parser) + >>> tree = etree.parse(StringIO(broken_html), parser) - >>> print etree.tostring(et.getroot(), pretty_print=True) + >>> print etree.tostring(tree.getroot(), pretty_print=True) <html> <head> <title>test From scoder at codespeak.net Thu Jul 12 00:34:37 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 12 Jul 2007 00:34:37 +0200 (CEST) Subject: [Lxml-checkins] r44941 - lxml/branch/lxml-1.3 Message-ID: <20070711223437.814B38115@code0.codespeak.net> Author: scoder Date: Thu Jul 12 00:34:35 2007 New Revision: 44941 Modified: lxml/branch/lxml-1.3/version.txt Log: version changed to 1.3.3 Modified: lxml/branch/lxml-1.3/version.txt ============================================================================== --- lxml/branch/lxml-1.3/version.txt (original) +++ lxml/branch/lxml-1.3/version.txt Thu Jul 12 00:34:35 2007 @@ -1 +1 @@ -1.3.2 +1.3.3 From scoder at codespeak.net Thu Jul 12 01:15:52 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 12 Jul 2007 01:15:52 +0200 (CEST) Subject: [Lxml-checkins] r44942 - lxml/trunk Message-ID: <20070711231552.2D9478164@code0.codespeak.net> Author: scoder Date: Thu Jul 12 01:15:52 2007 New Revision: 44942 Modified: lxml/trunk/Makefile Log: use epydoc (if available) to generate API docs Modified: lxml/trunk/Makefile ============================================================================== --- lxml/trunk/Makefile (original) +++ lxml/trunk/Makefile Thu Jul 12 01:15:52 2007 @@ -36,6 +36,9 @@ html: inplace mkdir -p doc/html PYTHONPATH=src $(PYTHON) doc/mkhtml.py doc/html . `cat version.txt` + [ -x "`which epydoc`" ] \ + && (cd src && PYTHONPATH=. epydoc -o ../doc/html/api --name lxml --url http://codespeak.net/lxml/ --top http://codespeak.net/lxml/api lxml/) \ + || (echo "not generating epydoc API documentation") # XXX What should the default be? test: test_inplace From scoder at codespeak.net Thu Jul 12 01:18:40 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 12 Jul 2007 01:18:40 +0200 (CEST) Subject: [Lxml-checkins] r44943 - lxml/branch/lxml-1.3 Message-ID: <20070711231840.236078165@code0.codespeak.net> Author: scoder Date: Thu Jul 12 01:18:39 2007 New Revision: 44943 Modified: lxml/branch/lxml-1.3/Makefile Log: use epydoc (if available) to generate API docs Modified: lxml/branch/lxml-1.3/Makefile ============================================================================== --- lxml/branch/lxml-1.3/Makefile (original) +++ lxml/branch/lxml-1.3/Makefile Thu Jul 12 01:18:39 2007 @@ -36,6 +36,9 @@ html: inplace mkdir -p doc/html PYTHONPATH=src $(PYTHON) doc/mkhtml.py doc/html . `cat version.txt` + [ -x "`which epydoc`" ] \ + && (cd src && PYTHONPATH=. epydoc -o ../doc/html/api --name lxml --url http://codespeak.net/lxml/ --top http://codespeak.net/lxml/api lxml/) \ + || (echo "not generating epydoc API documentation") # XXX What should the default be? test: test_inplace From scoder at codespeak.net Thu Jul 12 01:21:55 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 12 Jul 2007 01:21:55 +0200 (CEST) Subject: [Lxml-checkins] r44944 - lxml/trunk Message-ID: <20070711232155.3FEAD8164@code0.codespeak.net> Author: scoder Date: Thu Jul 12 01:21:54 2007 New Revision: 44944 Modified: lxml/trunk/Makefile Log: doc link fix Modified: lxml/trunk/Makefile ============================================================================== --- lxml/trunk/Makefile (original) +++ lxml/trunk/Makefile Thu Jul 12 01:21:54 2007 @@ -37,7 +37,7 @@ mkdir -p doc/html PYTHONPATH=src $(PYTHON) doc/mkhtml.py doc/html . `cat version.txt` [ -x "`which epydoc`" ] \ - && (cd src && PYTHONPATH=. epydoc -o ../doc/html/api --name lxml --url http://codespeak.net/lxml/ --top http://codespeak.net/lxml/api lxml/) \ + && (cd src && PYTHONPATH=. epydoc -o ../doc/html/api --name lxml --url http://codespeak.net/lxml/ lxml/) \ || (echo "not generating epydoc API documentation") # XXX What should the default be? From scoder at codespeak.net Thu Jul 12 01:22:13 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 12 Jul 2007 01:22:13 +0200 (CEST) Subject: [Lxml-checkins] r44945 - lxml/branch/lxml-1.3 Message-ID: <20070711232213.2F01C8164@code0.codespeak.net> Author: scoder Date: Thu Jul 12 01:22:12 2007 New Revision: 44945 Modified: lxml/branch/lxml-1.3/Makefile Log: doc link fix Modified: lxml/branch/lxml-1.3/Makefile ============================================================================== --- lxml/branch/lxml-1.3/Makefile (original) +++ lxml/branch/lxml-1.3/Makefile Thu Jul 12 01:22:12 2007 @@ -37,7 +37,7 @@ mkdir -p doc/html PYTHONPATH=src $(PYTHON) doc/mkhtml.py doc/html . `cat version.txt` [ -x "`which epydoc`" ] \ - && (cd src && PYTHONPATH=. epydoc -o ../doc/html/api --name lxml --url http://codespeak.net/lxml/ --top http://codespeak.net/lxml/api lxml/) \ + && (cd src && PYTHONPATH=. epydoc -o ../doc/html/api --name lxml --url http://codespeak.net/lxml/ lxml/) \ || (echo "not generating epydoc API documentation") # XXX What should the default be? From scoder at codespeak.net Thu Jul 12 10:04:13 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 12 Jul 2007 10:04:13 +0200 (CEST) Subject: [Lxml-checkins] r44946 - lxml/tag/lxml-1.3.2 Message-ID: <20070712080413.C8AAA81C5@code0.codespeak.net> Author: scoder Date: Thu Jul 12 10:04:10 2007 New Revision: 44946 Added: lxml/tag/lxml-1.3.2/ - copied from r44711, lxml/branch/lxml-1.3/ Log: 1.3.2 release tag From scoder at codespeak.net Thu Jul 12 23:59:28 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 12 Jul 2007 23:59:28 +0200 (CEST) Subject: [Lxml-checkins] r44986 - in lxml/branch/html/src/lxml: . html html/tests Message-ID: <20070712215928.3EB50814D@code0.codespeak.net> Author: scoder Date: Thu Jul 12 23:59:25 2007 New Revision: 44986 Modified: lxml/branch/html/src/lxml/doctestcompare.py lxml/branch/html/src/lxml/html/__init__.py lxml/branch/html/src/lxml/html/builder.py lxml/branch/html/src/lxml/html/diff.py lxml/branch/html/src/lxml/html/tests/test_basic.py lxml/branch/html/src/lxml/html/tests/test_basic.txt lxml/branch/html/src/lxml/html/tests/test_css.py lxml/branch/html/src/lxml/html/tests/test_css_select.txt lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt Log: renamed HTML() to document_fromstring and HTMLFragmentS() to fragmentS_fromstring() Modified: lxml/branch/html/src/lxml/doctestcompare.py ============================================================================== --- lxml/branch/html/src/lxml/doctestcompare.py (original) +++ lxml/branch/html/src/lxml/doctestcompare.py Thu Jul 12 23:59:25 2007 @@ -26,7 +26,7 @@ """ from lxml import etree -from lxml.html import HTML +from lxml.html import document_fromstring import re import doctest import cgi @@ -85,12 +85,12 @@ def get_parser(self, want, got, optionflags): parser = None if PARSE_HTML & optionflags: - parser = HTML + parser = document_fromstring elif PARSE_XML & optionflags: parser = etree.XML elif (want.strip().lower().startswith('>> h = HTMLFragment('
Hello World!
') + >>> h = fragment_fromstring('
Hello World!
') >>> h.find('//b').drop_tag() >>> print tostring(h)
Hello World!
@@ -292,7 +292,7 @@ element=HtmlElement, comment=HtmlComment, pi=HtmlProcessingInstruction, entity=HtmlEntity)) -def HTML(html): +def document_fromstring(html): # FIXME: should this notice a fragment and parse accordingly? value = etree.HTML(html, html_parser) if value is None: @@ -300,7 +300,7 @@ "Document is empty") return value -def HTMLFragments(html, no_leading_text=False): +def fragments_fromstring(html, no_leading_text=False): """ Parses several HTML elements, returning a list of elements. @@ -314,7 +314,7 @@ if not start.startswith('%s' % (create_parent, html, create_parent)) - elements = HTMLFragments(html, no_leading_text=True) + return fragment_fromstring('<%s>%s' % ( + create_parent, html, create_parent)) + elements = fragments_fromstring(html, no_leading_text=True) if not elements: raise etree.ParserError( "No elements found") @@ -368,9 +369,9 @@ start = html[:10].lstrip().lower() if start.startswith('>> from lxml.htmlbuilder import * + >>> from lxml.html.builder import * >>> html = HTML( ... HEAD( TITLE("Hello World") ), ... BODY( CLASS("main"), Modified: lxml/branch/html/src/lxml/html/diff.py ============================================================================== --- lxml/branch/html/src/lxml/html/diff.py (original) +++ lxml/branch/html/src/lxml/html/diff.py Thu Jul 12 23:59:25 2007 @@ -1,6 +1,6 @@ import difflib from lxml import etree -from lxml.html import HTMLFragment +from lxml.html import fragment_fromstring import cgi import re @@ -531,7 +531,7 @@ if cleanup: # This removes any extra markup or structure like : html = cleanup_html(html) - return HTMLFragment(html, create_parent=True) + return fragment_fromstring(html, create_parent=True) _body_re = re.compile(r'', re.I|re.S) _end_body_re = re.compile(r'', re.I|re.S) Modified: lxml/branch/html/src/lxml/html/tests/test_basic.py ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_basic.py (original) +++ lxml/branch/html/src/lxml/html/tests/test_basic.py Thu Jul 12 23:59:25 2007 @@ -1,8 +1,6 @@ import unittest from lxml.tests.common_imports import doctest -from lxml.html import HTML - def test_suite(): suite = unittest.TestSuite() suite.addTests([doctest.DocFileSuite('test_basic.txt')]) Modified: lxml/branch/html/src/lxml/html/tests/test_basic.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_basic.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_basic.txt Thu Jul 12 23:59:25 2007 @@ -1,10 +1,10 @@ lxml.html adds a find_class method to elements:: >>> from lxml.etree import Comment - >>> from lxml.html import HTML, HTMLFragment, tostring + >>> from lxml.html import document_fromstring, fragment_fromstring, tostring >>> from lxml.html.clean import clean, clean_html >>> from lxml.html import usedoctest - >>> h = HTML(''' + >>> h = document_fromstring(''' ... ... ... test 1 ... ... item 3 @@ -40,7 +40,7 @@ Another method is ``get_element_by_id`` that does what it says:: - >>> print tostring(HTMLFragment(''' + >>> print tostring(fragment_fromstring(''' ...
... stuff ...
''').get_element_by_id('test')) @@ -48,14 +48,14 @@ Or to get the content of an element without the tags, use text_content():: - >>> el = HTMLFragment(''' + >>> el = fragment_fromstring(''' ...
This is a bold link
''') >>> el.text_content() 'This is a bold link' Or drop an element (leaving its content) or the entire tree, like:: - >>> doc = HTML(''' + >>> doc = document_fromstring(''' ... ... ...
Modified: lxml/branch/html/src/lxml/html/tests/test_css.py ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_css.py (original) +++ lxml/branch/html/src/lxml/html/tests/test_css.py Thu Jul 12 23:59:25 2007 @@ -70,7 +70,7 @@ f = open(doc_fn, 'rb') c = f.read() f.close() - doc = html.HTML(c) + doc = html.document_fromstring(c) body = doc.xpath('//body')[0] bad = [] selector, count = self.selectors[self.index] Modified: lxml/branch/html/src/lxml/html/tests/test_css_select.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_css_select.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_css_select.txt Thu Jul 12 23:59:25 2007 @@ -2,8 +2,8 @@ all our selections, and a function make querying simpler: >>> from lxml.cssselect import CSSSelector - >>> from lxml.html import HTML - >>> doc = HTML(''' + >>> from lxml.html import document_fromstring + >>> doc = document_fromstring(''' ... ...
... Modified: lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt Thu Jul 12 23:59:25 2007 @@ -75,7 +75,7 @@ is something embedded). It returns a generator of ``(element, attrib, link)``, which is awkward to test here, so we'll make a printer:: - >>> from lxml.html import iterlinks, HTML, tostring + >>> from lxml.html import iterlinks, document_fromstring, tostring >>> def print_iter(seq): ... for element, attrib, link, pos in seq: ... if pos: From scoder at codespeak.net Fri Jul 13 12:22:58 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 13 Jul 2007 12:22:58 +0200 (CEST) Subject: [Lxml-checkins] r45008 - lxml/trunk/src/lxml Message-ID: <20070713102258.541D98205@code0.codespeak.net> Author: scoder Date: Fri Jul 13 12:22:57 2007 New Revision: 45008 Modified: lxml/trunk/src/lxml/parser.pxi Log: docstring fix Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Fri Jul 13 12:22:57 2007 @@ -144,7 +144,7 @@ cdef void _setupPythonUnicode(): """Sets _UNICODE_ENCODING to the internal encoding name of Python unicode - strings if libxmls supports reading native Python unicode. This depends + strings if libxml2 supports reading native Python unicode. This depends on iconv and the local Python installation, so we simply check if we find a matching encoding handler. """ From scoder at codespeak.net Fri Jul 13 15:42:56 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 13 Jul 2007 15:42:56 +0200 (CEST) Subject: [Lxml-checkins] r45025 - lxml/trunk/src/lxml Message-ID: <20070713134256.BC5798282@code0.codespeak.net> Author: scoder Date: Fri Jul 13 15:42:56 2007 New Revision: 45025 Modified: lxml/trunk/src/lxml/parser.pxi Log: work around libxml2 not being able to detect BOM-less UTF-16LE Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Fri Jul 13 15:42:56 2007 @@ -156,6 +156,15 @@ l = python.PyUnicode_GET_DATA_SIZE(utext) buffer = python.PyUnicode_AS_DATA(utext) enc = _findEncodingName(buffer, l) + if enc == NULL: + # apparently, libxml2 can't detect UTF16LE on some systems + if l >= 4 and \ + buffer[0] == c'<' and buffer[1] == c'\0' and \ + buffer[2] == c't' and buffer[3] == c'\0': + enc = "UTF16LE" + else: + # not my fault, it's YOUR broken system :) + return enchandler = tree.xmlFindCharEncodingHandler(enc) if enchandler is not NULL: global _UNICODE_ENCODING @@ -174,6 +183,8 @@ return "UCS-4LE" elif enc == tree.XML_CHAR_ENCODING_UCS4BE: return "UCS-4BE" + elif enc == tree.XML_CHAR_ENCODING_NONE: + return NULL else: return tree.xmlGetCharEncodingName(enc) From scoder at codespeak.net Sun Jul 15 21:39:27 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 15 Jul 2007 21:39:27 +0200 (CEST) Subject: [Lxml-checkins] r45110 - lxml/trunk/src/lxml/tests Message-ID: <20070715193927.70CDF8130@code0.codespeak.net> Author: scoder Date: Sun Jul 15 21:39:26 2007 New Revision: 45110 Modified: lxml/trunk/src/lxml/tests/test_xpathevaluator.py Log: VMS compat fix for test case Modified: lxml/trunk/src/lxml/tests/test_xpathevaluator.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xpathevaluator.py (original) +++ lxml/trunk/src/lxml/tests/test_xpathevaluator.py Sun Jul 15 21:39:26 2007 @@ -23,7 +23,7 @@ tree.xpath('number(/a)')) tree = self.parse('A') actual = str(tree.xpath('number(/a)')) - expected = ['nan', '1.#qnan'] + expected = ['nan', '1.#qnan', 'nanq'] if not actual.lower() in expected: self.fail('Expected a NAN value, got %s' % actual) From scoder at codespeak.net Sun Jul 15 22:31:05 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 15 Jul 2007 22:31:05 +0200 (CEST) Subject: [Lxml-checkins] r45111 - lxml/pyrex/Pyrex/Compiler Message-ID: <20070715203105.E74978148@code0.codespeak.net> Author: scoder Date: Sun Jul 15 22:31:05 2007 New Revision: 45111 Modified: lxml/pyrex/Pyrex/Compiler/ModuleNode.py Log: use type cast to prevent compiler warnings Modified: lxml/pyrex/Pyrex/Compiler/ModuleNode.py ============================================================================== --- lxml/pyrex/Pyrex/Compiler/ModuleNode.py (original) +++ lxml/pyrex/Pyrex/Compiler/ModuleNode.py Sun Jul 15 22:31:05 2007 @@ -80,7 +80,8 @@ "static struct {char *s; void **p;} _%s_API[] = {" % env.module_name) for entry in public_funcs: - h_code.putln('{"%s", &%s},' % (entry.cname, entry.cname)) + h_code.putln('{"%s", (void*)(&%s)},' % ( + entry.cname, entry.cname)) h_code.putln("{0, 0}") h_code.putln("};") self.generate_c_api_import_code(env, h_code) From scoder at codespeak.net Sun Jul 15 23:08:08 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 15 Jul 2007 23:08:08 +0200 (CEST) Subject: [Lxml-checkins] r45112 - lxml/trunk/src/lxml Message-ID: <20070715210808.928E98149@code0.codespeak.net> Author: scoder Date: Sun Jul 15 23:08:07 2007 New Revision: 45112 Modified: lxml/trunk/src/lxml/parser.pxi Log: use UTF16 encoding names that libxml2 understands natively Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Sun Jul 15 23:08:07 2007 @@ -157,14 +157,8 @@ buffer = python.PyUnicode_AS_DATA(utext) enc = _findEncodingName(buffer, l) if enc == NULL: - # apparently, libxml2 can't detect UTF16LE on some systems - if l >= 4 and \ - buffer[0] == c'<' and buffer[1] == c'\0' and \ - buffer[2] == c't' and buffer[3] == c'\0': - enc = "UTF16LE" - else: - # not my fault, it's YOUR broken system :) - return + # not my fault, it's YOUR broken system :) + return enchandler = tree.xmlFindCharEncodingHandler(enc) if enchandler is not NULL: global _UNICODE_ENCODING @@ -176,9 +170,9 @@ cdef tree.xmlCharEncoding enc enc = tree.xmlDetectCharEncoding(buffer, size) if enc == tree.XML_CHAR_ENCODING_UTF16LE: - return "UTF16LE" + return "UTF-16LE" elif enc == tree.XML_CHAR_ENCODING_UTF16BE: - return "UTF16BE" + return "UTF-16BE" elif enc == tree.XML_CHAR_ENCODING_UCS4LE: return "UCS-4LE" elif enc == tree.XML_CHAR_ENCODING_UCS4BE: From scoder at codespeak.net Sun Jul 15 23:27:50 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 15 Jul 2007 23:27:50 +0200 (CEST) Subject: [Lxml-checkins] r45113 - in lxml/trunk/src/lxml: . tests Message-ID: <20070715212750.E8AC98149@code0.codespeak.net> Author: scoder Date: Sun Jul 15 23:27:50 2007 New Revision: 45113 Modified: lxml/trunk/src/lxml/iterparse.pxi lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/tests/test_etree.py lxml/trunk/src/lxml/xmlparser.pxd Log: new parser kw arg 'remove_pis' to discard PIs Modified: lxml/trunk/src/lxml/iterparse.pxi ============================================================================== --- lxml/trunk/src/lxml/iterparse.pxi (original) +++ lxml/trunk/src/lxml/iterparse.pxi Sun Jul 15 23:27:50 2007 @@ -235,6 +235,7 @@ * no_network - prevent network access * remove_blank_text - discard blank text nodes * remove_comments - discard comments + * remove_pis - discard processing instructions """ cdef object _source cdef object _filename @@ -242,7 +243,7 @@ def __init__(self, source, events=("end",), tag=None, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=False, remove_blank_text=False, - remove_comments=False): + remove_comments=False, remove_pis=False): cdef _IterparseContext context cdef char* c_filename cdef int parse_options @@ -259,7 +260,8 @@ c_filename = NULL self._source = source - _BaseParser.__init__(self, remove_comments, _IterparseContext) + _BaseParser.__init__(self, remove_comments, remove_pis, + _IterparseContext) parse_options = _XML_DEFAULT_PARSE_OPTIONS if load_dtd: Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Sun Jul 15 23:27:50 2007 @@ -372,7 +372,8 @@ cdef ElementClassLookup _class_lookup cdef python.PyThread_type_lock _parser_lock - def __init__(self, remove_comments, context_class=_ResolverContext): + def __init__(self, remove_comments, remove_pis, + context_class=_ResolverContext): cdef xmlParserCtxt* pctxt if isinstance(self, HTMLParser): self._parser_type = LXML_HTML_PARSER @@ -391,6 +392,8 @@ if pctxt.sax != NULL: if remove_comments: pctxt.sax.comment = NULL + if remove_pis: + pctxt.sax.processingInstruction = NULL # hard switch-off for CDATA nodes => makes them plain text pctxt.sax.cdataBlock = NULL @@ -699,6 +702,7 @@ * recover - try hard to parse through broken XML * remove_blank_text - discard blank text nodes * remove_comments - discard comments + * remove_pis - discard processing instructions * compact - safe memory for short text content (default: True) * resolve_entities - replace entities by their text value (default: True) @@ -709,9 +713,10 @@ def __init__(self, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, ns_clean=False, recover=False, remove_blank_text=False, compact=True, - resolve_entities=True, remove_comments=False): + resolve_entities=True, remove_comments=False, + remove_pis=False): cdef int parse_options - _BaseParser.__init__(self, remove_comments) + _BaseParser.__init__(self, remove_comments, remove_pis) parse_options = _XML_DEFAULT_PARSE_OPTIONS if load_dtd: @@ -833,15 +838,16 @@ * no_network - prevent network access (default: True) * remove_blank_text - discard empty text nodes * remove_comments - discard comments + * remove_pis - discard processing instructions * compact - safe memory for short text content (default: True) Note that you should avoid sharing parsers between threads for performance reasons. """ def __init__(self, recover=True, no_network=True, remove_blank_text=False, - compact=True, remove_comments=False): + compact=True, remove_comments=False, remove_pis=False): cdef int parse_options - _BaseParser.__init__(self, remove_comments) + _BaseParser.__init__(self, remove_comments, remove_pis) parse_options = _HTML_DEFAULT_PARSE_OPTIONS if remove_blank_text: Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Sun Jul 15 23:27:50 2007 @@ -187,6 +187,26 @@ '', tostring(tree)) + def test_parse_remove_pis(self): + parse = self.etree.parse + tostring = self.etree.tostring + XMLParser = self.etree.XMLParser + + xml = '' + + f = StringIO(xml) + tree = parse(f) + self.assertEquals( + xml, + tostring(tree)) + + f = StringIO(xml) + parser = XMLParser(remove_pis=True) + tree = parse(f, parser) + self.assertEquals( + '', + tostring(tree)) + def test_parse_parser_type_error(self): # ET raises IOError only parse = self.etree.parse Modified: lxml/trunk/src/lxml/xmlparser.pxd ============================================================================== --- lxml/trunk/src/lxml/xmlparser.pxd (original) +++ lxml/trunk/src/lxml/xmlparser.pxd Sun Jul 15 23:27:50 2007 @@ -26,6 +26,10 @@ ctypedef void (*commentSAXFunc)(void* ctx, char* value) + ctypedef void (*processingInstructionSAXFunc)(void * ctx, + char* target, + char* data) + cdef extern from "libxml/tree.h": ctypedef struct xmlParserInput ctypedef struct xmlParserInputBuffer: @@ -34,10 +38,11 @@ xmlInputCloseCallback closecallback ctypedef struct xmlSAXHandler: - startElementNsSAX2Func startElementNs - endElementNsSAX2Func endElementNs - cdataBlockSAXFunc cdataBlock - commentSAXFunc comment + startElementNsSAX2Func startElementNs + endElementNsSAX2Func endElementNs + cdataBlockSAXFunc cdataBlock + commentSAXFunc comment + processingInstructionSAXFunc processingInstruction cdef extern from "libxml/xmlIO.h": cdef xmlParserInputBuffer* xmlAllocParserInputBuffer(int enc) From scoder at codespeak.net Sun Jul 15 23:28:43 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 15 Jul 2007 23:28:43 +0200 (CEST) Subject: [Lxml-checkins] r45114 - lxml/trunk Message-ID: <20070715212843.E5EEC8063@code0.codespeak.net> Author: scoder Date: Sun Jul 15 23:28:42 2007 New Revision: 45114 Modified: lxml/trunk/CHANGES.txt Log: new parser kw arg 'remove_pis' to discard PIs Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sun Jul 15 23:28:42 2007 @@ -8,6 +8,8 @@ Features added -------------- +* Parsers now support stripping PIs (keyword argument 'remove_pis') + * ``etree.fromstring()`` now supports parsing both HTML and XML, depending on the parser you pass. From scoder at codespeak.net Sun Jul 15 23:45:24 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 15 Jul 2007 23:45:24 +0200 (CEST) Subject: [Lxml-checkins] r45116 - in lxml/trunk: doc src/lxml Message-ID: <20070715214524.D30788071@code0.codespeak.net> Author: scoder Date: Sun Jul 15 23:45:24 2007 New Revision: 45116 Modified: lxml/trunk/doc/compatibility.txt lxml/trunk/src/lxml/parser.pxi Log: new ETCompatXMLParser subclass of XMLParser with an ElementTree compatible default setup Modified: lxml/trunk/doc/compatibility.txt ============================================================================== --- lxml/trunk/doc/compatibility.txt (original) +++ lxml/trunk/doc/compatibility.txt Sun Jul 15 23:45:24 2007 @@ -106,8 +106,14 @@ while etree will read them in and treat them as Comment or ProcessingInstruction elements respectively. This is especially visible where comments are found inside text content, which is then split by the - Comment element. You can disable this behaviour by passing the boolean - ``remove_comments`` keyword argument to the parser you use. + Comment element. + + You can disable this behaviour by passing the boolean ``remove_comments`` + and/or ``remove_pis`` keyword arguments to the parser you use. For + convenience and to support portable code, you can also use the + ``etree.ETCompatXMLParser`` instead of the default ``etree.XMLParser``. It + tries to provide a default setup that is as close to the ElementTree parser + as possible. * ElementTree has a bug when serializing an empty Comment (no text argument given) to XML, etree serializes this successfully. Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Sun Jul 15 23:45:24 2007 @@ -742,6 +742,25 @@ self._parse_options = parse_options +cdef class ETCompatXMLParser(XMLParser): + """An XML parser with an ElementTree compatible default setup. See the + XMLParser class for details. + + This parser defaults to removing processing instructions and comments from + the tree. + """ + def __init__(self, attribute_defaults=False, dtd_validation=False, + load_dtd=False, no_network=True, ns_clean=False, + recover=False, remove_blank_text=False, compact=True, + resolve_entities=True, remove_comments=True, + remove_pis=True): + XMLParser.__init__(self, + attribute_defaults, dtd_validation, + load_dtd, no_network, ns_clean, + recover, remove_blank_text, compact, + resolve_entities, remove_comments, + remove_pis) + cdef xmlDoc* _internalParseDoc(char* c_text, int options, _ResolverContext context) except NULL: # internal parser function for XSLT From scoder at codespeak.net Mon Jul 16 00:16:34 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 16 Jul 2007 00:16:34 +0200 (CEST) Subject: [Lxml-checkins] r45119 - in lxml/trunk: . doc src/lxml src/lxml/tests Message-ID: <20070715221634.1645E80AE@code0.codespeak.net> Author: scoder Date: Mon Jul 16 00:16:33 2007 New Revision: 45119 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/compatibility.txt lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tests/test_elementtree.py lxml/trunk/src/lxml/tests/test_etree.py Log: partial merge from lxml.html branch: support Comment/PI/Element in iter methods Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon Jul 16 00:16:33 2007 @@ -41,6 +41,9 @@ * ``Element()`` did not raise an exception on tag names containing ':' +* ``Element.getiterator(tag)`` did not accept ``Comment`` and + ``ProcessingInstruction`` as tags. It also accepts ``Element`` now. + * The XML parser did not report undefined entities as error * The text in exceptions raised by XML parsers, validators and XPath Modified: lxml/trunk/doc/compatibility.txt ============================================================================== --- lxml/trunk/doc/compatibility.txt (original) +++ lxml/trunk/doc/compatibility.txt Mon Jul 16 00:16:33 2007 @@ -122,6 +122,17 @@ not. This means that a comment text "text" that ElementTree serializes as "" will become "" in lxml. +* When '*' is used as filter in the ``Element.getiterator()`` method, + ElementTree returns all elements in the tree, including comments and + processing instructions. lxml.etree only returns real Elements, i.e. tree + nodes that have a string tag name. Without a filter, both libraries iterate + over all nodes. + + Note that currently only lxml.etree supports passing the ``Element`` factory + function as filter to select only Elements. Both libraries support passing + the ``Comment`` and ``ProcessingInstruction`` factories to select the + respective tree nodes. + * ElementTree merges the target of a processing instruction into ``PI.text``, while lxml.etree puts it into the ``.target`` property and leaves it out of the ``.text`` property. The ``pi.text`` in ElementTree therefore Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Mon Jul 16 00:16:33 2007 @@ -1645,17 +1645,26 @@ cdef public class _ElementTagMatcher [ object LxmlElementTagMatcher, type LxmlElementTagMatcherType ]: cdef object _pystrings + cdef int _node_type cdef char* _href cdef char* _name cdef _initTagMatch(self, tag): + self._href = NULL + self._name = NULL if tag is None: - self._href = NULL - self._name = NULL + self._node_type = 0 + elif tag is Comment: + self._node_type = tree.XML_COMMENT_NODE + elif tag is ProcessingInstruction: + self._node_type = tree.XML_PI_NODE + elif tag is Entity: + self._node_type = tree.XML_ENTITY_REF_NODE + elif tag is Element: + self._node_type = tree.XML_ELEMENT_NODE else: + self._node_type = tree.XML_ELEMENT_NODE self._pystrings = _getNsTag(tag) - if self._pystrings[0] is None: - self._href = NULL - else: + if self._pystrings[0] is not None: self._href = _cstr(self._pystrings[0]) self._name = _cstr(self._pystrings[1]) if self._name[0] == c'*' and self._name[1] == c'\0': @@ -1673,7 +1682,9 @@ cdef xmlNode* c_node c_node = self._next_element(node._c_node) while c_node is not NULL and \ - not _tagMatches(c_node, self._href, self._name): + self._node_type != 0 and \ + (self._node_type != c_node.type or + not _tagMatches(c_node, self._href, self._name)): c_node = self._next_element(c_node) if c_node is NULL: self._node = None @@ -1704,7 +1715,9 @@ self._next_element = _nextElement if tag is not None: while c_node is not NULL and \ - not _tagMatches(c_node, self._href, self._name): + self._node_type != 0 and \ + (self._node_type != c_node.type or + not _tagMatches(c_node, self._href, self._name)): c_node = self._next_element(c_node) if c_node is not NULL: # store Python ref: @@ -1750,14 +1763,15 @@ # keep next node to return and a depth counter in the tree cdef _Element _next_node cdef _Element _top_node - cdef int _include_all_types def __init__(self, _Element node not None, tag=None, inclusive=True): self._top_node = node self._next_node = node self._initTagMatch(tag) - if tag is not None and \ - not _tagMatches(node._c_node, self._href, self._name) or \ - not inclusive: + if not inclusive or \ + tag is not None and \ + self._node_type != 0 and \ + (self._node_type != node._c_node.type or + not _tagMatches(node._c_node, self._href, self._name)): # this cannot raise StopIteration, self._next_node != None self.next() @@ -1783,7 +1797,8 @@ cdef xmlNode* _nextNodeAnyTag(self, xmlNode* c_node): tree.BEGIN_FOR_EACH_ELEMENT_FROM(self._top_node._c_node, c_node, 0) - return c_node + if self._node_type == 0 or self._node_type == c_node.type: + return c_node tree.END_FOR_EACH_ELEMENT_FROM(c_node) return NULL Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Mon Jul 16 00:16:33 2007 @@ -1455,6 +1455,70 @@ [a2], list(c.getiterator('a'))) + def test_getiterator_filter_all(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + + a = Element('a') + b = SubElement(a, 'b') + c = SubElement(a, 'c') + d = SubElement(b, 'd') + e = SubElement(c, 'e') + + self.assertEquals( + [a, b, d, c, e], + list(a.getiterator('*'))) + + def test_getiterator_filter_comment(self): + Element = self.etree.Element + Comment = self.etree.Comment + SubElement = self.etree.SubElement + + a = Element('a') + b = SubElement(a, 'b') + comment_b = Comment("TEST-b") + b.append(comment_b) + + self.assertEquals( + [comment_b], + list(a.getiterator(Comment))) + + comment_a = Comment("TEST-a") + a.append(comment_a) + + self.assertEquals( + [comment_b, comment_a], + list(a.getiterator(Comment))) + + self.assertEquals( + [comment_b], + list(b.getiterator(Comment))) + + def test_getiterator_filter_pi(self): + Element = self.etree.Element + PI = self.etree.ProcessingInstruction + SubElement = self.etree.SubElement + + a = Element('a') + b = SubElement(a, 'b') + pi_b = PI("TEST-b") + b.append(pi_b) + + self.assertEquals( + [pi_b], + list(a.getiterator(PI))) + + pi_a = PI("TEST-a") + a.append(pi_a) + + self.assertEquals( + [pi_b, pi_a], + list(a.getiterator(PI))) + + self.assertEquals( + [pi_b], + list(b.getiterator(PI))) + def test_getiterator_with_text(self): Element = self.etree.Element SubElement = self.etree.SubElement Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Mon Jul 16 00:16:33 2007 @@ -1299,6 +1299,64 @@ [d, f], list(a.getiterator('{b}*'))) + def test_getiterator_filter_entities(self): + Element = self.etree.Element + Entity = self.etree.Entity + SubElement = self.etree.SubElement + + a = Element('a') + b = SubElement(a, 'b') + entity_b = Entity("TEST-b") + b.append(entity_b) + + self.assertEquals( + [entity_b], + list(a.getiterator(Entity))) + + entity_a = Entity("TEST-a") + a.append(entity_a) + + self.assertEquals( + [entity_b, entity_a], + list(a.getiterator(Entity))) + + self.assertEquals( + [entity_b], + list(b.getiterator(Entity))) + + def test_getiterator_filter_element(self): + Element = self.etree.Element + Comment = self.etree.Comment + PI = self.etree.PI + SubElement = self.etree.SubElement + + a = Element('a') + b = SubElement(a, 'b') + a.append(Comment("test")) + a.append(PI("pi", "content")) + c = SubElement(a, 'c') + + self.assertEquals( + [a, b, c], + list(a.getiterator(Element))) + + def test_getiterator_filter_all_comment_pi(self): + # ElementTree iterates over everything here + Element = self.etree.Element + Comment = self.etree.Comment + PI = self.etree.PI + SubElement = self.etree.SubElement + + a = Element('a') + b = SubElement(a, 'b') + a.append(Comment("test")) + a.append(PI("pi", "content")) + c = SubElement(a, 'c') + + self.assertEquals( + [a, b, c], + list(a.getiterator('*'))) + def test_findall_ns(self): XML = self.etree.XML root = XML('') From scoder at codespeak.net Mon Jul 16 00:17:55 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 16 Jul 2007 00:17:55 +0200 (CEST) Subject: [Lxml-checkins] r45120 - lxml/trunk/doc Message-ID: <20070715221755.7C5B380AE@code0.codespeak.net> Author: scoder Date: Mon Jul 16 00:17:54 2007 New Revision: 45120 Modified: lxml/trunk/doc/compatibility.txt Log: doc cleanup Modified: lxml/trunk/doc/compatibility.txt ============================================================================== --- lxml/trunk/doc/compatibility.txt (original) +++ lxml/trunk/doc/compatibility.txt Mon Jul 16 00:17:54 2007 @@ -122,8 +122,8 @@ not. This means that a comment text "text" that ElementTree serializes as "" will become "" in lxml. -* When '*' is used as filter in the ``Element.getiterator()`` method, - ElementTree returns all elements in the tree, including comments and +* When the string '*' is used as tag filter in the ``Element.getiterator()`` + method, ElementTree returns all elements in the tree, including comments and processing instructions. lxml.etree only returns real Elements, i.e. tree nodes that have a string tag name. Without a filter, both libraries iterate over all nodes. From ianb at codespeak.net Mon Jul 16 08:13:11 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Mon, 16 Jul 2007 08:13:11 +0200 (CEST) Subject: [Lxml-checkins] r45121 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070716061311.8A7078134@code0.codespeak.net> Author: ianb Date: Mon Jul 16 08:13:09 2007 New Revision: 45121 Added: lxml/branch/html/src/lxml/html/setmixin.py lxml/branch/html/src/lxml/html/tests/test_forms.py lxml/branch/html/src/lxml/html/tests/test_forms.txt Modified: lxml/branch/html/src/lxml/html/__init__.py Log: Add special handling for form and input elements: * New classes for form, input, select, textarea, and label elements * Any element can query for its label, labels know what they point to * input elements know their name and value * form elements have pointers to their input elements Also an accessor for head and body and the page's forms. Also a debugging function, to open a document in a web browser. Modified: lxml/branch/html/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/html/src/lxml/html/__init__.py (original) +++ lxml/branch/html/src/lxml/html/__init__.py Mon Jul 16 08:13:09 2007 @@ -5,10 +5,11 @@ from lxml import etree from lxml.html import defs from lxml import cssselect +from lxml.html.setmixin import SetMixin __all__ = ['document_fromstring', 'tostring', 'Element', 'defs', 'find_rel_links', 'find_class', 'make_links_absolute', - 'resolve_base_href', 'iterlinks', 'rewrite_links'] + 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser'] _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]") #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) @@ -17,9 +18,62 @@ _collect_string_content = etree.XPath("string()") _css_url_re = re.compile(r'url\((.*?)\)', re.I) _css_import_re = re.compile(r'@import "(.*?)"') +_label_xpath = etree.XPath("//label[@for=$id]") class HtmlMixin(object): + def forms(self): + """ + Return a list of all the forms + """ + return list(self.getiterator('form')) + forms = property(forms, doc=forms.__doc__) + + def body(self): + """ + Return the element. Can be called from a child element + to get the document's head. + """ + return self.xpath('//body')[0] + body = property(body, doc=body.__doc__) + + def head(self): + """ + Returns the element. Can be called from a child + element to get the document's head. + """ + return self.xpath('//head')[0] + head = property(head, doc=head.__doc__) + + def label__get(self): + """ + Get or set any