From scoder at codespeak.net Sat Aug 8 17:57:41 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 8 Aug 2009 17:57:41 +0200 (CEST) Subject: [Lxml-checkins] r66759 - in lxml/trunk: . src/lxml Message-ID: <20090808155741.D3F9D168011@codespeak.net> Author: scoder Date: Sat Aug 8 17:57:39 2009 New Revision: 66759 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/apihelpers.pxi Log: r5195 at delle: sbehnel | 2009-08-08 16:34:21 +0200 tiny cleanup Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Sat Aug 8 17:57:39 2009 @@ -221,7 +221,7 @@ return 0 nsdefs = list(nsmap.items()) - if None in nsmap and python.PyList_GET_SIZE(nsdefs) > 1: + if None in nsmap and len(nsdefs) > 1: # Move the default namespace to the end. This makes sure libxml2 # prefers a prefix if the ns is defined redundantly on the same # element. That way, users can work around a problem themselves From scoder at codespeak.net Sat Aug 8 17:57:45 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 8 Aug 2009 17:57:45 +0200 (CEST) Subject: [Lxml-checkins] r66760 - lxml/trunk Message-ID: <20090808155745.41985168019@codespeak.net> Author: scoder Date: Sat Aug 8 17:57:43 2009 New Revision: 66760 Modified: lxml/trunk/ (props changed) lxml/trunk/version.txt Log: r5196 at delle: sbehnel | 2009-08-08 17:40:46 +0200 set trunk version to 2.3dev Modified: lxml/trunk/version.txt ============================================================================== --- lxml/trunk/version.txt (original) +++ lxml/trunk/version.txt Sat Aug 8 17:57:43 2009 @@ -1 +1 @@ -2.2.2 +2.3dev From scoder at codespeak.net Sat Aug 8 17:57:50 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 8 Aug 2009 17:57:50 +0200 (CEST) Subject: [Lxml-checkins] r66761 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20090808155750.4BADF168019@codespeak.net> Author: scoder Date: Sat Aug 8 17:57:49 2009 New Revision: 66761 Added: lxml/trunk/src/lxml/uri.pxd Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/tests/test_unicode.py Log: r5197 at delle: sbehnel | 2009-08-08 17:53:33 +0200 check namespace URIs against RFC3986 Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Aug 8 17:57:49 2009 @@ -2,12 +2,15 @@ lxml changelog ============== -Under development -================== +2.3 (under development) +======================= Features added -------------- +* Namespace URIs get validated against RFC 3986 at the API level + (required by the XML namespace specification). + * Target parsers show their target object in the ``.target`` property (compatible with ElementTree). Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Sat Aug 8 17:57:49 2009 @@ -1,5 +1,7 @@ # Private/public helper functions for API functions +cimport uri + cdef void displayNode(xmlNode* c_node, indent): # to help with debugging cdef xmlNode* c_child @@ -217,6 +219,7 @@ cdef list nsdefs if not nsmap: if node_ns_utf is not None: + _uriValidOrRaise(node_ns_utf) doc._setNodeNs(c_node, _cstr(node_ns_utf)) return 0 @@ -234,6 +237,7 @@ for prefix, href in nsdefs: href_utf = _utf8(href) + _uriValidOrRaise(href_utf) c_href = _cstr(href_utf) if prefix is not None: prefix_utf = _utf8(prefix) @@ -279,6 +283,7 @@ if attr_ns_utf is None: tree.xmlNewProp(c_node, _cstr(attr_name_utf), _cstr(value_utf)) else: + _uriValidOrRaise(attr_ns_utf) c_ns = doc._findOrBuildNodeNs(c_node, _cstr(attr_ns_utf), NULL) tree.xmlNewNsProp(c_node, c_ns, _cstr(attr_name_utf), _cstr(value_utf)) @@ -1429,6 +1434,14 @@ python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', NULL) return 0 +cdef int _uriValidOrRaise(uri_utf) except -1: + cdef uri.xmlURI* c_uri = uri.xmlParseURI(_cstr(uri_utf)) + if c_uri is NULL: + raise ValueError, u"Invalid namespace URI %r" % \ + python.PyUnicode_FromEncodedObject(uri_utf, 'UTF-8', NULL) + uri.xmlFreeURI(c_uri) + return 0 + cdef inline object _namespacedName(xmlNode* c_node): return _namespacedNameFromNsName(_getNs(c_node), c_node.name) Modified: lxml/trunk/src/lxml/tests/test_unicode.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_unicode.py (original) +++ lxml/trunk/src/lxml/tests/test_unicode.py Sat Aug 8 17:57:49 2009 @@ -43,13 +43,18 @@ self.assertRaises(ValueError, etree.Element, invalid_tag) def test_unicode_nstag(self): - tag = _str("{%s}%s") % (uni, uni) + tag = _str("{http://abc/}%s") % uni el = etree.Element(tag) self.assertEquals(tag, el.tag) + def test_unicode_ns_invalid(self): + # namespace URIs must conform to RFC 3986 + tag = _str("{http://%s/}abc") % uni + self.assertRaises(ValueError, etree.Element, tag) + def test_unicode_nstag_invalid(self): # sadly, Klingon is not well-formed - tag = _str("{%s}%s") % (uni, invalid_tag) + tag = _str("{http://abc/}%s") % invalid_tag self.assertRaises(ValueError, etree.Element, tag) def test_unicode_qname(self): Added: lxml/trunk/src/lxml/uri.pxd ============================================================================== --- (empty file) +++ lxml/trunk/src/lxml/uri.pxd Sat Aug 8 17:57:49 2009 @@ -0,0 +1,5 @@ +cdef extern from "libxml/uri.h": + ctypedef struct xmlURI + + cdef xmlURI* xmlParseURI(char* str) + cdef void xmlFreeURI(xmlURI* uri) From scoder at codespeak.net Sat Aug 8 19:31:27 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 8 Aug 2009 19:31:27 +0200 (CEST) Subject: [Lxml-checkins] r66762 - in lxml/trunk: . doc Message-ID: <20090808173127.2ACA6168019@codespeak.net> Author: scoder Date: Sat Aug 8 19:31:25 2009 New Revision: 66762 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/FAQ.txt Log: r5201 at delle: sbehnel | 2009-08-08 19:27:29 +0200 FAQ article on XML bombs Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Sat Aug 8 19:31:25 2009 @@ -50,6 +50,7 @@ 6.3 What is the difference between str(xslt(doc)) and xslt(doc).write() ? 6.4 Why can't I just delete parents or clear the root node in iterparse()? 6.5 How do I output null characters in XML text? + 6.6 Is lxml vulnerable to XML bombs? 7 XPath and Document Traversal 7.1 What are the ``findall()`` and ``xpath()`` methods on Element(Tree)? 7.2 Why doesn't ``findall()`` support full XPath expressions? @@ -881,6 +882,21 @@ uuencode or base64. +Is lxml vulnerable to XML bombs? +-------------------------------- + +This has nothing to do with lxml itself, only with the parser of +libxml2. Since libxml2 version 2.7, the parser imposes hard security +limits on input documents to prevent DoS attacks with forged input +data. Since lxml 2.2.1, you can disable these limits with the +``huge_tree`` parser option if you need to parse *really* large, +trusted documents. All lxml versions will leave these restrictions +enabled by default. + +Note that libxml2 versions of the 2.6 series do not restrict their +parser and are therefore vulnerable to DoS attacks. + + XPath and Document Traversal ============================ From scoder at codespeak.net Thu Aug 13 09:10:27 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 13 Aug 2009 09:10:27 +0200 (CEST) Subject: [Lxml-checkins] r66801 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20090813071027.A9963168011@codespeak.net> Author: scoder Date: Thu Aug 13 09:10:25 2009 New Revision: 66801 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/tests/test_etree.py Log: r5203 at delle: sbehnel | 2009-08-13 09:06:20 +0200 fix recover flag in feed parser Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Thu Aug 13 09:10:25 2009 @@ -17,6 +17,8 @@ Bugs fixed ---------- +* Feed parser failed to honout the 'recover' option on parse errors. + * Target parser didn't call ``.close()`` on the target object if parsing failed. Now it is guaranteed that ``.close()`` will be called after parsing, regardless of the outcome. Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Thu Aug 13 09:10:25 2009 @@ -1042,6 +1042,7 @@ cdef char* c_encoding cdef int buffer_len cdef int error + cdef bint recover = self._parse_options & xmlparser.XML_PARSE_RECOVER if python.PyString_Check(data): if self._default_encoding is None: c_encoding = NULL @@ -1078,10 +1079,10 @@ xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options) error = xmlparser.xmlCtxtResetPush( pctxt, c_data, buffer_len, NULL, c_encoding) - py_buffer_len = py_buffer_len - buffer_len - c_data = c_data + buffer_len + py_buffer_len -= buffer_len + c_data += buffer_len - while error == 0 and py_buffer_len > 0: + while (recover or error == 0) and py_buffer_len > 0: if py_buffer_len > python.INT_MAX: buffer_len = python.INT_MAX else: @@ -1090,11 +1091,10 @@ error = htmlparser.htmlParseChunk(pctxt, c_data, buffer_len, 0) else: error = xmlparser.xmlParseChunk(pctxt, c_data, buffer_len, 0) - py_buffer_len = py_buffer_len - buffer_len - c_data = c_data + buffer_len + py_buffer_len -= buffer_len + c_data += buffer_len - if error or (not pctxt.wellFormed and - not self._parse_options & xmlparser.XML_PARSE_RECOVER): + if not recover and (error or not pctxt.wellFormed): self._feed_parser_running = 0 try: context._handleParseResult(self, NULL, None) Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Thu Aug 13 09:10:25 2009 @@ -580,6 +580,27 @@ self.etree.XMLParser(encoding="utf-8") self.etree.XMLParser(encoding="iso-8859-1") + def test_feed_parser_recover(self): + parser = self.etree.XMLParser(recover=True) + + parser.feed('<') + parser.feed('a test="works"') + parser.feed('> not closed! + parser.feed('>') + + root = parser.close() + + self.assertEquals(root.tag, "root") + self.assertEquals(len(root), 1) + self.assertEquals(root[0].tag, "a") + self.assertEquals(root[0].get("test"), "works") + self.assertEquals(len(root[0]), 1) + self.assertEquals(root[0][0].tag, "othertag") + # FIXME: would be nice to get some errors logged ... + #self.assert_(len(parser.error_log) > 0, "error log is empty") + def test_elementtree_parser_target_type_error(self): assertEquals = self.assertEquals assertFalse = self.assertFalse From scoder at codespeak.net Fri Aug 14 22:19:39 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 14 Aug 2009 22:19:39 +0200 (CEST) Subject: [Lxml-checkins] r66839 - lxml/trunk Message-ID: <20090814201939.5603E16802C@codespeak.net> Author: scoder Date: Fri Aug 14 22:19:37 2009 New Revision: 66839 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/buildlibxml.py Log: r5205 at delle: sbehnel | 2009-08-14 22:15:36 +0200 build libiconv on static build Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri Aug 14 22:19:37 2009 @@ -28,6 +28,8 @@ Other changes ------------- +* Static builds include libiconv, in addition to libxml2 and libxslt. + 2.2.2 (2009-06-21) ================== Modified: lxml/trunk/buildlibxml.py ============================================================================== --- lxml/trunk/buildlibxml.py (original) +++ lxml/trunk/buildlibxml.py Fri Aug 14 22:19:37 2009 @@ -11,6 +11,7 @@ ## Routines to download and build libxml2/xslt: LIBXML2_LOCATION = 'ftp://xmlsoft.org/libxml2/' +LIBICONV_LOCATION = 'ftp://ftp.gnu.org/pub/gnu/libiconv/' match_libfile_version = re.compile('^[^-]*-([.0-9-]+)[.].*').match def ftp_listdir(url): @@ -42,17 +43,29 @@ return download_library(dest_dir, LIBXML2_LOCATION, 'libxslt', version_re, filename, version=version) +def download_libiconv(dest_dir, version=None): + """Downloads libiconv, returning the filename where the library was downloaded""" + version_re = re.compile(r'^libiconv-([0-9.]+[0-9]).tar.gz$') + filename = 'libiconv-%s.tar.gz' + return download_library(dest_dir, LIBICONV_LOCATION, 'libiconv', + version_re, filename, version=version) + def download_library(dest_dir, location, name, version_re, filename, version=None): if version is None: try: fns = ftp_listdir(location) + versions = [] for fn in fns: match = version_re.search(fn) if match: - version = match.group(1) - print('Latest version of %s is %s' % (name, version)) - break + version_string = match.group(1) + versions.append((map(tryint, version_string.split('.')), + version_string)) + if versions: + versions.sort() + version = versions[-1][-1] + print('Latest version of %s is %s' % (name, version)) else: raise Exception( "Could not find the most current version of the %s from the files: %s" @@ -161,14 +174,23 @@ if not os.path.exists(dir): os.makedirs(dir) +def cmmi(configure_cmd, build_dir, **call_setup): + print('Starting build in %s' % build_dir) + call_subprocess(configure_cmd, cwd=build_dir, **call_setup) + call_subprocess( + ['make'], cwd=build_dir, **call_setup) + call_subprocess( + ['make', 'install'], cwd=build_dir, **call_setup) + def build_libxml2xslt(download_dir, build_dir, static_include_dirs, static_library_dirs, static_cflags, static_binaries, - libxml2_version=None, libxslt_version=None): + libxml2_version=None, libxslt_version=None, libiconv_version=None): safe_mkdir(download_dir) safe_mkdir(build_dir) - libxml2_dir = unpack_tarball(download_libxml2(download_dir, libxml2_version), build_dir) - libxslt_dir = unpack_tarball(download_libxslt(download_dir, libxslt_version), build_dir) + libiconv_dir = unpack_tarball(download_libiconv(download_dir, libiconv_version), build_dir) + libxml2_dir = unpack_tarball(download_libxml2(download_dir, libxml2_version), build_dir) + libxslt_dir = unpack_tarball(download_libxslt(download_dir, libxslt_version), build_dir) prefix = os.path.join(os.path.abspath(build_dir), 'libxml2') safe_mkdir(prefix) @@ -186,35 +208,33 @@ }) call_setup['env'] = env - # We may loose the link to iconv, so make sure it's there - static_binaries.append('-liconv') - configure_cmd = ['./configure', - '--without-python', '--disable-dependency-tracking', '--disable-shared', '--prefix=%s' % prefix, ] - call_subprocess(configure_cmd, cwd=libxml2_dir, **call_setup) - call_subprocess( - ['make'], cwd=libxml2_dir, **call_setup) - call_subprocess( - ['make', 'install'], cwd=libxml2_dir, **call_setup) + # build libiconv + cmmi(configure_cmd, libiconv_dir, **call_setup) + + # build libxml2 + libxml2_configure_cmd = configure_cmd + [ + '--without-python', + '--with-iconv=%s' % prefix] + cmmi(libxml2_configure_cmd, libxml2_dir, **call_setup) + + # build libxslt libxslt_configure_cmd = configure_cmd + [ + '--without-python', '--with-libxml-prefix=%s' % prefix, ] if sys.platform in ('darwin',): libxslt_configure_cmd += [ '--without-crypto', ] + cmmi(libxslt_configure_cmd, libxslt_dir, **call_setup) - call_subprocess(libxslt_configure_cmd, cwd=libxslt_dir, **call_setup) - call_subprocess( - ['make'], cwd=libxslt_dir, **call_setup) - call_subprocess( - ['make', 'install'], cwd=libxslt_dir, **call_setup) - + # collect build setup for lxml xslt_config = os.path.join(prefix, 'bin', 'xslt-config') xml2_config = os.path.join(prefix, 'bin', 'xml2-config') @@ -227,7 +247,7 @@ static_library_dirs.append(lib_dir) for filename in os.listdir(lib_dir): - if [l for l in ['libxml2', 'libxslt', 'libexslt'] if l in filename]: + if [l for l in ['iconv', 'libxml2', 'libxslt', 'libexslt'] if l in filename]: if [ext for ext in ['.a'] if filename.endswith(ext)]: static_binaries.append(os.path.join(lib_dir,filename)) From scoder at codespeak.net Fri Aug 14 22:31:08 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 14 Aug 2009 22:31:08 +0200 (CEST) Subject: [Lxml-checkins] r66840 - lxml/trunk Message-ID: <20090814203108.9D132168028@codespeak.net> Author: scoder Date: Fri Aug 14 22:31:08 2009 New Revision: 66840 Modified: lxml/trunk/ (props changed) lxml/trunk/setupinfo.py Log: r5207 at delle: sbehnel | 2009-08-14 22:27:08 +0200 make libiconv version configurable Modified: lxml/trunk/setupinfo.py ============================================================================== --- lxml/trunk/setupinfo.py (original) +++ lxml/trunk/setupinfo.py Fri Aug 14 22:31:08 2009 @@ -46,6 +46,7 @@ 'libs', 'build/tmp', static_include_dirs, static_library_dirs, static_cflags, static_binaries, + libiconv_version=OPTION_LIBICONV_VERSION, libxml2_version=OPTION_LIBXML2_VERSION, libxslt_version=OPTION_LIBXSLT_VERSION) if CYTHON_INSTALLED: @@ -337,3 +338,4 @@ OPTION_STATIC = True OPTION_LIBXML2_VERSION = option_value('libxml2-version') OPTION_LIBXSLT_VERSION = option_value('libxslt-version') +OPTION_LIBICONV_VERSION = option_value('libiconv-version') From scoder at codespeak.net Mon Aug 17 08:16:18 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 17 Aug 2009 08:16:18 +0200 (CEST) Subject: [Lxml-checkins] r66868 - lxml/trunk Message-ID: <20090817061618.3920716802B@codespeak.net> Author: scoder Date: Mon Aug 17 08:16:16 2009 New Revision: 66868 Modified: lxml/trunk/ (props changed) lxml/trunk/setup.py lxml/trunk/setupinfo.py Log: r5209 at delle: sbehnel | 2009-08-17 08:12:14 +0200 comments Modified: lxml/trunk/setup.py ============================================================================== --- lxml/trunk/setup.py (original) +++ lxml/trunk/setup.py Mon Aug 17 08:16:16 2009 @@ -1,5 +1,8 @@ import sys, os +# for command line options and supported environment variables, please +# see the end of 'setupinfo.py' + extra_options = {} try: Modified: lxml/trunk/setupinfo.py ============================================================================== --- lxml/trunk/setupinfo.py (original) +++ lxml/trunk/setupinfo.py Mon Aug 17 08:16:16 2009 @@ -321,7 +321,7 @@ env_val = os.getenv(name.upper().replace('-', '_')) return env_val -# pick up any commandline options +# pick up any commandline options and/or env variables OPTION_WITHOUT_OBJECTIFY = has_option('without-objectify') OPTION_WITHOUT_ASSERT = has_option('without-assert') OPTION_WITHOUT_THREADING = has_option('without-threading') From scoder at codespeak.net Mon Aug 17 09:33:02 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 17 Aug 2009 09:33:02 +0200 (CEST) Subject: [Lxml-checkins] r66869 - in lxml/trunk: . doc Message-ID: <20090817073302.50B0116802B@codespeak.net> Author: scoder Date: Mon Aug 17 09:33:00 2009 New Revision: 66869 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/elementsoup.txt lxml/trunk/doc/xpathxslt.txt Log: r5211 at delle: sbehnel | 2009-08-17 09:28:58 +0200 docs Modified: lxml/trunk/doc/elementsoup.txt ============================================================================== --- lxml/trunk/doc/elementsoup.txt (original) +++ lxml/trunk/doc/elementsoup.txt Mon Aug 17 09:33:00 2009 @@ -70,6 +70,25 @@ ``makeelement`` factory function to ``parse()`` and ``fromstring()``. By default, this is based on the HTML parser defined in ``lxml.html``. +For a quick comparison, libxml2 2.6.32 parses the same tag soup as +follows. The main difference is that libxml2 tries harder to adhere +to the structure of an HTML document and moves misplaced tags where +they (likely) belong. Note, however, that the result can vary between +parser versions. + +.. sourcecode:: html + + + + + Hello + + +

Hi all

+

+ + + Entity handling =============== Modified: lxml/trunk/doc/xpathxslt.txt ============================================================================== --- lxml/trunk/doc/xpathxslt.txt (original) +++ lxml/trunk/doc/xpathxslt.txt Mon Aug 17 09:33:00 2009 @@ -2,8 +2,8 @@ XPath and XSLT with lxml ======================== -lxml supports both XPath and XSLT through libxml2 and libxslt in a standards -compliant way. +lxml supports XPath 1.0, XSLT 1.0 and the EXSLT extensions through +libxml2 and libxslt in a standards compliant way. .. contents:: .. From scoder at codespeak.net Thu Aug 20 10:33:19 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 20 Aug 2009 10:33:19 +0200 (CEST) Subject: [Lxml-checkins] r67010 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20090820083319.A0EE2168038@codespeak.net> Author: scoder Date: Thu Aug 20 10:33:19 2009 New Revision: 67010 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/extensions.pxi lxml/trunk/src/lxml/tests/test_xpathevaluator.py Log: r5213 at delle: sbehnel | 2009-08-20 10:29:12 +0200 attrname property on XPath attribute result strings Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Thu Aug 20 10:33:19 2009 @@ -8,6 +8,8 @@ Features added -------------- +* XPath attribute result strings have an ``attrname`` property. + * Namespace URIs get validated against RFC 3986 at the API level (required by the XML namespace specification). Modified: lxml/trunk/src/lxml/extensions.pxi ============================================================================== --- lxml/trunk/src/lxml/extensions.pxi (original) +++ lxml/trunk/src/lxml/extensions.pxi Thu Aug 20 10:33:19 2009 @@ -500,7 +500,7 @@ stringval = funicode(xpathObj.stringval) if smart_string: stringval = _elementStringResultFactory( - stringval, None, 0, 0) + stringval, None, None, 0) return stringval elif xpathObj.type == xpath.XPATH_POINT: raise NotImplementedError, u"XPATH_POINT" @@ -588,9 +588,10 @@ cdef class _ElementUnicodeResult(python.unicode): cdef _Element _parent - cdef readonly object is_tail - cdef readonly object is_text - cdef readonly object is_attribute + cdef readonly bint is_tail + cdef readonly bint is_text + cdef readonly bint is_attribute + cdef readonly object attrname def getparent(self): return self._parent @@ -602,9 +603,10 @@ return self._parent cdef object _elementStringResultFactory(string_value, _Element parent, - bint is_attribute, bint is_tail): + attrname, bint is_tail): cdef _ElementUnicodeResult uresult cdef bint is_text + cdef bint is_attribute = attrname is not None if parent is None: is_text = 0 else: @@ -616,6 +618,7 @@ result.is_attribute = is_attribute result.is_tail = is_tail result.is_text = is_text + result.attrname = attrname return result else: uresult = _ElementUnicodeResult(string_value) @@ -623,17 +626,19 @@ uresult.is_attribute = is_attribute uresult.is_tail = is_tail uresult.is_text = is_text + uresult.attrname = attrname return uresult cdef object _buildElementStringResult(_Document doc, xmlNode* c_node, bint smart_string): - cdef _Element parent + cdef _Element parent = None + cdef object attrname = None cdef xmlNode* c_element cdef char* s - cdef bint is_attribute, is_text, is_tail + cdef bint is_tail if c_node.type == tree.XML_ATTRIBUTE_NODE: - is_attribute = 1 + attrname = _namespacedName(c_node) is_tail = 0 s = tree.xmlNodeGetContent(c_node) try: @@ -643,7 +648,6 @@ c_element = NULL else: #assert c_node.type == tree.XML_TEXT_NODE, "invalid node type" - is_attribute = 0 # may be tail text or normal text value = funicode(c_node.content) c_element = _previousElement(c_node) @@ -662,7 +666,7 @@ parent = _fakeDocElementFactory(doc, c_element) return _elementStringResultFactory( - value, parent, is_attribute, is_tail) + value, parent, attrname, is_tail) ################################################################################ Modified: lxml/trunk/src/lxml/tests/test_xpathevaluator.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xpathevaluator.py (original) +++ lxml/trunk/src/lxml/tests/test_xpathevaluator.py Thu Aug 20 10:33:19 2009 @@ -87,12 +87,18 @@ self.assertEquals([root[0], root[1]], [r.getparent() for r in tree.xpath('/a/b/text()', smart_strings=True)]) + self.assertEquals([None, None], + [r.attrname for r in + tree.xpath('/a/b/text()', smart_strings=True)]) self.assertEquals(['FooBar', 'BarFoo'], tree.xpath('/a/b/text()', smart_strings=False)) self.assertEquals([False, False], [hasattr(r, 'getparent') for r in tree.xpath('/a/b/text()', smart_strings=False)]) + self.assertEquals([None, None], + [r.attrname for r in + tree.xpath('/a/b/text()', smart_strings=True)]) def test_xpath_list_unicode_text_parent(self): xml = _bytes('FooBar\\u0680\\u3120BarFoo\\u0680\\u3120').decode("unicode_escape") @@ -122,12 +128,14 @@ results = tree.xpath('/a/@c', smart_strings=True) self.assertEquals(1, len(results)) self.assertEquals('CqWeRtZuI', results[0]) + self.assertEquals('c', results[0].attrname) self.assertEquals(tree.getroot().tag, results[0].getparent().tag) results = tree.xpath('/a/@c', smart_strings=False) self.assertEquals(1, len(results)) self.assertEquals('CqWeRtZuI', results[0]) self.assertEquals(False, hasattr(results[0], 'getparent')) + self.assertEquals(False, hasattr(results[0], 'attrname')) def test_xpath_list_comment(self): tree = self.parse('') From scoder at codespeak.net Sat Aug 22 14:46:42 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 22 Aug 2009 14:46:42 +0200 (CEST) Subject: [Lxml-checkins] r67089 - in lxml/trunk: . src/lxml Message-ID: <20090822124642.4ED11168067@codespeak.net> Author: scoder Date: Sat Aug 22 14:46:39 2009 New Revision: 67089 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/lxml.etree.pyx Log: r5215 at delle: sbehnel | 2009-08-21 17:46:12 +0200 minor cleanup, some comments Modified: lxml/trunk/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.etree.pyx (original) +++ lxml/trunk/src/lxml/lxml.etree.pyx Sat Aug 22 14:46:39 2009 @@ -54,6 +54,7 @@ EMPTY_READ_ONLY_DICT = python.PyDictProxy_New({}) # the rules +# --------- # any libxml C argument/variable is prefixed with c_ # any non-public function/class is prefixed with an underscore # instance creation is always through factories @@ -190,7 +191,8 @@ __version__ = (tree.LXML_VERSION_STRING).decode(u"ASCII") -# class for temporary storage of Python references +# class for temporary storage of Python references, +# used e.g. for XPath results cdef class _TempStore: cdef list _storage def __init__(self): @@ -271,6 +273,7 @@ #_deallocDocument(self._c_doc) cdef getroot(self): + # return an element proxy for the document root cdef xmlNode* c_node c_node = tree.xmlDocGetRootElement(self._c_doc) if c_node is NULL: @@ -278,9 +281,11 @@ return _elementFactory(self, c_node) cdef bint hasdoctype(self): + # DOCTYPE gets parsed into internal subset (xmlDTD*) return self._c_doc.intSubset is not NULL cdef getdoctype(self): + # get doctype info: root tag, public/system ID (or None if not known) cdef tree.xmlDtd* c_dtd cdef xmlNode* c_root_node public_id = None @@ -305,6 +310,7 @@ return (root_name, public_id, sys_url) cdef getxmlinfo(self): + # return XML version and encoding (or None if not known) cdef xmlDoc* c_doc c_doc = self._c_doc if c_doc.version is NULL: @@ -318,12 +324,15 @@ return (version, encoding) cdef isstandalone(self): + # returns True for "standalone=true", + # False for "standalone=false", None if not provided if self._c_doc.standalone == -1: return None else: return (self._c_doc.standalone == 1) cdef buildNewPrefix(self): + # get a new unique prefix ("nsX") for this document if self._ns_counter < python.PyTuple_GET_SIZE(_PREFIX_CACHE): ns = python.PyTuple_GET_ITEM(_PREFIX_CACHE, self._ns_counter) python.Py_INCREF(ns) @@ -353,11 +362,12 @@ assert c_node.type == tree.XML_ELEMENT_NODE, \ u"invalid node type %d, expected %d" % ( c_node.type, tree.XML_ELEMENT_NODE) - # look for existing ns + # look for existing ns declaration c_ns = tree.xmlSearchNsByHref(self._c_doc, c_node, c_href) if c_ns is not NULL: return c_ns + # none found => determine a suitable new prefix if c_prefix is NULL: dict_result = python.PyDict_GetItem( _DEFAULT_NAMESPACE_PREFIXES, c_href) @@ -372,6 +382,7 @@ prefix = self.buildNewPrefix() c_prefix = _cstr(prefix) + # declare the namespace and return it c_ns = tree.xmlNewNs(c_node, c_href, c_prefix) if c_ns is NULL: python.PyErr_NoMemory() @@ -883,7 +894,7 @@ prefix = None else: prefix = funicode(c_ns.prefix) - if not python.PyDict_GetItem(nsmap, prefix): + if prefix not in nsmap: nsmap[prefix] = funicode(c_ns.href) c_ns = c_ns.next c_node = c_node.parent From scoder at codespeak.net Sat Aug 22 14:46:46 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 22 Aug 2009 14:46:46 +0200 (CEST) Subject: [Lxml-checkins] r67090 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20090822124646.8338B16807A@codespeak.net> Author: scoder Date: Sat Aug 22 14:46:44 2009 New Revision: 67090 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/lxml.objectify.pyx lxml/trunk/src/lxml/tests/test_objectify.py Log: r5216 at delle: sbehnel | 2009-08-22 14:42:32 +0200 fix help(lxml.objectify), prevent objectify.ElementMaker from injecting default namespaces when not annotating, some cleanup/comments, doctest fixes, run API doctests in objectify Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Aug 22 14:46:44 2009 @@ -19,6 +19,11 @@ Bugs fixed ---------- +* Calling ``help(lxml.objectify)`` didn't work at the prompt. + +* The ``ElementMaker`` in lxml.objectify no longer defines the default + namespaces when annotation is disabled. + * Feed parser failed to honout the 'recover' option on parse errors. * Target parser didn't call ``.close()`` on the target object if Modified: lxml/trunk/src/lxml/lxml.objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.objectify.pyx (original) +++ lxml/trunk/src/lxml/lxml.objectify.pyx Sat Aug 22 14:46:44 2009 @@ -31,8 +31,8 @@ cdef object re import re -cdef tuple IGNORABLE_ERRORS -IGNORABLE_ERRORS = (ValueError, TypeError) +cdef tuple IGNORABLE_ERRORS = (ValueError, TypeError) +cdef object is_special_method = re.compile(u'__.*__$').match cdef object islice from itertools import islice @@ -127,10 +127,12 @@ u"""Main XML Element class. Element children are accessed as object attributes. Multiple children - with the same name are available through a list index. Example: + with the same name are available through a list index. Example:: - >>> root = etree.XML("01") + >>> root = XML("01") >>> second_c2 = root.c1.c2[1] + >>> print(second_c2.text) + 1 Note that you cannot (and must not) instantiate this class or its subclasses. @@ -218,6 +220,8 @@ u"""Return the (first) child with the given tag name. If no namespace is provided, the child will be looked up in the same one as self. """ + if is_special_method(tag): + return object.__getattr__(self, tag) return _lookupChildOrRaise(self, tag) def __setattr__(self, tag, value): @@ -1161,46 +1165,6 @@ ################################################################################ # adapted ElementMaker supports registered PyTypes -cdef class _ObjectifyElementMakerCaller # forward declaration - -cdef extern from "etree_defs.h": - # macro call to 't->tp_new()' for fast instantiation - cdef _ObjectifyElementMakerCaller NEW_ELEMENT_MAKER "PY_NEW" (object t) - -cdef class ElementMaker: - u"""ElementMaker(self, namespace=None, nsmap=None, annotate=True, makeelement=None) - """ - cdef object _makeelement - cdef object _namespace - cdef object _nsmap - cdef bint _annotate - def __init__(self, *, namespace=None, nsmap=None, annotate=True, - makeelement=None): - if nsmap is None: - nsmap = _DEFAULT_NSMAP - self._nsmap = nsmap - if namespace is None: - self._namespace = None - else: - self._namespace = u"{%s}" % namespace - self._annotate = annotate - if makeelement is not None: - assert callable(makeelement) - self._makeelement = makeelement - else: - self._makeelement = None - - def __getattr__(self, tag): - cdef _ObjectifyElementMakerCaller element_maker - if self._namespace is not None and tag[0] != u"{": - tag = self._namespace + tag - element_maker = NEW_ELEMENT_MAKER(_ObjectifyElementMakerCaller) - element_maker._tag = tag - element_maker._nsmap = self._nsmap - element_maker._annotate = self._annotate - element_maker._element_factory = self._makeelement - return element_maker - cdef class _ObjectifyElementMakerCaller: cdef object _tag cdef object _nsmap @@ -1266,6 +1230,8 @@ return element cdef _add_text(_Element elem, text): + # add text to the tree in construction, either as element text or + # tail text, depending on the current tree state cdef tree.xmlNode* c_child c_child = cetree.findChildBackwards(elem._c_node, 0) if c_child is not NULL: @@ -1279,6 +1245,62 @@ text = old + text cetree.setNodeText(elem._c_node, text) +cdef extern from "etree_defs.h": + # macro call to 't->tp_new()' for fast instantiation + cdef _ObjectifyElementMakerCaller NEW_ELEMENT_MAKER "PY_NEW" (object t) + +cdef class ElementMaker: + u"""ElementMaker(self, namespace=None, nsmap=None, annotate=True, makeelement=None) + + An ElementMaker that can be used for constructing trees. + + Example:: + + >>> M = ElementMaker(annotate=False) + >>> html = M.html( M.body( M.p('hello', M.br, 'objectify') ) ) + + >>> from lxml.etree import tostring + >>> print(tostring(html, method='html')) +

hello
objectify

+ + Note that this module has a predefined ElementMaker instance called ``E``. + """ + cdef object _makeelement + cdef object _namespace + cdef object _nsmap + cdef bint _annotate + def __init__(self, *, namespace=None, nsmap=None, annotate=True, + makeelement=None): + if nsmap is None: + if annotate: + nsmap = _DEFAULT_NSMAP + else: + nsmap = {} + self._nsmap = nsmap + if namespace is None: + self._namespace = None + else: + self._namespace = u"{%s}" % namespace + self._annotate = annotate + if makeelement is not None: + assert callable(makeelement) + self._makeelement = makeelement + else: + self._makeelement = None + + def __getattr__(self, tag): + cdef _ObjectifyElementMakerCaller element_maker + if is_special_method(tag): + return object.__getattr__(self, tag) + if self._namespace is not None and tag[0] != u"{": + tag = self._namespace + tag + element_maker = NEW_ELEMENT_MAKER(_ObjectifyElementMakerCaller) + element_maker._tag = tag + element_maker._nsmap = self._nsmap + element_maker._annotate = self._annotate + element_maker._element_factory = self._makeelement + return element_maker + ################################################################################ # Recursive element dumping Modified: lxml/trunk/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_objectify.py (original) +++ lxml/trunk/src/lxml/tests/test_objectify.py Sat Aug 22 14:46:44 2009 @@ -2453,6 +2453,7 @@ def test_suite(): suite = unittest.TestSuite() suite.addTests([unittest.makeSuite(ObjectifyTestCase)]) + suite.addTests(doctest.DocTestSuite(objectify)) if sys.version_info >= (2,4): suite.addTests( [make_doctest('../../../doc/objectify.txt')]) From scoder at codespeak.net Sat Aug 22 15:07:31 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 22 Aug 2009 15:07:31 +0200 (CEST) Subject: [Lxml-checkins] r67091 - in lxml/trunk: . src/lxml Message-ID: <20090822130731.A19CA168060@codespeak.net> Author: scoder Date: Sat Aug 22 15:07:31 2009 New Revision: 67091 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/apihelpers.pxi Log: r5219 at delle: sbehnel | 2009-08-22 14:58:54 +0200 simplification Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Sat Aug 22 15:07:31 2009 @@ -802,9 +802,8 @@ nodes. """ while c_node is not NULL: - if c_node.type == tree.XML_TEXT_NODE: - return c_node - if c_node.type == tree.XML_CDATA_SECTION_NODE: + if c_node.type == tree.XML_TEXT_NODE or \ + c_node.type == tree.XML_CDATA_SECTION_NODE: return c_node elif c_node.type == tree.XML_XINCLUDE_START or \ c_node.type == tree.XML_XINCLUDE_END: From scoder at codespeak.net Sat Aug 22 15:07:35 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 22 Aug 2009 15:07:35 +0200 (CEST) Subject: [Lxml-checkins] r67092 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20090822130735.C60E8168064@codespeak.net> Author: scoder Date: Sat Aug 22 15:07:35 2009 New Revision: 67092 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/extensions.pxi lxml/trunk/src/lxml/tests/test_etree.py Log: r5220 at delle: sbehnel | 2009-08-22 15:03:26 +0200 fix CDATA handling in XPath Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Aug 22 15:07:35 2009 @@ -19,6 +19,9 @@ Bugs fixed ---------- +* XPath raised a TypeError when finding CDATA sections. This is now + fully supported. + * Calling ``help(lxml.objectify)`` didn't work at the prompt. * The ``ElementMaker`` in lxml.objectify no longer defines the default Modified: lxml/trunk/src/lxml/extensions.pxi ============================================================================== --- lxml/trunk/src/lxml/extensions.pxi (original) +++ lxml/trunk/src/lxml/extensions.pxi Sat Aug 22 15:07:35 2009 @@ -543,7 +543,8 @@ results.append( _fakeDocElementFactory(doc, c_node)) elif c_node.type == tree.XML_TEXT_NODE or \ - c_node.type == tree.XML_ATTRIBUTE_NODE: + c_node.type == tree.XML_CDATA_SECTION_NODE or \ + c_node.type == tree.XML_ATTRIBUTE_NODE: results.append( _buildElementStringResult(doc, c_node, smart_string)) elif c_node.type == tree.XML_NAMESPACE_DECL: @@ -572,7 +573,7 @@ pass else: raise NotImplementedError, \ - u"Not yet implemented result node type: %d" % unicode(c_node.type) + u"Not yet implemented result node type: %d" % c_node.type cdef void _freeXPathObject(xpath.xmlXPathObject* xpathObj): u"""Free the XPath object, but *never* free the *content* of node sets. @@ -647,7 +648,7 @@ tree.xmlFree(s) c_element = NULL else: - #assert c_node.type == tree.XML_TEXT_NODE, "invalid node type" + #assert c_node.type == tree.XML_TEXT_NODE or c_node.type == tree.XML_CDATA_SECTION_NODE, "invalid node type" # may be tail text or normal text value = funicode(c_node.content) c_element = _previousElement(c_node) Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Sat Aug 22 15:07:35 2009 @@ -1140,6 +1140,15 @@ self.assertEquals(_bytes(''), tostring(root)) + def test_cdata_xpath(self): + tostring = self.etree.tostring + parser = self.etree.XMLParser(strip_cdata=False) + root = self.etree.XML(_bytes(''), parser) + self.assertEquals(_bytes(''), + tostring(root)) + + self.assertEquals(['test'], root.xpath('//text()')) + # TypeError in etree, AssertionError in ElementTree; def test_setitem_assert(self): Element = self.etree.Element From scoder at codespeak.net Sat Aug 22 15:32:10 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 22 Aug 2009 15:32:10 +0200 (CEST) Subject: [Lxml-checkins] r67093 - in lxml/trunk: . src/lxml/html src/lxml/html/tests Message-ID: <20090822133210.5661616807B@codespeak.net> Author: scoder Date: Sat Aug 22 15:32:08 2009 New Revision: 67093 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/html/__init__.py lxml/trunk/src/lxml/html/tests/test_forms.txt Log: r5223 at delle: sbehnel | 2009-08-22 15:27:51 +0200 fix bug 399249: handle option fields without explicit 'value' attribute in HTML forms Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Aug 22 15:32:08 2009 @@ -19,6 +19,11 @@ Bugs fixed ---------- +* Select options in HTML forms that had no explicit ``value`` + attribute were not handled correctly. The HTML standard dictates + that their value is defined by their text content. This is now + supported by lxml.html. + * XPath raised a TypeError when finding CDATA sections. This is now fully supported. Modified: lxml/trunk/src/lxml/html/__init__.py ============================================================================== --- lxml/trunk/src/lxml/html/__init__.py (original) +++ lxml/trunk/src/lxml/html/__init__.py Sat Aug 22 15:32:08 2009 @@ -991,9 +991,12 @@ if self.multiple: return MultipleSelectOptions(self) for el in _options_xpath(self): - if 'selected' in el.attrib: + if el.get('selected') is not None: value = el.get('value') - # FIXME: If value is None, what to return?, get_text()? + if value is None: + value = el.text or '' + if value: + value = value.strip() return value return None @@ -1006,9 +1009,14 @@ self.value.update(value) return if value is not None: + value = value.strip() for el in _options_xpath(self): - # FIXME: also if el.get('value') is None? - if el.get('value') == value: + opt_value = el.get('value') + if opt_value is None: + opt_value = el.text or '' + if opt_value: + opt_value = opt_value.strip() + if opt_value == value: checked_option = el break else: @@ -1034,7 +1042,15 @@ All the possible values this select can have (the ``value`` attribute of all the ``