From scoder at codespeak.net Fri Nov 5 22:18:19 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 5 Nov 2010 22:18:19 +0100 (CET) Subject: [Lxml-checkins] r78770 - lxml/trunk/src/lxml Message-ID: <20101105211819.321BB282B9D@codespeak.net> Author: scoder Date: Fri Nov 5 22:18:17 2010 New Revision: 78770 Modified: lxml/trunk/src/lxml/apihelpers.pxi Log: better error message for non-string input Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Fri Nov 5 22:18:17 2010 @@ -1361,7 +1361,7 @@ utf8_string = bytes(s) invalid = check_string_utf8(utf8_string) else: - raise TypeError, u"Argument must be string or unicode." + raise TypeError, (u"Argument must be bytes or unicode, got '%.200s'" % type(s).__name__) if invalid: raise ValueError, \ u"All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters" From scoder at codespeak.net Wed Nov 17 19:07:17 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 Nov 2010 19:07:17 +0100 (CET) Subject: [Lxml-checkins] r79212 - lxml/trunk/src/lxml Message-ID: <20101117180717.454D350827@codespeak.net> Author: scoder Date: Wed Nov 17 19:07:15 2010 New Revision: 79212 Modified: lxml/trunk/src/lxml/etree_defs.h Log: code simplification Modified: lxml/trunk/src/lxml/etree_defs.h ============================================================================== --- lxml/trunk/src/lxml/etree_defs.h (original) +++ lxml/trunk/src/lxml/etree_defs.h Wed Nov 17 19:07:15 2010 @@ -171,10 +171,7 @@ PyUnicode_CheckExact(obj) || \ PyObject_TypeCheck(obj, &PyBaseString_Type)) #else -#define _isString(obj) (PyUnicode_CheckExact(obj) || \ - PyBytes_CheckExact(obj) || \ - PyUnicode_Check(obj) || \ - PyBytes_Check(obj)) +#define _isString(obj) (PyUnicode_Check(obj) || PyBytes_Check(obj)) #endif #define _isElement(c_node) \ From scoder at codespeak.net Wed Nov 17 19:07:21 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 Nov 2010 19:07:21 +0100 (CET) Subject: [Lxml-checkins] r79213 - lxml/trunk/src/lxml Message-ID: <20101117180721.8D54250829@codespeak.net> Author: scoder Date: Wed Nov 17 19:07:19 2010 New Revision: 79213 Modified: lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/etreepublic.pxd lxml/trunk/src/lxml/public-api.pxi Log: return value typing in public API Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Wed Nov 17 19:07:19 2010 @@ -587,7 +587,7 @@ tree.xmlRemoveProp(c_attr) return 0 -cdef object _collectAttributes(xmlNode* c_node, int collecttype): +cdef list _collectAttributes(xmlNode* c_node, int collecttype): u"""Collect all attributes of a node in a list. Depending on collecttype, it collects either the name (1), the value (2) or the name-value tuples. """ Modified: lxml/trunk/src/lxml/etreepublic.pxd ============================================================================== --- lxml/trunk/src/lxml/etreepublic.pxd (original) +++ lxml/trunk/src/lxml/etreepublic.pxd Wed Nov 17 19:07:19 2010 @@ -110,7 +110,7 @@ cdef object iterattributes(_Element element, int keysvalues) # return the list of all attribute names (1), values (2) or items (3) - cdef object collectAttributes(tree.xmlNode* c_element, int keysvalues) + cdef list collectAttributes(tree.xmlNode* c_element, int keysvalues) # set an attribute value on an element # on failure, sets an exception and returns -1 @@ -179,10 +179,10 @@ cdef object pyunicode(char* s) # convert the string to UTF-8 using the normal lxml.etree semantics - cdef object utf8(object s) + cdef bytes utf8(object s) # split a tag into a (URI, name) tuple - cdef object getNsTag(object tag) + cdef tuple getNsTag(object tag) # get the "{ns}tag" string for a C node cdef object namespacedName(tree.xmlNode* c_node) Modified: lxml/trunk/src/lxml/public-api.pxi ============================================================================== --- lxml/trunk/src/lxml/public-api.pxi (original) +++ lxml/trunk/src/lxml/public-api.pxi Wed Nov 17 19:07:19 2010 @@ -93,7 +93,7 @@ cdef public api object iterattributes(_Element element, int keysvalues): return _attributeIteratorFactory(element, keysvalues) -cdef public api object collectAttributes(xmlNode* c_element, int keysvalues): +cdef public api list collectAttributes(xmlNode* c_element, int keysvalues): return _collectAttributes(c_element, keysvalues) cdef public api int setAttributeValue(_Element element, key, value) except -1: @@ -132,10 +132,10 @@ raise TypeError return funicode(s) -cdef public api object utf8(object s): +cdef public api bytes utf8(object s): return _utf8(s) -cdef public api object getNsTag(object tag): +cdef public api tuple getNsTag(object tag): return _getNsTag(tag) cdef public api object namespacedName(xmlNode* c_node): From scoder at codespeak.net Wed Nov 17 19:07:24 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 Nov 2010 19:07:24 +0100 (CET) Subject: [Lxml-checkins] r79214 - lxml/trunk Message-ID: <20101117180724.4F3C850821@codespeak.net> Author: scoder Date: Wed Nov 17 19:07:22 2010 New Revision: 79214 Modified: lxml/trunk/setupinfo.py Log: properly handle dependencies in Cython build Modified: lxml/trunk/setupinfo.py ============================================================================== --- lxml/trunk/setupinfo.py (original) +++ lxml/trunk/setupinfo.py Wed Nov 17 19:07:22 2010 @@ -110,11 +110,11 @@ result = [] for module in modules: main_module_source = PACKAGE_PATH + module + source_extension - dependencies = find_dependencies(module) result.append( Extension( module, - sources = [main_module_source] + dependencies, + sources = [main_module_source], + depends = find_dependencies(module), extra_compile_args = _cflags, extra_objects = static_binaries, define_macros = _define_macros, @@ -128,10 +128,6 @@ def find_dependencies(module): if not CYTHON_INSTALLED: return [] - from Cython.Compiler.Version import version - if split_version(version) < (0,9,6,13): - return [] - package_dir = os.path.join(get_base_dir(), PACKAGE_PATH) files = os.listdir(package_dir) pxd_files = [ os.path.join(PACKAGE_PATH, filename) for filename in files From scoder at codespeak.net Wed Nov 17 19:07:27 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 Nov 2010 19:07:27 +0100 (CET) Subject: [Lxml-checkins] r79215 - lxml/trunk Message-ID: <20101117180727.009E050828@codespeak.net> Author: scoder Date: Wed Nov 17 19:07:26 2010 New Revision: 79215 Modified: lxml/trunk/CHANGES.txt Log: changelog Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed Nov 17 19:07:26 2010 @@ -13,6 +13,12 @@ * ``marque`` tag in HTML cleanup code is correctly named ``marquee``. +Other changes +-------------- + +* Some public functions in the Cython-level C-API have more explicit + return types. + 2.3beta1 (2010-09-06) ===================== From scoder at codespeak.net Wed Nov 17 19:07:32 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 Nov 2010 19:07:32 +0100 (CET) Subject: [Lxml-checkins] r79216 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20101117180732.9759450821@codespeak.net> Author: scoder Date: Wed Nov 17 19:07:30 2010 New Revision: 79216 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/etreepublic.pxd lxml/trunk/src/lxml/lxml.objectify.pyx lxml/trunk/src/lxml/public-api.pxi lxml/trunk/src/lxml/tests/test_objectify.py Log: support '{}tag' in lxml.objectify child search as meaning explicitly the empty namespace Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed Nov 17 19:07:30 2010 @@ -5,6 +5,12 @@ Under development ================== +Features added +-------------- + +* When looking for children, ``lxml.objectify`` takes '{}tag' as + meaning an empty namespace, as opposed to the parent namespace. + Bugs fixed ---------- Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Wed Nov 17 19:07:30 2010 @@ -1457,8 +1457,18 @@ cdef tuple _getNsTag(tag): u"""Given a tag, find namespace URI and tag name. - Return None for NS uri if no namespace URI available. + Return None for NS uri if no namespace URI provided. """ + return __getNsTag(tag, 0) + +cdef tuple _getNsTagWithEmptyNs(tag): + u"""Given a tag, find namespace URI and tag name. Return None for NS uri + if no namespace URI provided, or the empty string if namespace + part is '{}'. + """ + return __getNsTag(tag, 1) + +cdef tuple __getNsTag(tag, bint empty_ns): cdef char* c_tag cdef char* c_ns_end cdef Py_ssize_t taglen @@ -1480,6 +1490,8 @@ raise ValueError, u"Empty tag name" if nslen > 0: ns = c_tag[:nslen] + elif empty_ns: + ns = b'' tag = c_ns_end[1:taglen+1] elif python.PyBytes_GET_SIZE(tag) == 0: raise ValueError, u"Empty tag name" Modified: lxml/trunk/src/lxml/etreepublic.pxd ============================================================================== --- lxml/trunk/src/lxml/etreepublic.pxd (original) +++ lxml/trunk/src/lxml/etreepublic.pxd Wed Nov 17 19:07:30 2010 @@ -181,9 +181,12 @@ # convert the string to UTF-8 using the normal lxml.etree semantics cdef bytes utf8(object s) - # split a tag into a (URI, name) tuple + # split a tag into a (URI, name) tuple, return None as URI for '{}tag' cdef tuple getNsTag(object tag) + # split a tag into a (URI, name) tuple, return b'' as URI for '{}tag' + cdef tuple getNsTagWithEmptyNs(object tag) + # get the "{ns}tag" string for a C node cdef object namespacedName(tree.xmlNode* c_node) Modified: lxml/trunk/src/lxml/lxml.objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.objectify.pyx (original) +++ lxml/trunk/src/lxml/lxml.objectify.pyx Wed Nov 17 19:07:30 2010 @@ -468,11 +468,11 @@ cdef char* c_href cdef char* c_tag c_node = parent._c_node - ns, tag = cetree.getNsTag(tag) + ns, tag = cetree.getNsTagWithEmptyNs(tag) c_tag = tree.xmlDictExists( c_node.doc.dict, _cstr(tag), python.PyBytes_GET_SIZE(tag)) if c_tag is NULL: - return None + return None # not in the hash map => not in the tree if ns is None: c_href = tree._getNs(c_node) else: Modified: lxml/trunk/src/lxml/public-api.pxi ============================================================================== --- lxml/trunk/src/lxml/public-api.pxi (original) +++ lxml/trunk/src/lxml/public-api.pxi Wed Nov 17 19:07:30 2010 @@ -138,6 +138,9 @@ cdef public api tuple getNsTag(object tag): return _getNsTag(tag) +cdef public api tuple getNsTagWithEmptyNs(object tag): + return _getNsTagWithEmptyNs(tag) + cdef public api object namespacedName(xmlNode* c_node): return _namespacedName(c_node) Modified: lxml/trunk/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_objectify.py (original) +++ lxml/trunk/src/lxml/tests/test_objectify.py Wed Nov 17 19:07:30 2010 @@ -63,7 +63,7 @@ 1 2 3 - 3 + 4 ''' @@ -353,6 +353,11 @@ self.assertRaises(AttributeError, getattr, root.c1, "NOT_THERE") self.assertRaises(AttributeError, getattr, root.c1, "{unknownNS}c2") + def test_child_getattr_empty_ns(self): + root = self.XML(xml_str) + self.assertEquals("4", getattr(root.c1, "{}c2").text) + self.assertEquals("0", getattr(root.c1, "c2").text) + def test_setattr(self): for val in [ 2, 2**32, 1.2, "Won't get fooled again", From lxml-checkins at codespeak.net Wed Nov 17 20:16:57 2010 From: lxml-checkins at codespeak.net (lxml-checkins at codespeak.net) Date: Wed, 17 Nov 2010 20:16:57 +0100 (CET) Subject: [Lxml-checkins] lxml-checkins@codespeak.net Rolex Today -25% Message-ID: <000301cb868c$027cd04e$249aa8c0@.dsl.telesp.net.br> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20101117/5461972a/attachment.htm From lxml-checkins at codespeak.net Wed Nov 17 20:16:57 2010 From: lxml-checkins at codespeak.net (lxml-checkins at codespeak.net) Date: Wed, 17 Nov 2010 20:16:57 +0100 (CET) Subject: [Lxml-checkins] lxml-checkins@codespeak.net Rolex Today -25% Message-ID: <000301cb868c$028d80c4$9ff7a8c0@.dsl.telesp.net.br> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20101117/0dec7b44/attachment.htm From scoder at codespeak.net Tue Nov 23 15:37:11 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 23 Nov 2010 15:37:11 +0100 (CET) Subject: [Lxml-checkins] r79398 - in lxml/trunk/src: . lxml/html lxml/html/tests lxml/tests Message-ID: <20101123143711.B4A5A5080F@codespeak.net> Author: scoder Date: Tue Nov 23 15:37:09 2010 New Revision: 79398 Modified: lxml/trunk/src/local_doctest.py lxml/trunk/src/lxml/html/_diffcommand.py lxml/trunk/src/lxml/html/tests/transform_feedparser_data.py lxml/trunk/src/lxml/tests/test_errors.py Log: make some non-public source files Py3 compliant (patch by Arfrever Frehtes Taifersar Arahesis) Modified: lxml/trunk/src/local_doctest.py ============================================================================== --- lxml/trunk/src/local_doctest.py (original) +++ lxml/trunk/src/local_doctest.py Tue Nov 23 15:37:09 2010 @@ -369,9 +369,9 @@ # [XX] Normalize with respect to os.path.pardir? def _module_relative_path(module, path): if not inspect.ismodule(module): - raise TypeError, 'Expected a module: %r' % module + raise TypeError('Expected a module: %r' % module) if path.startswith('/'): - raise ValueError, 'Module-relative files may not have absolute paths' + raise ValueError('Module-relative files may not have absolute paths') # Find the base directory for the path. if hasattr(module, '__file__'): @@ -897,7 +897,7 @@ add them to `tests`. """ if self._verbose: - print 'Finding tests in %s' % name + print('Finding tests in %s' % name) # If we've already processed this object, then ignore it. if id(obj) in seen: @@ -1059,7 +1059,7 @@ >>> tests = DocTestFinder().find(_TestClass) >>> runner = DocTestRunner(verbose=False) >>> for test in tests: - ... print runner.run(test) + ... print(runner.run(test)) (0, 2) (0, 1) (0, 2) @@ -1252,8 +1252,8 @@ # keyboard interrupts.) try: # Don't blink! This is where the user's code gets run. - exec compile(example.source, filename, "single", - compileflags, 1) in test.globs + exec(compile(example.source, filename, "single", + compileflags, 1), test.globs) self.debugger.set_continue() # ==== Example Finished ==== exception = None except KeyboardInterrupt: @@ -1427,28 +1427,28 @@ failed.append(x) if verbose: if notests: - print len(notests), "items had no tests:" + print("%s items had no tests:" % len(notests)) notests.sort() for thing in notests: - print " ", thing + print(" %s" % thing) if passed: - print len(passed), "items passed all tests:" + print("%s items passed all tests:" % len(passed)) passed.sort() for thing, count in passed: - print " %3d tests in %s" % (count, thing) + print(" %3d tests in %s" % (count, thing)) if failed: - print self.DIVIDER - print len(failed), "items had failures:" + print(self.DIVIDER) + print("%s items had failures:" % len(failed)) failed.sort() for thing, (f, t) in failed: - print " %3d of %3d in %s" % (f, t, thing) + print(" %3d of %3d in %s" % (f, t, thing)) if verbose: - print totalt, "tests in", len(self._name2ft), "items." - print totalt - totalf, "passed and", totalf, "failed." + print("%s tests in %s items." % (totalt, len(self._name2ft))) + print("%s passed and %s failed." % (totalt - totalf, totalf)) if totalf: - print "***Test Failed***", totalf, "failures." + print("***Test Failed*** %s failures." % totalf) elif verbose: - print "Test passed." + print("Test passed.") return totalf, totalt #///////////////////////////////////////////////////////////////// @@ -1458,8 +1458,8 @@ d = self._name2ft for name, (f, t) in other._name2ft.items(): if name in d: - print "*** DocTestRunner.merge: '" + name + "' in both" \ - " testers; summing outcomes." + print("*** DocTestRunner.merge: '" + name + "' in both" + " testers; summing outcomes.") f2, t2 = d[name] f = f + f2 t = t + t2 @@ -2037,10 +2037,10 @@ def runstring(self, s, name): test = DocTestParser().get_doctest(s, self.globs, name, None, None) if self.verbose: - print "Running string", name + print("Running string %s" % name) (f,t) = self.testrunner.run(test) if self.verbose: - print f, "of", t, "examples failed in string", name + print("%s of %s examples failed in string %s" % (f, t, name)) return (f,t) def rundoc(self, object, name=None, module=None): @@ -2487,7 +2487,7 @@ ... Ho hum ... ''' - >>> print script_from_examples(text) + >>> print(script_from_examples(text)) # Here are examples of simple math. # # Python has super accurate integer addition @@ -2578,7 +2578,7 @@ try: execfile(srcfilename, globs, globs) except: - print sys.exc_info()[1] + print(sys.exc_info()[1]) pdb.post_mortem(sys.exc_info()[2]) else: # Note that %r is vital here. '%s' instead can, e.g., cause @@ -2620,7 +2620,7 @@ """val -> _TestClass object with associated value val. >>> t = _TestClass(123) - >>> print t.get() + >>> print(t.get()) 123 """ @@ -2640,7 +2640,7 @@ """get() -> return TestClass's associated value. >>> x = _TestClass(-42) - >>> print x.get() + >>> print(x.get()) -42 """ @@ -2672,7 +2672,7 @@ "blank lines": r""" Blank lines can be marked with : - >>> print 'foo\n\nbar\n' + >>> print('foo\n\nbar\n') foo bar @@ -2682,14 +2682,14 @@ "ellipsis": r""" If the ellipsis flag is used, then '...' can be used to elide substrings in the desired output: - >>> print range(1000) #doctest: +ELLIPSIS + >>> print(range(1000)) #doctest: +ELLIPSIS [0, 1, 2, ..., 999] """, "whitespace normalization": r""" If the whitespace normalization flag is used, then differences in whitespace are ignored. - >>> print range(30) #doctest: +NORMALIZE_WHITESPACE + >>> print(range(30)) #doctest: +NORMALIZE_WHITESPACE [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29] Modified: lxml/trunk/src/lxml/html/_diffcommand.py ============================================================================== --- lxml/trunk/src/lxml/html/_diffcommand.py (original) +++ lxml/trunk/src/lxml/html/_diffcommand.py Tue Nov 23 15:37:09 2010 @@ -82,6 +82,6 @@ return pre, html, post def annotate(options, args): - print "Not yet implemented" + print("Not yet implemented") sys.exit(1) Modified: lxml/trunk/src/lxml/html/tests/transform_feedparser_data.py ============================================================================== --- lxml/trunk/src/lxml/html/tests/transform_feedparser_data.py (original) +++ lxml/trunk/src/lxml/html/tests/transform_feedparser_data.py Tue Nov 23 15:37:09 2010 @@ -88,10 +88,10 @@ try: output = serialize_content(parse_content(c)) except: - print 'Bad data in %s:' % filename - print c + print('Bad data in %s:' % filename) + print(c) traceback.print_exc() - print '-'*60 + print('-'*60) return new = os.path.splitext(filename)[0] + '.data' f = open(new, 'wb') Modified: lxml/trunk/src/lxml/tests/test_errors.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_errors.py (original) +++ lxml/trunk/src/lxml/tests/test_errors.py Tue Nov 23 15:37:09 2010 @@ -1,4 +1,4 @@ -?# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- import unittest, doctest # These tests check that error handling in the Pyrex code is From scoder at codespeak.net Thu Nov 25 12:48:51 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 25 Nov 2010 12:48:51 +0100 (CET) Subject: [Lxml-checkins] r79502 - lxml/trunk/src/lxml Message-ID: <20101125114851.18A825080E@codespeak.net> Author: scoder Date: Thu Nov 25 12:48:49 2010 New Revision: 79502 Modified: lxml/trunk/src/lxml/xmlerror.pxi Log: code cleanup Modified: lxml/trunk/src/lxml/xmlerror.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlerror.pxi (original) +++ lxml/trunk/src/lxml/xmlerror.pxi Thu Nov 25 12:48:49 2010 @@ -154,28 +154,24 @@ code = xmlerror.XML_ERR_INTERNAL_ERROR if self._first_error is None: return exctype(default_message, code, 0, 0) - if self._first_error is None or \ - self._first_error.message is None or \ - not self._first_error.message: + if not self._first_error.message: message = default_message - line = 0 - column = 0 else: message = self._first_error.message code = self._first_error.type - line = self._first_error.line - column = self._first_error.column - if line > 0: - if column > 0: - message = u"%s, line %d, column %d" % (message, line, column) - else: - message = u"%s, line %d" % (message, line) + line = self._first_error.line + column = self._first_error.column + if line > 0: + if column > 0: + message = u"%s, line %d, column %d" % (message, line, column) + else: + message = u"%s, line %d" % (message, line) return exctype(message, code, line, column) cdef _buildExceptionMessage(self, default_message): if self._first_error is None: return default_message - if self._first_error.message is not None and self._first_error.message: + if self._first_error.message: message = self._first_error.message elif default_message is None: return None From scoder at codespeak.net Thu Nov 25 13:07:02 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 25 Nov 2010 13:07:02 +0100 (CET) Subject: [Lxml-checkins] r79504 - lxml/trunk/doc Message-ID: <20101125120702.9CFA5282B9D@codespeak.net> Author: scoder Date: Thu Nov 25 13:07:01 2010 New Revision: 79504 Modified: lxml/trunk/doc/parsing.txt Log: doc comment Modified: lxml/trunk/doc/parsing.txt ============================================================================== --- lxml/trunk/doc/parsing.txt (original) +++ lxml/trunk/doc/parsing.txt Thu Nov 25 13:07:01 2010 @@ -259,6 +259,9 @@ XML/XHTML document after parsing, you may have to apply some manual preprocessing first. +Also note that the HTML parser is meant to parse HTML documents. For +XHTML documents, use the XML parser, which is namespace aware. + Doctype information -------------------