From scoder at codespeak.net Thu Oct 16 22:33:39 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 16 Oct 2008 22:33:39 +0200 (CEST) Subject: [Lxml-checkins] r59158 - in lxml/trunk: . src/lxml/tests Message-ID: <20081016203339.8920B16A1BA@codespeak.net> Author: scoder Date: Thu Oct 16 22:33:36 2008 New Revision: 59158 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/tests/test_htmlparser.py Log: r4808 at delle: sbehnel | 2008-10-16 21:53:28 +0200 test fixes Modified: lxml/trunk/src/lxml/tests/test_htmlparser.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_htmlparser.py (original) +++ lxml/trunk/src/lxml/tests/test_htmlparser.py Thu Oct 16 22:33:36 2008 @@ -40,12 +40,13 @@ def test_module_HTML(self): element = self.etree.HTML(self.html_str) - self.assertEqual(self.etree.tostring(element), + self.assertEqual(self.etree.tostring(element, method="html"), self.html_str) def test_module_HTML_unicode(self): element = self.etree.HTML(self.uhtml_str) - self.assertEqual(unicode(self.etree.tostring(element, encoding='UTF8'), 'UTF8'), + self.assertEqual(unicode(self.etree.tostring(element, method="html", + encoding='UTF8'), 'UTF8'), unicode(self.uhtml_str.encode('UTF8'), 'UTF8')) def test_module_HTML_pretty_print(self): @@ -194,7 +195,7 @@ def test_module_HTML_broken(self): element = self.etree.HTML(self.broken_html_str) - self.assertEqual(self.etree.tostring(element), + self.assertEqual(self.etree.tostring(element, method="html"), self.html_str) def test_module_HTML_cdata(self): @@ -215,7 +216,8 @@ f = open(filename, 'rb') tree = self.etree.parse(f, parser) f.close() - self.assertEqual(self.etree.tostring(tree.getroot()), self.html_str) + self.assertEqual(self.etree.tostring(tree.getroot(), method="html"), + self.html_str) finally: os.remove(filename) @@ -223,7 +225,8 @@ parser = self.etree.HTMLParser() f = SillyFileLike(self.html_str) tree = self.etree.parse(f, parser) - html = self.etree.tostring(tree.getroot(), encoding='UTF-8') + html = self.etree.tostring(tree.getroot(), + method="html", encoding='UTF-8') self.assertEqual(html, self.html_str) ## def test_module_parse_html_filelike_unicode(self): @@ -247,7 +250,7 @@ self.etree.set_default_parser( self.etree.HTMLParser() ) tree = self.etree.parse(BytesIO(self.broken_html_str)) - self.assertEqual(self.etree.tostring(tree.getroot()), + self.assertEqual(self.etree.tostring(tree.getroot(), method="html"), self.html_str) self.etree.set_default_parser() From scoder at codespeak.net Thu Oct 16 22:33:43 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 16 Oct 2008 22:33:43 +0200 (CEST) Subject: [Lxml-checkins] r59159 - in lxml/trunk: . src/lxml/tests Message-ID: <20081016203343.713F216A1DE@codespeak.net> Author: scoder Date: Thu Oct 16 22:33:42 2008 New Revision: 59159 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/tests/test_etree.py Log: r4809 at delle: sbehnel | 2008-10-16 21:55:04 +0200 test for parser encodings Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Thu Oct 16 22:33:42 2008 @@ -430,6 +430,11 @@ self.assertRaises( LookupError, self.etree.XMLParser, encoding="hopefully unknown") + def test_parser_encoding(self): + self.etree.XMLParser(encoding="ascii") + self.etree.XMLParser(encoding="utf-8") + self.etree.XMLParser(encoding="iso-8859-1") + def test_elementtree_parser_target_type_error(self): assertEquals = self.assertEquals assertFalse = self.assertFalse From scoder at codespeak.net Thu Oct 16 22:33:53 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 16 Oct 2008 22:33:53 +0200 (CEST) Subject: [Lxml-checkins] r59160 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20081016203353.22F94168443@codespeak.net> Author: scoder Date: Thu Oct 16 22:33:51 2008 New Revision: 59160 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/docloader.pxi lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/tests/test_etree.py lxml/trunk/src/lxml/xmlparser.pxd lxml/trunk/src/lxml/xslt.pxi Log: r4810 at delle: sbehnel | 2008-10-16 22:32:57 +0200 fix for Resolver.resolve_string() to make it work with non-ASCII byte strings Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Thu Oct 16 22:33:51 2008 @@ -15,6 +15,8 @@ Bugs fixed ---------- +* Resolver.resolve_string() did not work for non-ASCII byte strings. + * Overriding the parser encoding didn't work for many encodings. * 0-bytes could slip through the API when used inside of Unicode Modified: lxml/trunk/src/lxml/docloader.pxi ============================================================================== --- lxml/trunk/src/lxml/docloader.pxi (original) +++ lxml/trunk/src/lxml/docloader.pxi Thu Oct 16 22:33:51 2008 @@ -8,7 +8,7 @@ cdef class _InputDocument: cdef _InputDocumentDataType _type - cdef object _data_bytes + cdef object _data cdef object _filename cdef object _file @@ -47,9 +47,12 @@ argument. """ cdef _InputDocument doc_ref + if not python.PyString_Check(string) and \ + not python.PyUnicode_Check(string): + raise TypeError, "argument must be a byte string or unicode string" doc_ref = _InputDocument() doc_ref._type = PARSER_DATA_STRING - doc_ref._data_bytes = _utf8(string) + doc_ref._data = string if base_url is not None: doc_ref._filename = _encodeFilename(base_url) return doc_ref Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Thu Oct 16 22:33:51 2008 @@ -428,15 +428,23 @@ pubid = funicode(c_pubid) # always UTF-8 doc_ref = context._resolvers.resolve(url, pubid, context) + + if doc_ref is not None and doc_ref._type == PARSER_DATA_STRING: + if python.PyUnicode_Check(doc_ref._data): + doc_ref._data = python.PyUnicode_AsUTF8String(doc_ref._data) except: context._store_raised() return NULL if doc_ref is not None: if doc_ref._type == PARSER_DATA_STRING: - data = doc_ref._data_bytes - c_input = xmlparser.xmlNewStringInputStream( - c_context, _cstr(data)) + data = doc_ref._data + c_input = xmlparser.xmlNewInputStream(c_context) + if c_input is not NULL: + c_input.base = _cstr(data) + c_input.length = python.PyString_GET_SIZE(data) + c_input.cur = c_input.base + c_input.end = &c_input.base[c_input.length] elif doc_ref._type == PARSER_DATA_FILENAME: c_input = xmlparser.xmlNewInputFromFile( c_context, _cstr(doc_ref._filename)) Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Thu Oct 16 22:33:51 2008 @@ -672,6 +672,27 @@ root = tree.getroot() self.assertEquals(root.text, test_url) + def test_resolve_bytes_dtd(self): + parse = self.etree.parse + parser = self.etree.XMLParser(dtd_validation=True) + assertEqual = self.assertEqual + test_url = _str("__nosuch.dtd") + + class MyResolver(self.etree.Resolver): + def resolve(self, url, id, context): + assertEqual(url, test_url) + return self.resolve_string( + (_str(''' + ''') % url).encode('utf-8'), + context) + + parser.resolvers.add(MyResolver()) + + xml = _str('&myentity;') % test_url + tree = parse(StringIO(xml), parser) + root = tree.getroot() + self.assertEquals(root.text, test_url) + def test_resolve_filelike_dtd(self): parse = self.etree.parse parser = self.etree.XMLParser(dtd_validation=True) Modified: lxml/trunk/src/lxml/xmlparser.pxd ============================================================================== --- lxml/trunk/src/lxml/xmlparser.pxd (original) +++ lxml/trunk/src/lxml/xmlparser.pxd Thu Oct 16 22:33:51 2008 @@ -47,6 +47,10 @@ cdef extern from "libxml/tree.h": ctypedef struct xmlParserInput: int line + int length + char* base + char* cur + char* end ctypedef struct xmlParserInputBuffer: void* context @@ -183,6 +187,7 @@ int enc) nogil cdef extern from "libxml/parserInternals.h": + cdef xmlParserInput* xmlNewInputStream(xmlParserCtxt* ctxt) cdef xmlParserInput* xmlNewStringInputStream(xmlParserCtxt* ctxt, char* buffer) nogil cdef xmlParserInput* xmlNewInputFromFile(xmlParserCtxt* ctxt, Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Thu Oct 16 22:33:51 2008 @@ -93,7 +93,7 @@ if doc_ref is not None: if doc_ref._type == PARSER_DATA_STRING: c_doc = _parseDoc( - doc_ref._data_bytes, doc_ref._filename, context._parser) + doc_ref._data, doc_ref._filename, context._parser) elif doc_ref._type == PARSER_DATA_FILENAME: c_doc = _parseDocFromFile(doc_ref._filename, context._parser) elif doc_ref._type == PARSER_DATA_FILE: From scoder at codespeak.net Thu Oct 16 22:41:51 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 16 Oct 2008 22:41:51 +0200 (CEST) Subject: [Lxml-checkins] r59161 - in lxml/trunk: . src/lxml Message-ID: <20081016204151.EE38516A132@codespeak.net> Author: scoder Date: Thu Oct 16 22:41:51 2008 New Revision: 59161 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/docloader.pxi lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/xslt.pxi Log: r4814 at delle: sbehnel | 2008-10-16 22:41:13 +0200 cleanup and simplification Modified: lxml/trunk/src/lxml/docloader.pxi ============================================================================== --- lxml/trunk/src/lxml/docloader.pxi (original) +++ lxml/trunk/src/lxml/docloader.pxi Thu Oct 16 22:41:51 2008 @@ -8,7 +8,7 @@ cdef class _InputDocument: cdef _InputDocumentDataType _type - cdef object _data + cdef object _data_bytes cdef object _filename cdef object _file @@ -47,12 +47,13 @@ argument. """ cdef _InputDocument doc_ref - if not python.PyString_Check(string) and \ - not python.PyUnicode_Check(string): + if python.PyUnicode_Check(string): + string = python.PyUnicode_AsUTF8String(string) + elif not python.PyString_Check(string): raise TypeError, "argument must be a byte string or unicode string" doc_ref = _InputDocument() doc_ref._type = PARSER_DATA_STRING - doc_ref._data = string + doc_ref._data_bytes = string if base_url is not None: doc_ref._filename = _encodeFilename(base_url) return doc_ref Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Thu Oct 16 22:41:51 2008 @@ -428,17 +428,13 @@ pubid = funicode(c_pubid) # always UTF-8 doc_ref = context._resolvers.resolve(url, pubid, context) - - if doc_ref is not None and doc_ref._type == PARSER_DATA_STRING: - if python.PyUnicode_Check(doc_ref._data): - doc_ref._data = python.PyUnicode_AsUTF8String(doc_ref._data) except: context._store_raised() return NULL if doc_ref is not None: if doc_ref._type == PARSER_DATA_STRING: - data = doc_ref._data + data = doc_ref._data_bytes c_input = xmlparser.xmlNewInputStream(c_context) if c_input is not NULL: c_input.base = _cstr(data) Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Thu Oct 16 22:41:51 2008 @@ -93,7 +93,7 @@ if doc_ref is not None: if doc_ref._type == PARSER_DATA_STRING: c_doc = _parseDoc( - doc_ref._data, doc_ref._filename, context._parser) + doc_ref._data_bytes, doc_ref._filename, context._parser) elif doc_ref._type == PARSER_DATA_FILENAME: c_doc = _parseDocFromFile(doc_ref._filename, context._parser) elif doc_ref._type == PARSER_DATA_FILE: From scoder at codespeak.net Sat Oct 18 21:35:40 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 18 Oct 2008 21:35:40 +0200 (CEST) Subject: [Lxml-checkins] r59219 - in lxml/trunk: . doc Message-ID: <20081018193540.A983716A097@codespeak.net> Author: scoder Date: Sat Oct 18 21:35:37 2008 New Revision: 59219 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/parsing.txt Log: r4816 at delle: sbehnel | 2008-10-18 21:29:48 +0200 test fix Modified: lxml/trunk/doc/parsing.txt ============================================================================== --- lxml/trunk/doc/parsing.txt (original) +++ lxml/trunk/doc/parsing.txt Sat Oct 18 21:35:37 2008 @@ -31,6 +31,8 @@ >>> from lxml import etree .. + >>> from lxml import usedoctest + >>> try: from StringIO import StringIO ... except ImportError: ... from io import BytesIO From scoder at codespeak.net Sat Oct 18 21:35:44 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 18 Oct 2008 21:35:44 +0200 (CEST) Subject: [Lxml-checkins] r59220 - in lxml/trunk: . samples Message-ID: <20081018193544.EB01F16A09B@codespeak.net> Author: scoder Date: Sat Oct 18 21:35:44 2008 New Revision: 59220 Modified: lxml/trunk/ (props changed) lxml/trunk/samples/simple-ns.xml lxml/trunk/selftest.py lxml/trunk/selftest2.py Log: r4817 at delle: sbehnel | 2008-10-18 21:34:58 +0200 test fixes for libxml2 2.7 Modified: lxml/trunk/samples/simple-ns.xml ============================================================================== --- lxml/trunk/samples/simple-ns.xml (original) +++ lxml/trunk/samples/simple-ns.xml Sat Oct 18 21:35:44 2008 @@ -1,4 +1,4 @@ - + text texttail Modified: lxml/trunk/selftest.py ============================================================================== --- lxml/trunk/selftest.py (original) +++ lxml/trunk/selftest.py Sat Oct 18 21:35:44 2008 @@ -315,13 +315,13 @@ >>> tree = ElementTree.parse("samples/simple-ns.xml") >>> normalize_crlf(tree) >>> tree.write(sys.stdout) - + text texttail -## +## ## text ## texttail ## @@ -412,10 +412,10 @@ >>> context = iterparse("samples/simple-ns.xml") >>> for action, elem in context: ... print("%s %s" % (action, elem.tag)) - end {namespace}element - end {namespace}element - end {namespace}empty-element - end {namespace}root + end {http://namespace/}element + end {http://namespace/}element + end {http://namespace/}empty-element + end {http://namespace/}root >>> events = () >>> context = iterparse("samples/simple.xml", events) @@ -447,15 +447,15 @@ ... print("%s %s" % (action, elem.tag)) ... else: ... print("%s %s" % (action, elem)) - start-ns ('', 'namespace') - start {namespace}root - start {namespace}element - end {namespace}element - start {namespace}element - end {namespace}element - start {namespace}empty-element - end {namespace}empty-element - end {namespace}root + start-ns ('', 'http://namespace/') + start {http://namespace/}root + start {http://namespace/}element + end {http://namespace/}element + start {http://namespace/}element + end {http://namespace/}element + start {http://namespace/}empty-element + end {http://namespace/}empty-element + end {http://namespace/}root end-ns None """ Modified: lxml/trunk/selftest2.py ============================================================================== --- lxml/trunk/selftest2.py (original) +++ lxml/trunk/selftest2.py Sat Oct 18 21:35:44 2008 @@ -126,7 +126,7 @@ >>> tree = ElementTree.parse(open("samples/simple-ns.xml", "rb")) >>> tree.write(sys.stdout) - + text texttail From scoder at codespeak.net Sun Oct 19 16:43:49 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 19 Oct 2008 16:43:49 +0200 (CEST) Subject: [Lxml-checkins] r59235 - lxml/branch/lxml-2.1/src/lxml Message-ID: <20081019144349.19C6C16A175@codespeak.net> Author: scoder Date: Sun Oct 19 16:43:48 2008 New Revision: 59235 Modified: lxml/branch/lxml-2.1/src/lxml/apihelpers.pxi Log: Py2.6 compat fix Modified: lxml/branch/lxml-2.1/src/lxml/apihelpers.pxi ============================================================================== --- lxml/branch/lxml-2.1/src/lxml/apihelpers.pxi (original) +++ lxml/branch/lxml-2.1/src/lxml/apihelpers.pxi Sun Oct 19 16:43:48 2008 @@ -1351,19 +1351,16 @@ Returns None if not a file object. """ # file instances have a name attribute - if isinstance(source, file): - return os_path_abspath(source.name) + filename = getattr3(source, u'name', None) + if filename is not None: + return os_path_abspath(filename) # urllib2 provides a geturl() method geturl = getattr3(source, u'geturl', None) if geturl is not None: return geturl() - # gzip file instances have a filename attribute + # gzip file instances have a filename attribute (before Py3k) filename = getattr3(source, u'filename', None) if filename is not None: return os_path_abspath(filename) - # this is mostly for backwards compatibility - filename = getattr3(source, u'name', None) - if filename is not None: - return os_path_abspath(filename) # can't determine filename return None From scoder at codespeak.net Sun Oct 19 16:44:16 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 19 Oct 2008 16:44:16 +0200 (CEST) Subject: [Lxml-checkins] r59236 - in lxml/branch/lxml-2.1: . doc samples src/lxml src/lxml/tests Message-ID: <20081019144416.9E63A16A175@codespeak.net> Author: scoder Date: Sun Oct 19 16:44:15 2008 New Revision: 59236 Modified: lxml/branch/lxml-2.1/CHANGES.txt lxml/branch/lxml-2.1/doc/parsing.txt lxml/branch/lxml-2.1/samples/simple-ns.xml lxml/branch/lxml-2.1/selftest.py lxml/branch/lxml-2.1/selftest2.py lxml/branch/lxml-2.1/src/lxml/docloader.pxi lxml/branch/lxml-2.1/src/lxml/iterparse.pxi lxml/branch/lxml-2.1/src/lxml/parser.pxi lxml/branch/lxml-2.1/src/lxml/tests/test_elementtree.py lxml/branch/lxml-2.1/src/lxml/tests/test_etree.py lxml/branch/lxml-2.1/src/lxml/tests/test_htmlparser.py lxml/branch/lxml-2.1/src/lxml/xmlparser.pxd Log: bug fixes from trunk Modified: lxml/branch/lxml-2.1/CHANGES.txt ============================================================================== --- lxml/branch/lxml-2.1/CHANGES.txt (original) +++ lxml/branch/lxml-2.1/CHANGES.txt Sun Oct 19 16:44:15 2008 @@ -2,6 +2,27 @@ lxml changelog ============== +Under development +================= + +Features added +-------------- + +Bugs fixed +---------- + +* Test suite fixes for libxml2 2.7. + +* Resolver.resolve_string() did not work for non-ASCII byte strings. + +* Resolver.resolve_file() was broken. + +* Overriding the parser encoding didn't work for many encodings. + +Other changes +------------- + + 2.1.2 (2008-09-05) ================== Modified: lxml/branch/lxml-2.1/doc/parsing.txt ============================================================================== --- lxml/branch/lxml-2.1/doc/parsing.txt (original) +++ lxml/branch/lxml-2.1/doc/parsing.txt Sun Oct 19 16:44:15 2008 @@ -31,6 +31,8 @@ >>> from lxml import etree .. + >>> from lxml import usedoctest + >>> try: from StringIO import StringIO ... except ImportError: ... from io import BytesIO @@ -189,7 +191,8 @@ >>> parser = etree.HTMLParser() >>> tree = etree.parse(StringIO(broken_html), parser) - >>> result = etree.tostring(tree.getroot(), pretty_print=True) + >>> result = etree.tostring(tree.getroot(), + ... pretty_print=True, method="html") >>> print(result) @@ -206,7 +209,7 @@ .. sourcecode:: pycon >>> html = etree.HTML(broken_html) - >>> result = etree.tostring(html, pretty_print=True) + >>> result = etree.tostring(html, pretty_print=True, method="html") >>> print(result) @@ -405,7 +408,7 @@ ... ... text ... texttail - ... + ... ... ... ''' @@ -414,7 +417,7 @@ ... print("%s: %s" % (action, elem.tag)) end: element end: element - end: {testns}empty-element + end: {http://testns/}empty-element end: root The resulting tree is available through the ``root`` property of the iterator: @@ -437,8 +440,8 @@ end: element start: element end: element - start: {testns}empty-element - end: {testns}empty-element + start: {http://testns/}empty-element + end: {http://testns/}empty-element end: root The 'start-ns' and 'end-ns' events notify about namespace @@ -455,7 +458,7 @@ text texttail - + >>> events = ("start", "end", "start-ns", "end-ns") @@ -472,9 +475,9 @@ end: element start: element end: element - start-ns: ('', 'testns') - start: {testns}empty-element - end: {testns}empty-element + start-ns: ('', 'http://testns/') + start: {http://testns/}empty-element + end: {http://testns/}empty-element end-ns end: root @@ -496,11 +499,11 @@ >>> events = ("start", "end") >>> context = etree.iterparse( - ... StringIO(xml), events=events, tag="{testns}*") + ... StringIO(xml), events=events, tag="{http://testns/}*") >>> for action, elem in context: ... print("%s: %s" % (action, elem.tag)) - start: {testns}empty-element - end: {testns}empty-element + start: {http://testns/}empty-element + end: {http://testns/}empty-element Comments and PIs @@ -519,7 +522,7 @@ ... text ... ... texttail - ... + ... ... ... ''' @@ -540,8 +543,8 @@ comment: - another comment - start: element end: element - start: {testns}empty-element - end: {testns}empty-element + start: {http://testns/}empty-element + end: {http://testns/}empty-element end: root >>> print(context.root.tag) Modified: lxml/branch/lxml-2.1/samples/simple-ns.xml ============================================================================== --- lxml/branch/lxml-2.1/samples/simple-ns.xml (original) +++ lxml/branch/lxml-2.1/samples/simple-ns.xml Sun Oct 19 16:44:15 2008 @@ -1,4 +1,4 @@ - + text texttail Modified: lxml/branch/lxml-2.1/selftest.py ============================================================================== --- lxml/branch/lxml-2.1/selftest.py (original) +++ lxml/branch/lxml-2.1/selftest.py Sun Oct 19 16:44:15 2008 @@ -315,13 +315,13 @@ >>> tree = ElementTree.parse("samples/simple-ns.xml") >>> normalize_crlf(tree) >>> tree.write(sys.stdout) - + text texttail -## +## ## text ## texttail ## @@ -412,10 +412,10 @@ >>> context = iterparse("samples/simple-ns.xml") >>> for action, elem in context: ... print("%s %s" % (action, elem.tag)) - end {namespace}element - end {namespace}element - end {namespace}empty-element - end {namespace}root + end {http://namespace/}element + end {http://namespace/}element + end {http://namespace/}empty-element + end {http://namespace/}root >>> events = () >>> context = iterparse("samples/simple.xml", events) @@ -447,15 +447,15 @@ ... print("%s %s" % (action, elem.tag)) ... else: ... print("%s %s" % (action, elem)) - start-ns ('', 'namespace') - start {namespace}root - start {namespace}element - end {namespace}element - start {namespace}element - end {namespace}element - start {namespace}empty-element - end {namespace}empty-element - end {namespace}root + start-ns ('', 'http://namespace/') + start {http://namespace/}root + start {http://namespace/}element + end {http://namespace/}element + start {http://namespace/}element + end {http://namespace/}element + start {http://namespace/}empty-element + end {http://namespace/}empty-element + end {http://namespace/}root end-ns None """ Modified: lxml/branch/lxml-2.1/selftest2.py ============================================================================== --- lxml/branch/lxml-2.1/selftest2.py (original) +++ lxml/branch/lxml-2.1/selftest2.py Sun Oct 19 16:44:15 2008 @@ -126,7 +126,7 @@ >>> tree = ElementTree.parse(open("samples/simple-ns.xml", "rb")) >>> tree.write(sys.stdout) - + text texttail Modified: lxml/branch/lxml-2.1/src/lxml/docloader.pxi ============================================================================== --- lxml/branch/lxml-2.1/src/lxml/docloader.pxi (original) +++ lxml/branch/lxml-2.1/src/lxml/docloader.pxi Sun Oct 19 16:44:15 2008 @@ -47,9 +47,13 @@ You can pass the source URL as 'base_url' keyword. """ cdef _InputDocument doc_ref + if python.PyUnicode_Check(string): + string = python.PyUnicode_AsUTF8String(string) + elif not python.PyString_Check(string): + raise TypeError, "argument must be a byte string or unicode string" doc_ref = _InputDocument() doc_ref._type = PARSER_DATA_STRING - doc_ref._data_bytes = _utf8(string) + doc_ref._data_bytes = string if base_url is not None: doc_ref._filename = _encodeFilename(base_url) return doc_ref Modified: lxml/branch/lxml-2.1/src/lxml/iterparse.pxi ============================================================================== --- lxml/branch/lxml-2.1/src/lxml/iterparse.pxi (original) +++ lxml/branch/lxml-2.1/src/lxml/iterparse.pxi Sun Oct 19 16:44:15 2008 @@ -415,6 +415,19 @@ context = <_IterparseContext>self._getPushParserContext() __GLOBAL_PARSER_CONTEXT.initParserDict(context._c_ctxt) + + if self._default_encoding is not None: + if self._for_html: + error = _htmlCtxtResetPush( + context._c_ctxt, NULL, 0, + _cstr(self._default_encoding), self._parse_options) + else: + xmlparser.xmlCtxtUseOptions( + context._c_ctxt, self._parse_options) + error = xmlparser.xmlCtxtResetPush( + context._c_ctxt, NULL, 0, NULL, + _cstr(self._default_encoding)) + context.prepare() # parser will not be unlocked - no other methods supported Modified: lxml/branch/lxml-2.1/src/lxml/parser.pxi ============================================================================== --- lxml/branch/lxml-2.1/src/lxml/parser.pxi (original) +++ lxml/branch/lxml-2.1/src/lxml/parser.pxi Sun Oct 19 16:44:15 2008 @@ -435,13 +435,17 @@ if doc_ref is not None: if doc_ref._type == PARSER_DATA_STRING: data = doc_ref._data_bytes - c_input = xmlparser.xmlNewStringInputStream( - c_context, _cstr(data)) + c_input = xmlparser.xmlNewInputStream(c_context) + if c_input is not NULL: + c_input.base = _cstr(data) + c_input.length = python.PyString_GET_SIZE(data) + c_input.cur = c_input.base + c_input.end = &c_input.base[c_input.length] elif doc_ref._type == PARSER_DATA_FILENAME: c_input = xmlparser.xmlNewInputFromFile( c_context, _cstr(doc_ref._filename)) elif doc_ref._type == PARSER_DATA_FILE: - file_context = _FileReaderContext(doc_ref._file, context, url) + file_context = _FileReaderContext(doc_ref._file, context, url, None) c_input = file_context._createParserInput(c_context) data = file_context else: @@ -678,11 +682,11 @@ cdef object _filename cdef object _target cdef object _default_encoding - cdef int _default_encoding_int def __init__(self, int parse_options, bint for_html, XMLSchema schema, remove_comments, remove_pis, strip_cdata, target, filename, encoding): + cdef tree.xmlCharEncodingHandler* enchandler cdef int c_encoding if not isinstance(self, HTMLParser) and \ not isinstance(self, XMLParser) and \ @@ -702,15 +706,13 @@ if encoding is None: self._default_encoding = None - self._default_encoding_int = tree.XML_CHAR_ENCODING_NONE else: encoding = _utf8(encoding) - c_encoding = tree.xmlParseCharEncoding(_cstr(encoding)) - if c_encoding == tree.XML_CHAR_ENCODING_ERROR or \ - c_encoding == tree.XML_CHAR_ENCODING_NONE: + enchandler = tree.xmlFindCharEncodingHandler(_cstr(encoding)) + if enchandler is NULL: raise LookupError, u"unknown encoding: '%s'" % encoding + tree.xmlCharEncCloseFunc(enchandler) self._default_encoding = encoding - self._default_encoding_int = c_encoding cdef _ParserContext _getParserContext(self): cdef xmlparser.xmlParserCtxt* pctxt @@ -776,7 +778,7 @@ c_filename = NULL if self._for_html: c_ctxt = htmlparser.htmlCreatePushParserCtxt( - NULL, NULL, NULL, 0, c_filename, self._default_encoding_int) + NULL, NULL, NULL, 0, c_filename, tree.XML_CHAR_ENCODING_NONE) if c_ctxt is not NULL: htmlparser.htmlCtxtUseOptions(c_ctxt, self._parse_options) else: @@ -784,9 +786,6 @@ NULL, NULL, NULL, 0, c_filename) if c_ctxt is not NULL: xmlparser.xmlCtxtUseOptions(c_ctxt, self._parse_options) - if self._default_encoding_int != tree.XML_CHAR_ENCODING_NONE: - xmlparser.xmlSwitchEncoding( - c_ctxt, self._default_encoding_int) return c_ctxt property error_log: Modified: lxml/branch/lxml-2.1/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/branch/lxml-2.1/src/lxml/tests/test_elementtree.py (original) +++ lxml/branch/lxml-2.1/src/lxml/tests/test_elementtree.py Sun Oct 19 16:44:15 2008 @@ -2650,15 +2650,15 @@ def test_iterparse_attrib_ns(self): iterparse = self.etree.iterparse - f = BytesIO('') + f = BytesIO('') - attr_name = '{testns}bla' + attr_name = '{http://testns/}bla' events = [] iterator = iterparse(f, events=('start','end','start-ns','end-ns')) for event, elem in iterator: events.append(event) if event == 'start': - if elem.tag != '{ns1}a': + if elem.tag != '{http://ns1/}a': elem.set(attr_name, 'value') self.assertEquals( Modified: lxml/branch/lxml-2.1/src/lxml/tests/test_etree.py ============================================================================== --- lxml/branch/lxml-2.1/src/lxml/tests/test_etree.py (original) +++ lxml/branch/lxml-2.1/src/lxml/tests/test_etree.py Sun Oct 19 16:44:15 2008 @@ -14,7 +14,7 @@ sys.path.insert(0, this_dir) # needed for Py3 from common_imports import etree, StringIO, BytesIO, HelperTestCase, fileInTestDir -from common_imports import LargeFileLikeUnicode, doctest, make_doctest +from common_imports import SillyFileLike, LargeFileLikeUnicode, doctest, make_doctest from common_imports import canonicalize, sorted, _str, _bytes print("") @@ -430,6 +430,11 @@ self.assertRaises( LookupError, self.etree.XMLParser, encoding="hopefully unknown") + def test_parser_encoding(self): + self.etree.XMLParser(encoding="ascii") + self.etree.XMLParser(encoding="utf-8") + self.etree.XMLParser(encoding="iso-8859-1") + def test_elementtree_parser_target_type_error(self): assertEquals = self.assertEquals assertFalse = self.assertFalse @@ -667,6 +672,48 @@ root = tree.getroot() self.assertEquals(root.text, test_url) + def test_resolve_bytes_dtd(self): + parse = self.etree.parse + parser = self.etree.XMLParser(dtd_validation=True) + assertEqual = self.assertEqual + test_url = _str("__nosuch.dtd") + + class MyResolver(self.etree.Resolver): + def resolve(self, url, id, context): + assertEqual(url, test_url) + return self.resolve_string( + (_str(''' + ''') % url).encode('utf-8'), + context) + + parser.resolvers.add(MyResolver()) + + xml = _str('&myentity;') % test_url + tree = parse(StringIO(xml), parser) + root = tree.getroot() + self.assertEquals(root.text, test_url) + + def test_resolve_filelike_dtd(self): + parse = self.etree.parse + parser = self.etree.XMLParser(dtd_validation=True) + assertEqual = self.assertEqual + test_url = _str("__nosuch.dtd") + + class MyResolver(self.etree.Resolver): + def resolve(self, url, id, context): + assertEqual(url, test_url) + return self.resolve_file( + SillyFileLike( + _str(''' + ''') % url), context) + + parser.resolvers.add(MyResolver()) + + xml = _str('&myentity;') % test_url + tree = parse(StringIO(xml), parser) + root = tree.getroot() + self.assertEquals(root.text, test_url) + def test_resolve_filename_dtd(self): parse = self.etree.parse parser = self.etree.XMLParser(attribute_defaults=True) @@ -712,6 +759,28 @@ self.assertEquals( root[0].attrib, {'default': 'valueB'}) + def test_resolve_file_dtd(self): + parse = self.etree.parse + parser = self.etree.XMLParser(attribute_defaults=True) + assertEqual = self.assertEqual + test_url = _str("__nosuch.dtd") + + class MyResolver(self.etree.Resolver): + def resolve(self, url, id, context): + assertEqual(url, test_url) + return self.resolve_file( + open(fileInTestDir('test.dtd'), 'rb'), context) + + parser.resolvers.add(MyResolver()) + + xml = _str('') % test_url + tree = parse(StringIO(xml), parser) + root = tree.getroot() + self.assertEquals( + root.attrib, {'default': 'valueA'}) + self.assertEquals( + root[0].attrib, {'default': 'valueB'}) + def test_resolve_empty(self): parse = self.etree.parse parser = self.etree.XMLParser(load_dtd=True) Modified: lxml/branch/lxml-2.1/src/lxml/tests/test_htmlparser.py ============================================================================== --- lxml/branch/lxml-2.1/src/lxml/tests/test_htmlparser.py (original) +++ lxml/branch/lxml-2.1/src/lxml/tests/test_htmlparser.py Sun Oct 19 16:44:15 2008 @@ -40,12 +40,13 @@ def test_module_HTML(self): element = self.etree.HTML(self.html_str) - self.assertEqual(self.etree.tostring(element), + self.assertEqual(self.etree.tostring(element, method="html"), self.html_str) def test_module_HTML_unicode(self): element = self.etree.HTML(self.uhtml_str) - self.assertEqual(unicode(self.etree.tostring(element, encoding='UTF8'), 'UTF8'), + self.assertEqual(unicode(self.etree.tostring(element, method="html", + encoding='UTF8'), 'UTF8'), unicode(self.uhtml_str.encode('UTF8'), 'UTF8')) def test_module_HTML_pretty_print(self): @@ -194,7 +195,7 @@ def test_module_HTML_broken(self): element = self.etree.HTML(self.broken_html_str) - self.assertEqual(self.etree.tostring(element), + self.assertEqual(self.etree.tostring(element, method="html"), self.html_str) def test_module_HTML_cdata(self): @@ -215,7 +216,8 @@ f = open(filename, 'rb') tree = self.etree.parse(f, parser) f.close() - self.assertEqual(self.etree.tostring(tree.getroot()), self.html_str) + self.assertEqual(self.etree.tostring(tree.getroot(), method="html"), + self.html_str) finally: os.remove(filename) @@ -223,7 +225,8 @@ parser = self.etree.HTMLParser() f = SillyFileLike(self.html_str) tree = self.etree.parse(f, parser) - html = self.etree.tostring(tree.getroot(), encoding='UTF-8') + html = self.etree.tostring(tree.getroot(), + method="html", encoding='UTF-8') self.assertEqual(html, self.html_str) ## def test_module_parse_html_filelike_unicode(self): @@ -247,7 +250,7 @@ self.etree.set_default_parser( self.etree.HTMLParser() ) tree = self.etree.parse(BytesIO(self.broken_html_str)) - self.assertEqual(self.etree.tostring(tree.getroot()), + self.assertEqual(self.etree.tostring(tree.getroot(), method="html"), self.html_str) self.etree.set_default_parser() Modified: lxml/branch/lxml-2.1/src/lxml/xmlparser.pxd ============================================================================== --- lxml/branch/lxml-2.1/src/lxml/xmlparser.pxd (original) +++ lxml/branch/lxml-2.1/src/lxml/xmlparser.pxd Sun Oct 19 16:44:15 2008 @@ -47,6 +47,10 @@ cdef extern from "libxml/tree.h": ctypedef struct xmlParserInput: int line + int length + char* base + char* cur + char* end ctypedef struct xmlParserInputBuffer: void* context @@ -183,6 +187,7 @@ int enc) nogil cdef extern from "libxml/parserInternals.h": + cdef xmlParserInput* xmlNewInputStream(xmlParserCtxt* ctxt) cdef xmlParserInput* xmlNewStringInputStream(xmlParserCtxt* ctxt, char* buffer) nogil cdef xmlParserInput* xmlNewInputFromFile(xmlParserCtxt* ctxt, From lxml-checkins at codespeak.net Fri Oct 24 07:22:57 2008 From: lxml-checkins at codespeak.net (lxml-checkins at codespeak.net) Date: Fri, 24 Oct 2008 07:22:57 +0200 (CEST) Subject: VIAGRA ® Official Site Message-ID: <20081024182228.70828.qmail@cc35x2.sels.ru> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20081024/aabf3e1f/attachment.htm From scoder at codespeak.net Sat Oct 25 00:05:18 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 25 Oct 2008 00:05:18 +0200 (CEST) Subject: [Lxml-checkins] r59387 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20081024220518.E4A6F16A27F@codespeak.net> Author: scoder Date: Sat Oct 25 00:05:18 2008 New Revision: 59387 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/lxml.etree.pyx lxml/trunk/src/lxml/tests/test_etree.py Log: r4820 at delle: sbehnel | 2008-10-25 00:04:37 +0200 provide new properties 'namespace' and 'localname' on QName objects Modified: lxml/trunk/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.etree.pyx (original) +++ lxml/trunk/src/lxml/lxml.etree.pyx Sat Oct 25 00:05:18 2008 @@ -236,27 +236,51 @@ cdef class QName: u"""QName(text_or_uri, tag=None) - QName wrapper. + QName wrapper for qualified XML names. Pass a tag name by itself or a namespace URI and a tag name to - create a qualified name. The ``text`` property holds the - qualified name in ``{namespace}tagname`` notation. + create a qualified name. + + The ``text`` property holds the qualified name in + ``{namespace}tagname`` notation. The ``namespace`` and + ``localname`` properties hold the respective parts of the tag + name. You can pass QName objects wherever a tag name is expected. Also, setting Element text from a QName will resolve the namespace prefix and set a qualified text value. """ cdef readonly object text - def __init__(self, text_or_uri, tag=None): + cdef readonly object localname + cdef readonly object namespace + def __init__(self, text_or_uri_or_element, tag=None): + if not _isString(text_or_uri_or_element): + if isinstance(text_or_uri_or_element, _Element): + text_or_uri_or_element = (<_Element>text_or_uri_or_element).tag + if not _isString(text_or_uri_or_element): + raise ValueError, ("Invalid input tag of type %r" % + type(text_or_uri_or_element)) + elif isinstance(text_or_uri_or_element, QName): + text_or_uri_or_element = (text_or_uri_or_element).text + else: + text_or_uri_or_element = unicode(text_or_uri_or_element) + + ns_utf, tag_utf = _getNsTag(text_or_uri_or_element) if tag is not None: - _tagValidOrRaise(_utf8(tag)) - text_or_uri = u"{%s}%s" % (text_or_uri, tag) - else: - if not _isString(text_or_uri): - text_or_uri = unicode(text_or_uri) - tag = _getNsTag(text_or_uri)[1] - _tagValidOrRaise(tag) - self.text = text_or_uri + # either ('ns', 'tag') or ('{ns}oldtag', 'newtag') + if ns_utf is None: + ns_utf = tag_utf # case 1: namespace ended up as tag name + tag_utf = _utf8(tag) + _tagValidOrRaise(tag_utf) + self.localname = python.PyUnicode_FromEncodedObject( + tag_utf, 'UTF-8', NULL) + if ns_utf is None: + self.namespace = None + self.text = self.localname + else: + self.namespace = python.PyUnicode_FromEncodedObject( + ns_utf, 'UTF-8', NULL) + self.text = u"{%s}%s" % (self.namespace, self.localname) def __str__(self): return self.text def __hash__(self): Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Sat Oct 25 00:05:18 2008 @@ -154,6 +154,25 @@ self.assertRaises(ValueError, QName, 'na me') self.assertRaises(ValueError, QName, 'test', ' name') + def test_qname_namespace_localname(self): + # ET doesn't have namespace/localname properties on QNames + QName = self.etree.QName + namespace, localname = 'http://myns', 'a' + qname = QName(namespace, localname) + self.assertEquals(namespace, qname.namespace) + self.assertEquals(localname, qname.localname) + + def test_qname_element(self): + # ET doesn't have namespace/localname properties on QNames + QName = self.etree.QName + qname1 = QName('http://myns', 'a') + a = self.etree.Element(qname1, nsmap={'p' : 'http://myns'}) + + qname2 = QName(a) + self.assertEquals(a.tag, qname1.text) + self.assertEquals(qname1.text, qname2.text) + self.assertEquals(qname1, qname2) + def test_qname_text_resolve(self): # ET doesn't resove QNames as text values etree = self.etree From scoder at codespeak.net Sat Oct 25 00:05:36 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 25 Oct 2008 00:05:36 +0200 (CEST) Subject: [Lxml-checkins] r59388 - lxml/trunk Message-ID: <20081024220536.CD95716A27F@codespeak.net> Author: scoder Date: Sat Oct 25 00:05:36 2008 New Revision: 59388 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt Log: r4824 at delle: sbehnel | 2008-10-25 00:04:57 +0200 changelog Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Oct 25 00:05:36 2008 @@ -8,6 +8,8 @@ Features added -------------- +* QName objects have new properties ``namespace`` and ``localname``. + * New options for exclusive C14N and C14N without comments. * Instantiating a custom Element classes creates a new Element. @@ -15,6 +17,10 @@ Bugs fixed ---------- +* Fix a pre-Py3k warning when parsing from a gzip file in Py2.6. + +* Test suite fixes for libxml2 2.7. + * Resolver.resolve_string() did not work for non-ASCII byte strings. * Overriding the parser encoding didn't work for many encodings. From scoder at codespeak.net Sat Oct 25 20:26:56 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 25 Oct 2008 20:26:56 +0200 (CEST) Subject: [Lxml-checkins] r59408 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20081025182656.C295E16A296@codespeak.net> Author: scoder Date: Sat Oct 25 20:26:56 2008 New Revision: 59408 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/lxml.etree.pyx lxml/trunk/src/lxml/serializer.pxi lxml/trunk/src/lxml/tests/test_etree.py Log: r4826 at delle: sbehnel | 2008-10-25 20:19:53 +0200 fixed doctype serialisation for internal subsets without public/system ID Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Oct 25 20:26:56 2008 @@ -17,6 +17,10 @@ Bugs fixed ---------- +* Internal DTD subsets that did not specify a system or public ID were + not serialised and did not appear in the docinfo property of + ElementTrees. + * Fix a pre-Py3k warning when parsing from a gzip file in Py2.6. * Test suite fixes for libxml2 2.7. Modified: lxml/trunk/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.etree.pyx (original) +++ lxml/trunk/src/lxml/lxml.etree.pyx Sat Oct 25 20:26:56 2008 @@ -328,6 +328,9 @@ return None return _elementFactory(self, c_node) + cdef bint hasdoctype(self): + return self._c_doc.intSubset is not NULL + cdef getdoctype(self): cdef tree.xmlDtd* c_dtd cdef xmlNode* c_root_node @@ -520,6 +523,8 @@ elif system_url: return u'' % ( root_name, system_url) + elif self._doc.hasdoctype(): + return u'' % root_name else: return u"" Modified: lxml/trunk/src/lxml/serializer.pxi ============================================================================== --- lxml/trunk/src/lxml/serializer.pxi (original) +++ lxml/trunk/src/lxml/serializer.pxi Sat Oct 25 20:26:56 2008 @@ -189,9 +189,7 @@ cdef tree.xmlDtd* c_dtd cdef xmlNode* c_node c_dtd = c_doc.intSubset - if c_dtd == NULL or c_dtd.name == NULL: - return - if c_dtd.ExternalID == NULL and c_dtd.SystemID == NULL: + if c_dtd is NULL or c_dtd.name is NULL: return if cstd.strcmp(c_root_name, c_dtd.name) != 0: return Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Sat Oct 25 20:26:56 2008 @@ -2097,6 +2097,24 @@ self.assertEquals(docinfo.root_name, 'html') self.assertEquals(docinfo.doctype, '') + def test_docinfo_name_only(self): + etree = self.etree + xml = _bytes('') + tree = etree.parse(BytesIO(xml)) + docinfo = tree.docinfo + self.assertEquals(docinfo.encoding, "UTF-8") + self.assertEquals(docinfo.xml_version, "1.0") + self.assertEquals(docinfo.public_id, None) + self.assertEquals(docinfo.system_url, None) + self.assertEquals(docinfo.root_name, 'root') + self.assertEquals(docinfo.doctype, '') + + def test_doctype_name_only_roundtrip(self): + etree = self.etree + xml = _bytes('\n') + tree = etree.parse(BytesIO(xml)) + self.assertEquals(xml, etree.tostring(tree)) + def test_xml_base(self): etree = self.etree root = etree.XML(_bytes(""), base_url="http://no/such/url") From scoder at codespeak.net Sat Oct 25 20:41:50 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 25 Oct 2008 20:41:50 +0200 (CEST) Subject: [Lxml-checkins] r59410 - in lxml/branch/lxml-2.1/src/lxml: . tests Message-ID: <20081025184150.6856A16A296@codespeak.net> Author: scoder Date: Sat Oct 25 20:41:49 2008 New Revision: 59410 Modified: lxml/branch/lxml-2.1/src/lxml/lxml.etree.pyx lxml/branch/lxml-2.1/src/lxml/serializer.pxi lxml/branch/lxml-2.1/src/lxml/tests/test_etree.py Log: trunk merge of dtd serialiser fix Modified: lxml/branch/lxml-2.1/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/branch/lxml-2.1/src/lxml/lxml.etree.pyx (original) +++ lxml/branch/lxml-2.1/src/lxml/lxml.etree.pyx Sat Oct 25 20:41:49 2008 @@ -306,6 +306,9 @@ return None return _elementFactory(self, c_node) + cdef bint hasdoctype(self): + return self._c_doc.intSubset is not NULL + cdef getdoctype(self): cdef tree.xmlDtd* c_dtd cdef xmlNode* c_root_node @@ -498,6 +501,8 @@ elif system_url: return u'' % ( root_name, system_url) + elif self._doc.hasdoctype(): + return u'' % root_name else: return u"" Modified: lxml/branch/lxml-2.1/src/lxml/serializer.pxi ============================================================================== --- lxml/branch/lxml-2.1/src/lxml/serializer.pxi (original) +++ lxml/branch/lxml-2.1/src/lxml/serializer.pxi Sat Oct 25 20:41:49 2008 @@ -186,9 +186,7 @@ cdef tree.xmlDtd* c_dtd cdef xmlNode* c_node c_dtd = c_doc.intSubset - if c_dtd == NULL or c_dtd.name == NULL: - return - if c_dtd.ExternalID == NULL and c_dtd.SystemID == NULL: + if c_dtd is NULL or c_dtd.name is NULL: return if cstd.strcmp(c_root_name, c_dtd.name) != 0: return Modified: lxml/branch/lxml-2.1/src/lxml/tests/test_etree.py ============================================================================== --- lxml/branch/lxml-2.1/src/lxml/tests/test_etree.py (original) +++ lxml/branch/lxml-2.1/src/lxml/tests/test_etree.py Sat Oct 25 20:41:49 2008 @@ -2078,6 +2078,24 @@ self.assertEquals(docinfo.root_name, 'html') self.assertEquals(docinfo.doctype, '') + def test_docinfo_name_only(self): + etree = self.etree + xml = _bytes('') + tree = etree.parse(BytesIO(xml)) + docinfo = tree.docinfo + self.assertEquals(docinfo.encoding, "UTF-8") + self.assertEquals(docinfo.xml_version, "1.0") + self.assertEquals(docinfo.public_id, None) + self.assertEquals(docinfo.system_url, None) + self.assertEquals(docinfo.root_name, 'root') + self.assertEquals(docinfo.doctype, '') + + def test_doctype_name_only_roundtrip(self): + etree = self.etree + xml = _bytes('\n') + tree = etree.parse(BytesIO(xml)) + self.assertEquals(xml, etree.tostring(tree)) + def test_xml_base(self): etree = self.etree root = etree.XML(_bytes(""), base_url="http://no/such/url") From scoder at codespeak.net Tue Oct 28 21:42:27 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 28 Oct 2008 21:42:27 +0100 (CET) Subject: [Lxml-checkins] r59513 - in lxml/trunk: . doc Message-ID: <20081028204227.8210A16A145@codespeak.net> Author: scoder Date: Tue Oct 28 21:42:24 2008 New Revision: 59513 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/FAQ.txt Log: r4828 at delle: sbehnel | 2008-10-28 21:23:45 +0100 docs Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Tue Oct 28 21:42:24 2008 @@ -605,10 +605,14 @@ create a parser for each thread yourself. lxml also allows concurrency during validation (RelaxNG and XMLSchema) and XSL transformation. You can share RelaxNG, XMLSchema and (with -restrictions) XSLT objects between threads. While you can also share -parsers between threads, this will serialize the access to each of -them, so it is better to ``.copy()`` parsers or to just use the -default parser if you do not need any special configuration. +restrictions) XSLT objects between threads. + +While you can also share parsers between threads, this will serialize +the access to each of them, so it is better to ``.copy()`` parsers or +to just use the default parser if you do not need any special +configuration. The same applies to the XPath evaluators, which use an +internal lock to protect their prepared evaluation contexts. It is +therefore best to use separate evaluator instances in threads. Due to the way libxslt handles threading, applying a stylesheets is most efficient if it was parsed in the same thread that executes it. From ianb at codespeak.net Thu Oct 30 17:59:53 2008 From: ianb at codespeak.net (ianb at codespeak.net) Date: Thu, 30 Oct 2008 17:59:53 +0100 (CET) Subject: [Lxml-checkins] r59578 - in lxml/trunk: . src/lxml/html src/lxml/html/tests Message-ID: <20081030165953.C8D5E16A0BB@codespeak.net> Author: ianb Date: Thu Oct 30 17:59:51 2008 New Revision: 59578 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/html/clean.py lxml/trunk/src/lxml/html/tests/test_autolink.txt Log: Notice balanced parenthesis when autolinking Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Thu Oct 30 17:59:51 2008 @@ -32,6 +32,10 @@ * 0-bytes could slip through the API when used inside of Unicode strings. +* With ``lxml.html.clean.autolink``, links with balanced parenthesis, + that end in a parenthesis, will be linked in their entirety (typical + with Wikipedia links). + Other changes ------------- Modified: lxml/trunk/src/lxml/html/clean.py ============================================================================== --- lxml/trunk/src/lxml/html/clean.py (original) +++ lxml/trunk/src/lxml/html/clean.py Thu Oct 30 17:59:51 2008 @@ -497,7 +497,7 @@ ############################################################ _link_regexes = [ - re.compile(r'(?Phttps?://(?P[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?)', re.I), + re.compile(r'(?Phttps?://(?P[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I), # This is conservative, but autolinking can be a bit conservative: re.compile(r'mailto:(?P[a-z0-9._-]+@(?P[a-z0-9_._]+[a-z]))', re.I), ] @@ -527,7 +527,7 @@ host that matches one of the regular expressions in avoid_hosts (default localhost and 127.0.0.1). - If you pass in an element, the elements tail will not be + If you pass in an element, the element's tail will not be substituted, only the contents of the element. """ if el.tag in avoid_elements: Modified: lxml/trunk/src/lxml/html/tests/test_autolink.txt ============================================================================== --- lxml/trunk/src/lxml/html/tests/test_autolink.txt (original) +++ lxml/trunk/src/lxml/html/tests/test_autolink.txt Thu Oct 30 17:59:51 2008 @@ -23,6 +23,17 @@ ...
Link: (http://foobar.com)
''')) +Parenthesis are tricky, we'll do our best:: + + >>> print autolink_html(''' + ...
(Link: http://en.wikipedia.org/wiki/PC_Tools_(Central_Point_Software))
+ ... ''') + + >>> print autolink_html(''' + ...
... a link: http://foo.com)
+ ... ''') +
... a link: http://foo.com)
+ Some cases that won't be caught (on purpose):: >>> print(autolink_html('''