+ >>> from lxml.etree import fromstring
+ >>> h = fromstring('''
''')
From lxml-checkins at codespeak.net Sat Sep 22 18:05:49 2007
From: lxml-checkins at codespeak.net (Viagra.com Inc)
Date: Sat, 22 Sep 2007 18:05:49 +0200 (CEST)
Subject: [Lxml-checkins] September 70% OFF
Message-ID: <20070922110726.6865.qmail@cl181-114.petro.tvoe.tv>
An HTML attachment was scrubbed...
URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20070922/961c83e0/attachment.htm
From scoder at codespeak.net Sat Sep 22 18:55:08 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Sat, 22 Sep 2007 18:55:08 +0200 (CEST)
Subject: [Lxml-checkins] r46833 - lxml/trunk/src/lxml/html
Message-ID: <20070922165508.119A480E8@code0.codespeak.net>
Author: scoder
Date: Sat Sep 22 18:55:07 2007
New Revision: 46833
Modified:
lxml/trunk/src/lxml/html/__init__.py
Log:
make lxml.html serialisation code use the 'html' output method, 'pretty' kw is now called 'pretty_print' as in etree
Modified: lxml/trunk/src/lxml/html/__init__.py
==============================================================================
--- lxml/trunk/src/lxml/html/__init__.py (original)
+++ lxml/trunk/src/lxml/html/__init__.py Sat Sep 22 18:55:07 2007
@@ -1220,34 +1220,12 @@
## Serialization
############################################################
-_html_xsl = """\
-
-
-
-
-
-
-"""
-
-_pretty_html_xsl = """\
-
-
-
-
-
-
-"""
-
-_local_transforms = threading.local()
-# FIXME: should we just lazily compile these?
-_local_transforms.html_transform = etree.XSLT(etree.XML(_html_xsl))
-_local_transforms.pretty_html_transform = etree.XSLT(etree.XML(_pretty_html_xsl))
-
-# This isn't a general match, but it's a match for what XSLT specifically creates:
+# This isn't a general match, but it's a match for what libxml2
+# specifically serialises:
__replace_meta_content_type = re.compile(
r'
').sub
-def tostring(doc, pretty=False, include_meta_content_type=False):
+def tostring(doc, pretty_print=False, include_meta_content_type=False):
"""
return HTML string representation of the document given
@@ -1255,18 +1233,7 @@
and may replace any that are present
"""
assert doc is not None
- if pretty:
- try:
- pretty_html_transform = _local_transforms.pretty_html_transform
- except AttributeError:
- pretty_html_transform = _local_transforms.pretty_html_transform = etree.XSLT(etree.XML(_pretty_html_xsl))
- html = str(pretty_html_transform(doc))
- else:
- try:
- html_transform = _local_transforms.html_transform
- except AttributeError:
- html_transform = _local_transforms.html_transform = etree.XSLT(etree.XML(_html_xsl))
- html = str(html_transform(doc))
+ html = etree.tostring(doc, method="html", pretty_print=pretty_print)
if not include_meta_content_type:
html = __replace_meta_content_type('', html)
return html
@@ -1278,10 +1245,12 @@
"""
import os
import webbrowser
+ try:
+ write_doc = doc.write
+ except AttributeError:
+ write_doc = etree.ElementTree(element=doc).write
fn = os.tempnam() + '.html'
- f = open(fn, 'wb')
- f.write(tostring(doc, include_meta_content_type=True))
- f.close()
+ write_doc(fn, method="html")
url = 'file://' + fn.replace(os.path.sep, '/')
print url
webbrowser.open(url)
From scoder at codespeak.net Mon Sep 24 12:49:06 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Mon, 24 Sep 2007 12:49:06 +0200 (CEST)
Subject: [Lxml-checkins] r46847 - in lxml/trunk: . src/lxml
Message-ID: <20070924104906.7E35580FC@code0.codespeak.net>
Author: scoder
Date: Mon Sep 24 12:49:05 2007
New Revision: 46847
Modified:
lxml/trunk/CHANGES.txt
lxml/trunk/src/lxml/parser.pxi
Log:
fix: XML feed parser setup problem
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Mon Sep 24 12:49:05 2007
@@ -23,6 +23,8 @@
Bugs fixed
----------
+* XML feed parser setup problem
+
* Type annotation for unicode strings in ``DataElement()``
Other changes
Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi (original)
+++ lxml/trunk/src/lxml/parser.pxi Mon Sep 24 12:49:05 2007
@@ -842,9 +842,9 @@
error = _htmlCtxtResetPush(pctxt, c_data, buffer_len,
c_encoding, self._parse_options)
else:
+ xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options)
error = xmlparser.xmlCtxtResetPush(
pctxt, c_data, buffer_len, NULL, c_encoding)
- xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options)
py_buffer_len = py_buffer_len - buffer_len
c_data = c_data + buffer_len
From scoder at codespeak.net Mon Sep 24 18:51:33 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Mon, 24 Sep 2007 18:51:33 +0200 (CEST)
Subject: [Lxml-checkins] r46849 - in lxml/trunk: . doc
Message-ID: <20070924165133.76C1080E5@code0.codespeak.net>
Author: scoder
Date: Mon Sep 24 18:51:32 2007
New Revision: 46849
Modified:
lxml/trunk/doc/build.txt
lxml/trunk/setupinfo.py
Log:
build setup clarification and error message for missing xslt-config
Modified: lxml/trunk/doc/build.txt
==============================================================================
--- lxml/trunk/doc/build.txt (original)
+++ lxml/trunk/doc/build.txt Mon Sep 24 18:51:32 2007
@@ -77,10 +77,11 @@
make
-If you get errors about missing header files (e.g., ``libxml/xmlversion.h``)
-then you need to make sure the development packages of libxml2 and libxslt are
-properly installed. If this doesn't help, you may have to add the location of
-the header files to the include path like::
+If you get errors about missing header files (e.g. ``libxml/xmlversion.h``)
+then you need to make sure the development packages of both libxml2
+and libxslt are properly installed. If this doesn't help, you may
+have to add the location of the header files to the include path
+like::
python setup.py build_ext -i -I /usr/include/libxml2
Modified: lxml/trunk/setupinfo.py
==============================================================================
--- lxml/trunk/setupinfo.py (original)
+++ lxml/trunk/setupinfo.py Mon Sep 24 18:51:32 2007
@@ -132,6 +132,11 @@
def flags(cmd):
wf, rf, ef = os.popen3(cmd)
+ errors = ef.read()
+ if errors:
+ print "ERROR:", errors
+ print "** make sure the development package of libxml2 and libxslt are installed **"
+ print
return rf.read().split()
def has_option(name):
From scoder at codespeak.net Mon Sep 24 18:52:18 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Mon, 24 Sep 2007 18:52:18 +0200 (CEST)
Subject: [Lxml-checkins] r46850 - in lxml/branch/lxml-1.3: . doc
Message-ID: <20070924165218.B935280E5@code0.codespeak.net>
Author: scoder
Date: Mon Sep 24 18:52:18 2007
New Revision: 46850
Modified:
lxml/branch/lxml-1.3/doc/build.txt
lxml/branch/lxml-1.3/setupinfo.py
Log:
build setup clarification and error message for missing xslt-config
Modified: lxml/branch/lxml-1.3/doc/build.txt
==============================================================================
--- lxml/branch/lxml-1.3/doc/build.txt (original)
+++ lxml/branch/lxml-1.3/doc/build.txt Mon Sep 24 18:52:18 2007
@@ -105,10 +105,11 @@
make
-If you get errors about missing header files (e.g., ``libxml/xmlversion.h``)
-then you need to make sure the development packages of libxml2 and libxslt are
-properly installed. If this doesn't help, you may have to add the location of
-the header files to the include path like::
+If you get errors about missing header files (e.g. ``libxml/xmlversion.h``)
+then you need to make sure the development packages of both libxml2
+and libxslt are properly installed. If this doesn't help, you may
+have to add the location of the header files to the include path
+like::
python setup.py build_ext -i -I /usr/include/libxml2
Modified: lxml/branch/lxml-1.3/setupinfo.py
==============================================================================
--- lxml/branch/lxml-1.3/setupinfo.py (original)
+++ lxml/branch/lxml-1.3/setupinfo.py Mon Sep 24 18:52:18 2007
@@ -135,6 +135,11 @@
def flags(cmd):
wf, rf, ef = os.popen3(cmd)
+ errors = ef.read()
+ if errors:
+ print "ERROR:", errors
+ print "** make sure the development package of libxml2 and libxslt are installed **"
+ print
return rf.read().split()
def has_option(name):
From scoder at codespeak.net Mon Sep 24 18:53:05 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Mon, 24 Sep 2007 18:53:05 +0200 (CEST)
Subject: [Lxml-checkins] r46851 - lxml/trunk
Message-ID: <20070924165305.C1EE980EE@code0.codespeak.net>
Author: scoder
Date: Mon Sep 24 18:53:05 2007
New Revision: 46851
Modified:
lxml/trunk/setupinfo.py
Log:
typo
Modified: lxml/trunk/setupinfo.py
==============================================================================
--- lxml/trunk/setupinfo.py (original)
+++ lxml/trunk/setupinfo.py Mon Sep 24 18:53:05 2007
@@ -135,7 +135,7 @@
errors = ef.read()
if errors:
print "ERROR:", errors
- print "** make sure the development package of libxml2 and libxslt are installed **"
+ print "** make sure the development packages of libxml2 and libxslt are installed **"
print
return rf.read().split()
From scoder at codespeak.net Mon Sep 24 18:53:23 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Mon, 24 Sep 2007 18:53:23 +0200 (CEST)
Subject: [Lxml-checkins] r46852 - lxml/branch/lxml-1.3
Message-ID: <20070924165323.3925A80EE@code0.codespeak.net>
Author: scoder
Date: Mon Sep 24 18:53:22 2007
New Revision: 46852
Modified:
lxml/branch/lxml-1.3/setupinfo.py
Log:
typo
Modified: lxml/branch/lxml-1.3/setupinfo.py
==============================================================================
--- lxml/branch/lxml-1.3/setupinfo.py (original)
+++ lxml/branch/lxml-1.3/setupinfo.py Mon Sep 24 18:53:22 2007
@@ -138,7 +138,7 @@
errors = ef.read()
if errors:
print "ERROR:", errors
- print "** make sure the development package of libxml2 and libxslt are installed **"
+ print "** make sure the development packages of libxml2 and libxslt are installed **"
print
return rf.read().split()
From scoder at codespeak.net Tue Sep 25 10:41:04 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Tue, 25 Sep 2007 10:41:04 +0200 (CEST)
Subject: [Lxml-checkins] r46858 - lxml/trunk/src/lxml
Message-ID: <20070925084104.4F31D812C@code0.codespeak.net>
Author: scoder
Date: Tue Sep 25 10:41:03 2007
New Revision: 46858
Modified:
lxml/trunk/src/lxml/iterparse.pxi
lxml/trunk/src/lxml/parser.pxi
Log:
parser cleanup
Modified: lxml/trunk/src/lxml/iterparse.pxi
==============================================================================
--- lxml/trunk/src/lxml/iterparse.pxi (original)
+++ lxml/trunk/src/lxml/iterparse.pxi Tue Sep 25 10:41:03 2007
@@ -332,7 +332,8 @@
context = <_IterparseContext>self._context
context._setEventFilter(events, tag)
- self._lockParser() # will not be unlocked - no other methods supported
+ self._lockAndPrepare()
+ # parser will not be unlocked - no other methods supported
cdef _ParserContext _createContext(self, target):
return _IterparseContext()
Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi (original)
+++ lxml/trunk/src/lxml/parser.pxi Tue Sep 25 10:41:03 2007
@@ -572,12 +572,21 @@
if self._parser_lock is not NULL:
python.PyThread_free_lock(self._parser_lock)
+ cdef int _lockAndPrepare(self) except -1:
+ self._lockParser()
+ self._context._error_log.connect()
+ return 0
+
cdef void _cleanup(self):
- cdef xmlparser.xmlParserCtxt* pctxt
- pctxt = self._parser_ctxt
- if pctxt is not NULL:
- if pctxt.spaceTab is not NULL: # work around bug in libxml2
- xmlparser.xmlClearParserCtxt(pctxt)
+ if self._parser_ctxt is not NULL:
+ if self._parser_type == LXML_HTML_PARSER:
+ htmlparser.htmlCtxtReset(self._parser_ctxt)
+ elif self._parser_ctxt.spaceTab is not NULL or \
+ _LIBXML_VERSION_INT >= 20629: # work around bug in libxml2
+ xmlparser.xmlClearParserCtxt(self._parser_ctxt)
+ self._context.clear()
+ self._context._error_log.disconnect()
+ self._unlockParser()
cdef int _lockParser(self) except -1:
cdef python.PyThreadState* state
@@ -658,8 +667,7 @@
return self._parseDoc(_cstr(text_utf), py_buffer_len, c_filename)
buffer_len = py_buffer_len
- self._lockParser()
- self._context._error_log.connect()
+ self._lockAndPrepare()
try:
pctxt = self._parser_ctxt
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
@@ -679,9 +687,6 @@
return self._context._handleParseResultDoc(self, result, None)
finally:
self._cleanup()
- self._context.clear()
- self._context._error_log.disconnect()
- self._unlockParser()
cdef xmlDoc* _parseDoc(self, char* c_text, Py_ssize_t c_len,
char* c_filename) except NULL:
@@ -694,8 +699,7 @@
cdef char* c_encoding
if c_len > python.INT_MAX:
raise ParserError, "string is too long to parse it with libxml2"
- self._lockParser()
- self._context._error_log.connect()
+ self._lockAndPrepare()
try:
pctxt = self._parser_ctxt
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
@@ -719,9 +723,6 @@
return self._context._handleParseResultDoc(self, result, None)
finally:
self._cleanup()
- self._context.clear()
- self._context._error_log.disconnect()
- self._unlockParser()
cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL:
cdef python.PyThreadState* state
@@ -731,8 +732,7 @@
cdef int orig_options
cdef char* c_encoding
result = NULL
- self._lockParser()
- self._context._error_log.connect()
+ self._lockAndPrepare()
try:
pctxt = self._parser_ctxt
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
@@ -757,9 +757,6 @@
self, result, c_filename)
finally:
self._cleanup()
- self._context.clear()
- self._context._error_log.disconnect()
- self._unlockParser()
cdef xmlDoc* _parseDocFromFilelike(self, filelike, filename) except NULL:
cdef _FileReaderContext file_context
@@ -769,8 +766,7 @@
cdef int recover
if not filename:
filename = None
- self._lockParser()
- self._context._error_log.connect()
+ self._lockAndPrepare()
try:
pctxt = self._parser_ctxt
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
@@ -783,9 +779,6 @@
self, result, filename)
finally:
self._cleanup()
- self._context.clear()
- self._context._error_log.disconnect()
- self._unlockParser()
############################################################
## ET feed parser
@@ -829,9 +822,8 @@
pctxt = self._parser_ctxt
error = 0
if not self._feed_parser_running:
- self._lockParser()
+ self._lockAndPrepare()
self._feed_parser_running = 1
- self._context._error_log.connect()
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
if py_buffer_len > python.INT_MAX:
@@ -867,9 +859,6 @@
self, pctxt.myDoc, None)
finally:
self._cleanup()
- self._context.clear()
- self._context._error_log.disconnect()
- self._unlockParser()
def close(self):
"""Terminates feeding data to this parser. This tells the parser to
@@ -896,9 +885,6 @@
self, pctxt.myDoc, None)
finally:
self._cleanup()
- self._context.clear()
- self._context._error_log.disconnect()
- self._unlockParser()
if isinstance(result, _Document):
return (<_Document>result).getroot()
From scoder at codespeak.net Tue Sep 25 10:41:53 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Tue, 25 Sep 2007 10:41:53 +0200 (CEST)
Subject: [Lxml-checkins] r46859 - lxml/trunk/src/lxml/tests
Message-ID: <20070925084153.1C80D812C@code0.codespeak.net>
Author: scoder
Date: Tue Sep 25 10:41:52 2007
New Revision: 46859
Modified:
lxml/trunk/src/lxml/tests/test_elementtree.py
Log:
test cleanup
Modified: lxml/trunk/src/lxml/tests/test_elementtree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_elementtree.py (original)
+++ lxml/trunk/src/lxml/tests/test_elementtree.py Tue Sep 25 10:41:52 2007
@@ -2881,11 +2881,9 @@
def _writeElement(self, element, encoding='us-ascii'):
"""Write out element for comparison.
"""
- ElementTree = self.etree.ElementTree
- f = StringIO()
- tree = ElementTree(element=element)
- tree.write(f, encoding)
- data = unicode(f.getvalue(), encoding)
+ data = self.etree.tostring(element, encoding=encoding)
+ if encoding != 'us-ascii':
+ data = unicode(data, encoding)
return canonicalize(data)
def _writeElementFile(self, element, encoding='us-ascii'):
@@ -2899,11 +2897,13 @@
tree.write(f, encoding)
f.close()
f = open(filename, 'rb')
- data = unicode(f.read(), encoding)
+ data = f.read()
f.close()
finally:
os.close(handle)
os.remove(filename)
+ if encoding != 'us-ascii':
+ data = unicode(data, encoding)
return canonicalize(data)
def assertXML(self, expected, element, encoding='us-ascii'):
From scoder at codespeak.net Wed Sep 26 11:22:30 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Wed, 26 Sep 2007 11:22:30 +0200 (CEST)
Subject: [Lxml-checkins] r46889 - lxml/trunk/src/lxml
Message-ID: <20070926092230.BFA198162@code0.codespeak.net>
Author: scoder
Date: Wed Sep 26 11:22:29 2007
New Revision: 46889
Modified:
lxml/trunk/src/lxml/iterparse.pxi
lxml/trunk/src/lxml/parser.pxi
lxml/trunk/src/lxml/xmlparser.pxd
Log:
parser refactoring moves locking etc. into _ParserContext to separate contexts of feed parser and normal parser
Modified: lxml/trunk/src/lxml/iterparse.pxi
==============================================================================
--- lxml/trunk/src/lxml/iterparse.pxi (original)
+++ lxml/trunk/src/lxml/iterparse.pxi Wed Sep 26 11:22:29 2007
@@ -274,33 +274,27 @@
* encoding - override the document encoding
"""
cdef object _source
- cdef object _filename
cdef readonly object root
- cdef int _html
def __init__(self, source, events=("end",), tag=None,
attribute_defaults=False, dtd_validation=False,
load_dtd=False, no_network=True, remove_blank_text=False,
remove_comments=False, remove_pis=False, encoding=None,
html=False):
cdef _IterparseContext context
- cdef char* c_filename
cdef char* c_encoding
cdef int parse_options
if not hasattr(source, 'read'):
- self._filename = _encodeFilename(source)
- source = open(self._filename, 'rb')
- else:
- self._filename = _getFilenameForFile(source)
- if self._filename is not None:
- self._filename = _encodeFilename(self._filename)
- if self._filename is not None:
- c_filename = self._filename
+ filename = _encodeFilename(source)
+ source = open(filename, 'rb')
else:
- c_filename = NULL
+ filename = _getFilenameForFile(source)
+ if filename is not None:
+ filename = _encodeFilename(filename)
self._source = source
+ html = bool(html)
if html:
- self._html = 1
+ # make sure we're not looking for namespaces
if 'start' in events:
if 'end' in events:
events = ('start', 'end')
@@ -310,8 +304,6 @@
events = ('end',)
else:
events = ()
- else:
- self._html = 0
parse_options = _XML_DEFAULT_PARSE_OPTIONS
if load_dtd:
@@ -327,39 +319,18 @@
if remove_blank_text:
parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS
- _BaseParser.__init__(self, parse_options, remove_comments, remove_pis,
- None, encoding)
-
- context = <_IterparseContext>self._context
+ _BaseParser.__init__(self, parse_options, html,
+ remove_comments, remove_pis,
+ None, filename, encoding)
+
+ context = <_IterparseContext>self._getPushParserContext()
context._setEventFilter(events, tag)
- self._lockAndPrepare()
+ context.prepare()
# parser will not be unlocked - no other methods supported
cdef _ParserContext _createContext(self, target):
return _IterparseContext()
- cdef xmlparser.xmlParserCtxt* _newParserCtxt(self):
- cdef xmlparser.xmlParserCtxt* c_ctxt
- cdef char* c_filename
- if self._filename is not None:
- c_filename = _cstr(self._filename)
- else:
- c_filename = NULL
- if self._html:
- c_ctxt = htmlparser.htmlCreatePushParserCtxt(
- NULL, NULL, NULL, 0, c_filename, self._default_encoding_int)
- if c_ctxt is not NULL:
- htmlparser.htmlCtxtUseOptions(c_ctxt, self._parse_options)
- else:
- c_ctxt = xmlparser.xmlCreatePushParserCtxt(
- NULL, NULL, NULL, 0, c_filename)
- if c_ctxt is not NULL:
- xmlparser.xmlCtxtUseOptions(c_ctxt, self._parse_options)
- if self._default_encoding_int != tree.XML_CHAR_ENCODING_NONE:
- xmlparser.xmlSwitchEncoding(
- c_ctxt, self._default_encoding_int)
- return c_ctxt
-
def copy(self):
raise TypeError, "iterparse parsers cannot be copied"
@@ -368,11 +339,13 @@
def __next__(self):
cdef _IterparseContext context
+ cdef xmlparser.xmlParserCtxt* pctxt
cdef int error
cdef char* c_filename
if self._source is None:
raise StopIteration
- context = <_IterparseContext>self._context
+
+ context = <_IterparseContext>self._getPushParserContext()
if python.PyList_GET_SIZE(context._events) > context._event_index:
item = python.PyList_GET_ITEM(context._events, context._event_index)
python.Py_INCREF(item) # 'borrowed reference' from PyList_GET_ITEM
@@ -380,6 +353,7 @@
return item
del context._events[:]
+ pctxt = context._c_ctxt
error = 0
while python.PyList_GET_SIZE(context._events) == 0 and error == 0:
data = self._source.read(__ITERPARSE_CHUNK_SIZE)
@@ -387,27 +361,22 @@
self._source = None
raise TypeError, "reading file objects must return plain strings"
elif data:
- if self._html:
+ if self._for_html:
error = htmlparser.htmlParseChunk(
- self._parser_ctxt, _cstr(data),
- python.PyString_GET_SIZE(data), 0)
+ pctxt, _cstr(data), python.PyString_GET_SIZE(data), 0)
else:
error = xmlparser.xmlParseChunk(
- self._parser_ctxt, _cstr(data),
- python.PyString_GET_SIZE(data), 0)
+ pctxt, _cstr(data), python.PyString_GET_SIZE(data), 0)
else:
- if self._html:
- error = htmlparser.htmlParseChunk(
- self._parser_ctxt, NULL, 0, 1)
+ if self._for_html:
+ error = htmlparser.htmlParseChunk(pctxt, NULL, 0, 1)
else:
- error = xmlparser.xmlParseChunk(
- self._parser_ctxt, NULL, 0, 1)
+ error = xmlparser.xmlParseChunk(pctxt, NULL, 0, 1)
self._source = None
break
if error != 0:
self._source = None
- _raiseParseError(self._parser_ctxt, self._filename,
- self._context._error_log)
+ _raiseParseError(pctxt, self._filename, context._error_log)
if python.PyList_GET_SIZE(context._events) == 0:
self.root = context._root
self._source = None
Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi (original)
+++ lxml/trunk/src/lxml/parser.pxi Wed Sep 26 11:22:29 2007
@@ -20,11 +20,6 @@
"""
pass
-ctypedef enum LxmlParserType:
- LXML_XML_PARSER
- LXML_HTML_PARSER
- LXML_ITERPARSE_PARSER
-
cdef class _ParserDictionaryContext:
# Global parser context to share the string dictionary.
#
@@ -232,8 +227,7 @@
c_buffer.readcallback = _readFilelikeParser
return xmlparser.xmlNewIOInputStream(ctxt, c_buffer, 0)
- cdef xmlDoc* _readDoc(self, xmlparser.xmlParserCtxt* ctxt, int options,
- LxmlParserType parser_type):
+ cdef xmlDoc* _readDoc(self, xmlparser.xmlParserCtxt* ctxt, int options):
cdef python.PyThreadState* state
cdef xmlDoc* result
cdef char* c_encoding
@@ -244,12 +238,12 @@
c_encoding = _cstr(self._encoding)
state = python.PyEval_SaveThread()
- if parser_type == LXML_XML_PARSER:
- result = xmlparser.xmlCtxtReadIO(
+ if ctxt.html:
+ result = htmlparser.htmlCtxtReadIO(
ctxt, _readFilelikeParser, NULL,
self,
self._c_url, c_encoding, options)
else:
- result = htmlparser.htmlCtxtReadIO(
+ result = xmlparser.xmlCtxtReadIO(
ctxt, _readFilelikeParser, NULL, self,
self._c_url, c_encoding, options)
python.PyEval_RestoreThread(state)
@@ -387,6 +381,13 @@
cdef class _ParserContext(_ResolverContext):
cdef _ErrorLog _error_log
cdef xmlparser.xmlParserCtxt* _c_ctxt
+ cdef python.PyThread_type_lock _lock
+
+ def __dealloc__(self):
+ if self._lock is not NULL:
+ python.PyThread_free_lock(self._lock)
+ if self._c_ctxt is not NULL:
+ xmlparser.xmlFreeParserCtxt(self._c_ctxt)
cdef _ParserContext _copy(self):
cdef _ParserContext context
@@ -398,6 +399,35 @@
self._c_ctxt = c_ctxt
c_ctxt._private = self
+ cdef void _resetParserContext(self):
+ if self._c_ctxt is not NULL:
+ if self._c_ctxt.html:
+ htmlparser.htmlCtxtReset(self._c_ctxt)
+ elif self._c_ctxt.spaceTab is not NULL or \
+ _LIBXML_VERSION_INT >= 20629: # work around bug in libxml2
+ xmlparser.xmlClearParserCtxt(self._c_ctxt)
+
+ cdef int prepare(self) except -1:
+ cdef python.PyThreadState* state
+ cdef int result
+ if config.ENABLE_THREADING and self._lock is not NULL:
+ state = python.PyEval_SaveThread()
+ result = python.PyThread_acquire_lock(
+ self._lock, python.WAIT_LOCK)
+ python.PyEval_RestoreThread(state)
+ if result == 0:
+ raise ParserError, "parser locking failed"
+ self._error_log.connect()
+ return 0
+
+ cdef int cleanup(self) except -1:
+ self._resetParserContext()
+ self.clear()
+ self._error_log.disconnect()
+ if config.ENABLE_THREADING and self._lock is not NULL:
+ python.PyThread_release_lock(self._lock)
+ return 0
+
cdef object _handleParseResult(self, _BaseParser parser,
xmlDoc* result, filename):
cdef xmlDoc* c_doc
@@ -418,6 +448,10 @@
_ResolverRegistry resolvers,
xmlparser.xmlParserCtxt* c_ctxt):
_initResolverContext(context, resolvers)
+ if not config.ENABLE_THREADING:
+ context._lock = NULL
+ else:
+ context._lock = python.PyThread_allocate_lock()
if c_ctxt is not NULL:
context._initParserContext(c_ctxt)
context._error_log = _ErrorLog()
@@ -495,30 +529,37 @@
cdef class _BaseParser:
- cdef int _parse_options
- cdef _ParserContext _context
- cdef LxmlParserType _parser_type
- cdef xmlparser.xmlParserCtxt* _parser_ctxt
cdef ElementClassLookup _class_lookup
- cdef python.PyThread_type_lock _parser_lock
- cdef int _feed_parser_running
+ cdef _ResolverRegistry _resolvers
+ cdef int _parse_options
+ cdef object _filename
+ cdef _ParserContext _parser_context
+ cdef _ParserContext _push_parser_context
+ cdef object _target
+ cdef bint _for_html
+ cdef bint _remove_comments
+ cdef bint _remove_pis
cdef object _default_encoding
cdef int _default_encoding_int
- def __init__(self, int parse_options, remove_comments, remove_pis,
- target, encoding):
+ def __init__(self, int parse_options, bint for_html,
+ remove_comments, remove_pis,
+ target, filename, encoding):
cdef int c_encoding
cdef xmlparser.xmlParserCtxt* pctxt
- if isinstance(self, HTMLParser):
- self._parser_type = LXML_HTML_PARSER
- elif isinstance(self, XMLParser):
- self._parser_type = LXML_XML_PARSER
- elif isinstance(self, iterparse):
- self._parser_type = LXML_ITERPARSE_PARSER
- else:
+ if not isinstance(self, HTMLParser) and \
+ not isinstance(self, XMLParser) and \
+ not isinstance(self, iterparse):
raise TypeError, "This class cannot be instantiated"
self._parse_options = parse_options
+ self._filename = filename
+ self._target = target
+ self._for_html = for_html
+ self._remove_comments = bool(remove_comments)
+ self._remove_pis = bool(remove_pis)
+
+ self._resolvers = _ResolverRegistry()
if encoding is None:
self._default_encoding = None
@@ -532,25 +573,44 @@
self._default_encoding = encoding
self._default_encoding_int = c_encoding
- pctxt = self._newParserCtxt()
- self._parser_ctxt = pctxt
- if pctxt is NULL:
- python.PyErr_NoMemory()
-
- self._context = self._createContext(target)
- _initParserContext(self._context, None, pctxt)
-
- if remove_comments:
- pctxt.sax.comment = NULL
- if remove_pis:
- pctxt.sax.processingInstruction = NULL
- # hard switch-off for CDATA nodes => makes them plain text
- pctxt.sax.cdataBlock = NULL
+ cdef _ParserContext _getParserContext(self):
+ cdef xmlparser.xmlParserCtxt* pctxt
+ if self._parser_context is None:
+ self._parser_context = self._createContext(self._target)
- if not config.ENABLE_THREADING:
- self._parser_lock = NULL
- else:
- self._parser_lock = python.PyThread_allocate_lock()
+ pctxt = self._newParserCtxt()
+ if pctxt is NULL:
+ python.PyErr_NoMemory()
+
+ if self._remove_comments:
+ pctxt.sax.comment = NULL
+ if self._remove_pis:
+ pctxt.sax.processingInstruction = NULL
+ # hard switch-off for CDATA nodes => makes them plain text
+ pctxt.sax.cdataBlock = NULL
+
+ _initParserContext(self._parser_context, self._resolvers, pctxt)
+ return self._parser_context
+
+ cdef _ParserContext _getPushParserContext(self):
+ cdef xmlparser.xmlParserCtxt* pctxt
+ if self._push_parser_context is None:
+ self._push_parser_context = self._createContext(self._target)
+
+ pctxt = self._newPushParserCtxt()
+ if pctxt is NULL:
+ python.PyErr_NoMemory()
+
+ if self._remove_comments:
+ pctxt.sax.comment = NULL
+ if self._remove_pis:
+ pctxt.sax.processingInstruction = NULL
+ # hard switch-off for CDATA nodes => makes them plain text
+ pctxt.sax.cdataBlock = NULL
+
+ _initParserContext(
+ self._push_parser_context, self._resolvers, pctxt)
+ return self._push_parser_context
cdef _ParserContext _createContext(self, target):
cdef _TargetParserContext context
@@ -561,56 +621,44 @@
return context
cdef xmlparser.xmlParserCtxt* _newParserCtxt(self):
- if self._parser_type == LXML_HTML_PARSER:
+ if self._for_html:
return htmlparser.htmlCreateMemoryParserCtxt('dummy', 5)
else:
return xmlparser.xmlNewParserCtxt()
- def __dealloc__(self):
- if self._parser_ctxt is not NULL:
- xmlparser.xmlFreeParserCtxt(self._parser_ctxt)
- if self._parser_lock is not NULL:
- python.PyThread_free_lock(self._parser_lock)
-
- cdef int _lockAndPrepare(self) except -1:
- self._lockParser()
- self._context._error_log.connect()
- return 0
-
- cdef void _cleanup(self):
- if self._parser_ctxt is not NULL:
- if self._parser_type == LXML_HTML_PARSER:
- htmlparser.htmlCtxtReset(self._parser_ctxt)
- elif self._parser_ctxt.spaceTab is not NULL or \
- _LIBXML_VERSION_INT >= 20629: # work around bug in libxml2
- xmlparser.xmlClearParserCtxt(self._parser_ctxt)
- self._context.clear()
- self._context._error_log.disconnect()
- self._unlockParser()
-
- cdef int _lockParser(self) except -1:
- cdef python.PyThreadState* state
- cdef int result
- if config.ENABLE_THREADING and self._parser_lock != NULL:
- state = python.PyEval_SaveThread()
- result = python.PyThread_acquire_lock(
- self._parser_lock, python.WAIT_LOCK)
- python.PyEval_RestoreThread(state)
- if result == 0:
- raise ParserError, "parser locking failed"
- return 0
+ cdef xmlparser.xmlParserCtxt* _newPushParserCtxt(self):
+ cdef xmlparser.xmlParserCtxt* c_ctxt
+ cdef char* c_filename
+ if self._filename is not None:
+ c_filename = _cstr(self._filename)
+ else:
+ c_filename = NULL
+ if self._for_html:
+ c_ctxt = htmlparser.htmlCreatePushParserCtxt(
+ NULL, NULL, NULL, 0, c_filename, self._default_encoding_int)
+ if c_ctxt is not NULL:
+ htmlparser.htmlCtxtUseOptions(c_ctxt, self._parse_options)
+ else:
+ c_ctxt = xmlparser.xmlCreatePushParserCtxt(
+ NULL, NULL, NULL, 0, c_filename)
+ if c_ctxt is not NULL:
+ xmlparser.xmlCtxtUseOptions(c_ctxt, self._parse_options)
+ if self._default_encoding_int != tree.XML_CHAR_ENCODING_NONE:
+ xmlparser.xmlSwitchEncoding(
+ c_ctxt, self._default_encoding_int)
+ return c_ctxt
- cdef void _unlockParser(self):
- if config.ENABLE_THREADING and self._parser_lock != NULL:
- python.PyThread_release_lock(self._parser_lock)
+ def __dealloc__(self):
+ pass
property error_log:
def __get__(self):
- return self._context._error_log.copy()
+ # FIXME !!!!!!!
+ return self._parser_context._error_log.copy()
property resolvers:
def __get__(self):
- return self._context._resolvers
+ return self._resolvers
def setElementClassLookup(self, ElementClassLookup lookup = None):
"Deprecated, use ``parser.set_element_class_lookup(lookup)`` instead."
@@ -628,9 +676,12 @@
cdef _BaseParser parser
parser = self.__class__()
parser._parse_options = self._parse_options
+ parser._for_html = self._for_html
+ parser._remove_comments = self._remove_comments
+ parser._remove_pis = self._remove_pis
+ parser._filename = self._filename
+ parser._resolvers = self._resolvers
parser._class_lookup = self._class_lookup
- parser._context = self._context._copy()
- parser._context._initParserContext(parser._parser_ctxt)
return parser
def copy(self):
@@ -653,6 +704,7 @@
cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL:
"""Parse unicode document, share dictionary if possible.
"""
+ cdef _ParserContext context
cdef python.PyThreadState* state
cdef xmlDoc* result
cdef xmlparser.xmlParserCtxt* pctxt
@@ -667,14 +719,15 @@
return self._parseDoc(_cstr(text_utf), py_buffer_len, c_filename)
buffer_len = py_buffer_len
- self._lockAndPrepare()
+ context = self._getParserContext()
+ context.prepare()
try:
- pctxt = self._parser_ctxt
+ pctxt = context._c_ctxt
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
c_text = python.PyUnicode_AS_DATA(utext)
state = python.PyEval_SaveThread()
- if self._parser_type == LXML_HTML_PARSER:
+ if self._for_html:
result = htmlparser.htmlCtxtReadMemory(
pctxt, c_text, buffer_len, c_filename, _UNICODE_ENCODING,
self._parse_options)
@@ -684,14 +737,15 @@
self._parse_options)
python.PyEval_RestoreThread(state)
- return self._context._handleParseResultDoc(self, result, None)
+ return context._handleParseResultDoc(self, result, None)
finally:
- self._cleanup()
+ context.cleanup()
cdef xmlDoc* _parseDoc(self, char* c_text, Py_ssize_t c_len,
char* c_filename) except NULL:
"""Parse document, share dictionary if possible.
"""
+ cdef _ParserContext context
cdef python.PyThreadState* state
cdef xmlDoc* result
cdef xmlparser.xmlParserCtxt* pctxt
@@ -699,9 +753,11 @@
cdef char* c_encoding
if c_len > python.INT_MAX:
raise ParserError, "string is too long to parse it with libxml2"
- self._lockAndPrepare()
+
+ context = self._getParserContext()
+ context.prepare()
try:
- pctxt = self._parser_ctxt
+ pctxt = context._c_ctxt
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
if self._default_encoding is None:
@@ -710,7 +766,7 @@
c_encoding = _cstr(self._default_encoding)
state = python.PyEval_SaveThread()
- if self._parser_type == LXML_HTML_PARSER:
+ if self._for_html:
result = htmlparser.htmlCtxtReadMemory(
pctxt, c_text, c_len, c_filename,
c_encoding, self._parse_options)
@@ -720,11 +776,12 @@
c_encoding, self._parse_options)
python.PyEval_RestoreThread(state)
- return self._context._handleParseResultDoc(self, result, None)
+ return context._handleParseResultDoc(self, result, None)
finally:
- self._cleanup()
+ context.cleanup()
cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL:
+ cdef _ParserContext context
cdef python.PyThreadState* state
cdef xmlDoc* result
cdef xmlparser.xmlParserCtxt* pctxt
@@ -732,9 +789,11 @@
cdef int orig_options
cdef char* c_encoding
result = NULL
- self._lockAndPrepare()
+
+ context = self._getParserContext()
+ context.prepare()
try:
- pctxt = self._parser_ctxt
+ pctxt = context._c_ctxt
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
if self._default_encoding is None:
@@ -744,7 +803,7 @@
orig_options = pctxt.options
state = python.PyEval_SaveThread()
- if self._parser_type == LXML_HTML_PARSER:
+ if self._for_html:
result = htmlparser.htmlCtxtReadFile(
pctxt, c_filename, c_encoding, self._parse_options)
else:
@@ -753,12 +812,12 @@
python.PyEval_RestoreThread(state)
pctxt.options = orig_options # work around libxml2 problem
- return self._context._handleParseResultDoc(
- self, result, c_filename)
+ return context._handleParseResultDoc(self, result, c_filename)
finally:
- self._cleanup()
+ context.cleanup()
cdef xmlDoc* _parseDocFromFilelike(self, filelike, filename) except NULL:
+ cdef _ParserContext context
cdef _FileReaderContext file_context
cdef xmlDoc* result
cdef xmlparser.xmlParserCtxt* pctxt
@@ -766,25 +825,28 @@
cdef int recover
if not filename:
filename = None
- self._lockAndPrepare()
+
+ context = self._getParserContext()
+ context.prepare()
try:
- pctxt = self._parser_ctxt
+ pctxt = context._c_ctxt
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
- file_context = _FileReaderContext(filelike, self._context,
- filename, self._default_encoding)
- result = file_context._readDoc(
- pctxt, self._parse_options, self._parser_type)
+ file_context = _FileReaderContext(
+ filelike, context, filename, self._default_encoding)
+ result = file_context._readDoc(pctxt, self._parse_options)
- return self._context._handleParseResultDoc(
+ return context._handleParseResultDoc(
self, result, filename)
finally:
- self._cleanup()
+ context.cleanup()
############################################################
## ET feed parser
############################################################
cdef class _FeedParser(_BaseParser):
+ cdef bint _feed_parser_running
+
def feed(self, data):
"""Feeds data to the parser. The argument should be an 8-bit string
buffer containing encoded data, although Unicode is supported as long
@@ -798,6 +860,7 @@
the ``feed()`` method. The parser can only be reset by calling
``close()``.
"""
+ cdef _ParserContext context
cdef xmlparser.xmlParserCtxt* pctxt
cdef Py_ssize_t py_buffer_len
cdef char* c_data
@@ -819,10 +882,11 @@
else:
raise TypeError, "Parsing requires string data"
- pctxt = self._parser_ctxt
+ context = self._getPushParserContext()
+ pctxt = context._c_ctxt
error = 0
if not self._feed_parser_running:
- self._lockAndPrepare()
+ context.prepare()
self._feed_parser_running = 1
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
@@ -830,7 +894,7 @@
buffer_len = python.INT_MAX
else:
buffer_len = py_buffer_len
- if self._parser_type == LXML_HTML_PARSER:
+ if self._for_html:
error = _htmlCtxtResetPush(pctxt, c_data, buffer_len,
c_encoding, self._parse_options)
else:
@@ -845,7 +909,7 @@
buffer_len = python.INT_MAX
else:
buffer_len = py_buffer_len
- if self._parser_type == LXML_HTML_PARSER:
+ if self._for_html:
error = htmlparser.htmlParseChunk(pctxt, c_data, buffer_len, 0)
else:
error = xmlparser.xmlParseChunk(pctxt, c_data, buffer_len, 0)
@@ -855,10 +919,9 @@
if error:
self._feed_parser_running = 0
try:
- self._context._handleParseResult(
- self, pctxt.myDoc, None)
+ context._handleParseResult(self, pctxt.myDoc, None)
finally:
- self._cleanup()
+ context._cleanup()
def close(self):
"""Terminates feeding data to this parser. This tells the parser to
@@ -869,22 +932,25 @@
the ``feed()`` method. It should only be called when using the feed
parser interface, all other usage is undefined.
"""
+ cdef _ParserContext context
cdef xmlparser.xmlParserCtxt* pctxt
cdef xmlDoc* c_doc
cdef _Document doc
if not self._feed_parser_running:
raise XMLSyntaxError, "no element found"
- pctxt = self._parser_ctxt
+
+ context = self._getPushParserContext()
+ pctxt = context._c_ctxt
+
self._feed_parser_running = 0
- if self._parser_type == LXML_HTML_PARSER:
+ if self._for_html:
htmlparser.htmlParseChunk(pctxt, NULL, 0, 1)
else:
xmlparser.xmlParseChunk(pctxt, NULL, 0, 1)
try:
- result = self._context._handleParseResult(
- self, pctxt.myDoc, None)
+ result = context._handleParseResult(self, pctxt.myDoc, None)
finally:
- self._cleanup()
+ context.cleanup()
if isinstance(result, _Document):
return (<_Document>result).getroot()
@@ -987,8 +1053,9 @@
if not resolve_entities:
parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
- _BaseParser.__init__(self, parse_options, remove_comments, remove_pis,
- target, encoding)
+ _BaseParser.__init__(self, parse_options, 0,
+ remove_comments, remove_pis,
+ target, None, encoding)
cdef class ETCompatXMLParser(XMLParser):
"""An XML parser with an ElementTree compatible default setup. See the
@@ -1093,8 +1160,9 @@
if not compact:
parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT
- _BaseParser.__init__(self, parse_options, remove_comments, remove_pis,
- target, encoding)
+ _BaseParser.__init__(self, parse_options, 1,
+ remove_comments, remove_pis,
+ target, None, encoding)
cdef HTMLParser __DEFAULT_HTML_PARSER
__DEFAULT_HTML_PARSER = HTMLParser()
Modified: lxml/trunk/src/lxml/xmlparser.pxd
==============================================================================
--- lxml/trunk/src/lxml/xmlparser.pxd (original)
+++ lxml/trunk/src/lxml/xmlparser.pxd Wed Sep 26 11:22:29 2007
@@ -78,21 +78,21 @@
xmlDoc* myDoc
xmlDict* dict
void* _private
- int wellFormed
- int recovery
+ bint wellFormed
+ bint recovery
int options
- int disableSAX
+ bint disableSAX
int errNo
- int replaceEntities
- int loadsubset
- int validate
+ bint replaceEntities
+ bint loadsubset
+ bint validate
xmlError lastError
xmlNode* node
xmlSAXHandler* sax
int* spaceTab
int spaceMax
- int html
- int progressive
+ bint html
+ bint progressive
int charset
ctypedef enum xmlParserOption:
From scoder at codespeak.net Wed Sep 26 11:24:24 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Wed, 26 Sep 2007 11:24:24 +0200 (CEST)
Subject: [Lxml-checkins] r46890 - in lxml/trunk: . src/lxml src/lxml/tests
Message-ID: <20070926092424.0921D8159@code0.codespeak.net>
Author: scoder
Date: Wed Sep 26 11:24:24 2007
New Revision: 46890
Modified:
lxml/trunk/CHANGES.txt
lxml/trunk/src/lxml/etree.pyx
lxml/trunk/src/lxml/tests/test_elementtree.py
Log:
(from/to)stringlist() functions (ET 1.3)
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Wed Sep 26 11:24:24 2007
@@ -8,6 +8,9 @@
Features added
--------------
+* ``fromstringlist()`` and ``tostringlist()`` functions as in
+ ElementTree 1.3
+
* ``iterparse()`` accepts an ``html`` boolean keyword argument for
parsing with the HTML parser (note that this interface may be
subject to change)
Modified: lxml/trunk/src/lxml/etree.pyx
==============================================================================
--- lxml/trunk/src/lxml/etree.pyx (original)
+++ lxml/trunk/src/lxml/etree.pyx Wed Sep 26 11:24:24 2007
@@ -2139,6 +2139,20 @@
except _TargetParserResult, result_container:
return result_container.result
+def fromstringlist(strings, _BaseParser parser=None):
+ """Parses an XML document from a sequence of strings.
+
+ To override the default parser with a different parser you can pass it to
+ the ``parser`` keyword argument.
+ """
+ cdef _Document doc
+ if parser is None:
+ parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
+ feed = parser.feed
+ for data in strings:
+ feed(data)
+ return parser.close()
+
def iselement(element):
"""Checks if an object appears to be a valid element object.
"""
@@ -2185,6 +2199,15 @@
else:
raise TypeError, "Type '%s' cannot be serialized." % type(element_or_tree)
+def tostringlist(element_or_tree, *args, **kwargs):
+ """Serialize an element to an encoded string representation of its XML
+ tree, stored in a list of partial strings.
+
+ This is purely for ElementTree 1.3 compatibility. The result is a
+ single string wrapped in a list.
+ """
+ return [tostring(element_or_tree, *args, **kwargs)]
+
def tounicode(element_or_tree, method="xml", pretty_print=False):
"""Serialize an element to the Python unicode representation of its XML
tree.
Modified: lxml/trunk/src/lxml/tests/test_elementtree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_elementtree.py (original)
+++ lxml/trunk/src/lxml/tests/test_elementtree.py Wed Sep 26 11:24:24 2007
@@ -530,6 +530,28 @@
self.assertEquals(0, len(root))
self.assertEquals('This is a text.', root.text)
+ def test_fromstringlist(self):
+ fromstringlist = self.etree.fromstringlist
+
+ root = fromstringlist(["T", "hi", "s is",
+ " a text.<", "/doc", ">"])
+ self.assertEquals(0, len(root))
+ self.assertEquals('This is a text.', root.text)
+
+ def test_fromstringlist_characters(self):
+ fromstringlist = self.etree.fromstringlist
+
+ root = fromstringlist(list('This is a text.'))
+ self.assertEquals(0, len(root))
+ self.assertEquals('This is a text.', root.text)
+
+ def test_fromstringlist_single(self):
+ fromstringlist = self.etree.fromstringlist
+
+ root = fromstringlist(['This is a text.'])
+ self.assertEquals(0, len(root))
+ self.assertEquals('This is a text.', root.text)
+
def test_iselement(self):
iselement = self.etree.iselement
Element = self.etree.Element
From scoder at codespeak.net Wed Sep 26 12:02:45 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Wed, 26 Sep 2007 12:02:45 +0200 (CEST)
Subject: [Lxml-checkins] r46891 - in lxml/trunk: . src/lxml
Message-ID: <20070926100245.AF965815D@code0.codespeak.net>
Author: scoder
Date: Wed Sep 26 12:02:45 2007
New Revision: 46891
Modified:
lxml/trunk/CHANGES.txt
lxml/trunk/src/lxml/iterparse.pxi
lxml/trunk/src/lxml/parser.pxi
Log:
some fixes and cleanups in the error_log property, separate feed_error_log property for feed parser
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Wed Sep 26 12:02:45 2007
@@ -8,6 +8,10 @@
Features added
--------------
+* Separate ``feed_error_log`` property for the feed parser interface.
+ The the normal parser interface and ``iterparse`` still use
+ ``error_log``.
+
* ``fromstringlist()`` and ``tostringlist()`` functions as in
ElementTree 1.3
Modified: lxml/trunk/src/lxml/iterparse.pxi
==============================================================================
--- lxml/trunk/src/lxml/iterparse.pxi (original)
+++ lxml/trunk/src/lxml/iterparse.pxi Wed Sep 26 12:02:45 2007
@@ -328,6 +328,14 @@
context.prepare()
# parser will not be unlocked - no other methods supported
+ property error_log:
+ """The error log of the last (or current) parser run.
+ """
+ def __get__(self):
+ cdef _ParserContext context
+ context = self._getPushParserContext()
+ return context._error_log.copy()
+
cdef _ParserContext _createContext(self, target):
return _IterparseContext()
Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi (original)
+++ lxml/trunk/src/lxml/parser.pxi Wed Sep 26 12:02:45 2007
@@ -652,14 +652,22 @@
pass
property error_log:
+ """The error log of the last parser run.
+ """
def __get__(self):
- # FIXME !!!!!!!
- return self._parser_context._error_log.copy()
+ cdef _ParserContext context
+ context = self._getParserContext()
+ return context._error_log.copy()
property resolvers:
def __get__(self):
return self._resolvers
+ property version:
+ "The version of the underlying XML parser."
+ def __get__(self):
+ return "libxml2 %d.%d.%d" % LIBXML_VERSION
+
def setElementClassLookup(self, ElementClassLookup lookup = None):
"Deprecated, use ``parser.set_element_class_lookup(lookup)`` instead."
self.set_element_class_lookup(lookup)
@@ -694,11 +702,6 @@
return _makeElement(_tag, NULL, None, self, None, None,
attrib, nsmap, _extra)
- property version:
- "The version of the underlying XML parser."
- def __get__(self):
- return "libxml2 %d.%d.%d" % LIBXML_VERSION
-
# internal parser methods
cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL:
@@ -847,6 +850,17 @@
cdef class _FeedParser(_BaseParser):
cdef bint _feed_parser_running
+ property feed_error_log:
+ """The error log of the last (or current) run of the feed parser.
+
+ Note that this is local to the feed parser and thus is
+ different from what the ``error_log`` property returns.
+ """
+ def __get__(self):
+ cdef _ParserContext context
+ context = self._getPushParserContext()
+ return context._error_log.copy()
+
def feed(self, data):
"""Feeds data to the parser. The argument should be an 8-bit string
buffer containing encoded data, although Unicode is supported as long
From scoder at codespeak.net Wed Sep 26 12:17:14 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Wed, 26 Sep 2007 12:17:14 +0200 (CEST)
Subject: [Lxml-checkins] r46892 - lxml/trunk/doc
Message-ID: <20070926101714.B5A418119@code0.codespeak.net>
Author: scoder
Date: Wed Sep 26 12:17:13 2007
New Revision: 46892
Modified:
lxml/trunk/doc/parsing.txt
Log:
doc update on parser changes: error log and target parser
Modified: lxml/trunk/doc/parsing.txt
==============================================================================
--- lxml/trunk/doc/parsing.txt (original)
+++ lxml/trunk/doc/parsing.txt Wed Sep 26 12:17:13 2007
@@ -101,6 +101,31 @@
* compact - use compact storage for short text content (on by default)
+Error log
+---------
+
+Parsers have an ``error_log`` property that lists the errors of the
+last parser run::
+
+ >>> parser = etree.XMLParser()
+ >>> print len(parser.error_log)
+ 0
+
+ >>> tree = etree.XML("", parser)
+ Traceback (most recent call last):
+ ...
+ XMLSyntaxError: Opening and ending tag mismatch: root line 1 and b, line 1, column 11
+
+ >>> print len(parser.error_log)
+ 1
+
+ >>> error = parser.error_log[0]
+ >>> print error.message
+ Opening and ending tag mismatch: root line 1 and b
+ >>> print error.line, error.column
+ 1 11
+
+
Parsing HTML
------------
@@ -176,6 +201,42 @@
ascii
+The target parser interface
+===========================
+
+.. _`As in ElementTree`: http://effbot.org/elementtree/elementtree-xmlparser.htm
+
+`As in ElementTree`_, and similar to a SAX event handler, you can pass
+a target object to the parser::
+
+ >>> class EchoTarget:
+ ... def start(self, tag, attrib):
+ ... print "start", tag, attrib
+ ... def end(self, tag):
+ ... print "end", tag
+ ... def data(self, data):
+ ... print "data", repr(data)
+ ... def close(self):
+ ... print "close"
+ ... return "closed!"
+
+ >>> parser = etree.XMLParser(target = EchoTarget())
+
+ >>> result = etree.XML("some text", parser)
+ start element {}
+ data u'some text'
+ end element
+ close
+
+ >>> print result
+ closed!
+
+Note that the parser does *not* build a tree in this case. The result
+of the parser run is what the target object returns from its
+``close()`` method. If you want to return an XML tree here, you have
+to create it programmatically in the target object.
+
+
The feed parser interface
=========================
@@ -215,6 +276,31 @@
parser, a file-like object passively responds to ``read()`` requests of the
parser itself. Depending on the data source, either way may be more natural.
+Note that the feed parser has its own error log called
+``feed_error_log``. Errors in the feed parser do not show up in the
+normal ``error_log`` and vice versa.
+
+You can also combine the feed parser interface with the target parser::
+
+ >>> parser = etree.XMLParser(target = EchoTarget())
+
+ >>> parser.feed(">> parser.feed("nt>some text>> parser.feed("ent>")
+ end element
+
+ >>> result = parser.close()
+ close
+ >>> print result
+ closed!
+
+Again, this prevents the automatic creating of an XML tree and leaves
+all the event handling to the target object. The ``close()`` method
+of the parser forwards the return value of the target's ``close()``
+method.
+
iterparse and iterwalk
======================
From scoder at codespeak.net Wed Sep 26 12:24:54 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Wed, 26 Sep 2007 12:24:54 +0200 (CEST)
Subject: [Lxml-checkins] r46893 - lxml/trunk
Message-ID: <20070926102454.0C0D38166@code0.codespeak.net>
Author: scoder
Date: Wed Sep 26 12:24:54 2007
New Revision: 46893
Modified:
lxml/trunk/CHANGES.txt
Log:
changelog
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Wed Sep 26 12:24:54 2007
@@ -9,9 +9,12 @@
--------------
* Separate ``feed_error_log`` property for the feed parser interface.
- The the normal parser interface and ``iterparse`` still use
+ The normal parser interface and ``iterparse`` continue to use
``error_log``.
+* The normal parsers and the feed parser interface are now separated
+ and can be used concurrently on the same parser instance.
+
* ``fromstringlist()`` and ``tostringlist()`` functions as in
ElementTree 1.3
From scoder at codespeak.net Wed Sep 26 12:34:42 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Wed, 26 Sep 2007 12:34:42 +0200 (CEST)
Subject: [Lxml-checkins] r46894 - in lxml/trunk: . doc
Message-ID: <20070926103442.CCEB08165@code0.codespeak.net>
Author: scoder
Date: Wed Sep 26 12:34:42 2007
New Revision: 46894
Modified:
lxml/trunk/CHANGES.txt
lxml/trunk/doc/main.txt
lxml/trunk/version.txt
Log:
2.0alpha3
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Wed Sep 26 12:34:42 2007
@@ -2,8 +2,8 @@
lxml changelog
==============
-Under development
-=================
+2.0alpha3 (2007-09-26)
+======================
Features added
--------------
Modified: lxml/trunk/doc/main.txt
==============================================================================
--- lxml/trunk/doc/main.txt (original)
+++ lxml/trunk/doc/main.txt Wed Sep 26 12:34:42 2007
@@ -138,8 +138,8 @@
.. _`lxml at the Python Package Index`: http://pypi.python.org/pypi/lxml/
.. _`this key`: pubkey.asc
-The latest version is `lxml 2.0alpha2`_, released 2007-09-15
-(`changes for 2.0alpha2`_). `Older versions`_ are listed below.
+The latest version is `lxml 2.0alpha3`_, released 2007-09-26
+(`changes for 2.0alpha3`_). `Older versions`_ are listed below.
.. _`Older versions`: #old-versions
@@ -199,6 +199,8 @@
Old Versions
------------
+* `lxml 2.0alpha2`_, released 2007-09-15 (`changes for 2.0alpha2`_)
+
* `lxml 2.0alpha1`_, released 2007-09-02 (`changes for 2.0alpha1`_)
* `lxml 1.3.4`_, released 2007-08-30 (`changes for 1.3.4`_)
@@ -247,6 +249,7 @@
* `lxml 0.5`_, released 2005-04-08
+.. _`lxml 2.0alpha3`: lxml-2.0alpha3.tgz
.. _`lxml 2.0alpha2`: lxml-2.0alpha2.tgz
.. _`lxml 2.0alpha1`: lxml-2.0alpha1.tgz
.. _`lxml 1.3.4`: lxml-1.3.4.tgz
@@ -272,6 +275,7 @@
.. _`lxml 0.5.1`: lxml-0.5.1.tgz
.. _`lxml 0.5`: lxml-0.5.tgz
+.. _`changes for 2.0alpha3`: changes-2.0alpha3.html
.. _`changes for 2.0alpha2`: changes-2.0alpha2.html
.. _`changes for 2.0alpha1`: changes-2.0alpha1.html
.. _`changes for 1.3.4`: changes-1.3.4.html
Modified: lxml/trunk/version.txt
==============================================================================
--- lxml/trunk/version.txt (original)
+++ lxml/trunk/version.txt Wed Sep 26 12:34:42 2007
@@ -1 +1 @@
-2.0alpha2
+2.0alpha3
From scoder at codespeak.net Wed Sep 26 12:43:27 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Wed, 26 Sep 2007 12:43:27 +0200 (CEST)
Subject: [Lxml-checkins] r46895 - lxml/trunk
Message-ID: <20070926104327.81B61816D@code0.codespeak.net>
Author: scoder
Date: Wed Sep 26 12:43:27 2007
New Revision: 46895
Modified:
lxml/trunk/CHANGES.txt
Log:
changelog
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Wed Sep 26 12:43:27 2007
@@ -40,6 +40,9 @@
Other changes
-------------
+* lxml.etree now emits a warning if you use XPath with libxml2 2.6.27
+ (which can crash on certain XPath errors)
+
* Type annotation in objectify now preserves the already annotated type be
default to prevent loosing type information that is already there.
From scoder at codespeak.net Wed Sep 26 15:19:13 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Wed, 26 Sep 2007 15:19:13 +0200 (CEST)
Subject: [Lxml-checkins] r46898 - lxml/trunk
Message-ID: <20070926131913.043D28150@code0.codespeak.net>
Author: scoder
Date: Wed Sep 26 15:19:12 2007
New Revision: 46898
Modified:
lxml/trunk/CHANGES.txt
Log:
typo
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Wed Sep 26 15:19:12 2007
@@ -43,7 +43,7 @@
* lxml.etree now emits a warning if you use XPath with libxml2 2.6.27
(which can crash on certain XPath errors)
-* Type annotation in objectify now preserves the already annotated type be
+* Type annotation in objectify now preserves the already annotated type by
default to prevent loosing type information that is already there.
From scoder at codespeak.net Wed Sep 26 15:43:07 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Wed, 26 Sep 2007 15:43:07 +0200 (CEST)
Subject: [Lxml-checkins] r46899 - lxml/trunk/src/lxml
Message-ID: <20070926134307.169098169@code0.codespeak.net>
Author: scoder
Date: Wed Sep 26 15:43:05 2007
New Revision: 46899
Modified:
lxml/trunk/src/lxml/parser.pxi
Log:
cleanup
Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi (original)
+++ lxml/trunk/src/lxml/parser.pxi Wed Sep 26 15:43:05 2007
@@ -531,14 +531,14 @@
cdef class _BaseParser:
cdef ElementClassLookup _class_lookup
cdef _ResolverRegistry _resolvers
- cdef int _parse_options
- cdef object _filename
cdef _ParserContext _parser_context
cdef _ParserContext _push_parser_context
- cdef object _target
+ cdef int _parse_options
cdef bint _for_html
cdef bint _remove_comments
cdef bint _remove_pis
+ cdef object _filename
+ cdef object _target
cdef object _default_encoding
cdef int _default_encoding_int
From scoder at codespeak.net Wed Sep 26 16:39:09 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Wed, 26 Sep 2007 16:39:09 +0200 (CEST)
Subject: [Lxml-checkins] r46901 - in lxml/trunk: . src/lxml src/lxml/tests
Message-ID: <20070926143909.B9AF7817A@code0.codespeak.net>
Author: scoder
Date: Wed Sep 26 16:39:09 2007
New Revision: 46901
Modified:
lxml/trunk/CHANGES.txt
lxml/trunk/src/lxml/parser.pxi
lxml/trunk/src/lxml/tests/test_elementtree.py
Log:
AttributeError on parse errors in feed parser
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Wed Sep 26 16:39:09 2007
@@ -2,6 +2,21 @@
lxml changelog
==============
+Under development
+=================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* AttributeError in feed parser on parse errors
+
+Other changes
+-------------
+
+
2.0alpha3 (2007-09-26)
======================
Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi (original)
+++ lxml/trunk/src/lxml/parser.pxi Wed Sep 26 16:39:09 2007
@@ -935,7 +935,7 @@
try:
context._handleParseResult(self, pctxt.myDoc, None)
finally:
- context._cleanup()
+ context.cleanup()
def close(self):
"""Terminates feeding data to this parser. This tells the parser to
Modified: lxml/trunk/src/lxml/tests/test_elementtree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_elementtree.py (original)
+++ lxml/trunk/src/lxml/tests/test_elementtree.py Wed Sep 26 16:39:09 2007
@@ -2799,29 +2799,32 @@
self.assertEquals(root[0].get("test"), "works")
def test_feed_parser_error_close_empty(self):
+ ParseError = self.etree.ParseError
parser = self.etree.XMLParser()
- self.assertRaises(Exception, parser.close)
+ self.assertRaises(ParseError, parser.close)
def test_feed_parser_error_close_incomplete(self):
+ ParseError = self.etree.ParseError
parser = self.etree.XMLParser()
parser.feed('<><><><><><')
+ except ParseError:
# can raise, but not required before close()
pass
- self.assertRaises(Exception, parser.close)
+ self.assertRaises(ParseError, parser.close)
# parser target interface
From lxml-checkins at codespeak.net Wed Sep 26 18:43:54 2007
From: lxml-checkins at codespeak.net (Viagra.com Inc)
Date: Wed, 26 Sep 2007 18:43:54 +0200 (CEST)
Subject: [Lxml-checkins] September 70% OFF
Message-ID: <20070926074600.13862.qmail@AOrleans-258-1-116-100.w90-21.abo.wanadoo.fr>
An HTML attachment was scrubbed...
URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20070926/353e4972/attachment-0001.htm
From scoder at codespeak.net Thu Sep 27 09:20:20 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Thu, 27 Sep 2007 09:20:20 +0200 (CEST)
Subject: [Lxml-checkins] r46916 - lxml/trunk/src/lxml
Message-ID: <20070927072020.30D4F809D@code0.codespeak.net>
Author: scoder
Date: Thu Sep 27 09:20:19 2007
New Revision: 46916
Modified:
lxml/trunk/src/lxml/parser.pxi
Log:
docs, pass NULL to parse result handler where we expect an error anyway
Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi (original)
+++ lxml/trunk/src/lxml/parser.pxi Thu Sep 27 09:20:19 2007
@@ -866,13 +866,16 @@
buffer containing encoded data, although Unicode is supported as long
as both string types are not mixed.
- This is the main entry point to the consumer interface of a parser.
- The parser will parse as much of the XML stream as it can on each
- call. To finish parsing, call the ``close()`` method.
-
- It is not possible to use the parser in any other way after calling
- the ``feed()`` method. The parser can only be reset by calling
- ``close()``.
+ This is the main entry point to the consumer interface of a
+ parser. The parser will parse as much of the XML stream as it
+ can on each call. To finish parsing or to reset the parser,
+ call the ``close()`` method. Both methods may raise
+ ParseError if errors occur in the input data. If an error is
+ raised, there is no longer a need to call ``close()``.
+
+ The feed parser interface is independent of the normal parser
+ usage. You can use the same parser as a feed parser and in
+ the ``parse()`` function concurrently.
"""
cdef _ParserContext context
cdef xmlparser.xmlParserCtxt* pctxt
@@ -933,7 +936,7 @@
if error:
self._feed_parser_running = 0
try:
- context._handleParseResult(self, pctxt.myDoc, None)
+ context._handleParseResult(self, NULL, None)
finally:
context.cleanup()
From scoder at codespeak.net Thu Sep 27 09:21:11 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Thu, 27 Sep 2007 09:21:11 +0200 (CEST)
Subject: [Lxml-checkins] r46917 - lxml/trunk/src/lxml/html
Message-ID: <20070927072111.2C23A8170@code0.codespeak.net>
Author: scoder
Date: Thu Sep 27 09:21:11 2007
New Revision: 46917
Modified:
lxml/trunk/src/lxml/html/__init__.py
Log:
some cleanup, duplicate HTMLParser class to support parser options
Modified: lxml/trunk/src/lxml/html/__init__.py
==============================================================================
--- lxml/trunk/src/lxml/html/__init__.py (original)
+++ lxml/trunk/src/lxml/html/__init__.py Thu Sep 27 09:21:11 2007
@@ -404,9 +404,10 @@
return HtmlEntity
# Otherwise normal lookup
return None
-
-html_parser = etree.HTMLParser()
+################################################################################
+# parsing
+################################################################################
def document_fromstring(html, **kw):
value = etree.HTML(html, html_parser, **kw)
@@ -528,14 +529,16 @@
body.tag = 'span'
return body
-def parse(filename, **kw):
+def parse(filename, parser=None, **kw):
"""
Parse a filename, URL, or file-like object into an HTML document.
You may pass the keyword argument ``base_url='http://...'`` to set
the base URL.
"""
- return etree.parse(filename, html_parser, **kw)
+ if parser is None:
+ parser = html_parser
+ return etree.parse(filename, parser, **kw)
def _contains_block_level_tag(el):
# FIXME: I could do this with XPath, but would that just be
@@ -553,9 +556,9 @@
else:
return el.tag
-def Element(*args, **kw):
- v = html_parser.makeelement(*args, **kw)
- return v
+################################################################################
+# form handling
+################################################################################
class FormElement(HtmlElement):
"""
@@ -1257,5 +1260,15 @@
################################################################################
# configure Element class lookup
+################################################################################
+
+class HTMLParser(etree.HTMLParser):
+ def __init__(self, **kwargs):
+ super(HTMLParser, self).__init__(**kwargs)
+ self.setElementClassLookup(HtmlElementClassLookup())
+
+def Element(*args, **kw):
+ v = html_parser.makeelement(*args, **kw)
+ return v
-html_parser.setElementClassLookup(HtmlElementClassLookup())
+html_parser = HTMLParser()
From scoder at codespeak.net Thu Sep 27 09:21:43 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Thu, 27 Sep 2007 09:21:43 +0200 (CEST)
Subject: [Lxml-checkins] r46918 - lxml/trunk/src/lxml
Message-ID: <20070927072143.460BC815E@code0.codespeak.net>
Author: scoder
Date: Thu Sep 27 09:21:42 2007
New Revision: 46918
Modified:
lxml/trunk/src/lxml/xslt.pxi
Log:
release GIL when serialising XSLT result
Modified: lxml/trunk/src/lxml/xslt.pxi
==============================================================================
--- lxml/trunk/src/lxml/xslt.pxi (original)
+++ lxml/trunk/src/lxml/xslt.pxi Thu Sep 27 09:21:42 2007
@@ -508,6 +508,7 @@
cdef XSLT _xslt
cdef _Document _profile
cdef _saveToStringAndSize(self, char** s, int* l):
+ cdef python.PyThreadState* state
cdef _Document doc
cdef int r
if self._context_node is not None:
@@ -517,7 +518,9 @@
if doc is None:
s[0] = NULL
return
+ state = python.PyEval_SaveThread()
r = xslt.xsltSaveResultToString(s, l, doc._c_doc, self._xslt._c_style)
+ python.PyEval_RestoreThread(state)
if r == -1:
raise XSLTSaveError, "Error saving XSLT result to string"
From scoder at codespeak.net Thu Sep 27 20:46:42 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Thu, 27 Sep 2007 20:46:42 +0200 (CEST)
Subject: [Lxml-checkins] r46973 - in lxml/trunk: . src/lxml src/lxml/tests
Message-ID: <20070927184642.BAC9F8171@code0.codespeak.net>
Author: scoder
Date: Thu Sep 27 20:46:41 2007
New Revision: 46973
Modified:
lxml/trunk/CHANGES.txt
lxml/trunk/src/lxml/apihelpers.pxi
lxml/trunk/src/lxml/tests/test_unicode.py
Log:
avoid rejecting valid HTML names when validating XML tag names
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Thu Sep 27 20:46:41 2007
@@ -16,6 +16,10 @@
Other changes
-------------
+* lxml.etree no longer validates unicode characters in tag names to
+ avoid rejecting HTML tags. Only special characters like ':' and '>'
+ are rejected.
+
2.0alpha3 (2007-09-26)
======================
Modified: lxml/trunk/src/lxml/apihelpers.pxi
==============================================================================
--- lxml/trunk/src/lxml/apihelpers.pxi (original)
+++ lxml/trunk/src/lxml/apihelpers.pxi Thu Sep 27 20:46:41 2007
@@ -791,7 +791,23 @@
return _xmlNameIsValid(_cstr(name_utf8))
cdef int _xmlNameIsValid(char* c_name):
- return tree.xmlValidateNCName(c_name, 0) == 0
+ #return tree.xmlValidateNCName(c_name, 0) == 0
+ if c_name is NULL or c_name[0] == c'\0':
+ return 0
+ while c_name[0] != c'\0':
+ if c_name[0] == c':' or \
+ c_name[0] == c'&' or \
+ c_name[0] == c'<' or \
+ c_name[0] == c'>' or \
+ c_name[0] == c'/' or \
+ c_name[0] == c'\x09' or \
+ c_name[0] == c'\x0A' or \
+ c_name[0] == c'\x0B' or \
+ c_name[0] == c'\x0C' or \
+ c_name[0] == c'\x20':
+ return 0
+ c_name = c_name + 1
+ return 1
cdef int _tagValidOrRaise(tag_utf) except -1:
if not _pyXmlNameIsValid(tag_utf):
Modified: lxml/trunk/src/lxml/tests/test_unicode.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_unicode.py (original)
+++ lxml/trunk/src/lxml/tests/test_unicode.py Thu Sep 27 20:46:41 2007
@@ -5,7 +5,9 @@
ascii_uni = u'a'
-klingon = u"\uF8D2" # not valid for XML names
+# klingon = u"\uF8D2" # not valid for XML names
+
+invalid_tag = "\u0680:\u3120"
uni = u'?\u0680\u3120' # some non-ASCII characters
@@ -27,7 +29,7 @@
def test_unicode_tag_invalid(self):
# sadly, Klingon is not well-formed
- self.assertRaises(ValueError, etree.Element, klingon)
+ self.assertRaises(ValueError, etree.Element, invalid_tag)
def test_unicode_nstag(self):
tag = u"{%s}%s" % (uni, uni)
@@ -36,8 +38,8 @@
def test_unicode_nstag_invalid(self):
# sadly, Klingon is not well-formed
- tag = u"{%s}%s" % (uni, klingon)
- self.assertRaises(ValueError, etree.Element, klingon)
+ tag = u"{%s}%s" % (uni, invalid_tag)
+ self.assertRaises(ValueError, etree.Element, tag)
def test_unicode_qname(self):
qname = etree.QName(uni, uni)
@@ -46,7 +48,7 @@
self.assertEquals(unicode(qname), tag)
def test_unicode_qname_invalid(self):
- self.assertRaises(ValueError, etree.QName, klingon)
+ self.assertRaises(ValueError, etree.QName, invalid_tag)
def test_unicode_attr(self):
el = etree.Element('foo', {'bar': uni})
From scoder at codespeak.net Thu Sep 27 22:21:41 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Thu, 27 Sep 2007 22:21:41 +0200 (CEST)
Subject: [Lxml-checkins] r46978 - lxml/trunk/src/lxml
Message-ID: <20070927202141.1EBE7819E@code0.codespeak.net>
Author: scoder
Date: Thu Sep 27 22:21:40 2007
New Revision: 46978
Modified:
lxml/trunk/src/lxml/xpath.pxi
Log:
changed 'int' to 'bint' C type where boolean is meant
Modified: lxml/trunk/src/lxml/xpath.pxi
==============================================================================
--- lxml/trunk/src/lxml/xpath.pxi (original)
+++ lxml/trunk/src/lxml/xpath.pxi Thu Sep 27 22:21:40 2007
@@ -91,7 +91,7 @@
cdef void _setupDict(self, xpath.xmlXPathContext* xpathCtxt):
__GLOBAL_PARSER_CONTEXT.initXPathParserDict(xpathCtxt)
-cdef int _XPATH_VERSION_WARNING_REQUIRED
+cdef bint _XPATH_VERSION_WARNING_REQUIRED
if _LIBXML_VERSION_INT == 20627:
_XPATH_VERSION_WARNING_REQUIRED = 1
else:
@@ -137,7 +137,7 @@
"""
return self(_eval_arg, **_variables)
- cdef int _checkAbsolutePath(self, char* path):
+ cdef bint _checkAbsolutePath(self, char* path):
cdef char c
if path is NULL:
return 0
From scoder at codespeak.net Thu Sep 27 22:22:18 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Thu, 27 Sep 2007 22:22:18 +0200 (CEST)
Subject: [Lxml-checkins] r46979 - lxml/trunk/benchmark
Message-ID: <20070927202218.86181819E@code0.codespeak.net>
Author: scoder
Date: Thu Sep 27 22:22:18 2007
New Revision: 46979
Modified:
lxml/trunk/benchmark/bench_etree.py
Log:
iterfind() benchmarks, set_text() fix
Modified: lxml/trunk/benchmark/bench_etree.py
==============================================================================
--- lxml/trunk/benchmark/bench_etree.py (original)
+++ lxml/trunk/benchmark/bench_etree.py Thu Sep 27 22:22:18 2007
@@ -5,6 +5,9 @@
import benchbase
from benchbase import with_attributes, with_text, onlylib, serialized, children
+TEXT = "some ASCII text"
+UTEXT = u"some klingon: \F8D2"
+
############################################################
# Benchmarks
############################################################
@@ -299,5 +302,17 @@
def bench_findall_tag(self, root):
root.findall(".//" + self.SEARCH_TAG)
+ @onlylib('lxe')
+ def bench_iterfind(self, root):
+ list(root.iterfind(".//*"))
+
+ @onlylib('lxe')
+ def bench_iterfind_tag(self, root):
+ list(root.iterfind(".//" + self.SEARCH_TAG))
+
+ @onlylib('lxe')
+ def bench_iterfind_islice(self, root):
+ list(islice(root.iterfind(".//*"), 10, 110))
+
if __name__ == '__main__':
benchbase.main(BenchMark)
From scoder at codespeak.net Thu Sep 27 22:23:19 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Thu, 27 Sep 2007 22:23:19 +0200 (CEST)
Subject: [Lxml-checkins] r46980 - lxml/trunk/doc
Message-ID: <20070927202319.50333819E@code0.codespeak.net>
Author: scoder
Date: Thu Sep 27 22:23:18 2007
New Revision: 46980
Modified:
lxml/trunk/doc/tutorial.txt
Log:
tutorial example for el.index()
Modified: lxml/trunk/doc/tutorial.txt
==============================================================================
--- lxml/trunk/doc/tutorial.txt (original)
+++ lxml/trunk/doc/tutorial.txt Thu Sep 27 22:23:18 2007
@@ -125,6 +125,9 @@
>>> print len(root)
3
+ >>> root.index(root[1]) # lxml.etree only!
+ 1
+
>>> children = list(root)
>>> for child in root:
From scoder at codespeak.net Fri Sep 28 12:43:23 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Fri, 28 Sep 2007 12:43:23 +0200 (CEST)
Subject: [Lxml-checkins] r47002 - lxml/trunk/benchmark
Message-ID: <20070928104323.2D6858176@code0.codespeak.net>
Author: scoder
Date: Fri Sep 28 12:43:22 2007
New Revision: 47002
Modified:
lxml/trunk/benchmark/bench_etree.py
Log:
new benchmark
Modified: lxml/trunk/benchmark/bench_etree.py
==============================================================================
--- lxml/trunk/benchmark/bench_etree.py (original)
+++ lxml/trunk/benchmark/bench_etree.py Fri Sep 28 12:43:22 2007
@@ -299,6 +299,9 @@
def bench_findall(self, root):
root.findall(".//*")
+ def bench_findall_child(self, root):
+ root.findall(".//*/" + self.SEARCH_TAG)
+
def bench_findall_tag(self, root):
root.findall(".//" + self.SEARCH_TAG)
From scoder at codespeak.net Fri Sep 28 22:24:44 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Fri, 28 Sep 2007 22:24:44 +0200 (CEST)
Subject: [Lxml-checkins] r47005 - lxml/trunk/benchmark
Message-ID: <20070928202444.82559815A@code0.codespeak.net>
Author: scoder
Date: Fri Sep 28 22:24:43 2007
New Revision: 47005
Modified:
lxml/trunk/benchmark/bench_etree.py
Log:
findall() benchmark with a longer path
Modified: lxml/trunk/benchmark/bench_etree.py
==============================================================================
--- lxml/trunk/benchmark/bench_etree.py (original)
+++ lxml/trunk/benchmark/bench_etree.py Fri Sep 28 22:24:43 2007
@@ -305,6 +305,9 @@
def bench_findall_tag(self, root):
root.findall(".//" + self.SEARCH_TAG)
+ def bench_findall_path(self, root):
+ root.findall(".//*[%s]/./%s/./*" % (self.SEARCH_TAG, self.SEARCH_TAG))
+
@onlylib('lxe')
def bench_iterfind(self, root):
list(root.iterfind(".//*"))
From scoder at codespeak.net Fri Sep 28 22:25:22 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Fri, 28 Sep 2007 22:25:22 +0200 (CEST)
Subject: [Lxml-checkins] r47006 - lxml/trunk/src/lxml
Message-ID: <20070928202522.3A719815A@code0.codespeak.net>
Author: scoder
Date: Fri Sep 28 22:25:21 2007
New Revision: 47006
Modified:
lxml/trunk/src/lxml/_elementpath.py
Log:
some cleanup and optimisations in _elementpath.py
Modified: lxml/trunk/src/lxml/_elementpath.py
==============================================================================
--- lxml/trunk/src/lxml/_elementpath.py (original)
+++ lxml/trunk/src/lxml/_elementpath.py Fri Sep 28 22:25:21 2007
@@ -65,24 +65,22 @@
def prepare_tag(next, token):
tag = token[1]
- def select(context, result):
+ def select(result):
for elem in result:
- for e in elem:
- if e.tag == tag:
- yield e
+ for e in elem.iterchildren(tag=tag):
+ yield e
return select
def prepare_star(next, token):
- def select(context, result):
+ def select(result):
for elem in result:
for e in elem:
yield e
return select
def prepare_dot(next, token):
- def select(context, result):
- for elem in result:
- yield elem
+ def select(result):
+ return result
return select
def prepare_iter(next, token):
@@ -93,24 +91,18 @@
tag = token[1]
else:
raise SyntaxError
- def select(context, result):
+ def select(result):
for elem in result:
- for e in elem.iter(tag):
- if e is not elem:
- yield e
+ for e in elem.iterdescendants(tag=tag):
+ yield e
return select
def prepare_dot_dot(next, token):
- def select(context, result):
- parent_map = context.parent_map
- if parent_map is None:
- context.parent_map = parent_map = {}
- for p in context.root.iter():
- for e in p:
- parent_map[e] = p
+ def select(result):
for elem in result:
- if elem in parent_map:
- yield parent_map[elem]
+ parent = elem.getparent()
+ if parent is not None:
+ yield parent
return select
def prepare_predicate(next, token):
@@ -124,7 +116,7 @@
key = token[1]
token = next()
if token[0] == "]":
- def select(context, result):
+ def select(result):
for elem in result:
if elem.get(key) is not None:
yield elem
@@ -135,7 +127,7 @@
else:
raise SyntaxError("invalid comparision target")
token = next()
- def select(context, result):
+ def select(result):
for elem in result:
if elem.get(key) == value:
yield elem
@@ -146,10 +138,13 @@
token = next()
if token[0] != "]":
raise SyntaxError("invalid node predicate")
- def select(context, result):
+ def select(result):
for elem in result:
- if elem.find(tag) is not None:
+ try:
+ elem.iterdescendants(tag).next()
yield elem
+ except StopIteration:
+ pass
else:
raise SyntaxError("invalid predicate")
return select
@@ -165,13 +160,46 @@
_cache = {}
-class _SelectorContext:
- parent_map = None
- def __init__(self, root):
- self.root = root
-
# --------------------------------------------------------------------
+def _build_path_iterator(path):
+ # compile selector pattern
+ try:
+ return _cache[path]
+ except KeyError:
+ pass
+ if len(_cache) > 100:
+ _cache.clear()
+
+ if path[:1] == "/":
+ raise SyntaxError("cannot use absolute path on element")
+ stream = iter(xpath_tokenizer(path))
+ next = stream.next; token = next()
+ selector = []
+ while 1:
+ try:
+ selector.append(ops[token[0]](next, token))
+ except StopIteration:
+ raise SyntaxError("invalid path")
+ try:
+ token = next()
+ if token[0] == "/":
+ token = next()
+ except StopIteration:
+ break
+ return selector
+
+##
+# Iterate over the matching nodes
+
+def iterfind(elem, path):
+ # execute selector pattern
+ selector = _build_path_iterator(path)
+ result = iter((elem,))
+ for select in selector:
+ result = select(result)
+ return result
+
##
# Find first matching object.
@@ -187,37 +215,6 @@
def findall(elem, path):
return list(iterfind(elem, path))
-def iterfind(elem, path):
- # compile selector pattern
- try:
- selector = _cache[path]
- except KeyError:
- if len(_cache) > 100:
- _cache.clear()
- if path[:1] == "/":
- raise SyntaxError("cannot use absolute path on element")
- stream = iter(xpath_tokenizer(path))
- next = stream.next; token = next()
- selector = []
- while 1:
- try:
- selector.append(ops[token[0]](next, token))
- except StopIteration:
- raise SyntaxError("invalid path")
- try:
- token = next()
- if token[0] == "/":
- token = next()
- except StopIteration:
- break
- _cache[path] = selector
- # execute selector pattern
- result = [elem]
- context = _SelectorContext(elem)
- for select in selector:
- result = select(context, result)
- return result
-
##
# Find text for first matching object.
From scoder at codespeak.net Fri Sep 28 22:35:59 2007
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Fri, 28 Sep 2007 22:35:59 +0200 (CEST)
Subject: [Lxml-checkins] r47007 - lxml/trunk/src/lxml/html/tests
Message-ID: <20070928203559.8A6138171@code0.codespeak.net>
Author: scoder
Date: Fri Sep 28 22:35:59 2007
New Revision: 47007
Added:
lxml/trunk/src/lxml/html/tests/test_elementsoup.py
Log:
run ElementSoup doctest
Added: lxml/trunk/src/lxml/html/tests/test_elementsoup.py
==============================================================================
--- (empty file)
+++ lxml/trunk/src/lxml/html/tests/test_elementsoup.py Fri Sep 28 22:35:59 2007
@@ -0,0 +1,10 @@
+import unittest
+from lxml.tests.common_imports import doctest
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([doctest.DocFileSuite('../../../../doc/elementsoup.txt')])
+ return suite
+
+if __name__ == '__main__':
+ unittest.main()
From faassen at codespeak.net Sun Sep 30 09:11:44 2007
From: faassen at codespeak.net (faassen at codespeak.net)
Date: Sun, 30 Sep 2007 09:11:44 +0200 (CEST)
Subject: [Lxml-checkins] r47029 - lxml/trunk
Message-ID: <20070930071144.B7D3D80EE@code0.codespeak.net>
Author: faassen
Date: Sun Sep 30 09:11:43 2007
New Revision: 47029
Modified:
lxml/trunk/CREDITS.txt
Log:
Add Ian to credits.
Modified: lxml/trunk/CREDITS.txt
==============================================================================
--- lxml/trunk/CREDITS.txt (original)
+++ lxml/trunk/CREDITS.txt Sun Sep 30 09:11:43 2007
@@ -5,6 +5,8 @@
Martijn Faassen - creator of lxml and initial main developer
+Ian Bicking - lxml.html
+
Marc-Antoine Parent - XPath extension function help and patches
Olivier Grisel - improved (c)ElementTree compatibility patches,