[Lxml-checkins] r44104 - in lxml/branch/html: . doc src/lxml src/lxml/tests
ianb at codespeak.net
ianb at codespeak.net
Thu Jun 7 19:35:38 CEST 2007
Author: ianb
Date: Thu Jun 7 19:35:37 2007
New Revision: 44104
Modified:
lxml/branch/html/TODO.txt
lxml/branch/html/doc/FAQ.txt
lxml/branch/html/doc/build.txt
lxml/branch/html/doc/tutorial.txt
lxml/branch/html/src/lxml/builder.py
lxml/branch/html/src/lxml/parser.pxi
lxml/branch/html/src/lxml/tests/test_etree.py
lxml/branch/html/src/lxml/tests/test_xslt.py
Log:
svn merge -r43854:HEAD http://codespeak.net/svn/lxml/trunk
Modified: lxml/branch/html/TODO.txt
==============================================================================
--- lxml/branch/html/TODO.txt (original)
+++ lxml/branch/html/TODO.txt Thu Jun 7 19:35:37 2007
@@ -58,6 +58,10 @@
* clean up (and remove?) duplicated API for extension functions
+* allow (and prefer) Namespace classes local to a parser
+
+* remove first 'context' argument from extension functions
+
* find a way to integrate Schematron (if it's available)
* always use ns-prefixed type names in objectify's ``xsi:type`` attributes
@@ -68,3 +72,9 @@
* follow PEP 8 in API naming (avoidCamelCase in_favour_of_underscores)
* clean support for entities (maybe an Entity element class?)
+
+
+Changes in 2.0
+--------------
+
+* network access in parsers disabled by default
Modified: lxml/branch/html/doc/FAQ.txt
==============================================================================
--- lxml/branch/html/doc/FAQ.txt (original)
+++ lxml/branch/html/doc/FAQ.txt Thu Jun 7 19:35:37 2007
@@ -258,7 +258,12 @@
b) If you are using threads, please see the following section to check if
you touch on one of the potential pitfalls.
-c) Otherwise, we would really like to hear about it. Please report it to the
+c) Try to reproduce the problem with the latest versions of libxml2 and
+ libxslt. From time to time, bugs and race conditions are found in these
+ libraries, so a more recent version might already contain a fix for your
+ problem.
+
+d) Otherwise, we would really like to hear about it. Please report it to the
`mailing list`_ so that we can fix it. It is very helpful in this case if
you can come up with a short code snippet that demonstrates your problem.
Please also report the version of lxml, libxml2 and libxslt that you are
Modified: lxml/branch/html/doc/build.txt
==============================================================================
--- lxml/branch/html/doc/build.txt (original)
+++ lxml/branch/html/doc/build.txt Thu Jun 7 19:35:37 2007
@@ -44,7 +44,7 @@
http://codespeak.net/svn/lxml/pyrex/
A subversion checkout of lxml will automatically retrieve the latest Pyrex
- as external project source (``svn:externals``). Look out for the ``Pyrex``
+ as external project source (``svn:externals``). Look for the ``Pyrex``
directory in the source tree.
Since version 1.1.2, the lxml source distribution also includes this Pyrex
@@ -182,6 +182,26 @@
lxml maintainer.
+Providing newer library versions on Mac-OS X
+--------------------------------------------
+
+The Unix environment in Mac-OS X makes it relatively easy to install
+Unix/Linux style package management tools and new software. However, it seems
+to be hard to get libraries set up for exclusive usage that Mac-OS X ships in
+an older version. The result can be segfaults on this platform that are hard
+to track down.
+
+To make sure the newer libxml2 and libxslt versions are used (e.g. under
+fink), you should add the directory where you installed the libraries to the
+``DYLD_LIBRARY_PATH`` environment variable. This seems to fix a lot of
+problems for users.
+
+Alternatively, you can build lxml statically. A way to do this on MS Windows
+is described in the next section, but it should be easy to adapt it for
+Mac-OS. That way, you can always be sure you use the versions you compiled
+lxml with, regardless of the runtime environement.
+
+
Static linking on Windows
-------------------------
Modified: lxml/branch/html/doc/tutorial.txt
==============================================================================
--- lxml/branch/html/doc/tutorial.txt (original)
+++ lxml/branch/html/doc/tutorial.txt Thu Jun 7 19:35:37 2007
@@ -329,11 +329,11 @@
.. _`further iterators`: api.html#iteration
-
-
The ElementTree class
=====================
+An ``ElementTree`` is mainly a wrapper around a tree with a root node.
+
Parsing files and XML literals
==============================
Modified: lxml/branch/html/src/lxml/builder.py
==============================================================================
--- lxml/branch/html/src/lxml/builder.py (original)
+++ lxml/branch/html/src/lxml/builder.py Thu Jun 7 19:35:37 2007
@@ -16,9 +16,6 @@
return lambda *args, **kwargs: func(tag, *args, **kwargs)
-class _C:
- pass
-
class ElementMaker(object):
"""Element generator factory.
@@ -97,7 +94,12 @@
</html>
"""
- def __init__(self, typemap=None):
+ def __init__(self, typemap=None, parser=None):
+ if parser is not None:
+ self._makeelement = parser.makeelement
+ else:
+ self._makeelement = ET.Element
+
# initialize type map for this element factory
if typemap:
@@ -121,20 +123,12 @@
attrib[k] = typemap[type(v)](None, v)
typemap[dict] = add_dict
- def add_elem(elem, item):
- elem.append(item)
- t = type(ET.Element("tag"))
- if t is not type(_C()):
- typemap[t] = add_elem
-
self._typemap = typemap
- # print typemap
-
def __call__(self, tag, *children, **attrib):
get = self._typemap.get
- elem = ET.Element(tag)
+ elem = self._makeelement(tag)
if attrib:
get(dict)(elem, attrib)
Modified: lxml/branch/html/src/lxml/parser.pxi
==============================================================================
--- lxml/branch/html/src/lxml/parser.pxi (original)
+++ lxml/branch/html/src/lxml/parser.pxi Thu Jun 7 19:35:37 2007
@@ -665,8 +665,9 @@
cdef int _XML_DEFAULT_PARSE_OPTIONS
_XML_DEFAULT_PARSE_OPTIONS = (
- xmlparser.XML_PARSE_NOENT |
+ xmlparser.XML_PARSE_NOENT |
xmlparser.XML_PARSE_NOCDATA |
+ xmlparser.XML_PARSE_NONET |
xmlparser.XML_PARSE_COMPACT
)
@@ -685,19 +686,19 @@
* attribute_defaults - read default attributes from DTD
* dtd_validation - validate (if DTD is available)
* load_dtd - use DTD for parsing
- * no_network - prevent network access
+ * no_network - prevent network access (default: True)
* ns_clean - clean up redundant namespace declarations
* recover - try hard to parse through broken XML
* remove_blank_text - discard blank text nodes
- * compact - safe memory for short text content (default: on)
- * resolve_entities - replace entities by their text value (default: on)
+ * compact - safe memory for short text content (default: True)
+ * resolve_entities - replace entities by their text value (default: True)
Note that you should avoid sharing parsers between threads. While this is
not harmful, it is more efficient to use separate parsers. This does not
apply to the default parser.
"""
def __init__(self, attribute_defaults=False, dtd_validation=False,
- load_dtd=False, no_network=False, ns_clean=False,
+ load_dtd=False, no_network=True, ns_clean=False,
recover=False, remove_blank_text=False, compact=True,
resolve_entities=True):
cdef int parse_options
@@ -712,14 +713,14 @@
if attribute_defaults:
parse_options = parse_options | xmlparser.XML_PARSE_DTDATTR | \
xmlparser.XML_PARSE_DTDLOAD
- if no_network:
- parse_options = parse_options | xmlparser.XML_PARSE_NONET
if ns_clean:
parse_options = parse_options | xmlparser.XML_PARSE_NSCLEAN
if recover:
parse_options = parse_options | xmlparser.XML_PARSE_RECOVER
if remove_blank_text:
parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS
+ if not no_network:
+ parse_options = parse_options ^ xmlparser.XML_PARSE_NONET
if not compact:
parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT
if not resolve_entities:
@@ -777,7 +778,15 @@
__GLOBAL_PARSER_CONTEXT.setDefaultParser(__DEFAULT_XML_PARSER)
-def setDefaultParser(_BaseParser parser=None):
+def setDefaultParser(parser):
+ "Deprecated, please use set_default_parser instead."
+ set_default_parser(parser)
+
+def getDefaultParser():
+ "Deprecated, please use get_default_parser instead."
+ return get_default_parser()
+
+def set_default_parser(_BaseParser parser=None):
"""Set a default parser for the current thread. This parser is used
globally whenever no parser is supplied to the various parse functions of
the lxml API. If this function is called without a parser (or if it is
@@ -791,24 +800,19 @@
parser = __DEFAULT_XML_PARSER
__GLOBAL_PARSER_CONTEXT.setDefaultParser(parser)
-def getDefaultParser():
- return __GLOBAL_PARSER_CONTEXT.getDefaultParser()
-
-def set_default_parser(parser):
- "Deprecated, please use setDefaultParser instead."
- setDefaultParser(parser)
-
def get_default_parser():
- "Deprecated, please use getDefaultParser instead."
- return getDefaultParser()
+ return __GLOBAL_PARSER_CONTEXT.getDefaultParser()
############################################################
## HTML parser
############################################################
cdef int _HTML_DEFAULT_PARSE_OPTIONS
-_HTML_DEFAULT_PARSE_OPTIONS = \
+_HTML_DEFAULT_PARSE_OPTIONS = (
+ htmlparser.HTML_PARSE_RECOVER |
+ htmlparser.HTML_PARSE_NONET |
htmlparser.HTML_PARSE_COMPACT
+ )
cdef class HTMLParser(_BaseParser):
"""The HTML parser. This parser allows reading HTML into a normal XML
@@ -817,25 +821,25 @@
Available boolean keyword arguments:
* recover - try hard to parse through broken HTML (default: True)
- * no_network - prevent network access
+ * no_network - prevent network access (default: True)
* remove_blank_text - discard empty text nodes
- * compact - safe memory for short text content (default: on)
+ * compact - safe memory for short text content (default: True)
- Note that you should avoid sharing parsers between threads for parformance
+ Note that you should avoid sharing parsers between threads for performance
reasons.
"""
- def __init__(self, recover=True, no_network=False, remove_blank_text=False,
+ def __init__(self, recover=True, no_network=True, remove_blank_text=False,
compact=True):
cdef int parse_options
_BaseParser.__init__(self)
parse_options = _HTML_DEFAULT_PARSE_OPTIONS
- if recover:
- parse_options = parse_options | htmlparser.HTML_PARSE_RECOVER
- if no_network:
- parse_options = parse_options | htmlparser.HTML_PARSE_NONET
if remove_blank_text:
parse_options = parse_options | htmlparser.HTML_PARSE_NOBLANKS
+ if not recover:
+ parse_options = parse_options ^ htmlparser.HTML_PARSE_RECOVER
+ if not no_network:
+ parse_options = parse_options ^ htmlparser.HTML_PARSE_NONET
if not compact:
parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT
Modified: lxml/branch/html/src/lxml/tests/test_etree.py
==============================================================================
--- lxml/branch/html/src/lxml/tests/test_etree.py (original)
+++ lxml/branch/html/src/lxml/tests/test_etree.py Thu Jun 7 19:35:37 2007
@@ -390,22 +390,23 @@
xml = u'<!DOCTYPE doc SYSTEM "test"><doc>&myentity;</doc>'
self.assertRaises(_LocalException, parse, StringIO(xml), parser)
- def test_entity(self):
- parse = self.etree.parse
- tostring = self.etree.tostring
- parser = self.etree.XMLParser(resolve_entities=False)
- Entity = self.etree.Entity
+ if etree.LIBXML_VERSION > (2,6,20):
+ def test_entity_parse(self):
+ parse = self.etree.parse
+ tostring = self.etree.tostring
+ parser = self.etree.XMLParser(resolve_entities=False)
+ Entity = self.etree.Entity
+
+ xml = '<!DOCTYPE doc SYSTEM "test"><doc>&myentity;</doc>'
+ tree = parse(StringIO(xml), parser)
+ root = tree.getroot()
+ self.assertEquals(root[0].tag, Entity)
+ self.assertFalse(root[0].text)
+ self.assertEquals(root[0].tail, None)
+ self.assertEquals(root[0].name, "myentity")
- xml = '<!DOCTYPE doc SYSTEM "test"><doc>&myentity;</doc>'
- tree = parse(StringIO(xml), parser)
- root = tree.getroot()
- self.assertEquals(root[0].tag, Entity)
- self.assertFalse(root[0].text)
- self.assertEquals(root[0].tail, None)
- self.assertEquals(root[0].name, "myentity")
-
- self.assertEquals('<doc>&myentity;</doc>',
- tostring(root))
+ self.assertEquals('<doc>&myentity;</doc>',
+ tostring(root))
def test_entity_append(self):
Entity = self.etree.Entity
Modified: lxml/branch/html/src/lxml/tests/test_xslt.py
==============================================================================
--- lxml/branch/html/src/lxml/tests/test_xslt.py (original)
+++ lxml/branch/html/src/lxml/tests/test_xslt.py Thu Jun 7 19:35:37 2007
@@ -37,19 +37,18 @@
def test_xslt_input_none(self):
self.assertRaises(TypeError, etree.XSLT, None)
- def test_xslt_invalid_stylesheet(self):
- if etree.LIBXSLT_VERSION < (1,1,15):
- return # no error from libxslt?
-
- style = self.parse('''\
+ if False and etree.LIBXSLT_VERSION >= (1,1,15):
+ # earlier versions generate no error
+ if etree.LIBXSLT_VERSION > (1,1,17):
+ def test_xslt_invalid_stylesheet(self):
+ style = self.parse('''\
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
- <xsl:template match="/">
- <xsl:template />
- </xsl:template>
+ <xsl:stylesheet />
</xsl:stylesheet>''')
- self.assertRaises(etree.XSLTParseError, etree.XSLT, style)
+ self.assertRaises(
+ etree.XSLTParseError, etree.XSLT, style)
def test_xslt_utf8(self):
tree = self.parse(u'<a><b>\uF8D2</b><c>\uF8D2</c></a>')
@@ -242,13 +241,12 @@
''',
st.tostring(res))
- def test_xslt_parameter_missing(self):
- # DISABLED - NOT RELIABLE!
- if etree.LIBXSLT_VERSION >= (1,1,18):
- return # no error from libxslt?
- # apply() without needed parameter will lead to XSLTApplyError
- tree = self.parse('<a><b>B</b><c>C</c></a>')
- style = self.parse('''\
+ if etree.LIBXSLT_VERSION < (1,1,18):
+ # later versions produce no error
+ def test_xslt_parameter_missing(self):
+ # apply() without needed parameter will lead to XSLTApplyError
+ tree = self.parse('<a><b>B</b><c>C</c></a>')
+ style = self.parse('''\
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:template match="/">
@@ -256,9 +254,9 @@
</xsl:template>
</xsl:stylesheet>''')
- st = etree.XSLT(style)
- self.assertRaises(etree.XSLTApplyError,
- st.apply, tree)
+ st = etree.XSLT(style)
+ self.assertRaises(etree.XSLTApplyError,
+ st.apply, tree)
def test_xslt_multiple_parameters(self):
tree = self.parse('<a><b>B</b><c>C</c></a>')
More information about the lxml-checkins
mailing list