[Lxml-checkins] r46759 - in lxml/trunk: . src/lxml src/lxml/tests
scoder at codespeak.net
scoder at codespeak.net
Thu Sep 20 14:13:50 CEST 2007
Author: scoder
Date: Thu Sep 20 14:13:48 2007
New Revision: 46759
Modified:
lxml/trunk/CHANGES.txt
lxml/trunk/src/lxml/dtd.pxi
lxml/trunk/src/lxml/iterparse.pxi
lxml/trunk/src/lxml/parser.pxi
lxml/trunk/src/lxml/tests/test_elementtree.py
lxml/trunk/src/lxml/tests/test_etree.py
lxml/trunk/src/lxml/tests/test_htmlparser.py
lxml/trunk/src/lxml/tree.pxd
Log:
'encoding' kw argument in parsers to override document encoding
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Thu Sep 20 14:13:48 2007
@@ -8,6 +8,9 @@
Features added
--------------
+* Parsers accept an ``encoding`` keyword argument that overrides the encoding
+ of the parsed documents.
+
* New C-API function ``hasChild()`` to test for children
* ``annotate()`` function in objectify can annotate with Python types and XSI
Modified: lxml/trunk/src/lxml/dtd.pxi
==============================================================================
--- lxml/trunk/src/lxml/dtd.pxi (original)
+++ lxml/trunk/src/lxml/dtd.pxi Thu Sep 20 14:13:48 2007
@@ -91,7 +91,7 @@
cdef _FileReaderContext dtd_parser
cdef tree.xmlDtd* c_dtd
exc_context = _ExceptionContext()
- dtd_parser = _FileReaderContext(file, exc_context)
+ dtd_parser = _FileReaderContext(file, exc_context, None, None)
c_dtd = dtd_parser._readDtd()
Modified: lxml/trunk/src/lxml/iterparse.pxi
==============================================================================
--- lxml/trunk/src/lxml/iterparse.pxi (original)
+++ lxml/trunk/src/lxml/iterparse.pxi Thu Sep 20 14:13:48 2007
@@ -239,6 +239,9 @@
* remove_blank_text - discard blank text nodes
* remove_comments - discard comments
* remove_pis - discard processing instructions
+
+ Other keyword arguments:
+ * encoding - override the document encoding
"""
cdef object _source
cdef object _filename
@@ -246,9 +249,10 @@
def __init__(self, source, events=("end",), tag=None,
attribute_defaults=False, dtd_validation=False,
load_dtd=False, no_network=True, remove_blank_text=False,
- remove_comments=False, remove_pis=False):
+ remove_comments=False, remove_pis=False, encoding=None):
cdef _IterparseContext context
cdef char* c_filename
+ cdef char* c_encoding
cdef int parse_options
if not hasattr(source, 'read'):
self._filename = _encodeFilename(source)
@@ -279,12 +283,18 @@
parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS
_BaseParser.__init__(self, parse_options, remove_comments, remove_pis,
- None)
+ None, encoding)
+
+ if self._default_encoding is None:
+ c_encoding = NULL
+ else:
+ c_encoding = _cstr(self._default_encoding)
context = <_IterparseContext>self._context
context._setEventFilter(events, tag)
xmlparser.xmlCtxtUseOptions(self._parser_ctxt, parse_options)
- xmlparser.xmlCtxtResetPush(self._parser_ctxt, NULL, 0, c_filename, NULL)
+ xmlparser.xmlCtxtResetPush(self._parser_ctxt, NULL, 0,
+ c_filename, c_encoding)
self._lockParser() # will not be unlocked - no other methods supported
cdef _ParserContext _createContext(self, target):
Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi (original)
+++ lxml/trunk/src/lxml/parser.pxi Thu Sep 20 14:13:48 2007
@@ -206,14 +206,16 @@
cdef class _FileReaderContext:
cdef object _filelike
+ cdef object _encoding
cdef object _url
cdef object _bytes
cdef _ExceptionContext _exc_context
cdef cstd.size_t _bytes_read
cdef char* _c_url
- def __init__(self, filelike, exc_context, url=None):
+ def __init__(self, filelike, exc_context, url, encoding):
self._exc_context = exc_context
self._filelike = filelike
+ self._encoding = encoding
self._url = url
if url is None:
self._c_url = NULL
@@ -234,15 +236,22 @@
LxmlParserType parser_type):
cdef python.PyThreadState* state
cdef xmlDoc* result
+ cdef char* c_encoding
+
+ if self._encoding is None:
+ c_encoding = NULL
+ else:
+ c_encoding = _cstr(self._encoding)
+
state = python.PyEval_SaveThread()
if parser_type == LXML_XML_PARSER:
result = xmlparser.xmlCtxtReadIO(
ctxt, _readFilelikeParser, NULL, <python.PyObject*>self,
- self._c_url, NULL, options)
+ self._c_url, c_encoding, options)
else:
result = htmlparser.htmlCtxtReadIO(
ctxt, _readFilelikeParser, NULL, <python.PyObject*>self,
- self._c_url, NULL, options)
+ self._c_url, c_encoding, options)
python.PyEval_RestoreThread(state)
return result
@@ -493,9 +502,11 @@
cdef ElementClassLookup _class_lookup
cdef python.PyThread_type_lock _parser_lock
cdef int _feed_parser_running
+ cdef object _default_encoding
def __init__(self, int parse_options, remove_comments, remove_pis,
- target):
+ target, encoding):
+ cdef int c_encoding
cdef xmlparser.xmlParserCtxt* pctxt
if isinstance(self, HTMLParser):
self._parser_type = LXML_HTML_PARSER
@@ -516,6 +527,16 @@
self._context = self._createContext(target)
_initParserContext(self._context, None, pctxt)
+ if encoding is None:
+ self._default_encoding = None
+ else:
+ encoding = _utf8(encoding)
+ c_encoding = tree.xmlParseCharEncoding(_cstr(encoding))
+ if c_encoding == tree.XML_CHAR_ENCODING_ERROR or \
+ c_encoding == tree.XML_CHAR_ENCODING_NONE:
+ raise LookupError, "unknown encoding: '%s'" % encoding
+ self._default_encoding = encoding
+
if remove_comments:
pctxt.sax.comment = NULL
if remove_pis:
@@ -669,6 +690,7 @@
cdef xmlDoc* result
cdef xmlparser.xmlParserCtxt* pctxt
cdef int recover
+ cdef char* c_encoding
if c_len > python.INT_MAX:
raise ParserError, "string is too long to parse it with libxml2"
self._lockParser()
@@ -677,13 +699,20 @@
pctxt = self._parser_ctxt
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
+ if self._default_encoding is None:
+ c_encoding = NULL
+ else:
+ c_encoding = _cstr(self._default_encoding)
+
state = python.PyEval_SaveThread()
if self._parser_type == LXML_HTML_PARSER:
result = htmlparser.htmlCtxtReadMemory(
- pctxt, c_text, c_len, c_filename, NULL, self._parse_options)
+ pctxt, c_text, c_len, c_filename,
+ c_encoding, self._parse_options)
else:
result = xmlparser.xmlCtxtReadMemory(
- pctxt, c_text, c_len, c_filename, NULL, self._parse_options)
+ pctxt, c_text, c_len, c_filename,
+ c_encoding, self._parse_options)
python.PyEval_RestoreThread(state)
return self._context._handleParseResultDoc(self, result, None)
@@ -699,6 +728,7 @@
cdef xmlparser.xmlParserCtxt* pctxt
cdef int recover
cdef int orig_options
+ cdef char* c_encoding
result = NULL
self._lockParser()
self._context._error_log.connect()
@@ -706,14 +736,19 @@
pctxt = self._parser_ctxt
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
+ if self._default_encoding is None:
+ c_encoding = NULL
+ else:
+ c_encoding = _cstr(self._default_encoding)
+
orig_options = pctxt.options
state = python.PyEval_SaveThread()
if self._parser_type == LXML_HTML_PARSER:
result = htmlparser.htmlCtxtReadFile(
- pctxt, c_filename, NULL, self._parse_options)
+ pctxt, c_filename, c_encoding, self._parse_options)
else:
result = xmlparser.xmlCtxtReadFile(
- pctxt, c_filename, NULL, self._parse_options)
+ pctxt, c_filename, c_encoding, self._parse_options)
python.PyEval_RestoreThread(state)
pctxt.options = orig_options # work around libxml2 problem
@@ -738,7 +773,8 @@
try:
pctxt = self._parser_ctxt
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
- file_context = _FileReaderContext(filelike, self._context, filename)
+ file_context = _FileReaderContext(filelike, self._context,
+ filename, self._default_encoding)
result = file_context._readDoc(
pctxt, self._parse_options, self._parser_type)
@@ -928,7 +964,9 @@
* compact - safe memory for short text content (default: True)
* resolve_entities - replace entities by their text value (default: True)
- You can pass a parser target as ``target`` keyword argument.
+ Other keyword arguments:
+ * encoding - override the document encoding
+ * target - a parser target object that will receive the parse events
Note that you should avoid sharing parsers between threads. While this is
not harmful, it is more efficient to use separate parsers. This does not
@@ -938,7 +976,7 @@
load_dtd=False, no_network=True, ns_clean=False,
recover=False, remove_blank_text=False, compact=True,
resolve_entities=True, remove_comments=False,
- remove_pis=False, target=None):
+ remove_pis=False, target=None, encoding=None):
cdef int parse_options
parse_options = _XML_DEFAULT_PARSE_OPTIONS
if load_dtd:
@@ -963,26 +1001,34 @@
parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
_BaseParser.__init__(self, parse_options, remove_comments, remove_pis,
- target)
+ target, encoding)
cdef class ETCompatXMLParser(XMLParser):
"""An XML parser with an ElementTree compatible default setup. See the
XMLParser class for details.
- This parser defaults to removing processing instructions and comments from
- the tree.
+ This parser has ``remove_comments`` and ``remove_pis`` enabled by default
+ and thus ignores comments and processing instructions.
"""
def __init__(self, attribute_defaults=False, dtd_validation=False,
load_dtd=False, no_network=True, ns_clean=False,
recover=False, remove_blank_text=False, compact=True,
resolve_entities=True, remove_comments=True,
- remove_pis=True, target=None):
+ remove_pis=True, target=None, encoding=None):
XMLParser.__init__(self,
- attribute_defaults, dtd_validation,
- load_dtd, no_network, ns_clean,
- recover, remove_blank_text, compact,
- resolve_entities, remove_comments,
- remove_pis, target)
+ attribute_defaults=attribute_defaults,
+ dtd_validation=dtd_validation,
+ load_dtd=load_dtd,
+ no_network=no_network,
+ ns_clean=ns_clean,
+ recover=recover,
+ remove_blank_text=remove_blank_text,
+ compact=compact,
+ resolve_entities=resolve_entities,
+ remove_comments=remove_comments,
+ remove_pis=remove_pis,
+ target=target,
+ encoding=encoding)
cdef XMLParser __DEFAULT_XML_PARSER
@@ -1039,14 +1085,16 @@
* remove_pis - discard processing instructions
* compact - safe memory for short text content (default: True)
- You can pass a parser target as ``target`` keyword argument.
+ Other keyword arguments:
+ * encoding - override the document encoding
+ * target - a parser target object that will receive the parse events
Note that you should avoid sharing parsers between threads for performance
reasons.
"""
def __init__(self, recover=True, no_network=True, remove_blank_text=False,
compact=True, remove_comments=False, remove_pis=False,
- target=None):
+ target=None, encoding=None):
cdef int parse_options
parse_options = _HTML_DEFAULT_PARSE_OPTIONS
if remove_blank_text:
@@ -1059,7 +1107,7 @@
parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT
_BaseParser.__init__(self, parse_options, remove_comments, remove_pis,
- target)
+ target, encoding)
cdef HTMLParser __DEFAULT_HTML_PARSER
__DEFAULT_HTML_PARSER = HTMLParser()
Modified: lxml/trunk/src/lxml/tests/test_elementtree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_elementtree.py (original)
+++ lxml/trunk/src/lxml/tests/test_elementtree.py Thu Sep 20 14:13:48 2007
@@ -2377,10 +2377,43 @@
self.assertEquals(u'<a>Søk på nettet</a>'.encode('iso-8859-1'),
result)
- # raise error on wrong (left-over?) encoding declaration in unicode strings
+ def test_parse_encoding_8bit_explicit(self):
+ XMLParser = self.etree.XMLParser
+
+ text = u'Søk på nettet'
+ xml_latin1 = (u'<a>%s</a>' % text).encode('iso-8859-1')
+
+ self.assertRaises(self.etree.ParseError,
+ self.etree.parse,
+ StringIO(xml_latin1))
+
+ tree = self.etree.parse(StringIO(xml_latin1),
+ XMLParser(encoding="iso-8859-1"))
+ a = tree.getroot()
+ self.assertEquals(a.text, text)
+
+ def test_parse_encoding_8bit_override(self):
+ XMLParser = self.etree.XMLParser
+
+ text = u'Søk på nettet'
+ wrong_declaration = "<?xml version='1.0' encoding='UTF-8'?>"
+ xml_latin1 = (u'%s<a>%s</a>' % (wrong_declaration, text)
+ ).encode('iso-8859-1')
+
+ self.assertRaises(self.etree.ParseError,
+ self.etree.parse,
+ StringIO(xml_latin1))
+
+ tree = self.etree.parse(StringIO(xml_latin1),
+ XMLParser(encoding="iso-8859-1"))
+ a = tree.getroot()
+ self.assertEquals(a.text, text)
+
def _test_wrong_unicode_encoding(self):
+ # raise error on wrong encoding declaration in unicode strings
XML = self.etree.XML
- test_utf = u'<?xml version=\'1.0\' encoding=\'iso-8859-1\'?><a>Søk på nettet</a>'
+ test_utf = (u'<?xml version="1.0" encoding="iso-8859-1"?>' + \
+ u'<a>Søk på nettet</a>')
self.assertRaises(SyntaxError, XML, test_utf)
def test_encoding_write_default_encoding(self):
Modified: lxml/trunk/src/lxml/tests/test_etree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_etree.py (original)
+++ lxml/trunk/src/lxml/tests/test_etree.py Thu Sep 20 14:13:48 2007
@@ -368,6 +368,26 @@
8,
len(events))
+ def test_iterparse_encoding_8bit_override(self):
+ text = u'Søk på nettet'
+ wrong_declaration = "<?xml version='1.0' encoding='UTF-8'?>"
+ xml_latin1 = (u'%s<a>%s</a>' % (wrong_declaration, text)
+ ).encode('iso-8859-1')
+
+ self.assertRaises(self.etree.ParseError,
+ list, self.etree.iterparse(StringIO(xml_latin1)))
+
+ iterator = self.etree.iterparse(StringIO(xml_latin1),
+ encoding="iso-8859-1")
+ self.assertEquals(1, len(list(iterator)))
+
+ a = iterator.root
+ self.assertEquals(a.text, text)
+
+ def test_parser_encoding_unknown(self):
+ self.assertRaises(
+ LookupError, self.etree.XMLParser, encoding="hopefully unknown")
+
def test_iterwalk_tag(self):
iterwalk = self.etree.iterwalk
root = self.etree.XML('<a><b><d/></b><c/></a>')
Modified: lxml/trunk/src/lxml/tests/test_htmlparser.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_htmlparser.py (original)
+++ lxml/trunk/src/lxml/tests/test_htmlparser.py Thu Sep 20 14:13:48 2007
@@ -46,6 +46,37 @@
self.assertRaises(self.etree.XMLSyntaxError,
parse, f, parser)
+ def test_parse_encoding_8bit_explicit(self):
+ text = u'Søk på nettet'
+ html_latin1 = (u'<p>%s</p>' % text).encode('iso-8859-1')
+
+ tree = self.etree.parse(
+ StringIO(html_latin1),
+ self.etree.HTMLParser(encoding="iso-8859-1"))
+ p = tree.find("//p")
+ self.assertEquals(p.text, text)
+
+ def test_parse_encoding_8bit_override(self):
+ text = u'Søk på nettet'
+ wrong_head = '''
+ <head>
+ <meta http-equiv="Content-Type"
+ content="text/html; charset=UTF-8" />
+ </head>'''
+ html_latin1 = (u'<html>%s<body><p>%s</p></body></html>' % (wrong_head,
+ text)
+ ).encode('iso-8859-1')
+
+ self.assertRaises(self.etree.ParseError,
+ self.etree.parse,
+ StringIO(html_latin1))
+
+ tree = self.etree.parse(
+ StringIO(html_latin1),
+ self.etree.HTMLParser(encoding="iso-8859-1"))
+ p = tree.find("//p")
+ self.assertEquals(p.text, text)
+
def test_module_HTML_broken(self):
element = self.etree.HTML(self.broken_html_str)
self.assertEqual(self.etree.tostring(element),
Modified: lxml/trunk/src/lxml/tree.pxd
==============================================================================
--- lxml/trunk/src/lxml/tree.pxd (original)
+++ lxml/trunk/src/lxml/tree.pxd Thu Sep 20 14:13:48 2007
@@ -40,6 +40,7 @@
cdef int xmlCharEncCloseFunc(xmlCharEncodingHandler* handler)
cdef xmlCharEncoding xmlDetectCharEncoding(char* text, int len)
cdef char* xmlGetCharEncodingName(xmlCharEncoding enc)
+ cdef xmlCharEncoding xmlParseCharEncoding(char* name)
cdef extern from "libxml/chvalid.h":
cdef int xmlIsChar_ch(char c)
More information about the lxml-checkins
mailing list