[Lxml-checkins] r46759 - in lxml/trunk: . src/lxml src/lxml/tests

scoder at codespeak.net scoder at codespeak.net
Thu Sep 20 14:13:50 CEST 2007


Author: scoder
Date: Thu Sep 20 14:13:48 2007
New Revision: 46759

Modified:
   lxml/trunk/CHANGES.txt
   lxml/trunk/src/lxml/dtd.pxi
   lxml/trunk/src/lxml/iterparse.pxi
   lxml/trunk/src/lxml/parser.pxi
   lxml/trunk/src/lxml/tests/test_elementtree.py
   lxml/trunk/src/lxml/tests/test_etree.py
   lxml/trunk/src/lxml/tests/test_htmlparser.py
   lxml/trunk/src/lxml/tree.pxd
Log:
'encoding' kw argument in parsers to override document encoding

Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt	(original)
+++ lxml/trunk/CHANGES.txt	Thu Sep 20 14:13:48 2007
@@ -8,6 +8,9 @@
 Features added
 --------------
 
+* Parsers accept an ``encoding`` keyword argument that overrides the encoding
+  of the parsed documents.
+
 * New C-API function ``hasChild()`` to test for children
 
 * ``annotate()`` function in objectify can annotate with Python types and XSI

Modified: lxml/trunk/src/lxml/dtd.pxi
==============================================================================
--- lxml/trunk/src/lxml/dtd.pxi	(original)
+++ lxml/trunk/src/lxml/dtd.pxi	Thu Sep 20 14:13:48 2007
@@ -91,7 +91,7 @@
     cdef _FileReaderContext dtd_parser
     cdef tree.xmlDtd* c_dtd
     exc_context = _ExceptionContext()
-    dtd_parser = _FileReaderContext(file, exc_context)
+    dtd_parser = _FileReaderContext(file, exc_context, None, None)
 
     c_dtd = dtd_parser._readDtd()
 

Modified: lxml/trunk/src/lxml/iterparse.pxi
==============================================================================
--- lxml/trunk/src/lxml/iterparse.pxi	(original)
+++ lxml/trunk/src/lxml/iterparse.pxi	Thu Sep 20 14:13:48 2007
@@ -239,6 +239,9 @@
     * remove_blank_text  - discard blank text nodes
     * remove_comments    - discard comments
     * remove_pis         - discard processing instructions
+
+    Other keyword arguments:
+    * encoding           - override the document encoding
     """
     cdef object _source
     cdef object _filename
@@ -246,9 +249,10 @@
     def __init__(self, source, events=("end",), tag=None,
                  attribute_defaults=False, dtd_validation=False,
                  load_dtd=False, no_network=True, remove_blank_text=False,
-                 remove_comments=False, remove_pis=False):
+                 remove_comments=False, remove_pis=False, encoding=None):
         cdef _IterparseContext context
         cdef char* c_filename
+        cdef char* c_encoding
         cdef int parse_options
         if not hasattr(source, 'read'):
             self._filename = _encodeFilename(source)
@@ -279,12 +283,18 @@
             parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS
 
         _BaseParser.__init__(self, parse_options, remove_comments, remove_pis,
-                             None)
+                             None, encoding)
+
+        if self._default_encoding is None:
+            c_encoding = NULL
+        else:
+            c_encoding = _cstr(self._default_encoding)
 
         context = <_IterparseContext>self._context
         context._setEventFilter(events, tag)
         xmlparser.xmlCtxtUseOptions(self._parser_ctxt, parse_options)
-        xmlparser.xmlCtxtResetPush(self._parser_ctxt, NULL, 0, c_filename, NULL)
+        xmlparser.xmlCtxtResetPush(self._parser_ctxt, NULL, 0,
+                                   c_filename, c_encoding)
         self._lockParser() # will not be unlocked - no other methods supported
 
     cdef _ParserContext _createContext(self, target):

Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi	(original)
+++ lxml/trunk/src/lxml/parser.pxi	Thu Sep 20 14:13:48 2007
@@ -206,14 +206,16 @@
 
 cdef class _FileReaderContext:
     cdef object _filelike
+    cdef object _encoding
     cdef object _url
     cdef object _bytes
     cdef _ExceptionContext _exc_context
     cdef cstd.size_t _bytes_read
     cdef char* _c_url
-    def __init__(self, filelike, exc_context, url=None):
+    def __init__(self, filelike, exc_context, url, encoding):
         self._exc_context = exc_context
         self._filelike = filelike
+        self._encoding = encoding
         self._url = url
         if url is None:
             self._c_url = NULL
@@ -234,15 +236,22 @@
                           LxmlParserType parser_type):
         cdef python.PyThreadState* state
         cdef xmlDoc* result
+        cdef char* c_encoding
+
+        if self._encoding is None:
+            c_encoding = NULL
+        else:
+            c_encoding = _cstr(self._encoding)
+
         state = python.PyEval_SaveThread()
         if parser_type == LXML_XML_PARSER:
             result = xmlparser.xmlCtxtReadIO(
                 ctxt, _readFilelikeParser, NULL, <python.PyObject*>self,
-                self._c_url, NULL, options)
+                self._c_url, c_encoding, options)
         else:
             result = htmlparser.htmlCtxtReadIO(
                 ctxt, _readFilelikeParser, NULL, <python.PyObject*>self,
-                self._c_url, NULL, options)
+                self._c_url, c_encoding, options)
         python.PyEval_RestoreThread(state)
         return result
 
@@ -493,9 +502,11 @@
     cdef ElementClassLookup _class_lookup
     cdef python.PyThread_type_lock _parser_lock
     cdef int _feed_parser_running
+    cdef object _default_encoding
 
     def __init__(self, int parse_options, remove_comments, remove_pis,
-                 target):
+                 target, encoding):
+        cdef int c_encoding
         cdef xmlparser.xmlParserCtxt* pctxt
         if isinstance(self, HTMLParser):
             self._parser_type = LXML_HTML_PARSER
@@ -516,6 +527,16 @@
         self._context = self._createContext(target)
         _initParserContext(self._context, None, pctxt)
 
+        if encoding is None:
+            self._default_encoding = None
+        else:
+            encoding = _utf8(encoding)
+            c_encoding = tree.xmlParseCharEncoding(_cstr(encoding))
+            if c_encoding == tree.XML_CHAR_ENCODING_ERROR or \
+                   c_encoding == tree.XML_CHAR_ENCODING_NONE:
+                raise LookupError, "unknown encoding: '%s'" % encoding
+            self._default_encoding = encoding
+
         if remove_comments:
             pctxt.sax.comment = NULL
         if remove_pis:
@@ -669,6 +690,7 @@
         cdef xmlDoc* result
         cdef xmlparser.xmlParserCtxt* pctxt
         cdef int recover
+        cdef char* c_encoding
         if c_len > python.INT_MAX:
             raise ParserError, "string is too long to parse it with libxml2"
         self._lockParser()
@@ -677,13 +699,20 @@
             pctxt = self._parser_ctxt
             __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
 
+            if self._default_encoding is None:
+                c_encoding = NULL
+            else:
+                c_encoding = _cstr(self._default_encoding)
+
             state = python.PyEval_SaveThread()
             if self._parser_type == LXML_HTML_PARSER:
                 result = htmlparser.htmlCtxtReadMemory(
-                    pctxt, c_text, c_len, c_filename, NULL, self._parse_options)
+                    pctxt, c_text, c_len, c_filename,
+                    c_encoding, self._parse_options)
             else:
                 result = xmlparser.xmlCtxtReadMemory(
-                    pctxt, c_text, c_len, c_filename, NULL, self._parse_options)
+                    pctxt, c_text, c_len, c_filename,
+                    c_encoding, self._parse_options)
             python.PyEval_RestoreThread(state)
 
             return self._context._handleParseResultDoc(self, result, None)
@@ -699,6 +728,7 @@
         cdef xmlparser.xmlParserCtxt* pctxt
         cdef int recover
         cdef int orig_options
+        cdef char* c_encoding
         result = NULL
         self._lockParser()
         self._context._error_log.connect()
@@ -706,14 +736,19 @@
             pctxt = self._parser_ctxt
             __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
 
+            if self._default_encoding is None:
+                c_encoding = NULL
+            else:
+                c_encoding = _cstr(self._default_encoding)
+
             orig_options = pctxt.options
             state = python.PyEval_SaveThread()
             if self._parser_type == LXML_HTML_PARSER:
                 result = htmlparser.htmlCtxtReadFile(
-                    pctxt, c_filename, NULL, self._parse_options)
+                    pctxt, c_filename, c_encoding, self._parse_options)
             else:
                 result = xmlparser.xmlCtxtReadFile(
-                    pctxt, c_filename, NULL, self._parse_options)
+                    pctxt, c_filename, c_encoding, self._parse_options)
             python.PyEval_RestoreThread(state)
             pctxt.options = orig_options # work around libxml2 problem
 
@@ -738,7 +773,8 @@
         try:
             pctxt = self._parser_ctxt
             __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
-            file_context = _FileReaderContext(filelike, self._context, filename)
+            file_context = _FileReaderContext(filelike, self._context,
+                                              filename, self._default_encoding)
             result = file_context._readDoc(
                 pctxt, self._parse_options, self._parser_type)
 
@@ -928,7 +964,9 @@
     * compact            - safe memory for short text content (default: True)
     * resolve_entities   - replace entities by their text value (default: True)
 
-    You can pass a parser target as ``target`` keyword argument.
+    Other keyword arguments:
+    * encoding - override the document encoding
+    * target   - a parser target object that will receive the parse events
 
     Note that you should avoid sharing parsers between threads.  While this is
     not harmful, it is more efficient to use separate parsers.  This does not
@@ -938,7 +976,7 @@
                  load_dtd=False, no_network=True, ns_clean=False,
                  recover=False, remove_blank_text=False, compact=True,
                  resolve_entities=True, remove_comments=False,
-                 remove_pis=False, target=None):
+                 remove_pis=False, target=None, encoding=None):
         cdef int parse_options
         parse_options = _XML_DEFAULT_PARSE_OPTIONS
         if load_dtd:
@@ -963,26 +1001,34 @@
             parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
 
         _BaseParser.__init__(self, parse_options, remove_comments, remove_pis,
-                             target)
+                             target, encoding)
 
 cdef class ETCompatXMLParser(XMLParser):
     """An XML parser with an ElementTree compatible default setup.  See the
     XMLParser class for details.
 
-    This parser defaults to removing processing instructions and comments from
-    the tree.
+    This parser has ``remove_comments`` and ``remove_pis`` enabled by default
+    and thus ignores comments and processing instructions.
     """
     def __init__(self, attribute_defaults=False, dtd_validation=False,
                  load_dtd=False, no_network=True, ns_clean=False,
                  recover=False, remove_blank_text=False, compact=True,
                  resolve_entities=True, remove_comments=True,
-                 remove_pis=True, target=None):
+                 remove_pis=True, target=None, encoding=None):
         XMLParser.__init__(self,
-                 attribute_defaults, dtd_validation,
-                 load_dtd, no_network, ns_clean,
-                 recover, remove_blank_text, compact,
-                 resolve_entities, remove_comments,
-                 remove_pis, target)
+                           attribute_defaults=attribute_defaults,
+                           dtd_validation=dtd_validation,
+                           load_dtd=load_dtd,
+                           no_network=no_network,
+                           ns_clean=ns_clean,
+                           recover=recover,
+                           remove_blank_text=remove_blank_text,
+                           compact=compact,
+                           resolve_entities=resolve_entities,
+                           remove_comments=remove_comments,
+                           remove_pis=remove_pis,
+                           target=target,
+                           encoding=encoding)
 
 
 cdef XMLParser __DEFAULT_XML_PARSER
@@ -1039,14 +1085,16 @@
     * remove_pis         - discard processing instructions
     * compact            - safe memory for short text content (default: True)
 
-    You can pass a parser target as ``target`` keyword argument.
+    Other keyword arguments:
+    * encoding - override the document encoding
+    * target   - a parser target object that will receive the parse events
 
     Note that you should avoid sharing parsers between threads for performance
     reasons.
     """
     def __init__(self, recover=True, no_network=True, remove_blank_text=False,
                  compact=True, remove_comments=False, remove_pis=False,
-                 target=None):
+                 target=None, encoding=None):
         cdef int parse_options
         parse_options = _HTML_DEFAULT_PARSE_OPTIONS
         if remove_blank_text:
@@ -1059,7 +1107,7 @@
             parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT
 
         _BaseParser.__init__(self, parse_options, remove_comments, remove_pis,
-                             target)
+                             target, encoding)
 
 cdef HTMLParser __DEFAULT_HTML_PARSER
 __DEFAULT_HTML_PARSER = HTMLParser()

Modified: lxml/trunk/src/lxml/tests/test_elementtree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_elementtree.py	(original)
+++ lxml/trunk/src/lxml/tests/test_elementtree.py	Thu Sep 20 14:13:48 2007
@@ -2377,10 +2377,43 @@
         self.assertEquals(u'<a>Søk på nettet</a>'.encode('iso-8859-1'),
                           result)
 
-    # raise error on wrong (left-over?) encoding declaration in unicode strings
+    def test_parse_encoding_8bit_explicit(self):
+        XMLParser = self.etree.XMLParser
+
+        text = u'Søk på nettet'
+        xml_latin1 = (u'<a>%s</a>' % text).encode('iso-8859-1')
+
+        self.assertRaises(self.etree.ParseError,
+                          self.etree.parse,
+                          StringIO(xml_latin1))
+
+        tree = self.etree.parse(StringIO(xml_latin1),
+                                XMLParser(encoding="iso-8859-1"))
+        a = tree.getroot()
+        self.assertEquals(a.text, text)
+
+    def test_parse_encoding_8bit_override(self):
+        XMLParser = self.etree.XMLParser
+
+        text = u'Søk på nettet'
+        wrong_declaration = "<?xml version='1.0' encoding='UTF-8'?>"
+        xml_latin1 = (u'%s<a>%s</a>' % (wrong_declaration, text)
+                      ).encode('iso-8859-1')
+
+        self.assertRaises(self.etree.ParseError,
+                          self.etree.parse,
+                          StringIO(xml_latin1))
+
+        tree = self.etree.parse(StringIO(xml_latin1),
+                                XMLParser(encoding="iso-8859-1"))
+        a = tree.getroot()
+        self.assertEquals(a.text, text)
+
     def _test_wrong_unicode_encoding(self):
+        # raise error on wrong encoding declaration in unicode strings
         XML = self.etree.XML
-        test_utf = u'<?xml version=\'1.0\' encoding=\'iso-8859-1\'?><a>Søk på nettet</a>'
+        test_utf = (u'<?xml version="1.0" encoding="iso-8859-1"?>' + \
+                                        u'<a>Søk på nettet</a>')
         self.assertRaises(SyntaxError, XML, test_utf)
 
     def test_encoding_write_default_encoding(self):

Modified: lxml/trunk/src/lxml/tests/test_etree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_etree.py	(original)
+++ lxml/trunk/src/lxml/tests/test_etree.py	Thu Sep 20 14:13:48 2007
@@ -368,6 +368,26 @@
             8,
             len(events))
 
+    def test_iterparse_encoding_8bit_override(self):
+        text = u'Søk på nettet'
+        wrong_declaration = "<?xml version='1.0' encoding='UTF-8'?>"
+        xml_latin1 = (u'%s<a>%s</a>' % (wrong_declaration, text)
+                      ).encode('iso-8859-1')
+
+        self.assertRaises(self.etree.ParseError,
+                          list, self.etree.iterparse(StringIO(xml_latin1)))
+
+        iterator = self.etree.iterparse(StringIO(xml_latin1),
+                                        encoding="iso-8859-1")
+        self.assertEquals(1, len(list(iterator)))
+
+        a = iterator.root
+        self.assertEquals(a.text, text)
+
+    def test_parser_encoding_unknown(self):
+        self.assertRaises(
+            LookupError, self.etree.XMLParser, encoding="hopefully unknown")
+
     def test_iterwalk_tag(self):
         iterwalk = self.etree.iterwalk
         root = self.etree.XML('<a><b><d/></b><c/></a>')

Modified: lxml/trunk/src/lxml/tests/test_htmlparser.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_htmlparser.py	(original)
+++ lxml/trunk/src/lxml/tests/test_htmlparser.py	Thu Sep 20 14:13:48 2007
@@ -46,6 +46,37 @@
         self.assertRaises(self.etree.XMLSyntaxError,
                           parse, f, parser)
 
+    def test_parse_encoding_8bit_explicit(self):
+        text = u'Søk på nettet'
+        html_latin1 = (u'<p>%s</p>' % text).encode('iso-8859-1')
+
+        tree = self.etree.parse(
+            StringIO(html_latin1),
+            self.etree.HTMLParser(encoding="iso-8859-1"))
+        p = tree.find("//p")
+        self.assertEquals(p.text, text)
+
+    def test_parse_encoding_8bit_override(self):
+        text = u'Søk på nettet'
+        wrong_head = '''
+        <head>
+          <meta http-equiv="Content-Type"
+                content="text/html; charset=UTF-8" />
+        </head>'''
+        html_latin1 = (u'<html>%s<body><p>%s</p></body></html>' % (wrong_head,
+                                                                   text)
+                      ).encode('iso-8859-1')
+
+        self.assertRaises(self.etree.ParseError,
+                          self.etree.parse,
+                          StringIO(html_latin1))
+
+        tree = self.etree.parse(
+            StringIO(html_latin1),
+            self.etree.HTMLParser(encoding="iso-8859-1"))
+        p = tree.find("//p")
+        self.assertEquals(p.text, text)
+
     def test_module_HTML_broken(self):
         element = self.etree.HTML(self.broken_html_str)
         self.assertEqual(self.etree.tostring(element),

Modified: lxml/trunk/src/lxml/tree.pxd
==============================================================================
--- lxml/trunk/src/lxml/tree.pxd	(original)
+++ lxml/trunk/src/lxml/tree.pxd	Thu Sep 20 14:13:48 2007
@@ -40,6 +40,7 @@
     cdef int xmlCharEncCloseFunc(xmlCharEncodingHandler* handler)
     cdef xmlCharEncoding xmlDetectCharEncoding(char* text, int len)
     cdef char* xmlGetCharEncodingName(xmlCharEncoding enc)
+    cdef xmlCharEncoding xmlParseCharEncoding(char* name)
 
 cdef extern from "libxml/chvalid.h":
     cdef int xmlIsChar_ch(char c)


More information about the lxml-checkins mailing list