[Lxml-checkins] r32095 - lxml/trunk/src/lxml

scoder at codespeak.net scoder at codespeak.net
Sat Sep 9 07:20:54 CEST 2006


Author: scoder
Date: Sat Sep  9 07:20:49 2006
New Revision: 32095

Modified:
   lxml/trunk/src/lxml/apihelpers.pxi
   lxml/trunk/src/lxml/docloader.pxi
   lxml/trunk/src/lxml/etree.pyx
   lxml/trunk/src/lxml/iterparse.pxi
   lxml/trunk/src/lxml/parser.pxi
   lxml/trunk/src/lxml/python.pxd
   lxml/trunk/src/lxml/relaxng.pxi
   lxml/trunk/src/lxml/serializer.pxi
   lxml/trunk/src/lxml/xmlschema.pxi
   lxml/trunk/src/lxml/xslt.pxi
Log:
fix handling of 8-bit encoded filenames

Modified: lxml/trunk/src/lxml/apihelpers.pxi
==============================================================================
--- lxml/trunk/src/lxml/apihelpers.pxi	(original)
+++ lxml/trunk/src/lxml/apihelpers.pxi	Sat Sep  9 07:20:49 2006
@@ -501,6 +501,46 @@
     else:
         raise TypeError, "Argument must be string or unicode."
 
+cdef object _encodeFilename(object filename):
+    if filename is None:
+        return None
+    elif python.PyString_Check(filename):
+        return filename
+    elif python.PyUnicode_Check(filename):
+        return python.PyUnicode_AsEncodedString(
+            filename, _C_FILENAME_ENCODING, NULL)
+    else:
+        raise TypeError, "Argument must be string or unicode."
+
+cdef object _encodeFilenameUTF8(object filename):
+    """Recode filename as UTF-8. Tries ASCII, local filesystem encoding and
+    UTF-8 as source encoding.
+    """
+    cdef char* c_filename
+    if filename is None:
+        return None
+    elif python.PyString_Check(filename):
+        c_filename = _cstr(filename)
+        if not isutf8(c_filename):
+            # plain ASCII!
+            return filename
+        try:
+            # try to decode with default encoding
+            filename = python.PyUnicode_Decode(
+                c_filename, python.PyString_GET_SIZE(filename),
+                _C_FILENAME_ENCODING, NULL)
+        except UnicodeDecodeError, decode_exc:
+            try:
+                # try if it's UTF-8
+                filename = python.PyUnicode_DecodeUTF8(
+                    c_filename, python.PyString_GET_SIZE(filename), NULL)
+            except UnicodeDecodeError:
+                raise decode_exc # otherwise re-raise original exception
+    if python.PyUnicode_Check(filename):
+        return python.PyUnicode_AsUTF8String(filename)
+    else:
+        raise TypeError, "Argument must be string or unicode."
+
 cdef _getNsTag(tag):
     """Given a tag, find namespace URI and tag name.
     Return None for NS uri if no namespace URI available.

Modified: lxml/trunk/src/lxml/docloader.pxi
==============================================================================
--- lxml/trunk/src/lxml/docloader.pxi	(original)
+++ lxml/trunk/src/lxml/docloader.pxi	Sat Sep  9 07:20:49 2006
@@ -8,7 +8,7 @@
 
 cdef class _InputDocument:
     cdef _InputDocumentDataType _type
-    cdef object _data_utf
+    cdef object _data_bytes
     cdef object _file
 
 cdef class Resolver:
@@ -28,7 +28,7 @@
         cdef _InputDocument doc_ref
         doc_ref = _InputDocument()
         doc_ref._type = PARSER_DATA_STRING
-        doc_ref._data_utf = _utf8(string)
+        doc_ref._data_bytes = _utf8(string)
         return doc_ref
 
     def resolve_filename(self, filename, context):
@@ -36,7 +36,7 @@
         cdef _InputDocument doc_ref
         doc_ref = _ParserInput()
         doc_ref._type = PARSER_DATA_FILENAME
-        doc_ref._data_utf = _utf8(filename)
+        doc_ref._data_bytes = _encodeFilename(filename)
         return doc_ref
 
     def resolve_file(self, f, context):

Modified: lxml/trunk/src/lxml/etree.pyx
==============================================================================
--- lxml/trunk/src/lxml/etree.pyx	(original)
+++ lxml/trunk/src/lxml/etree.pyx	Sat Sep  9 07:20:49 2006
@@ -68,6 +68,16 @@
 # initialize parser (and threading)
 xmlparser.xmlInitParser()
 
+# filename encoding
+cdef object _FILENAME_ENCODING
+_FILENAME_ENCODING = sys.getfilesystemencoding()
+if _FILENAME_ENCODING is None:
+    _FILENAME_ENCODING = sys.getdefaultencoding()
+if _FILENAME_ENCODING is None:
+    _FILENAME_ENCODING = 'ascii'
+cdef char* _C_FILENAME_ENCODING
+_C_FILENAME_ENCODING = _cstr(_FILENAME_ENCODING)
+
 
 # Error superclass for ElementTree compatibility
 class Error(Exception):

Modified: lxml/trunk/src/lxml/iterparse.pxi
==============================================================================
--- lxml/trunk/src/lxml/iterparse.pxi	(original)
+++ lxml/trunk/src/lxml/iterparse.pxi	Sat Sep  9 07:20:49 2006
@@ -234,10 +234,12 @@
         cdef char* c_filename
         cdef int parse_options
         if not hasattr(source, 'read'):
-            self._filename = source
-            source = open(source, 'rb')
+            self._filename = _encodeFilename(source)
+            source = open(self._filename, 'rb')
         else:
             self._filename = _getFilenameForFile(source)
+            if self._filename is not None:
+                self._filename = _encodeFilename(self._filename)
         if self._filename is not None:
             c_filename = self._filename
         else:
@@ -301,11 +303,7 @@
                 break
         if error != 0:
             self._source = None
-            if self._filename is not None:
-                c_filename = self._filename
-            else:
-                c_filename = NULL
-            _raiseParseError(self._parser_ctxt, c_filename)
+            _raiseParseError(self._parser_ctxt, self._filename)
         if python.PyList_GET_SIZE(context._events) == 0:
             self.root = context._root
             raise StopIteration

Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi	(original)
+++ lxml/trunk/src/lxml/parser.pxi	Sat Sep  9 07:20:49 2006
@@ -289,12 +289,12 @@
     c_input = NULL
     data = None
     if doc_ref._type == PARSER_DATA_STRING:
-        data = doc_ref._data_utf
+        data = doc_ref._data_bytes
         c_input = xmlparser.xmlNewStringInputStream(
             c_context, _cstr(data))
     elif doc_ref._type == PARSER_DATA_FILENAME:
         c_input = xmlparser.xmlNewInputFromFile(
-            c_context, _cstr(doc_ref._data_utf))
+            c_context, _cstr(doc_ref._data_bytes))
     elif doc_ref._type == PARSER_DATA_FILE:
         file_context = _FileParserContext(doc_ref._file, context, url)
         c_input = file_context._createParserInput(c_context)
@@ -451,7 +451,7 @@
             python.PyEval_RestoreThread(state)
 
             recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
-            return _handleParseResult(pctxt, result, NULL, recover)
+            return _handleParseResult(pctxt, result, None, recover)
         finally:
             self._error_log.disconnect()
             self._unlockParser()
@@ -482,7 +482,7 @@
             python.PyEval_RestoreThread(state)
 
             recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
-            return _handleParseResult(pctxt, result, NULL, recover)
+            return _handleParseResult(pctxt, result, None, recover)
         finally:
             self._error_log.disconnect()
             self._unlockParser()
@@ -521,9 +521,7 @@
         cdef char* c_filename
         cdef int recover
         if not filename:
-            c_filename = NULL
-        else:
-            c_filename = filename
+            filename = None
         self._lockParser()
         self._error_log.connect()
         try:
@@ -534,23 +532,22 @@
                 pctxt, self._parse_options, self._parser_type)
 
             recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
-            return _handleParseResult(pctxt, result, c_filename, recover)
+            return _handleParseResult(pctxt, result, filename, recover)
         finally:
             self._error_log.disconnect()
             self._unlockParser()
 
-cdef int _raiseParseError(xmlParserCtxt* ctxt, char* c_filename) except 0:
-    if c_filename is not NULL and \
+cdef int _raiseParseError(xmlParserCtxt* ctxt, filename) except 0:
+    if filename is not None and \
            ctxt.lastError.domain == xmlerror.XML_FROM_IO:
         if ctxt.lastError.message is not NULL:
-            message = "Error reading file %s: %s" % (
-                funicode(c_filename),
-                funicode(ctxt.lastError.message).strip())
+            message = "Error reading file '%s': %s" % (
+                filename, (ctxt.lastError.message).strip())
         else:
-            message = "Error reading file %s" % funicode(c_filename)
+            message = "Error reading file '%s'" % filename
         raise IOError, message
     elif ctxt.lastError.message is not NULL:
-        message = funicode(ctxt.lastError.message).strip()
+        message = (ctxt.lastError.message).strip()
         if ctxt.lastError.line >= 0:
             message = "line %d: %s" % (ctxt.lastError.line, message)
         raise XMLSyntaxError, message
@@ -558,7 +555,7 @@
         raise XMLSyntaxError
 
 cdef xmlDoc* _handleParseResult(xmlParserCtxt* ctxt, xmlDoc* result,
-                                char* c_filename, int recover) except NULL:
+                                filename, int recover) except NULL:
     cdef _ResolverContext context
     if ctxt.myDoc is not NULL:
         if ctxt.myDoc != result:
@@ -582,9 +579,9 @@
             context._raise_if_stored()
 
     if result is NULL:
-        _raiseParseError(ctxt, c_filename)
-    elif result.URL is NULL and c_filename is not NULL:
-        result.URL = tree.xmlStrdup(c_filename)
+        _raiseParseError(ctxt, filename)
+    elif result.URL is NULL and filename is not None:
+        result.URL = tree.xmlStrdup(_cstr(filename))
     return result
 
 ############################################################
@@ -669,7 +666,7 @@
         pctxt, c_text, NULL, NULL, options)
     try:
         recover = options & xmlparser.XML_PARSE_RECOVER
-        c_doc = _handleParseResult(pctxt, c_doc, NULL, recover)
+        c_doc = _handleParseResult(pctxt, c_doc, None, recover)
     finally:
         xmlparser.xmlFreeParserCtxt(pctxt)
     return c_doc
@@ -689,7 +686,11 @@
         pctxt, c_filename, NULL, options)
     try:
         recover = options & xmlparser.XML_PARSE_RECOVER
-        c_doc = _handleParseResult(pctxt, c_doc, c_filename, recover)
+        if c_filename is NULL:
+            filename = None
+        else:
+            filename = c_filename
+        c_doc = _handleParseResult(pctxt, c_doc, filename, recover)
     finally:
         xmlparser.xmlFreeParserCtxt(pctxt)
     return c_doc
@@ -782,7 +783,8 @@
     if not filename:
         c_filename = NULL
     else:
-        c_filename = _cstr(filename)
+        filename_utf = _encodeFilenameUTF8(filename)
+        c_filename = _cstr(filename_utf)
     if python.PyUnicode_Check(text):
         return (<_BaseParser>parser)._parseUnicodeDoc(text, c_filename)
     else:
@@ -790,14 +792,13 @@
         c_len  = python.PyString_GET_SIZE(text)
         return (<_BaseParser>parser)._parseDoc(c_text, c_len, c_filename)
 
-cdef xmlDoc* _parseDocFromFile(filename, _BaseParser parser) except NULL:
+cdef xmlDoc* _parseDocFromFile(filename8, _BaseParser parser) except NULL:
     if parser is None:
         parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
-    return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename))
+    return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename8))
 
 cdef xmlDoc* _parseDocFromFilelike(source, filename,
                                    _BaseParser parser) except NULL:
-    cdef char* c_filename
     if parser is None:
         parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
     return (<_BaseParser>parser)._parseDocFromFilelike(source, filename)
@@ -862,17 +863,19 @@
     if hasattr(source, 'getvalue') and hasattr(source, 'tell'):
         # StringIO - reading from start?
         if source.tell() == 0:
-            return _parseMemoryDocument(source.getvalue(), filename, parser)
+            return _parseMemoryDocument(
+                source.getvalue(), _encodeFilenameUTF8(filename), parser)
 
     # Support for file-like objects (urlgrabber.urlopen, ...)
     if hasattr(source, 'read'):
-        return _parseFilelikeDocument(source, filename, parser)
+        return _parseFilelikeDocument(
+            source, _encodeFilenameUTF8(filename), parser)
 
     # Otherwise parse the file directly from the filesystem
     if filename is None:
-        filename = source
+        filename = _encodeFilename(source)
     # open filename
-    c_doc = _parseDocFromFile(_utf8(filename), parser)
+    c_doc = _parseDocFromFile(filename, parser)
     return _documentFactory(c_doc, parser)
 
 cdef _Document _parseMemoryDocument(text, url, _BaseParser parser):
@@ -886,14 +889,14 @@
             text = python.PyUnicode_AsUTF8String(text)
     elif not python.PyString_Check(text):
         raise ValueError, "can only parse strings"
-    if url is not None:
-        url = _utf8(url)
+    if python.PyUnicode_Check(url):
+        url = python.PyUnicode_AsUTF8String(url)
     c_doc = _parseDoc(text, url, parser)
     return _documentFactory(c_doc, parser)
 
 cdef _Document _parseFilelikeDocument(source, url, _BaseParser parser):
     cdef xmlDoc* c_doc
-    if url is not None:
-        url = _utf8(url)
+    if python.PyUnicode_Check(url):
+        url = python.PyUnicode_AsUTF8String(url)
     c_doc = _parseDocFromFilelike(source, url, parser)
     return _documentFactory(c_doc, parser)

Modified: lxml/trunk/src/lxml/python.pxd
==============================================================================
--- lxml/trunk/src/lxml/python.pxd	(original)
+++ lxml/trunk/src/lxml/python.pxd	Sat Sep  9 07:20:49 2006
@@ -19,6 +19,8 @@
 
     cdef object PyUnicode_FromEncodedObject(object s, char* encoding,
                                             char* errors)
+    cdef object PyUnicode_AsEncodedString(object u, char* encoding,
+                                          char* errors)
     cdef object PyUnicode_Decode(char* s, Py_ssize_t size,
                                  char* encoding, char* errors)
     cdef object PyUnicode_DecodeUTF8(char* s, Py_ssize_t size, char* errors)

Modified: lxml/trunk/src/lxml/relaxng.pxi
==============================================================================
--- lxml/trunk/src/lxml/relaxng.pxi	(original)
+++ lxml/trunk/src/lxml/relaxng.pxi	Sat Sep  9 07:20:49 2006
@@ -44,6 +44,8 @@
             if filename is None:
                 # XXX assume a string object
                 filename = file
+            else:
+                filename = _encodeFilename(filename)
             parser_ctxt = relaxng.xmlRelaxNGNewParserCtxt(filename)
         else:
             raise RelaxNGParseError, "No tree or file given"

Modified: lxml/trunk/src/lxml/serializer.pxi
==============================================================================
--- lxml/trunk/src/lxml/serializer.pxi	(original)
+++ lxml/trunk/src/lxml/serializer.pxi	Sat Sep  9 07:20:49 2006
@@ -163,9 +163,9 @@
             "unknown encoding: '%s'", c_enc)
 
     if _isString(f):
-        filename = _utf8(f)
+        filename8 = _encodeFilename(f)
         c_buffer = tree.xmlOutputBufferCreateFilename(
-            _cstr(filename), enchandler, 0)
+            _cstr(filename8), enchandler, 0)
         state = python.PyEval_SaveThread()
     elif hasattr(f, 'write'):
         writer   = _FilelikeWriter(f)
@@ -196,8 +196,8 @@
     c_doc = _fakeRootDoc(c_base_doc, element._c_node)
     try:
         if _isString(f):
-            filename = _utf8(f)
-            c_filename = _cstr(filename)
+            filename8 = _encodeFilename(f)
+            c_filename = _cstr(filename8)
             state = python.PyEval_SaveThread()
             bytes = c14n.xmlC14NDocSave(c_doc, NULL, 0, NULL, 1, c_filename, 0)
             python.PyEval_RestoreThread(state)

Modified: lxml/trunk/src/lxml/xmlschema.pxi
==============================================================================
--- lxml/trunk/src/lxml/xmlschema.pxi	(original)
+++ lxml/trunk/src/lxml/xmlschema.pxi	Sat Sep  9 07:20:49 2006
@@ -50,6 +50,8 @@
             if filename is None:
                 # XXX assume a string object
                 filename = file
+            else:
+                filename = _encodeFilename(filename)
             parser_ctxt = xmlschema.xmlSchemaNewParserCtxt(filename)
             self._c_schema = xmlschema.xmlSchemaParse(parser_ctxt)
             xmlschema.xmlSchemaFreeParserCtxt(parser_ctxt)

Modified: lxml/trunk/src/lxml/xslt.pxi
==============================================================================
--- lxml/trunk/src/lxml/xslt.pxi	(original)
+++ lxml/trunk/src/lxml/xslt.pxi	Sat Sep  9 07:20:49 2006
@@ -77,7 +77,7 @@
                 c_doc = _newDoc()
             if doc_ref._type == PARSER_DATA_STRING:
                 c_doc = _internalParseDoc(
-                    _cstr(doc_ref._data_utf), parse_options,
+                    _cstr(doc_ref._data_bytes), parse_options,
                     resolver_context)
             elif doc_ref._type == PARSER_DATA_FILE:
                 data = doc_ref._file.read()
@@ -86,7 +86,7 @@
                     resolver_context)
             elif doc_ref._type == PARSER_DATA_FILENAME:
                 c_doc = _internalParseDocFromFile(
-                    _cstr(doc_ref._data_utf), parse_options,
+                    _cstr(doc_ref._data_bytes), parse_options,
                     resolver_context)
             if c_doc is not NULL and c_doc.URL is NULL:
                 c_doc.URL = tree.xmlStrdup(c_uri)


More information about the lxml-checkins mailing list