[Lxml-checkins] r32095 - lxml/trunk/src/lxml
scoder at codespeak.net
scoder at codespeak.net
Sat Sep 9 07:20:54 CEST 2006
Author: scoder
Date: Sat Sep 9 07:20:49 2006
New Revision: 32095
Modified:
lxml/trunk/src/lxml/apihelpers.pxi
lxml/trunk/src/lxml/docloader.pxi
lxml/trunk/src/lxml/etree.pyx
lxml/trunk/src/lxml/iterparse.pxi
lxml/trunk/src/lxml/parser.pxi
lxml/trunk/src/lxml/python.pxd
lxml/trunk/src/lxml/relaxng.pxi
lxml/trunk/src/lxml/serializer.pxi
lxml/trunk/src/lxml/xmlschema.pxi
lxml/trunk/src/lxml/xslt.pxi
Log:
fix handling of 8-bit encoded filenames
Modified: lxml/trunk/src/lxml/apihelpers.pxi
==============================================================================
--- lxml/trunk/src/lxml/apihelpers.pxi (original)
+++ lxml/trunk/src/lxml/apihelpers.pxi Sat Sep 9 07:20:49 2006
@@ -501,6 +501,46 @@
else:
raise TypeError, "Argument must be string or unicode."
+cdef object _encodeFilename(object filename):
+ if filename is None:
+ return None
+ elif python.PyString_Check(filename):
+ return filename
+ elif python.PyUnicode_Check(filename):
+ return python.PyUnicode_AsEncodedString(
+ filename, _C_FILENAME_ENCODING, NULL)
+ else:
+ raise TypeError, "Argument must be string or unicode."
+
+cdef object _encodeFilenameUTF8(object filename):
+ """Recode filename as UTF-8. Tries ASCII, local filesystem encoding and
+ UTF-8 as source encoding.
+ """
+ cdef char* c_filename
+ if filename is None:
+ return None
+ elif python.PyString_Check(filename):
+ c_filename = _cstr(filename)
+ if not isutf8(c_filename):
+ # plain ASCII!
+ return filename
+ try:
+ # try to decode with default encoding
+ filename = python.PyUnicode_Decode(
+ c_filename, python.PyString_GET_SIZE(filename),
+ _C_FILENAME_ENCODING, NULL)
+ except UnicodeDecodeError, decode_exc:
+ try:
+ # try if it's UTF-8
+ filename = python.PyUnicode_DecodeUTF8(
+ c_filename, python.PyString_GET_SIZE(filename), NULL)
+ except UnicodeDecodeError:
+ raise decode_exc # otherwise re-raise original exception
+ if python.PyUnicode_Check(filename):
+ return python.PyUnicode_AsUTF8String(filename)
+ else:
+ raise TypeError, "Argument must be string or unicode."
+
cdef _getNsTag(tag):
"""Given a tag, find namespace URI and tag name.
Return None for NS uri if no namespace URI available.
Modified: lxml/trunk/src/lxml/docloader.pxi
==============================================================================
--- lxml/trunk/src/lxml/docloader.pxi (original)
+++ lxml/trunk/src/lxml/docloader.pxi Sat Sep 9 07:20:49 2006
@@ -8,7 +8,7 @@
cdef class _InputDocument:
cdef _InputDocumentDataType _type
- cdef object _data_utf
+ cdef object _data_bytes
cdef object _file
cdef class Resolver:
@@ -28,7 +28,7 @@
cdef _InputDocument doc_ref
doc_ref = _InputDocument()
doc_ref._type = PARSER_DATA_STRING
- doc_ref._data_utf = _utf8(string)
+ doc_ref._data_bytes = _utf8(string)
return doc_ref
def resolve_filename(self, filename, context):
@@ -36,7 +36,7 @@
cdef _InputDocument doc_ref
doc_ref = _ParserInput()
doc_ref._type = PARSER_DATA_FILENAME
- doc_ref._data_utf = _utf8(filename)
+ doc_ref._data_bytes = _encodeFilename(filename)
return doc_ref
def resolve_file(self, f, context):
Modified: lxml/trunk/src/lxml/etree.pyx
==============================================================================
--- lxml/trunk/src/lxml/etree.pyx (original)
+++ lxml/trunk/src/lxml/etree.pyx Sat Sep 9 07:20:49 2006
@@ -68,6 +68,16 @@
# initialize parser (and threading)
xmlparser.xmlInitParser()
+# filename encoding
+cdef object _FILENAME_ENCODING
+_FILENAME_ENCODING = sys.getfilesystemencoding()
+if _FILENAME_ENCODING is None:
+ _FILENAME_ENCODING = sys.getdefaultencoding()
+if _FILENAME_ENCODING is None:
+ _FILENAME_ENCODING = 'ascii'
+cdef char* _C_FILENAME_ENCODING
+_C_FILENAME_ENCODING = _cstr(_FILENAME_ENCODING)
+
# Error superclass for ElementTree compatibility
class Error(Exception):
Modified: lxml/trunk/src/lxml/iterparse.pxi
==============================================================================
--- lxml/trunk/src/lxml/iterparse.pxi (original)
+++ lxml/trunk/src/lxml/iterparse.pxi Sat Sep 9 07:20:49 2006
@@ -234,10 +234,12 @@
cdef char* c_filename
cdef int parse_options
if not hasattr(source, 'read'):
- self._filename = source
- source = open(source, 'rb')
+ self._filename = _encodeFilename(source)
+ source = open(self._filename, 'rb')
else:
self._filename = _getFilenameForFile(source)
+ if self._filename is not None:
+ self._filename = _encodeFilename(self._filename)
if self._filename is not None:
c_filename = self._filename
else:
@@ -301,11 +303,7 @@
break
if error != 0:
self._source = None
- if self._filename is not None:
- c_filename = self._filename
- else:
- c_filename = NULL
- _raiseParseError(self._parser_ctxt, c_filename)
+ _raiseParseError(self._parser_ctxt, self._filename)
if python.PyList_GET_SIZE(context._events) == 0:
self.root = context._root
raise StopIteration
Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi (original)
+++ lxml/trunk/src/lxml/parser.pxi Sat Sep 9 07:20:49 2006
@@ -289,12 +289,12 @@
c_input = NULL
data = None
if doc_ref._type == PARSER_DATA_STRING:
- data = doc_ref._data_utf
+ data = doc_ref._data_bytes
c_input = xmlparser.xmlNewStringInputStream(
c_context, _cstr(data))
elif doc_ref._type == PARSER_DATA_FILENAME:
c_input = xmlparser.xmlNewInputFromFile(
- c_context, _cstr(doc_ref._data_utf))
+ c_context, _cstr(doc_ref._data_bytes))
elif doc_ref._type == PARSER_DATA_FILE:
file_context = _FileParserContext(doc_ref._file, context, url)
c_input = file_context._createParserInput(c_context)
@@ -451,7 +451,7 @@
python.PyEval_RestoreThread(state)
recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
- return _handleParseResult(pctxt, result, NULL, recover)
+ return _handleParseResult(pctxt, result, None, recover)
finally:
self._error_log.disconnect()
self._unlockParser()
@@ -482,7 +482,7 @@
python.PyEval_RestoreThread(state)
recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
- return _handleParseResult(pctxt, result, NULL, recover)
+ return _handleParseResult(pctxt, result, None, recover)
finally:
self._error_log.disconnect()
self._unlockParser()
@@ -521,9 +521,7 @@
cdef char* c_filename
cdef int recover
if not filename:
- c_filename = NULL
- else:
- c_filename = filename
+ filename = None
self._lockParser()
self._error_log.connect()
try:
@@ -534,23 +532,22 @@
pctxt, self._parse_options, self._parser_type)
recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
- return _handleParseResult(pctxt, result, c_filename, recover)
+ return _handleParseResult(pctxt, result, filename, recover)
finally:
self._error_log.disconnect()
self._unlockParser()
-cdef int _raiseParseError(xmlParserCtxt* ctxt, char* c_filename) except 0:
- if c_filename is not NULL and \
+cdef int _raiseParseError(xmlParserCtxt* ctxt, filename) except 0:
+ if filename is not None and \
ctxt.lastError.domain == xmlerror.XML_FROM_IO:
if ctxt.lastError.message is not NULL:
- message = "Error reading file %s: %s" % (
- funicode(c_filename),
- funicode(ctxt.lastError.message).strip())
+ message = "Error reading file '%s': %s" % (
+ filename, (ctxt.lastError.message).strip())
else:
- message = "Error reading file %s" % funicode(c_filename)
+ message = "Error reading file '%s'" % filename
raise IOError, message
elif ctxt.lastError.message is not NULL:
- message = funicode(ctxt.lastError.message).strip()
+ message = (ctxt.lastError.message).strip()
if ctxt.lastError.line >= 0:
message = "line %d: %s" % (ctxt.lastError.line, message)
raise XMLSyntaxError, message
@@ -558,7 +555,7 @@
raise XMLSyntaxError
cdef xmlDoc* _handleParseResult(xmlParserCtxt* ctxt, xmlDoc* result,
- char* c_filename, int recover) except NULL:
+ filename, int recover) except NULL:
cdef _ResolverContext context
if ctxt.myDoc is not NULL:
if ctxt.myDoc != result:
@@ -582,9 +579,9 @@
context._raise_if_stored()
if result is NULL:
- _raiseParseError(ctxt, c_filename)
- elif result.URL is NULL and c_filename is not NULL:
- result.URL = tree.xmlStrdup(c_filename)
+ _raiseParseError(ctxt, filename)
+ elif result.URL is NULL and filename is not None:
+ result.URL = tree.xmlStrdup(_cstr(filename))
return result
############################################################
@@ -669,7 +666,7 @@
pctxt, c_text, NULL, NULL, options)
try:
recover = options & xmlparser.XML_PARSE_RECOVER
- c_doc = _handleParseResult(pctxt, c_doc, NULL, recover)
+ c_doc = _handleParseResult(pctxt, c_doc, None, recover)
finally:
xmlparser.xmlFreeParserCtxt(pctxt)
return c_doc
@@ -689,7 +686,11 @@
pctxt, c_filename, NULL, options)
try:
recover = options & xmlparser.XML_PARSE_RECOVER
- c_doc = _handleParseResult(pctxt, c_doc, c_filename, recover)
+ if c_filename is NULL:
+ filename = None
+ else:
+ filename = c_filename
+ c_doc = _handleParseResult(pctxt, c_doc, filename, recover)
finally:
xmlparser.xmlFreeParserCtxt(pctxt)
return c_doc
@@ -782,7 +783,8 @@
if not filename:
c_filename = NULL
else:
- c_filename = _cstr(filename)
+ filename_utf = _encodeFilenameUTF8(filename)
+ c_filename = _cstr(filename_utf)
if python.PyUnicode_Check(text):
return (<_BaseParser>parser)._parseUnicodeDoc(text, c_filename)
else:
@@ -790,14 +792,13 @@
c_len = python.PyString_GET_SIZE(text)
return (<_BaseParser>parser)._parseDoc(c_text, c_len, c_filename)
-cdef xmlDoc* _parseDocFromFile(filename, _BaseParser parser) except NULL:
+cdef xmlDoc* _parseDocFromFile(filename8, _BaseParser parser) except NULL:
if parser is None:
parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
- return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename))
+ return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename8))
cdef xmlDoc* _parseDocFromFilelike(source, filename,
_BaseParser parser) except NULL:
- cdef char* c_filename
if parser is None:
parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
return (<_BaseParser>parser)._parseDocFromFilelike(source, filename)
@@ -862,17 +863,19 @@
if hasattr(source, 'getvalue') and hasattr(source, 'tell'):
# StringIO - reading from start?
if source.tell() == 0:
- return _parseMemoryDocument(source.getvalue(), filename, parser)
+ return _parseMemoryDocument(
+ source.getvalue(), _encodeFilenameUTF8(filename), parser)
# Support for file-like objects (urlgrabber.urlopen, ...)
if hasattr(source, 'read'):
- return _parseFilelikeDocument(source, filename, parser)
+ return _parseFilelikeDocument(
+ source, _encodeFilenameUTF8(filename), parser)
# Otherwise parse the file directly from the filesystem
if filename is None:
- filename = source
+ filename = _encodeFilename(source)
# open filename
- c_doc = _parseDocFromFile(_utf8(filename), parser)
+ c_doc = _parseDocFromFile(filename, parser)
return _documentFactory(c_doc, parser)
cdef _Document _parseMemoryDocument(text, url, _BaseParser parser):
@@ -886,14 +889,14 @@
text = python.PyUnicode_AsUTF8String(text)
elif not python.PyString_Check(text):
raise ValueError, "can only parse strings"
- if url is not None:
- url = _utf8(url)
+ if python.PyUnicode_Check(url):
+ url = python.PyUnicode_AsUTF8String(url)
c_doc = _parseDoc(text, url, parser)
return _documentFactory(c_doc, parser)
cdef _Document _parseFilelikeDocument(source, url, _BaseParser parser):
cdef xmlDoc* c_doc
- if url is not None:
- url = _utf8(url)
+ if python.PyUnicode_Check(url):
+ url = python.PyUnicode_AsUTF8String(url)
c_doc = _parseDocFromFilelike(source, url, parser)
return _documentFactory(c_doc, parser)
Modified: lxml/trunk/src/lxml/python.pxd
==============================================================================
--- lxml/trunk/src/lxml/python.pxd (original)
+++ lxml/trunk/src/lxml/python.pxd Sat Sep 9 07:20:49 2006
@@ -19,6 +19,8 @@
cdef object PyUnicode_FromEncodedObject(object s, char* encoding,
char* errors)
+ cdef object PyUnicode_AsEncodedString(object u, char* encoding,
+ char* errors)
cdef object PyUnicode_Decode(char* s, Py_ssize_t size,
char* encoding, char* errors)
cdef object PyUnicode_DecodeUTF8(char* s, Py_ssize_t size, char* errors)
Modified: lxml/trunk/src/lxml/relaxng.pxi
==============================================================================
--- lxml/trunk/src/lxml/relaxng.pxi (original)
+++ lxml/trunk/src/lxml/relaxng.pxi Sat Sep 9 07:20:49 2006
@@ -44,6 +44,8 @@
if filename is None:
# XXX assume a string object
filename = file
+ else:
+ filename = _encodeFilename(filename)
parser_ctxt = relaxng.xmlRelaxNGNewParserCtxt(filename)
else:
raise RelaxNGParseError, "No tree or file given"
Modified: lxml/trunk/src/lxml/serializer.pxi
==============================================================================
--- lxml/trunk/src/lxml/serializer.pxi (original)
+++ lxml/trunk/src/lxml/serializer.pxi Sat Sep 9 07:20:49 2006
@@ -163,9 +163,9 @@
"unknown encoding: '%s'", c_enc)
if _isString(f):
- filename = _utf8(f)
+ filename8 = _encodeFilename(f)
c_buffer = tree.xmlOutputBufferCreateFilename(
- _cstr(filename), enchandler, 0)
+ _cstr(filename8), enchandler, 0)
state = python.PyEval_SaveThread()
elif hasattr(f, 'write'):
writer = _FilelikeWriter(f)
@@ -196,8 +196,8 @@
c_doc = _fakeRootDoc(c_base_doc, element._c_node)
try:
if _isString(f):
- filename = _utf8(f)
- c_filename = _cstr(filename)
+ filename8 = _encodeFilename(f)
+ c_filename = _cstr(filename8)
state = python.PyEval_SaveThread()
bytes = c14n.xmlC14NDocSave(c_doc, NULL, 0, NULL, 1, c_filename, 0)
python.PyEval_RestoreThread(state)
Modified: lxml/trunk/src/lxml/xmlschema.pxi
==============================================================================
--- lxml/trunk/src/lxml/xmlschema.pxi (original)
+++ lxml/trunk/src/lxml/xmlschema.pxi Sat Sep 9 07:20:49 2006
@@ -50,6 +50,8 @@
if filename is None:
# XXX assume a string object
filename = file
+ else:
+ filename = _encodeFilename(filename)
parser_ctxt = xmlschema.xmlSchemaNewParserCtxt(filename)
self._c_schema = xmlschema.xmlSchemaParse(parser_ctxt)
xmlschema.xmlSchemaFreeParserCtxt(parser_ctxt)
Modified: lxml/trunk/src/lxml/xslt.pxi
==============================================================================
--- lxml/trunk/src/lxml/xslt.pxi (original)
+++ lxml/trunk/src/lxml/xslt.pxi Sat Sep 9 07:20:49 2006
@@ -77,7 +77,7 @@
c_doc = _newDoc()
if doc_ref._type == PARSER_DATA_STRING:
c_doc = _internalParseDoc(
- _cstr(doc_ref._data_utf), parse_options,
+ _cstr(doc_ref._data_bytes), parse_options,
resolver_context)
elif doc_ref._type == PARSER_DATA_FILE:
data = doc_ref._file.read()
@@ -86,7 +86,7 @@
resolver_context)
elif doc_ref._type == PARSER_DATA_FILENAME:
c_doc = _internalParseDocFromFile(
- _cstr(doc_ref._data_utf), parse_options,
+ _cstr(doc_ref._data_bytes), parse_options,
resolver_context)
if c_doc is not NULL and c_doc.URL is NULL:
c_doc.URL = tree.xmlStrdup(c_uri)
More information about the lxml-checkins
mailing list