[Lxml-checkins] r54974 - in lxml/trunk: . src/lxml src/lxml/html/tests
scoder at codespeak.net
scoder at codespeak.net
Tue May 20 00:01:35 CEST 2008
Author: scoder
Date: Tue May 20 00:01:33 2008
New Revision: 54974
Modified:
lxml/trunk/ (props changed)
lxml/trunk/CHANGES.txt
lxml/trunk/src/lxml/apihelpers.pxi
lxml/trunk/src/lxml/etree_defs.h
lxml/trunk/src/lxml/html/tests/test_forms.txt
lxml/trunk/src/lxml/iterparse.pxi
lxml/trunk/src/lxml/lxml.etree.pyx
lxml/trunk/src/lxml/parser.pxi
lxml/trunk/src/lxml/python.pxd
lxml/trunk/src/lxml/xslt.pxi
Log:
r4232 at delle: sbehnel | 2008-05-19 23:51:31 +0200
unicode filename handling, uses a heuristic to distinguish file paths and network paths, plus some general Py3 fixes
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Tue May 20 00:01:33 2008
@@ -8,6 +8,9 @@
Features added
--------------
+* File name handling now uses a heuristic to convert between byte
+ strings and unicode strings.
+
* Parsing from a plain file object frees the GIL.
* Running ``iterparse()`` on a plain file (or filename) frees the GIL
Modified: lxml/trunk/src/lxml/apihelpers.pxi
==============================================================================
--- lxml/trunk/src/lxml/apihelpers.pxi (original)
+++ lxml/trunk/src/lxml/apihelpers.pxi Tue May 20 00:01:33 2008
@@ -435,7 +435,10 @@
# handle two most common cases first
if c_text is NULL:
if scount > 0:
- return ''
+ if python.IS_PYTHON3:
+ return u''
+ else:
+ return ''
else:
return None
if scount == 1:
@@ -505,7 +508,7 @@
else:
c_ns = element._doc._findOrBuildNodeNs(
element._c_node, _cstr(ns), NULL)
- return '%s:%s' % (c_ns.prefix, tag) # UTF-8
+ return python.PyString_FromFormat('%s:%s', c_ns.prefix, _cstr(tag))
cdef inline bint _hasChild(xmlNode* c_node):
return c_node is not NULL and _findChildForwards(c_node, 0) is not NULL
@@ -1034,6 +1037,27 @@
raise TypeError, "Argument must be string or unicode."
return s
+cdef bint _isFilePath(char* c_path):
+ u"simple heuristic to see if a path is a filename"
+ # test if it looks like an absolute Unix path or a Windows network path
+ if c_path[0] == c'/':
+ return 1
+ # test if it looks like an absolute Windows path
+ if (c_path[0] >= c'a' and c_path[0] <= c'z') or \
+ (c_path[0] >= c'A' and c_path[0] <= c'Z'):
+ if c_path[1] == c':':
+ return 1
+ # test if it looks like a relative path
+ while c_path[0] != c'\0':
+ if c_path[0] == c':':
+ return 0
+ if c_path[0] == c'/':
+ return 1
+ if c_path[0] == c'\\':
+ return 1
+ c_path += 1
+ return 1
+
cdef object _encodeFilename(object filename):
u"""Make sure a filename is 8-bit encoded (or None).
"""
@@ -1042,11 +1066,34 @@
elif python.PyString_Check(filename):
return filename
elif python.PyUnicode_Check(filename):
- return python.PyUnicode_AsEncodedString(
- filename, _C_FILENAME_ENCODING, NULL)
+ filename8 = python.PyUnicode_AsEncodedString(
+ filename, 'UTF-8', NULL)
+ if _isFilePath(filename8):
+ try:
+ return python.PyUnicode_AsEncodedString(
+ filename, _C_FILENAME_ENCODING, NULL)
+ except UnicodeEncodeError:
+ pass
+ return filename8
else:
raise TypeError, u"Argument must be string or unicode."
+cdef object _decodeFilename(char* c_path):
+ u"""Make the filename a unicode string if we are in Py3.
+ """
+ cdef Py_ssize_t c_len = cstd.strlen(c_path)
+ if _isFilePath(c_path):
+ try:
+ return python.PyUnicode_Decode(
+ c_path, c_len, _C_FILENAME_ENCODING, NULL)
+ except UnicodeDecodeError:
+ pass
+ try:
+ return python.PyUnicode_DecodeUTF8(c_path, c_len, NULL)
+ except UnicodeDecodeError:
+ # this is a stupid fallback, but it might still work...
+ return python.PyString_FromStringAndSize(c_path, c_len)
+
cdef object _encodeFilenameUTF8(object filename):
u"""Recode filename as UTF-8. Tries ASCII, local filesystem encoding and
UTF-8 as source encoding.
@@ -1182,6 +1229,8 @@
cdef object _namespacedNameFromNsName(char* href, char* name):
if href is NULL:
return funicode(name)
+ elif python.IS_PYTHON3:
+ return python.PyUnicode_FromFormat("{%s}%s", href, name)
else:
s = python.PyString_FromFormat("{%s}%s", href, name)
if isutf8(href) or isutf8(name):
Modified: lxml/trunk/src/lxml/etree_defs.h
==============================================================================
--- lxml/trunk/src/lxml/etree_defs.h (original)
+++ lxml/trunk/src/lxml/etree_defs.h Tue May 20 00:01:33 2008
@@ -15,6 +15,8 @@
/* Python 3 doesn't have PyFile_*() */
#if PY_VERSION_HEX >= 0x03000000
# define PyFile_AsFile(o) (NULL)
+#else
+# define PyUnicode_FromFormat(s, ...) (NULL)
#endif
#if PY_VERSION_HEX >= 0x03000000
Modified: lxml/trunk/src/lxml/html/tests/test_forms.txt
==============================================================================
--- lxml/trunk/src/lxml/html/tests/test_forms.txt (original)
+++ lxml/trunk/src/lxml/html/tests/test_forms.txt Tue May 20 00:01:33 2008
@@ -33,10 +33,10 @@
... </form>
... </body></html>''', base_url='http://example.org/form.html')
>>> h.base_url
-'http://example.org/form.html'
+u'http://example.org/form.html'
>>> f = h.forms[0]
>>> f.action
-'http://example.org/test'
+u'http://example.org/test'
>>> f.method
'GET'
>>> f.inputs # doctest:+NOPARSE_MARKUP
Modified: lxml/trunk/src/lxml/iterparse.pxi
==============================================================================
--- lxml/trunk/src/lxml/iterparse.pxi (original)
+++ lxml/trunk/src/lxml/iterparse.pxi Tue May 20 00:01:33 2008
@@ -365,7 +365,9 @@
cdef int parse_options
if not hasattr(source, u'read'):
filename = _encodeFilename(source)
- source = open(filename, u'rb')
+ if not python.IS_PYTHON3:
+ source = filename
+ source = open(source, u'rb')
else:
filename = _encodeFilename(_getFilenameForFile(source))
Modified: lxml/trunk/src/lxml/lxml.etree.pyx
==============================================================================
--- lxml/trunk/src/lxml/lxml.etree.pyx (original)
+++ lxml/trunk/src/lxml/lxml.etree.pyx Tue May 20 00:01:33 2008
@@ -82,6 +82,8 @@
_FILENAME_ENCODING = sys.getdefaultencoding()
if _FILENAME_ENCODING is None:
_FILENAME_ENCODING = 'ascii'
+else:
+ _FILENAME_ENCODING = _FILENAME_ENCODING.encode(u"UTF-8")
cdef char* _C_FILENAME_ENCODING
_C_FILENAME_ENCODING = _cstr(_FILENAME_ENCODING)
@@ -245,7 +247,7 @@
text_or_uri = u"{%s}%s" % (text_or_uri, tag)
else:
if not _isString(text_or_uri):
- text_or_uri = str(text_or_uri)
+ text_or_uri = unicode(text_or_uri)
tag = _getNsTag(text_or_uri)[1]
_tagValidOrRaise(tag)
self.text = text_or_uri
@@ -255,9 +257,9 @@
return self.text.__hash__()
def __richcmp__(one, other, int op):
if not _isString(one):
- one = str(one)
+ one = unicode(one)
if not _isString(other):
- other = str(other)
+ other = unicode(other)
return python.PyObject_RichCompare(one, other, op)
@@ -326,11 +328,11 @@
if c_doc.version is NULL:
version = None
else:
- version = c_doc.version
+ version = funicode(c_doc.version)
if c_doc.encoding is NULL:
encoding = None
else:
- encoding = c_doc.encoding
+ encoding = funicode(c_doc.encoding)
return (version, encoding)
cdef buildNewPrefix(self):
@@ -462,7 +464,7 @@
def __get__(self):
if self._doc._c_doc.URL is NULL:
return None
- return self._doc._c_doc.URL
+ return _decodeFilename(self._doc._c_doc.URL)
def __set__(self, url):
cdef char* c_oldurl
url = _encodeFilename(url)
@@ -905,9 +907,8 @@
if c_base is NULL:
if self._doc._c_doc.URL is NULL:
return None
- return self._doc._c_doc.URL
- # FIXME: this might be UTF-8 or any other 8-bit encoding
- base = c_base
+ return _decodeFilename(self._doc._c_doc.URL)
+ base = _decodeFilename(c_base)
tree.xmlFree(c_base)
return base
def __set__(self, url):
@@ -1839,7 +1840,7 @@
def update(self, sequence_or_dict):
if isinstance(sequence_or_dict, dict):
- sequence_or_dict = sequence_or_dict.iteritems()
+ sequence_or_dict = sequence_or_dict.items()
for key, value in sequence_or_dict:
_setAttributeValue(self._element, key, value)
@@ -2192,7 +2193,7 @@
else:
events = (u"start",)
self._start_element = element
- self._nextEvent = iterwalk(element, events=events, tag=tag).next
+ self._nextEvent = iterwalk(element, events=events, tag=tag).__next__
def __iter__(self):
return self
@@ -2454,7 +2455,7 @@
"""
_dumpToFile(sys.stdout, elem._c_node, pretty_print, with_tail)
-def tostring(element_or_tree, *, encoding=None, method="xml",
+def tostring(element_or_tree, *, encoding=None, method=u"xml",
xml_declaration=None, pretty_print=False, with_tail=True):
u"""tostring(element_or_tree, encoding=None, method="xml",
xml_declaration=None, pretty_print=False, with_tail=True)
Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi (original)
+++ lxml/trunk/src/lxml/parser.pxi Tue May 20 00:01:33 2008
@@ -335,12 +335,9 @@
try:
if c_url is NULL:
url = None
- elif c_context.myDoc is NULL or c_context.myDoc.URL is NULL:
- # parsing a main document, so URL was passed verbatimly by user
- url = c_url
else:
- # parsing a related document (DTD etc.) => UTF-8 encoded URL
- url = funicode(c_url)
+ # parsing a related document (DTD etc.) => UTF-8 encoded URL?
+ url = _decodeFilename(c_url)
if c_pubid is NULL:
pubid = None
else:
Modified: lxml/trunk/src/lxml/python.pxd
==============================================================================
--- lxml/trunk/src/lxml/python.pxd (original)
+++ lxml/trunk/src/lxml/python.pxd Tue May 20 00:01:33 2008
@@ -30,6 +30,7 @@
char* errors)
cdef object PyUnicode_AsEncodedString(object u, char* encoding,
char* errors)
+ cdef object PyUnicode_FromFormat(char* format, ...) # Python 3
cdef object PyUnicode_Decode(char* s, Py_ssize_t size,
char* encoding, char* errors)
cdef object PyUnicode_DecodeUTF8(char* s, Py_ssize_t size, char* errors)
Modified: lxml/trunk/src/lxml/xslt.pxi
==============================================================================
--- lxml/trunk/src/lxml/xslt.pxi (original)
+++ lxml/trunk/src/lxml/xslt.pxi Tue May 20 00:01:33 2008
@@ -85,13 +85,13 @@
try:
resolvers = context._resolvers
if cstd.strncmp('string://', c_uri, 9) == 0:
- uri = funicode(c_uri + 9)
+ uri = _decodeFilename(c_uri + 9)
if cstd.strncmp('string://', context._c_style_doc.URL, 9) != 0 and \
cstd.strcmp('<string>', context._c_style_doc.URL) != 0:
# stylesheet URL known => make the target URL absolute
- uri = os_path_join(context._c_style_doc.URL, uri)
+ uri = os_path_join(_decodeFilename(context._c_style_doc.URL), uri)
else:
- uri = funicode(c_uri)
+ uri = _decodeFilename(c_uri)
doc_ref = resolvers.resolve(uri, None, context)
c_doc = NULL
More information about the lxml-checkins
mailing list