[Lxml-checkins] r54974 - in lxml/trunk: . src/lxml src/lxml/html/tests

scoder at codespeak.net scoder at codespeak.net
Tue May 20 00:01:35 CEST 2008


Author: scoder
Date: Tue May 20 00:01:33 2008
New Revision: 54974

Modified:
   lxml/trunk/   (props changed)
   lxml/trunk/CHANGES.txt
   lxml/trunk/src/lxml/apihelpers.pxi
   lxml/trunk/src/lxml/etree_defs.h
   lxml/trunk/src/lxml/html/tests/test_forms.txt
   lxml/trunk/src/lxml/iterparse.pxi
   lxml/trunk/src/lxml/lxml.etree.pyx
   lxml/trunk/src/lxml/parser.pxi
   lxml/trunk/src/lxml/python.pxd
   lxml/trunk/src/lxml/xslt.pxi
Log:
 r4232 at delle:  sbehnel | 2008-05-19 23:51:31 +0200
 unicode filename handling, uses a heuristic to distinguish file paths and network paths, plus some general Py3 fixes


Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt	(original)
+++ lxml/trunk/CHANGES.txt	Tue May 20 00:01:33 2008
@@ -8,6 +8,9 @@
 Features added
 --------------
 
+* File name handling now uses a heuristic to convert between byte
+  strings and unicode strings.
+
 * Parsing from a plain file object frees the GIL.
 
 * Running ``iterparse()`` on a plain file (or filename) frees the GIL

Modified: lxml/trunk/src/lxml/apihelpers.pxi
==============================================================================
--- lxml/trunk/src/lxml/apihelpers.pxi	(original)
+++ lxml/trunk/src/lxml/apihelpers.pxi	Tue May 20 00:01:33 2008
@@ -435,7 +435,10 @@
     # handle two most common cases first
     if c_text is NULL:
         if scount > 0:
-            return ''
+            if python.IS_PYTHON3:
+                return u''
+            else:
+                return ''
         else:
             return None
     if scount == 1:
@@ -505,7 +508,7 @@
     else:
         c_ns = element._doc._findOrBuildNodeNs(
             element._c_node, _cstr(ns), NULL)
-        return '%s:%s' % (c_ns.prefix, tag) # UTF-8
+        return python.PyString_FromFormat('%s:%s', c_ns.prefix, _cstr(tag))
 
 cdef inline bint _hasChild(xmlNode* c_node):
     return c_node is not NULL and _findChildForwards(c_node, 0) is not NULL
@@ -1034,6 +1037,27 @@
         raise TypeError, "Argument must be string or unicode."
     return s
 
+cdef bint _isFilePath(char* c_path):
+    u"simple heuristic to see if a path is a filename"
+    # test if it looks like an absolute Unix path or a Windows network path
+    if c_path[0] == c'/':
+        return 1
+    # test if it looks like an absolute Windows path
+    if (c_path[0] >= c'a' and c_path[0] <= c'z') or \
+            (c_path[0] >= c'A' and c_path[0] <= c'Z'):
+        if c_path[1] == c':':
+            return 1
+    # test if it looks like a relative path
+    while c_path[0] != c'\0':
+        if c_path[0] == c':':
+            return 0
+        if c_path[0] == c'/':
+            return 1
+        if c_path[0] == c'\\':
+            return 1
+        c_path += 1
+    return 1
+
 cdef object _encodeFilename(object filename):
     u"""Make sure a filename is 8-bit encoded (or None).
     """
@@ -1042,11 +1066,34 @@
     elif python.PyString_Check(filename):
         return filename
     elif python.PyUnicode_Check(filename):
-        return python.PyUnicode_AsEncodedString(
-            filename, _C_FILENAME_ENCODING, NULL)
+        filename8 = python.PyUnicode_AsEncodedString(
+            filename, 'UTF-8', NULL)
+        if _isFilePath(filename8):
+            try:
+                return python.PyUnicode_AsEncodedString(
+                    filename, _C_FILENAME_ENCODING, NULL)
+            except UnicodeEncodeError:
+                pass
+        return filename8
     else:
         raise TypeError, u"Argument must be string or unicode."
 
+cdef object _decodeFilename(char* c_path):
+    u"""Make the filename a unicode string if we are in Py3.
+    """
+    cdef Py_ssize_t c_len = cstd.strlen(c_path)
+    if _isFilePath(c_path):
+        try:
+            return python.PyUnicode_Decode(
+                c_path, c_len, _C_FILENAME_ENCODING, NULL)
+        except UnicodeDecodeError:
+            pass
+    try:
+        return python.PyUnicode_DecodeUTF8(c_path, c_len, NULL)
+    except UnicodeDecodeError:
+        # this is a stupid fallback, but it might still work...
+        return python.PyString_FromStringAndSize(c_path, c_len)
+
 cdef object _encodeFilenameUTF8(object filename):
     u"""Recode filename as UTF-8. Tries ASCII, local filesystem encoding and
     UTF-8 as source encoding.
@@ -1182,6 +1229,8 @@
 cdef object _namespacedNameFromNsName(char* href, char* name):
     if href is NULL:
         return funicode(name)
+    elif python.IS_PYTHON3:
+        return python.PyUnicode_FromFormat("{%s}%s", href, name)
     else:
         s = python.PyString_FromFormat("{%s}%s", href, name)
         if isutf8(href) or isutf8(name):

Modified: lxml/trunk/src/lxml/etree_defs.h
==============================================================================
--- lxml/trunk/src/lxml/etree_defs.h	(original)
+++ lxml/trunk/src/lxml/etree_defs.h	Tue May 20 00:01:33 2008
@@ -15,6 +15,8 @@
 /* Python 3 doesn't have PyFile_*() */
 #if PY_VERSION_HEX >= 0x03000000
 #  define PyFile_AsFile(o) (NULL)
+#else
+#  define PyUnicode_FromFormat(s, ...) (NULL)
 #endif
 
 #if PY_VERSION_HEX >= 0x03000000

Modified: lxml/trunk/src/lxml/html/tests/test_forms.txt
==============================================================================
--- lxml/trunk/src/lxml/html/tests/test_forms.txt	(original)
+++ lxml/trunk/src/lxml/html/tests/test_forms.txt	Tue May 20 00:01:33 2008
@@ -33,10 +33,10 @@
 ... </form>
 ... </body></html>''', base_url='http://example.org/form.html')
 >>> h.base_url
-'http://example.org/form.html'
+u'http://example.org/form.html'
 >>> f = h.forms[0]
 >>> f.action
-'http://example.org/test'
+u'http://example.org/test'
 >>> f.method
 'GET'
 >>> f.inputs # doctest:+NOPARSE_MARKUP

Modified: lxml/trunk/src/lxml/iterparse.pxi
==============================================================================
--- lxml/trunk/src/lxml/iterparse.pxi	(original)
+++ lxml/trunk/src/lxml/iterparse.pxi	Tue May 20 00:01:33 2008
@@ -365,7 +365,9 @@
         cdef int parse_options
         if not hasattr(source, u'read'):
             filename = _encodeFilename(source)
-            source = open(filename, u'rb')
+            if not python.IS_PYTHON3:
+                source = filename
+            source = open(source, u'rb')
         else:
             filename = _encodeFilename(_getFilenameForFile(source))
 

Modified: lxml/trunk/src/lxml/lxml.etree.pyx
==============================================================================
--- lxml/trunk/src/lxml/lxml.etree.pyx	(original)
+++ lxml/trunk/src/lxml/lxml.etree.pyx	Tue May 20 00:01:33 2008
@@ -82,6 +82,8 @@
     _FILENAME_ENCODING = sys.getdefaultencoding()
 if _FILENAME_ENCODING is None:
     _FILENAME_ENCODING = 'ascii'
+else:
+    _FILENAME_ENCODING = _FILENAME_ENCODING.encode(u"UTF-8")
 cdef char* _C_FILENAME_ENCODING
 _C_FILENAME_ENCODING = _cstr(_FILENAME_ENCODING)
 
@@ -245,7 +247,7 @@
             text_or_uri = u"{%s}%s" % (text_or_uri, tag)
         else:
             if not _isString(text_or_uri):
-                text_or_uri = str(text_or_uri)
+                text_or_uri = unicode(text_or_uri)
             tag = _getNsTag(text_or_uri)[1]
             _tagValidOrRaise(tag)
         self.text = text_or_uri
@@ -255,9 +257,9 @@
         return self.text.__hash__()
     def __richcmp__(one, other, int op):
         if not _isString(one):
-            one = str(one)
+            one = unicode(one)
         if not _isString(other):
-            other = str(other)
+            other = unicode(other)
         return python.PyObject_RichCompare(one, other, op)
 
 
@@ -326,11 +328,11 @@
         if c_doc.version is NULL:
             version = None
         else:
-            version = c_doc.version
+            version = funicode(c_doc.version)
         if c_doc.encoding is NULL:
             encoding = None
         else:
-            encoding = c_doc.encoding
+            encoding = funicode(c_doc.encoding)
         return (version, encoding)
 
     cdef buildNewPrefix(self):
@@ -462,7 +464,7 @@
         def __get__(self):
             if self._doc._c_doc.URL is NULL:
                 return None
-            return self._doc._c_doc.URL
+            return _decodeFilename(self._doc._c_doc.URL)
         def __set__(self, url):
             cdef char* c_oldurl
             url = _encodeFilename(url)
@@ -905,9 +907,8 @@
             if c_base is NULL:
                 if self._doc._c_doc.URL is NULL:
                     return None
-                return self._doc._c_doc.URL
-            # FIXME: this might be UTF-8 or any other 8-bit encoding
-            base = c_base
+                return _decodeFilename(self._doc._c_doc.URL)
+            base = _decodeFilename(c_base)
             tree.xmlFree(c_base)
             return base
         def __set__(self, url):
@@ -1839,7 +1840,7 @@
 
     def update(self, sequence_or_dict):
         if isinstance(sequence_or_dict, dict):
-            sequence_or_dict = sequence_or_dict.iteritems()
+            sequence_or_dict = sequence_or_dict.items()
         for key, value in sequence_or_dict:
             _setAttributeValue(self._element, key, value)
 
@@ -2192,7 +2193,7 @@
         else:
             events = (u"start",)
         self._start_element = element
-        self._nextEvent = iterwalk(element, events=events, tag=tag).next
+        self._nextEvent = iterwalk(element, events=events, tag=tag).__next__
 
     def __iter__(self):
         return self
@@ -2454,7 +2455,7 @@
     """
     _dumpToFile(sys.stdout, elem._c_node, pretty_print, with_tail)
 
-def tostring(element_or_tree, *, encoding=None, method="xml",
+def tostring(element_or_tree, *, encoding=None, method=u"xml",
              xml_declaration=None, pretty_print=False, with_tail=True):
     u"""tostring(element_or_tree, encoding=None, method="xml",
                 xml_declaration=None, pretty_print=False, with_tail=True)

Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi	(original)
+++ lxml/trunk/src/lxml/parser.pxi	Tue May 20 00:01:33 2008
@@ -335,12 +335,9 @@
     try:
         if c_url is NULL:
             url = None
-        elif c_context.myDoc is NULL or c_context.myDoc.URL is NULL:
-            # parsing a main document, so URL was passed verbatimly by user
-            url = c_url
         else:
-            # parsing a related document (DTD etc.) => UTF-8 encoded URL
-            url = funicode(c_url)
+            # parsing a related document (DTD etc.) => UTF-8 encoded URL?
+            url = _decodeFilename(c_url)
         if c_pubid is NULL:
             pubid = None
         else:

Modified: lxml/trunk/src/lxml/python.pxd
==============================================================================
--- lxml/trunk/src/lxml/python.pxd	(original)
+++ lxml/trunk/src/lxml/python.pxd	Tue May 20 00:01:33 2008
@@ -30,6 +30,7 @@
                                             char* errors)
     cdef object PyUnicode_AsEncodedString(object u, char* encoding,
                                           char* errors)
+    cdef object PyUnicode_FromFormat(char* format, ...) # Python 3
     cdef object PyUnicode_Decode(char* s, Py_ssize_t size,
                                  char* encoding, char* errors)
     cdef object PyUnicode_DecodeUTF8(char* s, Py_ssize_t size, char* errors)

Modified: lxml/trunk/src/lxml/xslt.pxi
==============================================================================
--- lxml/trunk/src/lxml/xslt.pxi	(original)
+++ lxml/trunk/src/lxml/xslt.pxi	Tue May 20 00:01:33 2008
@@ -85,13 +85,13 @@
     try:
         resolvers = context._resolvers
         if cstd.strncmp('string://', c_uri, 9) == 0:
-            uri = funicode(c_uri + 9)
+            uri = _decodeFilename(c_uri + 9)
             if cstd.strncmp('string://', context._c_style_doc.URL, 9) != 0 and \
                     cstd.strcmp('<string>', context._c_style_doc.URL) != 0:
                 # stylesheet URL known => make the target URL absolute
-                uri = os_path_join(context._c_style_doc.URL, uri)
+                uri = os_path_join(_decodeFilename(context._c_style_doc.URL), uri)
         else:
-            uri = funicode(c_uri)
+            uri = _decodeFilename(c_uri)
         doc_ref = resolvers.resolve(uri, None, context)
 
         c_doc = NULL


More information about the lxml-checkins mailing list