[Lxml-checkins] r51739 - in lxml/trunk: . src/lxml src/lxml/html src/lxml/tests

scoder at codespeak.net scoder at codespeak.net
Thu Feb 21 17:19:37 CET 2008


Author: scoder
Date: Thu Feb 21 17:19:36 2008
New Revision: 51739

Modified:
   lxml/trunk/   (props changed)
   lxml/trunk/CHANGES.txt
   lxml/trunk/TODO.txt
   lxml/trunk/src/lxml/docloader.pxi
   lxml/trunk/src/lxml/html/__init__.py
   lxml/trunk/src/lxml/lxml.etree.pyx
   lxml/trunk/src/lxml/parser.pxi
   lxml/trunk/src/lxml/relaxng.pxi
   lxml/trunk/src/lxml/tests/test_etree.py
   lxml/trunk/src/lxml/xmlid.pxi
   lxml/trunk/src/lxml/xmlschema.pxi
Log:
 r3549 at delle:  sbehnel | 2008-02-19 12:35:22 +0100
 support overriding base_url in parse()


Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt	(original)
+++ lxml/trunk/CHANGES.txt	Thu Feb 21 17:19:36 2008
@@ -8,6 +8,9 @@
 Features added
 --------------
 
+* Support passing ``base_url`` to file parser functions to override
+  the filename of the file(-like) object.
+
 Bugs fixed
 ----------
 

Modified: lxml/trunk/TODO.txt
==============================================================================
--- lxml/trunk/TODO.txt	(original)
+++ lxml/trunk/TODO.txt	Thu Feb 21 17:19:36 2008
@@ -21,6 +21,9 @@
 * better exception messages for XPath and schemas based on error log,
   e.g. missing namespace mappings in XPath
 
+* more testing on input/output of encoded filenames, including custom
+  resolvers, relative XSLT imports, ...
+
 
 QName
 -----

Modified: lxml/trunk/src/lxml/docloader.pxi
==============================================================================
--- lxml/trunk/src/lxml/docloader.pxi	(original)
+++ lxml/trunk/src/lxml/docloader.pxi	Thu Feb 21 17:19:36 2008
@@ -67,8 +67,8 @@
         doc_ref._filename = _encodeFilename(filename)
         return doc_ref
 
-    def resolve_file(self, f, context):
-        """resolve_file(self, f, context)
+    def resolve_file(self, f, context, *, base_url=None):
+        """resolve_file(self, f, context, base_url=None)
 
         Return an open file-like object as input document.
 
@@ -81,7 +81,10 @@
             raise TypeError("Argument is not a file-like object")
         doc_ref = _InputDocument()
         doc_ref._type = PARSER_DATA_FILE
-        doc_ref._filename = _getFilenameForFile(f)
+        if base_url is not None:
+            doc_ref._filename = _encodeFilename(base_url)
+        else:
+            doc_ref._filename = _getFilenameForFile(f)
         doc_ref._file = f
         return doc_ref
 

Modified: lxml/trunk/src/lxml/html/__init__.py
==============================================================================
--- lxml/trunk/src/lxml/html/__init__.py	(original)
+++ lxml/trunk/src/lxml/html/__init__.py	Thu Feb 21 17:19:36 2008
@@ -569,20 +569,18 @@
         body.tag = 'span'
     return body
 
-def parse(filename_or_url, parser=None, **kw):
+def parse(filename_or_url, parser=None, base_url=None, **kw):
     """
     Parse a filename, URL, or file-like object into an HTML document
     tree.  Note: this returns a tree, not an element.  Use
     ``parse(...).getroot()`` to get the document root.
 
-    You cannot give a base_url, but the filename/url will serve as
-    that URL.  If you pass in a file-like object and that object has a
-    ``.geturl()`` method then that will be used as the base_url
-    (``urllib.urlopen()`` returns file-like objects with this method).
+    You can override the base URL with the ``base_url`` keyword.  This
+    is most useful when parsing from a file-like object.
     """
     if parser is None:
         parser = html_parser
-    return etree.parse(filename_or_url, parser, **kw)
+    return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
 
 def _contains_block_level_tag(el):
     # FIXME: I could do this with XPath, but would that just be

Modified: lxml/trunk/src/lxml/lxml.etree.pyx
==============================================================================
--- lxml/trunk/src/lxml/lxml.etree.pyx	(original)
+++ lxml/trunk/src/lxml/lxml.etree.pyx	Thu Feb 21 17:19:36 2008
@@ -1488,13 +1488,13 @@
         assert self._context_node is not None, \
                "ElementTree not initialized, missing root"
 
-    def parse(self, source, _BaseParser parser=None):
-        """parse(self, source, parser=None)
+    def parse(self, source, _BaseParser parser=None, *, base_url=None):
+        """parse(self, source, parser=None, base_url=None)
 
         Updates self with the content of source and returns its root
         """
         cdef _Document doc
-        doc = _parseDocument(source, parser)
+        doc = _parseDocument(source, parser, base_url)
         self._context_node = doc.getroot()
         if self._context_node is None:
             self._doc = doc
@@ -2300,7 +2300,7 @@
         doc  = element._doc
     elif file is not None:
         try:
-            doc = _parseDocument(file, parser)
+            doc = _parseDocument(file, parser, None)
         except _TargetParserResult, result_container:
             return result_container.result
     else:
@@ -2504,15 +2504,19 @@
         raise TypeError("Type '%s' cannot be serialized." %
                         type(element_or_tree))
 
-def parse(source, _BaseParser parser=None):
-    """parse(source, parser=None)
+def parse(source, _BaseParser parser=None, *, base_url=None):
+    """parse(source, parser=None, base_url=None)
 
     Return an ElementTree object loaded with source elements.  If no parser
     is provided as second argument, the default parser is used.
+
+    The ``base_url`` keyword allows setting a URL for the document
+    when parsing from a file-like object.  This is needed when looking
+    up external entities (DTD, XInclude, ...) with relative paths.
     """
     cdef _Document doc
     try:
-        doc = _parseDocument(source, parser)
+        doc = _parseDocument(source, parser, base_url)
         return _elementTreeFactory(doc, None)
     except _TargetParserResult, result_container:
         return result_container.result

Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi	(original)
+++ lxml/trunk/src/lxml/parser.pxi	Thu Feb 21 17:19:36 2008
@@ -1735,24 +1735,33 @@
 ## (here we convert to UTF-8)
 ############################################################
 
-cdef _Document _parseDocument(source, _BaseParser parser):
-    filename = _getFilenameForFile(source)
+cdef _Document _parseDocument(source, _BaseParser parser, base_url):
+    cdef _Document doc
+    if base_url is not None:
+        url = base_url
+    else:
+        url = _getFilenameForFile(source)
     if hasattr(source, 'getvalue') and hasattr(source, 'tell'):
         # StringIO - reading from start?
         if source.tell() == 0:
             return _parseMemoryDocument(
-                source.getvalue(), _encodeFilenameUTF8(filename), parser)
+                source.getvalue(), _encodeFilenameUTF8(url), parser)
 
     # Support for file-like objects (urlgrabber.urlopen, ...)
     if hasattr(source, 'read'):
         return _parseFilelikeDocument(
-            source, _encodeFilenameUTF8(filename), parser)
+            source, _encodeFilenameUTF8(url), parser)
 
     # Otherwise parse the file directly from the filesystem
-    if filename is None:
-        filename = _encodeFilename(source)
-    # open filename
-    return _parseDocumentFromURL(filename, parser)
+    filename = _encodeFilename(source)
+    doc = _parseDocumentFromURL(filename, parser)
+    # fix base URL if requested
+    if base_url is not None:
+        base_url = _encodeFilenameUTF8(base_url)
+        if doc._c_doc.URL is not NULL:
+            tree.xmlFree(doc._c_doc.URL)
+        doc._c_doc.URL = tree.xmlStrdup(_cstr(base_url))
+    return doc
 
 cdef _Document _parseDocumentFromURL(url, _BaseParser parser):
     cdef xmlDoc* c_doc

Modified: lxml/trunk/src/lxml/relaxng.pxi
==============================================================================
--- lxml/trunk/src/lxml/relaxng.pxi	(original)
+++ lxml/trunk/src/lxml/relaxng.pxi	Thu Feb 21 17:19:36 2008
@@ -57,7 +57,7 @@
                 self._error_log.connect()
                 parser_ctxt = relaxng.xmlRelaxNGNewParserCtxt(_cstr(filename))
             else:
-                doc = _parseDocument(file, None)
+                doc = _parseDocument(file, None, None)
                 self._error_log.connect()
                 parser_ctxt = relaxng.xmlRelaxNGNewDocParserCtxt(doc._c_doc)
         else:

Modified: lxml/trunk/src/lxml/tests/test_etree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_etree.py	(original)
+++ lxml/trunk/src/lxml/tests/test_etree.py	Thu Feb 21 17:19:36 2008
@@ -1769,6 +1769,19 @@
         docinfo = root.getroottree().docinfo
         self.assertEquals(docinfo.URL, "http://no/such/url")
 
+    def test_parse_stringio_base_url(self):
+        etree = self.etree
+        tree = etree.parse(StringIO("<root/>"), base_url="http://no/such/url")
+        docinfo = tree.docinfo
+        self.assertEquals(docinfo.URL, "http://no/such/url")
+
+    def test_parse_base_url_docinfo(self):
+        etree = self.etree
+        tree = etree.parse(fileInTestDir('include/test_xinclude.xml'),
+                           base_url="http://no/such/url")
+        docinfo = tree.docinfo
+        self.assertEquals(docinfo.URL, "http://no/such/url")
+
     def test_HTML_base_url_docinfo(self):
         etree = self.etree
         root = etree.HTML("<html/>", base_url="http://no/such/url")

Modified: lxml/trunk/src/lxml/xmlid.pxi
==============================================================================
--- lxml/trunk/src/lxml/xmlid.pxi	(original)
+++ lxml/trunk/src/lxml/xmlid.pxi	Thu Feb 21 17:19:36 2008
@@ -40,7 +40,7 @@
     else:
         return (root, _IDDict(root))
 
-def parseid(source, parser=None):
+def parseid(source, parser=None, *, base_url=None):
     """parseid(source, parser=None)
 
     Parses the source into a tuple containing an ElementTree object and an
@@ -51,7 +51,7 @@
     The results are undefined.
     """
     cdef _Document doc
-    doc = _parseDocument(source, parser)
+    doc = _parseDocument(source, parser, base_url)
     return (_elementTreeFactory(doc, None), _IDDict(doc))
 
 cdef class _IDDict:

Modified: lxml/trunk/src/lxml/xmlschema.pxi
==============================================================================
--- lxml/trunk/src/lxml/xmlschema.pxi	(original)
+++ lxml/trunk/src/lxml/xmlschema.pxi	Thu Feb 21 17:19:36 2008
@@ -58,7 +58,7 @@
                 self._error_log.connect()
                 parser_ctxt = xmlschema.xmlSchemaNewParserCtxt(_cstr(filename))
             else:
-                doc = _parseDocument(file, None)
+                doc = _parseDocument(file, None, None)
                 self._error_log.connect()
                 parser_ctxt = xmlschema.xmlSchemaNewDocParserCtxt(doc._c_doc)
         else:


More information about the lxml-checkins mailing list