[Lxml-checkins] r51739 - in lxml/trunk: . src/lxml src/lxml/html src/lxml/tests
scoder at codespeak.net
scoder at codespeak.net
Thu Feb 21 17:19:37 CET 2008
Author: scoder
Date: Thu Feb 21 17:19:36 2008
New Revision: 51739
Modified:
lxml/trunk/ (props changed)
lxml/trunk/CHANGES.txt
lxml/trunk/TODO.txt
lxml/trunk/src/lxml/docloader.pxi
lxml/trunk/src/lxml/html/__init__.py
lxml/trunk/src/lxml/lxml.etree.pyx
lxml/trunk/src/lxml/parser.pxi
lxml/trunk/src/lxml/relaxng.pxi
lxml/trunk/src/lxml/tests/test_etree.py
lxml/trunk/src/lxml/xmlid.pxi
lxml/trunk/src/lxml/xmlschema.pxi
Log:
r3549 at delle: sbehnel | 2008-02-19 12:35:22 +0100
support overriding base_url in parse()
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Thu Feb 21 17:19:36 2008
@@ -8,6 +8,9 @@
Features added
--------------
+* Support passing ``base_url`` to file parser functions to override
+ the filename of the file(-like) object.
+
Bugs fixed
----------
Modified: lxml/trunk/TODO.txt
==============================================================================
--- lxml/trunk/TODO.txt (original)
+++ lxml/trunk/TODO.txt Thu Feb 21 17:19:36 2008
@@ -21,6 +21,9 @@
* better exception messages for XPath and schemas based on error log,
e.g. missing namespace mappings in XPath
+* more testing on input/output of encoded filenames, including custom
+ resolvers, relative XSLT imports, ...
+
QName
-----
Modified: lxml/trunk/src/lxml/docloader.pxi
==============================================================================
--- lxml/trunk/src/lxml/docloader.pxi (original)
+++ lxml/trunk/src/lxml/docloader.pxi Thu Feb 21 17:19:36 2008
@@ -67,8 +67,8 @@
doc_ref._filename = _encodeFilename(filename)
return doc_ref
- def resolve_file(self, f, context):
- """resolve_file(self, f, context)
+ def resolve_file(self, f, context, *, base_url=None):
+ """resolve_file(self, f, context, base_url=None)
Return an open file-like object as input document.
@@ -81,7 +81,10 @@
raise TypeError("Argument is not a file-like object")
doc_ref = _InputDocument()
doc_ref._type = PARSER_DATA_FILE
- doc_ref._filename = _getFilenameForFile(f)
+ if base_url is not None:
+ doc_ref._filename = _encodeFilename(base_url)
+ else:
+ doc_ref._filename = _getFilenameForFile(f)
doc_ref._file = f
return doc_ref
Modified: lxml/trunk/src/lxml/html/__init__.py
==============================================================================
--- lxml/trunk/src/lxml/html/__init__.py (original)
+++ lxml/trunk/src/lxml/html/__init__.py Thu Feb 21 17:19:36 2008
@@ -569,20 +569,18 @@
body.tag = 'span'
return body
-def parse(filename_or_url, parser=None, **kw):
+def parse(filename_or_url, parser=None, base_url=None, **kw):
"""
Parse a filename, URL, or file-like object into an HTML document
tree. Note: this returns a tree, not an element. Use
``parse(...).getroot()`` to get the document root.
- You cannot give a base_url, but the filename/url will serve as
- that URL. If you pass in a file-like object and that object has a
- ``.geturl()`` method then that will be used as the base_url
- (``urllib.urlopen()`` returns file-like objects with this method).
+ You can override the base URL with the ``base_url`` keyword. This
+ is most useful when parsing from a file-like object.
"""
if parser is None:
parser = html_parser
- return etree.parse(filename_or_url, parser, **kw)
+ return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
def _contains_block_level_tag(el):
# FIXME: I could do this with XPath, but would that just be
Modified: lxml/trunk/src/lxml/lxml.etree.pyx
==============================================================================
--- lxml/trunk/src/lxml/lxml.etree.pyx (original)
+++ lxml/trunk/src/lxml/lxml.etree.pyx Thu Feb 21 17:19:36 2008
@@ -1488,13 +1488,13 @@
assert self._context_node is not None, \
"ElementTree not initialized, missing root"
- def parse(self, source, _BaseParser parser=None):
- """parse(self, source, parser=None)
+ def parse(self, source, _BaseParser parser=None, *, base_url=None):
+ """parse(self, source, parser=None, base_url=None)
Updates self with the content of source and returns its root
"""
cdef _Document doc
- doc = _parseDocument(source, parser)
+ doc = _parseDocument(source, parser, base_url)
self._context_node = doc.getroot()
if self._context_node is None:
self._doc = doc
@@ -2300,7 +2300,7 @@
doc = element._doc
elif file is not None:
try:
- doc = _parseDocument(file, parser)
+ doc = _parseDocument(file, parser, None)
except _TargetParserResult, result_container:
return result_container.result
else:
@@ -2504,15 +2504,19 @@
raise TypeError("Type '%s' cannot be serialized." %
type(element_or_tree))
-def parse(source, _BaseParser parser=None):
- """parse(source, parser=None)
+def parse(source, _BaseParser parser=None, *, base_url=None):
+ """parse(source, parser=None, base_url=None)
Return an ElementTree object loaded with source elements. If no parser
is provided as second argument, the default parser is used.
+
+ The ``base_url`` keyword allows setting a URL for the document
+ when parsing from a file-like object. This is needed when looking
+ up external entities (DTD, XInclude, ...) with relative paths.
"""
cdef _Document doc
try:
- doc = _parseDocument(source, parser)
+ doc = _parseDocument(source, parser, base_url)
return _elementTreeFactory(doc, None)
except _TargetParserResult, result_container:
return result_container.result
Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi (original)
+++ lxml/trunk/src/lxml/parser.pxi Thu Feb 21 17:19:36 2008
@@ -1735,24 +1735,33 @@
## (here we convert to UTF-8)
############################################################
-cdef _Document _parseDocument(source, _BaseParser parser):
- filename = _getFilenameForFile(source)
+cdef _Document _parseDocument(source, _BaseParser parser, base_url):
+ cdef _Document doc
+ if base_url is not None:
+ url = base_url
+ else:
+ url = _getFilenameForFile(source)
if hasattr(source, 'getvalue') and hasattr(source, 'tell'):
# StringIO - reading from start?
if source.tell() == 0:
return _parseMemoryDocument(
- source.getvalue(), _encodeFilenameUTF8(filename), parser)
+ source.getvalue(), _encodeFilenameUTF8(url), parser)
# Support for file-like objects (urlgrabber.urlopen, ...)
if hasattr(source, 'read'):
return _parseFilelikeDocument(
- source, _encodeFilenameUTF8(filename), parser)
+ source, _encodeFilenameUTF8(url), parser)
# Otherwise parse the file directly from the filesystem
- if filename is None:
- filename = _encodeFilename(source)
- # open filename
- return _parseDocumentFromURL(filename, parser)
+ filename = _encodeFilename(source)
+ doc = _parseDocumentFromURL(filename, parser)
+ # fix base URL if requested
+ if base_url is not None:
+ base_url = _encodeFilenameUTF8(base_url)
+ if doc._c_doc.URL is not NULL:
+ tree.xmlFree(doc._c_doc.URL)
+ doc._c_doc.URL = tree.xmlStrdup(_cstr(base_url))
+ return doc
cdef _Document _parseDocumentFromURL(url, _BaseParser parser):
cdef xmlDoc* c_doc
Modified: lxml/trunk/src/lxml/relaxng.pxi
==============================================================================
--- lxml/trunk/src/lxml/relaxng.pxi (original)
+++ lxml/trunk/src/lxml/relaxng.pxi Thu Feb 21 17:19:36 2008
@@ -57,7 +57,7 @@
self._error_log.connect()
parser_ctxt = relaxng.xmlRelaxNGNewParserCtxt(_cstr(filename))
else:
- doc = _parseDocument(file, None)
+ doc = _parseDocument(file, None, None)
self._error_log.connect()
parser_ctxt = relaxng.xmlRelaxNGNewDocParserCtxt(doc._c_doc)
else:
Modified: lxml/trunk/src/lxml/tests/test_etree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_etree.py (original)
+++ lxml/trunk/src/lxml/tests/test_etree.py Thu Feb 21 17:19:36 2008
@@ -1769,6 +1769,19 @@
docinfo = root.getroottree().docinfo
self.assertEquals(docinfo.URL, "http://no/such/url")
+ def test_parse_stringio_base_url(self):
+ etree = self.etree
+ tree = etree.parse(StringIO("<root/>"), base_url="http://no/such/url")
+ docinfo = tree.docinfo
+ self.assertEquals(docinfo.URL, "http://no/such/url")
+
+ def test_parse_base_url_docinfo(self):
+ etree = self.etree
+ tree = etree.parse(fileInTestDir('include/test_xinclude.xml'),
+ base_url="http://no/such/url")
+ docinfo = tree.docinfo
+ self.assertEquals(docinfo.URL, "http://no/such/url")
+
def test_HTML_base_url_docinfo(self):
etree = self.etree
root = etree.HTML("<html/>", base_url="http://no/such/url")
Modified: lxml/trunk/src/lxml/xmlid.pxi
==============================================================================
--- lxml/trunk/src/lxml/xmlid.pxi (original)
+++ lxml/trunk/src/lxml/xmlid.pxi Thu Feb 21 17:19:36 2008
@@ -40,7 +40,7 @@
else:
return (root, _IDDict(root))
-def parseid(source, parser=None):
+def parseid(source, parser=None, *, base_url=None):
"""parseid(source, parser=None)
Parses the source into a tuple containing an ElementTree object and an
@@ -51,7 +51,7 @@
The results are undefined.
"""
cdef _Document doc
- doc = _parseDocument(source, parser)
+ doc = _parseDocument(source, parser, base_url)
return (_elementTreeFactory(doc, None), _IDDict(doc))
cdef class _IDDict:
Modified: lxml/trunk/src/lxml/xmlschema.pxi
==============================================================================
--- lxml/trunk/src/lxml/xmlschema.pxi (original)
+++ lxml/trunk/src/lxml/xmlschema.pxi Thu Feb 21 17:19:36 2008
@@ -58,7 +58,7 @@
self._error_log.connect()
parser_ctxt = xmlschema.xmlSchemaNewParserCtxt(_cstr(filename))
else:
- doc = _parseDocument(file, None)
+ doc = _parseDocument(file, None, None)
self._error_log.connect()
parser_ctxt = xmlschema.xmlSchemaNewDocParserCtxt(doc._c_doc)
else:
More information about the lxml-checkins
mailing list