HelloHi all'
-all you need to do is pass it to the `parse()` function:
+all you need to do is pass it to the ``fromstring()`` function:
.. sourcecode:: pycon
- >>> from lxml.html.ElementSoup import parse
- >>> from StringIO import StringIO
- >>> root = parse(StringIO(tag_soup))
+ >>> from lxml.html.soupparser import fromstring
+ >>> root = fromstring(tag_soup)
To see what we have here, you can serialise it:
@@ -49,5 +50,10 @@
already, right? BeautifulSoup did its best, and so now it's a tree.
To control which Element implementation is used, you can pass a
-``makeelement`` factory function to ``parse()``. By default, this is based on
-the HTML parser defined in ``lxml.html``.
+``makeelement`` factory function to ``parse()`` and ``fromstring()``.
+By default, this is based on the HTML parser defined in ``lxml.html``.
+
+There is also a legacy module called ``ElementSoup``, which mimics the
+interface provided by ElementTree's own ElementSoup_ module.
+
+.. _ElementSoup: http://effbot.org/zone/element-soup.htm
Deleted: /lxml/trunk/src/lxml/html/ElementSoup.py
==============================================================================
--- /lxml/trunk/src/lxml/html/ElementSoup.py Sun Mar 16 11:50:28 2008
+++ (empty file)
@@ -1,94 +0,0 @@
-__doc__ = """External interface to the BeautifulSoup HTML parser.
-"""
-
-__all__ = ["parse", "convert_tree"]
-
-from lxml import etree, html
-from BeautifulSoup import \
- BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString
-
-
-def parse(file, beautifulsoup=None, makeelement=None):
- if beautifulsoup is None:
- beautifulsoup = BeautifulSoup
- if makeelement is None:
- makeelement = html.html_parser.makeelement
- if not hasattr(file, 'read'):
- file = open(file)
- tree = beautifulsoup(file)
- root = _convert_tree(tree, makeelement)
- # from ET: wrap the document in a html root element, if necessary
- if len(root) == 1 and root[0].tag == "html":
- return root[0]
- root.tag = "html"
- return root
-
-def convert_tree(beautiful_soup_tree, makeelement=None):
- if makeelement is None:
- makeelement = html.html_parser.makeelement
- root = _convert_tree(beautiful_soup_tree, makeelement)
- children = root.getchildren()
- for child in children:
- root.remove(child)
- return children
-
-
-# helpers
-
-def _convert_tree(beautiful_soup_tree, makeelement):
- root = makeelement(beautiful_soup_tree.name,
- attrib=dict(beautiful_soup_tree.attrs))
- _convert_children(root, beautiful_soup_tree, makeelement)
- return root
-
-def _convert_children(parent, beautiful_soup_tree, makeelement):
- SubElement = etree.SubElement
- et_child = None
- for child in beautiful_soup_tree:
- if isinstance(child, Tag):
- et_child = SubElement(parent, child.name, attrib=dict(
- [(k, unescape(v)) for (k,v) in child.attrs]))
- _convert_children(et_child, child, makeelement)
- elif type(child) is NavigableString:
- _append_text(parent, et_child, unescape(unicode(child)))
- else:
- if isinstance(child, Comment):
- parent.append(etree.Comment(child.string))
- elif isinstance(child, ProcessingInstruction):
- parent.append(etree.ProcessingInstruction(
- *child.string.split(' ', 1)))
- else: # CData
- _append_text(parent, et_child, unescape(unicode(child)))
-
-def _append_text(parent, element, text):
- if element is None:
- parent.text = (parent.text or '') + text
- else:
- element.tail = (element.tail or '') + text
-
-
-# copied from ET's ElementSoup
-
-import htmlentitydefs, re
-
-handle_entities = re.compile("&(\w+);").sub
-
-try:
- name2codepoint = htmlentitydefs.name2codepoint
-except AttributeError:
- # Emulate name2codepoint for Python 2.2 and earlier
- name2codepoint = {}
- for name, entity in htmlentitydefs.entitydefs.items():
- if len(entity) == 1:
- name2codepoint[name] = ord(entity)
- else:
- name2codepoint[name] = int(entity[2:-1])
-
-def unescape(string):
- # work around oddities in BeautifulSoup's entity handling
- def unescape_entity(m):
- try:
- return unichr(name2codepoint[m.group(1)])
- except KeyError:
- return m.group(0) # use as is
- return handle_entities(unescape_entity, string)
Copied: lxml/trunk/src/lxml/html/soupparser.py (from r51012, lxml/trunk/src/lxml/html/ElementSoup.py)
==============================================================================
--- lxml/trunk/src/lxml/html/ElementSoup.py (original)
+++ lxml/trunk/src/lxml/html/soupparser.py Sun Mar 16 11:50:28 2008
@@ -1,29 +1,50 @@
__doc__ = """External interface to the BeautifulSoup HTML parser.
"""
-__all__ = ["parse", "convert_tree"]
+__all__ = ["fromstring", "parse", "convert_tree"]
from lxml import etree, html
from BeautifulSoup import \
BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString
+def fromstring(data, beautifulsoup=None, makeelement=None):
+ """Parse a string of HTML data into an Element tree using the
+ BeautifulSoup parser.
+
+ Returns the root ```` Element of the tree.
+
+ You can pass a different BeautifulSoup parser through the
+ `beautifulsoup` keyword, and a diffent Element factory function
+ through the `makeelement` keyword. By default, the standard
+ ``BeautifulSoup`` class and the default factory of `lxml.html` are
+ used.
+ """
+ return _parse(data, beautifulsoup, makeelement)
+
def parse(file, beautifulsoup=None, makeelement=None):
- if beautifulsoup is None:
- beautifulsoup = BeautifulSoup
- if makeelement is None:
- makeelement = html.html_parser.makeelement
+ """Parse a file into an ElemenTree using the BeautifulSoup parser.
+
+ You can pass a different BeautifulSoup parser through the
+ `beautifulsoup` keyword, and a diffent Element factory function
+ through the `makeelement` keyword. By default, the standard
+ ``BeautifulSoup`` class and the default factory of `lxml.html` are
+ used.
+ """
if not hasattr(file, 'read'):
file = open(file)
- tree = beautifulsoup(file)
- root = _convert_tree(tree, makeelement)
- # from ET: wrap the document in a html root element, if necessary
- if len(root) == 1 and root[0].tag == "html":
- return root[0]
- root.tag = "html"
- return root
+ root = _parse(file, beautifulsoup, makeelement)
+ return etree.ElementTree(root)
def convert_tree(beautiful_soup_tree, makeelement=None):
+ """Convert a BeautifulSoup tree to a list of Element trees.
+
+ Returns a list instead of a single root Element to support
+ HTML-like soup with more than one root element.
+
+ You can pass a different Element factory through the `makeelement`
+ keyword.
+ """
if makeelement is None:
makeelement = html.html_parser.makeelement
root = _convert_tree(beautiful_soup_tree, makeelement)
@@ -35,6 +56,19 @@
# helpers
+def _parse(source, beautifulsoup, makeelement):
+ if beautifulsoup is None:
+ beautifulsoup = BeautifulSoup
+ if makeelement is None:
+ makeelement = html.html_parser.makeelement
+ tree = beautifulsoup(source)
+ root = _convert_tree(tree, makeelement)
+ # from ET: wrap the document in a html root element, if necessary
+ if len(root) == 1 and root[0].tag == "html":
+ return root[0]
+ root.tag = "html"
+ return root
+
def _convert_tree(beautiful_soup_tree, makeelement):
root = makeelement(beautiful_soup_tree.name,
attrib=dict(beautiful_soup_tree.attrs))
From scoder at codespeak.net Sun Mar 16 12:04:03 2008
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Sun, 16 Mar 2008 12:04:03 +0100 (CET)
Subject: [Lxml-checkins] r52578 - in lxml/branch/lxml-2.0: . doc
src/lxml/html
Message-ID: <20080316110403.0100C16A130@codespeak.net>
Author: scoder
Date: Sun Mar 16 12:04:03 2008
New Revision: 52578
Added:
lxml/branch/lxml-2.0/src/lxml/html/soupparser.py
- copied unchanged from r52576, lxml/trunk/src/lxml/html/soupparser.py
Removed:
lxml/branch/lxml-2.0/src/lxml/html/ElementSoup.py
Modified:
lxml/branch/lxml-2.0/CHANGES.txt
lxml/branch/lxml-2.0/doc/elementsoup.txt
Log:
merge -c 52576
Modified: lxml/branch/lxml-2.0/CHANGES.txt
==============================================================================
--- lxml/branch/lxml-2.0/CHANGES.txt (original)
+++ lxml/branch/lxml-2.0/CHANGES.txt Sun Mar 16 12:04:03 2008
@@ -17,6 +17,10 @@
Other changes
-------------
+* ``lxml.html.ElementSoup`` was replaced by a new module
+ ``lxml.html.soupparser`` with a more consistent API. The old module
+ remains for compatibility with ElementTree's own ElementSoup module.
+
* Setting the XSLT_CONFIG and XML2_CONFIG environment variables at
build time will let setup.py pick up the ``xml2-config`` and
``xslt-config`` scripts from the supplied path name.
Modified: lxml/branch/lxml-2.0/doc/elementsoup.txt
==============================================================================
--- lxml/branch/lxml-2.0/doc/elementsoup.txt (original)
+++ lxml/branch/lxml-2.0/doc/elementsoup.txt Sun Mar 16 12:04:03 2008
@@ -2,9 +2,6 @@
BeautifulSoup Parser
====================
-:Author:
- Stefan Behnel
-
BeautifulSoup_ is a Python package that parses broken HTML. While libxml2
(and thus lxml) can also parse broken HTML, BeautifulSoup is much more
forgiving and has superiour `support for encoding detection`_.
@@ -12,22 +9,32 @@
.. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/
.. _`support for encoding detection`: http://www.crummy.com/software/BeautifulSoup/documentation.html#Beautiful%20Soup%20Gives%20You%20Unicode,%20Dammit
-lxml can benefit from the parsing capabilities of BeautifulSoup through the
-`lxml.html.ElementSoup` module. It provides two main functions: `parse()` to
-parse a file using BeautifulSoup, and `convert_tree()` to convert a
+lxml can benefit from the parsing capabilities of BeautifulSoup
+through the ``lxml.html.soupparser`` module. It provides three main
+functions: ``fromstring()`` and ``parse()`` to parse a string or file
+using BeautifulSoup, and `convert_tree()` to convert an existing
BeautifulSoup tree into a list of top-level Elements.
-Here is a document full of tag soup, similar to, but not quite like, HTML::
+The functions ``fromstring()`` and ``parse()`` behave as known from
+ElementTree. The first returns a root Element, the latter returns an
+ElementTree.
+
+Here is a document full of tag soup, similar to, but not quite like, HTML:
+
+.. sourcecode:: pycon
>>> tag_soup = '
HelloHi all'
-all you need to do is pass it to the `parse()` function::
+all you need to do is pass it to the ``fromstring()`` function:
+
+.. sourcecode:: pycon
- >>> from lxml.html.ElementSoup import parse
- >>> from StringIO import StringIO
- >>> root = parse(StringIO(tag_soup))
+ >>> from lxml.html.soupparser import fromstring
+ >>> root = fromstring(tag_soup)
-To see what we have here, you can serialise it::
+To see what we have here, you can serialise it:
+
+.. sourcecode:: pycon
>>> from lxml.etree import tostring
>>> print tostring(root, pretty_print=True),
@@ -43,5 +50,10 @@
already, right? BeautifulSoup did its best, and so now it's a tree.
To control which Element implementation is used, you can pass a
-``makeelement`` factory function to ``parse()``. By default, this is based on
-the HTML parser defined in ``lxml.html``.
+``makeelement`` factory function to ``parse()`` and ``fromstring()``.
+By default, this is based on the HTML parser defined in ``lxml.html``.
+
+There is also a legacy module called ``ElementSoup``, which mimics the
+interface provided by ElementTree's own ElementSoup_ module.
+
+.. _ElementSoup: http://effbot.org/zone/element-soup.htm
Deleted: /lxml/branch/lxml-2.0/src/lxml/html/ElementSoup.py
==============================================================================
--- /lxml/branch/lxml-2.0/src/lxml/html/ElementSoup.py Sun Mar 16 12:04:03 2008
+++ (empty file)
@@ -1,94 +0,0 @@
-__doc__ = """External interface to the BeautifulSoup HTML parser.
-"""
-
-__all__ = ["parse", "convert_tree"]
-
-from lxml import etree, html
-from BeautifulSoup import \
- BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString
-
-
-def parse(file, beautifulsoup=None, makeelement=None):
- if beautifulsoup is None:
- beautifulsoup = BeautifulSoup
- if makeelement is None:
- makeelement = html.html_parser.makeelement
- if not hasattr(file, 'read'):
- file = open(file)
- tree = beautifulsoup(file)
- root = _convert_tree(tree, makeelement)
- # from ET: wrap the document in a html root element, if necessary
- if len(root) == 1 and root[0].tag == "html":
- return root[0]
- root.tag = "html"
- return root
-
-def convert_tree(beautiful_soup_tree, makeelement=None):
- if makeelement is None:
- makeelement = html.html_parser.makeelement
- root = _convert_tree(beautiful_soup_tree, makeelement)
- children = root.getchildren()
- for child in children:
- root.remove(child)
- return children
-
-
-# helpers
-
-def _convert_tree(beautiful_soup_tree, makeelement):
- root = makeelement(beautiful_soup_tree.name,
- attrib=dict(beautiful_soup_tree.attrs))
- _convert_children(root, beautiful_soup_tree, makeelement)
- return root
-
-def _convert_children(parent, beautiful_soup_tree, makeelement):
- SubElement = etree.SubElement
- et_child = None
- for child in beautiful_soup_tree:
- if isinstance(child, Tag):
- et_child = SubElement(parent, child.name, attrib=dict(
- [(k, unescape(v)) for (k,v) in child.attrs]))
- _convert_children(et_child, child, makeelement)
- elif type(child) is NavigableString:
- _append_text(parent, et_child, unescape(unicode(child)))
- else:
- if isinstance(child, Comment):
- parent.append(etree.Comment(child.string))
- elif isinstance(child, ProcessingInstruction):
- parent.append(etree.ProcessingInstruction(
- *child.string.split(' ', 1)))
- else: # CData
- _append_text(parent, et_child, unescape(unicode(child)))
-
-def _append_text(parent, element, text):
- if element is None:
- parent.text = (parent.text or '') + text
- else:
- element.tail = (element.tail or '') + text
-
-
-# copied from ET's ElementSoup
-
-import htmlentitydefs, re
-
-handle_entities = re.compile("&(\w+);").sub
-
-try:
- name2codepoint = htmlentitydefs.name2codepoint
-except AttributeError:
- # Emulate name2codepoint for Python 2.2 and earlier
- name2codepoint = {}
- for name, entity in htmlentitydefs.entitydefs.items():
- if len(entity) == 1:
- name2codepoint[name] = ord(entity)
- else:
- name2codepoint[name] = int(entity[2:-1])
-
-def unescape(string):
- # work around oddities in BeautifulSoup's entity handling
- def unescape_entity(m):
- try:
- return unichr(name2codepoint[m.group(1)])
- except KeyError:
- return m.group(0) # use as is
- return handle_entities(unescape_entity, string)
From scoder at codespeak.net Sun Mar 16 12:05:13 2008
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Sun, 16 Mar 2008 12:05:13 +0100 (CET)
Subject: [Lxml-checkins] r52579 - in lxml/trunk: . src/lxml/html
Message-ID: <20080316110513.0164416A130@codespeak.net>
Author: scoder
Date: Sun Mar 16 12:05:13 2008
New Revision: 52579
Added:
lxml/trunk/src/lxml/html/ElementSoup.py
Modified:
lxml/trunk/ (props changed)
Log:
r3793 at delle: sbehnel | 2008-03-16 12:04:17 +0100
missing module
Added: lxml/trunk/src/lxml/html/ElementSoup.py
==============================================================================
--- (empty file)
+++ lxml/trunk/src/lxml/html/ElementSoup.py Sun Mar 16 12:05:13 2008
@@ -0,0 +1,10 @@
+__doc__ = """Legacy interface to the BeautifulSoup HTML parser.
+"""
+
+__all__ = ["parse", "convert_tree"]
+
+from soupparser import convert_tree, parse as _parse
+
+def parse(file, beautifulsoup=None, makeelement=None):
+ root = _parse(file, beautifulsoup=beautifulsoup, makeelement=makeelement)
+ return root.getroot()
From scoder at codespeak.net Sun Mar 16 12:06:07 2008
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Sun, 16 Mar 2008 12:06:07 +0100 (CET)
Subject: [Lxml-checkins] r52580 - lxml/branch/lxml-2.0/src/lxml/html
Message-ID: <20080316110607.21F8116A130@codespeak.net>
Author: scoder
Date: Sun Mar 16 12:06:06 2008
New Revision: 52580
Added:
lxml/branch/lxml-2.0/src/lxml/html/ElementSoup.py
- copied unchanged from r52579, lxml/trunk/src/lxml/html/ElementSoup.py
Log:
merge -c 52579
From scoder at codespeak.net Sun Mar 16 12:08:13 2008
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Sun, 16 Mar 2008 12:08:13 +0100 (CET)
Subject: [Lxml-checkins] r52581 - lxml/branch/lxml-2.0
Message-ID: <20080316110813.3D22116A130@codespeak.net>
Author: scoder
Date: Sun Mar 16 12:08:12 2008
New Revision: 52581
Modified:
lxml/branch/lxml-2.0/setupinfo.py
Log:
merge -c 52425
Modified: lxml/branch/lxml-2.0/setupinfo.py
==============================================================================
--- lxml/branch/lxml-2.0/setupinfo.py (original)
+++ lxml/branch/lxml-2.0/setupinfo.py Sun Mar 16 12:08:12 2008
@@ -30,8 +30,12 @@
modules = EXT_MODULES
lib_versions = get_library_versions()
- print("Using build configuration of libxml2 %s and libxslt %s" %
- lib_versions)
+ if lib_versions[0]:
+ print("Using build configuration of libxml2 %s and libxslt %s" %
+ lib_versions)
+ else:
+ print("Using build configuration of libxslt %s" %
+ lib_versions[1])
_include_dirs = include_dirs(static_include_dirs)
_library_dirs = library_dirs(static_library_dirs)
@@ -144,7 +148,11 @@
_ERROR_PRINTED = False
-def run_command(cmd):
+def run_command(cmd, *args):
+ if not cmd:
+ return ''
+ if args:
+ cmd = ' '.join((cmd,) + args)
try:
import subprocess
except ImportError:
@@ -165,17 +173,13 @@
return (output or '').strip()
def get_library_versions():
- cmd = "%s --version" % find_xml2_config()
- xml2_version = run_command(cmd)
- cmd = "%s --version" % find_xslt_config()
- xslt_version = run_command(cmd)
+ xml2_version = run_command(find_xml2_config(), "--version")
+ xslt_version = run_command(find_xslt_config(), "--version")
return xml2_version, xslt_version
def flags(option):
- cmd = "%s --%s" % (find_xml2_config(), option)
- xml2_flags = run_command(cmd)
- cmd = "%s --%s" % (find_xslt_config(), option)
- xslt_flags = run_command(cmd)
+ xml2_flags = run_command(find_xml2_config(), "--%s" % option)
+ xslt_flags = run_command(find_xslt_config(), "--%s" % option)
flag_list = xml2_flags.split()
for flag in xslt_flags.split():
@@ -197,7 +201,8 @@
XML2_CONFIG = arg[len(option):]
return XML2_CONFIG
else:
- XML2_CONFIG = os.getenv('XML2_CONFIG', 'xml2-config')
+ # default: do nothing, rely only on xslt-config
+ XML2_CONFIG = os.getenv('XML2_CONFIG', '')
return XML2_CONFIG
def find_xslt_config():
From scoder at codespeak.net Sun Mar 16 12:11:02 2008
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Sun, 16 Mar 2008 12:11:02 +0100 (CET)
Subject: [Lxml-checkins] r52582 - in lxml/branch/lxml-2.0: . doc
Message-ID: <20080316111102.1583E16A130@codespeak.net>
Author: scoder
Date: Sun Mar 16 12:11:01 2008
New Revision: 52582
Removed:
lxml/branch/lxml-2.0/doc/pyrex.txt
Modified:
lxml/branch/lxml-2.0/MANIFEST.in
Log:
merge -r 52573:52574
Modified: lxml/branch/lxml-2.0/MANIFEST.in
==============================================================================
--- lxml/branch/lxml-2.0/MANIFEST.in (original)
+++ lxml/branch/lxml-2.0/MANIFEST.in Sun Mar 16 12:11:01 2008
@@ -12,4 +12,3 @@
recursive-include doc *.txt *.html *.css *.xml *.mgp pubkey.asc tagpython.png
recursive-include fake_pyrex *.py
include doc/mkhtml.py doc/rest2html.py
-exclude doc/pyrex.txt
Deleted: /lxml/branch/lxml-2.0/doc/pyrex.txt
==============================================================================
--- /lxml/branch/lxml-2.0/doc/pyrex.txt Sun Mar 16 12:11:01 2008
+++ (empty file)
@@ -1,25 +0,0 @@
-Notes on Pyrex
-==============
-
-The lxml wrapper around libxml2 and libxslt is written in Pyrex_. However,
-there are known issues with the current version of Pyrex (0.9.3.1) and version
-4.x of gcc. Most Linux distributions have the necessary patches applied, but
-there is still a certain chance yours hasn't. Also, MacOS-X is known to ship
-with GCC 4, so users may run into problems when compiling Pyrex generated code
-on this system. If the C compiler fails to compile the file src/lxml/etree.c,
-you likely have used an unpatched version of Pyrex to build it.
-
-There are two ways to get around this problem. First of all, if you are using
-a release version of lxml, it should come with the generated C file in the
-source distribution. There is no need to regenerate it using Pyrex.
-
-However, if you want to use more recent SVN versions of lxml or want to work
-on the code, you will need Pyrex to regenerate the C-code. If your version of
-Pyrex is not patched, you may try to apply the patch that ships with lxml and
-is also part of the SVN checkouts. It should fix the remaining problems.
-Apply it to the 0.9.3.1 version of Pyrex, rebuild and install it. If the
-problems persist, please report to the lxml mailing list. Try to provide a
-clear description of what you did to run into the problems and provide the
-compiler output that shows the error.
-
-.. _Pyrex: http://www.cosc.canterbury.ac.nz/~greg/python/Pyrex/
From scoder at codespeak.net Sun Mar 16 13:53:25 2008
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Sun, 16 Mar 2008 13:53:25 +0100 (CET)
Subject: [Lxml-checkins] r52596 - in lxml/trunk: . src/lxml
Message-ID: <20080316125325.766C216A149@codespeak.net>
Author: scoder
Date: Sun Mar 16 13:53:25 2008
New Revision: 52596
Modified:
lxml/trunk/ (props changed)
lxml/trunk/src/lxml/parser.pxi
Log:
r3799 at delle: sbehnel | 2008-03-16 13:52:28 +0100
cleanup, raise a more specific type error on an unparsable source
Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi (original)
+++ lxml/trunk/src/lxml/parser.pxi Sun Mar 16 13:53:25 2008
@@ -1294,10 +1294,22 @@
cdef _Document _parseDocument(source, _BaseParser parser, base_url):
cdef _Document doc
+ if _isString(source):
+ # parse the file directly from the filesystem
+ doc = _parseDocumentFromURL(_encodeFilename(source), parser)
+ # fix base URL if requested
+ if base_url is not None:
+ base_url = _encodeFilenameUTF8(base_url)
+ if doc._c_doc.URL is not NULL:
+ tree.xmlFree(doc._c_doc.URL)
+ doc._c_doc.URL = tree.xmlStrdup(_cstr(base_url))
+ return doc
+
if base_url is not None:
url = base_url
else:
url = _getFilenameForFile(source)
+
if hasattr(source, 'getvalue') and hasattr(source, 'tell'):
# StringIO - reading from start?
if source.tell() == 0:
@@ -1309,16 +1321,7 @@
return _parseFilelikeDocument(
source, _encodeFilenameUTF8(url), parser)
- # Otherwise parse the file directly from the filesystem
- filename = _encodeFilename(source)
- doc = _parseDocumentFromURL(filename, parser)
- # fix base URL if requested
- if base_url is not None:
- base_url = _encodeFilenameUTF8(base_url)
- if doc._c_doc.URL is not NULL:
- tree.xmlFree(doc._c_doc.URL)
- doc._c_doc.URL = tree.xmlStrdup(_cstr(base_url))
- return doc
+ raise TypeError("cannot parse from '%s'" % python._fqtypename(source))
cdef _Document _parseDocumentFromURL(url, _BaseParser parser):
cdef xmlDoc* c_doc
From scoder at codespeak.net Sun Mar 16 13:54:08 2008
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Sun, 16 Mar 2008 13:54:08 +0100 (CET)
Subject: [Lxml-checkins] r52597 - lxml/branch/lxml-2.0/src/lxml
Message-ID: <20080316125408.7843616A149@codespeak.net>
Author: scoder
Date: Sun Mar 16 13:54:06 2008
New Revision: 52597
Modified:
lxml/branch/lxml-2.0/src/lxml/parser.pxi
Log:
merge -c 52596
Modified: lxml/branch/lxml-2.0/src/lxml/parser.pxi
==============================================================================
--- lxml/branch/lxml-2.0/src/lxml/parser.pxi (original)
+++ lxml/branch/lxml-2.0/src/lxml/parser.pxi Sun Mar 16 13:54:06 2008
@@ -1302,10 +1302,22 @@
cdef _Document _parseDocument(source, _BaseParser parser, base_url):
cdef _Document doc
+ if _isString(source):
+ # parse the file directly from the filesystem
+ doc = _parseDocumentFromURL(_encodeFilename(source), parser)
+ # fix base URL if requested
+ if base_url is not None:
+ base_url = _encodeFilenameUTF8(base_url)
+ if doc._c_doc.URL is not NULL:
+ tree.xmlFree(doc._c_doc.URL)
+ doc._c_doc.URL = tree.xmlStrdup(_cstr(base_url))
+ return doc
+
if base_url is not None:
url = base_url
else:
url = _getFilenameForFile(source)
+
if hasattr(source, 'getvalue') and hasattr(source, 'tell'):
# StringIO - reading from start?
if source.tell() == 0:
@@ -1317,16 +1329,7 @@
return _parseFilelikeDocument(
source, _encodeFilenameUTF8(url), parser)
- # Otherwise parse the file directly from the filesystem
- filename = _encodeFilename(source)
- doc = _parseDocumentFromURL(filename, parser)
- # fix base URL if requested
- if base_url is not None:
- base_url = _encodeFilenameUTF8(base_url)
- if doc._c_doc.URL is not NULL:
- tree.xmlFree(doc._c_doc.URL)
- doc._c_doc.URL = tree.xmlStrdup(_cstr(base_url))
- return doc
+ raise TypeError("cannot parse from '%s'" % python._fqtypename(source))
cdef _Document _parseDocumentFromURL(url, _BaseParser parser):
cdef xmlDoc* c_doc
From scoder at codespeak.net Mon Mar 17 13:43:48 2008
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Mon, 17 Mar 2008 13:43:48 +0100 (CET)
Subject: [Lxml-checkins] r52640 - in lxml/trunk: . src/lxml/tests
Message-ID: <20080317124348.2DFF4169E71@codespeak.net>
Author: scoder
Date: Mon Mar 17 13:43:47 2008
New Revision: 52640
Modified:
lxml/trunk/ (props changed)
lxml/trunk/src/lxml/tests/test_elementtree.py
Log:
r3802 at delle: sbehnel | 2008-03-17 13:42:51 +0100
error test for parse(None)
Modified: lxml/trunk/src/lxml/tests/test_elementtree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_elementtree.py (original)
+++ lxml/trunk/src/lxml/tests/test_elementtree.py Mon Mar 17 13:43:47 2008
@@ -2708,6 +2708,10 @@
parse = self.etree.parse
self.assertRaises(IOError, parse, fileInTestDir('notthere.xml'))
+ def test_parse_error_none(self):
+ parse = self.etree.parse
+ self.assertRaises(TypeError, parse, None)
+
def test_parse_error(self):
# ET < 1.3 raises ExpatError
parse = self.etree.parse
From scoder at codespeak.net Mon Mar 17 17:19:20 2008
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Mon, 17 Mar 2008 17:19:20 +0100 (CET)
Subject: [Lxml-checkins] r52646 - lxml/trunk
Message-ID: <20080317161920.CFE67169E19@codespeak.net>
Author: scoder
Date: Mon Mar 17 17:19:19 2008
New Revision: 52646
Modified:
lxml/trunk/ (props changed)
lxml/trunk/setupinfo.py
lxml/trunk/versioninfo.py
Log:
r3804 at delle: sbehnel | 2008-03-17 17:09:07 +0100
setupinfo.py fix for testing Cython version
Modified: lxml/trunk/setupinfo.py
==============================================================================
--- lxml/trunk/setupinfo.py (original)
+++ lxml/trunk/setupinfo.py Mon Mar 17 17:19:19 2008
@@ -1,6 +1,8 @@
import sys, os, os.path
from distutils.core import Extension
+from versioninfo import get_base_dir, split_version
+
try:
from Cython.Distutils import build_ext as build_pyx
import Cython.Compiler.Version
@@ -78,10 +80,11 @@
return result
def find_dependencies(module):
- if CYTHON_INSTALLED:
- from Cython.Compiler.Version import version
- if tuple(version.split('.')) <= (0,9,6,12):
- return []
+ if not CYTHON_INSTALLED:
+ return []
+ from Cython.Compiler.Version import version
+ if split_version(version) <= (0,9,6,12):
+ return []
package_dir = os.path.join(get_base_dir(), PACKAGE_PATH)
files = os.listdir(package_dir)
@@ -255,9 +258,6 @@
except ValueError:
return False
-def get_base_dir():
- return os.path.join(os.getcwd(), os.path.dirname(sys.argv[0]))
-
# pick up any commandline options
OPTION_WITHOUT_OBJECTIFY = has_option('without-objectify')
OPTION_WITHOUT_ASSERT = has_option('without-assert')
Modified: lxml/trunk/versioninfo.py
==============================================================================
--- lxml/trunk/versioninfo.py (original)
+++ lxml/trunk/versioninfo.py Mon Mar 17 17:19:19 2008
@@ -5,7 +5,7 @@
def version():
global __LXML_VERSION
if __LXML_VERSION is None:
- __LXML_VERSION = open(os.path.join(get_src_dir(), 'version.txt')).read().strip()
+ __LXML_VERSION = open(os.path.join(get_base_dir(), 'version.txt')).read().strip()
return __LXML_VERSION
def branch_version():
@@ -17,7 +17,7 @@
def svn_version():
_version = version()
- src_dir = get_src_dir()
+ src_dir = get_base_dir()
revision = 0
base_url = None
@@ -89,7 +89,7 @@
"""Extract part of changelog pertaining to version.
"""
_version = version()
- f = open(os.path.join(get_src_dir(), "CHANGES.txt"), 'r')
+ f = open(os.path.join(get_base_dir(), "CHANGES.txt"), 'r')
lines = []
for line in f:
if line.startswith('====='):
@@ -114,7 +114,7 @@
svn_version += '.0'
version_h = open(
- os.path.join(get_src_dir(), 'src', 'lxml', 'lxml-version.h'),
+ os.path.join(get_base_dir(), 'src', 'lxml', 'lxml-version.h'),
'w')
version_h.write('''\
#ifndef LXML_VERSION_STRING
@@ -123,10 +123,23 @@
''' % svn_version)
version_h.close()
-def get_src_dir():
+def get_base_dir():
return os.path.join(os.getcwd(), os.path.dirname(sys.argv[0]))
def fix_alphabeta(version, alphabeta):
if ('.' + alphabeta) in version:
return version
return version.replace(alphabeta, '.' + alphabeta)
+
+def split_version(version):
+ find_digits = re.compile('([0-9]+)(.*)').match
+ l = []
+ for part in version.split('.'):
+ try:
+ l.append( int(part) )
+ except ValueError:
+ match = find_digits(part)
+ if match:
+ l.append( int(match.group(1)) )
+ l.append( match.group(2) )
+ return tuple(l)
From scoder at codespeak.net Mon Mar 17 17:19:26 2008
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Mon, 17 Mar 2008 17:19:26 +0100 (CET)
Subject: [Lxml-checkins] r52647 - in lxml/trunk: . src/lxml
Message-ID: <20080317161926.5CDB4169E19@codespeak.net>
Author: scoder
Date: Mon Mar 17 17:19:25 2008
New Revision: 52647
Modified:
lxml/trunk/ (props changed)
lxml/trunk/src/lxml/apihelpers.pxi
Log:
r3805 at delle: sbehnel | 2008-03-17 17:18:23 +0100
replaced _getFilenameForFile() by an equivalent implementation that seems to work better with GTK's threading
Modified: lxml/trunk/src/lxml/apihelpers.pxi
==============================================================================
--- lxml/trunk/src/lxml/apihelpers.pxi (original)
+++ lxml/trunk/src/lxml/apihelpers.pxi Mon Mar 17 17:19:25 2008
@@ -1107,20 +1107,16 @@
Returns None if not a file object.
"""
# file instances have a name attribute
- try:
- return source.name
- except AttributeError:
- pass
+ filename = getattr3(source, 'name', None)
+ if filename is not None:
+ return filename
# gzip file instances have a filename attribute
- try:
- return source.filename
- except AttributeError:
- pass
+ filename = getattr3(source, 'filename', None)
+ if filename is not None:
+ return filename
# urllib2 provides a geturl() method
- try:
- geturl = source.geturl
- except AttributeError:
- # can't determine filename
- return None
- else:
+ geturl = getattr3(source, 'geturl', None)
+ if geturl is not None:
return geturl()
+ # can't determine filename
+ return None
From lxml-checkins at codespeak.net Mon Mar 17 19:10:01 2008
From: lxml-checkins at codespeak.net (lxml-checkins at codespeak.net)
Date: Mon, 17 Mar 2008 19:10:01 +0100 (CET)
Subject: [Lxml-checkins] Men's Health id 29161134
Message-ID: <20080318195901.4558.qmail@adsl-pool-222.123.22-143.tttmaxnet.com>
Canadian Doctor Latoya Saldana Best Price On Net March 87% OFF!
http://www.google.ws/pagead/iclk?sa=l&ai=rgqhe&num=71109&adurl=http://qwcf.changereach.com
From scoder at codespeak.net Wed Mar 19 07:44:45 2008
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Wed, 19 Mar 2008 07:44:45 +0100 (CET)
Subject: [Lxml-checkins] r52712 - in lxml/trunk: . doc src/lxml/html
Message-ID: <20080319064445.1C0FB169EA2@codespeak.net>
Author: scoder
Date: Wed Mar 19 07:44:44 2008
New Revision: 52712
Modified:
lxml/trunk/ (props changed)
lxml/trunk/CHANGES.txt
lxml/trunk/doc/elementsoup.txt
lxml/trunk/src/lxml/html/soupparser.py
Log:
r3808 at delle: sbehnel | 2008-03-18 22:17:23 +0100
entity replacement fixes for soupparser/ElementSoup, some cleanup, support for passing keyword arguments on to BS
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Wed Mar 19 07:44:44 2008
@@ -8,6 +8,11 @@
Features added
--------------
+* ElementSoup/soupparser.parse() allows passing keyword arguments on
+ to BeautifulSoup.
+
+* ``fromstring()`` method in ``lxml.html.soupparser``.
+
* ``XSLTAccessControl`` instances have a property ``options`` that
returns a dict of access configuration options.
@@ -24,6 +29,9 @@
Bugs fixed
----------
+* The BeautifulSoup parser did not replace entities, which made them
+ turn up in text content.
+
* Attribute assignment of custom PyTypes in objectify could fail to
correctly serialise the value to a string.
Modified: lxml/trunk/doc/elementsoup.txt
==============================================================================
--- lxml/trunk/doc/elementsoup.txt (original)
+++ lxml/trunk/doc/elementsoup.txt Wed Mar 19 07:44:44 2008
@@ -53,7 +53,46 @@
``makeelement`` factory function to ``parse()`` and ``fromstring()``.
By default, this is based on the HTML parser defined in ``lxml.html``.
-There is also a legacy module called ``ElementSoup``, which mimics the
-interface provided by ElementTree's own ElementSoup_ module.
+By default, the BeautifulSoup parser also replaces the entities it
+finds by their character equivalent.
+
+.. sourcecode:: pycon
+
+ >>> tag_soup = '
©€-õƽ'
+ >>> body = fromstring(tag_soup).find('.//body')
+ >>> body.text
+ u'\xa9\u20ac-\xf5\u01bd'
+
+If you want them back on the way out, you can serialise with the
+'html' method, which will always use escaping for safety reasons:
+
+.. sourcecode:: pycon
+
+ >>> tostring(body, method="html")
+ '
©€-õƽ'
+ >>> body = fromstring(tag_soup).find('.//body')
+ >>> body.text
+ u'\xa9\u20ac-\xf5\u01bd'
+
+If you want them back on the way out, you can serialise with the
+'html' method, which will always use escaping for safety reasons:
+
+.. sourcecode:: pycon
+
+ >>> tostring(body, method="html")
+ '
©€-õƽ