[Lxml-checkins] r54365 - in lxml/trunk: . src/lxml/html src/lxml/html/tests
scoder at codespeak.net
scoder at codespeak.net
Sat May 3 15:42:27 CEST 2008
Author: scoder
Date: Sat May 3 15:42:27 2008
New Revision: 54365
Added:
lxml/trunk/src/lxml/html/tests/test_xhtml.py
lxml/trunk/src/lxml/html/tests/test_xhtml.txt
Modified:
lxml/trunk/ (props changed)
lxml/trunk/CHANGES.txt
lxml/trunk/src/lxml/html/__init__.py
lxml/trunk/src/lxml/html/tests/test_basic.txt
Log:
r4146 at delle: sbehnel | 2008-05-03 15:40:45 +0200
conversion functions HTML<->XHTML in lxml.html
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Sat May 3 15:42:27 2008
@@ -8,6 +8,9 @@
Features added
--------------
+* Conversion functions ``html_to_xhtml()`` and ``xhtml_to_html()`` in
+ lxml.html.
+
* Most features in lxml.html work for XHTML namespaced tag names.
Bugs fixed
Modified: lxml/trunk/src/lxml/html/__init__.py
==============================================================================
--- lxml/trunk/src/lxml/html/__init__.py (original)
+++ lxml/trunk/src/lxml/html/__init__.py Sat May 3 15:42:27 2008
@@ -1301,6 +1301,34 @@
## Serialization
############################################################
+def html_to_xhtml(html):
+ """Convert all tags in an HTML tree to XHTML by moving them to the
+ XHTML namespace.
+ """
+ try:
+ html = html.getroot()
+ except AttributeError:
+ pass
+ prefix = "{%s}" % XHTML_NAMESPACE
+ for el in html.iter():
+ tag = el.tag
+ if isinstance(tag, basestring):
+ if tag[0] != '{':
+ el.tag = prefix + tag
+
+def xhtml_to_html(xhtml):
+ """Convert all tags in an XHTML tree to HTML by removing their
+ XHTML namespace.
+ """
+ try:
+ xhtml = xhtml.getroot()
+ except AttributeError:
+ pass
+ prefix = "{%s}" % XHTML_NAMESPACE
+ prefix_len = len(prefix)
+ for el in xhtml.iter(prefix + "*"):
+ el.tag = el.tag[prefix_len:]
+
# This isn't a general match, but it's a match for what libxml2
# specifically serialises:
__replace_meta_content_type = re.compile(
Modified: lxml/trunk/src/lxml/html/tests/test_basic.txt
==============================================================================
--- lxml/trunk/src/lxml/html/tests/test_basic.txt (original)
+++ lxml/trunk/src/lxml/html/tests/test_basic.txt Sat May 3 15:42:27 2008
@@ -96,16 +96,3 @@
<div>footer</div>
</body>
</html>
-
-lxml.html has two parsers, one for HTML, one for XHTML:
-
- >>> from lxml.html import HTMLParser, XHTMLParser
- >>> html = "<html><body><p>Hi!</p></body></html>"
-
- >>> root = document_fromstring(html, parser=HTMLParser())
- >>> print root.tag
- html
-
- >>> root = document_fromstring(html, parser=XHTMLParser())
- >>> print root.tag
- html
Added: lxml/trunk/src/lxml/html/tests/test_xhtml.py
==============================================================================
--- (empty file)
+++ lxml/trunk/src/lxml/html/tests/test_xhtml.py Sat May 3 15:42:27 2008
@@ -0,0 +1,11 @@
+import unittest, sys
+from lxml.tests.common_imports import doctest
+import lxml.html
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([doctest.DocFileSuite('test_xhtml.txt')])
+ return suite
+
+if __name__ == '__main__':
+ unittest.main()
Added: lxml/trunk/src/lxml/html/tests/test_xhtml.txt
==============================================================================
--- (empty file)
+++ lxml/trunk/src/lxml/html/tests/test_xhtml.txt Sat May 3 15:42:27 2008
@@ -0,0 +1,30 @@
+ >>> from lxml.html import document_fromstring, fragment_fromstring, tostring
+
+lxml.html has two parsers, one for HTML, one for XHTML:
+
+ >>> from lxml.html import HTMLParser, XHTMLParser
+ >>> html = "<html><body><p>Hi!</p></body></html>"
+
+ >>> root = document_fromstring(html, parser=HTMLParser())
+ >>> print root.tag
+ html
+
+ >>> root = document_fromstring(html, parser=XHTMLParser())
+ >>> print root.tag
+ html
+
+There are two functions for converting between HTML and XHTML:
+
+ >>> from lxml.html import xhtml_to_html, html_to_xhtml
+
+ >>> doc = document_fromstring(html, parser=HTMLParser())
+ >>> print tostring(doc)
+ <html><body><p>Hi!</p></body></html>
+
+ >>> html_to_xhtml(doc)
+ >>> print tostring(doc)
+ <html:html xmlns:html="http://www.w3.org/1999/xhtml"><html:body><html:p>Hi!</html:p></html:body></html:html>
+
+ >>> xhtml_to_html(doc)
+ >>> print tostring(doc)
+ <html xmlns:html="http://www.w3.org/1999/xhtml"><body><p>Hi!</p></body></html>
More information about the lxml-checkins
mailing list