[Lxml-checkins] r52960 - in lxml/trunk: . src/lxml/html src/lxml/html/tests
ianb at codespeak.net
ianb at codespeak.net
Wed Mar 26 17:54:49 CET 2008
Author: ianb
Date: Wed Mar 26 17:54:46 2008
New Revision: 52960
Modified:
lxml/trunk/CHANGES.txt
lxml/trunk/src/lxml/html/diff.py
lxml/trunk/src/lxml/html/tests/test_diff.txt
Log:
Fix empty tags (e.g., <br>) in diffs.
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Wed Mar 26 17:54:46 2008
@@ -29,6 +29,9 @@
* Default encoding for plain text serialisation was different from
that of XML serialisation (UTF-8 instead of ASCII).
+* ``lxml.html.diff`` didn't treat empty tags properly (e.g.,
+ ``<br>``).
+
Other changes
-------------
Modified: lxml/trunk/src/lxml/html/diff.py
==============================================================================
--- lxml/trunk/src/lxml/html/diff.py (original)
+++ lxml/trunk/src/lxml/html/diff.py Wed Mar 26 17:54:46 2008
@@ -139,6 +139,8 @@
############################################################
def htmldiff(old_html, new_html):
+ ## FIXME: this should take parsed documents too, and use their body
+ ## or other content.
""" Do a diff of the old and new document. The documents are HTML
*fragments* (str/UTF8 or unicode), they are not complete documents
(i.e., no <html> tag).
@@ -310,8 +312,6 @@
endtag = chunk[1] == '/'
name = chunk.split()[0].strip('<>/')
if name in empty_tags:
- assert not endtag, (
- "Empty tag %r should have no end tag" % chunk)
balanced.append(chunk)
continue
if endtag:
@@ -669,7 +669,7 @@
yield ('img', el.attrib['src'], start_tag(el))
else:
yield start_tag(el)
- if el.tag in empty_tags and not el.text and not len(el):
+ if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
return
start_words = split_words(el.text)
for word in start_words:
Modified: lxml/trunk/src/lxml/html/tests/test_diff.txt
==============================================================================
--- lxml/trunk/src/lxml/html/tests/test_diff.txt (original)
+++ lxml/trunk/src/lxml/html/tests/test_diff.txt Wed Mar 26 17:54:46 2008
@@ -66,6 +66,11 @@
>>> pdiff('<a href="http://yahoo.com">search</a>', '<a href="http://yahoo.com">search</a>')
<a href="http://yahoo.com">search</a>
+A test of empty elements:
+
+ >>> pdiff('some <br> text', 'some <br> test')
+ some <ins><br> test</ins> <del><br> text</del>
+
The sixteen combinations::
First "insert start" (del start/middle/end/none):
@@ -177,8 +182,8 @@
>>> panno('<p>Hi <img src="/foo"> You</p>',
... '<p>Hi You</p>',
... '<p>Hi You <img src="/bar"></p>')
- <p><span version="0">Hi</span> <span version="1">You</span> <span
- version="2"><img src="/bar"></span></p>
+ <p><span version="0">Hi You</span> <span version="2"><img
+ src="/bar"></span></p>
>>> panno('<p><a href="/foo">Hey</a></p>',
... '<p><a href="/bar">Hey</a></p>')
<p><a href="/bar"><span version="0">Hey</span></a></p>
More information about the lxml-checkins
mailing list