[wwwsearch-commits] r33200 - wwwsearch/ClientForm/trunk

jjlee at codespeak.net jjlee at codespeak.net
Thu Oct 12 00:23:36 CEST 2006


Author: jjlee
Date: Thu Oct 12 00:23:34 2006
New Revision: 33200

Modified:
   wwwsearch/ClientForm/trunk/ClientForm.py
   wwwsearch/ClientForm/trunk/test.py
Log:
Handle line endings in element content the same way browsers do; Convert TEXTAREA content to DOS line ending convention, again following the major browsers (possibly they also do this line ending normalization in some other case(s), I haven't checked)

Modified: wwwsearch/ClientForm/trunk/ClientForm.py
==============================================================================
--- wwwsearch/ClientForm/trunk/ClientForm.py	(original)
+++ wwwsearch/ClientForm/trunk/ClientForm.py	Thu Oct 12 00:23:34 2006
@@ -126,6 +126,10 @@
 _compress_re = re.compile(r"\s+")
 def compress_text(text): return _compress_re.sub(" ", text.strip())
 
+def normalize_line_endings(text):
+    return re.sub(r"(?:(?<!\r)\n)|(?:\r(?!\n))", "\r\n", text)
+
+
 # This version of urlencode is from my Python 1.5.2 back-port of the
 # Python 2.1 CVS maintenance branch of urllib.  It will accept a sequence
 # of pairs instead of a mapping -- the 2.0 version only accepts a mapping.
@@ -639,6 +643,16 @@
 
     def handle_data(self, data):
         debug("%s", data)
+
+        # according to http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.1
+        # line break immediately after start tags or immediately before end
+        # tags must be ignored, but real browsers only ignore a line break
+        # after a start tag, so we'll do that.
+        if data[0:2] == "\r\n":
+            data = data[2:]
+        if data[0:1] in ["\n", "\r"]:
+            data = data[1:]
+
         if self._option is not None:
             # self._option is a dictionary of the OPTION element's HTML
             # attributes, but it has two special keys, one of which is the
@@ -649,6 +663,7 @@
         elif self._textarea is not None:
             map = self._textarea
             key = "value"
+            data = normalize_line_endings(data)
         # not if within option or textarea
         elif self._current_label is not None:
             map = self._current_label

Modified: wwwsearch/ClientForm/trunk/test.py
==============================================================================
--- wwwsearch/ClientForm/trunk/test.py	(original)
+++ wwwsearch/ClientForm/trunk/test.py	Thu Oct 12 00:23:34 2006
@@ -288,7 +288,7 @@
         self.assertEqual(form.action, "http://localhost/abc&amp;"+u"\u2014".encode('utf8')+"d")
         control = form.find_control(type="textarea", nr=0)
         self.assert_(control.name is None)
-        self.assert_(control.value == "blah, blah,\nRhubarb.\n\n")
+        self.assert_(control.value == "blah, blah,\r\nRhubarb.\r\n\r\n")
 
         empty_control = form.find_control(type="textarea", nr=1)
         self.assert_(str(empty_control) == "<TextareaControl(<None>=)>")
@@ -621,6 +621,17 @@
         single_control = form.find_control(type="select", nr=1)
         self.assert_(single_control.value == ["1"])
 
+    def test_close_base_tag(self):
+        # Benji York: a single newline immediately after a start tag is
+        # stripped by browsers, but not one immediately before an end tag.
+        # TEXTAREA content is converted to the DOS newline convention.
+        forms = ClientForm.ParseFile(
+            StringIO("<form><textarea>\n\nblah\n</textarea></form>"),
+            "http://example.com/",
+            )
+        ctl = forms[0].find_control(type="textarea")
+        self.assertEqual(ctl.value, "\r\nblah\r\n")
+
 
 class DisabledTests(TestCase):
     def testOptgroup(self):
@@ -3054,6 +3065,32 @@
             self.assertEqual(req.ah, not auh)
 
 
+class FunctionTests(TestCase):
+
+    def test_normalize_line_endings(self):
+        def check(text, expected):
+            got = ClientForm.normalize_line_endings(text)
+            self.assertEqual(got, expected)
+
+        # unix
+        check("foo\nbar", "foo\r\nbar")
+        check("foo\nbar\n", "foo\r\nbar\r\n")
+        # mac
+        check("foo\rbar", "foo\r\nbar")
+        check("foo\rbar\r", "foo\r\nbar\r\n")
+        # dos
+        check("foo\r\nbar", "foo\r\nbar")
+        check("foo\r\nbar\r\n", "foo\r\nbar\r\n")
+
+        # inconsistent -- we just blithely convert anything that looks like a
+        # line ending to the DOS convention, following Firefox's behaviour when
+        # normalizing textarea content
+        check("foo\r\nbar\nbaz\rblah\r\n", "foo\r\nbar\r\nbaz\r\nblah\r\n")
+
+        # pathological ;-O
+        check("\r\n\n\r\r\r\n", "\r\n"*5)
+
+
 def startswith(string, initial):
     if len(initial) > len(string): return False
     return string[:len(initial)] == initial


More information about the wwwsearch-commits mailing list