[Lxml-checkins] r43982 - in lxml/branch/html/src/lxml/html: . tests

ianb at codespeak.net ianb at codespeak.net
Fri Jun 1 22:40:07 CEST 2007


Author: ianb
Date: Fri Jun  1 22:40:06 2007
New Revision: 43982

Modified:
   lxml/branch/html/src/lxml/html/clean.py
   lxml/branch/html/src/lxml/html/tests/test_autolink.txt
Log:
Added long word breaking

Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py	(original)
+++ lxml/branch/html/src/lxml/html/clean.py	Fri Jun  1 22:40:06 2007
@@ -3,14 +3,14 @@
 from lxml.html import defs
 from lxml.html import parse, tostring
 
-__all__ = ['clean_html', 'clean', 'autolink', 'autolink_html']
+__all__ = ['clean_html', 'clean', 'autolink', 'autolink_html',
+           'word_break', 'word_break_html']
 
 # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
+#   Particularly the CSS cleaning; most of the tag cleaning is integrated now
 # I have multiple kinds of schemes searched; but should schemes be
 #   whitelisted instead?
-# max width for words (but not in pre or textarea)
 # max height?
-# autolink? (don't autolink in textarea, pre, code)
 # remove images?  Also in CSS?  background attribute?
 # Some way to whitelist object, iframe, etc (e.g., if you want to
 #   allow *just* embedded YouTube movies)
@@ -376,5 +376,82 @@
     autolink(doc, *args, **kw)
     return tostring(doc)
 
-            
-        
+_avoid_word_break_elements = ['pre', 'textarea', 'code']
+_avoid_word_break_classes = ['nobreak']
+
+def word_break(el, max_width=40,
+               avoid_elements=_avoid_word_break_elements,
+               avoid_classes=_avoid_word_break_classes,
+               break_character=u'\u200b'):
+    """
+    Breaks any long words found in the body of the text (not attributes).
+
+    Doesn't effect any of the tags in avoid_elements, by default
+    textarea and pre
+
+    Breaks words by inserting ​, which is a unicode character
+    for Zero Width Space character.  This generally takes up no space
+    in rendering, but does copy as a space, and in monospace contexts
+    usually takes up space.
+
+    See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
+    """
+    # Character suggestion of &#8203 comes from:
+    #   http://www.cs.tut.fi/~jkorpela/html/nobr.html
+    if el.tag in _avoid_word_break_elements:
+        return
+    class_name = el.attrib.get('class')
+    if class_name:
+        dont_break = False
+        class_name = class_name.split()
+        for avoid in avoid_classes:
+            if avoid in class_name:
+                dont_break = True
+                break
+        if dont_break:
+            return
+    if el.text:
+        el.text = _break_text(el.text, max_width, break_character)
+    for child in el:
+        word_break(child, max_width=max_width,
+                   avoid_elements=avoid_elements,
+                   avoid_classes=avoid_classes,
+                   break_character=break_character)
+        if child.tail:
+            child.tail = _break_text(child.tail, max_width, break_character)
+
+def word_break_html(html, *args, **kw):
+    doc = parse(html)
+    word_break(doc, *args, **kw)
+    return tostring(doc)
+
+def _break_text(text, max_width, break_character):
+    words = text.split()
+    for word in words:
+        if len(word) > max_width:
+            replacement = _insert_break(word, max_width, break_character)
+            text = text.replace(word, replacement)
+    return text
+
+_break_prefer_re = re.compile(r'[^a-z]', re.I)
+
+def _insert_break(word, width, break_character):
+    orig_word = word
+    result = ''
+    while len(word) > width:
+        start = word[:width]
+        breaks = list(_break_prefer_re.finditer(start))
+        if breaks:
+            last_break = breaks[-1]
+            # Only walk back up to 10 characters to find a nice break:
+            if last_break.end() > width-10:
+                # FIXME: should the break character be at the end of the
+                # chunk, or the beginning of the next chunk?
+                start = word[:last_break.end()]
+        result += start + break_character
+        word = word[len(start):]
+    result += word
+    return result
+                
+    
+    

Modified: lxml/branch/html/src/lxml/html/tests/test_autolink.txt
==============================================================================
--- lxml/branch/html/src/lxml/html/tests/test_autolink.txt	(original)
+++ lxml/branch/html/src/lxml/html/tests/test_autolink.txt	Fri Jun  1 22:40:06 2007
@@ -35,3 +35,28 @@
     ... <span class="nolink">http://bar.com</span></div>''')
     <div>A link in <code>http://foo.com</code> or
     <span class="nolink">http://bar.com</span></div>
+
+There's also a word wrapping function, that should probably be run
+after autolink::
+
+    >>> from lxml.html.clean import word_break_html
+    >>> def pascii(s):
+    ...     print s.decode('utf8').encode('ascii', 'xmlcharrefreplace')
+    >>> pascii(word_break_html('''
+    ... <div>Hey you
+    ... 12345678901234567890123456789012345678901234567890</div>'''))
+    <div>Hey you
+    1234567890123456789012345678901234567890&#8203;1234567890</div>
+
+Not everything is broken:
+
+    >>> pascii(word_break_html('''
+    ... <div>Hey you
+    ... <code>12345678901234567890123456789012345678901234567890</code></div>'''))
+    <div>Hey you
+    <code>12345678901234567890123456789012345678901234567890</code></div>
+    >>> pascii(word_break_html('''
+    ... <a href="12345678901234567890123456789012345678901234567890">text</a>'''))
+    <a href="12345678901234567890123456789012345678901234567890">text</a>
+
+    
\ No newline at end of file


More information about the lxml-checkins mailing list