[Lxml-checkins] r43982 - in lxml/branch/html/src/lxml/html: . tests
ianb at codespeak.net
ianb at codespeak.net
Fri Jun 1 22:40:07 CEST 2007
Author: ianb
Date: Fri Jun 1 22:40:06 2007
New Revision: 43982
Modified:
lxml/branch/html/src/lxml/html/clean.py
lxml/branch/html/src/lxml/html/tests/test_autolink.txt
Log:
Added long word breaking
Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py (original)
+++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 1 22:40:06 2007
@@ -3,14 +3,14 @@
from lxml.html import defs
from lxml.html import parse, tostring
-__all__ = ['clean_html', 'clean', 'autolink', 'autolink_html']
+__all__ = ['clean_html', 'clean', 'autolink', 'autolink_html',
+ 'word_break', 'word_break_html']
# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
+# Particularly the CSS cleaning; most of the tag cleaning is integrated now
# I have multiple kinds of schemes searched; but should schemes be
# whitelisted instead?
-# max width for words (but not in pre or textarea)
# max height?
-# autolink? (don't autolink in textarea, pre, code)
# remove images? Also in CSS? background attribute?
# Some way to whitelist object, iframe, etc (e.g., if you want to
# allow *just* embedded YouTube movies)
@@ -376,5 +376,82 @@
autolink(doc, *args, **kw)
return tostring(doc)
-
-
+_avoid_word_break_elements = ['pre', 'textarea', 'code']
+_avoid_word_break_classes = ['nobreak']
+
+def word_break(el, max_width=40,
+ avoid_elements=_avoid_word_break_elements,
+ avoid_classes=_avoid_word_break_classes,
+ break_character=u'\u200b'):
+ """
+ Breaks any long words found in the body of the text (not attributes).
+
+ Doesn't effect any of the tags in avoid_elements, by default
+ textarea and pre
+
+ Breaks words by inserting ​, which is a unicode character
+ for Zero Width Space character. This generally takes up no space
+ in rendering, but does copy as a space, and in monospace contexts
+ usually takes up space.
+
+ See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
+ """
+ # Character suggestion of ​ comes from:
+ # http://www.cs.tut.fi/~jkorpela/html/nobr.html
+ if el.tag in _avoid_word_break_elements:
+ return
+ class_name = el.attrib.get('class')
+ if class_name:
+ dont_break = False
+ class_name = class_name.split()
+ for avoid in avoid_classes:
+ if avoid in class_name:
+ dont_break = True
+ break
+ if dont_break:
+ return
+ if el.text:
+ el.text = _break_text(el.text, max_width, break_character)
+ for child in el:
+ word_break(child, max_width=max_width,
+ avoid_elements=avoid_elements,
+ avoid_classes=avoid_classes,
+ break_character=break_character)
+ if child.tail:
+ child.tail = _break_text(child.tail, max_width, break_character)
+
+def word_break_html(html, *args, **kw):
+ doc = parse(html)
+ word_break(doc, *args, **kw)
+ return tostring(doc)
+
+def _break_text(text, max_width, break_character):
+ words = text.split()
+ for word in words:
+ if len(word) > max_width:
+ replacement = _insert_break(word, max_width, break_character)
+ text = text.replace(word, replacement)
+ return text
+
+_break_prefer_re = re.compile(r'[^a-z]', re.I)
+
+def _insert_break(word, width, break_character):
+ orig_word = word
+ result = ''
+ while len(word) > width:
+ start = word[:width]
+ breaks = list(_break_prefer_re.finditer(start))
+ if breaks:
+ last_break = breaks[-1]
+ # Only walk back up to 10 characters to find a nice break:
+ if last_break.end() > width-10:
+ # FIXME: should the break character be at the end of the
+ # chunk, or the beginning of the next chunk?
+ start = word[:last_break.end()]
+ result += start + break_character
+ word = word[len(start):]
+ result += word
+ return result
+
+
+
Modified: lxml/branch/html/src/lxml/html/tests/test_autolink.txt
==============================================================================
--- lxml/branch/html/src/lxml/html/tests/test_autolink.txt (original)
+++ lxml/branch/html/src/lxml/html/tests/test_autolink.txt Fri Jun 1 22:40:06 2007
@@ -35,3 +35,28 @@
... <span class="nolink">http://bar.com</span></div>''')
<div>A link in <code>http://foo.com</code> or
<span class="nolink">http://bar.com</span></div>
+
+There's also a word wrapping function, that should probably be run
+after autolink::
+
+ >>> from lxml.html.clean import word_break_html
+ >>> def pascii(s):
+ ... print s.decode('utf8').encode('ascii', 'xmlcharrefreplace')
+ >>> pascii(word_break_html('''
+ ... <div>Hey you
+ ... 12345678901234567890123456789012345678901234567890</div>'''))
+ <div>Hey you
+ 1234567890123456789012345678901234567890​1234567890</div>
+
+Not everything is broken:
+
+ >>> pascii(word_break_html('''
+ ... <div>Hey you
+ ... <code>12345678901234567890123456789012345678901234567890</code></div>'''))
+ <div>Hey you
+ <code>12345678901234567890123456789012345678901234567890</code></div>
+ >>> pascii(word_break_html('''
+ ... <a href="12345678901234567890123456789012345678901234567890">text</a>'''))
+ <a href="12345678901234567890123456789012345678901234567890">text</a>
+
+
\ No newline at end of file
More information about the lxml-checkins
mailing list