comparison env/lib/python3.9/site-packages/lxml/html/clean.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 # cython: language_level=3str
2
3 """A cleanup tool for HTML.
4
5 Removes unwanted tags and content. See the `Cleaner` class for
6 details.
7 """
8
9 from __future__ import absolute_import
10
11 import copy
12 import re
13 import sys
14 try:
15 from urlparse import urlsplit
16 from urllib import unquote_plus
17 except ImportError:
18 # Python 3
19 from urllib.parse import urlsplit, unquote_plus
20 from lxml import etree
21 from lxml.html import defs
22 from lxml.html import fromstring, XHTML_NAMESPACE
23 from lxml.html import xhtml_to_html, _transform_result
24
25 try:
26 unichr
27 except NameError:
28 # Python 3
29 unichr = chr
30 try:
31 unicode
32 except NameError:
33 # Python 3
34 unicode = str
35 try:
36 basestring
37 except NameError:
38 basestring = (str, bytes)
39
40
41 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
42 'word_break', 'word_break_html']
43
44 # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
45 # Particularly the CSS cleaning; most of the tag cleaning is integrated now
46 # I have multiple kinds of schemes searched; but should schemes be
47 # whitelisted instead?
48 # max height?
49 # remove images? Also in CSS? background attribute?
50 # Some way to whitelist object, iframe, etc (e.g., if you want to
51 # allow *just* embedded YouTube movies)
52 # Log what was deleted and why?
53 # style="behavior: ..." might be bad in IE?
54 # Should we have something for just <meta http-equiv>? That's the worst of the
55 # metas.
56 # UTF-7 detections? Example:
57 # <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-
58 # you don't always have to have the charset set, if the page has no charset
59 # and there's UTF7-like code in it.
60 # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php
61
62
63 # This is an IE-specific construct you can have in a stylesheet to
64 # run some Javascript:
65 _replace_css_javascript = re.compile(
66 r'expression\s*\(.*?\)', re.S|re.I).sub
67
68 # Do I have to worry about @\nimport?
69 _replace_css_import = re.compile(
70 r'@\s*import', re.I).sub
71
72 _looks_like_tag_content = re.compile(
73 r'</?[a-zA-Z]+|\son[a-zA-Z]+\s*=',
74 *((re.ASCII,) if sys.version_info[0] >= 3 else ())).search
75
76 # All kinds of schemes besides just javascript: that can cause
77 # execution:
78 _is_image_dataurl = re.compile(
79 r'^data:image/.+;base64', re.I).search
80 _is_possibly_malicious_scheme = re.compile(
81 r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):',
82 re.I).search
83 def _is_javascript_scheme(s):
84 if _is_image_dataurl(s):
85 return None
86 return _is_possibly_malicious_scheme(s)
87
88 _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
89 # FIXME: should data: be blocked?
90
91 # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx
92 _conditional_comment_re = re.compile(
93 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
94
95 _find_styled_elements = etree.XPath(
96 "descendant-or-self::*[@style]")
97
98 _find_external_links = etree.XPath(
99 ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
100 "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
101 namespaces={'x':XHTML_NAMESPACE})
102
103
104 class Cleaner(object):
105 """
106 Instances cleans the document of each of the possible offending
107 elements. The cleaning is controlled by attributes; you can
108 override attributes in a subclass, or set them in the constructor.
109
110 ``scripts``:
111 Removes any ``<script>`` tags.
112
113 ``javascript``:
114 Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
115 as they could contain Javascript.
116
117 ``comments``:
118 Removes any comments.
119
120 ``style``:
121 Removes any style tags.
122
123 ``inline_style``
124 Removes any style attributes. Defaults to the value of the ``style`` option.
125
126 ``links``:
127 Removes any ``<link>`` tags
128
129 ``meta``:
130 Removes any ``<meta>`` tags
131
132 ``page_structure``:
133 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
134
135 ``processing_instructions``:
136 Removes any processing instructions.
137
138 ``embedded``:
139 Removes any embedded objects (flash, iframes)
140
141 ``frames``:
142 Removes any frame-related tags
143
144 ``forms``:
145 Removes any form tags
146
147 ``annoying_tags``:
148 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``
149
150 ``remove_tags``:
151 A list of tags to remove. Only the tags will be removed,
152 their content will get pulled up into the parent tag.
153
154 ``kill_tags``:
155 A list of tags to kill. Killing also removes the tag's content,
156 i.e. the whole subtree, not just the tag itself.
157
158 ``allow_tags``:
159 A list of tags to include (default include all).
160
161 ``remove_unknown_tags``:
162 Remove any tags that aren't standard parts of HTML.
163
164 ``safe_attrs_only``:
165 If true, only include 'safe' attributes (specifically the list
166 from the feedparser HTML sanitisation web site).
167
168 ``safe_attrs``:
169 A set of attribute names to override the default list of attributes
170 considered 'safe' (when safe_attrs_only=True).
171
172 ``add_nofollow``:
173 If true, then any <a> tags will have ``rel="nofollow"`` added to them.
174
175 ``host_whitelist``:
176 A list or set of hosts that you can use for embedded content
177 (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
178 You can also implement/override the method
179 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
180 implement more complex rules for what can be embedded.
181 Anything that passes this test will be shown, regardless of
182 the value of (for instance) ``embedded``.
183
184 Note that this parameter might not work as intended if you do not
185 make the links absolute before doing the cleaning.
186
187 Note that you may also need to set ``whitelist_tags``.
188
189 ``whitelist_tags``:
190 A set of tags that can be included with ``host_whitelist``.
191 The default is ``iframe`` and ``embed``; you may wish to
192 include other tags like ``script``, or you may want to
193 implement ``allow_embedded_url`` for more control. Set to None to
194 include all tags.
195
196 This modifies the document *in place*.
197 """
198
199 scripts = True
200 javascript = True
201 comments = True
202 style = False
203 inline_style = None
204 links = True
205 meta = True
206 page_structure = True
207 processing_instructions = True
208 embedded = True
209 frames = True
210 forms = True
211 annoying_tags = True
212 remove_tags = None
213 allow_tags = None
214 kill_tags = None
215 remove_unknown_tags = True
216 safe_attrs_only = True
217 safe_attrs = defs.safe_attrs
218 add_nofollow = False
219 host_whitelist = ()
220 whitelist_tags = {'iframe', 'embed'}
221
222 def __init__(self, **kw):
223 not_an_attribute = object()
224 for name, value in kw.items():
225 default = getattr(self, name, not_an_attribute)
226 if (default is not None and default is not True and default is not False
227 and not isinstance(default, (frozenset, set, tuple, list))):
228 raise TypeError(
229 "Unknown parameter: %s=%r" % (name, value))
230 setattr(self, name, value)
231 if self.inline_style is None and 'inline_style' not in kw:
232 self.inline_style = self.style
233
234 if kw.get("allow_tags"):
235 if kw.get("remove_unknown_tags"):
236 raise ValueError("It does not make sense to pass in both "
237 "allow_tags and remove_unknown_tags")
238 self.remove_unknown_tags = False
239
240 # Used to lookup the primary URL for a given tag that is up for
241 # removal:
242 _tag_link_attrs = dict(
243 script='src',
244 link='href',
245 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html
246 # From what I can tell, both attributes can contain a link:
247 applet=['code', 'object'],
248 iframe='src',
249 embed='src',
250 layer='src',
251 # FIXME: there doesn't really seem like a general way to figure out what
252 # links an <object> tag uses; links often go in <param> tags with values
253 # that we don't really know. You'd have to have knowledge about specific
254 # kinds of plugins (probably keyed off classid), and match against those.
255 ##object=?,
256 # FIXME: not looking at the action currently, because it is more complex
257 # than than -- if you keep the form, you should keep the form controls.
258 ##form='action',
259 a='href',
260 )
261
262 def __call__(self, doc):
263 """
264 Cleans the document.
265 """
266 try:
267 getroot = doc.getroot
268 except AttributeError:
269 pass # Element instance
270 else:
271 doc = getroot() # ElementTree instance, instead of an element
272 # convert XHTML to HTML
273 xhtml_to_html(doc)
274 # Normalize a case that IE treats <image> like <img>, and that
275 # can confuse either this step or later steps.
276 for el in doc.iter('image'):
277 el.tag = 'img'
278 if not self.comments:
279 # Of course, if we were going to kill comments anyway, we don't
280 # need to worry about this
281 self.kill_conditional_comments(doc)
282
283 kill_tags = set(self.kill_tags or ())
284 remove_tags = set(self.remove_tags or ())
285 allow_tags = set(self.allow_tags or ())
286
287 if self.scripts:
288 kill_tags.add('script')
289 if self.safe_attrs_only:
290 safe_attrs = set(self.safe_attrs)
291 for el in doc.iter(etree.Element):
292 attrib = el.attrib
293 for aname in attrib.keys():
294 if aname not in safe_attrs:
295 del attrib[aname]
296 if self.javascript:
297 if not (self.safe_attrs_only and
298 self.safe_attrs == defs.safe_attrs):
299 # safe_attrs handles events attributes itself
300 for el in doc.iter(etree.Element):
301 attrib = el.attrib
302 for aname in attrib.keys():
303 if aname.startswith('on'):
304 del attrib[aname]
305 doc.rewrite_links(self._remove_javascript_link,
306 resolve_base_href=False)
307 # If we're deleting style then we don't have to remove JS links
308 # from styles, otherwise...
309 if not self.inline_style:
310 for el in _find_styled_elements(doc):
311 old = el.get('style')
312 new = _replace_css_javascript('', old)
313 new = _replace_css_import('', new)
314 if self._has_sneaky_javascript(new):
315 # Something tricky is going on...
316 del el.attrib['style']
317 elif new != old:
318 el.set('style', new)
319 if not self.style:
320 for el in list(doc.iter('style')):
321 if el.get('type', '').lower().strip() == 'text/javascript':
322 el.drop_tree()
323 continue
324 old = el.text or ''
325 new = _replace_css_javascript('', old)
326 # The imported CSS can do anything; we just can't allow:
327 new = _replace_css_import('', new)
328 if self._has_sneaky_javascript(new):
329 # Something tricky is going on...
330 el.text = '/* deleted */'
331 elif new != old:
332 el.text = new
333 if self.comments:
334 kill_tags.add(etree.Comment)
335 if self.processing_instructions:
336 kill_tags.add(etree.ProcessingInstruction)
337 if self.style:
338 kill_tags.add('style')
339 if self.inline_style:
340 etree.strip_attributes(doc, 'style')
341 if self.links:
342 kill_tags.add('link')
343 elif self.style or self.javascript:
344 # We must get rid of included stylesheets if Javascript is not
345 # allowed, as you can put Javascript in them
346 for el in list(doc.iter('link')):
347 if 'stylesheet' in el.get('rel', '').lower():
348 # Note this kills alternate stylesheets as well
349 if not self.allow_element(el):
350 el.drop_tree()
351 if self.meta:
352 kill_tags.add('meta')
353 if self.page_structure:
354 remove_tags.update(('head', 'html', 'title'))
355 if self.embedded:
356 # FIXME: is <layer> really embedded?
357 # We should get rid of any <param> tags not inside <applet>;
358 # These are not really valid anyway.
359 for el in list(doc.iter('param')):
360 parent = el.getparent()
361 while parent is not None and parent.tag not in ('applet', 'object'):
362 parent = parent.getparent()
363 if parent is None:
364 el.drop_tree()
365 kill_tags.update(('applet',))
366 # The alternate contents that are in an iframe are a good fallback:
367 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
368 if self.frames:
369 # FIXME: ideally we should look at the frame links, but
370 # generally frames don't mix properly with an HTML
371 # fragment anyway.
372 kill_tags.update(defs.frame_tags)
373 if self.forms:
374 remove_tags.add('form')
375 kill_tags.update(('button', 'input', 'select', 'textarea'))
376 if self.annoying_tags:
377 remove_tags.update(('blink', 'marquee'))
378
379 _remove = []
380 _kill = []
381 for el in doc.iter():
382 if el.tag in kill_tags:
383 if self.allow_element(el):
384 continue
385 _kill.append(el)
386 elif el.tag in remove_tags:
387 if self.allow_element(el):
388 continue
389 _remove.append(el)
390
391 if _remove and _remove[0] == doc:
392 # We have to drop the parent-most tag, which we can't
393 # do. Instead we'll rewrite it:
394 el = _remove.pop(0)
395 el.tag = 'div'
396 el.attrib.clear()
397 elif _kill and _kill[0] == doc:
398 # We have to drop the parent-most element, which we can't
399 # do. Instead we'll clear it:
400 el = _kill.pop(0)
401 if el.tag != 'html':
402 el.tag = 'div'
403 el.clear()
404
405 _kill.reverse() # start with innermost tags
406 for el in _kill:
407 el.drop_tree()
408 for el in _remove:
409 el.drop_tag()
410
411 if self.remove_unknown_tags:
412 if allow_tags:
413 raise ValueError(
414 "It does not make sense to pass in both allow_tags and remove_unknown_tags")
415 allow_tags = set(defs.tags)
416 if allow_tags:
417 # make sure we do not remove comments/PIs if users want them (which is rare enough)
418 if not self.comments:
419 allow_tags.add(etree.Comment)
420 if not self.processing_instructions:
421 allow_tags.add(etree.ProcessingInstruction)
422
423 bad = []
424 for el in doc.iter():
425 if el.tag not in allow_tags:
426 bad.append(el)
427 if bad:
428 if bad[0] is doc:
429 el = bad.pop(0)
430 el.tag = 'div'
431 el.attrib.clear()
432 for el in bad:
433 el.drop_tag()
434 if self.add_nofollow:
435 for el in _find_external_links(doc):
436 if not self.allow_follow(el):
437 rel = el.get('rel')
438 if rel:
439 if ('nofollow' in rel
440 and ' nofollow ' in (' %s ' % rel)):
441 continue
442 rel = '%s nofollow' % rel
443 else:
444 rel = 'nofollow'
445 el.set('rel', rel)
446
447 def allow_follow(self, anchor):
448 """
449 Override to suppress rel="nofollow" on some anchors.
450 """
451 return False
452
453 def allow_element(self, el):
454 """
455 Decide whether an element is configured to be accepted or rejected.
456
457 :param el: an element.
458 :return: true to accept the element or false to reject/discard it.
459 """
460 if el.tag not in self._tag_link_attrs:
461 return False
462 attr = self._tag_link_attrs[el.tag]
463 if isinstance(attr, (list, tuple)):
464 for one_attr in attr:
465 url = el.get(one_attr)
466 if not url:
467 return False
468 if not self.allow_embedded_url(el, url):
469 return False
470 return True
471 else:
472 url = el.get(attr)
473 if not url:
474 return False
475 return self.allow_embedded_url(el, url)
476
477 def allow_embedded_url(self, el, url):
478 """
479 Decide whether a URL that was found in an element's attributes or text
480 if configured to be accepted or rejected.
481
482 :param el: an element.
483 :param url: a URL found on the element.
484 :return: true to accept the URL and false to reject it.
485 """
486 if self.whitelist_tags is not None and el.tag not in self.whitelist_tags:
487 return False
488 scheme, netloc, path, query, fragment = urlsplit(url)
489 netloc = netloc.lower().split(':', 1)[0]
490 if scheme not in ('http', 'https'):
491 return False
492 if netloc in self.host_whitelist:
493 return True
494 return False
495
496 def kill_conditional_comments(self, doc):
497 """
498 IE conditional comments basically embed HTML that the parser
499 doesn't normally see. We can't allow anything like that, so
500 we'll kill any comments that could be conditional.
501 """
502 has_conditional_comment = _conditional_comment_re.search
503 self._kill_elements(
504 doc, lambda el: has_conditional_comment(el.text),
505 etree.Comment)
506
507 def _kill_elements(self, doc, condition, iterate=None):
508 bad = []
509 for el in doc.iter(iterate):
510 if condition(el):
511 bad.append(el)
512 for el in bad:
513 el.drop_tree()
514
515 def _remove_javascript_link(self, link):
516 # links like "j a v a s c r i p t:" might be interpreted in IE
517 new = _substitute_whitespace('', unquote_plus(link))
518 if _is_javascript_scheme(new):
519 # FIXME: should this be None to delete?
520 return ''
521 return link
522
523 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
524
525 def _has_sneaky_javascript(self, style):
526 """
527 Depending on the browser, stuff like ``e x p r e s s i o n(...)``
528 can get interpreted, or ``expre/* stuff */ssion(...)``. This
529 checks for attempt to do stuff like this.
530
531 Typically the response will be to kill the entire style; if you
532 have just a bit of Javascript in the style another rule will catch
533 that and remove only the Javascript from the style; this catches
534 more sneaky attempts.
535 """
536 style = self._substitute_comments('', style)
537 style = style.replace('\\', '')
538 style = _substitute_whitespace('', style)
539 style = style.lower()
540 if 'javascript:' in style:
541 return True
542 if 'expression(' in style:
543 return True
544 if '</noscript' in style:
545 # e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
546 return True
547 if _looks_like_tag_content(style):
548 # e.g. '<math><style><img src=x onerror=alert(1)></style></math>'
549 return True
550 return False
551
552 def clean_html(self, html):
553 result_type = type(html)
554 if isinstance(html, basestring):
555 doc = fromstring(html)
556 else:
557 doc = copy.deepcopy(html)
558 self(doc)
559 return _transform_result(result_type, doc)
560
561 clean = Cleaner()
562 clean_html = clean.clean_html
563
564 ############################################################
565 ## Autolinking
566 ############################################################
567
568 _link_regexes = [
569 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
570 # This is conservative, but autolinking can be a bit conservative:
571 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I),
572 ]
573
574 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
575
576 _avoid_hosts = [
577 re.compile(r'^localhost', re.I),
578 re.compile(r'\bexample\.(?:com|org|net)$', re.I),
579 re.compile(r'^127\.0\.0\.1$'),
580 ]
581
582 _avoid_classes = ['nolink']
583
584 def autolink(el, link_regexes=_link_regexes,
585 avoid_elements=_avoid_elements,
586 avoid_hosts=_avoid_hosts,
587 avoid_classes=_avoid_classes):
588 """
589 Turn any URLs into links.
590
591 It will search for links identified by the given regular
592 expressions (by default mailto and http(s) links).
593
594 It won't link text in an element in avoid_elements, or an element
595 with a class in avoid_classes. It won't link to anything with a
596 host that matches one of the regular expressions in avoid_hosts
597 (default localhost and 127.0.0.1).
598
599 If you pass in an element, the element's tail will not be
600 substituted, only the contents of the element.
601 """
602 if el.tag in avoid_elements:
603 return
604 class_name = el.get('class')
605 if class_name:
606 class_name = class_name.split()
607 for match_class in avoid_classes:
608 if match_class in class_name:
609 return
610 for child in list(el):
611 autolink(child, link_regexes=link_regexes,
612 avoid_elements=avoid_elements,
613 avoid_hosts=avoid_hosts,
614 avoid_classes=avoid_classes)
615 if child.tail:
616 text, tail_children = _link_text(
617 child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
618 if tail_children:
619 child.tail = text
620 index = el.index(child)
621 el[index+1:index+1] = tail_children
622 if el.text:
623 text, pre_children = _link_text(
624 el.text, link_regexes, avoid_hosts, factory=el.makeelement)
625 if pre_children:
626 el.text = text
627 el[:0] = pre_children
628
629 def _link_text(text, link_regexes, avoid_hosts, factory):
630 leading_text = ''
631 links = []
632 last_pos = 0
633 while 1:
634 best_match, best_pos = None, None
635 for regex in link_regexes:
636 regex_pos = last_pos
637 while 1:
638 match = regex.search(text, pos=regex_pos)
639 if match is None:
640 break
641 host = match.group('host')
642 for host_regex in avoid_hosts:
643 if host_regex.search(host):
644 regex_pos = match.end()
645 break
646 else:
647 break
648 if match is None:
649 continue
650 if best_pos is None or match.start() < best_pos:
651 best_match = match
652 best_pos = match.start()
653 if best_match is None:
654 # No more matches
655 if links:
656 assert not links[-1].tail
657 links[-1].tail = text
658 else:
659 assert not leading_text
660 leading_text = text
661 break
662 link = best_match.group(0)
663 end = best_match.end()
664 if link.endswith('.') or link.endswith(','):
665 # These punctuation marks shouldn't end a link
666 end -= 1
667 link = link[:-1]
668 prev_text = text[:best_match.start()]
669 if links:
670 assert not links[-1].tail
671 links[-1].tail = prev_text
672 else:
673 assert not leading_text
674 leading_text = prev_text
675 anchor = factory('a')
676 anchor.set('href', link)
677 body = best_match.group('body')
678 if not body:
679 body = link
680 if body.endswith('.') or body.endswith(','):
681 body = body[:-1]
682 anchor.text = body
683 links.append(anchor)
684 text = text[end:]
685 return leading_text, links
686
687 def autolink_html(html, *args, **kw):
688 result_type = type(html)
689 if isinstance(html, basestring):
690 doc = fromstring(html)
691 else:
692 doc = copy.deepcopy(html)
693 autolink(doc, *args, **kw)
694 return _transform_result(result_type, doc)
695
696 autolink_html.__doc__ = autolink.__doc__
697
698 ############################################################
699 ## Word wrapping
700 ############################################################
701
702 _avoid_word_break_elements = ['pre', 'textarea', 'code']
703 _avoid_word_break_classes = ['nobreak']
704
705 def word_break(el, max_width=40,
706 avoid_elements=_avoid_word_break_elements,
707 avoid_classes=_avoid_word_break_classes,
708 break_character=unichr(0x200b)):
709 """
710 Breaks any long words found in the body of the text (not attributes).
711
712 Doesn't effect any of the tags in avoid_elements, by default
713 ``<textarea>`` and ``<pre>``
714
715 Breaks words by inserting &#8203;, which is a unicode character
716 for Zero Width Space character. This generally takes up no space
717 in rendering, but does copy as a space, and in monospace contexts
718 usually takes up space.
719
720 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
721 """
722 # Character suggestion of &#8203 comes from:
723 # http://www.cs.tut.fi/~jkorpela/html/nobr.html
724 if el.tag in _avoid_word_break_elements:
725 return
726 class_name = el.get('class')
727 if class_name:
728 dont_break = False
729 class_name = class_name.split()
730 for avoid in avoid_classes:
731 if avoid in class_name:
732 dont_break = True
733 break
734 if dont_break:
735 return
736 if el.text:
737 el.text = _break_text(el.text, max_width, break_character)
738 for child in el:
739 word_break(child, max_width=max_width,
740 avoid_elements=avoid_elements,
741 avoid_classes=avoid_classes,
742 break_character=break_character)
743 if child.tail:
744 child.tail = _break_text(child.tail, max_width, break_character)
745
746 def word_break_html(html, *args, **kw):
747 result_type = type(html)
748 doc = fromstring(html)
749 word_break(doc, *args, **kw)
750 return _transform_result(result_type, doc)
751
752 def _break_text(text, max_width, break_character):
753 words = text.split()
754 for word in words:
755 if len(word) > max_width:
756 replacement = _insert_break(word, max_width, break_character)
757 text = text.replace(word, replacement)
758 return text
759
760 _break_prefer_re = re.compile(r'[^a-z]', re.I)
761
762 def _insert_break(word, width, break_character):
763 orig_word = word
764 result = ''
765 while len(word) > width:
766 start = word[:width]
767 breaks = list(_break_prefer_re.finditer(start))
768 if breaks:
769 last_break = breaks[-1]
770 # Only walk back up to 10 characters to find a nice break:
771 if last_break.end() > width-10:
772 # FIXME: should the break character be at the end of the
773 # chunk, or the beginning of the next chunk?
774 start = word[:last_break.end()]
775 result += start + break_character
776 word = word[len(start):]
777 result += word
778 return result
779