Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/lxml/html/clean.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4f3585e2f14b |
---|---|
1 # cython: language_level=3str | |
2 | |
3 """A cleanup tool for HTML. | |
4 | |
5 Removes unwanted tags and content. See the `Cleaner` class for | |
6 details. | |
7 """ | |
8 | |
9 from __future__ import absolute_import | |
10 | |
11 import copy | |
12 import re | |
13 import sys | |
14 try: | |
15 from urlparse import urlsplit | |
16 from urllib import unquote_plus | |
17 except ImportError: | |
18 # Python 3 | |
19 from urllib.parse import urlsplit, unquote_plus | |
20 from lxml import etree | |
21 from lxml.html import defs | |
22 from lxml.html import fromstring, XHTML_NAMESPACE | |
23 from lxml.html import xhtml_to_html, _transform_result | |
24 | |
25 try: | |
26 unichr | |
27 except NameError: | |
28 # Python 3 | |
29 unichr = chr | |
30 try: | |
31 unicode | |
32 except NameError: | |
33 # Python 3 | |
34 unicode = str | |
35 try: | |
36 basestring | |
37 except NameError: | |
38 basestring = (str, bytes) | |
39 | |
40 | |
41 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', | |
42 'word_break', 'word_break_html'] | |
43 | |
44 # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl | |
45 # Particularly the CSS cleaning; most of the tag cleaning is integrated now | |
46 # I have multiple kinds of schemes searched; but should schemes be | |
47 # whitelisted instead? | |
48 # max height? | |
49 # remove images? Also in CSS? background attribute? | |
50 # Some way to whitelist object, iframe, etc (e.g., if you want to | |
51 # allow *just* embedded YouTube movies) | |
52 # Log what was deleted and why? | |
53 # style="behavior: ..." might be bad in IE? | |
54 # Should we have something for just <meta http-equiv>? That's the worst of the | |
55 # metas. | |
56 # UTF-7 detections? Example: | |
57 # <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- | |
58 # you don't always have to have the charset set, if the page has no charset | |
59 # and there's UTF7-like code in it. | |
60 # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php | |
61 | |
62 | |
63 # This is an IE-specific construct you can have in a stylesheet to | |
64 # run some Javascript: | |
65 _replace_css_javascript = re.compile( | |
66 r'expression\s*\(.*?\)', re.S|re.I).sub | |
67 | |
68 # Do I have to worry about @\nimport? | |
69 _replace_css_import = re.compile( | |
70 r'@\s*import', re.I).sub | |
71 | |
72 _looks_like_tag_content = re.compile( | |
73 r'</?[a-zA-Z]+|\son[a-zA-Z]+\s*=', | |
74 *((re.ASCII,) if sys.version_info[0] >= 3 else ())).search | |
75 | |
76 # All kinds of schemes besides just javascript: that can cause | |
77 # execution: | |
78 _is_image_dataurl = re.compile( | |
79 r'^data:image/.+;base64', re.I).search | |
80 _is_possibly_malicious_scheme = re.compile( | |
81 r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):', | |
82 re.I).search | |
83 def _is_javascript_scheme(s): | |
84 if _is_image_dataurl(s): | |
85 return None | |
86 return _is_possibly_malicious_scheme(s) | |
87 | |
88 _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub | |
89 # FIXME: should data: be blocked? | |
90 | |
91 # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx | |
92 _conditional_comment_re = re.compile( | |
93 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) | |
94 | |
95 _find_styled_elements = etree.XPath( | |
96 "descendant-or-self::*[@style]") | |
97 | |
98 _find_external_links = etree.XPath( | |
99 ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" | |
100 "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), | |
101 namespaces={'x':XHTML_NAMESPACE}) | |
102 | |
103 | |
104 class Cleaner(object): | |
105 """ | |
106 Instances cleans the document of each of the possible offending | |
107 elements. The cleaning is controlled by attributes; you can | |
108 override attributes in a subclass, or set them in the constructor. | |
109 | |
110 ``scripts``: | |
111 Removes any ``<script>`` tags. | |
112 | |
113 ``javascript``: | |
114 Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets | |
115 as they could contain Javascript. | |
116 | |
117 ``comments``: | |
118 Removes any comments. | |
119 | |
120 ``style``: | |
121 Removes any style tags. | |
122 | |
123 ``inline_style`` | |
124 Removes any style attributes. Defaults to the value of the ``style`` option. | |
125 | |
126 ``links``: | |
127 Removes any ``<link>`` tags | |
128 | |
129 ``meta``: | |
130 Removes any ``<meta>`` tags | |
131 | |
132 ``page_structure``: | |
133 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. | |
134 | |
135 ``processing_instructions``: | |
136 Removes any processing instructions. | |
137 | |
138 ``embedded``: | |
139 Removes any embedded objects (flash, iframes) | |
140 | |
141 ``frames``: | |
142 Removes any frame-related tags | |
143 | |
144 ``forms``: | |
145 Removes any form tags | |
146 | |
147 ``annoying_tags``: | |
148 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>`` | |
149 | |
150 ``remove_tags``: | |
151 A list of tags to remove. Only the tags will be removed, | |
152 their content will get pulled up into the parent tag. | |
153 | |
154 ``kill_tags``: | |
155 A list of tags to kill. Killing also removes the tag's content, | |
156 i.e. the whole subtree, not just the tag itself. | |
157 | |
158 ``allow_tags``: | |
159 A list of tags to include (default include all). | |
160 | |
161 ``remove_unknown_tags``: | |
162 Remove any tags that aren't standard parts of HTML. | |
163 | |
164 ``safe_attrs_only``: | |
165 If true, only include 'safe' attributes (specifically the list | |
166 from the feedparser HTML sanitisation web site). | |
167 | |
168 ``safe_attrs``: | |
169 A set of attribute names to override the default list of attributes | |
170 considered 'safe' (when safe_attrs_only=True). | |
171 | |
172 ``add_nofollow``: | |
173 If true, then any <a> tags will have ``rel="nofollow"`` added to them. | |
174 | |
175 ``host_whitelist``: | |
176 A list or set of hosts that you can use for embedded content | |
177 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). | |
178 You can also implement/override the method | |
179 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to | |
180 implement more complex rules for what can be embedded. | |
181 Anything that passes this test will be shown, regardless of | |
182 the value of (for instance) ``embedded``. | |
183 | |
184 Note that this parameter might not work as intended if you do not | |
185 make the links absolute before doing the cleaning. | |
186 | |
187 Note that you may also need to set ``whitelist_tags``. | |
188 | |
189 ``whitelist_tags``: | |
190 A set of tags that can be included with ``host_whitelist``. | |
191 The default is ``iframe`` and ``embed``; you may wish to | |
192 include other tags like ``script``, or you may want to | |
193 implement ``allow_embedded_url`` for more control. Set to None to | |
194 include all tags. | |
195 | |
196 This modifies the document *in place*. | |
197 """ | |
198 | |
199 scripts = True | |
200 javascript = True | |
201 comments = True | |
202 style = False | |
203 inline_style = None | |
204 links = True | |
205 meta = True | |
206 page_structure = True | |
207 processing_instructions = True | |
208 embedded = True | |
209 frames = True | |
210 forms = True | |
211 annoying_tags = True | |
212 remove_tags = None | |
213 allow_tags = None | |
214 kill_tags = None | |
215 remove_unknown_tags = True | |
216 safe_attrs_only = True | |
217 safe_attrs = defs.safe_attrs | |
218 add_nofollow = False | |
219 host_whitelist = () | |
220 whitelist_tags = {'iframe', 'embed'} | |
221 | |
222 def __init__(self, **kw): | |
223 not_an_attribute = object() | |
224 for name, value in kw.items(): | |
225 default = getattr(self, name, not_an_attribute) | |
226 if (default is not None and default is not True and default is not False | |
227 and not isinstance(default, (frozenset, set, tuple, list))): | |
228 raise TypeError( | |
229 "Unknown parameter: %s=%r" % (name, value)) | |
230 setattr(self, name, value) | |
231 if self.inline_style is None and 'inline_style' not in kw: | |
232 self.inline_style = self.style | |
233 | |
234 if kw.get("allow_tags"): | |
235 if kw.get("remove_unknown_tags"): | |
236 raise ValueError("It does not make sense to pass in both " | |
237 "allow_tags and remove_unknown_tags") | |
238 self.remove_unknown_tags = False | |
239 | |
240 # Used to lookup the primary URL for a given tag that is up for | |
241 # removal: | |
242 _tag_link_attrs = dict( | |
243 script='src', | |
244 link='href', | |
245 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html | |
246 # From what I can tell, both attributes can contain a link: | |
247 applet=['code', 'object'], | |
248 iframe='src', | |
249 embed='src', | |
250 layer='src', | |
251 # FIXME: there doesn't really seem like a general way to figure out what | |
252 # links an <object> tag uses; links often go in <param> tags with values | |
253 # that we don't really know. You'd have to have knowledge about specific | |
254 # kinds of plugins (probably keyed off classid), and match against those. | |
255 ##object=?, | |
256 # FIXME: not looking at the action currently, because it is more complex | |
257 # than than -- if you keep the form, you should keep the form controls. | |
258 ##form='action', | |
259 a='href', | |
260 ) | |
261 | |
262 def __call__(self, doc): | |
263 """ | |
264 Cleans the document. | |
265 """ | |
266 try: | |
267 getroot = doc.getroot | |
268 except AttributeError: | |
269 pass # Element instance | |
270 else: | |
271 doc = getroot() # ElementTree instance, instead of an element | |
272 # convert XHTML to HTML | |
273 xhtml_to_html(doc) | |
274 # Normalize a case that IE treats <image> like <img>, and that | |
275 # can confuse either this step or later steps. | |
276 for el in doc.iter('image'): | |
277 el.tag = 'img' | |
278 if not self.comments: | |
279 # Of course, if we were going to kill comments anyway, we don't | |
280 # need to worry about this | |
281 self.kill_conditional_comments(doc) | |
282 | |
283 kill_tags = set(self.kill_tags or ()) | |
284 remove_tags = set(self.remove_tags or ()) | |
285 allow_tags = set(self.allow_tags or ()) | |
286 | |
287 if self.scripts: | |
288 kill_tags.add('script') | |
289 if self.safe_attrs_only: | |
290 safe_attrs = set(self.safe_attrs) | |
291 for el in doc.iter(etree.Element): | |
292 attrib = el.attrib | |
293 for aname in attrib.keys(): | |
294 if aname not in safe_attrs: | |
295 del attrib[aname] | |
296 if self.javascript: | |
297 if not (self.safe_attrs_only and | |
298 self.safe_attrs == defs.safe_attrs): | |
299 # safe_attrs handles events attributes itself | |
300 for el in doc.iter(etree.Element): | |
301 attrib = el.attrib | |
302 for aname in attrib.keys(): | |
303 if aname.startswith('on'): | |
304 del attrib[aname] | |
305 doc.rewrite_links(self._remove_javascript_link, | |
306 resolve_base_href=False) | |
307 # If we're deleting style then we don't have to remove JS links | |
308 # from styles, otherwise... | |
309 if not self.inline_style: | |
310 for el in _find_styled_elements(doc): | |
311 old = el.get('style') | |
312 new = _replace_css_javascript('', old) | |
313 new = _replace_css_import('', new) | |
314 if self._has_sneaky_javascript(new): | |
315 # Something tricky is going on... | |
316 del el.attrib['style'] | |
317 elif new != old: | |
318 el.set('style', new) | |
319 if not self.style: | |
320 for el in list(doc.iter('style')): | |
321 if el.get('type', '').lower().strip() == 'text/javascript': | |
322 el.drop_tree() | |
323 continue | |
324 old = el.text or '' | |
325 new = _replace_css_javascript('', old) | |
326 # The imported CSS can do anything; we just can't allow: | |
327 new = _replace_css_import('', new) | |
328 if self._has_sneaky_javascript(new): | |
329 # Something tricky is going on... | |
330 el.text = '/* deleted */' | |
331 elif new != old: | |
332 el.text = new | |
333 if self.comments: | |
334 kill_tags.add(etree.Comment) | |
335 if self.processing_instructions: | |
336 kill_tags.add(etree.ProcessingInstruction) | |
337 if self.style: | |
338 kill_tags.add('style') | |
339 if self.inline_style: | |
340 etree.strip_attributes(doc, 'style') | |
341 if self.links: | |
342 kill_tags.add('link') | |
343 elif self.style or self.javascript: | |
344 # We must get rid of included stylesheets if Javascript is not | |
345 # allowed, as you can put Javascript in them | |
346 for el in list(doc.iter('link')): | |
347 if 'stylesheet' in el.get('rel', '').lower(): | |
348 # Note this kills alternate stylesheets as well | |
349 if not self.allow_element(el): | |
350 el.drop_tree() | |
351 if self.meta: | |
352 kill_tags.add('meta') | |
353 if self.page_structure: | |
354 remove_tags.update(('head', 'html', 'title')) | |
355 if self.embedded: | |
356 # FIXME: is <layer> really embedded? | |
357 # We should get rid of any <param> tags not inside <applet>; | |
358 # These are not really valid anyway. | |
359 for el in list(doc.iter('param')): | |
360 parent = el.getparent() | |
361 while parent is not None and parent.tag not in ('applet', 'object'): | |
362 parent = parent.getparent() | |
363 if parent is None: | |
364 el.drop_tree() | |
365 kill_tags.update(('applet',)) | |
366 # The alternate contents that are in an iframe are a good fallback: | |
367 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) | |
368 if self.frames: | |
369 # FIXME: ideally we should look at the frame links, but | |
370 # generally frames don't mix properly with an HTML | |
371 # fragment anyway. | |
372 kill_tags.update(defs.frame_tags) | |
373 if self.forms: | |
374 remove_tags.add('form') | |
375 kill_tags.update(('button', 'input', 'select', 'textarea')) | |
376 if self.annoying_tags: | |
377 remove_tags.update(('blink', 'marquee')) | |
378 | |
379 _remove = [] | |
380 _kill = [] | |
381 for el in doc.iter(): | |
382 if el.tag in kill_tags: | |
383 if self.allow_element(el): | |
384 continue | |
385 _kill.append(el) | |
386 elif el.tag in remove_tags: | |
387 if self.allow_element(el): | |
388 continue | |
389 _remove.append(el) | |
390 | |
391 if _remove and _remove[0] == doc: | |
392 # We have to drop the parent-most tag, which we can't | |
393 # do. Instead we'll rewrite it: | |
394 el = _remove.pop(0) | |
395 el.tag = 'div' | |
396 el.attrib.clear() | |
397 elif _kill and _kill[0] == doc: | |
398 # We have to drop the parent-most element, which we can't | |
399 # do. Instead we'll clear it: | |
400 el = _kill.pop(0) | |
401 if el.tag != 'html': | |
402 el.tag = 'div' | |
403 el.clear() | |
404 | |
405 _kill.reverse() # start with innermost tags | |
406 for el in _kill: | |
407 el.drop_tree() | |
408 for el in _remove: | |
409 el.drop_tag() | |
410 | |
411 if self.remove_unknown_tags: | |
412 if allow_tags: | |
413 raise ValueError( | |
414 "It does not make sense to pass in both allow_tags and remove_unknown_tags") | |
415 allow_tags = set(defs.tags) | |
416 if allow_tags: | |
417 # make sure we do not remove comments/PIs if users want them (which is rare enough) | |
418 if not self.comments: | |
419 allow_tags.add(etree.Comment) | |
420 if not self.processing_instructions: | |
421 allow_tags.add(etree.ProcessingInstruction) | |
422 | |
423 bad = [] | |
424 for el in doc.iter(): | |
425 if el.tag not in allow_tags: | |
426 bad.append(el) | |
427 if bad: | |
428 if bad[0] is doc: | |
429 el = bad.pop(0) | |
430 el.tag = 'div' | |
431 el.attrib.clear() | |
432 for el in bad: | |
433 el.drop_tag() | |
434 if self.add_nofollow: | |
435 for el in _find_external_links(doc): | |
436 if not self.allow_follow(el): | |
437 rel = el.get('rel') | |
438 if rel: | |
439 if ('nofollow' in rel | |
440 and ' nofollow ' in (' %s ' % rel)): | |
441 continue | |
442 rel = '%s nofollow' % rel | |
443 else: | |
444 rel = 'nofollow' | |
445 el.set('rel', rel) | |
446 | |
447 def allow_follow(self, anchor): | |
448 """ | |
449 Override to suppress rel="nofollow" on some anchors. | |
450 """ | |
451 return False | |
452 | |
453 def allow_element(self, el): | |
454 """ | |
455 Decide whether an element is configured to be accepted or rejected. | |
456 | |
457 :param el: an element. | |
458 :return: true to accept the element or false to reject/discard it. | |
459 """ | |
460 if el.tag not in self._tag_link_attrs: | |
461 return False | |
462 attr = self._tag_link_attrs[el.tag] | |
463 if isinstance(attr, (list, tuple)): | |
464 for one_attr in attr: | |
465 url = el.get(one_attr) | |
466 if not url: | |
467 return False | |
468 if not self.allow_embedded_url(el, url): | |
469 return False | |
470 return True | |
471 else: | |
472 url = el.get(attr) | |
473 if not url: | |
474 return False | |
475 return self.allow_embedded_url(el, url) | |
476 | |
477 def allow_embedded_url(self, el, url): | |
478 """ | |
479 Decide whether a URL that was found in an element's attributes or text | |
480 if configured to be accepted or rejected. | |
481 | |
482 :param el: an element. | |
483 :param url: a URL found on the element. | |
484 :return: true to accept the URL and false to reject it. | |
485 """ | |
486 if self.whitelist_tags is not None and el.tag not in self.whitelist_tags: | |
487 return False | |
488 scheme, netloc, path, query, fragment = urlsplit(url) | |
489 netloc = netloc.lower().split(':', 1)[0] | |
490 if scheme not in ('http', 'https'): | |
491 return False | |
492 if netloc in self.host_whitelist: | |
493 return True | |
494 return False | |
495 | |
496 def kill_conditional_comments(self, doc): | |
497 """ | |
498 IE conditional comments basically embed HTML that the parser | |
499 doesn't normally see. We can't allow anything like that, so | |
500 we'll kill any comments that could be conditional. | |
501 """ | |
502 has_conditional_comment = _conditional_comment_re.search | |
503 self._kill_elements( | |
504 doc, lambda el: has_conditional_comment(el.text), | |
505 etree.Comment) | |
506 | |
507 def _kill_elements(self, doc, condition, iterate=None): | |
508 bad = [] | |
509 for el in doc.iter(iterate): | |
510 if condition(el): | |
511 bad.append(el) | |
512 for el in bad: | |
513 el.drop_tree() | |
514 | |
515 def _remove_javascript_link(self, link): | |
516 # links like "j a v a s c r i p t:" might be interpreted in IE | |
517 new = _substitute_whitespace('', unquote_plus(link)) | |
518 if _is_javascript_scheme(new): | |
519 # FIXME: should this be None to delete? | |
520 return '' | |
521 return link | |
522 | |
523 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub | |
524 | |
525 def _has_sneaky_javascript(self, style): | |
526 """ | |
527 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` | |
528 can get interpreted, or ``expre/* stuff */ssion(...)``. This | |
529 checks for attempt to do stuff like this. | |
530 | |
531 Typically the response will be to kill the entire style; if you | |
532 have just a bit of Javascript in the style another rule will catch | |
533 that and remove only the Javascript from the style; this catches | |
534 more sneaky attempts. | |
535 """ | |
536 style = self._substitute_comments('', style) | |
537 style = style.replace('\\', '') | |
538 style = _substitute_whitespace('', style) | |
539 style = style.lower() | |
540 if 'javascript:' in style: | |
541 return True | |
542 if 'expression(' in style: | |
543 return True | |
544 if '</noscript' in style: | |
545 # e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">' | |
546 return True | |
547 if _looks_like_tag_content(style): | |
548 # e.g. '<math><style><img src=x onerror=alert(1)></style></math>' | |
549 return True | |
550 return False | |
551 | |
552 def clean_html(self, html): | |
553 result_type = type(html) | |
554 if isinstance(html, basestring): | |
555 doc = fromstring(html) | |
556 else: | |
557 doc = copy.deepcopy(html) | |
558 self(doc) | |
559 return _transform_result(result_type, doc) | |
560 | |
561 clean = Cleaner() | |
562 clean_html = clean.clean_html | |
563 | |
564 ############################################################ | |
565 ## Autolinking | |
566 ############################################################ | |
567 | |
568 _link_regexes = [ | |
569 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I), | |
570 # This is conservative, but autolinking can be a bit conservative: | |
571 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I), | |
572 ] | |
573 | |
574 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] | |
575 | |
576 _avoid_hosts = [ | |
577 re.compile(r'^localhost', re.I), | |
578 re.compile(r'\bexample\.(?:com|org|net)$', re.I), | |
579 re.compile(r'^127\.0\.0\.1$'), | |
580 ] | |
581 | |
582 _avoid_classes = ['nolink'] | |
583 | |
584 def autolink(el, link_regexes=_link_regexes, | |
585 avoid_elements=_avoid_elements, | |
586 avoid_hosts=_avoid_hosts, | |
587 avoid_classes=_avoid_classes): | |
588 """ | |
589 Turn any URLs into links. | |
590 | |
591 It will search for links identified by the given regular | |
592 expressions (by default mailto and http(s) links). | |
593 | |
594 It won't link text in an element in avoid_elements, or an element | |
595 with a class in avoid_classes. It won't link to anything with a | |
596 host that matches one of the regular expressions in avoid_hosts | |
597 (default localhost and 127.0.0.1). | |
598 | |
599 If you pass in an element, the element's tail will not be | |
600 substituted, only the contents of the element. | |
601 """ | |
602 if el.tag in avoid_elements: | |
603 return | |
604 class_name = el.get('class') | |
605 if class_name: | |
606 class_name = class_name.split() | |
607 for match_class in avoid_classes: | |
608 if match_class in class_name: | |
609 return | |
610 for child in list(el): | |
611 autolink(child, link_regexes=link_regexes, | |
612 avoid_elements=avoid_elements, | |
613 avoid_hosts=avoid_hosts, | |
614 avoid_classes=avoid_classes) | |
615 if child.tail: | |
616 text, tail_children = _link_text( | |
617 child.tail, link_regexes, avoid_hosts, factory=el.makeelement) | |
618 if tail_children: | |
619 child.tail = text | |
620 index = el.index(child) | |
621 el[index+1:index+1] = tail_children | |
622 if el.text: | |
623 text, pre_children = _link_text( | |
624 el.text, link_regexes, avoid_hosts, factory=el.makeelement) | |
625 if pre_children: | |
626 el.text = text | |
627 el[:0] = pre_children | |
628 | |
629 def _link_text(text, link_regexes, avoid_hosts, factory): | |
630 leading_text = '' | |
631 links = [] | |
632 last_pos = 0 | |
633 while 1: | |
634 best_match, best_pos = None, None | |
635 for regex in link_regexes: | |
636 regex_pos = last_pos | |
637 while 1: | |
638 match = regex.search(text, pos=regex_pos) | |
639 if match is None: | |
640 break | |
641 host = match.group('host') | |
642 for host_regex in avoid_hosts: | |
643 if host_regex.search(host): | |
644 regex_pos = match.end() | |
645 break | |
646 else: | |
647 break | |
648 if match is None: | |
649 continue | |
650 if best_pos is None or match.start() < best_pos: | |
651 best_match = match | |
652 best_pos = match.start() | |
653 if best_match is None: | |
654 # No more matches | |
655 if links: | |
656 assert not links[-1].tail | |
657 links[-1].tail = text | |
658 else: | |
659 assert not leading_text | |
660 leading_text = text | |
661 break | |
662 link = best_match.group(0) | |
663 end = best_match.end() | |
664 if link.endswith('.') or link.endswith(','): | |
665 # These punctuation marks shouldn't end a link | |
666 end -= 1 | |
667 link = link[:-1] | |
668 prev_text = text[:best_match.start()] | |
669 if links: | |
670 assert not links[-1].tail | |
671 links[-1].tail = prev_text | |
672 else: | |
673 assert not leading_text | |
674 leading_text = prev_text | |
675 anchor = factory('a') | |
676 anchor.set('href', link) | |
677 body = best_match.group('body') | |
678 if not body: | |
679 body = link | |
680 if body.endswith('.') or body.endswith(','): | |
681 body = body[:-1] | |
682 anchor.text = body | |
683 links.append(anchor) | |
684 text = text[end:] | |
685 return leading_text, links | |
686 | |
687 def autolink_html(html, *args, **kw): | |
688 result_type = type(html) | |
689 if isinstance(html, basestring): | |
690 doc = fromstring(html) | |
691 else: | |
692 doc = copy.deepcopy(html) | |
693 autolink(doc, *args, **kw) | |
694 return _transform_result(result_type, doc) | |
695 | |
696 autolink_html.__doc__ = autolink.__doc__ | |
697 | |
698 ############################################################ | |
699 ## Word wrapping | |
700 ############################################################ | |
701 | |
702 _avoid_word_break_elements = ['pre', 'textarea', 'code'] | |
703 _avoid_word_break_classes = ['nobreak'] | |
704 | |
705 def word_break(el, max_width=40, | |
706 avoid_elements=_avoid_word_break_elements, | |
707 avoid_classes=_avoid_word_break_classes, | |
708 break_character=unichr(0x200b)): | |
709 """ | |
710 Breaks any long words found in the body of the text (not attributes). | |
711 | |
712 Doesn't effect any of the tags in avoid_elements, by default | |
713 ``<textarea>`` and ``<pre>`` | |
714 | |
715 Breaks words by inserting ​, which is a unicode character | |
716 for Zero Width Space character. This generally takes up no space | |
717 in rendering, but does copy as a space, and in monospace contexts | |
718 usually takes up space. | |
719 | |
720 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion | |
721 """ | |
722 # Character suggestion of ​ comes from: | |
723 # http://www.cs.tut.fi/~jkorpela/html/nobr.html | |
724 if el.tag in _avoid_word_break_elements: | |
725 return | |
726 class_name = el.get('class') | |
727 if class_name: | |
728 dont_break = False | |
729 class_name = class_name.split() | |
730 for avoid in avoid_classes: | |
731 if avoid in class_name: | |
732 dont_break = True | |
733 break | |
734 if dont_break: | |
735 return | |
736 if el.text: | |
737 el.text = _break_text(el.text, max_width, break_character) | |
738 for child in el: | |
739 word_break(child, max_width=max_width, | |
740 avoid_elements=avoid_elements, | |
741 avoid_classes=avoid_classes, | |
742 break_character=break_character) | |
743 if child.tail: | |
744 child.tail = _break_text(child.tail, max_width, break_character) | |
745 | |
746 def word_break_html(html, *args, **kw): | |
747 result_type = type(html) | |
748 doc = fromstring(html) | |
749 word_break(doc, *args, **kw) | |
750 return _transform_result(result_type, doc) | |
751 | |
752 def _break_text(text, max_width, break_character): | |
753 words = text.split() | |
754 for word in words: | |
755 if len(word) > max_width: | |
756 replacement = _insert_break(word, max_width, break_character) | |
757 text = text.replace(word, replacement) | |
758 return text | |
759 | |
760 _break_prefer_re = re.compile(r'[^a-z]', re.I) | |
761 | |
762 def _insert_break(word, width, break_character): | |
763 orig_word = word | |
764 result = '' | |
765 while len(word) > width: | |
766 start = word[:width] | |
767 breaks = list(_break_prefer_re.finditer(start)) | |
768 if breaks: | |
769 last_break = breaks[-1] | |
770 # Only walk back up to 10 characters to find a nice break: | |
771 if last_break.end() > width-10: | |
772 # FIXME: should the break character be at the end of the | |
773 # chunk, or the beginning of the next chunk? | |
774 start = word[:last_break.end()] | |
775 result += start + break_character | |
776 word = word[len(start):] | |
777 result += word | |
778 return result | |
779 |