Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/bleach/sanitizer.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4f3585e2f14b |
---|---|
1 from __future__ import unicode_literals | |
2 | |
3 from itertools import chain | |
4 import re | |
5 import warnings | |
6 | |
7 import six | |
8 from six.moves.urllib.parse import urlparse | |
9 from xml.sax.saxutils import unescape | |
10 | |
11 from bleach import html5lib_shim | |
12 from bleach.utils import alphabetize_attributes, force_unicode | |
13 | |
14 | |
15 #: List of allowed tags | |
16 ALLOWED_TAGS = [ | |
17 "a", | |
18 "abbr", | |
19 "acronym", | |
20 "b", | |
21 "blockquote", | |
22 "code", | |
23 "em", | |
24 "i", | |
25 "li", | |
26 "ol", | |
27 "strong", | |
28 "ul", | |
29 ] | |
30 | |
31 | |
32 #: Map of allowed attributes by tag | |
33 ALLOWED_ATTRIBUTES = { | |
34 "a": ["href", "title"], | |
35 "abbr": ["title"], | |
36 "acronym": ["title"], | |
37 } | |
38 | |
39 #: List of allowed styles | |
40 ALLOWED_STYLES = [] | |
41 | |
42 #: List of allowed protocols | |
43 ALLOWED_PROTOCOLS = ["http", "https", "mailto"] | |
44 | |
45 #: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr) | |
46 INVISIBLE_CHARACTERS = "".join( | |
47 [chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))] | |
48 ) | |
49 | |
50 #: Regexp for characters that are invisible | |
51 INVISIBLE_CHARACTERS_RE = re.compile("[" + INVISIBLE_CHARACTERS + "]", re.UNICODE) | |
52 | |
53 #: String to replace invisible characters with. This can be a character, a | |
54 #: string, or even a function that takes a Python re matchobj | |
55 INVISIBLE_REPLACEMENT_CHAR = "?" | |
56 | |
57 | |
58 class Cleaner(object): | |
59 """Cleaner for cleaning HTML fragments of malicious content | |
60 | |
61 This cleaner is a security-focused function whose sole purpose is to remove | |
62 malicious content from a string such that it can be displayed as content in | |
63 a web page. | |
64 | |
65 To use:: | |
66 | |
67 from bleach.sanitizer import Cleaner | |
68 | |
69 cleaner = Cleaner() | |
70 | |
71 for text in all_the_yucky_things: | |
72 sanitized = cleaner.clean(text) | |
73 | |
74 .. Note:: | |
75 | |
76 This cleaner is not designed to use to transform content to be used in | |
77 non-web-page contexts. | |
78 | |
79 .. Warning:: | |
80 | |
81 This cleaner is not thread-safe--the html parser has internal state. | |
82 Create a separate cleaner per thread! | |
83 | |
84 | |
85 """ | |
86 | |
87 def __init__( | |
88 self, | |
89 tags=ALLOWED_TAGS, | |
90 attributes=ALLOWED_ATTRIBUTES, | |
91 styles=ALLOWED_STYLES, | |
92 protocols=ALLOWED_PROTOCOLS, | |
93 strip=False, | |
94 strip_comments=True, | |
95 filters=None, | |
96 ): | |
97 """Initializes a Cleaner | |
98 | |
99 :arg list tags: allowed list of tags; defaults to | |
100 ``bleach.sanitizer.ALLOWED_TAGS`` | |
101 | |
102 :arg dict attributes: allowed attributes; can be a callable, list or dict; | |
103 defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` | |
104 | |
105 :arg list styles: allowed list of css styles; defaults to | |
106 ``bleach.sanitizer.ALLOWED_STYLES`` | |
107 | |
108 :arg list protocols: allowed list of protocols for links; defaults | |
109 to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` | |
110 | |
111 :arg bool strip: whether or not to strip disallowed elements | |
112 | |
113 :arg bool strip_comments: whether or not to strip HTML comments | |
114 | |
115 :arg list filters: list of html5lib Filter classes to pass streamed content through | |
116 | |
117 .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters | |
118 | |
119 .. Warning:: | |
120 | |
121 Using filters changes the output of ``bleach.Cleaner.clean``. | |
122 Make sure the way the filters change the output are secure. | |
123 | |
124 """ | |
125 self.tags = tags | |
126 self.attributes = attributes | |
127 self.styles = styles | |
128 self.protocols = protocols | |
129 self.strip = strip | |
130 self.strip_comments = strip_comments | |
131 self.filters = filters or [] | |
132 | |
133 self.parser = html5lib_shim.BleachHTMLParser( | |
134 tags=self.tags, | |
135 strip=self.strip, | |
136 consume_entities=False, | |
137 namespaceHTMLElements=False, | |
138 ) | |
139 self.walker = html5lib_shim.getTreeWalker("etree") | |
140 self.serializer = html5lib_shim.BleachHTMLSerializer( | |
141 quote_attr_values="always", | |
142 omit_optional_tags=False, | |
143 escape_lt_in_attrs=True, | |
144 # We want to leave entities as they are without escaping or | |
145 # resolving or expanding | |
146 resolve_entities=False, | |
147 # Bleach has its own sanitizer, so don't use the html5lib one | |
148 sanitize=False, | |
149 # Bleach sanitizer alphabetizes already, so don't use the html5lib one | |
150 alphabetical_attributes=False, | |
151 ) | |
152 | |
153 def clean(self, text): | |
154 """Cleans text and returns sanitized result as unicode | |
155 | |
156 :arg str text: text to be cleaned | |
157 | |
158 :returns: sanitized text as unicode | |
159 | |
160 :raises TypeError: if ``text`` is not a text type | |
161 | |
162 """ | |
163 if not isinstance(text, six.string_types): | |
164 message = ( | |
165 "argument cannot be of '{name}' type, must be of text type".format( | |
166 name=text.__class__.__name__ | |
167 ) | |
168 ) | |
169 raise TypeError(message) | |
170 | |
171 if not text: | |
172 return "" | |
173 | |
174 text = force_unicode(text) | |
175 | |
176 dom = self.parser.parseFragment(text) | |
177 filtered = BleachSanitizerFilter( | |
178 source=self.walker(dom), | |
179 # Bleach-sanitizer-specific things | |
180 attributes=self.attributes, | |
181 strip_disallowed_elements=self.strip, | |
182 strip_html_comments=self.strip_comments, | |
183 # html5lib-sanitizer things | |
184 allowed_elements=self.tags, | |
185 allowed_css_properties=self.styles, | |
186 allowed_protocols=self.protocols, | |
187 allowed_svg_properties=[], | |
188 ) | |
189 | |
190 # Apply any filters after the BleachSanitizerFilter | |
191 for filter_class in self.filters: | |
192 filtered = filter_class(source=filtered) | |
193 | |
194 return self.serializer.render(filtered) | |
195 | |
196 | |
197 def attribute_filter_factory(attributes): | |
198 """Generates attribute filter function for the given attributes value | |
199 | |
200 The attributes value can take one of several shapes. This returns a filter | |
201 function appropriate to the attributes value. One nice thing about this is | |
202 that there's less if/then shenanigans in the ``allow_token`` method. | |
203 | |
204 """ | |
205 if callable(attributes): | |
206 return attributes | |
207 | |
208 if isinstance(attributes, dict): | |
209 | |
210 def _attr_filter(tag, attr, value): | |
211 if tag in attributes: | |
212 attr_val = attributes[tag] | |
213 if callable(attr_val): | |
214 return attr_val(tag, attr, value) | |
215 | |
216 if attr in attr_val: | |
217 return True | |
218 | |
219 if "*" in attributes: | |
220 attr_val = attributes["*"] | |
221 if callable(attr_val): | |
222 return attr_val(tag, attr, value) | |
223 | |
224 return attr in attr_val | |
225 | |
226 return False | |
227 | |
228 return _attr_filter | |
229 | |
230 if isinstance(attributes, list): | |
231 | |
232 def _attr_filter(tag, attr, value): | |
233 return attr in attributes | |
234 | |
235 return _attr_filter | |
236 | |
237 raise ValueError("attributes needs to be a callable, a list or a dict") | |
238 | |
239 | |
240 class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): | |
241 """html5lib Filter that sanitizes text | |
242 | |
243 This filter can be used anywhere html5lib filters can be used. | |
244 | |
245 """ | |
246 | |
247 def __init__( | |
248 self, | |
249 source, | |
250 attributes=ALLOWED_ATTRIBUTES, | |
251 strip_disallowed_elements=False, | |
252 strip_html_comments=True, | |
253 **kwargs | |
254 ): | |
255 """Creates a BleachSanitizerFilter instance | |
256 | |
257 :arg Treewalker source: stream | |
258 | |
259 :arg list tags: allowed list of tags; defaults to | |
260 ``bleach.sanitizer.ALLOWED_TAGS`` | |
261 | |
262 :arg dict attributes: allowed attributes; can be a callable, list or dict; | |
263 defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` | |
264 | |
265 :arg list styles: allowed list of css styles; defaults to | |
266 ``bleach.sanitizer.ALLOWED_STYLES`` | |
267 | |
268 :arg list protocols: allowed list of protocols for links; defaults | |
269 to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` | |
270 | |
271 :arg bool strip_disallowed_elements: whether or not to strip disallowed | |
272 elements | |
273 | |
274 :arg bool strip_html_comments: whether or not to strip HTML comments | |
275 | |
276 """ | |
277 self.attr_filter = attribute_filter_factory(attributes) | |
278 self.strip_disallowed_elements = strip_disallowed_elements | |
279 self.strip_html_comments = strip_html_comments | |
280 | |
281 # filter out html5lib deprecation warnings to use bleach from BleachSanitizerFilter init | |
282 warnings.filterwarnings( | |
283 "ignore", | |
284 message="html5lib's sanitizer is deprecated", | |
285 category=DeprecationWarning, | |
286 module="bleach._vendor.html5lib", | |
287 ) | |
288 return super(BleachSanitizerFilter, self).__init__(source, **kwargs) | |
289 | |
290 def sanitize_stream(self, token_iterator): | |
291 for token in token_iterator: | |
292 ret = self.sanitize_token(token) | |
293 | |
294 if not ret: | |
295 continue | |
296 | |
297 if isinstance(ret, list): | |
298 for subtoken in ret: | |
299 yield subtoken | |
300 else: | |
301 yield ret | |
302 | |
303 def merge_characters(self, token_iterator): | |
304 """Merge consecutive Characters tokens in a stream""" | |
305 characters_buffer = [] | |
306 | |
307 for token in token_iterator: | |
308 if characters_buffer: | |
309 if token["type"] == "Characters": | |
310 characters_buffer.append(token) | |
311 continue | |
312 else: | |
313 # Merge all the characters tokens together into one and then | |
314 # operate on it. | |
315 new_token = { | |
316 "data": "".join( | |
317 [char_token["data"] for char_token in characters_buffer] | |
318 ), | |
319 "type": "Characters", | |
320 } | |
321 characters_buffer = [] | |
322 yield new_token | |
323 | |
324 elif token["type"] == "Characters": | |
325 characters_buffer.append(token) | |
326 continue | |
327 | |
328 yield token | |
329 | |
330 new_token = { | |
331 "data": "".join([char_token["data"] for char_token in characters_buffer]), | |
332 "type": "Characters", | |
333 } | |
334 yield new_token | |
335 | |
336 def __iter__(self): | |
337 return self.merge_characters( | |
338 self.sanitize_stream(html5lib_shim.Filter.__iter__(self)) | |
339 ) | |
340 | |
341 def sanitize_token(self, token): | |
342 """Sanitize a token either by HTML-encoding or dropping. | |
343 | |
344 Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag': | |
345 ['attribute', 'pairs'], 'tag': callable}. | |
346 | |
347 Here callable is a function with two arguments of attribute name and | |
348 value. It should return true of false. | |
349 | |
350 Also gives the option to strip tags instead of encoding. | |
351 | |
352 :arg dict token: token to sanitize | |
353 | |
354 :returns: token or list of tokens | |
355 | |
356 """ | |
357 token_type = token["type"] | |
358 if token_type in ["StartTag", "EndTag", "EmptyTag"]: | |
359 if token["name"] in self.allowed_elements: | |
360 return self.allow_token(token) | |
361 | |
362 elif self.strip_disallowed_elements: | |
363 return None | |
364 | |
365 else: | |
366 if "data" in token: | |
367 # Alphabetize the attributes before calling .disallowed_token() | |
368 # so that the resulting string is stable | |
369 token["data"] = alphabetize_attributes(token["data"]) | |
370 return self.disallowed_token(token) | |
371 | |
372 elif token_type == "Comment": | |
373 if not self.strip_html_comments: | |
374 # call lxml.sax.saxutils to escape &, <, and > in addition to " and ' | |
375 token["data"] = html5lib_shim.escape( | |
376 token["data"], entities={'"': """, "'": "'"} | |
377 ) | |
378 return token | |
379 else: | |
380 return None | |
381 | |
382 elif token_type == "Characters": | |
383 return self.sanitize_characters(token) | |
384 | |
385 else: | |
386 return token | |
387 | |
388 def sanitize_characters(self, token): | |
389 """Handles Characters tokens | |
390 | |
391 Our overridden tokenizer doesn't do anything with entities. However, | |
392 that means that the serializer will convert all ``&`` in Characters | |
393 tokens to ``&``. | |
394 | |
395 Since we don't want that, we extract entities here and convert them to | |
396 Entity tokens so the serializer will let them be. | |
397 | |
398 :arg token: the Characters token to work on | |
399 | |
400 :returns: a list of tokens | |
401 | |
402 """ | |
403 data = token.get("data", "") | |
404 | |
405 if not data: | |
406 return token | |
407 | |
408 data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data) | |
409 token["data"] = data | |
410 | |
411 # If there isn't a & in the data, we can return now | |
412 if "&" not in data: | |
413 return token | |
414 | |
415 new_tokens = [] | |
416 | |
417 # For each possible entity that starts with a "&", we try to extract an | |
418 # actual entity and re-tokenize accordingly | |
419 for part in html5lib_shim.next_possible_entity(data): | |
420 if not part: | |
421 continue | |
422 | |
423 if part.startswith("&"): | |
424 entity = html5lib_shim.match_entity(part) | |
425 if entity is not None: | |
426 if entity == "amp": | |
427 # LinkifyFilter can't match urls across token boundaries | |
428 # which is problematic with & since that shows up in | |
429 # querystrings all the time. This special-cases & | |
430 # and converts it to a & and sticks it in as a | |
431 # Characters token. It'll get merged with surrounding | |
432 # tokens in the BleachSanitizerfilter.__iter__ and | |
433 # escaped in the serializer. | |
434 new_tokens.append({"type": "Characters", "data": "&"}) | |
435 else: | |
436 new_tokens.append({"type": "Entity", "name": entity}) | |
437 | |
438 # Length of the entity plus 2--one for & at the beginning | |
439 # and one for ; at the end | |
440 remainder = part[len(entity) + 2 :] | |
441 if remainder: | |
442 new_tokens.append({"type": "Characters", "data": remainder}) | |
443 continue | |
444 | |
445 new_tokens.append({"type": "Characters", "data": part}) | |
446 | |
447 return new_tokens | |
448 | |
449 def sanitize_uri_value(self, value, allowed_protocols): | |
450 """Checks a uri value to see if it's allowed | |
451 | |
452 :arg value: the uri value to sanitize | |
453 :arg allowed_protocols: list of allowed protocols | |
454 | |
455 :returns: allowed value or None | |
456 | |
457 """ | |
458 # NOTE(willkg): This transforms the value into one that's easier to | |
459 # match and verify, but shouldn't get returned since it's vastly | |
460 # different than the original value. | |
461 | |
462 # Convert all character entities in the value | |
463 new_value = html5lib_shim.convert_entities(value) | |
464 | |
465 # Nix backtick, space characters, and control characters | |
466 new_value = re.sub(r"[`\000-\040\177-\240\s]+", "", new_value) | |
467 | |
468 # Remove REPLACEMENT characters | |
469 new_value = new_value.replace("\ufffd", "") | |
470 | |
471 # Lowercase it--this breaks the value, but makes it easier to match | |
472 # against | |
473 new_value = new_value.lower() | |
474 | |
475 try: | |
476 # Drop attributes with uri values that have protocols that aren't | |
477 # allowed | |
478 parsed = urlparse(new_value) | |
479 except ValueError: | |
480 # URI is impossible to parse, therefore it's not allowed | |
481 return None | |
482 | |
483 if parsed.scheme: | |
484 # If urlparse found a scheme, check that | |
485 if parsed.scheme in allowed_protocols: | |
486 return value | |
487 | |
488 else: | |
489 # Allow uris that are just an anchor | |
490 if new_value.startswith("#"): | |
491 return value | |
492 | |
493 # Handle protocols that urlparse doesn't recognize like "myprotocol" | |
494 if ":" in new_value and new_value.split(":")[0] in allowed_protocols: | |
495 return value | |
496 | |
497 # If there's no protocol/scheme specified, then assume it's "http" | |
498 # and see if that's allowed | |
499 if "http" in allowed_protocols: | |
500 return value | |
501 | |
502 return None | |
503 | |
504 def allow_token(self, token): | |
505 """Handles the case where we're allowing the tag""" | |
506 if "data" in token: | |
507 # Loop through all the attributes and drop the ones that are not | |
508 # allowed, are unsafe or break other rules. Additionally, fix | |
509 # attribute values that need fixing. | |
510 # | |
511 # At the end of this loop, we have the final set of attributes | |
512 # we're keeping. | |
513 attrs = {} | |
514 for namespaced_name, val in token["data"].items(): | |
515 namespace, name = namespaced_name | |
516 | |
517 # Drop attributes that are not explicitly allowed | |
518 # | |
519 # NOTE(willkg): We pass in the attribute name--not a namespaced | |
520 # name. | |
521 if not self.attr_filter(token["name"], name, val): | |
522 continue | |
523 | |
524 # Drop attributes with uri values that use a disallowed protocol | |
525 # Sanitize attributes with uri values | |
526 if namespaced_name in self.attr_val_is_uri: | |
527 new_value = self.sanitize_uri_value(val, self.allowed_protocols) | |
528 if new_value is None: | |
529 continue | |
530 val = new_value | |
531 | |
532 # Drop values in svg attrs with non-local IRIs | |
533 if namespaced_name in self.svg_attr_val_allows_ref: | |
534 new_val = re.sub(r"url\s*\(\s*[^#\s][^)]+?\)", " ", unescape(val)) | |
535 new_val = new_val.strip() | |
536 if not new_val: | |
537 continue | |
538 | |
539 else: | |
540 # Replace the val with the unescaped version because | |
541 # it's a iri | |
542 val = new_val | |
543 | |
544 # Drop href and xlink:href attr for svg elements with non-local IRIs | |
545 if (None, token["name"]) in self.svg_allow_local_href: | |
546 if namespaced_name in [ | |
547 (None, "href"), | |
548 (html5lib_shim.namespaces["xlink"], "href"), | |
549 ]: | |
550 if re.search(r"^\s*[^#\s]", val): | |
551 continue | |
552 | |
553 # If it's a style attribute, sanitize it | |
554 if namespaced_name == (None, "style"): | |
555 val = self.sanitize_css(val) | |
556 | |
557 # At this point, we want to keep the attribute, so add it in | |
558 attrs[namespaced_name] = val | |
559 | |
560 token["data"] = alphabetize_attributes(attrs) | |
561 | |
562 return token | |
563 | |
564 def disallowed_token(self, token): | |
565 token_type = token["type"] | |
566 if token_type == "EndTag": | |
567 token["data"] = "</%s>" % token["name"] | |
568 | |
569 elif token["data"]: | |
570 assert token_type in ("StartTag", "EmptyTag") | |
571 attrs = [] | |
572 for (ns, name), v in token["data"].items(): | |
573 # If we end up with a namespace, but no name, switch them so we | |
574 # have a valid name to use. | |
575 if ns and not name: | |
576 ns, name = name, ns | |
577 | |
578 # Figure out namespaced name if the namespace is appropriate | |
579 # and exists; if the ns isn't in prefixes, then drop it. | |
580 if ns is None or ns not in html5lib_shim.prefixes: | |
581 namespaced_name = name | |
582 else: | |
583 namespaced_name = "%s:%s" % (html5lib_shim.prefixes[ns], name) | |
584 | |
585 attrs.append( | |
586 ' %s="%s"' | |
587 % ( | |
588 namespaced_name, | |
589 # NOTE(willkg): HTMLSerializer escapes attribute values | |
590 # already, so if we do it here (like HTMLSerializer does), | |
591 # then we end up double-escaping. | |
592 v, | |
593 ) | |
594 ) | |
595 token["data"] = "<%s%s>" % (token["name"], "".join(attrs)) | |
596 | |
597 else: | |
598 token["data"] = "<%s>" % token["name"] | |
599 | |
600 if token.get("selfClosing"): | |
601 token["data"] = token["data"][:-1] + "/>" | |
602 | |
603 token["type"] = "Characters" | |
604 | |
605 del token["name"] | |
606 return token | |
607 | |
608 def sanitize_css(self, style): | |
609 """Sanitizes css in style tags""" | |
610 # Convert entities in the style so that it can be parsed as CSS | |
611 style = html5lib_shim.convert_entities(style) | |
612 | |
613 # Drop any url values before we do anything else | |
614 style = re.compile(r"url\s*\(\s*[^\s)]+?\s*\)\s*").sub(" ", style) | |
615 | |
616 # The gauntlet of sanitization | |
617 | |
618 # Validate the css in the style tag and if it's not valid, then drop | |
619 # the whole thing. | |
620 parts = style.split(";") | |
621 gauntlet = re.compile( | |
622 r"""^( # consider a style attribute value as composed of: | |
623 [/:,#%!.\s\w] # a non-newline character | |
624 |\w-\w # 3 characters in the form \w-\w | |
625 |'[\s\w]+'\s* # a single quoted string of [\s\w]+ with trailing space | |
626 |"[\s\w]+" # a double quoted string of [\s\w]+ | |
627 |\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, ... | |
628 )*$""", # ... percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)' | |
629 flags=re.U | re.VERBOSE, | |
630 ) | |
631 | |
632 for part in parts: | |
633 if not gauntlet.match(part): | |
634 return "" | |
635 | |
636 if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): | |
637 return "" | |
638 | |
639 clean = [] | |
640 for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style): | |
641 if not value: | |
642 continue | |
643 | |
644 if prop.lower() in self.allowed_css_properties: | |
645 clean.append(prop + ": " + value + ";") | |
646 | |
647 elif prop.lower() in self.allowed_svg_properties: | |
648 clean.append(prop + ": " + value + ";") | |
649 | |
650 return " ".join(clean) |