Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/bleach/sanitizer.py @ 0:d30785e31577 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
| author | guerler |
|---|---|
| date | Fri, 31 Jul 2020 00:18:57 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:d30785e31577 |
|---|---|
| 1 from __future__ import unicode_literals | |
| 2 | |
| 3 from itertools import chain | |
| 4 import re | |
| 5 | |
| 6 import six | |
| 7 from six.moves.urllib.parse import urlparse | |
| 8 from xml.sax.saxutils import unescape | |
| 9 | |
| 10 from bleach import html5lib_shim | |
| 11 from bleach.utils import alphabetize_attributes, force_unicode | |
| 12 | |
| 13 | |
| 14 #: List of allowed tags | |
| 15 ALLOWED_TAGS = [ | |
| 16 'a', | |
| 17 'abbr', | |
| 18 'acronym', | |
| 19 'b', | |
| 20 'blockquote', | |
| 21 'code', | |
| 22 'em', | |
| 23 'i', | |
| 24 'li', | |
| 25 'ol', | |
| 26 'strong', | |
| 27 'ul', | |
| 28 ] | |
| 29 | |
| 30 | |
| 31 #: Map of allowed attributes by tag | |
| 32 ALLOWED_ATTRIBUTES = { | |
| 33 'a': ['href', 'title'], | |
| 34 'abbr': ['title'], | |
| 35 'acronym': ['title'], | |
| 36 } | |
| 37 | |
| 38 #: List of allowed styles | |
| 39 ALLOWED_STYLES = [] | |
| 40 | |
| 41 #: List of allowed protocols | |
| 42 ALLOWED_PROTOCOLS = ['http', 'https', 'mailto'] | |
| 43 | |
| 44 #: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr) | |
| 45 INVISIBLE_CHARACTERS = ''.join([chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))]) | |
| 46 | |
| 47 #: Regexp for characters that are invisible | |
| 48 INVISIBLE_CHARACTERS_RE = re.compile( | |
| 49 '[' + INVISIBLE_CHARACTERS + ']', | |
| 50 re.UNICODE | |
| 51 ) | |
| 52 | |
| 53 #: String to replace invisible characters with. This can be a character, a | |
| 54 #: string, or even a function that takes a Python re matchobj | |
| 55 INVISIBLE_REPLACEMENT_CHAR = '?' | |
| 56 | |
| 57 | |
| 58 class Cleaner(object): | |
| 59 """Cleaner for cleaning HTML fragments of malicious content | |
| 60 | |
| 61 This cleaner is a security-focused function whose sole purpose is to remove | |
| 62 malicious content from a string such that it can be displayed as content in | |
| 63 a web page. | |
| 64 | |
| 65 To use:: | |
| 66 | |
| 67 from bleach.sanitizer import Cleaner | |
| 68 | |
| 69 cleaner = Cleaner() | |
| 70 | |
| 71 for text in all_the_yucky_things: | |
| 72 sanitized = cleaner.clean(text) | |
| 73 | |
| 74 .. Note:: | |
| 75 | |
| 76 This cleaner is not designed to use to transform content to be used in | |
| 77 non-web-page contexts. | |
| 78 | |
| 79 .. Warning:: | |
| 80 | |
| 81 This cleaner is not thread-safe--the html parser has internal state. | |
| 82 Create a separate cleaner per thread! | |
| 83 | |
| 84 | |
| 85 """ | |
| 86 | |
| 87 def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, | |
| 88 styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, | |
| 89 strip_comments=True, filters=None): | |
| 90 """Initializes a Cleaner | |
| 91 | |
| 92 :arg list tags: allowed list of tags; defaults to | |
| 93 ``bleach.sanitizer.ALLOWED_TAGS`` | |
| 94 | |
| 95 :arg dict attributes: allowed attributes; can be a callable, list or dict; | |
| 96 defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` | |
| 97 | |
| 98 :arg list styles: allowed list of css styles; defaults to | |
| 99 ``bleach.sanitizer.ALLOWED_STYLES`` | |
| 100 | |
| 101 :arg list protocols: allowed list of protocols for links; defaults | |
| 102 to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` | |
| 103 | |
| 104 :arg bool strip: whether or not to strip disallowed elements | |
| 105 | |
| 106 :arg bool strip_comments: whether or not to strip HTML comments | |
| 107 | |
| 108 :arg list filters: list of html5lib Filter classes to pass streamed content through | |
| 109 | |
| 110 .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters | |
| 111 | |
| 112 .. Warning:: | |
| 113 | |
| 114 Using filters changes the output of ``bleach.Cleaner.clean``. | |
| 115 Make sure the way the filters change the output are secure. | |
| 116 | |
| 117 """ | |
| 118 self.tags = tags | |
| 119 self.attributes = attributes | |
| 120 self.styles = styles | |
| 121 self.protocols = protocols | |
| 122 self.strip = strip | |
| 123 self.strip_comments = strip_comments | |
| 124 self.filters = filters or [] | |
| 125 | |
| 126 self.parser = html5lib_shim.BleachHTMLParser( | |
| 127 tags=self.tags, | |
| 128 strip=self.strip, | |
| 129 consume_entities=False, | |
| 130 namespaceHTMLElements=False | |
| 131 ) | |
| 132 self.walker = html5lib_shim.getTreeWalker('etree') | |
| 133 self.serializer = html5lib_shim.BleachHTMLSerializer( | |
| 134 quote_attr_values='always', | |
| 135 omit_optional_tags=False, | |
| 136 escape_lt_in_attrs=True, | |
| 137 | |
| 138 # We want to leave entities as they are without escaping or | |
| 139 # resolving or expanding | |
| 140 resolve_entities=False, | |
| 141 | |
| 142 # Bleach has its own sanitizer, so don't use the html5lib one | |
| 143 sanitize=False, | |
| 144 | |
| 145 # Bleach sanitizer alphabetizes already, so don't use the html5lib one | |
| 146 alphabetical_attributes=False, | |
| 147 ) | |
| 148 | |
| 149 def clean(self, text): | |
| 150 """Cleans text and returns sanitized result as unicode | |
| 151 | |
| 152 :arg str text: text to be cleaned | |
| 153 | |
| 154 :returns: sanitized text as unicode | |
| 155 | |
| 156 :raises TypeError: if ``text`` is not a text type | |
| 157 | |
| 158 """ | |
| 159 if not isinstance(text, six.string_types): | |
| 160 message = "argument cannot be of '{name}' type, must be of text type".format( | |
| 161 name=text.__class__.__name__) | |
| 162 raise TypeError(message) | |
| 163 | |
| 164 if not text: | |
| 165 return '' | |
| 166 | |
| 167 text = force_unicode(text) | |
| 168 | |
| 169 dom = self.parser.parseFragment(text) | |
| 170 filtered = BleachSanitizerFilter( | |
| 171 source=self.walker(dom), | |
| 172 | |
| 173 # Bleach-sanitizer-specific things | |
| 174 attributes=self.attributes, | |
| 175 strip_disallowed_elements=self.strip, | |
| 176 strip_html_comments=self.strip_comments, | |
| 177 | |
| 178 # html5lib-sanitizer things | |
| 179 allowed_elements=self.tags, | |
| 180 allowed_css_properties=self.styles, | |
| 181 allowed_protocols=self.protocols, | |
| 182 allowed_svg_properties=[], | |
| 183 ) | |
| 184 | |
| 185 # Apply any filters after the BleachSanitizerFilter | |
| 186 for filter_class in self.filters: | |
| 187 filtered = filter_class(source=filtered) | |
| 188 | |
| 189 return self.serializer.render(filtered) | |
| 190 | |
| 191 | |
| 192 def attribute_filter_factory(attributes): | |
| 193 """Generates attribute filter function for the given attributes value | |
| 194 | |
| 195 The attributes value can take one of several shapes. This returns a filter | |
| 196 function appropriate to the attributes value. One nice thing about this is | |
| 197 that there's less if/then shenanigans in the ``allow_token`` method. | |
| 198 | |
| 199 """ | |
| 200 if callable(attributes): | |
| 201 return attributes | |
| 202 | |
| 203 if isinstance(attributes, dict): | |
| 204 def _attr_filter(tag, attr, value): | |
| 205 if tag in attributes: | |
| 206 attr_val = attributes[tag] | |
| 207 if callable(attr_val): | |
| 208 return attr_val(tag, attr, value) | |
| 209 | |
| 210 if attr in attr_val: | |
| 211 return True | |
| 212 | |
| 213 if '*' in attributes: | |
| 214 attr_val = attributes['*'] | |
| 215 if callable(attr_val): | |
| 216 return attr_val(tag, attr, value) | |
| 217 | |
| 218 return attr in attr_val | |
| 219 | |
| 220 return False | |
| 221 | |
| 222 return _attr_filter | |
| 223 | |
| 224 if isinstance(attributes, list): | |
| 225 def _attr_filter(tag, attr, value): | |
| 226 return attr in attributes | |
| 227 | |
| 228 return _attr_filter | |
| 229 | |
| 230 raise ValueError('attributes needs to be a callable, a list or a dict') | |
| 231 | |
| 232 | |
| 233 class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): | |
| 234 """html5lib Filter that sanitizes text | |
| 235 | |
| 236 This filter can be used anywhere html5lib filters can be used. | |
| 237 | |
| 238 """ | |
| 239 def __init__(self, source, attributes=ALLOWED_ATTRIBUTES, | |
| 240 strip_disallowed_elements=False, strip_html_comments=True, | |
| 241 **kwargs): | |
| 242 """Creates a BleachSanitizerFilter instance | |
| 243 | |
| 244 :arg Treewalker source: stream | |
| 245 | |
| 246 :arg list tags: allowed list of tags; defaults to | |
| 247 ``bleach.sanitizer.ALLOWED_TAGS`` | |
| 248 | |
| 249 :arg dict attributes: allowed attributes; can be a callable, list or dict; | |
| 250 defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` | |
| 251 | |
| 252 :arg list styles: allowed list of css styles; defaults to | |
| 253 ``bleach.sanitizer.ALLOWED_STYLES`` | |
| 254 | |
| 255 :arg list protocols: allowed list of protocols for links; defaults | |
| 256 to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` | |
| 257 | |
| 258 :arg bool strip_disallowed_elements: whether or not to strip disallowed | |
| 259 elements | |
| 260 | |
| 261 :arg bool strip_html_comments: whether or not to strip HTML comments | |
| 262 | |
| 263 """ | |
| 264 self.attr_filter = attribute_filter_factory(attributes) | |
| 265 self.strip_disallowed_elements = strip_disallowed_elements | |
| 266 self.strip_html_comments = strip_html_comments | |
| 267 | |
| 268 return super(BleachSanitizerFilter, self).__init__(source, **kwargs) | |
| 269 | |
| 270 def sanitize_stream(self, token_iterator): | |
| 271 for token in token_iterator: | |
| 272 ret = self.sanitize_token(token) | |
| 273 | |
| 274 if not ret: | |
| 275 continue | |
| 276 | |
| 277 if isinstance(ret, list): | |
| 278 for subtoken in ret: | |
| 279 yield subtoken | |
| 280 else: | |
| 281 yield ret | |
| 282 | |
| 283 def merge_characters(self, token_iterator): | |
| 284 """Merge consecutive Characters tokens in a stream""" | |
| 285 characters_buffer = [] | |
| 286 | |
| 287 for token in token_iterator: | |
| 288 if characters_buffer: | |
| 289 if token['type'] == 'Characters': | |
| 290 characters_buffer.append(token) | |
| 291 continue | |
| 292 else: | |
| 293 # Merge all the characters tokens together into one and then | |
| 294 # operate on it. | |
| 295 new_token = { | |
| 296 'data': ''.join([char_token['data'] for char_token in characters_buffer]), | |
| 297 'type': 'Characters' | |
| 298 } | |
| 299 characters_buffer = [] | |
| 300 yield new_token | |
| 301 | |
| 302 elif token['type'] == 'Characters': | |
| 303 characters_buffer.append(token) | |
| 304 continue | |
| 305 | |
| 306 yield token | |
| 307 | |
| 308 new_token = { | |
| 309 'data': ''.join([char_token['data'] for char_token in characters_buffer]), | |
| 310 'type': 'Characters' | |
| 311 } | |
| 312 yield new_token | |
| 313 | |
| 314 def __iter__(self): | |
| 315 return self.merge_characters(self.sanitize_stream(html5lib_shim.Filter.__iter__(self))) | |
| 316 | |
| 317 def sanitize_token(self, token): | |
| 318 """Sanitize a token either by HTML-encoding or dropping. | |
| 319 | |
| 320 Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag': | |
| 321 ['attribute', 'pairs'], 'tag': callable}. | |
| 322 | |
| 323 Here callable is a function with two arguments of attribute name and | |
| 324 value. It should return true of false. | |
| 325 | |
| 326 Also gives the option to strip tags instead of encoding. | |
| 327 | |
| 328 :arg dict token: token to sanitize | |
| 329 | |
| 330 :returns: token or list of tokens | |
| 331 | |
| 332 """ | |
| 333 token_type = token['type'] | |
| 334 if token_type in ['StartTag', 'EndTag', 'EmptyTag']: | |
| 335 if token['name'] in self.allowed_elements: | |
| 336 return self.allow_token(token) | |
| 337 | |
| 338 elif self.strip_disallowed_elements: | |
| 339 return None | |
| 340 | |
| 341 else: | |
| 342 if 'data' in token: | |
| 343 # Alphabetize the attributes before calling .disallowed_token() | |
| 344 # so that the resulting string is stable | |
| 345 token['data'] = alphabetize_attributes(token['data']) | |
| 346 return self.disallowed_token(token) | |
| 347 | |
| 348 elif token_type == 'Comment': | |
| 349 if not self.strip_html_comments: | |
| 350 return token | |
| 351 else: | |
| 352 return None | |
| 353 | |
| 354 elif token_type == 'Characters': | |
| 355 return self.sanitize_characters(token) | |
| 356 | |
| 357 else: | |
| 358 return token | |
| 359 | |
| 360 def sanitize_characters(self, token): | |
| 361 """Handles Characters tokens | |
| 362 | |
| 363 Our overridden tokenizer doesn't do anything with entities. However, | |
| 364 that means that the serializer will convert all ``&`` in Characters | |
| 365 tokens to ``&``. | |
| 366 | |
| 367 Since we don't want that, we extract entities here and convert them to | |
| 368 Entity tokens so the serializer will let them be. | |
| 369 | |
| 370 :arg token: the Characters token to work on | |
| 371 | |
| 372 :returns: a list of tokens | |
| 373 | |
| 374 """ | |
| 375 data = token.get('data', '') | |
| 376 | |
| 377 if not data: | |
| 378 return token | |
| 379 | |
| 380 data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data) | |
| 381 token['data'] = data | |
| 382 | |
| 383 # If there isn't a & in the data, we can return now | |
| 384 if '&' not in data: | |
| 385 return token | |
| 386 | |
| 387 new_tokens = [] | |
| 388 | |
| 389 # For each possible entity that starts with a "&", we try to extract an | |
| 390 # actual entity and re-tokenize accordingly | |
| 391 for part in html5lib_shim.next_possible_entity(data): | |
| 392 if not part: | |
| 393 continue | |
| 394 | |
| 395 if part.startswith('&'): | |
| 396 entity = html5lib_shim.match_entity(part) | |
| 397 if entity is not None: | |
| 398 if entity == 'amp': | |
| 399 # LinkifyFilter can't match urls across token boundaries | |
| 400 # which is problematic with & since that shows up in | |
| 401 # querystrings all the time. This special-cases & | |
| 402 # and converts it to a & and sticks it in as a | |
| 403 # Characters token. It'll get merged with surrounding | |
| 404 # tokens in the BleachSanitizerfilter.__iter__ and | |
| 405 # escaped in the serializer. | |
| 406 new_tokens.append({'type': 'Characters', 'data': '&'}) | |
| 407 else: | |
| 408 new_tokens.append({'type': 'Entity', 'name': entity}) | |
| 409 | |
| 410 # Length of the entity plus 2--one for & at the beginning | |
| 411 # and one for ; at the end | |
| 412 remainder = part[len(entity) + 2:] | |
| 413 if remainder: | |
| 414 new_tokens.append({'type': 'Characters', 'data': remainder}) | |
| 415 continue | |
| 416 | |
| 417 new_tokens.append({'type': 'Characters', 'data': part}) | |
| 418 | |
| 419 return new_tokens | |
| 420 | |
| 421 def sanitize_uri_value(self, value, allowed_protocols): | |
| 422 """Checks a uri value to see if it's allowed | |
| 423 | |
| 424 :arg value: the uri value to sanitize | |
| 425 :arg allowed_protocols: list of allowed protocols | |
| 426 | |
| 427 :returns: allowed value or None | |
| 428 | |
| 429 """ | |
| 430 # NOTE(willkg): This transforms the value into one that's easier to | |
| 431 # match and verify, but shouldn't get returned since it's vastly | |
| 432 # different than the original value. | |
| 433 | |
| 434 # Convert all character entities in the value | |
| 435 new_value = html5lib_shim.convert_entities(value) | |
| 436 | |
| 437 # Nix backtick, space characters, and control characters | |
| 438 new_value = re.sub( | |
| 439 r"[`\000-\040\177-\240\s]+", | |
| 440 '', | |
| 441 new_value | |
| 442 ) | |
| 443 | |
| 444 # Remove REPLACEMENT characters | |
| 445 new_value = new_value.replace('\ufffd', '') | |
| 446 | |
| 447 # Lowercase it--this breaks the value, but makes it easier to match | |
| 448 # against | |
| 449 new_value = new_value.lower() | |
| 450 | |
| 451 try: | |
| 452 # Drop attributes with uri values that have protocols that aren't | |
| 453 # allowed | |
| 454 parsed = urlparse(new_value) | |
| 455 except ValueError: | |
| 456 # URI is impossible to parse, therefore it's not allowed | |
| 457 return None | |
| 458 | |
| 459 if parsed.scheme: | |
| 460 # If urlparse found a scheme, check that | |
| 461 if parsed.scheme in allowed_protocols: | |
| 462 return value | |
| 463 | |
| 464 else: | |
| 465 # Allow uris that are just an anchor | |
| 466 if new_value.startswith('#'): | |
| 467 return value | |
| 468 | |
| 469 # Handle protocols that urlparse doesn't recognize like "myprotocol" | |
| 470 if ':' in new_value and new_value.split(':')[0] in allowed_protocols: | |
| 471 return value | |
| 472 | |
| 473 # If there's no protocol/scheme specified, then assume it's "http" | |
| 474 # and see if that's allowed | |
| 475 if 'http' in allowed_protocols: | |
| 476 return value | |
| 477 | |
| 478 return None | |
| 479 | |
| 480 def allow_token(self, token): | |
| 481 """Handles the case where we're allowing the tag""" | |
| 482 if 'data' in token: | |
| 483 # Loop through all the attributes and drop the ones that are not | |
| 484 # allowed, are unsafe or break other rules. Additionally, fix | |
| 485 # attribute values that need fixing. | |
| 486 # | |
| 487 # At the end of this loop, we have the final set of attributes | |
| 488 # we're keeping. | |
| 489 attrs = {} | |
| 490 for namespaced_name, val in token['data'].items(): | |
| 491 namespace, name = namespaced_name | |
| 492 | |
| 493 # Drop attributes that are not explicitly allowed | |
| 494 # | |
| 495 # NOTE(willkg): We pass in the attribute name--not a namespaced | |
| 496 # name. | |
| 497 if not self.attr_filter(token['name'], name, val): | |
| 498 continue | |
| 499 | |
| 500 # Drop attributes with uri values that use a disallowed protocol | |
| 501 # Sanitize attributes with uri values | |
| 502 if namespaced_name in self.attr_val_is_uri: | |
| 503 new_value = self.sanitize_uri_value(val, self.allowed_protocols) | |
| 504 if new_value is None: | |
| 505 continue | |
| 506 val = new_value | |
| 507 | |
| 508 # Drop values in svg attrs with non-local IRIs | |
| 509 if namespaced_name in self.svg_attr_val_allows_ref: | |
| 510 new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', | |
| 511 ' ', | |
| 512 unescape(val)) | |
| 513 new_val = new_val.strip() | |
| 514 if not new_val: | |
| 515 continue | |
| 516 | |
| 517 else: | |
| 518 # Replace the val with the unescaped version because | |
| 519 # it's a iri | |
| 520 val = new_val | |
| 521 | |
| 522 # Drop href and xlink:href attr for svg elements with non-local IRIs | |
| 523 if (None, token['name']) in self.svg_allow_local_href: | |
| 524 if namespaced_name in [ | |
| 525 (None, 'href'), (html5lib_shim.namespaces['xlink'], 'href') | |
| 526 ]: | |
| 527 if re.search(r'^\s*[^#\s]', val): | |
| 528 continue | |
| 529 | |
| 530 # If it's a style attribute, sanitize it | |
| 531 if namespaced_name == (None, 'style'): | |
| 532 val = self.sanitize_css(val) | |
| 533 | |
| 534 # At this point, we want to keep the attribute, so add it in | |
| 535 attrs[namespaced_name] = val | |
| 536 | |
| 537 token['data'] = alphabetize_attributes(attrs) | |
| 538 | |
| 539 return token | |
| 540 | |
| 541 def disallowed_token(self, token): | |
| 542 token_type = token["type"] | |
| 543 if token_type == "EndTag": | |
| 544 token["data"] = "</%s>" % token["name"] | |
| 545 | |
| 546 elif token["data"]: | |
| 547 assert token_type in ("StartTag", "EmptyTag") | |
| 548 attrs = [] | |
| 549 for (ns, name), v in token["data"].items(): | |
| 550 # If we end up with a namespace, but no name, switch them so we | |
| 551 # have a valid name to use. | |
| 552 if ns and not name: | |
| 553 ns, name = name, ns | |
| 554 | |
| 555 # Figure out namespaced name if the namespace is appropriate | |
| 556 # and exists; if the ns isn't in prefixes, then drop it. | |
| 557 if ns is None or ns not in html5lib_shim.prefixes: | |
| 558 namespaced_name = name | |
| 559 else: | |
| 560 namespaced_name = '%s:%s' % (html5lib_shim.prefixes[ns], name) | |
| 561 | |
| 562 attrs.append(' %s="%s"' % ( | |
| 563 namespaced_name, | |
| 564 # NOTE(willkg): HTMLSerializer escapes attribute values | |
| 565 # already, so if we do it here (like HTMLSerializer does), | |
| 566 # then we end up double-escaping. | |
| 567 v) | |
| 568 ) | |
| 569 token["data"] = "<%s%s>" % (token["name"], ''.join(attrs)) | |
| 570 | |
| 571 else: | |
| 572 token["data"] = "<%s>" % token["name"] | |
| 573 | |
| 574 if token.get("selfClosing"): | |
| 575 token["data"] = token["data"][:-1] + "/>" | |
| 576 | |
| 577 token["type"] = "Characters" | |
| 578 | |
| 579 del token["name"] | |
| 580 return token | |
| 581 | |
| 582 def sanitize_css(self, style): | |
| 583 """Sanitizes css in style tags""" | |
| 584 # Convert entities in the style so that it can be parsed as CSS | |
| 585 style = html5lib_shim.convert_entities(style) | |
| 586 | |
| 587 # Drop any url values before we do anything else | |
| 588 style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) | |
| 589 | |
| 590 # The gauntlet of sanitization | |
| 591 | |
| 592 # Validate the css in the style tag and if it's not valid, then drop | |
| 593 # the whole thing. | |
| 594 parts = style.split(';') | |
| 595 gauntlet = re.compile( | |
| 596 r"""^( # consider a style attribute value as composed of: | |
| 597 [/:,#%!.\s\w] # a non-newline character | |
| 598 |\w-\w # 3 characters in the form \w-\w | |
| 599 |'[\s\w]+'\s* # a single quoted string of [\s\w]+ with trailing space | |
| 600 |"[\s\w]+" # a double quoted string of [\s\w]+ | |
| 601 |\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, ... | |
| 602 )*$""", # ... percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)' | |
| 603 flags=re.U | re.VERBOSE | |
| 604 ) | |
| 605 | |
| 606 for part in parts: | |
| 607 if not gauntlet.match(part): | |
| 608 return '' | |
| 609 | |
| 610 if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): | |
| 611 return '' | |
| 612 | |
| 613 clean = [] | |
| 614 for prop, value in re.findall(r'([-\w]+)\s*:\s*([^:;]*)', style): | |
| 615 if not value: | |
| 616 continue | |
| 617 | |
| 618 if prop.lower() in self.allowed_css_properties: | |
| 619 clean.append(prop + ': ' + value + ';') | |
| 620 | |
| 621 elif prop.lower() in self.allowed_svg_properties: | |
| 622 clean.append(prop + ': ' + value + ';') | |
| 623 | |
| 624 return ' '.join(clean) |
