Mercurial > repos > guerler > springsuite
annotate planemo/lib/python3.7/site-packages/bleach/sanitizer.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
| author | guerler | 
|---|---|
| date | Fri, 31 Jul 2020 00:32:28 -0400 | 
| parents | d30785e31577 | 
| children | 
| rev | line source | 
|---|---|
| 0 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 1 from __future__ import unicode_literals | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 2 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 3 from itertools import chain | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 4 import re | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 5 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 6 import six | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 7 from six.moves.urllib.parse import urlparse | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 8 from xml.sax.saxutils import unescape | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 9 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 10 from bleach import html5lib_shim | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 11 from bleach.utils import alphabetize_attributes, force_unicode | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 12 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 13 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 14 #: List of allowed tags | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 15 ALLOWED_TAGS = [ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 16 'a', | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 17 'abbr', | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 18 'acronym', | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 19 'b', | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 20 'blockquote', | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 21 'code', | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 22 'em', | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 23 'i', | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 24 'li', | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 25 'ol', | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 26 'strong', | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 27 'ul', | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 28 ] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 29 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 30 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 31 #: Map of allowed attributes by tag | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 32 ALLOWED_ATTRIBUTES = { | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 33 'a': ['href', 'title'], | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 34 'abbr': ['title'], | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 35 'acronym': ['title'], | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 36 } | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 37 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 38 #: List of allowed styles | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 39 ALLOWED_STYLES = [] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 40 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 41 #: List of allowed protocols | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 42 ALLOWED_PROTOCOLS = ['http', 'https', 'mailto'] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 43 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 44 #: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 45 INVISIBLE_CHARACTERS = ''.join([chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))]) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 46 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 47 #: Regexp for characters that are invisible | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 48 INVISIBLE_CHARACTERS_RE = re.compile( | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 49 '[' + INVISIBLE_CHARACTERS + ']', | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 50 re.UNICODE | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 51 ) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 52 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 53 #: String to replace invisible characters with. This can be a character, a | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 54 #: string, or even a function that takes a Python re matchobj | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 55 INVISIBLE_REPLACEMENT_CHAR = '?' | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 56 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 57 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 58 class Cleaner(object): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 59 """Cleaner for cleaning HTML fragments of malicious content | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 60 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 61 This cleaner is a security-focused function whose sole purpose is to remove | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 62 malicious content from a string such that it can be displayed as content in | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 63 a web page. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 64 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 65 To use:: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 66 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 67 from bleach.sanitizer import Cleaner | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 68 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 69 cleaner = Cleaner() | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 70 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 71 for text in all_the_yucky_things: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 72 sanitized = cleaner.clean(text) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 73 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 74 .. Note:: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 75 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 76 This cleaner is not designed to use to transform content to be used in | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 77 non-web-page contexts. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 78 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 79 .. Warning:: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 80 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 81 This cleaner is not thread-safe--the html parser has internal state. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 82 Create a separate cleaner per thread! | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 83 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 84 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 85 """ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 86 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 87 def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 88 styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 89 strip_comments=True, filters=None): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 90 """Initializes a Cleaner | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 91 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 92 :arg list tags: allowed list of tags; defaults to | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 93 ``bleach.sanitizer.ALLOWED_TAGS`` | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 94 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 95 :arg dict attributes: allowed attributes; can be a callable, list or dict; | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 96 defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 97 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 98 :arg list styles: allowed list of css styles; defaults to | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 99 ``bleach.sanitizer.ALLOWED_STYLES`` | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 100 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 101 :arg list protocols: allowed list of protocols for links; defaults | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 102 to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 103 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 104 :arg bool strip: whether or not to strip disallowed elements | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 105 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 106 :arg bool strip_comments: whether or not to strip HTML comments | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 107 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 108 :arg list filters: list of html5lib Filter classes to pass streamed content through | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 109 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 110 .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 111 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 112 .. Warning:: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 113 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 114 Using filters changes the output of ``bleach.Cleaner.clean``. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 115 Make sure the way the filters change the output are secure. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 116 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 117 """ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 118 self.tags = tags | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 119 self.attributes = attributes | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 120 self.styles = styles | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 121 self.protocols = protocols | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 122 self.strip = strip | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 123 self.strip_comments = strip_comments | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 124 self.filters = filters or [] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 125 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 126 self.parser = html5lib_shim.BleachHTMLParser( | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 127 tags=self.tags, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 128 strip=self.strip, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 129 consume_entities=False, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 130 namespaceHTMLElements=False | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 131 ) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 132 self.walker = html5lib_shim.getTreeWalker('etree') | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 133 self.serializer = html5lib_shim.BleachHTMLSerializer( | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 134 quote_attr_values='always', | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 135 omit_optional_tags=False, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 136 escape_lt_in_attrs=True, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 137 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 138 # We want to leave entities as they are without escaping or | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 139 # resolving or expanding | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 140 resolve_entities=False, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 141 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 142 # Bleach has its own sanitizer, so don't use the html5lib one | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 143 sanitize=False, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 144 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 145 # Bleach sanitizer alphabetizes already, so don't use the html5lib one | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 146 alphabetical_attributes=False, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 147 ) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 148 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 149 def clean(self, text): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 150 """Cleans text and returns sanitized result as unicode | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 151 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 152 :arg str text: text to be cleaned | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 153 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 154 :returns: sanitized text as unicode | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 155 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 156 :raises TypeError: if ``text`` is not a text type | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 157 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 158 """ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 159 if not isinstance(text, six.string_types): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 160 message = "argument cannot be of '{name}' type, must be of text type".format( | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 161 name=text.__class__.__name__) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 162 raise TypeError(message) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 163 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 164 if not text: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 165 return '' | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 166 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 167 text = force_unicode(text) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 168 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 169 dom = self.parser.parseFragment(text) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 170 filtered = BleachSanitizerFilter( | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 171 source=self.walker(dom), | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 172 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 173 # Bleach-sanitizer-specific things | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 174 attributes=self.attributes, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 175 strip_disallowed_elements=self.strip, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 176 strip_html_comments=self.strip_comments, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 177 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 178 # html5lib-sanitizer things | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 179 allowed_elements=self.tags, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 180 allowed_css_properties=self.styles, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 181 allowed_protocols=self.protocols, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 182 allowed_svg_properties=[], | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 183 ) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 184 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 185 # Apply any filters after the BleachSanitizerFilter | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 186 for filter_class in self.filters: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 187 filtered = filter_class(source=filtered) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 188 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 189 return self.serializer.render(filtered) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 190 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 191 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 192 def attribute_filter_factory(attributes): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 193 """Generates attribute filter function for the given attributes value | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 194 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 195 The attributes value can take one of several shapes. This returns a filter | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 196 function appropriate to the attributes value. One nice thing about this is | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 197 that there's less if/then shenanigans in the ``allow_token`` method. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 198 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 199 """ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 200 if callable(attributes): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 201 return attributes | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 202 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 203 if isinstance(attributes, dict): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 204 def _attr_filter(tag, attr, value): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 205 if tag in attributes: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 206 attr_val = attributes[tag] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 207 if callable(attr_val): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 208 return attr_val(tag, attr, value) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 209 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 210 if attr in attr_val: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 211 return True | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 212 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 213 if '*' in attributes: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 214 attr_val = attributes['*'] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 215 if callable(attr_val): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 216 return attr_val(tag, attr, value) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 217 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 218 return attr in attr_val | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 219 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 220 return False | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 221 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 222 return _attr_filter | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 223 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 224 if isinstance(attributes, list): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 225 def _attr_filter(tag, attr, value): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 226 return attr in attributes | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 227 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 228 return _attr_filter | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 229 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 230 raise ValueError('attributes needs to be a callable, a list or a dict') | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 231 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 232 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 233 class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 234 """html5lib Filter that sanitizes text | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 235 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 236 This filter can be used anywhere html5lib filters can be used. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 237 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 238 """ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 239 def __init__(self, source, attributes=ALLOWED_ATTRIBUTES, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 240 strip_disallowed_elements=False, strip_html_comments=True, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 241 **kwargs): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 242 """Creates a BleachSanitizerFilter instance | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 243 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 244 :arg Treewalker source: stream | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 245 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 246 :arg list tags: allowed list of tags; defaults to | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 247 ``bleach.sanitizer.ALLOWED_TAGS`` | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 248 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 249 :arg dict attributes: allowed attributes; can be a callable, list or dict; | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 250 defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 251 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 252 :arg list styles: allowed list of css styles; defaults to | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 253 ``bleach.sanitizer.ALLOWED_STYLES`` | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 254 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 255 :arg list protocols: allowed list of protocols for links; defaults | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 256 to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 257 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 258 :arg bool strip_disallowed_elements: whether or not to strip disallowed | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 259 elements | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 260 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 261 :arg bool strip_html_comments: whether or not to strip HTML comments | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 262 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 263 """ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 264 self.attr_filter = attribute_filter_factory(attributes) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 265 self.strip_disallowed_elements = strip_disallowed_elements | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 266 self.strip_html_comments = strip_html_comments | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 267 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 268 return super(BleachSanitizerFilter, self).__init__(source, **kwargs) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 269 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 270 def sanitize_stream(self, token_iterator): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 271 for token in token_iterator: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 272 ret = self.sanitize_token(token) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 273 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 274 if not ret: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 275 continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 276 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 277 if isinstance(ret, list): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 278 for subtoken in ret: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 279 yield subtoken | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 280 else: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 281 yield ret | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 282 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 283 def merge_characters(self, token_iterator): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 284 """Merge consecutive Characters tokens in a stream""" | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 285 characters_buffer = [] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 286 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 287 for token in token_iterator: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 288 if characters_buffer: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 289 if token['type'] == 'Characters': | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 290 characters_buffer.append(token) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 291 continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 292 else: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 293 # Merge all the characters tokens together into one and then | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 294 # operate on it. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 295 new_token = { | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 296 'data': ''.join([char_token['data'] for char_token in characters_buffer]), | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 297 'type': 'Characters' | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 298 } | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 299 characters_buffer = [] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 300 yield new_token | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 301 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 302 elif token['type'] == 'Characters': | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 303 characters_buffer.append(token) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 304 continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 305 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 306 yield token | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 307 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 308 new_token = { | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 309 'data': ''.join([char_token['data'] for char_token in characters_buffer]), | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 310 'type': 'Characters' | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 311 } | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 312 yield new_token | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 313 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 314 def __iter__(self): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 315 return self.merge_characters(self.sanitize_stream(html5lib_shim.Filter.__iter__(self))) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 316 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 317 def sanitize_token(self, token): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 318 """Sanitize a token either by HTML-encoding or dropping. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 319 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 320 Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag': | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 321 ['attribute', 'pairs'], 'tag': callable}. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 322 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 323 Here callable is a function with two arguments of attribute name and | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 324 value. It should return true of false. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 325 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 326 Also gives the option to strip tags instead of encoding. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 327 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 328 :arg dict token: token to sanitize | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 329 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 330 :returns: token or list of tokens | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 331 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 332 """ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 333 token_type = token['type'] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 334 if token_type in ['StartTag', 'EndTag', 'EmptyTag']: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 335 if token['name'] in self.allowed_elements: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 336 return self.allow_token(token) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 337 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 338 elif self.strip_disallowed_elements: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 339 return None | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 340 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 341 else: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 342 if 'data' in token: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 343 # Alphabetize the attributes before calling .disallowed_token() | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 344 # so that the resulting string is stable | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 345 token['data'] = alphabetize_attributes(token['data']) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 346 return self.disallowed_token(token) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 347 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 348 elif token_type == 'Comment': | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 349 if not self.strip_html_comments: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 350 return token | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 351 else: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 352 return None | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 353 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 354 elif token_type == 'Characters': | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 355 return self.sanitize_characters(token) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 356 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 357 else: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 358 return token | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 359 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 360 def sanitize_characters(self, token): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 361 """Handles Characters tokens | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 362 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 363 Our overridden tokenizer doesn't do anything with entities. However, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 364 that means that the serializer will convert all ``&`` in Characters | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 365 tokens to ``&``. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 366 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 367 Since we don't want that, we extract entities here and convert them to | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 368 Entity tokens so the serializer will let them be. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 369 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 370 :arg token: the Characters token to work on | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 371 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 372 :returns: a list of tokens | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 373 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 374 """ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 375 data = token.get('data', '') | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 376 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 377 if not data: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 378 return token | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 379 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 380 data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 381 token['data'] = data | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 382 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 383 # If there isn't a & in the data, we can return now | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 384 if '&' not in data: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 385 return token | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 386 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 387 new_tokens = [] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 388 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 389 # For each possible entity that starts with a "&", we try to extract an | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 390 # actual entity and re-tokenize accordingly | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 391 for part in html5lib_shim.next_possible_entity(data): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 392 if not part: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 393 continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 394 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 395 if part.startswith('&'): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 396 entity = html5lib_shim.match_entity(part) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 397 if entity is not None: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 398 if entity == 'amp': | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 399 # LinkifyFilter can't match urls across token boundaries | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 400 # which is problematic with & since that shows up in | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 401 # querystrings all the time. This special-cases & | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 402 # and converts it to a & and sticks it in as a | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 403 # Characters token. It'll get merged with surrounding | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 404 # tokens in the BleachSanitizerfilter.__iter__ and | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 405 # escaped in the serializer. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 406 new_tokens.append({'type': 'Characters', 'data': '&'}) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 407 else: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 408 new_tokens.append({'type': 'Entity', 'name': entity}) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 409 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 410 # Length of the entity plus 2--one for & at the beginning | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 411 # and one for ; at the end | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 412 remainder = part[len(entity) + 2:] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 413 if remainder: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 414 new_tokens.append({'type': 'Characters', 'data': remainder}) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 415 continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 416 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 417 new_tokens.append({'type': 'Characters', 'data': part}) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 418 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 419 return new_tokens | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 420 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 421 def sanitize_uri_value(self, value, allowed_protocols): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 422 """Checks a uri value to see if it's allowed | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 423 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 424 :arg value: the uri value to sanitize | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 425 :arg allowed_protocols: list of allowed protocols | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 426 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 427 :returns: allowed value or None | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 428 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 429 """ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 430 # NOTE(willkg): This transforms the value into one that's easier to | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 431 # match and verify, but shouldn't get returned since it's vastly | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 432 # different than the original value. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 433 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 434 # Convert all character entities in the value | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 435 new_value = html5lib_shim.convert_entities(value) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 436 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 437 # Nix backtick, space characters, and control characters | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 438 new_value = re.sub( | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 439 r"[`\000-\040\177-\240\s]+", | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 440 '', | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 441 new_value | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 442 ) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 443 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 444 # Remove REPLACEMENT characters | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 445 new_value = new_value.replace('\ufffd', '') | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 446 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 447 # Lowercase it--this breaks the value, but makes it easier to match | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 448 # against | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 449 new_value = new_value.lower() | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 450 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 451 try: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 452 # Drop attributes with uri values that have protocols that aren't | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 453 # allowed | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 454 parsed = urlparse(new_value) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 455 except ValueError: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 456 # URI is impossible to parse, therefore it's not allowed | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 457 return None | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 458 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 459 if parsed.scheme: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 460 # If urlparse found a scheme, check that | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 461 if parsed.scheme in allowed_protocols: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 462 return value | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 463 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 464 else: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 465 # Allow uris that are just an anchor | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 466 if new_value.startswith('#'): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 467 return value | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 468 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 469 # Handle protocols that urlparse doesn't recognize like "myprotocol" | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 470 if ':' in new_value and new_value.split(':')[0] in allowed_protocols: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 471 return value | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 472 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 473 # If there's no protocol/scheme specified, then assume it's "http" | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 474 # and see if that's allowed | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 475 if 'http' in allowed_protocols: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 476 return value | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 477 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 478 return None | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 479 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 480 def allow_token(self, token): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 481 """Handles the case where we're allowing the tag""" | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 482 if 'data' in token: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 483 # Loop through all the attributes and drop the ones that are not | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 484 # allowed, are unsafe or break other rules. Additionally, fix | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 485 # attribute values that need fixing. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 486 # | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 487 # At the end of this loop, we have the final set of attributes | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 488 # we're keeping. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 489 attrs = {} | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 490 for namespaced_name, val in token['data'].items(): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 491 namespace, name = namespaced_name | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 492 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 493 # Drop attributes that are not explicitly allowed | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 494 # | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 495 # NOTE(willkg): We pass in the attribute name--not a namespaced | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 496 # name. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 497 if not self.attr_filter(token['name'], name, val): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 498 continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 499 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 500 # Drop attributes with uri values that use a disallowed protocol | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 501 # Sanitize attributes with uri values | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 502 if namespaced_name in self.attr_val_is_uri: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 503 new_value = self.sanitize_uri_value(val, self.allowed_protocols) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 504 if new_value is None: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 505 continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 506 val = new_value | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 507 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 508 # Drop values in svg attrs with non-local IRIs | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 509 if namespaced_name in self.svg_attr_val_allows_ref: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 510 new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 511 ' ', | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 512 unescape(val)) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 513 new_val = new_val.strip() | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 514 if not new_val: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 515 continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 516 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 517 else: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 518 # Replace the val with the unescaped version because | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 519 # it's a iri | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 520 val = new_val | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 521 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 522 # Drop href and xlink:href attr for svg elements with non-local IRIs | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 523 if (None, token['name']) in self.svg_allow_local_href: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 524 if namespaced_name in [ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 525 (None, 'href'), (html5lib_shim.namespaces['xlink'], 'href') | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 526 ]: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 527 if re.search(r'^\s*[^#\s]', val): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 528 continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 529 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 530 # If it's a style attribute, sanitize it | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 531 if namespaced_name == (None, 'style'): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 532 val = self.sanitize_css(val) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 533 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 534 # At this point, we want to keep the attribute, so add it in | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 535 attrs[namespaced_name] = val | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 536 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 537 token['data'] = alphabetize_attributes(attrs) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 538 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 539 return token | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 540 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 541 def disallowed_token(self, token): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 542 token_type = token["type"] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 543 if token_type == "EndTag": | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 544 token["data"] = "</%s>" % token["name"] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 545 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 546 elif token["data"]: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 547 assert token_type in ("StartTag", "EmptyTag") | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 548 attrs = [] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 549 for (ns, name), v in token["data"].items(): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 550 # If we end up with a namespace, but no name, switch them so we | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 551 # have a valid name to use. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 552 if ns and not name: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 553 ns, name = name, ns | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 554 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 555 # Figure out namespaced name if the namespace is appropriate | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 556 # and exists; if the ns isn't in prefixes, then drop it. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 557 if ns is None or ns not in html5lib_shim.prefixes: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 558 namespaced_name = name | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 559 else: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 560 namespaced_name = '%s:%s' % (html5lib_shim.prefixes[ns], name) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 561 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 562 attrs.append(' %s="%s"' % ( | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 563 namespaced_name, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 564 # NOTE(willkg): HTMLSerializer escapes attribute values | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 565 # already, so if we do it here (like HTMLSerializer does), | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 566 # then we end up double-escaping. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 567 v) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 568 ) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 569 token["data"] = "<%s%s>" % (token["name"], ''.join(attrs)) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 570 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 571 else: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 572 token["data"] = "<%s>" % token["name"] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 573 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 574 if token.get("selfClosing"): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 575 token["data"] = token["data"][:-1] + "/>" | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 576 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 577 token["type"] = "Characters" | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 578 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 579 del token["name"] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 580 return token | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 581 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 582 def sanitize_css(self, style): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 583 """Sanitizes css in style tags""" | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 584 # Convert entities in the style so that it can be parsed as CSS | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 585 style = html5lib_shim.convert_entities(style) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 586 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 587 # Drop any url values before we do anything else | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 588 style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 589 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 590 # The gauntlet of sanitization | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 591 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 592 # Validate the css in the style tag and if it's not valid, then drop | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 593 # the whole thing. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 594 parts = style.split(';') | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 595 gauntlet = re.compile( | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 596 r"""^( # consider a style attribute value as composed of: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 597 [/:,#%!.\s\w] # a non-newline character | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 598 |\w-\w # 3 characters in the form \w-\w | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 599 |'[\s\w]+'\s* # a single quoted string of [\s\w]+ with trailing space | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 600 |"[\s\w]+" # a double quoted string of [\s\w]+ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 601 |\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, ... | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 602 )*$""", # ... percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)' | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 603 flags=re.U | re.VERBOSE | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 604 ) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 605 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 606 for part in parts: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 607 if not gauntlet.match(part): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 608 return '' | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 609 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 610 if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 611 return '' | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 612 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 613 clean = [] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 614 for prop, value in re.findall(r'([-\w]+)\s*:\s*([^:;]*)', style): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 615 if not value: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 616 continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 617 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 618 if prop.lower() in self.allowed_css_properties: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 619 clean.append(prop + ': ' + value + ';') | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 620 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 621 elif prop.lower() in self.allowed_svg_properties: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 622 clean.append(prop + ': ' + value + ';') | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 623 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 624 return ' '.join(clean) | 
