Mercurial > repos > guerler > springsuite
annotate planemo/lib/python3.7/site-packages/bleach/linkifier.py @ 0:d30785e31577 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
| author | guerler | 
|---|---|
| date | Fri, 31 Jul 2020 00:18:57 -0400 | 
| parents | |
| children | 
| rev | line source | 
|---|---|
| 0 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 1 from __future__ import unicode_literals | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 2 import re | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 3 import six | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 4 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 5 from bleach import callbacks as linkify_callbacks | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 6 from bleach import html5lib_shim | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 7 from bleach.utils import alphabetize_attributes, force_unicode | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 8 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 9 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 10 #: List of default callbacks | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 11 DEFAULT_CALLBACKS = [linkify_callbacks.nofollow] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 12 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 13 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 14 TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 15 ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 16 cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 17 dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 18 gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 19 im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 20 kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 21 ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 22 net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 23 pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 24 sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 25 tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 26 xn xxx ye yt yu za zm zw""".split() | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 27 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 28 # Make sure that .com doesn't get matched by .co first | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 29 TLDS.reverse() | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 30 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 31 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 32 def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 33 """Builds the url regex used by linkifier | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 34 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 35 If you want a different set of tlds or allowed protocols, pass those in | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 36 and stomp on the existing ``url_re``:: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 37 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 38 from bleach import linkifier | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 39 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 40 my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 41 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 42 linker = LinkifyFilter(url_re=my_url_re) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 43 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 44 """ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 45 return re.compile( | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 46 r"""\(* # Match any opening parentheses. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 47 \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http:// | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 48 ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)? | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 49 (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)? | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 50 # /path/zz (excluding "unsafe" chars from RFC 1738, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 51 # except for # and ~, which happen in practice) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 52 """.format('|'.join(sorted(protocols)), '|'.join(sorted(tlds))), | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 53 re.IGNORECASE | re.VERBOSE | re.UNICODE) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 54 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 55 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 56 URL_RE = build_url_re() | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 57 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 58 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 59 PROTO_RE = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 60 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 61 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 62 def build_email_re(tlds=TLDS): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 63 """Builds the email regex used by linkifier | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 64 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 65 If you want a different set of tlds, pass those in and stomp on the existing ``email_re``:: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 66 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 67 from bleach import linkifier | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 68 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 69 my_email_re = linkifier.build_email_re(my_tlds_list) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 70 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 71 linker = LinkifyFilter(email_re=my_url_re) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 72 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 73 """ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 74 # open and closing braces doubled below for format string | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 75 return re.compile( | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 76 r"""(?<!//) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 77 (([-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 78 (\.[-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+)* # dot-atom | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 79 |^"([\001-\010\013\014\016-\037!#-\[\]-\177] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 80 |\\[\001-\011\013\014\016-\177])*" # quoted-string | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 81 )@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0})) # domain | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 82 """.format('|'.join(tlds)), | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 83 re.IGNORECASE | re.MULTILINE | re.VERBOSE) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 84 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 85 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 86 EMAIL_RE = build_email_re() | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 87 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 88 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 89 class Linker(object): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 90 """Convert URL-like strings in an HTML fragment to links | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 91 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 92 This function converts strings that look like URLs, domain names and email | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 93 addresses in text that may be an HTML fragment to links, while preserving: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 94 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 95 1. links already in the string | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 96 2. urls found in attributes | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 97 3. email addresses | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 98 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 99 linkify does a best-effort approach and tries to recover from bad | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 100 situations due to crazy text. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 101 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 102 """ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 103 def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 104 url_re=URL_RE, email_re=EMAIL_RE, recognized_tags=html5lib_shim.HTML_TAGS): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 105 """Creates a Linker instance | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 106 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 107 :arg list callbacks: list of callbacks to run when adjusting tag attributes; | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 108 defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 109 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 110 :arg list skip_tags: list of tags that you don't want to linkify the | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 111 contents of; for example, you could set this to ``['pre']`` to skip | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 112 linkifying contents of ``pre`` tags | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 113 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 114 :arg bool parse_email: whether or not to linkify email addresses | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 115 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 116 :arg re url_re: url matching regex | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 117 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 118 :arg re email_re: email matching regex | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 119 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 120 :arg list-of-strings recognized_tags: the list of tags that linkify knows about; | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 121 everything else gets escaped | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 122 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 123 :returns: linkified text as unicode | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 124 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 125 """ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 126 self.callbacks = callbacks | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 127 self.skip_tags = skip_tags | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 128 self.parse_email = parse_email | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 129 self.url_re = url_re | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 130 self.email_re = email_re | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 131 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 132 # Create a parser/tokenizer that allows all HTML tags and escapes | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 133 # anything not in that list. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 134 self.parser = html5lib_shim.BleachHTMLParser( | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 135 tags=recognized_tags, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 136 strip=False, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 137 consume_entities=True, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 138 namespaceHTMLElements=False, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 139 ) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 140 self.walker = html5lib_shim.getTreeWalker('etree') | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 141 self.serializer = html5lib_shim.BleachHTMLSerializer( | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 142 quote_attr_values='always', | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 143 omit_optional_tags=False, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 144 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 145 # linkify does not sanitize | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 146 sanitize=False, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 147 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 148 # linkify alphabetizes | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 149 alphabetical_attributes=False, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 150 ) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 151 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 152 def linkify(self, text): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 153 """Linkify specified text | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 154 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 155 :arg str text: the text to add links to | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 156 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 157 :returns: linkified text as unicode | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 158 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 159 :raises TypeError: if ``text`` is not a text type | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 160 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 161 """ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 162 if not isinstance(text, six.string_types): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 163 raise TypeError('argument must be of text type') | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 164 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 165 text = force_unicode(text) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 166 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 167 if not text: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 168 return '' | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 169 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 170 dom = self.parser.parseFragment(text) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 171 filtered = LinkifyFilter( | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 172 source=self.walker(dom), | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 173 callbacks=self.callbacks, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 174 skip_tags=self.skip_tags, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 175 parse_email=self.parse_email, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 176 url_re=self.url_re, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 177 email_re=self.email_re, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 178 ) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 179 return self.serializer.render(filtered) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 180 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 181 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 182 class LinkifyFilter(html5lib_shim.Filter): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 183 """html5lib filter that linkifies text | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 184 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 185 This will do the following: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 186 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 187 * convert email addresses into links | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 188 * convert urls into links | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 189 * edit existing links by running them through callbacks--the default is to | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 190 add a ``rel="nofollow"`` | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 191 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 192 This filter can be used anywhere html5lib filters can be used. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 193 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 194 """ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 195 def __init__(self, source, callbacks=None, skip_tags=None, parse_email=False, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 196 url_re=URL_RE, email_re=EMAIL_RE): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 197 """Creates a LinkifyFilter instance | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 198 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 199 :arg TreeWalker source: stream | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 200 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 201 :arg list callbacks: list of callbacks to run when adjusting tag attributes; | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 202 defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 203 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 204 :arg list skip_tags: list of tags that you don't want to linkify the | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 205 contents of; for example, you could set this to ``['pre']`` to skip | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 206 linkifying contents of ``pre`` tags | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 207 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 208 :arg bool parse_email: whether or not to linkify email addresses | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 209 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 210 :arg re url_re: url matching regex | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 211 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 212 :arg re email_re: email matching regex | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 213 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 214 """ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 215 super(LinkifyFilter, self).__init__(source) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 216 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 217 self.callbacks = callbacks or [] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 218 self.skip_tags = skip_tags or [] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 219 self.parse_email = parse_email | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 220 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 221 self.url_re = url_re | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 222 self.email_re = email_re | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 223 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 224 def apply_callbacks(self, attrs, is_new): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 225 """Given an attrs dict and an is_new bool, runs through callbacks | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 226 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 227 Callbacks can return an adjusted attrs dict or ``None``. In the case of | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 228 ``None``, we stop going through callbacks and return that and the link | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 229 gets dropped. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 230 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 231 :arg dict attrs: map of ``(namespace, name)`` -> ``value`` | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 232 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 233 :arg bool is_new: whether or not this link was added by linkify | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 234 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 235 :returns: adjusted attrs dict or ``None`` | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 236 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 237 """ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 238 for cb in self.callbacks: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 239 attrs = cb(attrs, is_new) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 240 if attrs is None: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 241 return None | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 242 return attrs | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 243 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 244 def extract_character_data(self, token_list): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 245 """Extracts and squashes character sequences in a token stream""" | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 246 # FIXME(willkg): This is a terrible idea. What it does is drop all the | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 247 # tags from the token list and merge the Characters and SpaceCharacters | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 248 # tokens into a single text. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 249 # | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 250 # So something like this:: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 251 # | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 252 # "<span>" "<b>" "some text" "</b>" "</span>" | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 253 # | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 254 # gets converted to "some text". | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 255 # | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 256 # This gets used to figure out the ``_text`` fauxttribute value for | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 257 # linkify callables. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 258 # | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 259 # I'm not really sure how else to support that ``_text`` fauxttribute and | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 260 # maintain some modicum of backwards compatibility with previous versions | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 261 # of Bleach. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 262 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 263 out = [] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 264 for token in token_list: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 265 token_type = token['type'] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 266 if token_type in ['Characters', 'SpaceCharacters']: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 267 out.append(token['data']) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 268 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 269 return ''.join(out) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 270 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 271 def handle_email_addresses(self, src_iter): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 272 """Handle email addresses in character tokens""" | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 273 for token in src_iter: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 274 if token['type'] == 'Characters': | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 275 text = token['data'] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 276 new_tokens = [] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 277 end = 0 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 278 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 279 # For each email address we find in the text | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 280 for match in self.email_re.finditer(text): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 281 if match.start() > end: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 282 new_tokens.append( | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 283 {'type': 'Characters', 'data': text[end:match.start()]} | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 284 ) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 285 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 286 # Run attributes through the callbacks to see what we | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 287 # should do with this match | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 288 attrs = { | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 289 (None, 'href'): 'mailto:%s' % match.group(0), | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 290 '_text': match.group(0) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 291 } | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 292 attrs = self.apply_callbacks(attrs, True) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 293 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 294 if attrs is None: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 295 # Just add the text--but not as a link | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 296 new_tokens.append( | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 297 {'type': 'Characters', 'data': match.group(0)} | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 298 ) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 299 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 300 else: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 301 # Add an "a" tag for the new link | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 302 _text = attrs.pop('_text', '') | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 303 attrs = alphabetize_attributes(attrs) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 304 new_tokens.extend([ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 305 {'type': 'StartTag', 'name': 'a', 'data': attrs}, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 306 {'type': 'Characters', 'data': force_unicode(_text)}, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 307 {'type': 'EndTag', 'name': 'a'} | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 308 ]) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 309 end = match.end() | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 310 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 311 if new_tokens: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 312 # Yield the adjusted set of tokens and then continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 313 # through the loop | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 314 if end < len(text): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 315 new_tokens.append({'type': 'Characters', 'data': text[end:]}) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 316 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 317 for new_token in new_tokens: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 318 yield new_token | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 319 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 320 continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 321 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 322 yield token | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 323 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 324 def strip_non_url_bits(self, fragment): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 325 """Strips non-url bits from the url | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 326 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 327 This accounts for over-eager matching by the regex. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 328 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 329 """ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 330 prefix = suffix = '' | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 331 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 332 while fragment: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 333 # Try removing ( from the beginning and, if it's balanced, from the | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 334 # end, too | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 335 if fragment.startswith('('): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 336 prefix = prefix + '(' | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 337 fragment = fragment[1:] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 338 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 339 if fragment.endswith(')'): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 340 suffix = ')' + suffix | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 341 fragment = fragment[:-1] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 342 continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 343 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 344 # Now try extraneous things from the end. For example, sometimes we | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 345 # pick up ) at the end of a url, but the url is in a parenthesized | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 346 # phrase like: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 347 # | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 348 # "i looked at the site (at http://example.com)" | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 349 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 350 if fragment.endswith(')') and '(' not in fragment: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 351 fragment = fragment[:-1] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 352 suffix = ')' + suffix | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 353 continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 354 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 355 # Handle commas | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 356 if fragment.endswith(','): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 357 fragment = fragment[:-1] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 358 suffix = ',' + suffix | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 359 continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 360 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 361 # Handle periods | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 362 if fragment.endswith('.'): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 363 fragment = fragment[:-1] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 364 suffix = '.' + suffix | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 365 continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 366 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 367 # Nothing matched, so we're done | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 368 break | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 369 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 370 return fragment, prefix, suffix | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 371 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 372 def handle_links(self, src_iter): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 373 """Handle links in character tokens""" | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 374 in_a = False # happens, if parse_email=True and if a mail was found | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 375 for token in src_iter: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 376 if in_a: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 377 if token['type'] == 'EndTag' and token['name'] == 'a': | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 378 in_a = False | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 379 yield token | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 380 continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 381 elif token['type'] == 'StartTag' and token['name'] == 'a': | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 382 in_a = True | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 383 yield token | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 384 continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 385 if token['type'] == 'Characters': | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 386 text = token['data'] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 387 new_tokens = [] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 388 end = 0 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 389 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 390 for match in self.url_re.finditer(text): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 391 if match.start() > end: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 392 new_tokens.append( | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 393 {'type': 'Characters', 'data': text[end:match.start()]} | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 394 ) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 395 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 396 url = match.group(0) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 397 prefix = suffix = '' | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 398 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 399 # Sometimes we pick up too much in the url match, so look for | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 400 # bits we should drop and remove them from the match | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 401 url, prefix, suffix = self.strip_non_url_bits(url) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 402 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 403 # If there's no protocol, add one | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 404 if PROTO_RE.search(url): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 405 href = url | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 406 else: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 407 href = 'http://%s' % url | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 408 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 409 attrs = { | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 410 (None, 'href'): href, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 411 '_text': url | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 412 } | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 413 attrs = self.apply_callbacks(attrs, True) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 414 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 415 if attrs is None: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 416 # Just add the text | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 417 new_tokens.append( | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 418 {'type': 'Characters', 'data': prefix + url + suffix} | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 419 ) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 420 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 421 else: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 422 # Add the "a" tag! | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 423 if prefix: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 424 new_tokens.append( | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 425 {'type': 'Characters', 'data': prefix} | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 426 ) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 427 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 428 _text = attrs.pop('_text', '') | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 429 attrs = alphabetize_attributes(attrs) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 430 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 431 new_tokens.extend([ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 432 {'type': 'StartTag', 'name': 'a', 'data': attrs}, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 433 {'type': 'Characters', 'data': force_unicode(_text)}, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 434 {'type': 'EndTag', 'name': 'a'}, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 435 ]) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 436 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 437 if suffix: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 438 new_tokens.append( | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 439 {'type': 'Characters', 'data': suffix} | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 440 ) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 441 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 442 end = match.end() | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 443 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 444 if new_tokens: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 445 # Yield the adjusted set of tokens and then continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 446 # through the loop | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 447 if end < len(text): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 448 new_tokens.append({'type': 'Characters', 'data': text[end:]}) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 449 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 450 for new_token in new_tokens: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 451 yield new_token | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 452 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 453 continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 454 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 455 yield token | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 456 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 457 def handle_a_tag(self, token_buffer): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 458 """Handle the "a" tag | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 459 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 460 This could adjust the link or drop it altogether depending on what the | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 461 callbacks return. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 462 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 463 This yields the new set of tokens. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 464 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 465 """ | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 466 a_token = token_buffer[0] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 467 if a_token['data']: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 468 attrs = a_token['data'] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 469 else: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 470 attrs = {} | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 471 text = self.extract_character_data(token_buffer) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 472 attrs['_text'] = text | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 473 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 474 attrs = self.apply_callbacks(attrs, False) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 475 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 476 if attrs is None: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 477 # We're dropping the "a" tag and everything else and replacing | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 478 # it with character data. So emit that token. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 479 yield {'type': 'Characters', 'data': text} | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 480 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 481 else: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 482 new_text = attrs.pop('_text', '') | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 483 a_token['data'] = alphabetize_attributes(attrs) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 484 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 485 if text == new_text: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 486 # The callbacks didn't change the text, so we yield the new "a" | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 487 # token, then whatever else was there, then the end "a" token | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 488 yield a_token | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 489 for mem in token_buffer[1:]: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 490 yield mem | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 491 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 492 else: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 493 # If the callbacks changed the text, then we're going to drop | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 494 # all the tokens between the start and end "a" tags and replace | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 495 # it with the new text | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 496 yield a_token | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 497 yield {'type': 'Characters', 'data': force_unicode(new_text)} | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 498 yield token_buffer[-1] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 499 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 500 def __iter__(self): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 501 in_a = False | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 502 in_skip_tag = None | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 503 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 504 token_buffer = [] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 505 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 506 for token in super(LinkifyFilter, self).__iter__(): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 507 if in_a: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 508 # Handle the case where we're in an "a" tag--we want to buffer tokens | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 509 # until we hit an end "a" tag. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 510 if token['type'] == 'EndTag' and token['name'] == 'a': | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 511 # Add the end tag to the token buffer and then handle them | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 512 # and yield anything returned | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 513 token_buffer.append(token) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 514 for new_token in self.handle_a_tag(token_buffer): | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 515 yield new_token | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 516 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 517 # Clear "a" related state and continue since we've yielded all | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 518 # the tokens we're going to yield | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 519 in_a = False | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 520 token_buffer = [] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 521 else: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 522 token_buffer.append(token) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 523 continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 524 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 525 if token['type'] in ['StartTag', 'EmptyTag']: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 526 if token['name'] in self.skip_tags: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 527 # Skip tags start a "special mode" where we don't linkify | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 528 # anything until the end tag. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 529 in_skip_tag = token['name'] | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 530 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 531 elif token['name'] == 'a': | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 532 # The "a" tag is special--we switch to a slurp mode and | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 533 # slurp all the tokens until the end "a" tag and then | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 534 # figure out what to do with them there. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 535 in_a = True | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 536 token_buffer.append(token) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 537 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 538 # We buffer the start tag, so we don't want to yield it, | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 539 # yet | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 540 continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 541 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 542 elif in_skip_tag and self.skip_tags: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 543 # NOTE(willkg): We put this clause here since in_a and | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 544 # switching in and out of in_a takes precedence. | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 545 if token['type'] == 'EndTag' and token['name'] == in_skip_tag: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 546 in_skip_tag = None | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 547 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 548 elif not in_a and not in_skip_tag and token['type'] == 'Characters': | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 549 new_stream = iter([token]) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 550 if self.parse_email: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 551 new_stream = self.handle_email_addresses(new_stream) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 552 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 553 new_stream = self.handle_links(new_stream) | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 554 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 555 for token in new_stream: | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 556 yield token | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 557 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 558 # We've already yielded this token, so continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 559 continue | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 560 | 
| 
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
 guerler parents: diff
changeset | 561 yield token | 
