comparison Markdown/markdown2.py @ 4:a4813532bbc6 draft

Added MarkDown support
author saskia-hiltemann
date Tue, 07 Oct 2014 08:41:30 -0400
parents
children
comparison
equal deleted inserted replaced
3:4a6ebda2a3ae 4:a4813532bbc6
1 #!/usr/bin/env python
2 # Copyright (c) 2012 Trent Mick.
3 # Copyright (c) 2007-2008 ActiveState Corp.
4 # License: MIT (http://www.opensource.org/licenses/mit-license.php)
5
6 from __future__ import generators
7
8 r"""A fast and complete Python implementation of Markdown.
9
10 [from http://daringfireball.net/projects/markdown/]
11 > Markdown is a text-to-HTML filter; it translates an easy-to-read /
12 > easy-to-write structured text format into HTML. Markdown's text
13 > format is most similar to that of plain text email, and supports
14 > features such as headers, *emphasis*, code blocks, blockquotes, and
15 > links.
16 >
17 > Markdown's syntax is designed not as a generic markup language, but
18 > specifically to serve as a front-end to (X)HTML. You can use span-level
19 > HTML tags anywhere in a Markdown document, and you can use block level
20 > HTML tags (like <div> and <table> as well).
21
22 Module usage:
23
24 >>> import markdown2
25 >>> markdown2.markdown("*boo!*") # or use `html = markdown_path(PATH)`
26 u'<p><em>boo!</em></p>\n'
27
28 >>> markdowner = Markdown()
29 >>> markdowner.convert("*boo!*")
30 u'<p><em>boo!</em></p>\n'
31 >>> markdowner.convert("**boom!**")
32 u'<p><strong>boom!</strong></p>\n'
33
34 This implementation of Markdown implements the full "core" syntax plus a
35 number of extras (e.g., code syntax coloring, footnotes) as described on
36 <https://github.com/trentm/python-markdown2/wiki/Extras>.
37 """
38
39 cmdln_desc = """A fast and complete Python implementation of Markdown, a
40 text-to-HTML conversion tool for web writers.
41
42 Supported extra syntax options (see -x|--extras option below and
43 see <https://github.com/trentm/python-markdown2/wiki/Extras> for details):
44
45 * code-friendly: Disable _ and __ for em and strong.
46 * cuddled-lists: Allow lists to be cuddled to the preceding paragraph.
47 * fenced-code-blocks: Allows a code block to not have to be indented
48 by fencing it with '```' on a line before and after. Based on
49 <http://github.github.com/github-flavored-markdown/> with support for
50 syntax highlighting.
51 * footnotes: Support footnotes as in use on daringfireball.net and
52 implemented in other Markdown processors (tho not in Markdown.pl v1.0.1).
53 * header-ids: Adds "id" attributes to headers. The id value is a slug of
54 the header text.
55 * html-classes: Takes a dict mapping html tag names (lowercase) to a
56 string to use for a "class" tag attribute. Currently only supports
57 "pre" and "code" tags. Add an issue if you require this for other tags.
58 * markdown-in-html: Allow the use of `markdown="1"` in a block HTML tag to
59 have markdown processing be done on its contents. Similar to
60 <http://michelf.com/projects/php-markdown/extra/#markdown-attr> but with
61 some limitations.
62 * metadata: Extract metadata from a leading '---'-fenced block.
63 See <https://github.com/trentm/python-markdown2/issues/77> for details.
64 * nofollow: Add `rel="nofollow"` to add `<a>` tags with an href. See
65 <http://en.wikipedia.org/wiki/Nofollow>.
66 * pyshell: Treats unindented Python interactive shell sessions as <code>
67 blocks.
68 * link-patterns: Auto-link given regex patterns in text (e.g. bug number
69 references, revision number references).
70 * smarty-pants: Replaces ' and " with curly quotation marks or curly
71 apostrophes. Replaces --, ---, ..., and . . . with en dashes, em dashes,
72 and ellipses.
73 * toc: The returned HTML string gets a new "toc_html" attribute which is
74 a Table of Contents for the document. (experimental)
75 * xml: Passes one-liner processing instructions and namespaced XML tags.
76 * tables: Tables using the same format as GFM
77 <https://help.github.com/articles/github-flavored-markdown#tables> and
78 PHP-Markdown Extra <https://michelf.ca/projects/php-markdown/extra/#table>.
79 * wiki-tables: Google Code Wiki-style tables. See
80 <http://code.google.com/p/support/wiki/WikiSyntax#Tables>.
81 """
82
83 # Dev Notes:
84 # - Python's regex syntax doesn't have '\z', so I'm using '\Z'. I'm
85 # not yet sure if there implications with this. Compare 'pydoc sre'
86 # and 'perldoc perlre'.
87
88 __version_info__ = (2, 3, 1)
89 __version__ = '.'.join(map(str, __version_info__))
90 __author__ = "Trent Mick"
91
92 import os
93 import sys
94 from pprint import pprint, pformat
95 import re
96 import logging
97 try:
98 from hashlib import md5
99 except ImportError:
100 from md5 import md5
101 import optparse
102 from random import random, randint
103 import codecs
104
105
106 #---- Python version compat
107
108 try:
109 from urllib.parse import quote # python3
110 except ImportError:
111 from urllib import quote # python2
112
113 if sys.version_info[:2] < (2,4):
114 from sets import Set as set
115 def reversed(sequence):
116 for i in sequence[::-1]:
117 yield i
118
119 # Use `bytes` for byte strings and `unicode` for unicode strings (str in Py3).
120 if sys.version_info[0] <= 2:
121 py3 = False
122 try:
123 bytes
124 except NameError:
125 bytes = str
126 base_string_type = basestring
127 elif sys.version_info[0] >= 3:
128 py3 = True
129 unicode = str
130 base_string_type = str
131
132
133
134 #---- globals
135
136 DEBUG = False
137 log = logging.getLogger("markdown")
138
139 DEFAULT_TAB_WIDTH = 4
140
141
142 SECRET_SALT = bytes(randint(0, 1000000))
143 def _hash_text(s):
144 return 'md5-' + md5(SECRET_SALT + s.encode("utf-8")).hexdigest()
145
146 # Table of hash values for escaped characters:
147 g_escape_table = dict([(ch, _hash_text(ch))
148 for ch in '\\`*_{}[]()>#+-.!'])
149
150
151
152 #---- exceptions
153
154 class MarkdownError(Exception):
155 pass
156
157
158
159 #---- public api
160
161 def markdown_path(path, encoding="utf-8",
162 html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
163 safe_mode=None, extras=None, link_patterns=None,
164 use_file_vars=False):
165 fp = codecs.open(path, 'r', encoding)
166 text = fp.read()
167 fp.close()
168 return Markdown(html4tags=html4tags, tab_width=tab_width,
169 safe_mode=safe_mode, extras=extras,
170 link_patterns=link_patterns,
171 use_file_vars=use_file_vars).convert(text)
172
173 def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
174 safe_mode=None, extras=None, link_patterns=None,
175 use_file_vars=False):
176 return Markdown(html4tags=html4tags, tab_width=tab_width,
177 safe_mode=safe_mode, extras=extras,
178 link_patterns=link_patterns,
179 use_file_vars=use_file_vars).convert(text)
180
181 class Markdown(object):
182 # The dict of "extras" to enable in processing -- a mapping of
183 # extra name to argument for the extra. Most extras do not have an
184 # argument, in which case the value is None.
185 #
186 # This can be set via (a) subclassing and (b) the constructor
187 # "extras" argument.
188 extras = None
189
190 urls = None
191 titles = None
192 html_blocks = None
193 html_spans = None
194 html_removed_text = "[HTML_REMOVED]" # for compat with markdown.py
195
196 # Used to track when we're inside an ordered or unordered list
197 # (see _ProcessListItems() for details):
198 list_level = 0
199
200 _ws_only_line_re = re.compile(r"^[ \t]+$", re.M)
201
202 def __init__(self, html4tags=False, tab_width=4, safe_mode=None,
203 extras=None, link_patterns=None, use_file_vars=False):
204 if html4tags:
205 self.empty_element_suffix = ">"
206 else:
207 self.empty_element_suffix = " />"
208 self.tab_width = tab_width
209
210 # For compatibility with earlier markdown2.py and with
211 # markdown.py's safe_mode being a boolean,
212 # safe_mode == True -> "replace"
213 if safe_mode is True:
214 self.safe_mode = "replace"
215 else:
216 self.safe_mode = safe_mode
217
218 # Massaging and building the "extras" info.
219 if self.extras is None:
220 self.extras = {}
221 elif not isinstance(self.extras, dict):
222 self.extras = dict([(e, None) for e in self.extras])
223 if extras:
224 if not isinstance(extras, dict):
225 extras = dict([(e, None) for e in extras])
226 self.extras.update(extras)
227 assert isinstance(self.extras, dict)
228 if "toc" in self.extras and not "header-ids" in self.extras:
229 self.extras["header-ids"] = None # "toc" implies "header-ids"
230 self._instance_extras = self.extras.copy()
231
232 self.link_patterns = link_patterns
233 self.use_file_vars = use_file_vars
234 self._outdent_re = re.compile(r'^(\t|[ ]{1,%d})' % tab_width, re.M)
235
236 self._escape_table = g_escape_table.copy()
237 if "smarty-pants" in self.extras:
238 self._escape_table['"'] = _hash_text('"')
239 self._escape_table["'"] = _hash_text("'")
240
241 def reset(self):
242 self.urls = {}
243 self.titles = {}
244 self.html_blocks = {}
245 self.html_spans = {}
246 self.list_level = 0
247 self.extras = self._instance_extras.copy()
248 if "footnotes" in self.extras:
249 self.footnotes = {}
250 self.footnote_ids = []
251 if "header-ids" in self.extras:
252 self._count_from_header_id = {} # no `defaultdict` in Python 2.4
253 if "metadata" in self.extras:
254 self.metadata = {}
255
256 # Per <https://developer.mozilla.org/en-US/docs/HTML/Element/a> "rel"
257 # should only be used in <a> tags with an "href" attribute.
258 _a_nofollow = re.compile(r"<(a)([^>]*href=)", re.IGNORECASE)
259
260 def convert(self, text):
261 """Convert the given text."""
262 # Main function. The order in which other subs are called here is
263 # essential. Link and image substitutions need to happen before
264 # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
265 # and <img> tags get encoded.
266
267 # Clear the global hashes. If we don't clear these, you get conflicts
268 # from other articles when generating a page which contains more than
269 # one article (e.g. an index page that shows the N most recent
270 # articles):
271 self.reset()
272
273 if not isinstance(text, unicode):
274 #TODO: perhaps shouldn't presume UTF-8 for string input?
275 text = unicode(text, 'utf-8')
276
277 if self.use_file_vars:
278 # Look for emacs-style file variable hints.
279 emacs_vars = self._get_emacs_vars(text)
280 if "markdown-extras" in emacs_vars:
281 splitter = re.compile("[ ,]+")
282 for e in splitter.split(emacs_vars["markdown-extras"]):
283 if '=' in e:
284 ename, earg = e.split('=', 1)
285 try:
286 earg = int(earg)
287 except ValueError:
288 pass
289 else:
290 ename, earg = e, None
291 self.extras[ename] = earg
292
293 # Standardize line endings:
294 text = re.sub("\r\n|\r", "\n", text)
295
296 # Make sure $text ends with a couple of newlines:
297 text += "\n\n"
298
299 # Convert all tabs to spaces.
300 text = self._detab(text)
301
302 # Strip any lines consisting only of spaces and tabs.
303 # This makes subsequent regexen easier to write, because we can
304 # match consecutive blank lines with /\n+/ instead of something
305 # contorted like /[ \t]*\n+/ .
306 text = self._ws_only_line_re.sub("", text)
307
308 # strip metadata from head and extract
309 if "metadata" in self.extras:
310 text = self._extract_metadata(text)
311
312 text = self.preprocess(text)
313
314 if "fenced-code-blocks" in self.extras and not self.safe_mode:
315 text = self._do_fenced_code_blocks(text)
316
317 if self.safe_mode:
318 text = self._hash_html_spans(text)
319
320 # Turn block-level HTML blocks into hash entries
321 text = self._hash_html_blocks(text, raw=True)
322
323 if "fenced-code-blocks" in self.extras and self.safe_mode:
324 text = self._do_fenced_code_blocks(text)
325
326 # Strip link definitions, store in hashes.
327 if "footnotes" in self.extras:
328 # Must do footnotes first because an unlucky footnote defn
329 # looks like a link defn:
330 # [^4]: this "looks like a link defn"
331 text = self._strip_footnote_definitions(text)
332 text = self._strip_link_definitions(text)
333
334 text = self._run_block_gamut(text)
335
336 if "footnotes" in self.extras:
337 text = self._add_footnotes(text)
338
339 text = self.postprocess(text)
340
341 text = self._unescape_special_chars(text)
342
343 if self.safe_mode:
344 text = self._unhash_html_spans(text)
345
346 if "nofollow" in self.extras:
347 text = self._a_nofollow.sub(r'<\1 rel="nofollow"\2', text)
348
349 text += "\n"
350
351 rv = UnicodeWithAttrs(text)
352 if "toc" in self.extras:
353 rv._toc = self._toc
354 if "metadata" in self.extras:
355 rv.metadata = self.metadata
356 return rv
357
358 def postprocess(self, text):
359 """A hook for subclasses to do some postprocessing of the html, if
360 desired. This is called before unescaping of special chars and
361 unhashing of raw HTML spans.
362 """
363 return text
364
365 def preprocess(self, text):
366 """A hook for subclasses to do some preprocessing of the Markdown, if
367 desired. This is called after basic formatting of the text, but prior
368 to any extras, safe mode, etc. processing.
369 """
370 return text
371
372 # Is metadata if the content starts with '---'-fenced `key: value`
373 # pairs. E.g. (indented for presentation):
374 # ---
375 # foo: bar
376 # another-var: blah blah
377 # ---
378 _metadata_pat = re.compile("""^---[ \t]*\n((?:[ \t]*[^ \t:]+[ \t]*:[^\n]*\n)+)---[ \t]*\n""")
379
380 def _extract_metadata(self, text):
381 # fast test
382 if not text.startswith("---"):
383 return text
384 match = self._metadata_pat.match(text)
385 if not match:
386 return text
387
388 tail = text[len(match.group(0)):]
389 metadata_str = match.group(1).strip()
390 for line in metadata_str.split('\n'):
391 key, value = line.split(':', 1)
392 self.metadata[key.strip()] = value.strip()
393
394 return tail
395
396
397 _emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE)
398 # This regular expression is intended to match blocks like this:
399 # PREFIX Local Variables: SUFFIX
400 # PREFIX mode: Tcl SUFFIX
401 # PREFIX End: SUFFIX
402 # Some notes:
403 # - "[ \t]" is used instead of "\s" to specifically exclude newlines
404 # - "(\r\n|\n|\r)" is used instead of "$" because the sre engine does
405 # not like anything other than Unix-style line terminators.
406 _emacs_local_vars_pat = re.compile(r"""^
407 (?P<prefix>(?:[^\r\n|\n|\r])*?)
408 [\ \t]*Local\ Variables:[\ \t]*
409 (?P<suffix>.*?)(?:\r\n|\n|\r)
410 (?P<content>.*?\1End:)
411 """, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)
412
413 def _get_emacs_vars(self, text):
414 """Return a dictionary of emacs-style local variables.
415
416 Parsing is done loosely according to this spec (and according to
417 some in-practice deviations from this):
418 http://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html#Specifying-File-Variables
419 """
420 emacs_vars = {}
421 SIZE = pow(2, 13) # 8kB
422
423 # Search near the start for a '-*-'-style one-liner of variables.
424 head = text[:SIZE]
425 if "-*-" in head:
426 match = self._emacs_oneliner_vars_pat.search(head)
427 if match:
428 emacs_vars_str = match.group(1)
429 assert '\n' not in emacs_vars_str
430 emacs_var_strs = [s.strip() for s in emacs_vars_str.split(';')
431 if s.strip()]
432 if len(emacs_var_strs) == 1 and ':' not in emacs_var_strs[0]:
433 # While not in the spec, this form is allowed by emacs:
434 # -*- Tcl -*-
435 # where the implied "variable" is "mode". This form
436 # is only allowed if there are no other variables.
437 emacs_vars["mode"] = emacs_var_strs[0].strip()
438 else:
439 for emacs_var_str in emacs_var_strs:
440 try:
441 variable, value = emacs_var_str.strip().split(':', 1)
442 except ValueError:
443 log.debug("emacs variables error: malformed -*- "
444 "line: %r", emacs_var_str)
445 continue
446 # Lowercase the variable name because Emacs allows "Mode"
447 # or "mode" or "MoDe", etc.
448 emacs_vars[variable.lower()] = value.strip()
449
450 tail = text[-SIZE:]
451 if "Local Variables" in tail:
452 match = self._emacs_local_vars_pat.search(tail)
453 if match:
454 prefix = match.group("prefix")
455 suffix = match.group("suffix")
456 lines = match.group("content").splitlines(0)
457 #print "prefix=%r, suffix=%r, content=%r, lines: %s"\
458 # % (prefix, suffix, match.group("content"), lines)
459
460 # Validate the Local Variables block: proper prefix and suffix
461 # usage.
462 for i, line in enumerate(lines):
463 if not line.startswith(prefix):
464 log.debug("emacs variables error: line '%s' "
465 "does not use proper prefix '%s'"
466 % (line, prefix))
467 return {}
468 # Don't validate suffix on last line. Emacs doesn't care,
469 # neither should we.
470 if i != len(lines)-1 and not line.endswith(suffix):
471 log.debug("emacs variables error: line '%s' "
472 "does not use proper suffix '%s'"
473 % (line, suffix))
474 return {}
475
476 # Parse out one emacs var per line.
477 continued_for = None
478 for line in lines[:-1]: # no var on the last line ("PREFIX End:")
479 if prefix: line = line[len(prefix):] # strip prefix
480 if suffix: line = line[:-len(suffix)] # strip suffix
481 line = line.strip()
482 if continued_for:
483 variable = continued_for
484 if line.endswith('\\'):
485 line = line[:-1].rstrip()
486 else:
487 continued_for = None
488 emacs_vars[variable] += ' ' + line
489 else:
490 try:
491 variable, value = line.split(':', 1)
492 except ValueError:
493 log.debug("local variables error: missing colon "
494 "in local variables entry: '%s'" % line)
495 continue
496 # Do NOT lowercase the variable name, because Emacs only
497 # allows "mode" (and not "Mode", "MoDe", etc.) in this block.
498 value = value.strip()
499 if value.endswith('\\'):
500 value = value[:-1].rstrip()
501 continued_for = variable
502 else:
503 continued_for = None
504 emacs_vars[variable] = value
505
506 # Unquote values.
507 for var, val in list(emacs_vars.items()):
508 if len(val) > 1 and (val.startswith('"') and val.endswith('"')
509 or val.startswith('"') and val.endswith('"')):
510 emacs_vars[var] = val[1:-1]
511
512 return emacs_vars
513
514 # Cribbed from a post by Bart Lateur:
515 # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
516 _detab_re = re.compile(r'(.*?)\t', re.M)
517 def _detab_sub(self, match):
518 g1 = match.group(1)
519 return g1 + (' ' * (self.tab_width - len(g1) % self.tab_width))
520 def _detab(self, text):
521 r"""Remove (leading?) tabs from a file.
522
523 >>> m = Markdown()
524 >>> m._detab("\tfoo")
525 ' foo'
526 >>> m._detab(" \tfoo")
527 ' foo'
528 >>> m._detab("\t foo")
529 ' foo'
530 >>> m._detab(" foo")
531 ' foo'
532 >>> m._detab(" foo\n\tbar\tblam")
533 ' foo\n bar blam'
534 """
535 if '\t' not in text:
536 return text
537 return self._detab_re.subn(self._detab_sub, text)[0]
538
539 # I broke out the html5 tags here and add them to _block_tags_a and
540 # _block_tags_b. This way html5 tags are easy to keep track of.
541 _html5tags = '|article|aside|header|hgroup|footer|nav|section|figure|figcaption'
542
543 _block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del'
544 _block_tags_a += _html5tags
545
546 _strict_tag_block_re = re.compile(r"""
547 ( # save in \1
548 ^ # start of line (with re.M)
549 <(%s) # start tag = \2
550 \b # word break
551 (.*\n)*? # any number of lines, minimally matching
552 </\2> # the matching end tag
553 [ \t]* # trailing spaces/tabs
554 (?=\n+|\Z) # followed by a newline or end of document
555 )
556 """ % _block_tags_a,
557 re.X | re.M)
558
559 _block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math'
560 _block_tags_b += _html5tags
561
562 _liberal_tag_block_re = re.compile(r"""
563 ( # save in \1
564 ^ # start of line (with re.M)
565 <(%s) # start tag = \2
566 \b # word break
567 (.*\n)*? # any number of lines, minimally matching
568 .*</\2> # the matching end tag
569 [ \t]* # trailing spaces/tabs
570 (?=\n+|\Z) # followed by a newline or end of document
571 )
572 """ % _block_tags_b,
573 re.X | re.M)
574
575 _html_markdown_attr_re = re.compile(
576 r'''\s+markdown=("1"|'1')''')
577 def _hash_html_block_sub(self, match, raw=False):
578 html = match.group(1)
579 if raw and self.safe_mode:
580 html = self._sanitize_html(html)
581 elif 'markdown-in-html' in self.extras and 'markdown=' in html:
582 first_line = html.split('\n', 1)[0]
583 m = self._html_markdown_attr_re.search(first_line)
584 if m:
585 lines = html.split('\n')
586 middle = '\n'.join(lines[1:-1])
587 last_line = lines[-1]
588 first_line = first_line[:m.start()] + first_line[m.end():]
589 f_key = _hash_text(first_line)
590 self.html_blocks[f_key] = first_line
591 l_key = _hash_text(last_line)
592 self.html_blocks[l_key] = last_line
593 return ''.join(["\n\n", f_key,
594 "\n\n", middle, "\n\n",
595 l_key, "\n\n"])
596 key = _hash_text(html)
597 self.html_blocks[key] = html
598 return "\n\n" + key + "\n\n"
599
600 def _hash_html_blocks(self, text, raw=False):
601 """Hashify HTML blocks
602
603 We only want to do this for block-level HTML tags, such as headers,
604 lists, and tables. That's because we still want to wrap <p>s around
605 "paragraphs" that are wrapped in non-block-level tags, such as anchors,
606 phrase emphasis, and spans. The list of tags we're looking for is
607 hard-coded.
608
609 @param raw {boolean} indicates if these are raw HTML blocks in
610 the original source. It makes a difference in "safe" mode.
611 """
612 if '<' not in text:
613 return text
614
615 # Pass `raw` value into our calls to self._hash_html_block_sub.
616 hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw)
617
618 # First, look for nested blocks, e.g.:
619 # <div>
620 # <div>
621 # tags for inner block must be indented.
622 # </div>
623 # </div>
624 #
625 # The outermost tags must start at the left margin for this to match, and
626 # the inner nested divs must be indented.
627 # We need to do this before the next, more liberal match, because the next
628 # match will start at the first `<div>` and stop at the first `</div>`.
629 text = self._strict_tag_block_re.sub(hash_html_block_sub, text)
630
631 # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
632 text = self._liberal_tag_block_re.sub(hash_html_block_sub, text)
633
634 # Special case just for <hr />. It was easier to make a special
635 # case than to make the other regex more complicated.
636 if "<hr" in text:
637 _hr_tag_re = _hr_tag_re_from_tab_width(self.tab_width)
638 text = _hr_tag_re.sub(hash_html_block_sub, text)
639
640 # Special case for standalone HTML comments:
641 if "<!--" in text:
642 start = 0
643 while True:
644 # Delimiters for next comment block.
645 try:
646 start_idx = text.index("<!--", start)
647 except ValueError:
648 break
649 try:
650 end_idx = text.index("-->", start_idx) + 3
651 except ValueError:
652 break
653
654 # Start position for next comment block search.
655 start = end_idx
656
657 # Validate whitespace before comment.
658 if start_idx:
659 # - Up to `tab_width - 1` spaces before start_idx.
660 for i in range(self.tab_width - 1):
661 if text[start_idx - 1] != ' ':
662 break
663 start_idx -= 1
664 if start_idx == 0:
665 break
666 # - Must be preceded by 2 newlines or hit the start of
667 # the document.
668 if start_idx == 0:
669 pass
670 elif start_idx == 1 and text[0] == '\n':
671 start_idx = 0 # to match minute detail of Markdown.pl regex
672 elif text[start_idx-2:start_idx] == '\n\n':
673 pass
674 else:
675 break
676
677 # Validate whitespace after comment.
678 # - Any number of spaces and tabs.
679 while end_idx < len(text):
680 if text[end_idx] not in ' \t':
681 break
682 end_idx += 1
683 # - Must be following by 2 newlines or hit end of text.
684 if text[end_idx:end_idx+2] not in ('', '\n', '\n\n'):
685 continue
686
687 # Escape and hash (must match `_hash_html_block_sub`).
688 html = text[start_idx:end_idx]
689 if raw and self.safe_mode:
690 html = self._sanitize_html(html)
691 key = _hash_text(html)
692 self.html_blocks[key] = html
693 text = text[:start_idx] + "\n\n" + key + "\n\n" + text[end_idx:]
694
695 if "xml" in self.extras:
696 # Treat XML processing instructions and namespaced one-liner
697 # tags as if they were block HTML tags. E.g., if standalone
698 # (i.e. are their own paragraph), the following do not get
699 # wrapped in a <p> tag:
700 # <?foo bar?>
701 #
702 # <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="chapter_1.md"/>
703 _xml_oneliner_re = _xml_oneliner_re_from_tab_width(self.tab_width)
704 text = _xml_oneliner_re.sub(hash_html_block_sub, text)
705
706 return text
707
708 def _strip_link_definitions(self, text):
709 # Strips link definitions from text, stores the URLs and titles in
710 # hash references.
711 less_than_tab = self.tab_width - 1
712
713 # Link defs are in the form:
714 # [id]: url "optional title"
715 _link_def_re = re.compile(r"""
716 ^[ ]{0,%d}\[(.+)\]: # id = \1
717 [ \t]*
718 \n? # maybe *one* newline
719 [ \t]*
720 <?(.+?)>? # url = \2
721 [ \t]*
722 (?:
723 \n? # maybe one newline
724 [ \t]*
725 (?<=\s) # lookbehind for whitespace
726 ['"(]
727 ([^\n]*) # title = \3
728 ['")]
729 [ \t]*
730 )? # title is optional
731 (?:\n+|\Z)
732 """ % less_than_tab, re.X | re.M | re.U)
733 return _link_def_re.sub(self._extract_link_def_sub, text)
734
735 def _extract_link_def_sub(self, match):
736 id, url, title = match.groups()
737 key = id.lower() # Link IDs are case-insensitive
738 self.urls[key] = self._encode_amps_and_angles(url)
739 if title:
740 self.titles[key] = title
741 return ""
742
743 def _extract_footnote_def_sub(self, match):
744 id, text = match.groups()
745 text = _dedent(text, skip_first_line=not text.startswith('\n')).strip()
746 normed_id = re.sub(r'\W', '-', id)
747 # Ensure footnote text ends with a couple newlines (for some
748 # block gamut matches).
749 self.footnotes[normed_id] = text + "\n\n"
750 return ""
751
752 def _strip_footnote_definitions(self, text):
753 """A footnote definition looks like this:
754
755 [^note-id]: Text of the note.
756
757 May include one or more indented paragraphs.
758
759 Where,
760 - The 'note-id' can be pretty much anything, though typically it
761 is the number of the footnote.
762 - The first paragraph may start on the next line, like so:
763
764 [^note-id]:
765 Text of the note.
766 """
767 less_than_tab = self.tab_width - 1
768 footnote_def_re = re.compile(r'''
769 ^[ ]{0,%d}\[\^(.+)\]: # id = \1
770 [ \t]*
771 ( # footnote text = \2
772 # First line need not start with the spaces.
773 (?:\s*.*\n+)
774 (?:
775 (?:[ ]{%d} | \t) # Subsequent lines must be indented.
776 .*\n+
777 )*
778 )
779 # Lookahead for non-space at line-start, or end of doc.
780 (?:(?=^[ ]{0,%d}\S)|\Z)
781 ''' % (less_than_tab, self.tab_width, self.tab_width),
782 re.X | re.M)
783 return footnote_def_re.sub(self._extract_footnote_def_sub, text)
784
785 _hr_re = re.compile(r'^[ ]{0,3}([-_*][ ]{0,2}){3,}$', re.M)
786
787 def _run_block_gamut(self, text):
788 # These are all the transformations that form block-level
789 # tags like paragraphs, headers, and list items.
790
791 if "fenced-code-blocks" in self.extras:
792 text = self._do_fenced_code_blocks(text)
793
794 text = self._do_headers(text)
795
796 # Do Horizontal Rules:
797 # On the number of spaces in horizontal rules: The spec is fuzzy: "If
798 # you wish, you may use spaces between the hyphens or asterisks."
799 # Markdown.pl 1.0.1's hr regexes limit the number of spaces between the
800 # hr chars to one or two. We'll reproduce that limit here.
801 hr = "\n<hr"+self.empty_element_suffix+"\n"
802 text = re.sub(self._hr_re, hr, text)
803
804 text = self._do_lists(text)
805
806 if "pyshell" in self.extras:
807 text = self._prepare_pyshell_blocks(text)
808 if "wiki-tables" in self.extras:
809 text = self._do_wiki_tables(text)
810 if "tables" in self.extras:
811 text = self._do_tables(text)
812
813 text = self._do_code_blocks(text)
814
815 text = self._do_block_quotes(text)
816
817 # We already ran _HashHTMLBlocks() before, in Markdown(), but that
818 # was to escape raw HTML in the original Markdown source. This time,
819 # we're escaping the markup we've just created, so that we don't wrap
820 # <p> tags around block-level tags.
821 text = self._hash_html_blocks(text)
822
823 text = self._form_paragraphs(text)
824
825 return text
826
827 def _pyshell_block_sub(self, match):
828 lines = match.group(0).splitlines(0)
829 _dedentlines(lines)
830 indent = ' ' * self.tab_width
831 s = ('\n' # separate from possible cuddled paragraph
832 + indent + ('\n'+indent).join(lines)
833 + '\n\n')
834 return s
835
836 def _prepare_pyshell_blocks(self, text):
837 """Ensure that Python interactive shell sessions are put in
838 code blocks -- even if not properly indented.
839 """
840 if ">>>" not in text:
841 return text
842
843 less_than_tab = self.tab_width - 1
844 _pyshell_block_re = re.compile(r"""
845 ^([ ]{0,%d})>>>[ ].*\n # first line
846 ^(\1.*\S+.*\n)* # any number of subsequent lines
847 ^\n # ends with a blank line
848 """ % less_than_tab, re.M | re.X)
849
850 return _pyshell_block_re.sub(self._pyshell_block_sub, text)
851
852 def _table_sub(self, match):
853 head, underline, body = match.groups()
854
855 # Determine aligns for columns.
856 cols = [cell.strip() for cell in underline.strip('| \t\n').split('|')]
857 align_from_col_idx = {}
858 for col_idx, col in enumerate(cols):
859 if col[0] == ':' and col[-1] == ':':
860 align_from_col_idx[col_idx] = ' align="center"'
861 elif col[0] == ':':
862 align_from_col_idx[col_idx] = ' align="left"'
863 elif col[-1] == ':':
864 align_from_col_idx[col_idx] = ' align="right"'
865
866 # thead
867 hlines = ['<table>', '<thead>', '<tr>']
868 cols = [cell.strip() for cell in head.strip('| \t\n').split('|')]
869 for col_idx, col in enumerate(cols):
870 hlines.append(' <th%s>%s</th>' % (
871 align_from_col_idx.get(col_idx, ''),
872 self._run_span_gamut(col)
873 ))
874 hlines.append('</tr>')
875 hlines.append('</thead>')
876
877 # tbody
878 hlines.append('<tbody>')
879 for line in body.strip('\n').split('\n'):
880 hlines.append('<tr>')
881 cols = [cell.strip() for cell in line.strip('| \t\n').split('|')]
882 for col_idx, col in enumerate(cols):
883 hlines.append(' <td%s>%s</td>' % (
884 align_from_col_idx.get(col_idx, ''),
885 self._run_span_gamut(col)
886 ))
887 hlines.append('</tr>')
888 hlines.append('</tbody>')
889 hlines.append('</table>')
890
891 return '\n'.join(hlines) + '\n'
892
893 def _do_tables(self, text):
894 """Copying PHP-Markdown and GFM table syntax. Some regex borrowed from
895 https://github.com/michelf/php-markdown/blob/lib/Michelf/Markdown.php#L2538
896 """
897 less_than_tab = self.tab_width - 1
898 table_re = re.compile(r'''
899 (?:(?<=\n\n)|\A\n?) # leading blank line
900
901 ^[ ]{0,%d} # allowed whitespace
902 (.*[|].*) \n # $1: header row (at least one pipe)
903
904 ^[ ]{0,%d} # allowed whitespace
905 ( # $2: underline row
906 # underline row with leading bar
907 (?: \|\ *:?-+:?\ * )+ \|? \n
908 |
909 # or, underline row without leading bar
910 (?: \ *:?-+:?\ *\| )+ (?: \ *:?-+:?\ * )? \n
911 )
912
913 ( # $3: data rows
914 (?:
915 ^[ ]{0,%d}(?!\ ) # ensure line begins with 0 to less_than_tab spaces
916 .*\|.* \n
917 )+
918 )
919 ''' % (less_than_tab, less_than_tab, less_than_tab), re.M | re.X)
920 return table_re.sub(self._table_sub, text)
921
922 def _wiki_table_sub(self, match):
923 ttext = match.group(0).strip()
924 #print 'wiki table: %r' % match.group(0)
925 rows = []
926 for line in ttext.splitlines(0):
927 line = line.strip()[2:-2].strip()
928 row = [c.strip() for c in re.split(r'(?<!\\)\|\|', line)]
929 rows.append(row)
930 #pprint(rows)
931 hlines = ['<table>', '<tbody>']
932 for row in rows:
933 hrow = ['<tr>']
934 for cell in row:
935 hrow.append('<td>')
936 hrow.append(self._run_span_gamut(cell))
937 hrow.append('</td>')
938 hrow.append('</tr>')
939 hlines.append(''.join(hrow))
940 hlines += ['</tbody>', '</table>']
941 return '\n'.join(hlines) + '\n'
942
943 def _do_wiki_tables(self, text):
944 # Optimization.
945 if "||" not in text:
946 return text
947
948 less_than_tab = self.tab_width - 1
949 wiki_table_re = re.compile(r'''
950 (?:(?<=\n\n)|\A\n?) # leading blank line
951 ^([ ]{0,%d})\|\|.+?\|\|[ ]*\n # first line
952 (^\1\|\|.+?\|\|\n)* # any number of subsequent lines
953 ''' % less_than_tab, re.M | re.X)
954 return wiki_table_re.sub(self._wiki_table_sub, text)
955
956 def _run_span_gamut(self, text):
957 # These are all the transformations that occur *within* block-level
958 # tags like paragraphs, headers, and list items.
959
960 text = self._do_code_spans(text)
961
962 text = self._escape_special_chars(text)
963
964 # Process anchor and image tags.
965 text = self._do_links(text)
966
967 # Make links out of things like `<http://example.com/>`
968 # Must come after _do_links(), because you can use < and >
969 # delimiters in inline links like [this](<url>).
970 text = self._do_auto_links(text)
971
972 if "link-patterns" in self.extras:
973 text = self._do_link_patterns(text)
974
975 text = self._encode_amps_and_angles(text)
976
977 text = self._do_italics_and_bold(text)
978
979 if "smarty-pants" in self.extras:
980 text = self._do_smart_punctuation(text)
981
982 # Do hard breaks:
983 if "break-on-newline" in self.extras:
984 text = re.sub(r" *\n", "<br%s\n" % self.empty_element_suffix, text)
985 else:
986 text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text)
987
988 return text
989
990 # "Sorta" because auto-links are identified as "tag" tokens.
991 _sorta_html_tokenize_re = re.compile(r"""
992 (
993 # tag
994 </?
995 (?:\w+) # tag name
996 (?:\s+(?:[\w-]+:)?[\w-]+=(?:".*?"|'.*?'))* # attributes
997 \s*/?>
998 |
999 # auto-link (e.g., <http://www.activestate.com/>)
1000 <\w+[^>]*>
1001 |
1002 <!--.*?--> # comment
1003 |
1004 <\?.*?\?> # processing instruction
1005 )
1006 """, re.X)
1007
1008 def _escape_special_chars(self, text):
1009 # Python markdown note: the HTML tokenization here differs from
1010 # that in Markdown.pl, hence the behaviour for subtle cases can
1011 # differ (I believe the tokenizer here does a better job because
1012 # it isn't susceptible to unmatched '<' and '>' in HTML tags).
1013 # Note, however, that '>' is not allowed in an auto-link URL
1014 # here.
1015 escaped = []
1016 is_html_markup = False
1017 for token in self._sorta_html_tokenize_re.split(text):
1018 if is_html_markup:
1019 # Within tags/HTML-comments/auto-links, encode * and _
1020 # so they don't conflict with their use in Markdown for
1021 # italics and strong. We're replacing each such
1022 # character with its corresponding MD5 checksum value;
1023 # this is likely overkill, but it should prevent us from
1024 # colliding with the escape values by accident.
1025 escaped.append(token.replace('*', self._escape_table['*'])
1026 .replace('_', self._escape_table['_']))
1027 else:
1028 escaped.append(self._encode_backslash_escapes(token))
1029 is_html_markup = not is_html_markup
1030 return ''.join(escaped)
1031
1032 def _hash_html_spans(self, text):
1033 # Used for safe_mode.
1034
1035 def _is_auto_link(s):
1036 if ':' in s and self._auto_link_re.match(s):
1037 return True
1038 elif '@' in s and self._auto_email_link_re.match(s):
1039 return True
1040 return False
1041
1042 tokens = []
1043 is_html_markup = False
1044 for token in self._sorta_html_tokenize_re.split(text):
1045 if is_html_markup and not _is_auto_link(token):
1046 sanitized = self._sanitize_html(token)
1047 key = _hash_text(sanitized)
1048 self.html_spans[key] = sanitized
1049 tokens.append(key)
1050 else:
1051 tokens.append(token)
1052 is_html_markup = not is_html_markup
1053 return ''.join(tokens)
1054
1055 def _unhash_html_spans(self, text):
1056 for key, sanitized in list(self.html_spans.items()):
1057 text = text.replace(key, sanitized)
1058 return text
1059
1060 def _sanitize_html(self, s):
1061 if self.safe_mode == "replace":
1062 return self.html_removed_text
1063 elif self.safe_mode == "escape":
1064 replacements = [
1065 ('&', '&amp;'),
1066 ('<', '&lt;'),
1067 ('>', '&gt;'),
1068 ]
1069 for before, after in replacements:
1070 s = s.replace(before, after)
1071 return s
1072 else:
1073 raise MarkdownError("invalid value for 'safe_mode': %r (must be "
1074 "'escape' or 'replace')" % self.safe_mode)
1075
1076 _inline_link_title = re.compile(r'''
1077 ( # \1
1078 [ \t]+
1079 (['"]) # quote char = \2
1080 (?P<title>.*?)
1081 \2
1082 )? # title is optional
1083 \)$
1084 ''', re.X | re.S)
1085 _tail_of_reference_link_re = re.compile(r'''
1086 # Match tail of: [text][id]
1087 [ ]? # one optional space
1088 (?:\n[ ]*)? # one optional newline followed by spaces
1089 \[
1090 (?P<id>.*?)
1091 \]
1092 ''', re.X | re.S)
1093
1094 _whitespace = re.compile(r'\s*')
1095
1096 _strip_anglebrackets = re.compile(r'<(.*)>.*')
1097
1098 def _find_non_whitespace(self, text, start):
1099 """Returns the index of the first non-whitespace character in text
1100 after (and including) start
1101 """
1102 match = self._whitespace.match(text, start)
1103 return match.end()
1104
1105 def _find_balanced(self, text, start, open_c, close_c):
1106 """Returns the index where the open_c and close_c characters balance
1107 out - the same number of open_c and close_c are encountered - or the
1108 end of string if it's reached before the balance point is found.
1109 """
1110 i = start
1111 l = len(text)
1112 count = 1
1113 while count > 0 and i < l:
1114 if text[i] == open_c:
1115 count += 1
1116 elif text[i] == close_c:
1117 count -= 1
1118 i += 1
1119 return i
1120
1121 def _extract_url_and_title(self, text, start):
1122 """Extracts the url and (optional) title from the tail of a link"""
1123 # text[start] equals the opening parenthesis
1124 idx = self._find_non_whitespace(text, start+1)
1125 if idx == len(text):
1126 return None, None, None
1127 end_idx = idx
1128 has_anglebrackets = text[idx] == "<"
1129 if has_anglebrackets:
1130 end_idx = self._find_balanced(text, end_idx+1, "<", ">")
1131 end_idx = self._find_balanced(text, end_idx, "(", ")")
1132 match = self._inline_link_title.search(text, idx, end_idx)
1133 if not match:
1134 return None, None, None
1135 url, title = text[idx:match.start()], match.group("title")
1136 if has_anglebrackets:
1137 url = self._strip_anglebrackets.sub(r'\1', url)
1138 return url, title, end_idx
1139
1140 def _do_links(self, text):
1141 """Turn Markdown link shortcuts into XHTML <a> and <img> tags.
1142
1143 This is a combination of Markdown.pl's _DoAnchors() and
1144 _DoImages(). They are done together because that simplified the
1145 approach. It was necessary to use a different approach than
1146 Markdown.pl because of the lack of atomic matching support in
1147 Python's regex engine used in $g_nested_brackets.
1148 """
1149 MAX_LINK_TEXT_SENTINEL = 3000 # markdown2 issue 24
1150
1151 # `anchor_allowed_pos` is used to support img links inside
1152 # anchors, but not anchors inside anchors. An anchor's start
1153 # pos must be `>= anchor_allowed_pos`.
1154 anchor_allowed_pos = 0
1155
1156 curr_pos = 0
1157 while True: # Handle the next link.
1158 # The next '[' is the start of:
1159 # - an inline anchor: [text](url "title")
1160 # - a reference anchor: [text][id]
1161 # - an inline img: ![text](url "title")
1162 # - a reference img: ![text][id]
1163 # - a footnote ref: [^id]
1164 # (Only if 'footnotes' extra enabled)
1165 # - a footnote defn: [^id]: ...
1166 # (Only if 'footnotes' extra enabled) These have already
1167 # been stripped in _strip_footnote_definitions() so no
1168 # need to watch for them.
1169 # - a link definition: [id]: url "title"
1170 # These have already been stripped in
1171 # _strip_link_definitions() so no need to watch for them.
1172 # - not markup: [...anything else...
1173 try:
1174 start_idx = text.index('[', curr_pos)
1175 except ValueError:
1176 break
1177 text_length = len(text)
1178
1179 # Find the matching closing ']'.
1180 # Markdown.pl allows *matching* brackets in link text so we
1181 # will here too. Markdown.pl *doesn't* currently allow
1182 # matching brackets in img alt text -- we'll differ in that
1183 # regard.
1184 bracket_depth = 0
1185 for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL,
1186 text_length)):
1187 ch = text[p]
1188 if ch == ']':
1189 bracket_depth -= 1
1190 if bracket_depth < 0:
1191 break
1192 elif ch == '[':
1193 bracket_depth += 1
1194 else:
1195 # Closing bracket not found within sentinel length.
1196 # This isn't markup.
1197 curr_pos = start_idx + 1
1198 continue
1199 link_text = text[start_idx+1:p]
1200
1201 # Possibly a footnote ref?
1202 if "footnotes" in self.extras and link_text.startswith("^"):
1203 normed_id = re.sub(r'\W', '-', link_text[1:])
1204 if normed_id in self.footnotes:
1205 self.footnote_ids.append(normed_id)
1206 result = '<sup class="footnote-ref" id="fnref-%s">' \
1207 '<a href="#fn-%s">%s</a></sup>' \
1208 % (normed_id, normed_id, len(self.footnote_ids))
1209 text = text[:start_idx] + result + text[p+1:]
1210 else:
1211 # This id isn't defined, leave the markup alone.
1212 curr_pos = p+1
1213 continue
1214
1215 # Now determine what this is by the remainder.
1216 p += 1
1217 if p == text_length:
1218 return text
1219
1220 # Inline anchor or img?
1221 if text[p] == '(': # attempt at perf improvement
1222 url, title, url_end_idx = self._extract_url_and_title(text, p)
1223 if url is not None:
1224 # Handle an inline anchor or img.
1225 is_img = start_idx > 0 and text[start_idx-1] == "!"
1226 if is_img:
1227 start_idx -= 1
1228
1229 # We've got to encode these to avoid conflicting
1230 # with italics/bold.
1231 url = url.replace('*', self._escape_table['*']) \
1232 .replace('_', self._escape_table['_'])
1233 if title:
1234 title_str = ' title="%s"' % (
1235 _xml_escape_attr(title)
1236 .replace('*', self._escape_table['*'])
1237 .replace('_', self._escape_table['_']))
1238 else:
1239 title_str = ''
1240 if is_img:
1241 img_class_str = self._html_class_str_from_tag("img")
1242 result = '<img src="%s" alt="%s"%s%s%s' \
1243 % (url.replace('"', '&quot;'),
1244 _xml_escape_attr(link_text),
1245 title_str, img_class_str, self.empty_element_suffix)
1246 if "smarty-pants" in self.extras:
1247 result = result.replace('"', self._escape_table['"'])
1248 curr_pos = start_idx + len(result)
1249 text = text[:start_idx] + result + text[url_end_idx:]
1250 elif start_idx >= anchor_allowed_pos:
1251 result_head = '<a href="%s"%s>' % (url, title_str)
1252 result = '%s%s</a>' % (result_head, link_text)
1253 if "smarty-pants" in self.extras:
1254 result = result.replace('"', self._escape_table['"'])
1255 # <img> allowed from curr_pos on, <a> from
1256 # anchor_allowed_pos on.
1257 curr_pos = start_idx + len(result_head)
1258 anchor_allowed_pos = start_idx + len(result)
1259 text = text[:start_idx] + result + text[url_end_idx:]
1260 else:
1261 # Anchor not allowed here.
1262 curr_pos = start_idx + 1
1263 continue
1264
1265 # Reference anchor or img?
1266 else:
1267 match = self._tail_of_reference_link_re.match(text, p)
1268 if match:
1269 # Handle a reference-style anchor or img.
1270 is_img = start_idx > 0 and text[start_idx-1] == "!"
1271 if is_img:
1272 start_idx -= 1
1273 link_id = match.group("id").lower()
1274 if not link_id:
1275 link_id = link_text.lower() # for links like [this][]
1276 if link_id in self.urls:
1277 url = self.urls[link_id]
1278 # We've got to encode these to avoid conflicting
1279 # with italics/bold.
1280 url = url.replace('*', self._escape_table['*']) \
1281 .replace('_', self._escape_table['_'])
1282 title = self.titles.get(link_id)
1283 if title:
1284 before = title
1285 title = _xml_escape_attr(title) \
1286 .replace('*', self._escape_table['*']) \
1287 .replace('_', self._escape_table['_'])
1288 title_str = ' title="%s"' % title
1289 else:
1290 title_str = ''
1291 if is_img:
1292 img_class_str = self._html_class_str_from_tag("img")
1293 result = '<img src="%s" alt="%s"%s%s%s' \
1294 % (url.replace('"', '&quot;'),
1295 link_text.replace('"', '&quot;'),
1296 title_str, img_class_str, self.empty_element_suffix)
1297 if "smarty-pants" in self.extras:
1298 result = result.replace('"', self._escape_table['"'])
1299 curr_pos = start_idx + len(result)
1300 text = text[:start_idx] + result + text[match.end():]
1301 elif start_idx >= anchor_allowed_pos:
1302 result = '<a href="%s"%s>%s</a>' \
1303 % (url, title_str, link_text)
1304 result_head = '<a href="%s"%s>' % (url, title_str)
1305 result = '%s%s</a>' % (result_head, link_text)
1306 if "smarty-pants" in self.extras:
1307 result = result.replace('"', self._escape_table['"'])
1308 # <img> allowed from curr_pos on, <a> from
1309 # anchor_allowed_pos on.
1310 curr_pos = start_idx + len(result_head)
1311 anchor_allowed_pos = start_idx + len(result)
1312 text = text[:start_idx] + result + text[match.end():]
1313 else:
1314 # Anchor not allowed here.
1315 curr_pos = start_idx + 1
1316 else:
1317 # This id isn't defined, leave the markup alone.
1318 curr_pos = match.end()
1319 continue
1320
1321 # Otherwise, it isn't markup.
1322 curr_pos = start_idx + 1
1323
1324 return text
1325
1326 def header_id_from_text(self, text, prefix, n):
1327 """Generate a header id attribute value from the given header
1328 HTML content.
1329
1330 This is only called if the "header-ids" extra is enabled.
1331 Subclasses may override this for different header ids.
1332
1333 @param text {str} The text of the header tag
1334 @param prefix {str} The requested prefix for header ids. This is the
1335 value of the "header-ids" extra key, if any. Otherwise, None.
1336 @param n {int} The <hN> tag number, i.e. `1` for an <h1> tag.
1337 @returns {str} The value for the header tag's "id" attribute. Return
1338 None to not have an id attribute and to exclude this header from
1339 the TOC (if the "toc" extra is specified).
1340 """
1341 header_id = _slugify(text)
1342 if prefix and isinstance(prefix, base_string_type):
1343 header_id = prefix + '-' + header_id
1344 if header_id in self._count_from_header_id:
1345 self._count_from_header_id[header_id] += 1
1346 header_id += '-%s' % self._count_from_header_id[header_id]
1347 else:
1348 self._count_from_header_id[header_id] = 1
1349 return header_id
1350
1351 _toc = None
1352 def _toc_add_entry(self, level, id, name):
1353 if self._toc is None:
1354 self._toc = []
1355 self._toc.append((level, id, self._unescape_special_chars(name)))
1356
1357 _h_re_base = r'''
1358 (^(.+)[ \t]*\n(=+|-+)[ \t]*\n+)
1359 |
1360 (^(\#{1,6}) # \1 = string of #'s
1361 [ \t]%s
1362 (.+?) # \2 = Header text
1363 [ \t]*
1364 (?<!\\) # ensure not an escaped trailing '#'
1365 \#* # optional closing #'s (not counted)
1366 \n+
1367 )
1368 '''
1369
1370 _h_re = re.compile(_h_re_base % '*', re.X | re.M)
1371 _h_re_tag_friendly = re.compile(_h_re_base % '+', re.X | re.M)
1372
1373 def _h_sub(self, match):
1374 if match.group(1) is not None:
1375 # Setext header
1376 n = {"=": 1, "-": 2}[match.group(3)[0]]
1377 header_group = match.group(2)
1378 else:
1379 # atx header
1380 n = len(match.group(5))
1381 header_group = match.group(6)
1382
1383 demote_headers = self.extras.get("demote-headers")
1384 if demote_headers:
1385 n = min(n + demote_headers, 6)
1386 header_id_attr = ""
1387 if "header-ids" in self.extras:
1388 header_id = self.header_id_from_text(header_group,
1389 self.extras["header-ids"], n)
1390 if header_id:
1391 header_id_attr = ' id="%s"' % header_id
1392 html = self._run_span_gamut(header_group)
1393 if "toc" in self.extras and header_id:
1394 self._toc_add_entry(n, header_id, html)
1395 return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)
1396
1397 def _do_headers(self, text):
1398 # Setext-style headers:
1399 # Header 1
1400 # ========
1401 #
1402 # Header 2
1403 # --------
1404
1405 # atx-style headers:
1406 # # Header 1
1407 # ## Header 2
1408 # ## Header 2 with closing hashes ##
1409 # ...
1410 # ###### Header 6
1411
1412 if 'tag-friendly' in self.extras:
1413 return self._h_re_tag_friendly.sub(self._h_sub, text)
1414 return self._h_re.sub(self._h_sub, text)
1415
1416 _marker_ul_chars = '*+-'
1417 _marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars
1418 _marker_ul = '(?:[%s])' % _marker_ul_chars
1419 _marker_ol = r'(?:\d+\.)'
1420
1421 def _list_sub(self, match):
1422 lst = match.group(1)
1423 lst_type = match.group(3) in self._marker_ul_chars and "ul" or "ol"
1424 result = self._process_list_items(lst)
1425 if self.list_level:
1426 return "<%s>\n%s</%s>\n" % (lst_type, result, lst_type)
1427 else:
1428 return "<%s>\n%s</%s>\n\n" % (lst_type, result, lst_type)
1429
1430 def _do_lists(self, text):
1431 # Form HTML ordered (numbered) and unordered (bulleted) lists.
1432
1433 # Iterate over each *non-overlapping* list match.
1434 pos = 0
1435 while True:
1436 # Find the *first* hit for either list style (ul or ol). We
1437 # match ul and ol separately to avoid adjacent lists of different
1438 # types running into each other (see issue #16).
1439 hits = []
1440 for marker_pat in (self._marker_ul, self._marker_ol):
1441 less_than_tab = self.tab_width - 1
1442 whole_list = r'''
1443 ( # \1 = whole list
1444 ( # \2
1445 [ ]{0,%d}
1446 (%s) # \3 = first list item marker
1447 [ \t]+
1448 (?!\ *\3\ ) # '- - - ...' isn't a list. See 'not_quite_a_list' test case.
1449 )
1450 (?:.+?)
1451 ( # \4
1452 \Z
1453 |
1454 \n{2,}
1455 (?=\S)
1456 (?! # Negative lookahead for another list item marker
1457 [ \t]*
1458 %s[ \t]+
1459 )
1460 )
1461 )
1462 ''' % (less_than_tab, marker_pat, marker_pat)
1463 if self.list_level: # sub-list
1464 list_re = re.compile("^"+whole_list, re.X | re.M | re.S)
1465 else:
1466 list_re = re.compile(r"(?:(?<=\n\n)|\A\n?)"+whole_list,
1467 re.X | re.M | re.S)
1468 match = list_re.search(text, pos)
1469 if match:
1470 hits.append((match.start(), match))
1471 if not hits:
1472 break
1473 hits.sort()
1474 match = hits[0][1]
1475 start, end = match.span()
1476 middle = self._list_sub(match)
1477 text = text[:start] + middle + text[end:]
1478 pos = start + len(middle) # start pos for next attempted match
1479
1480 return text
1481
1482 _list_item_re = re.compile(r'''
1483 (\n)? # leading line = \1
1484 (^[ \t]*) # leading whitespace = \2
1485 (?P<marker>%s) [ \t]+ # list marker = \3
1486 ((?:.+?) # list item text = \4
1487 (\n{1,2})) # eols = \5
1488 (?= \n* (\Z | \2 (?P<next_marker>%s) [ \t]+))
1489 ''' % (_marker_any, _marker_any),
1490 re.M | re.X | re.S)
1491
1492 _last_li_endswith_two_eols = False
1493 def _list_item_sub(self, match):
1494 item = match.group(4)
1495 leading_line = match.group(1)
1496 leading_space = match.group(2)
1497 if leading_line or "\n\n" in item or self._last_li_endswith_two_eols:
1498 item = self._run_block_gamut(self._outdent(item))
1499 else:
1500 # Recursion for sub-lists:
1501 item = self._do_lists(self._outdent(item))
1502 if item.endswith('\n'):
1503 item = item[:-1]
1504 item = self._run_span_gamut(item)
1505 self._last_li_endswith_two_eols = (len(match.group(5)) == 2)
1506 return "<li>%s</li>\n" % item
1507
1508 def _process_list_items(self, list_str):
1509 # Process the contents of a single ordered or unordered list,
1510 # splitting it into individual list items.
1511
1512 # The $g_list_level global keeps track of when we're inside a list.
1513 # Each time we enter a list, we increment it; when we leave a list,
1514 # we decrement. If it's zero, we're not in a list anymore.
1515 #
1516 # We do this because when we're not inside a list, we want to treat
1517 # something like this:
1518 #
1519 # I recommend upgrading to version
1520 # 8. Oops, now this line is treated
1521 # as a sub-list.
1522 #
1523 # As a single paragraph, despite the fact that the second line starts
1524 # with a digit-period-space sequence.
1525 #
1526 # Whereas when we're inside a list (or sub-list), that line will be
1527 # treated as the start of a sub-list. What a kludge, huh? This is
1528 # an aspect of Markdown's syntax that's hard to parse perfectly
1529 # without resorting to mind-reading. Perhaps the solution is to
1530 # change the syntax rules such that sub-lists must start with a
1531 # starting cardinal number; e.g. "1." or "a.".
1532 self.list_level += 1
1533 self._last_li_endswith_two_eols = False
1534 list_str = list_str.rstrip('\n') + '\n'
1535 list_str = self._list_item_re.sub(self._list_item_sub, list_str)
1536 self.list_level -= 1
1537 return list_str
1538
1539 def _get_pygments_lexer(self, lexer_name):
1540 try:
1541 from pygments import lexers, util
1542 except ImportError:
1543 return None
1544 try:
1545 return lexers.get_lexer_by_name(lexer_name)
1546 except util.ClassNotFound:
1547 return None
1548
1549 def _color_with_pygments(self, codeblock, lexer, **formatter_opts):
1550 import pygments
1551 import pygments.formatters
1552
1553 class HtmlCodeFormatter(pygments.formatters.HtmlFormatter):
1554 def _wrap_code(self, inner):
1555 """A function for use in a Pygments Formatter which
1556 wraps in <code> tags.
1557 """
1558 yield 0, "<code>"
1559 for tup in inner:
1560 yield tup
1561 yield 0, "</code>"
1562
1563 def wrap(self, source, outfile):
1564 """Return the source with a code, pre, and div."""
1565 return self._wrap_div(self._wrap_pre(self._wrap_code(source)))
1566
1567 formatter_opts.setdefault("cssclass", "codehilite")
1568 formatter = HtmlCodeFormatter(**formatter_opts)
1569 return pygments.highlight(codeblock, lexer, formatter)
1570
1571 def _code_block_sub(self, match, is_fenced_code_block=False):
1572 lexer_name = None
1573 if is_fenced_code_block:
1574 lexer_name = match.group(1)
1575 if lexer_name:
1576 formatter_opts = self.extras['fenced-code-blocks'] or {}
1577 codeblock = match.group(2)
1578 codeblock = codeblock[:-1] # drop one trailing newline
1579 else:
1580 codeblock = match.group(1)
1581 codeblock = self._outdent(codeblock)
1582 codeblock = self._detab(codeblock)
1583 codeblock = codeblock.lstrip('\n') # trim leading newlines
1584 codeblock = codeblock.rstrip() # trim trailing whitespace
1585
1586 # Note: "code-color" extra is DEPRECATED.
1587 if "code-color" in self.extras and codeblock.startswith(":::"):
1588 lexer_name, rest = codeblock.split('\n', 1)
1589 lexer_name = lexer_name[3:].strip()
1590 codeblock = rest.lstrip("\n") # Remove lexer declaration line.
1591 formatter_opts = self.extras['code-color'] or {}
1592
1593 if lexer_name:
1594 def unhash_code( codeblock ):
1595 for key, sanitized in list(self.html_spans.items()):
1596 codeblock = codeblock.replace(key, sanitized)
1597 replacements = [
1598 ("&amp;", "&"),
1599 ("&lt;", "<"),
1600 ("&gt;", ">")
1601 ]
1602 for old, new in replacements:
1603 codeblock = codeblock.replace(old, new)
1604 return codeblock
1605 lexer = self._get_pygments_lexer(lexer_name)
1606 if lexer:
1607 codeblock = unhash_code( codeblock )
1608 colored = self._color_with_pygments(codeblock, lexer,
1609 **formatter_opts)
1610 return "\n\n%s\n\n" % colored
1611
1612 codeblock = self._encode_code(codeblock)
1613 pre_class_str = self._html_class_str_from_tag("pre")
1614 code_class_str = self._html_class_str_from_tag("code")
1615 return "\n\n<pre%s><code%s>%s\n</code></pre>\n\n" % (
1616 pre_class_str, code_class_str, codeblock)
1617
1618 def _html_class_str_from_tag(self, tag):
1619 """Get the appropriate ' class="..."' string (note the leading
1620 space), if any, for the given tag.
1621 """
1622 if "html-classes" not in self.extras:
1623 return ""
1624 try:
1625 html_classes_from_tag = self.extras["html-classes"]
1626 except TypeError:
1627 return ""
1628 else:
1629 if tag in html_classes_from_tag:
1630 return ' class="%s"' % html_classes_from_tag[tag]
1631 return ""
1632
1633 def _do_code_blocks(self, text):
1634 """Process Markdown `<pre><code>` blocks."""
1635 code_block_re = re.compile(r'''
1636 (?:\n\n|\A\n?)
1637 ( # $1 = the code block -- one or more lines, starting with a space/tab
1638 (?:
1639 (?:[ ]{%d} | \t) # Lines must start with a tab or a tab-width of spaces
1640 .*\n+
1641 )+
1642 )
1643 ((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
1644 # Lookahead to make sure this block isn't already in a code block.
1645 # Needed when syntax highlighting is being used.
1646 (?![^<]*\</code\>)
1647 ''' % (self.tab_width, self.tab_width),
1648 re.M | re.X)
1649 return code_block_re.sub(self._code_block_sub, text)
1650
1651 _fenced_code_block_re = re.compile(r'''
1652 (?:\n\n|\A\n?)
1653 ^```([\w+-]+)?[ \t]*\n # opening fence, $1 = optional lang
1654 (.*?) # $2 = code block content
1655 ^```[ \t]*\n # closing fence
1656 ''', re.M | re.X | re.S)
1657
1658 def _fenced_code_block_sub(self, match):
1659 return self._code_block_sub(match, is_fenced_code_block=True);
1660
1661 def _do_fenced_code_blocks(self, text):
1662 """Process ```-fenced unindented code blocks ('fenced-code-blocks' extra)."""
1663 return self._fenced_code_block_re.sub(self._fenced_code_block_sub, text)
1664
1665 # Rules for a code span:
1666 # - backslash escapes are not interpreted in a code span
1667 # - to include one or or a run of more backticks the delimiters must
1668 # be a longer run of backticks
1669 # - cannot start or end a code span with a backtick; pad with a
1670 # space and that space will be removed in the emitted HTML
1671 # See `test/tm-cases/escapes.text` for a number of edge-case
1672 # examples.
1673 _code_span_re = re.compile(r'''
1674 (?<!\\)
1675 (`+) # \1 = Opening run of `
1676 (?!`) # See Note A test/tm-cases/escapes.text
1677 (.+?) # \2 = The code block
1678 (?<!`)
1679 \1 # Matching closer
1680 (?!`)
1681 ''', re.X | re.S)
1682
1683 def _code_span_sub(self, match):
1684 c = match.group(2).strip(" \t")
1685 c = self._encode_code(c)
1686 return "<code>%s</code>" % c
1687
1688 def _do_code_spans(self, text):
1689 # * Backtick quotes are used for <code></code> spans.
1690 #
1691 # * You can use multiple backticks as the delimiters if you want to
1692 # include literal backticks in the code span. So, this input:
1693 #
1694 # Just type ``foo `bar` baz`` at the prompt.
1695 #
1696 # Will translate to:
1697 #
1698 # <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
1699 #
1700 # There's no arbitrary limit to the number of backticks you
1701 # can use as delimters. If you need three consecutive backticks
1702 # in your code, use four for delimiters, etc.
1703 #
1704 # * You can use spaces to get literal backticks at the edges:
1705 #
1706 # ... type `` `bar` `` ...
1707 #
1708 # Turns to:
1709 #
1710 # ... type <code>`bar`</code> ...
1711 return self._code_span_re.sub(self._code_span_sub, text)
1712
1713 def _encode_code(self, text):
1714 """Encode/escape certain characters inside Markdown code runs.
1715 The point is that in code, these characters are literals,
1716 and lose their special Markdown meanings.
1717 """
1718 replacements = [
1719 # Encode all ampersands; HTML entities are not
1720 # entities within a Markdown code span.
1721 ('&', '&amp;'),
1722 # Do the angle bracket song and dance:
1723 ('<', '&lt;'),
1724 ('>', '&gt;'),
1725 ]
1726 for before, after in replacements:
1727 text = text.replace(before, after)
1728 hashed = _hash_text(text)
1729 self._escape_table[text] = hashed
1730 return hashed
1731
1732 _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S)
1733 _em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S)
1734 _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S)
1735 _code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S)
1736 def _do_italics_and_bold(self, text):
1737 # <strong> must go first:
1738 if "code-friendly" in self.extras:
1739 text = self._code_friendly_strong_re.sub(r"<strong>\1</strong>", text)
1740 text = self._code_friendly_em_re.sub(r"<em>\1</em>", text)
1741 else:
1742 text = self._strong_re.sub(r"<strong>\2</strong>", text)
1743 text = self._em_re.sub(r"<em>\2</em>", text)
1744 return text
1745
1746 # "smarty-pants" extra: Very liberal in interpreting a single prime as an
1747 # apostrophe; e.g. ignores the fact that "round", "bout", "twer", and
1748 # "twixt" can be written without an initial apostrophe. This is fine because
1749 # using scare quotes (single quotation marks) is rare.
1750 _apostrophe_year_re = re.compile(r"'(\d\d)(?=(\s|,|;|\.|\?|!|$))")
1751 _contractions = ["tis", "twas", "twer", "neath", "o", "n",
1752 "round", "bout", "twixt", "nuff", "fraid", "sup"]
1753 def _do_smart_contractions(self, text):
1754 text = self._apostrophe_year_re.sub(r"&#8217;\1", text)
1755 for c in self._contractions:
1756 text = text.replace("'%s" % c, "&#8217;%s" % c)
1757 text = text.replace("'%s" % c.capitalize(),
1758 "&#8217;%s" % c.capitalize())
1759 return text
1760
1761 # Substitute double-quotes before single-quotes.
1762 _opening_single_quote_re = re.compile(r"(?<!\S)'(?=\S)")
1763 _opening_double_quote_re = re.compile(r'(?<!\S)"(?=\S)')
1764 _closing_single_quote_re = re.compile(r"(?<=\S)'")
1765 _closing_double_quote_re = re.compile(r'(?<=\S)"(?=(\s|,|;|\.|\?|!|$))')
1766 def _do_smart_punctuation(self, text):
1767 """Fancifies 'single quotes', "double quotes", and apostrophes.
1768 Converts --, ---, and ... into en dashes, em dashes, and ellipses.
1769
1770 Inspiration is: <http://daringfireball.net/projects/smartypants/>
1771 See "test/tm-cases/smarty_pants.text" for a full discussion of the
1772 support here and
1773 <http://code.google.com/p/python-markdown2/issues/detail?id=42> for a
1774 discussion of some diversion from the original SmartyPants.
1775 """
1776 if "'" in text: # guard for perf
1777 text = self._do_smart_contractions(text)
1778 text = self._opening_single_quote_re.sub("&#8216;", text)
1779 text = self._closing_single_quote_re.sub("&#8217;", text)
1780
1781 if '"' in text: # guard for perf
1782 text = self._opening_double_quote_re.sub("&#8220;", text)
1783 text = self._closing_double_quote_re.sub("&#8221;", text)
1784
1785 text = text.replace("---", "&#8212;")
1786 text = text.replace("--", "&#8211;")
1787 text = text.replace("...", "&#8230;")
1788 text = text.replace(" . . . ", "&#8230;")
1789 text = text.replace(". . .", "&#8230;")
1790 return text
1791
1792 _block_quote_re = re.compile(r'''
1793 ( # Wrap whole match in \1
1794 (
1795 ^[ \t]*>[ \t]? # '>' at the start of a line
1796 .+\n # rest of the first line
1797 (.+\n)* # subsequent consecutive lines
1798 \n* # blanks
1799 )+
1800 )
1801 ''', re.M | re.X)
1802 _bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M);
1803
1804 _html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S)
1805 def _dedent_two_spaces_sub(self, match):
1806 return re.sub(r'(?m)^ ', '', match.group(1))
1807
1808 def _block_quote_sub(self, match):
1809 bq = match.group(1)
1810 bq = self._bq_one_level_re.sub('', bq) # trim one level of quoting
1811 bq = self._ws_only_line_re.sub('', bq) # trim whitespace-only lines
1812 bq = self._run_block_gamut(bq) # recurse
1813
1814 bq = re.sub('(?m)^', ' ', bq)
1815 # These leading spaces screw with <pre> content, so we need to fix that:
1816 bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq)
1817
1818 return "<blockquote>\n%s\n</blockquote>\n\n" % bq
1819
1820 def _do_block_quotes(self, text):
1821 if '>' not in text:
1822 return text
1823 return self._block_quote_re.sub(self._block_quote_sub, text)
1824
1825 def _form_paragraphs(self, text):
1826 # Strip leading and trailing lines:
1827 text = text.strip('\n')
1828
1829 # Wrap <p> tags.
1830 grafs = []
1831 for i, graf in enumerate(re.split(r"\n{2,}", text)):
1832 if graf in self.html_blocks:
1833 # Unhashify HTML blocks
1834 grafs.append(self.html_blocks[graf])
1835 else:
1836 cuddled_list = None
1837 if "cuddled-lists" in self.extras:
1838 # Need to put back trailing '\n' for `_list_item_re`
1839 # match at the end of the paragraph.
1840 li = self._list_item_re.search(graf + '\n')
1841 # Two of the same list marker in this paragraph: a likely
1842 # candidate for a list cuddled to preceding paragraph
1843 # text (issue 33). Note the `[-1]` is a quick way to
1844 # consider numeric bullets (e.g. "1." and "2.") to be
1845 # equal.
1846 if (li and len(li.group(2)) <= 3 and li.group("next_marker")
1847 and li.group("marker")[-1] == li.group("next_marker")[-1]):
1848 start = li.start()
1849 cuddled_list = self._do_lists(graf[start:]).rstrip("\n")
1850 assert cuddled_list.startswith("<ul>") or cuddled_list.startswith("<ol>")
1851 graf = graf[:start]
1852
1853 # Wrap <p> tags.
1854 graf = self._run_span_gamut(graf)
1855 grafs.append("<p>" + graf.lstrip(" \t") + "</p>")
1856
1857 if cuddled_list:
1858 grafs.append(cuddled_list)
1859
1860 return "\n\n".join(grafs)
1861
1862 def _add_footnotes(self, text):
1863 if self.footnotes:
1864 footer = [
1865 '<div class="footnotes">',
1866 '<hr' + self.empty_element_suffix,
1867 '<ol>',
1868 ]
1869 for i, id in enumerate(self.footnote_ids):
1870 if i != 0:
1871 footer.append('')
1872 footer.append('<li id="fn-%s">' % id)
1873 footer.append(self._run_block_gamut(self.footnotes[id]))
1874 backlink = ('<a href="#fnref-%s" '
1875 'class="footnoteBackLink" '
1876 'title="Jump back to footnote %d in the text.">'
1877 '&#8617;</a>' % (id, i+1))
1878 if footer[-1].endswith("</p>"):
1879 footer[-1] = footer[-1][:-len("</p>")] \
1880 + '&#160;' + backlink + "</p>"
1881 else:
1882 footer.append("\n<p>%s</p>" % backlink)
1883 footer.append('</li>')
1884 footer.append('</ol>')
1885 footer.append('</div>')
1886 return text + '\n\n' + '\n'.join(footer)
1887 else:
1888 return text
1889
1890 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1891 # http://bumppo.net/projects/amputator/
1892 _ampersand_re = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)')
1893 _naked_lt_re = re.compile(r'<(?![a-z/?\$!])', re.I)
1894 _naked_gt_re = re.compile(r'''(?<![a-z0-9?!/'"-])>''', re.I)
1895
1896 def _encode_amps_and_angles(self, text):
1897 # Smart processing for ampersands and angle brackets that need
1898 # to be encoded.
1899 text = self._ampersand_re.sub('&amp;', text)
1900
1901 # Encode naked <'s
1902 text = self._naked_lt_re.sub('&lt;', text)
1903
1904 # Encode naked >'s
1905 # Note: Other markdown implementations (e.g. Markdown.pl, PHP
1906 # Markdown) don't do this.
1907 text = self._naked_gt_re.sub('&gt;', text)
1908 return text
1909
1910 def _encode_backslash_escapes(self, text):
1911 for ch, escape in list(self._escape_table.items()):
1912 text = text.replace("\\"+ch, escape)
1913 return text
1914
1915 _auto_link_re = re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I)
1916 def _auto_link_sub(self, match):
1917 g1 = match.group(1)
1918 return '<a href="%s">%s</a>' % (g1, g1)
1919
1920 _auto_email_link_re = re.compile(r"""
1921 <
1922 (?:mailto:)?
1923 (
1924 [-.\w]+
1925 \@
1926 [-\w]+(\.[-\w]+)*\.[a-z]+
1927 )
1928 >
1929 """, re.I | re.X | re.U)
1930 def _auto_email_link_sub(self, match):
1931 return self._encode_email_address(
1932 self._unescape_special_chars(match.group(1)))
1933
1934 def _do_auto_links(self, text):
1935 text = self._auto_link_re.sub(self._auto_link_sub, text)
1936 text = self._auto_email_link_re.sub(self._auto_email_link_sub, text)
1937 return text
1938
1939 def _encode_email_address(self, addr):
1940 # Input: an email address, e.g. "foo@example.com"
1941 #
1942 # Output: the email address as a mailto link, with each character
1943 # of the address encoded as either a decimal or hex entity, in
1944 # the hopes of foiling most address harvesting spam bots. E.g.:
1945 #
1946 # <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
1947 # x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
1948 # &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
1949 #
1950 # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1951 # mailing list: <http://tinyurl.com/yu7ue>
1952 chars = [_xml_encode_email_char_at_random(ch)
1953 for ch in "mailto:" + addr]
1954 # Strip the mailto: from the visible part.
1955 addr = '<a href="%s">%s</a>' \
1956 % (''.join(chars), ''.join(chars[7:]))
1957 return addr
1958
1959 def _do_link_patterns(self, text):
1960 """Caveat emptor: there isn't much guarding against link
1961 patterns being formed inside other standard Markdown links, e.g.
1962 inside a [link def][like this].
1963
1964 Dev Notes: *Could* consider prefixing regexes with a negative
1965 lookbehind assertion to attempt to guard against this.
1966 """
1967 link_from_hash = {}
1968 for regex, repl in self.link_patterns:
1969 replacements = []
1970 for match in regex.finditer(text):
1971 if hasattr(repl, "__call__"):
1972 href = repl(match)
1973 else:
1974 href = match.expand(repl)
1975 replacements.append((match.span(), href))
1976 for (start, end), href in reversed(replacements):
1977 escaped_href = (
1978 href.replace('"', '&quot;') # b/c of attr quote
1979 # To avoid markdown <em> and <strong>:
1980 .replace('*', self._escape_table['*'])
1981 .replace('_', self._escape_table['_']))
1982 link = '<a href="%s">%s</a>' % (escaped_href, text[start:end])
1983 hash = _hash_text(link)
1984 link_from_hash[hash] = link
1985 text = text[:start] + hash + text[end:]
1986 for hash, link in list(link_from_hash.items()):
1987 text = text.replace(hash, link)
1988 return text
1989
1990 def _unescape_special_chars(self, text):
1991 # Swap back in all the special characters we've hidden.
1992 for ch, hash in list(self._escape_table.items()):
1993 text = text.replace(hash, ch)
1994 return text
1995
1996 def _outdent(self, text):
1997 # Remove one level of line-leading tabs or spaces
1998 return self._outdent_re.sub('', text)
1999
2000
2001 class MarkdownWithExtras(Markdown):
2002 """A markdowner class that enables most extras:
2003
2004 - footnotes
2005 - code-color (only has effect if 'pygments' Python module on path)
2006
2007 These are not included:
2008 - pyshell (specific to Python-related documenting)
2009 - code-friendly (because it *disables* part of the syntax)
2010 - link-patterns (because you need to specify some actual
2011 link-patterns anyway)
2012 """
2013 extras = ["footnotes", "code-color"]
2014
2015
2016 #---- internal support functions
2017
2018 class UnicodeWithAttrs(unicode):
2019 """A subclass of unicode used for the return value of conversion to
2020 possibly attach some attributes. E.g. the "toc_html" attribute when
2021 the "toc" extra is used.
2022 """
2023 metadata = None
2024 _toc = None
2025 def toc_html(self):
2026 """Return the HTML for the current TOC.
2027
2028 This expects the `_toc` attribute to have been set on this instance.
2029 """
2030 if self._toc is None:
2031 return None
2032
2033 def indent():
2034 return ' ' * (len(h_stack) - 1)
2035 lines = []
2036 h_stack = [0] # stack of header-level numbers
2037 for level, id, name in self._toc:
2038 if level > h_stack[-1]:
2039 lines.append("%s<ul>" % indent())
2040 h_stack.append(level)
2041 elif level == h_stack[-1]:
2042 lines[-1] += "</li>"
2043 else:
2044 while level < h_stack[-1]:
2045 h_stack.pop()
2046 if not lines[-1].endswith("</li>"):
2047 lines[-1] += "</li>"
2048 lines.append("%s</ul></li>" % indent())
2049 lines.append('%s<li><a href="#%s">%s</a>' % (
2050 indent(), id, name))
2051 while len(h_stack) > 1:
2052 h_stack.pop()
2053 if not lines[-1].endswith("</li>"):
2054 lines[-1] += "</li>"
2055 lines.append("%s</ul>" % indent())
2056 return '\n'.join(lines) + '\n'
2057 toc_html = property(toc_html)
2058
2059 ## {{{ http://code.activestate.com/recipes/577257/ (r1)
2060 _slugify_strip_re = re.compile(r'[^\w\s-]')
2061 _slugify_hyphenate_re = re.compile(r'[-\s]+')
2062 def _slugify(value):
2063 """
2064 Normalizes string, converts to lowercase, removes non-alpha characters,
2065 and converts spaces to hyphens.
2066
2067 From Django's "django/template/defaultfilters.py".
2068 """
2069 import unicodedata
2070 value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
2071 value = _slugify_strip_re.sub('', value).strip().lower()
2072 return _slugify_hyphenate_re.sub('-', value)
2073 ## end of http://code.activestate.com/recipes/577257/ }}}
2074
2075
2076 # From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549
2077 def _curry(*args, **kwargs):
2078 function, args = args[0], args[1:]
2079 def result(*rest, **kwrest):
2080 combined = kwargs.copy()
2081 combined.update(kwrest)
2082 return function(*args + rest, **combined)
2083 return result
2084
2085 # Recipe: regex_from_encoded_pattern (1.0)
2086 def _regex_from_encoded_pattern(s):
2087 """'foo' -> re.compile(re.escape('foo'))
2088 '/foo/' -> re.compile('foo')
2089 '/foo/i' -> re.compile('foo', re.I)
2090 """
2091 if s.startswith('/') and s.rfind('/') != 0:
2092 # Parse it: /PATTERN/FLAGS
2093 idx = s.rfind('/')
2094 pattern, flags_str = s[1:idx], s[idx+1:]
2095 flag_from_char = {
2096 "i": re.IGNORECASE,
2097 "l": re.LOCALE,
2098 "s": re.DOTALL,
2099 "m": re.MULTILINE,
2100 "u": re.UNICODE,
2101 }
2102 flags = 0
2103 for char in flags_str:
2104 try:
2105 flags |= flag_from_char[char]
2106 except KeyError:
2107 raise ValueError("unsupported regex flag: '%s' in '%s' "
2108 "(must be one of '%s')"
2109 % (char, s, ''.join(list(flag_from_char.keys()))))
2110 return re.compile(s[1:idx], flags)
2111 else: # not an encoded regex
2112 return re.compile(re.escape(s))
2113
2114 # Recipe: dedent (0.1.2)
2115 def _dedentlines(lines, tabsize=8, skip_first_line=False):
2116 """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
2117
2118 "lines" is a list of lines to dedent.
2119 "tabsize" is the tab width to use for indent width calculations.
2120 "skip_first_line" is a boolean indicating if the first line should
2121 be skipped for calculating the indent width and for dedenting.
2122 This is sometimes useful for docstrings and similar.
2123
2124 Same as dedent() except operates on a sequence of lines. Note: the
2125 lines list is modified **in-place**.
2126 """
2127 DEBUG = False
2128 if DEBUG:
2129 print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\
2130 % (tabsize, skip_first_line))
2131 indents = []
2132 margin = None
2133 for i, line in enumerate(lines):
2134 if i == 0 and skip_first_line: continue
2135 indent = 0
2136 for ch in line:
2137 if ch == ' ':
2138 indent += 1
2139 elif ch == '\t':
2140 indent += tabsize - (indent % tabsize)
2141 elif ch in '\r\n':
2142 continue # skip all-whitespace lines
2143 else:
2144 break
2145 else:
2146 continue # skip all-whitespace lines
2147 if DEBUG: print("dedent: indent=%d: %r" % (indent, line))
2148 if margin is None:
2149 margin = indent
2150 else:
2151 margin = min(margin, indent)
2152 if DEBUG: print("dedent: margin=%r" % margin)
2153
2154 if margin is not None and margin > 0:
2155 for i, line in enumerate(lines):
2156 if i == 0 and skip_first_line: continue
2157 removed = 0
2158 for j, ch in enumerate(line):
2159 if ch == ' ':
2160 removed += 1
2161 elif ch == '\t':
2162 removed += tabsize - (removed % tabsize)
2163 elif ch in '\r\n':
2164 if DEBUG: print("dedent: %r: EOL -> strip up to EOL" % line)
2165 lines[i] = lines[i][j:]
2166 break
2167 else:
2168 raise ValueError("unexpected non-whitespace char %r in "
2169 "line %r while removing %d-space margin"
2170 % (ch, line, margin))
2171 if DEBUG:
2172 print("dedent: %r: %r -> removed %d/%d"\
2173 % (line, ch, removed, margin))
2174 if removed == margin:
2175 lines[i] = lines[i][j+1:]
2176 break
2177 elif removed > margin:
2178 lines[i] = ' '*(removed-margin) + lines[i][j+1:]
2179 break
2180 else:
2181 if removed:
2182 lines[i] = lines[i][removed:]
2183 return lines
2184
2185 def _dedent(text, tabsize=8, skip_first_line=False):
2186 """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text
2187
2188 "text" is the text to dedent.
2189 "tabsize" is the tab width to use for indent width calculations.
2190 "skip_first_line" is a boolean indicating if the first line should
2191 be skipped for calculating the indent width and for dedenting.
2192 This is sometimes useful for docstrings and similar.
2193
2194 textwrap.dedent(s), but don't expand tabs to spaces
2195 """
2196 lines = text.splitlines(1)
2197 _dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line)
2198 return ''.join(lines)
2199
2200
2201 class _memoized(object):
2202 """Decorator that caches a function's return value each time it is called.
2203 If called later with the same arguments, the cached value is returned, and
2204 not re-evaluated.
2205
2206 http://wiki.python.org/moin/PythonDecoratorLibrary
2207 """
2208 def __init__(self, func):
2209 self.func = func
2210 self.cache = {}
2211 def __call__(self, *args):
2212 try:
2213 return self.cache[args]
2214 except KeyError:
2215 self.cache[args] = value = self.func(*args)
2216 return value
2217 except TypeError:
2218 # uncachable -- for instance, passing a list as an argument.
2219 # Better to not cache than to blow up entirely.
2220 return self.func(*args)
2221 def __repr__(self):
2222 """Return the function's docstring."""
2223 return self.func.__doc__
2224
2225
2226 def _xml_oneliner_re_from_tab_width(tab_width):
2227 """Standalone XML processing instruction regex."""
2228 return re.compile(r"""
2229 (?:
2230 (?<=\n\n) # Starting after a blank line
2231 | # or
2232 \A\n? # the beginning of the doc
2233 )
2234 ( # save in $1
2235 [ ]{0,%d}
2236 (?:
2237 <\?\w+\b\s+.*?\?> # XML processing instruction
2238 |
2239 <\w+:\w+\b\s+.*?/> # namespaced single tag
2240 )
2241 [ \t]*
2242 (?=\n{2,}|\Z) # followed by a blank line or end of document
2243 )
2244 """ % (tab_width - 1), re.X)
2245 _xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width)
2246
2247 def _hr_tag_re_from_tab_width(tab_width):
2248 return re.compile(r"""
2249 (?:
2250 (?<=\n\n) # Starting after a blank line
2251 | # or
2252 \A\n? # the beginning of the doc
2253 )
2254 ( # save in \1
2255 [ ]{0,%d}
2256 <(hr) # start tag = \2
2257 \b # word break
2258 ([^<>])*? #
2259 /?> # the matching end tag
2260 [ \t]*
2261 (?=\n{2,}|\Z) # followed by a blank line or end of document
2262 )
2263 """ % (tab_width - 1), re.X)
2264 _hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width)
2265
2266
2267 def _xml_escape_attr(attr, skip_single_quote=True):
2268 """Escape the given string for use in an HTML/XML tag attribute.
2269
2270 By default this doesn't bother with escaping `'` to `&#39;`, presuming that
2271 the tag attribute is surrounded by double quotes.
2272 """
2273 escaped = (attr
2274 .replace('&', '&amp;')
2275 .replace('"', '&quot;')
2276 .replace('<', '&lt;')
2277 .replace('>', '&gt;'))
2278 if not skip_single_quote:
2279 escaped = escaped.replace("'", "&#39;")
2280 return escaped
2281
2282
2283 def _xml_encode_email_char_at_random(ch):
2284 r = random()
2285 # Roughly 10% raw, 45% hex, 45% dec.
2286 # '@' *must* be encoded. I [John Gruber] insist.
2287 # Issue 26: '_' must be encoded.
2288 if r > 0.9 and ch not in "@_":
2289 return ch
2290 elif r < 0.45:
2291 # The [1:] is to drop leading '0': 0x63 -> x63
2292 return '&#%s;' % hex(ord(ch))[1:]
2293 else:
2294 return '&#%s;' % ord(ch)
2295
2296
2297
2298 #---- mainline
2299
2300 class _NoReflowFormatter(optparse.IndentedHelpFormatter):
2301 """An optparse formatter that does NOT reflow the description."""
2302 def format_description(self, description):
2303 return description or ""
2304
2305 def _test():
2306 import doctest
2307 doctest.testmod()
2308
2309 def main(argv=None):
2310 if argv is None:
2311 argv = sys.argv
2312 if not logging.root.handlers:
2313 logging.basicConfig()
2314
2315 usage = "usage: %prog [PATHS...]"
2316 version = "%prog "+__version__
2317 parser = optparse.OptionParser(prog="markdown2", usage=usage,
2318 version=version, description=cmdln_desc,
2319 formatter=_NoReflowFormatter())
2320 parser.add_option("-v", "--verbose", dest="log_level",
2321 action="store_const", const=logging.DEBUG,
2322 help="more verbose output")
2323 parser.add_option("--encoding",
2324 help="specify encoding of text content")
2325 parser.add_option("--html4tags", action="store_true", default=False,
2326 help="use HTML 4 style for empty element tags")
2327 parser.add_option("-s", "--safe", metavar="MODE", dest="safe_mode",
2328 help="sanitize literal HTML: 'escape' escapes "
2329 "HTML meta chars, 'replace' replaces with an "
2330 "[HTML_REMOVED] note")
2331 parser.add_option("-x", "--extras", action="append",
2332 help="Turn on specific extra features (not part of "
2333 "the core Markdown spec). See above.")
2334 parser.add_option("--use-file-vars",
2335 help="Look for and use Emacs-style 'markdown-extras' "
2336 "file var to turn on extras. See "
2337 "<https://github.com/trentm/python-markdown2/wiki/Extras>")
2338 parser.add_option("--link-patterns-file",
2339 help="path to a link pattern file")
2340 parser.add_option("--self-test", action="store_true",
2341 help="run internal self-tests (some doctests)")
2342 parser.add_option("--compare", action="store_true",
2343 help="run against Markdown.pl as well (for testing)")
2344 parser.set_defaults(log_level=logging.INFO, compare=False,
2345 encoding="utf-8", safe_mode=None, use_file_vars=False)
2346 opts, paths = parser.parse_args()
2347 log.setLevel(opts.log_level)
2348
2349 if opts.self_test:
2350 return _test()
2351
2352 if opts.extras:
2353 extras = {}
2354 for s in opts.extras:
2355 splitter = re.compile("[,;: ]+")
2356 for e in splitter.split(s):
2357 if '=' in e:
2358 ename, earg = e.split('=', 1)
2359 try:
2360 earg = int(earg)
2361 except ValueError:
2362 pass
2363 else:
2364 ename, earg = e, None
2365 extras[ename] = earg
2366 else:
2367 extras = None
2368
2369 if opts.link_patterns_file:
2370 link_patterns = []
2371 f = open(opts.link_patterns_file)
2372 try:
2373 for i, line in enumerate(f.readlines()):
2374 if not line.strip(): continue
2375 if line.lstrip().startswith("#"): continue
2376 try:
2377 pat, href = line.rstrip().rsplit(None, 1)
2378 except ValueError:
2379 raise MarkdownError("%s:%d: invalid link pattern line: %r"
2380 % (opts.link_patterns_file, i+1, line))
2381 link_patterns.append(
2382 (_regex_from_encoded_pattern(pat), href))
2383 finally:
2384 f.close()
2385 else:
2386 link_patterns = None
2387
2388 from os.path import join, dirname, abspath, exists
2389 markdown_pl = join(dirname(dirname(abspath(__file__))), "test",
2390 "Markdown.pl")
2391 if not paths:
2392 paths = ['-']
2393 for path in paths:
2394 if path == '-':
2395 text = sys.stdin.read()
2396 else:
2397 fp = codecs.open(path, 'r', opts.encoding)
2398 text = fp.read()
2399 fp.close()
2400 if opts.compare:
2401 from subprocess import Popen, PIPE
2402 print("==== Markdown.pl ====")
2403 p = Popen('perl %s' % markdown_pl, shell=True, stdin=PIPE, stdout=PIPE, close_fds=True)
2404 p.stdin.write(text.encode('utf-8'))
2405 p.stdin.close()
2406 perl_html = p.stdout.read().decode('utf-8')
2407 if py3:
2408 sys.stdout.write(perl_html)
2409 else:
2410 sys.stdout.write(perl_html.encode(
2411 sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
2412 print("==== markdown2.py ====")
2413 html = markdown(text,
2414 html4tags=opts.html4tags,
2415 safe_mode=opts.safe_mode,
2416 extras=extras, link_patterns=link_patterns,
2417 use_file_vars=opts.use_file_vars)
2418 if py3:
2419 sys.stdout.write(html)
2420 else:
2421 sys.stdout.write(html.encode(
2422 sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
2423 if extras and "toc" in extras:
2424 log.debug("toc_html: " +
2425 html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
2426 if opts.compare:
2427 test_dir = join(dirname(dirname(abspath(__file__))), "test")
2428 if exists(join(test_dir, "test_markdown2.py")):
2429 sys.path.insert(0, test_dir)
2430 from test_markdown2 import norm_html_from_html
2431 norm_html = norm_html_from_html(html)
2432 norm_perl_html = norm_html_from_html(perl_html)
2433 else:
2434 norm_html = html
2435 norm_perl_html = perl_html
2436 print("==== match? %r ====" % (norm_perl_html == norm_html))
2437
2438
2439 if __name__ == "__main__":
2440 sys.exit( main(sys.argv) )