Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/boltons/strutils.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
| author | shellac |
|---|---|
| date | Sat, 02 May 2020 07:14:21 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:26e78fe6e8c4 |
|---|---|
| 1 # -*- coding: utf-8 -*- | |
| 2 """So much practical programming involves string manipulation, which | |
| 3 Python readily accommodates. Still, there are dozens of basic and | |
| 4 common capabilities missing from the standard library, several of them | |
| 5 provided by ``strutils``. | |
| 6 """ | |
| 7 | |
| 8 from __future__ import print_function | |
| 9 | |
| 10 import re | |
| 11 import sys | |
| 12 import uuid | |
| 13 import zlib | |
| 14 import string | |
| 15 import unicodedata | |
| 16 import collections | |
| 17 from gzip import GzipFile | |
| 18 | |
| 19 try: | |
| 20 from cStringIO import cStringIO as StringIO | |
| 21 except ImportError: | |
| 22 from io import BytesIO as StringIO | |
| 23 | |
| 24 try: | |
| 25 from collections.abc import Mapping | |
| 26 except ImportError: | |
| 27 from collections import Mapping | |
| 28 | |
| 29 try: | |
| 30 unicode, str, bytes, basestring = unicode, str, str, basestring | |
| 31 from HTMLParser import HTMLParser | |
| 32 import htmlentitydefs | |
| 33 except NameError: # basestring not defined in Python 3 | |
| 34 unicode, str, bytes, basestring = str, bytes, bytes, (str, bytes) | |
| 35 unichr = chr | |
| 36 from html.parser import HTMLParser | |
| 37 from html import entities as htmlentitydefs | |
| 38 | |
| 39 | |
| 40 __all__ = ['camel2under', 'under2camel', 'slugify', 'split_punct_ws', | |
| 41 'unit_len', 'ordinalize', 'cardinalize', 'pluralize', 'singularize', | |
| 42 'asciify', 'is_ascii', 'is_uuid', 'html2text', 'strip_ansi', | |
| 43 'bytes2human', 'find_hashtags', 'a10n', 'gzip_bytes', 'gunzip_bytes', | |
| 44 'iter_splitlines', 'indent', 'escape_shell_args', | |
| 45 'args2cmd', 'args2sh', 'parse_int_list', 'format_int_list', 'unwrap_text'] | |
| 46 | |
| 47 | |
| 48 _punct_ws_str = string.punctuation + string.whitespace | |
| 49 _punct_re = re.compile('[' + _punct_ws_str + ']+') | |
| 50 _camel2under_re = re.compile('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))') | |
| 51 | |
| 52 | |
| 53 def camel2under(camel_string): | |
| 54 """Converts a camelcased string to underscores. Useful for turning a | |
| 55 class name into a function name. | |
| 56 | |
| 57 >>> camel2under('BasicParseTest') | |
| 58 'basic_parse_test' | |
| 59 """ | |
| 60 return _camel2under_re.sub(r'_\1', camel_string).lower() | |
| 61 | |
| 62 | |
| 63 def under2camel(under_string): | |
| 64 """Converts an underscored string to camelcased. Useful for turning a | |
| 65 function name into a class name. | |
| 66 | |
| 67 >>> under2camel('complex_tokenizer') | |
| 68 'ComplexTokenizer' | |
| 69 """ | |
| 70 return ''.join(w.capitalize() or '_' for w in under_string.split('_')) | |
| 71 | |
| 72 | |
| 73 def slugify(text, delim='_', lower=True, ascii=False): | |
| 74 """ | |
| 75 A basic function that turns text full of scary characters | |
| 76 (i.e., punctuation and whitespace), into a relatively safe | |
| 77 lowercased string separated only by the delimiter specified | |
| 78 by *delim*, which defaults to ``_``. | |
| 79 | |
| 80 The *ascii* convenience flag will :func:`asciify` the slug if | |
| 81 you require ascii-only slugs. | |
| 82 | |
| 83 >>> slugify('First post! Hi!!!!~1 ') | |
| 84 'first_post_hi_1' | |
| 85 | |
| 86 >>> slugify("Kurt Gödel's pretty cool.", ascii=True) == \ | |
| 87 b'kurt_goedel_s_pretty_cool' | |
| 88 True | |
| 89 | |
| 90 """ | |
| 91 ret = delim.join(split_punct_ws(text)) or delim if text else '' | |
| 92 if ascii: | |
| 93 ret = asciify(ret) | |
| 94 if lower: | |
| 95 ret = ret.lower() | |
| 96 return ret | |
| 97 | |
| 98 | |
| 99 def split_punct_ws(text): | |
| 100 """While :meth:`str.split` will split on whitespace, | |
| 101 :func:`split_punct_ws` will split on punctuation and | |
| 102 whitespace. This used internally by :func:`slugify`, above. | |
| 103 | |
| 104 >>> split_punct_ws('First post! Hi!!!!~1 ') | |
| 105 ['First', 'post', 'Hi', '1'] | |
| 106 """ | |
| 107 return [w for w in _punct_re.split(text) if w] | |
| 108 | |
| 109 | |
| 110 def unit_len(sized_iterable, unit_noun='item'): # TODO: len_units()/unitize()? | |
| 111 """Returns a plain-English description of an iterable's | |
| 112 :func:`len()`, conditionally pluralized with :func:`cardinalize`, | |
| 113 detailed below. | |
| 114 | |
| 115 >>> print(unit_len(range(10), 'number')) | |
| 116 10 numbers | |
| 117 >>> print(unit_len('aeiou', 'vowel')) | |
| 118 5 vowels | |
| 119 >>> print(unit_len([], 'worry')) | |
| 120 No worries | |
| 121 """ | |
| 122 count = len(sized_iterable) | |
| 123 units = cardinalize(unit_noun, count) | |
| 124 if count: | |
| 125 return u'%s %s' % (count, units) | |
| 126 return u'No %s' % (units,) | |
| 127 | |
| 128 | |
| 129 _ORDINAL_MAP = {'1': 'st', | |
| 130 '2': 'nd', | |
| 131 '3': 'rd'} # 'th' is the default | |
| 132 | |
| 133 | |
| 134 def ordinalize(number, ext_only=False): | |
| 135 """Turns *number* into its cardinal form, i.e., 1st, 2nd, | |
| 136 3rd, 4th, etc. If the last character isn't a digit, it returns the | |
| 137 string value unchanged. | |
| 138 | |
| 139 Args: | |
| 140 number (int or str): Number to be cardinalized. | |
| 141 ext_only (bool): Whether to return only the suffix. Default ``False``. | |
| 142 | |
| 143 >>> print(ordinalize(1)) | |
| 144 1st | |
| 145 >>> print(ordinalize(3694839230)) | |
| 146 3694839230th | |
| 147 >>> print(ordinalize('hi')) | |
| 148 hi | |
| 149 >>> print(ordinalize(1515)) | |
| 150 1515th | |
| 151 """ | |
| 152 numstr, ext = unicode(number), '' | |
| 153 if numstr and numstr[-1] in string.digits: | |
| 154 try: | |
| 155 # first check for teens | |
| 156 if numstr[-2] == '1': | |
| 157 ext = 'th' | |
| 158 else: | |
| 159 # all other cases | |
| 160 ext = _ORDINAL_MAP.get(numstr[-1], 'th') | |
| 161 except IndexError: | |
| 162 # single digit numbers (will reach here based on [-2] above) | |
| 163 ext = _ORDINAL_MAP.get(numstr[-1], 'th') | |
| 164 if ext_only: | |
| 165 return ext | |
| 166 else: | |
| 167 return numstr + ext | |
| 168 | |
| 169 | |
| 170 def cardinalize(unit_noun, count): | |
| 171 """Conditionally pluralizes a singular word *unit_noun* if | |
| 172 *count* is not one, preserving case when possible. | |
| 173 | |
| 174 >>> vowels = 'aeiou' | |
| 175 >>> print(len(vowels), cardinalize('vowel', len(vowels))) | |
| 176 5 vowels | |
| 177 >>> print(3, cardinalize('Wish', 3)) | |
| 178 3 Wishes | |
| 179 """ | |
| 180 if count == 1: | |
| 181 return unit_noun | |
| 182 return pluralize(unit_noun) | |
| 183 | |
| 184 | |
| 185 def singularize(word): | |
| 186 """Semi-intelligently converts an English plural *word* to its | |
| 187 singular form, preserving case pattern. | |
| 188 | |
| 189 >>> singularize('chances') | |
| 190 'chance' | |
| 191 >>> singularize('Activities') | |
| 192 'Activity' | |
| 193 >>> singularize('Glasses') | |
| 194 'Glass' | |
| 195 >>> singularize('FEET') | |
| 196 'FOOT' | |
| 197 | |
| 198 """ | |
| 199 orig_word, word = word, word.strip().lower() | |
| 200 if not word or word in _IRR_S2P: | |
| 201 return orig_word | |
| 202 | |
| 203 irr_singular = _IRR_P2S.get(word) | |
| 204 if irr_singular: | |
| 205 singular = irr_singular | |
| 206 elif not word.endswith('s'): | |
| 207 return orig_word | |
| 208 elif len(word) == 2: | |
| 209 singular = word[:-1] # or just return word? | |
| 210 elif word.endswith('ies') and word[-4:-3] not in 'aeiou': | |
| 211 singular = word[:-3] + 'y' | |
| 212 elif word.endswith('es') and word[-3] == 's': | |
| 213 singular = word[:-2] | |
| 214 else: | |
| 215 singular = word[:-1] | |
| 216 return _match_case(orig_word, singular) | |
| 217 | |
| 218 | |
| 219 def pluralize(word): | |
| 220 """Semi-intelligently converts an English *word* from singular form to | |
| 221 plural, preserving case pattern. | |
| 222 | |
| 223 >>> pluralize('friend') | |
| 224 'friends' | |
| 225 >>> pluralize('enemy') | |
| 226 'enemies' | |
| 227 >>> pluralize('Sheep') | |
| 228 'Sheep' | |
| 229 """ | |
| 230 orig_word, word = word, word.strip().lower() | |
| 231 if not word or word in _IRR_P2S: | |
| 232 return orig_word | |
| 233 irr_plural = _IRR_S2P.get(word) | |
| 234 if irr_plural: | |
| 235 plural = irr_plural | |
| 236 elif word.endswith('y') and word[-2:-1] not in 'aeiou': | |
| 237 plural = word[:-1] + 'ies' | |
| 238 elif word[-1] == 's' or word.endswith('ch') or word.endswith('sh'): | |
| 239 plural = word if word.endswith('es') else word + 'es' | |
| 240 else: | |
| 241 plural = word + 's' | |
| 242 return _match_case(orig_word, plural) | |
| 243 | |
| 244 | |
| 245 def _match_case(master, disciple): | |
| 246 if not master.strip(): | |
| 247 return disciple | |
| 248 if master.lower() == master: | |
| 249 return disciple.lower() | |
| 250 elif master.upper() == master: | |
| 251 return disciple.upper() | |
| 252 elif master.title() == master: | |
| 253 return disciple.title() | |
| 254 return disciple | |
| 255 | |
| 256 | |
| 257 # Singular to plural map of irregular pluralizations | |
| 258 _IRR_S2P = {'addendum': 'addenda', 'alga': 'algae', 'alumna': 'alumnae', | |
| 259 'alumnus': 'alumni', 'analysis': 'analyses', 'antenna': 'antennae', | |
| 260 'appendix': 'appendices', 'axis': 'axes', 'bacillus': 'bacilli', | |
| 261 'bacterium': 'bacteria', 'basis': 'bases', 'beau': 'beaux', | |
| 262 'bison': 'bison', 'bureau': 'bureaus', 'cactus': 'cacti', | |
| 263 'calf': 'calves', 'child': 'children', 'corps': 'corps', | |
| 264 'corpus': 'corpora', 'crisis': 'crises', 'criterion': 'criteria', | |
| 265 'curriculum': 'curricula', 'datum': 'data', 'deer': 'deer', | |
| 266 'diagnosis': 'diagnoses', 'die': 'dice', 'dwarf': 'dwarves', | |
| 267 'echo': 'echoes', 'elf': 'elves', 'ellipsis': 'ellipses', | |
| 268 'embargo': 'embargoes', 'emphasis': 'emphases', 'erratum': 'errata', | |
| 269 'fireman': 'firemen', 'fish': 'fish', 'focus': 'foci', | |
| 270 'foot': 'feet', 'formula': 'formulae', 'formula': 'formulas', | |
| 271 'fungus': 'fungi', 'genus': 'genera', 'goose': 'geese', | |
| 272 'half': 'halves', 'hero': 'heroes', 'hippopotamus': 'hippopotami', | |
| 273 'hoof': 'hooves', 'hypothesis': 'hypotheses', 'index': 'indices', | |
| 274 'knife': 'knives', 'leaf': 'leaves', 'life': 'lives', | |
| 275 'loaf': 'loaves', 'louse': 'lice', 'man': 'men', | |
| 276 'matrix': 'matrices', 'means': 'means', 'medium': 'media', | |
| 277 'memorandum': 'memoranda', 'millennium': 'milennia', 'moose': 'moose', | |
| 278 'mosquito': 'mosquitoes', 'mouse': 'mice', 'nebula': 'nebulae', | |
| 279 'neurosis': 'neuroses', 'nucleus': 'nuclei', 'oasis': 'oases', | |
| 280 'octopus': 'octopi', 'offspring': 'offspring', 'ovum': 'ova', | |
| 281 'ox': 'oxen', 'paralysis': 'paralyses', 'parenthesis': 'parentheses', | |
| 282 'person': 'people', 'phenomenon': 'phenomena', 'potato': 'potatoes', | |
| 283 'radius': 'radii', 'scarf': 'scarves', 'scissors': 'scissors', | |
| 284 'self': 'selves', 'sense': 'senses', 'series': 'series', 'sheep': | |
| 285 'sheep', 'shelf': 'shelves', 'species': 'species', 'stimulus': | |
| 286 'stimuli', 'stratum': 'strata', 'syllabus': 'syllabi', 'symposium': | |
| 287 'symposia', 'synopsis': 'synopses', 'synthesis': 'syntheses', | |
| 288 'tableau': 'tableaux', 'that': 'those', 'thesis': 'theses', | |
| 289 'thief': 'thieves', 'this': 'these', 'tomato': 'tomatoes', 'tooth': | |
| 290 'teeth', 'torpedo': 'torpedoes', 'vertebra': 'vertebrae', 'veto': | |
| 291 'vetoes', 'vita': 'vitae', 'watch': 'watches', 'wife': 'wives', | |
| 292 'wolf': 'wolves', 'woman': 'women'} | |
| 293 | |
| 294 | |
| 295 # Reverse index of the above | |
| 296 _IRR_P2S = dict([(v, k) for k, v in _IRR_S2P.items()]) | |
| 297 | |
| 298 HASHTAG_RE = re.compile(r"(?:^|\s)[##]{1}(\w+)", re.UNICODE) | |
| 299 | |
| 300 | |
| 301 def find_hashtags(string): | |
| 302 """Finds and returns all hashtags in a string, with the hashmark | |
| 303 removed. Supports full-width hashmarks for Asian languages and | |
| 304 does not false-positive on URL anchors. | |
| 305 | |
| 306 >>> find_hashtags('#atag http://asite/#ananchor') | |
| 307 ['atag'] | |
| 308 | |
| 309 ``find_hashtags`` also works with unicode hashtags. | |
| 310 """ | |
| 311 | |
| 312 # the following works, doctest just struggles with it | |
| 313 # >>> find_hashtags(u"can't get enough of that dignity chicken #肯德基 woo") | |
| 314 # [u'\u80af\u5fb7\u57fa'] | |
| 315 return HASHTAG_RE.findall(string) | |
| 316 | |
| 317 | |
| 318 def a10n(string): | |
| 319 """That thing where "internationalization" becomes "i18n", what's it | |
| 320 called? Abbreviation? Oh wait, no: ``a10n``. (It's actually a form | |
| 321 of `numeronym`_.) | |
| 322 | |
| 323 >>> a10n('abbreviation') | |
| 324 'a10n' | |
| 325 >>> a10n('internationalization') | |
| 326 'i18n' | |
| 327 >>> a10n('') | |
| 328 '' | |
| 329 | |
| 330 .. _numeronym: http://en.wikipedia.org/wiki/Numeronym | |
| 331 """ | |
| 332 if len(string) < 3: | |
| 333 return string | |
| 334 return '%s%s%s' % (string[0], len(string[1:-1]), string[-1]) | |
| 335 | |
| 336 | |
| 337 ANSI_ESCAPE_BEGIN = '\x1b[' | |
| 338 ANSI_TERMINATORS = ('H', 'f', 'A', 'B', 'C', 'D', 'R', 's', 'u', 'J', | |
| 339 'K', 'h', 'l', 'p', 'm') | |
| 340 | |
| 341 | |
| 342 def strip_ansi(text): | |
| 343 """Strips ANSI escape codes from *text*. Useful for the occasional | |
| 344 time when a log or redirected output accidentally captures console | |
| 345 color codes and the like. | |
| 346 | |
| 347 >>> strip_ansi('\x1b[0m\x1b[1;36mart\x1b[46;34m\xdc') | |
| 348 'art' | |
| 349 | |
| 350 The test above is an excerpt from ANSI art on | |
| 351 `sixteencolors.net`_. This function does not interpret or render | |
| 352 ANSI art, but you can do so with `ansi2img`_ or `escapes.js`_. | |
| 353 | |
| 354 .. _sixteencolors.net: http://sixteencolors.net | |
| 355 .. _ansi2img: http://www.bedroomlan.org/projects/ansi2img | |
| 356 .. _escapes.js: https://github.com/atdt/escapes.js | |
| 357 """ | |
| 358 # TODO: move to cliutils.py | |
| 359 nansi, keep, i, text_len = [], True, 0, len(text) | |
| 360 while i < text_len: | |
| 361 if not keep and text[i] in ANSI_TERMINATORS: | |
| 362 keep = True | |
| 363 elif keep: | |
| 364 keep_end_i = text.find(ANSI_ESCAPE_BEGIN, i) | |
| 365 if keep_end_i < 0: | |
| 366 break | |
| 367 else: | |
| 368 nansi.append(text[i:keep_end_i]) | |
| 369 i, keep = keep_end_i, False | |
| 370 i += 1 | |
| 371 if not nansi: | |
| 372 return text | |
| 373 return type(text)().join(nansi) # attempted unicode + str support | |
| 374 | |
| 375 | |
| 376 def asciify(text, ignore=False): | |
| 377 """Converts a unicode or bytestring, *text*, into a bytestring with | |
| 378 just ascii characters. Performs basic deaccenting for all you | |
| 379 Europhiles out there. | |
| 380 | |
| 381 Also, a gentle reminder that this is a **utility**, primarily meant | |
| 382 for slugification. Whenever possible, make your application work | |
| 383 **with** unicode, not against it. | |
| 384 | |
| 385 Args: | |
| 386 text (str or unicode): The string to be asciified. | |
| 387 ignore (bool): Configures final encoding to ignore remaining | |
| 388 unasciified unicode instead of replacing it. | |
| 389 | |
| 390 >>> asciify('Beyoncé') == b'Beyonce' | |
| 391 True | |
| 392 """ | |
| 393 try: | |
| 394 try: | |
| 395 return text.encode('ascii') | |
| 396 except UnicodeDecodeError: | |
| 397 # this usually means you passed in a non-unicode string | |
| 398 text = text.decode('utf-8') | |
| 399 return text.encode('ascii') | |
| 400 except UnicodeEncodeError: | |
| 401 mode = 'replace' | |
| 402 if ignore: | |
| 403 mode = 'ignore' | |
| 404 transd = unicodedata.normalize('NFKD', text.translate(DEACCENT_MAP)) | |
| 405 ret = transd.encode('ascii', mode) | |
| 406 return ret | |
| 407 | |
| 408 | |
| 409 def is_ascii(text): | |
| 410 """Check if a unicode or bytestring, *text*, is composed of ascii | |
| 411 characters only. Raises :exc:`ValueError` if argument is not text. | |
| 412 | |
| 413 Args: | |
| 414 text (str or unicode): The string to be checked. | |
| 415 | |
| 416 >>> is_ascii('Beyoncé') | |
| 417 False | |
| 418 >>> is_ascii('Beyonce') | |
| 419 True | |
| 420 """ | |
| 421 if isinstance(text, unicode): | |
| 422 try: | |
| 423 text.encode('ascii') | |
| 424 except UnicodeEncodeError: | |
| 425 return False | |
| 426 elif isinstance(text, bytes): | |
| 427 try: | |
| 428 text.decode('ascii') | |
| 429 except UnicodeDecodeError: | |
| 430 return False | |
| 431 else: | |
| 432 raise ValueError('expected text or bytes, not %r' % type(text)) | |
| 433 return True | |
| 434 | |
| 435 | |
| 436 class DeaccenterDict(dict): | |
| 437 "A small caching dictionary for deaccenting." | |
| 438 def __missing__(self, key): | |
| 439 ch = self.get(key) | |
| 440 if ch is not None: | |
| 441 return ch | |
| 442 try: | |
| 443 de = unicodedata.decomposition(unichr(key)) | |
| 444 p1, _, p2 = de.rpartition(' ') | |
| 445 if int(p2, 16) == 0x308: | |
| 446 ch = self.get(key) | |
| 447 else: | |
| 448 ch = int(p1, 16) | |
| 449 except (IndexError, ValueError): | |
| 450 ch = self.get(key, key) | |
| 451 self[key] = ch | |
| 452 return ch | |
| 453 | |
| 454 try: | |
| 455 from collections import defaultdict | |
| 456 except ImportError: | |
| 457 # no defaultdict means that __missing__ isn't supported in | |
| 458 # this version of python, so we define __getitem__ | |
| 459 def __getitem__(self, key): | |
| 460 try: | |
| 461 return super(DeaccenterDict, self).__getitem__(key) | |
| 462 except KeyError: | |
| 463 return self.__missing__(key) | |
| 464 else: | |
| 465 del defaultdict | |
| 466 | |
| 467 | |
| 468 # http://chmullig.com/2009/12/python-unicode-ascii-ifier/ | |
| 469 # For something more complete, investigate the unidecode | |
| 470 # or isounidecode packages, which are capable of performing | |
| 471 # crude transliteration. | |
| 472 _BASE_DEACCENT_MAP = { | |
| 473 0xc6: u"AE", # Æ LATIN CAPITAL LETTER AE | |
| 474 0xd0: u"D", # Ð LATIN CAPITAL LETTER ETH | |
| 475 0xd8: u"OE", # Ø LATIN CAPITAL LETTER O WITH STROKE | |
| 476 0xde: u"Th", # Þ LATIN CAPITAL LETTER THORN | |
| 477 0xc4: u'Ae', # Ä LATIN CAPITAL LETTER A WITH DIAERESIS | |
| 478 0xd6: u'Oe', # Ö LATIN CAPITAL LETTER O WITH DIAERESIS | |
| 479 0xdc: u'Ue', # Ü LATIN CAPITAL LETTER U WITH DIAERESIS | |
| 480 0xc0: u"A", # À LATIN CAPITAL LETTER A WITH GRAVE | |
| 481 0xc1: u"A", # Á LATIN CAPITAL LETTER A WITH ACUTE | |
| 482 0xc3: u"A", # Ã LATIN CAPITAL LETTER A WITH TILDE | |
| 483 0xc7: u"C", # Ç LATIN CAPITAL LETTER C WITH CEDILLA | |
| 484 0xc8: u"E", # È LATIN CAPITAL LETTER E WITH GRAVE | |
| 485 0xc9: u"E", # É LATIN CAPITAL LETTER E WITH ACUTE | |
| 486 0xca: u"E", # Ê LATIN CAPITAL LETTER E WITH CIRCUMFLEX | |
| 487 0xcc: u"I", # Ì LATIN CAPITAL LETTER I WITH GRAVE | |
| 488 0xcd: u"I", # Í LATIN CAPITAL LETTER I WITH ACUTE | |
| 489 0xd2: u"O", # Ò LATIN CAPITAL LETTER O WITH GRAVE | |
| 490 0xd3: u"O", # Ó LATIN CAPITAL LETTER O WITH ACUTE | |
| 491 0xd5: u"O", # Õ LATIN CAPITAL LETTER O WITH TILDE | |
| 492 0xd9: u"U", # Ù LATIN CAPITAL LETTER U WITH GRAVE | |
| 493 0xda: u"U", # Ú LATIN CAPITAL LETTER U WITH ACUTE | |
| 494 0xdf: u"ss", # ß LATIN SMALL LETTER SHARP S | |
| 495 0xe6: u"ae", # æ LATIN SMALL LETTER AE | |
| 496 0xf0: u"d", # ð LATIN SMALL LETTER ETH | |
| 497 0xf8: u"oe", # ø LATIN SMALL LETTER O WITH STROKE | |
| 498 0xfe: u"th", # þ LATIN SMALL LETTER THORN, | |
| 499 0xe4: u'ae', # ä LATIN SMALL LETTER A WITH DIAERESIS | |
| 500 0xf6: u'oe', # ö LATIN SMALL LETTER O WITH DIAERESIS | |
| 501 0xfc: u'ue', # ü LATIN SMALL LETTER U WITH DIAERESIS | |
| 502 0xe0: u"a", # à LATIN SMALL LETTER A WITH GRAVE | |
| 503 0xe1: u"a", # á LATIN SMALL LETTER A WITH ACUTE | |
| 504 0xe3: u"a", # ã LATIN SMALL LETTER A WITH TILDE | |
| 505 0xe7: u"c", # ç LATIN SMALL LETTER C WITH CEDILLA | |
| 506 0xe8: u"e", # è LATIN SMALL LETTER E WITH GRAVE | |
| 507 0xe9: u"e", # é LATIN SMALL LETTER E WITH ACUTE | |
| 508 0xea: u"e", # ê LATIN SMALL LETTER E WITH CIRCUMFLEX | |
| 509 0xec: u"i", # ì LATIN SMALL LETTER I WITH GRAVE | |
| 510 0xed: u"i", # í LATIN SMALL LETTER I WITH ACUTE | |
| 511 0xf2: u"o", # ò LATIN SMALL LETTER O WITH GRAVE | |
| 512 0xf3: u"o", # ó LATIN SMALL LETTER O WITH ACUTE | |
| 513 0xf5: u"o", # õ LATIN SMALL LETTER O WITH TILDE | |
| 514 0xf9: u"u", # ù LATIN SMALL LETTER U WITH GRAVE | |
| 515 0xfa: u"u", # ú LATIN SMALL LETTER U WITH ACUTE | |
| 516 0x2018: u"'", # ‘ LEFT SINGLE QUOTATION MARK | |
| 517 0x2019: u"'", # ’ RIGHT SINGLE QUOTATION MARK | |
| 518 0x201c: u'"', # “ LEFT DOUBLE QUOTATION MARK | |
| 519 0x201d: u'"', # ” RIGHT DOUBLE QUOTATION MARK | |
| 520 } | |
| 521 | |
| 522 | |
| 523 DEACCENT_MAP = DeaccenterDict(_BASE_DEACCENT_MAP) | |
| 524 | |
| 525 | |
| 526 _SIZE_SYMBOLS = ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y') | |
| 527 _SIZE_BOUNDS = [(1024 ** i, sym) for i, sym in enumerate(_SIZE_SYMBOLS)] | |
| 528 _SIZE_RANGES = list(zip(_SIZE_BOUNDS, _SIZE_BOUNDS[1:])) | |
| 529 | |
| 530 | |
| 531 def bytes2human(nbytes, ndigits=0): | |
| 532 """Turns an integer value of *nbytes* into a human readable format. Set | |
| 533 *ndigits* to control how many digits after the decimal point | |
| 534 should be shown (default ``0``). | |
| 535 | |
| 536 >>> bytes2human(128991) | |
| 537 '126K' | |
| 538 >>> bytes2human(100001221) | |
| 539 '95M' | |
| 540 >>> bytes2human(0, 2) | |
| 541 '0.00B' | |
| 542 """ | |
| 543 abs_bytes = abs(nbytes) | |
| 544 for (size, symbol), (next_size, next_symbol) in _SIZE_RANGES: | |
| 545 if abs_bytes <= next_size: | |
| 546 break | |
| 547 hnbytes = float(nbytes) / size | |
| 548 return '{hnbytes:.{ndigits}f}{symbol}'.format(hnbytes=hnbytes, | |
| 549 ndigits=ndigits, | |
| 550 symbol=symbol) | |
| 551 | |
| 552 | |
| 553 class HTMLTextExtractor(HTMLParser): | |
| 554 def __init__(self): | |
| 555 self.reset() | |
| 556 self.strict = False | |
| 557 self.convert_charrefs = True | |
| 558 self.result = [] | |
| 559 | |
| 560 def handle_data(self, d): | |
| 561 self.result.append(d) | |
| 562 | |
| 563 def handle_charref(self, number): | |
| 564 if number[0] == u'x' or number[0] == u'X': | |
| 565 codepoint = int(number[1:], 16) | |
| 566 else: | |
| 567 codepoint = int(number) | |
| 568 self.result.append(unichr(codepoint)) | |
| 569 | |
| 570 def handle_entityref(self, name): | |
| 571 try: | |
| 572 codepoint = htmlentitydefs.name2codepoint[name] | |
| 573 except KeyError: | |
| 574 self.result.append(u'&' + name + u';') | |
| 575 else: | |
| 576 self.result.append(unichr(codepoint)) | |
| 577 | |
| 578 def get_text(self): | |
| 579 return u''.join(self.result) | |
| 580 | |
| 581 | |
| 582 def html2text(html): | |
| 583 """Strips tags from HTML text, returning markup-free text. Also, does | |
| 584 a best effort replacement of entities like " " | |
| 585 | |
| 586 >>> r = html2text(u'<a href="#">Test &<em>(\u0394ημώ)</em></a>') | |
| 587 >>> r == u'Test &(\u0394\u03b7\u03bc\u03ce)' | |
| 588 True | |
| 589 """ | |
| 590 # based on answers to http://stackoverflow.com/questions/753052/ | |
| 591 s = HTMLTextExtractor() | |
| 592 s.feed(html) | |
| 593 return s.get_text() | |
| 594 | |
| 595 | |
| 596 _EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x089\xf3\xb9U\x00\x03empty\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00' | |
| 597 _NON_EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x08\xbc\xf7\xb9U\x00\x03not_empty\x00K\xaa,I-N\xcc\xc8\xafT\xe4\x02\x00\xf3nb\xbf\x0b\x00\x00\x00' | |
| 598 | |
| 599 | |
| 600 def gunzip_bytes(bytestring): | |
| 601 """The :mod:`gzip` module is great if you have a file or file-like | |
| 602 object, but what if you just have bytes. StringIO is one | |
| 603 possibility, but it's often faster, easier, and simpler to just | |
| 604 use this one-liner. Use this tried-and-true utility function to | |
| 605 decompress gzip from bytes. | |
| 606 | |
| 607 >>> gunzip_bytes(_EMPTY_GZIP_BYTES) == b'' | |
| 608 True | |
| 609 >>> gunzip_bytes(_NON_EMPTY_GZIP_BYTES).rstrip() == b'bytesahoy!' | |
| 610 True | |
| 611 """ | |
| 612 return zlib.decompress(bytestring, 16 + zlib.MAX_WBITS) | |
| 613 | |
| 614 | |
| 615 def gzip_bytes(bytestring, level=6): | |
| 616 """Turn some bytes into some compressed bytes. | |
| 617 | |
| 618 >>> len(gzip_bytes(b'a' * 10000)) | |
| 619 46 | |
| 620 | |
| 621 Args: | |
| 622 bytestring (bytes): Bytes to be compressed | |
| 623 level (int): An integer, 1-9, controlling the | |
| 624 speed/compression. 1 is fastest, least compressed, 9 is | |
| 625 slowest, but most compressed. | |
| 626 | |
| 627 Note that all levels of gzip are pretty fast these days, though | |
| 628 it's not really a competitor in compression, at any level. | |
| 629 """ | |
| 630 out = StringIO() | |
| 631 f = GzipFile(fileobj=out, mode='wb', compresslevel=level) | |
| 632 f.write(bytestring) | |
| 633 f.close() | |
| 634 return out.getvalue() | |
| 635 | |
| 636 | |
| 637 | |
| 638 _line_ending_re = re.compile(r'(\r\n|\n|\x0b|\f|\r|\x85|\x2028|\x2029)', | |
| 639 re.UNICODE) | |
| 640 | |
| 641 | |
| 642 def iter_splitlines(text): | |
| 643 r"""Like :meth:`str.splitlines`, but returns an iterator of lines | |
| 644 instead of a list. Also similar to :meth:`file.next`, as that also | |
| 645 lazily reads and yields lines from a file. | |
| 646 | |
| 647 This function works with a variety of line endings, but as always, | |
| 648 be careful when mixing line endings within a file. | |
| 649 | |
| 650 >>> list(iter_splitlines('\nhi\nbye\n')) | |
| 651 ['', 'hi', 'bye', ''] | |
| 652 >>> list(iter_splitlines('\r\nhi\rbye\r\n')) | |
| 653 ['', 'hi', 'bye', ''] | |
| 654 >>> list(iter_splitlines('')) | |
| 655 [] | |
| 656 """ | |
| 657 prev_end, len_text = 0, len(text) | |
| 658 # print('last: %r' % last_idx) | |
| 659 # start, end = None, None | |
| 660 for match in _line_ending_re.finditer(text): | |
| 661 start, end = match.start(1), match.end(1) | |
| 662 # print(start, end) | |
| 663 if prev_end <= start: | |
| 664 yield text[prev_end:start] | |
| 665 if end == len_text: | |
| 666 yield '' | |
| 667 prev_end = end | |
| 668 tail = text[prev_end:] | |
| 669 if tail: | |
| 670 yield tail | |
| 671 return | |
| 672 | |
| 673 | |
| 674 def indent(text, margin, newline='\n', key=bool): | |
| 675 """The missing counterpart to the built-in :func:`textwrap.dedent`. | |
| 676 | |
| 677 Args: | |
| 678 text (str): The text to indent. | |
| 679 margin (str): The string to prepend to each line. | |
| 680 newline (str): The newline used to rejoin the lines (default: ``\\n``) | |
| 681 key (callable): Called on each line to determine whether to | |
| 682 indent it. Default: :class:`bool`, to ensure that empty lines do | |
| 683 not get whitespace added. | |
| 684 """ | |
| 685 indented_lines = [(margin + line if key(line) else line) | |
| 686 for line in iter_splitlines(text)] | |
| 687 return newline.join(indented_lines) | |
| 688 | |
| 689 | |
| 690 def is_uuid(obj, version=4): | |
| 691 """Check the argument is either a valid UUID object or string. | |
| 692 | |
| 693 Args: | |
| 694 obj (object): The test target. Strings and UUID objects supported. | |
| 695 version (int): The target UUID version, set to 0 to skip version check. | |
| 696 | |
| 697 >>> is_uuid('e682ccca-5a4c-4ef2-9711-73f9ad1e15ea') | |
| 698 True | |
| 699 >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9') | |
| 700 False | |
| 701 >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9', version=1) | |
| 702 True | |
| 703 """ | |
| 704 if not isinstance(obj, uuid.UUID): | |
| 705 try: | |
| 706 obj = uuid.UUID(obj) | |
| 707 except (TypeError, ValueError, AttributeError): | |
| 708 return False | |
| 709 if version and obj.version != int(version): | |
| 710 return False | |
| 711 return True | |
| 712 | |
| 713 | |
| 714 def escape_shell_args(args, sep=' ', style=None): | |
| 715 """Returns an escaped version of each string in *args*, according to | |
| 716 *style*. | |
| 717 | |
| 718 Args: | |
| 719 args (list): A list of arguments to escape and join together | |
| 720 sep (str): The separator used to join the escaped arguments. | |
| 721 style (str): The style of escaping to use. Can be one of | |
| 722 ``cmd`` or ``sh``, geared toward Windows and Linux/BSD/etc., | |
| 723 respectively. If *style* is ``None``, then it is picked | |
| 724 according to the system platform. | |
| 725 | |
| 726 See :func:`args2cmd` and :func:`args2sh` for details and example | |
| 727 output for each style. | |
| 728 """ | |
| 729 if not style: | |
| 730 style = 'cmd' if sys.platform == 'win32' else 'sh' | |
| 731 | |
| 732 if style == 'sh': | |
| 733 return args2sh(args, sep=sep) | |
| 734 elif style == 'cmd': | |
| 735 return args2cmd(args, sep=sep) | |
| 736 | |
| 737 raise ValueError("style expected one of 'cmd' or 'sh', not %r" % style) | |
| 738 | |
| 739 | |
| 740 _find_sh_unsafe = re.compile(r'[^a-zA-Z0-9_@%+=:,./-]').search | |
| 741 | |
| 742 | |
| 743 def args2sh(args, sep=' '): | |
| 744 """Return a shell-escaped string version of *args*, separated by | |
| 745 *sep*, based on the rules of sh, bash, and other shells in the | |
| 746 Linux/BSD/MacOS ecosystem. | |
| 747 | |
| 748 >>> print(args2sh(['aa', '[bb]', "cc'cc", 'dd"dd'])) | |
| 749 aa '[bb]' 'cc'"'"'cc' 'dd"dd' | |
| 750 | |
| 751 As you can see, arguments with no special characters are not | |
| 752 escaped, arguments with special characters are quoted with single | |
| 753 quotes, and single quotes themselves are quoted with double | |
| 754 quotes. Double quotes are handled like any other special | |
| 755 character. | |
| 756 | |
| 757 Based on code from the :mod:`pipes`/:mod:`shlex` modules. Also | |
| 758 note that :mod:`shlex` and :mod:`argparse` have functions to split | |
| 759 and parse strings escaped in this manner. | |
| 760 """ | |
| 761 ret_list = [] | |
| 762 | |
| 763 for arg in args: | |
| 764 if not arg: | |
| 765 ret_list.append("''") | |
| 766 continue | |
| 767 if _find_sh_unsafe(arg) is None: | |
| 768 ret_list.append(arg) | |
| 769 continue | |
| 770 # use single quotes, and put single quotes into double quotes | |
| 771 # the string $'b is then quoted as '$'"'"'b' | |
| 772 ret_list.append("'" + arg.replace("'", "'\"'\"'") + "'") | |
| 773 | |
| 774 return ' '.join(ret_list) | |
| 775 | |
| 776 | |
| 777 def args2cmd(args, sep=' '): | |
| 778 r"""Return a shell-escaped string version of *args*, separated by | |
| 779 *sep*, using the same rules as the Microsoft C runtime. | |
| 780 | |
| 781 >>> print(args2cmd(['aa', '[bb]', "cc'cc", 'dd"dd'])) | |
| 782 aa [bb] cc'cc dd\"dd | |
| 783 | |
| 784 As you can see, escaping is through backslashing and not quoting, | |
| 785 and double quotes are the only special character. See the comment | |
| 786 in the code for more details. Based on internal code from the | |
| 787 :mod:`subprocess` module. | |
| 788 | |
| 789 """ | |
| 790 # technique description from subprocess below | |
| 791 """ | |
| 792 1) Arguments are delimited by white space, which is either a | |
| 793 space or a tab. | |
| 794 | |
| 795 2) A string surrounded by double quotation marks is | |
| 796 interpreted as a single argument, regardless of white space | |
| 797 contained within. A quoted string can be embedded in an | |
| 798 argument. | |
| 799 | |
| 800 3) A double quotation mark preceded by a backslash is | |
| 801 interpreted as a literal double quotation mark. | |
| 802 | |
| 803 4) Backslashes are interpreted literally, unless they | |
| 804 immediately precede a double quotation mark. | |
| 805 | |
| 806 5) If backslashes immediately precede a double quotation mark, | |
| 807 every pair of backslashes is interpreted as a literal | |
| 808 backslash. If the number of backslashes is odd, the last | |
| 809 backslash escapes the next double quotation mark as | |
| 810 described in rule 3. | |
| 811 | |
| 812 See http://msdn.microsoft.com/en-us/library/17w5ykft.aspx | |
| 813 or search http://msdn.microsoft.com for | |
| 814 "Parsing C++ Command-Line Arguments" | |
| 815 """ | |
| 816 result = [] | |
| 817 needquote = False | |
| 818 for arg in args: | |
| 819 bs_buf = [] | |
| 820 | |
| 821 # Add a space to separate this argument from the others | |
| 822 if result: | |
| 823 result.append(' ') | |
| 824 | |
| 825 needquote = (" " in arg) or ("\t" in arg) or not arg | |
| 826 if needquote: | |
| 827 result.append('"') | |
| 828 | |
| 829 for c in arg: | |
| 830 if c == '\\': | |
| 831 # Don't know if we need to double yet. | |
| 832 bs_buf.append(c) | |
| 833 elif c == '"': | |
| 834 # Double backslashes. | |
| 835 result.append('\\' * len(bs_buf)*2) | |
| 836 bs_buf = [] | |
| 837 result.append('\\"') | |
| 838 else: | |
| 839 # Normal char | |
| 840 if bs_buf: | |
| 841 result.extend(bs_buf) | |
| 842 bs_buf = [] | |
| 843 result.append(c) | |
| 844 | |
| 845 # Add remaining backslashes, if any. | |
| 846 if bs_buf: | |
| 847 result.extend(bs_buf) | |
| 848 | |
| 849 if needquote: | |
| 850 result.extend(bs_buf) | |
| 851 result.append('"') | |
| 852 | |
| 853 return ''.join(result) | |
| 854 | |
| 855 | |
| 856 def parse_int_list(range_string, delim=',', range_delim='-'): | |
| 857 """Returns a sorted list of positive integers based on | |
| 858 *range_string*. Reverse of :func:`format_int_list`. | |
| 859 | |
| 860 Args: | |
| 861 range_string (str): String of comma separated positive | |
| 862 integers or ranges (e.g. '1,2,4-6,8'). Typical of a custom | |
| 863 page range string used in printer dialogs. | |
| 864 delim (char): Defaults to ','. Separates integers and | |
| 865 contiguous ranges of integers. | |
| 866 range_delim (char): Defaults to '-'. Indicates a contiguous | |
| 867 range of integers. | |
| 868 | |
| 869 >>> parse_int_list('1,3,5-8,10-11,15') | |
| 870 [1, 3, 5, 6, 7, 8, 10, 11, 15] | |
| 871 | |
| 872 """ | |
| 873 output = [] | |
| 874 | |
| 875 for x in range_string.strip().split(delim): | |
| 876 | |
| 877 # Range | |
| 878 if range_delim in x: | |
| 879 range_limits = list(map(int, x.split(range_delim))) | |
| 880 output += list(range(min(range_limits), max(range_limits)+1)) | |
| 881 | |
| 882 # Empty String | |
| 883 elif not x: | |
| 884 continue | |
| 885 | |
| 886 # Integer | |
| 887 else: | |
| 888 output.append(int(x)) | |
| 889 | |
| 890 return sorted(output) | |
| 891 | |
| 892 | |
| 893 def format_int_list(int_list, delim=',', range_delim='-', delim_space=False): | |
| 894 """Returns a sorted range string from a list of positive integers | |
| 895 (*int_list*). Contiguous ranges of integers are collapsed to min | |
| 896 and max values. Reverse of :func:`parse_int_list`. | |
| 897 | |
| 898 Args: | |
| 899 int_list (list): List of positive integers to be converted | |
| 900 into a range string (e.g. [1,2,4,5,6,8]). | |
| 901 delim (char): Defaults to ','. Separates integers and | |
| 902 contiguous ranges of integers. | |
| 903 range_delim (char): Defaults to '-'. Indicates a contiguous | |
| 904 range of integers. | |
| 905 delim_space (bool): Defaults to ``False``. If ``True``, adds a | |
| 906 space after all *delim* characters. | |
| 907 | |
| 908 >>> format_int_list([1,3,5,6,7,8,10,11,15]) | |
| 909 '1,3,5-8,10-11,15' | |
| 910 | |
| 911 """ | |
| 912 output = [] | |
| 913 contig_range = collections.deque() | |
| 914 | |
| 915 for x in sorted(int_list): | |
| 916 | |
| 917 # Handle current (and first) value. | |
| 918 if len(contig_range) < 1: | |
| 919 contig_range.append(x) | |
| 920 | |
| 921 # Handle current value, given multiple previous values are contiguous. | |
| 922 elif len(contig_range) > 1: | |
| 923 delta = x - contig_range[-1] | |
| 924 | |
| 925 # Current value is contiguous. | |
| 926 if delta == 1: | |
| 927 contig_range.append(x) | |
| 928 | |
| 929 # Current value is non-contiguous. | |
| 930 elif delta > 1: | |
| 931 range_substr = '{0:d}{1}{2:d}'.format(min(contig_range), | |
| 932 range_delim, | |
| 933 max(contig_range)) | |
| 934 output.append(range_substr) | |
| 935 contig_range.clear() | |
| 936 contig_range.append(x) | |
| 937 | |
| 938 # Current value repeated. | |
| 939 else: | |
| 940 continue | |
| 941 | |
| 942 # Handle current value, given no previous contiguous integers | |
| 943 else: | |
| 944 delta = x - contig_range[0] | |
| 945 | |
| 946 # Current value is contiguous. | |
| 947 if delta == 1: | |
| 948 contig_range.append(x) | |
| 949 | |
| 950 # Current value is non-contiguous. | |
| 951 elif delta > 1: | |
| 952 output.append('{0:d}'.format(contig_range.popleft())) | |
| 953 contig_range.append(x) | |
| 954 | |
| 955 # Current value repeated. | |
| 956 else: | |
| 957 continue | |
| 958 | |
| 959 # Handle the last value. | |
| 960 else: | |
| 961 | |
| 962 # Last value is non-contiguous. | |
| 963 if len(contig_range) == 1: | |
| 964 output.append('{0:d}'.format(contig_range.popleft())) | |
| 965 contig_range.clear() | |
| 966 | |
| 967 # Last value is part of contiguous range. | |
| 968 elif len(contig_range) > 1: | |
| 969 range_substr = '{0:d}{1}{2:d}'.format(min(contig_range), | |
| 970 range_delim, | |
| 971 max(contig_range)) | |
| 972 output.append(range_substr) | |
| 973 contig_range.clear() | |
| 974 | |
| 975 if delim_space: | |
| 976 output_str = (delim+' ').join(output) | |
| 977 else: | |
| 978 output_str = delim.join(output) | |
| 979 | |
| 980 return output_str | |
| 981 | |
| 982 | |
| 983 class MultiReplace(object): | |
| 984 """ | |
| 985 MultiReplace is a tool for doing multiple find/replace actions in one pass. | |
| 986 | |
| 987 Given a mapping of values to be replaced it allows for all of the matching | |
| 988 values to be replaced in a single pass which can save a lot of performance | |
| 989 on very large strings. In addition to simple replace, it also allows for | |
| 990 replacing based on regular expressions. | |
| 991 | |
| 992 Keyword Arguments: | |
| 993 | |
| 994 :type regex: bool | |
| 995 :param regex: Treat search keys as regular expressions [Default: False] | |
| 996 :type flags: int | |
| 997 :param flags: flags to pass to the regex engine during compile | |
| 998 | |
| 999 Dictionary Usage:: | |
| 1000 | |
| 1001 from lrmslib import stringutils | |
| 1002 s = stringutils.MultiReplace({ | |
| 1003 'foo': 'zoo', | |
| 1004 'cat': 'hat', | |
| 1005 'bat': 'kraken' | |
| 1006 }) | |
| 1007 new = s.sub('The foo bar cat ate a bat') | |
| 1008 new == 'The zoo bar hat ate a kraken' | |
| 1009 | |
| 1010 Iterable Usage:: | |
| 1011 | |
| 1012 from lrmslib import stringutils | |
| 1013 s = stringutils.MultiReplace([ | |
| 1014 ('foo', 'zoo'), | |
| 1015 ('cat', 'hat'), | |
| 1016 ('bat', 'kraken)' | |
| 1017 ]) | |
| 1018 new = s.sub('The foo bar cat ate a bat') | |
| 1019 new == 'The zoo bar hat ate a kraken' | |
| 1020 | |
| 1021 | |
| 1022 The constructor can be passed a dictionary or other mapping as well as | |
| 1023 an iterable of tuples. If given an iterable, the substitution will be run | |
| 1024 in the order the replacement values are specified in the iterable. This is | |
| 1025 also true if it is given an OrderedDict. If given a dictionary then the | |
| 1026 order will be non-deterministic:: | |
| 1027 | |
| 1028 >>> 'foo bar baz'.replace('foo', 'baz').replace('baz', 'bar') | |
| 1029 'bar bar bar' | |
| 1030 >>> m = MultiReplace({'foo': 'baz', 'baz': 'bar'}) | |
| 1031 >>> m.sub('foo bar baz') | |
| 1032 'baz bar bar' | |
| 1033 | |
| 1034 This is because the order of replacement can matter if you're inserting | |
| 1035 something that might be replaced by a later substitution. Pay attention and | |
| 1036 if you need to rely on order then consider using a list of tuples instead | |
| 1037 of a dictionary. | |
| 1038 """ | |
| 1039 | |
| 1040 def __init__(self, sub_map, **kwargs): | |
| 1041 """Compile any regular expressions that have been passed.""" | |
| 1042 options = { | |
| 1043 'regex': False, | |
| 1044 'flags': 0, | |
| 1045 } | |
| 1046 options.update(kwargs) | |
| 1047 self.group_map = {} | |
| 1048 regex_values = [] | |
| 1049 | |
| 1050 if isinstance(sub_map, Mapping): | |
| 1051 sub_map = sub_map.items() | |
| 1052 | |
| 1053 for idx, vals in enumerate(sub_map): | |
| 1054 group_name = 'group{0}'.format(idx) | |
| 1055 if isinstance(vals[0], basestring): | |
| 1056 # If we're not treating input strings like a regex, escape it | |
| 1057 if not options['regex']: | |
| 1058 exp = re.escape(vals[0]) | |
| 1059 else: | |
| 1060 exp = vals[0] | |
| 1061 else: | |
| 1062 exp = vals[0].pattern | |
| 1063 | |
| 1064 regex_values.append('(?P<{0}>{1})'.format( | |
| 1065 group_name, | |
| 1066 exp | |
| 1067 )) | |
| 1068 self.group_map[group_name] = vals[1] | |
| 1069 | |
| 1070 self.combined_pattern = re.compile( | |
| 1071 '|'.join(regex_values), | |
| 1072 flags=options['flags'] | |
| 1073 ) | |
| 1074 | |
| 1075 def _get_value(self, match): | |
| 1076 """Given a match object find replacement value.""" | |
| 1077 group_dict = match.groupdict() | |
| 1078 key = [x for x in group_dict if group_dict[x]][0] | |
| 1079 return self.group_map[key] | |
| 1080 | |
| 1081 def sub(self, text): | |
| 1082 """ | |
| 1083 Run substitutions on the input text. | |
| 1084 | |
| 1085 Given an input string, run all substitutions given in the | |
| 1086 constructor. | |
| 1087 """ | |
| 1088 return self.combined_pattern.sub(self._get_value, text) | |
| 1089 | |
| 1090 | |
| 1091 def multi_replace(text, sub_map, **kwargs): | |
| 1092 """Shortcut function to invoke multi-replace in a single command.""" | |
| 1093 m = MultiReplace(sub_map, **kwargs) | |
| 1094 return m.sub(text) | |
| 1095 | |
| 1096 | |
| 1097 def unwrap_text(text, ending='\n\n'): | |
| 1098 r""" | |
| 1099 Unwrap text, the natural complement to :func:`textwrap.wrap`. | |
| 1100 | |
| 1101 >>> text = "Short \n lines \nwrapped\nsmall.\n\nAnother\nparagraph." | |
| 1102 >>> unwrap_text(text) | |
| 1103 'Short lines wrapped small.\n\nAnother paragraph.' | |
| 1104 | |
| 1105 Args: | |
| 1106 text: A string to unwrap. | |
| 1107 ending (str): The string to join all unwrapped paragraphs | |
| 1108 by. Pass ``None`` to get the list. Defaults to '\n\n' for | |
| 1109 compatibility with Markdown and RST. | |
| 1110 | |
| 1111 """ | |
| 1112 all_grafs = [] | |
| 1113 cur_graf = [] | |
| 1114 for line in text.splitlines(): | |
| 1115 line = line.strip() | |
| 1116 if line: | |
| 1117 cur_graf.append(line) | |
| 1118 else: | |
| 1119 all_grafs.append(' '.join(cur_graf)) | |
| 1120 cur_graf = [] | |
| 1121 if cur_graf: | |
| 1122 all_grafs.append(' '.join(cur_graf)) | |
| 1123 if ending is None: | |
| 1124 return all_grafs | |
| 1125 return ending.join(all_grafs) |
