comparison env/lib/python3.9/site-packages/boltons/strutils.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 # -*- coding: utf-8 -*-
2 """So much practical programming involves string manipulation, which
3 Python readily accommodates. Still, there are dozens of basic and
4 common capabilities missing from the standard library, several of them
5 provided by ``strutils``.
6 """
7
8 from __future__ import print_function
9
10 import re
11 import sys
12 import uuid
13 import zlib
14 import string
15 import unicodedata
16 import collections
17 from gzip import GzipFile
18
19 try:
20 from cStringIO import cStringIO as StringIO
21 except ImportError:
22 from io import BytesIO as StringIO
23
24 try:
25 from collections.abc import Mapping
26 except ImportError:
27 from collections import Mapping
28
29 try:
30 unicode, str, bytes, basestring = unicode, str, str, basestring
31 from HTMLParser import HTMLParser
32 import htmlentitydefs
33 except NameError: # basestring not defined in Python 3
34 unicode, str, bytes, basestring = str, bytes, bytes, (str, bytes)
35 unichr = chr
36 from html.parser import HTMLParser
37 from html import entities as htmlentitydefs
38
39 try:
40 import __builtin__ as builtins
41 except ImportError:
42 import builtins
43
44 __all__ = ['camel2under', 'under2camel', 'slugify', 'split_punct_ws',
45 'unit_len', 'ordinalize', 'cardinalize', 'pluralize', 'singularize',
46 'asciify', 'is_ascii', 'is_uuid', 'html2text', 'strip_ansi',
47 'bytes2human', 'find_hashtags', 'a10n', 'gzip_bytes', 'gunzip_bytes',
48 'iter_splitlines', 'indent', 'escape_shell_args',
49 'args2cmd', 'args2sh', 'parse_int_list', 'format_int_list',
50 'int_list_complement', 'int_list_to_int_tuples', 'unwrap_text']
51
52
53 _punct_ws_str = string.punctuation + string.whitespace
54 _punct_re = re.compile('[' + _punct_ws_str + ']+')
55 _camel2under_re = re.compile('((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))')
56
57
58 def camel2under(camel_string):
59 """Converts a camelcased string to underscores. Useful for turning a
60 class name into a function name.
61
62 >>> camel2under('BasicParseTest')
63 'basic_parse_test'
64 """
65 return _camel2under_re.sub(r'_\1', camel_string).lower()
66
67
68 def under2camel(under_string):
69 """Converts an underscored string to camelcased. Useful for turning a
70 function name into a class name.
71
72 >>> under2camel('complex_tokenizer')
73 'ComplexTokenizer'
74 """
75 return ''.join(w.capitalize() or '_' for w in under_string.split('_'))
76
77
78 def slugify(text, delim='_', lower=True, ascii=False):
79 """
80 A basic function that turns text full of scary characters
81 (i.e., punctuation and whitespace), into a relatively safe
82 lowercased string separated only by the delimiter specified
83 by *delim*, which defaults to ``_``.
84
85 The *ascii* convenience flag will :func:`asciify` the slug if
86 you require ascii-only slugs.
87
88 >>> slugify('First post! Hi!!!!~1 ')
89 'first_post_hi_1'
90
91 >>> slugify("Kurt Gödel's pretty cool.", ascii=True) == \
92 b'kurt_goedel_s_pretty_cool'
93 True
94
95 """
96 ret = delim.join(split_punct_ws(text)) or delim if text else ''
97 if ascii:
98 ret = asciify(ret)
99 if lower:
100 ret = ret.lower()
101 return ret
102
103
104 def split_punct_ws(text):
105 """While :meth:`str.split` will split on whitespace,
106 :func:`split_punct_ws` will split on punctuation and
107 whitespace. This used internally by :func:`slugify`, above.
108
109 >>> split_punct_ws('First post! Hi!!!!~1 ')
110 ['First', 'post', 'Hi', '1']
111 """
112 return [w for w in _punct_re.split(text) if w]
113
114
115 def unit_len(sized_iterable, unit_noun='item'): # TODO: len_units()/unitize()?
116 """Returns a plain-English description of an iterable's
117 :func:`len()`, conditionally pluralized with :func:`cardinalize`,
118 detailed below.
119
120 >>> print(unit_len(range(10), 'number'))
121 10 numbers
122 >>> print(unit_len('aeiou', 'vowel'))
123 5 vowels
124 >>> print(unit_len([], 'worry'))
125 No worries
126 """
127 count = len(sized_iterable)
128 units = cardinalize(unit_noun, count)
129 if count:
130 return u'%s %s' % (count, units)
131 return u'No %s' % (units,)
132
133
134 _ORDINAL_MAP = {'1': 'st',
135 '2': 'nd',
136 '3': 'rd'} # 'th' is the default
137
138
139 def ordinalize(number, ext_only=False):
140 """Turns *number* into its cardinal form, i.e., 1st, 2nd,
141 3rd, 4th, etc. If the last character isn't a digit, it returns the
142 string value unchanged.
143
144 Args:
145 number (int or str): Number to be cardinalized.
146 ext_only (bool): Whether to return only the suffix. Default ``False``.
147
148 >>> print(ordinalize(1))
149 1st
150 >>> print(ordinalize(3694839230))
151 3694839230th
152 >>> print(ordinalize('hi'))
153 hi
154 >>> print(ordinalize(1515))
155 1515th
156 """
157 numstr, ext = unicode(number), ''
158 if numstr and numstr[-1] in string.digits:
159 try:
160 # first check for teens
161 if numstr[-2] == '1':
162 ext = 'th'
163 else:
164 # all other cases
165 ext = _ORDINAL_MAP.get(numstr[-1], 'th')
166 except IndexError:
167 # single digit numbers (will reach here based on [-2] above)
168 ext = _ORDINAL_MAP.get(numstr[-1], 'th')
169 if ext_only:
170 return ext
171 else:
172 return numstr + ext
173
174
175 def cardinalize(unit_noun, count):
176 """Conditionally pluralizes a singular word *unit_noun* if
177 *count* is not one, preserving case when possible.
178
179 >>> vowels = 'aeiou'
180 >>> print(len(vowels), cardinalize('vowel', len(vowels)))
181 5 vowels
182 >>> print(3, cardinalize('Wish', 3))
183 3 Wishes
184 """
185 if count == 1:
186 return unit_noun
187 return pluralize(unit_noun)
188
189
190 def singularize(word):
191 """Semi-intelligently converts an English plural *word* to its
192 singular form, preserving case pattern.
193
194 >>> singularize('chances')
195 'chance'
196 >>> singularize('Activities')
197 'Activity'
198 >>> singularize('Glasses')
199 'Glass'
200 >>> singularize('FEET')
201 'FOOT'
202
203 """
204 orig_word, word = word, word.strip().lower()
205 if not word or word in _IRR_S2P:
206 return orig_word
207
208 irr_singular = _IRR_P2S.get(word)
209 if irr_singular:
210 singular = irr_singular
211 elif not word.endswith('s'):
212 return orig_word
213 elif len(word) == 2:
214 singular = word[:-1] # or just return word?
215 elif word.endswith('ies') and word[-4:-3] not in 'aeiou':
216 singular = word[:-3] + 'y'
217 elif word.endswith('es') and word[-3] == 's':
218 singular = word[:-2]
219 else:
220 singular = word[:-1]
221 return _match_case(orig_word, singular)
222
223
224 def pluralize(word):
225 """Semi-intelligently converts an English *word* from singular form to
226 plural, preserving case pattern.
227
228 >>> pluralize('friend')
229 'friends'
230 >>> pluralize('enemy')
231 'enemies'
232 >>> pluralize('Sheep')
233 'Sheep'
234 """
235 orig_word, word = word, word.strip().lower()
236 if not word or word in _IRR_P2S:
237 return orig_word
238 irr_plural = _IRR_S2P.get(word)
239 if irr_plural:
240 plural = irr_plural
241 elif word.endswith('y') and word[-2:-1] not in 'aeiou':
242 plural = word[:-1] + 'ies'
243 elif word[-1] == 's' or word.endswith('ch') or word.endswith('sh'):
244 plural = word if word.endswith('es') else word + 'es'
245 else:
246 plural = word + 's'
247 return _match_case(orig_word, plural)
248
249
250 def _match_case(master, disciple):
251 if not master.strip():
252 return disciple
253 if master.lower() == master:
254 return disciple.lower()
255 elif master.upper() == master:
256 return disciple.upper()
257 elif master.title() == master:
258 return disciple.title()
259 return disciple
260
261
262 # Singular to plural map of irregular pluralizations
263 _IRR_S2P = {'addendum': 'addenda', 'alga': 'algae', 'alumna': 'alumnae',
264 'alumnus': 'alumni', 'analysis': 'analyses', 'antenna': 'antennae',
265 'appendix': 'appendices', 'axis': 'axes', 'bacillus': 'bacilli',
266 'bacterium': 'bacteria', 'basis': 'bases', 'beau': 'beaux',
267 'bison': 'bison', 'bureau': 'bureaus', 'cactus': 'cacti',
268 'calf': 'calves', 'child': 'children', 'corps': 'corps',
269 'corpus': 'corpora', 'crisis': 'crises', 'criterion': 'criteria',
270 'curriculum': 'curricula', 'datum': 'data', 'deer': 'deer',
271 'diagnosis': 'diagnoses', 'die': 'dice', 'dwarf': 'dwarves',
272 'echo': 'echoes', 'elf': 'elves', 'ellipsis': 'ellipses',
273 'embargo': 'embargoes', 'emphasis': 'emphases', 'erratum': 'errata',
274 'fireman': 'firemen', 'fish': 'fish', 'focus': 'foci',
275 'foot': 'feet', 'formula': 'formulae', 'formula': 'formulas',
276 'fungus': 'fungi', 'genus': 'genera', 'goose': 'geese',
277 'half': 'halves', 'hero': 'heroes', 'hippopotamus': 'hippopotami',
278 'hoof': 'hooves', 'hypothesis': 'hypotheses', 'index': 'indices',
279 'knife': 'knives', 'leaf': 'leaves', 'life': 'lives',
280 'loaf': 'loaves', 'louse': 'lice', 'man': 'men',
281 'matrix': 'matrices', 'means': 'means', 'medium': 'media',
282 'memorandum': 'memoranda', 'millennium': 'milennia', 'moose': 'moose',
283 'mosquito': 'mosquitoes', 'mouse': 'mice', 'nebula': 'nebulae',
284 'neurosis': 'neuroses', 'nucleus': 'nuclei', 'oasis': 'oases',
285 'octopus': 'octopi', 'offspring': 'offspring', 'ovum': 'ova',
286 'ox': 'oxen', 'paralysis': 'paralyses', 'parenthesis': 'parentheses',
287 'person': 'people', 'phenomenon': 'phenomena', 'potato': 'potatoes',
288 'radius': 'radii', 'scarf': 'scarves', 'scissors': 'scissors',
289 'self': 'selves', 'sense': 'senses', 'series': 'series', 'sheep':
290 'sheep', 'shelf': 'shelves', 'species': 'species', 'stimulus':
291 'stimuli', 'stratum': 'strata', 'syllabus': 'syllabi', 'symposium':
292 'symposia', 'synopsis': 'synopses', 'synthesis': 'syntheses',
293 'tableau': 'tableaux', 'that': 'those', 'thesis': 'theses',
294 'thief': 'thieves', 'this': 'these', 'tomato': 'tomatoes', 'tooth':
295 'teeth', 'torpedo': 'torpedoes', 'vertebra': 'vertebrae', 'veto':
296 'vetoes', 'vita': 'vitae', 'watch': 'watches', 'wife': 'wives',
297 'wolf': 'wolves', 'woman': 'women'}
298
299
300 # Reverse index of the above
301 _IRR_P2S = dict([(v, k) for k, v in _IRR_S2P.items()])
302
303 HASHTAG_RE = re.compile(r"(?:^|\s)[##]{1}(\w+)", re.UNICODE)
304
305
306 def find_hashtags(string):
307 """Finds and returns all hashtags in a string, with the hashmark
308 removed. Supports full-width hashmarks for Asian languages and
309 does not false-positive on URL anchors.
310
311 >>> find_hashtags('#atag http://asite/#ananchor')
312 ['atag']
313
314 ``find_hashtags`` also works with unicode hashtags.
315 """
316
317 # the following works, doctest just struggles with it
318 # >>> find_hashtags(u"can't get enough of that dignity chicken #肯德基 woo")
319 # [u'\u80af\u5fb7\u57fa']
320 return HASHTAG_RE.findall(string)
321
322
323 def a10n(string):
324 """That thing where "internationalization" becomes "i18n", what's it
325 called? Abbreviation? Oh wait, no: ``a10n``. (It's actually a form
326 of `numeronym`_.)
327
328 >>> a10n('abbreviation')
329 'a10n'
330 >>> a10n('internationalization')
331 'i18n'
332 >>> a10n('')
333 ''
334
335 .. _numeronym: http://en.wikipedia.org/wiki/Numeronym
336 """
337 if len(string) < 3:
338 return string
339 return '%s%s%s' % (string[0], len(string[1:-1]), string[-1])
340
341
342 # Based on https://en.wikipedia.org/wiki/ANSI_escape_code#Escape_sequences
343 ANSI_SEQUENCES = re.compile(r'''
344 \x1B # Sequence starts with ESC, i.e. hex 0x1B
345 (?:
346 [@-Z\\-_] # Second byte:
347 # all 0x40–0x5F range but CSI char, i.e ASCII @A–Z\]^_
348 | # Or
349 \[ # CSI sequences, starting with [
350 [0-?]* # Parameter bytes:
351 # range 0x30–0x3F, ASCII 0–9:;<=>?
352 [ -/]* # Intermediate bytes:
353 # range 0x20–0x2F, ASCII space and !"#$%&'()*+,-./
354 [@-~] # Final byte
355 # range 0x40–0x7E, ASCII @A–Z[\]^_`a–z{|}~
356 )
357 ''', re.VERBOSE)
358
359
360 def strip_ansi(text):
361 """Strips ANSI escape codes from *text*. Useful for the occasional
362 time when a log or redirected output accidentally captures console
363 color codes and the like.
364
365 >>> strip_ansi('\x1b[0m\x1b[1;36mart\x1b[46;34m')
366 'art'
367
368 Supports unicode, str, bytes and bytearray content as input. Returns the
369 same type as the input.
370
371 There's a lot of ANSI art available for testing on `sixteencolors.net`_.
372 This function does not interpret or render ANSI art, but you can do so with
373 `ansi2img`_ or `escapes.js`_.
374
375 .. _sixteencolors.net: http://sixteencolors.net
376 .. _ansi2img: http://www.bedroomlan.org/projects/ansi2img
377 .. _escapes.js: https://github.com/atdt/escapes.js
378 """
379 # TODO: move to cliutils.py
380
381 # Transform any ASCII-like content to unicode to allow regex to match, and
382 # save input type for later.
383 target_type = None
384 # Unicode type aliased to str is code-smell for Boltons in Python 3 env.
385 is_py3 = (unicode == builtins.str)
386 if is_py3 and isinstance(text, (bytes, bytearray)):
387 target_type = type(text)
388 text = text.decode('utf-8')
389
390 cleaned = ANSI_SEQUENCES.sub('', text)
391
392 # Transform back the result to the same bytearray type provided by the user.
393 if target_type and target_type != type(cleaned):
394 cleaned = target_type(cleaned, 'utf-8')
395
396 return cleaned
397
398
399 def asciify(text, ignore=False):
400 """Converts a unicode or bytestring, *text*, into a bytestring with
401 just ascii characters. Performs basic deaccenting for all you
402 Europhiles out there.
403
404 Also, a gentle reminder that this is a **utility**, primarily meant
405 for slugification. Whenever possible, make your application work
406 **with** unicode, not against it.
407
408 Args:
409 text (str or unicode): The string to be asciified.
410 ignore (bool): Configures final encoding to ignore remaining
411 unasciified unicode instead of replacing it.
412
413 >>> asciify('Beyoncé') == b'Beyonce'
414 True
415 """
416 try:
417 try:
418 return text.encode('ascii')
419 except UnicodeDecodeError:
420 # this usually means you passed in a non-unicode string
421 text = text.decode('utf-8')
422 return text.encode('ascii')
423 except UnicodeEncodeError:
424 mode = 'replace'
425 if ignore:
426 mode = 'ignore'
427 transd = unicodedata.normalize('NFKD', text.translate(DEACCENT_MAP))
428 ret = transd.encode('ascii', mode)
429 return ret
430
431
432 def is_ascii(text):
433 """Check if a unicode or bytestring, *text*, is composed of ascii
434 characters only. Raises :exc:`ValueError` if argument is not text.
435
436 Args:
437 text (str or unicode): The string to be checked.
438
439 >>> is_ascii('Beyoncé')
440 False
441 >>> is_ascii('Beyonce')
442 True
443 """
444 if isinstance(text, unicode):
445 try:
446 text.encode('ascii')
447 except UnicodeEncodeError:
448 return False
449 elif isinstance(text, bytes):
450 try:
451 text.decode('ascii')
452 except UnicodeDecodeError:
453 return False
454 else:
455 raise ValueError('expected text or bytes, not %r' % type(text))
456 return True
457
458
459 class DeaccenterDict(dict):
460 "A small caching dictionary for deaccenting."
461 def __missing__(self, key):
462 ch = self.get(key)
463 if ch is not None:
464 return ch
465 try:
466 de = unicodedata.decomposition(unichr(key))
467 p1, _, p2 = de.rpartition(' ')
468 if int(p2, 16) == 0x308:
469 ch = self.get(key)
470 else:
471 ch = int(p1, 16)
472 except (IndexError, ValueError):
473 ch = self.get(key, key)
474 self[key] = ch
475 return ch
476
477 try:
478 from collections import defaultdict
479 except ImportError:
480 # no defaultdict means that __missing__ isn't supported in
481 # this version of python, so we define __getitem__
482 def __getitem__(self, key):
483 try:
484 return super(DeaccenterDict, self).__getitem__(key)
485 except KeyError:
486 return self.__missing__(key)
487 else:
488 del defaultdict
489
490
491 # http://chmullig.com/2009/12/python-unicode-ascii-ifier/
492 # For something more complete, investigate the unidecode
493 # or isounidecode packages, which are capable of performing
494 # crude transliteration.
495 _BASE_DEACCENT_MAP = {
496 0xc6: u"AE", # Æ LATIN CAPITAL LETTER AE
497 0xd0: u"D", # Ð LATIN CAPITAL LETTER ETH
498 0xd8: u"OE", # Ø LATIN CAPITAL LETTER O WITH STROKE
499 0xde: u"Th", # Þ LATIN CAPITAL LETTER THORN
500 0xc4: u'Ae', # Ä LATIN CAPITAL LETTER A WITH DIAERESIS
501 0xd6: u'Oe', # Ö LATIN CAPITAL LETTER O WITH DIAERESIS
502 0xdc: u'Ue', # Ü LATIN CAPITAL LETTER U WITH DIAERESIS
503 0xc0: u"A", # À LATIN CAPITAL LETTER A WITH GRAVE
504 0xc1: u"A", # Á LATIN CAPITAL LETTER A WITH ACUTE
505 0xc3: u"A", # Ã LATIN CAPITAL LETTER A WITH TILDE
506 0xc7: u"C", # Ç LATIN CAPITAL LETTER C WITH CEDILLA
507 0xc8: u"E", # È LATIN CAPITAL LETTER E WITH GRAVE
508 0xc9: u"E", # É LATIN CAPITAL LETTER E WITH ACUTE
509 0xca: u"E", # Ê LATIN CAPITAL LETTER E WITH CIRCUMFLEX
510 0xcc: u"I", # Ì LATIN CAPITAL LETTER I WITH GRAVE
511 0xcd: u"I", # Í LATIN CAPITAL LETTER I WITH ACUTE
512 0xd2: u"O", # Ò LATIN CAPITAL LETTER O WITH GRAVE
513 0xd3: u"O", # Ó LATIN CAPITAL LETTER O WITH ACUTE
514 0xd5: u"O", # Õ LATIN CAPITAL LETTER O WITH TILDE
515 0xd9: u"U", # Ù LATIN CAPITAL LETTER U WITH GRAVE
516 0xda: u"U", # Ú LATIN CAPITAL LETTER U WITH ACUTE
517 0xdf: u"ss", # ß LATIN SMALL LETTER SHARP S
518 0xe6: u"ae", # æ LATIN SMALL LETTER AE
519 0xf0: u"d", # ð LATIN SMALL LETTER ETH
520 0xf8: u"oe", # ø LATIN SMALL LETTER O WITH STROKE
521 0xfe: u"th", # þ LATIN SMALL LETTER THORN,
522 0xe4: u'ae', # ä LATIN SMALL LETTER A WITH DIAERESIS
523 0xf6: u'oe', # ö LATIN SMALL LETTER O WITH DIAERESIS
524 0xfc: u'ue', # ü LATIN SMALL LETTER U WITH DIAERESIS
525 0xe0: u"a", # à LATIN SMALL LETTER A WITH GRAVE
526 0xe1: u"a", # á LATIN SMALL LETTER A WITH ACUTE
527 0xe3: u"a", # ã LATIN SMALL LETTER A WITH TILDE
528 0xe7: u"c", # ç LATIN SMALL LETTER C WITH CEDILLA
529 0xe8: u"e", # è LATIN SMALL LETTER E WITH GRAVE
530 0xe9: u"e", # é LATIN SMALL LETTER E WITH ACUTE
531 0xea: u"e", # ê LATIN SMALL LETTER E WITH CIRCUMFLEX
532 0xec: u"i", # ì LATIN SMALL LETTER I WITH GRAVE
533 0xed: u"i", # í LATIN SMALL LETTER I WITH ACUTE
534 0xf2: u"o", # ò LATIN SMALL LETTER O WITH GRAVE
535 0xf3: u"o", # ó LATIN SMALL LETTER O WITH ACUTE
536 0xf5: u"o", # õ LATIN SMALL LETTER O WITH TILDE
537 0xf9: u"u", # ù LATIN SMALL LETTER U WITH GRAVE
538 0xfa: u"u", # ú LATIN SMALL LETTER U WITH ACUTE
539 0x2018: u"'", # ‘ LEFT SINGLE QUOTATION MARK
540 0x2019: u"'", # ’ RIGHT SINGLE QUOTATION MARK
541 0x201c: u'"', # “ LEFT DOUBLE QUOTATION MARK
542 0x201d: u'"', # ” RIGHT DOUBLE QUOTATION MARK
543 }
544
545
546 DEACCENT_MAP = DeaccenterDict(_BASE_DEACCENT_MAP)
547
548
549 _SIZE_SYMBOLS = ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y')
550 _SIZE_BOUNDS = [(1024 ** i, sym) for i, sym in enumerate(_SIZE_SYMBOLS)]
551 _SIZE_RANGES = list(zip(_SIZE_BOUNDS, _SIZE_BOUNDS[1:]))
552
553
554 def bytes2human(nbytes, ndigits=0):
555 """Turns an integer value of *nbytes* into a human readable format. Set
556 *ndigits* to control how many digits after the decimal point
557 should be shown (default ``0``).
558
559 >>> bytes2human(128991)
560 '126K'
561 >>> bytes2human(100001221)
562 '95M'
563 >>> bytes2human(0, 2)
564 '0.00B'
565 """
566 abs_bytes = abs(nbytes)
567 for (size, symbol), (next_size, next_symbol) in _SIZE_RANGES:
568 if abs_bytes <= next_size:
569 break
570 hnbytes = float(nbytes) / size
571 return '{hnbytes:.{ndigits}f}{symbol}'.format(hnbytes=hnbytes,
572 ndigits=ndigits,
573 symbol=symbol)
574
575
576 class HTMLTextExtractor(HTMLParser):
577 def __init__(self):
578 self.reset()
579 self.strict = False
580 self.convert_charrefs = True
581 self.result = []
582
583 def handle_data(self, d):
584 self.result.append(d)
585
586 def handle_charref(self, number):
587 if number[0] == u'x' or number[0] == u'X':
588 codepoint = int(number[1:], 16)
589 else:
590 codepoint = int(number)
591 self.result.append(unichr(codepoint))
592
593 def handle_entityref(self, name):
594 try:
595 codepoint = htmlentitydefs.name2codepoint[name]
596 except KeyError:
597 self.result.append(u'&' + name + u';')
598 else:
599 self.result.append(unichr(codepoint))
600
601 def get_text(self):
602 return u''.join(self.result)
603
604
605 def html2text(html):
606 """Strips tags from HTML text, returning markup-free text. Also, does
607 a best effort replacement of entities like "&nbsp;"
608
609 >>> r = html2text(u'<a href="#">Test &amp;<em>(\u0394&#x03b7;&#956;&#x03CE;)</em></a>')
610 >>> r == u'Test &(\u0394\u03b7\u03bc\u03ce)'
611 True
612 """
613 # based on answers to http://stackoverflow.com/questions/753052/
614 s = HTMLTextExtractor()
615 s.feed(html)
616 return s.get_text()
617
618
619 _EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x089\xf3\xb9U\x00\x03empty\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00'
620 _NON_EMPTY_GZIP_BYTES = b'\x1f\x8b\x08\x08\xbc\xf7\xb9U\x00\x03not_empty\x00K\xaa,I-N\xcc\xc8\xafT\xe4\x02\x00\xf3nb\xbf\x0b\x00\x00\x00'
621
622
623 def gunzip_bytes(bytestring):
624 """The :mod:`gzip` module is great if you have a file or file-like
625 object, but what if you just have bytes. StringIO is one
626 possibility, but it's often faster, easier, and simpler to just
627 use this one-liner. Use this tried-and-true utility function to
628 decompress gzip from bytes.
629
630 >>> gunzip_bytes(_EMPTY_GZIP_BYTES) == b''
631 True
632 >>> gunzip_bytes(_NON_EMPTY_GZIP_BYTES).rstrip() == b'bytesahoy!'
633 True
634 """
635 return zlib.decompress(bytestring, 16 + zlib.MAX_WBITS)
636
637
638 def gzip_bytes(bytestring, level=6):
639 """Turn some bytes into some compressed bytes.
640
641 >>> len(gzip_bytes(b'a' * 10000))
642 46
643
644 Args:
645 bytestring (bytes): Bytes to be compressed
646 level (int): An integer, 1-9, controlling the
647 speed/compression. 1 is fastest, least compressed, 9 is
648 slowest, but most compressed.
649
650 Note that all levels of gzip are pretty fast these days, though
651 it's not really a competitor in compression, at any level.
652 """
653 out = StringIO()
654 f = GzipFile(fileobj=out, mode='wb', compresslevel=level)
655 f.write(bytestring)
656 f.close()
657 return out.getvalue()
658
659
660
661 _line_ending_re = re.compile(r'(\r\n|\n|\x0b|\f|\r|\x85|\x2028|\x2029)',
662 re.UNICODE)
663
664
665 def iter_splitlines(text):
666 r"""Like :meth:`str.splitlines`, but returns an iterator of lines
667 instead of a list. Also similar to :meth:`file.next`, as that also
668 lazily reads and yields lines from a file.
669
670 This function works with a variety of line endings, but as always,
671 be careful when mixing line endings within a file.
672
673 >>> list(iter_splitlines('\nhi\nbye\n'))
674 ['', 'hi', 'bye', '']
675 >>> list(iter_splitlines('\r\nhi\rbye\r\n'))
676 ['', 'hi', 'bye', '']
677 >>> list(iter_splitlines(''))
678 []
679 """
680 prev_end, len_text = 0, len(text)
681 # print('last: %r' % last_idx)
682 # start, end = None, None
683 for match in _line_ending_re.finditer(text):
684 start, end = match.start(1), match.end(1)
685 # print(start, end)
686 if prev_end <= start:
687 yield text[prev_end:start]
688 if end == len_text:
689 yield ''
690 prev_end = end
691 tail = text[prev_end:]
692 if tail:
693 yield tail
694 return
695
696
697 def indent(text, margin, newline='\n', key=bool):
698 """The missing counterpart to the built-in :func:`textwrap.dedent`.
699
700 Args:
701 text (str): The text to indent.
702 margin (str): The string to prepend to each line.
703 newline (str): The newline used to rejoin the lines (default: ``\\n``)
704 key (callable): Called on each line to determine whether to
705 indent it. Default: :class:`bool`, to ensure that empty lines do
706 not get whitespace added.
707 """
708 indented_lines = [(margin + line if key(line) else line)
709 for line in iter_splitlines(text)]
710 return newline.join(indented_lines)
711
712
713 def is_uuid(obj, version=4):
714 """Check the argument is either a valid UUID object or string.
715
716 Args:
717 obj (object): The test target. Strings and UUID objects supported.
718 version (int): The target UUID version, set to 0 to skip version check.
719
720 >>> is_uuid('e682ccca-5a4c-4ef2-9711-73f9ad1e15ea')
721 True
722 >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9')
723 False
724 >>> is_uuid('0221f0d9-d4b9-11e5-a478-10ddb1c2feb9', version=1)
725 True
726 """
727 if not isinstance(obj, uuid.UUID):
728 try:
729 obj = uuid.UUID(obj)
730 except (TypeError, ValueError, AttributeError):
731 return False
732 if version and obj.version != int(version):
733 return False
734 return True
735
736
737 def escape_shell_args(args, sep=' ', style=None):
738 """Returns an escaped version of each string in *args*, according to
739 *style*.
740
741 Args:
742 args (list): A list of arguments to escape and join together
743 sep (str): The separator used to join the escaped arguments.
744 style (str): The style of escaping to use. Can be one of
745 ``cmd`` or ``sh``, geared toward Windows and Linux/BSD/etc.,
746 respectively. If *style* is ``None``, then it is picked
747 according to the system platform.
748
749 See :func:`args2cmd` and :func:`args2sh` for details and example
750 output for each style.
751 """
752 if not style:
753 style = 'cmd' if sys.platform == 'win32' else 'sh'
754
755 if style == 'sh':
756 return args2sh(args, sep=sep)
757 elif style == 'cmd':
758 return args2cmd(args, sep=sep)
759
760 raise ValueError("style expected one of 'cmd' or 'sh', not %r" % style)
761
762
763 _find_sh_unsafe = re.compile(r'[^a-zA-Z0-9_@%+=:,./-]').search
764
765
766 def args2sh(args, sep=' '):
767 """Return a shell-escaped string version of *args*, separated by
768 *sep*, based on the rules of sh, bash, and other shells in the
769 Linux/BSD/MacOS ecosystem.
770
771 >>> print(args2sh(['aa', '[bb]', "cc'cc", 'dd"dd']))
772 aa '[bb]' 'cc'"'"'cc' 'dd"dd'
773
774 As you can see, arguments with no special characters are not
775 escaped, arguments with special characters are quoted with single
776 quotes, and single quotes themselves are quoted with double
777 quotes. Double quotes are handled like any other special
778 character.
779
780 Based on code from the :mod:`pipes`/:mod:`shlex` modules. Also
781 note that :mod:`shlex` and :mod:`argparse` have functions to split
782 and parse strings escaped in this manner.
783 """
784 ret_list = []
785
786 for arg in args:
787 if not arg:
788 ret_list.append("''")
789 continue
790 if _find_sh_unsafe(arg) is None:
791 ret_list.append(arg)
792 continue
793 # use single quotes, and put single quotes into double quotes
794 # the string $'b is then quoted as '$'"'"'b'
795 ret_list.append("'" + arg.replace("'", "'\"'\"'") + "'")
796
797 return ' '.join(ret_list)
798
799
800 def args2cmd(args, sep=' '):
801 r"""Return a shell-escaped string version of *args*, separated by
802 *sep*, using the same rules as the Microsoft C runtime.
803
804 >>> print(args2cmd(['aa', '[bb]', "cc'cc", 'dd"dd']))
805 aa [bb] cc'cc dd\"dd
806
807 As you can see, escaping is through backslashing and not quoting,
808 and double quotes are the only special character. See the comment
809 in the code for more details. Based on internal code from the
810 :mod:`subprocess` module.
811
812 """
813 # technique description from subprocess below
814 """
815 1) Arguments are delimited by white space, which is either a
816 space or a tab.
817
818 2) A string surrounded by double quotation marks is
819 interpreted as a single argument, regardless of white space
820 contained within. A quoted string can be embedded in an
821 argument.
822
823 3) A double quotation mark preceded by a backslash is
824 interpreted as a literal double quotation mark.
825
826 4) Backslashes are interpreted literally, unless they
827 immediately precede a double quotation mark.
828
829 5) If backslashes immediately precede a double quotation mark,
830 every pair of backslashes is interpreted as a literal
831 backslash. If the number of backslashes is odd, the last
832 backslash escapes the next double quotation mark as
833 described in rule 3.
834
835 See http://msdn.microsoft.com/en-us/library/17w5ykft.aspx
836 or search http://msdn.microsoft.com for
837 "Parsing C++ Command-Line Arguments"
838 """
839 result = []
840 needquote = False
841 for arg in args:
842 bs_buf = []
843
844 # Add a space to separate this argument from the others
845 if result:
846 result.append(' ')
847
848 needquote = (" " in arg) or ("\t" in arg) or not arg
849 if needquote:
850 result.append('"')
851
852 for c in arg:
853 if c == '\\':
854 # Don't know if we need to double yet.
855 bs_buf.append(c)
856 elif c == '"':
857 # Double backslashes.
858 result.append('\\' * len(bs_buf)*2)
859 bs_buf = []
860 result.append('\\"')
861 else:
862 # Normal char
863 if bs_buf:
864 result.extend(bs_buf)
865 bs_buf = []
866 result.append(c)
867
868 # Add remaining backslashes, if any.
869 if bs_buf:
870 result.extend(bs_buf)
871
872 if needquote:
873 result.extend(bs_buf)
874 result.append('"')
875
876 return ''.join(result)
877
878
879 def parse_int_list(range_string, delim=',', range_delim='-'):
880 """Returns a sorted list of positive integers based on
881 *range_string*. Reverse of :func:`format_int_list`.
882
883 Args:
884 range_string (str): String of comma separated positive
885 integers or ranges (e.g. '1,2,4-6,8'). Typical of a custom
886 page range string used in printer dialogs.
887 delim (char): Defaults to ','. Separates integers and
888 contiguous ranges of integers.
889 range_delim (char): Defaults to '-'. Indicates a contiguous
890 range of integers.
891
892 >>> parse_int_list('1,3,5-8,10-11,15')
893 [1, 3, 5, 6, 7, 8, 10, 11, 15]
894
895 """
896 output = []
897
898 for x in range_string.strip().split(delim):
899
900 # Range
901 if range_delim in x:
902 range_limits = list(map(int, x.split(range_delim)))
903 output += list(range(min(range_limits), max(range_limits)+1))
904
905 # Empty String
906 elif not x:
907 continue
908
909 # Integer
910 else:
911 output.append(int(x))
912
913 return sorted(output)
914
915
916 def format_int_list(int_list, delim=',', range_delim='-', delim_space=False):
917 """Returns a sorted range string from a list of positive integers
918 (*int_list*). Contiguous ranges of integers are collapsed to min
919 and max values. Reverse of :func:`parse_int_list`.
920
921 Args:
922 int_list (list): List of positive integers to be converted
923 into a range string (e.g. [1,2,4,5,6,8]).
924 delim (char): Defaults to ','. Separates integers and
925 contiguous ranges of integers.
926 range_delim (char): Defaults to '-'. Indicates a contiguous
927 range of integers.
928 delim_space (bool): Defaults to ``False``. If ``True``, adds a
929 space after all *delim* characters.
930
931 >>> format_int_list([1,3,5,6,7,8,10,11,15])
932 '1,3,5-8,10-11,15'
933
934 """
935 output = []
936 contig_range = collections.deque()
937
938 for x in sorted(int_list):
939
940 # Handle current (and first) value.
941 if len(contig_range) < 1:
942 contig_range.append(x)
943
944 # Handle current value, given multiple previous values are contiguous.
945 elif len(contig_range) > 1:
946 delta = x - contig_range[-1]
947
948 # Current value is contiguous.
949 if delta == 1:
950 contig_range.append(x)
951
952 # Current value is non-contiguous.
953 elif delta > 1:
954 range_substr = '{0:d}{1}{2:d}'.format(min(contig_range),
955 range_delim,
956 max(contig_range))
957 output.append(range_substr)
958 contig_range.clear()
959 contig_range.append(x)
960
961 # Current value repeated.
962 else:
963 continue
964
965 # Handle current value, given no previous contiguous integers
966 else:
967 delta = x - contig_range[0]
968
969 # Current value is contiguous.
970 if delta == 1:
971 contig_range.append(x)
972
973 # Current value is non-contiguous.
974 elif delta > 1:
975 output.append('{0:d}'.format(contig_range.popleft()))
976 contig_range.append(x)
977
978 # Current value repeated.
979 else:
980 continue
981
982 # Handle the last value.
983 else:
984
985 # Last value is non-contiguous.
986 if len(contig_range) == 1:
987 output.append('{0:d}'.format(contig_range.popleft()))
988 contig_range.clear()
989
990 # Last value is part of contiguous range.
991 elif len(contig_range) > 1:
992 range_substr = '{0:d}{1}{2:d}'.format(min(contig_range),
993 range_delim,
994 max(contig_range))
995 output.append(range_substr)
996 contig_range.clear()
997
998 if delim_space:
999 output_str = (delim+' ').join(output)
1000 else:
1001 output_str = delim.join(output)
1002
1003 return output_str
1004
1005
1006 def complement_int_list(
1007 range_string, range_start=0, range_end=None,
1008 delim=',', range_delim='-'):
1009 """ Returns range string that is the complement of the one provided as
1010 *range_string* parameter.
1011
1012 These range strings are of the kind produce by :func:`format_int_list`, and
1013 parseable by :func:`parse_int_list`.
1014
1015 Args:
1016 range_string (str): String of comma separated positive integers or
1017 ranges (e.g. '1,2,4-6,8'). Typical of a custom page range string
1018 used in printer dialogs.
1019 range_start (int): A positive integer from which to start the resulting
1020 range. Value is inclusive. Defaults to ``0``.
1021 range_end (int): A positive integer from which the produced range is
1022 stopped. Value is exclusive. Defaults to the maximum value found in
1023 the provided ``range_string``.
1024 delim (char): Defaults to ','. Separates integers and contiguous ranges
1025 of integers.
1026 range_delim (char): Defaults to '-'. Indicates a contiguous range of
1027 integers.
1028
1029 >>> complement_int_list('1,3,5-8,10-11,15')
1030 '0,2,4,9,12-14'
1031
1032 >>> complement_int_list('1,3,5-8,10-11,15', range_start=0)
1033 '0,2,4,9,12-14'
1034
1035 >>> complement_int_list('1,3,5-8,10-11,15', range_start=1)
1036 '2,4,9,12-14'
1037
1038 >>> complement_int_list('1,3,5-8,10-11,15', range_start=2)
1039 '2,4,9,12-14'
1040
1041 >>> complement_int_list('1,3,5-8,10-11,15', range_start=3)
1042 '4,9,12-14'
1043
1044 >>> complement_int_list('1,3,5-8,10-11,15', range_end=15)
1045 '0,2,4,9,12-14'
1046
1047 >>> complement_int_list('1,3,5-8,10-11,15', range_end=14)
1048 '0,2,4,9,12-13'
1049
1050 >>> complement_int_list('1,3,5-8,10-11,15', range_end=13)
1051 '0,2,4,9,12'
1052
1053 >>> complement_int_list('1,3,5-8,10-11,15', range_end=20)
1054 '0,2,4,9,12-14,16-19'
1055
1056 >>> complement_int_list('1,3,5-8,10-11,15', range_end=0)
1057 ''
1058
1059 >>> complement_int_list('1,3,5-8,10-11,15', range_start=-1)
1060 '0,2,4,9,12-14'
1061
1062 >>> complement_int_list('1,3,5-8,10-11,15', range_end=-1)
1063 ''
1064
1065 >>> complement_int_list('1,3,5-8', range_start=1, range_end=1)
1066 ''
1067
1068 >>> complement_int_list('1,3,5-8', range_start=2, range_end=2)
1069 ''
1070
1071 >>> complement_int_list('1,3,5-8', range_start=2, range_end=3)
1072 '2'
1073
1074 >>> complement_int_list('1,3,5-8', range_start=-10, range_end=-5)
1075 ''
1076
1077 >>> complement_int_list('1,3,5-8', range_start=20, range_end=10)
1078 ''
1079
1080 >>> complement_int_list('')
1081 ''
1082 """
1083 int_list = set(parse_int_list(range_string, delim, range_delim))
1084 if range_end is None:
1085 if int_list:
1086 range_end = max(int_list) + 1
1087 else:
1088 range_end = range_start
1089 complement_values = set(
1090 range(range_end)) - int_list - set(range(range_start))
1091 return format_int_list(complement_values, delim, range_delim)
1092
1093
1094 def int_ranges_from_int_list(range_string, delim=',', range_delim='-'):
1095 """ Transform a string of ranges (*range_string*) into a tuple of tuples.
1096
1097 Args:
1098 range_string (str): String of comma separated positive integers or
1099 ranges (e.g. '1,2,4-6,8'). Typical of a custom page range string
1100 used in printer dialogs.
1101 delim (char): Defaults to ','. Separates integers and contiguous ranges
1102 of integers.
1103 range_delim (char): Defaults to '-'. Indicates a contiguous range of
1104 integers.
1105
1106 >>> int_ranges_from_int_list('1,3,5-8,10-11,15')
1107 ((1, 1), (3, 3), (5, 8), (10, 11), (15, 15))
1108
1109 >>> int_ranges_from_int_list('1')
1110 ((1, 1),)
1111
1112 >>> int_ranges_from_int_list('')
1113 ()
1114 """
1115 int_tuples = []
1116 # Normalize the range string to our internal format for processing.
1117 range_string = format_int_list(
1118 parse_int_list(range_string, delim, range_delim))
1119 if range_string:
1120 for bounds in range_string.split(','):
1121 if '-' in bounds:
1122 start, end = bounds.split('-')
1123 else:
1124 start, end = bounds, bounds
1125 int_tuples.append((int(start), int(end)))
1126 return tuple(int_tuples)
1127
1128
1129 class MultiReplace(object):
1130 """
1131 MultiReplace is a tool for doing multiple find/replace actions in one pass.
1132
1133 Given a mapping of values to be replaced it allows for all of the matching
1134 values to be replaced in a single pass which can save a lot of performance
1135 on very large strings. In addition to simple replace, it also allows for
1136 replacing based on regular expressions.
1137
1138 Keyword Arguments:
1139
1140 :type regex: bool
1141 :param regex: Treat search keys as regular expressions [Default: False]
1142 :type flags: int
1143 :param flags: flags to pass to the regex engine during compile
1144
1145 Dictionary Usage::
1146
1147 from lrmslib import stringutils
1148 s = stringutils.MultiReplace({
1149 'foo': 'zoo',
1150 'cat': 'hat',
1151 'bat': 'kraken'
1152 })
1153 new = s.sub('The foo bar cat ate a bat')
1154 new == 'The zoo bar hat ate a kraken'
1155
1156 Iterable Usage::
1157
1158 from lrmslib import stringutils
1159 s = stringutils.MultiReplace([
1160 ('foo', 'zoo'),
1161 ('cat', 'hat'),
1162 ('bat', 'kraken)'
1163 ])
1164 new = s.sub('The foo bar cat ate a bat')
1165 new == 'The zoo bar hat ate a kraken'
1166
1167
1168 The constructor can be passed a dictionary or other mapping as well as
1169 an iterable of tuples. If given an iterable, the substitution will be run
1170 in the order the replacement values are specified in the iterable. This is
1171 also true if it is given an OrderedDict. If given a dictionary then the
1172 order will be non-deterministic::
1173
1174 >>> 'foo bar baz'.replace('foo', 'baz').replace('baz', 'bar')
1175 'bar bar bar'
1176 >>> m = MultiReplace({'foo': 'baz', 'baz': 'bar'})
1177 >>> m.sub('foo bar baz')
1178 'baz bar bar'
1179
1180 This is because the order of replacement can matter if you're inserting
1181 something that might be replaced by a later substitution. Pay attention and
1182 if you need to rely on order then consider using a list of tuples instead
1183 of a dictionary.
1184 """
1185
1186 def __init__(self, sub_map, **kwargs):
1187 """Compile any regular expressions that have been passed."""
1188 options = {
1189 'regex': False,
1190 'flags': 0,
1191 }
1192 options.update(kwargs)
1193 self.group_map = {}
1194 regex_values = []
1195
1196 if isinstance(sub_map, Mapping):
1197 sub_map = sub_map.items()
1198
1199 for idx, vals in enumerate(sub_map):
1200 group_name = 'group{0}'.format(idx)
1201 if isinstance(vals[0], basestring):
1202 # If we're not treating input strings like a regex, escape it
1203 if not options['regex']:
1204 exp = re.escape(vals[0])
1205 else:
1206 exp = vals[0]
1207 else:
1208 exp = vals[0].pattern
1209
1210 regex_values.append('(?P<{0}>{1})'.format(
1211 group_name,
1212 exp
1213 ))
1214 self.group_map[group_name] = vals[1]
1215
1216 self.combined_pattern = re.compile(
1217 '|'.join(regex_values),
1218 flags=options['flags']
1219 )
1220
1221 def _get_value(self, match):
1222 """Given a match object find replacement value."""
1223 group_dict = match.groupdict()
1224 key = [x for x in group_dict if group_dict[x]][0]
1225 return self.group_map[key]
1226
1227 def sub(self, text):
1228 """
1229 Run substitutions on the input text.
1230
1231 Given an input string, run all substitutions given in the
1232 constructor.
1233 """
1234 return self.combined_pattern.sub(self._get_value, text)
1235
1236
1237 def multi_replace(text, sub_map, **kwargs):
1238 """Shortcut function to invoke multi-replace in a single command."""
1239 m = MultiReplace(sub_map, **kwargs)
1240 return m.sub(text)
1241
1242
1243 def unwrap_text(text, ending='\n\n'):
1244 r"""
1245 Unwrap text, the natural complement to :func:`textwrap.wrap`.
1246
1247 >>> text = "Short \n lines \nwrapped\nsmall.\n\nAnother\nparagraph."
1248 >>> unwrap_text(text)
1249 'Short lines wrapped small.\n\nAnother paragraph.'
1250
1251 Args:
1252 text: A string to unwrap.
1253 ending (str): The string to join all unwrapped paragraphs
1254 by. Pass ``None`` to get the list. Defaults to '\n\n' for
1255 compatibility with Markdown and RST.
1256
1257 """
1258 all_grafs = []
1259 cur_graf = []
1260 for line in text.splitlines():
1261 line = line.strip()
1262 if line:
1263 cur_graf.append(line)
1264 else:
1265 all_grafs.append(' '.join(cur_graf))
1266 cur_graf = []
1267 if cur_graf:
1268 all_grafs.append(' '.join(cur_graf))
1269 if ending is None:
1270 return all_grafs
1271 return ending.join(all_grafs)