Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/humanfriendly/text.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
| author | shellac |
|---|---|
| date | Mon, 22 Mar 2021 18:12:50 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4f3585e2f14b |
|---|---|
| 1 # Human friendly input/output in Python. | |
| 2 # | |
| 3 # Author: Peter Odding <peter@peterodding.com> | |
| 4 # Last Change: December 1, 2020 | |
| 5 # URL: https://humanfriendly.readthedocs.io | |
| 6 | |
| 7 """ | |
| 8 Simple text manipulation functions. | |
| 9 | |
| 10 The :mod:`~humanfriendly.text` module contains simple functions to manipulate text: | |
| 11 | |
| 12 - The :func:`concatenate()` and :func:`pluralize()` functions make it easy to | |
| 13 generate human friendly output. | |
| 14 | |
| 15 - The :func:`format()`, :func:`compact()` and :func:`dedent()` functions | |
| 16 provide a clean and simple to use syntax for composing large text fragments | |
| 17 with interpolated variables. | |
| 18 | |
| 19 - The :func:`tokenize()` function parses simple user input. | |
| 20 """ | |
| 21 | |
| 22 # Standard library modules. | |
| 23 import numbers | |
| 24 import random | |
| 25 import re | |
| 26 import string | |
| 27 import textwrap | |
| 28 | |
| 29 # Public identifiers that require documentation. | |
| 30 __all__ = ( | |
| 31 'compact', | |
| 32 'compact_empty_lines', | |
| 33 'concatenate', | |
| 34 'dedent', | |
| 35 'format', | |
| 36 'generate_slug', | |
| 37 'is_empty_line', | |
| 38 'join_lines', | |
| 39 'pluralize', | |
| 40 'pluralize_raw', | |
| 41 'random_string', | |
| 42 'split', | |
| 43 'split_paragraphs', | |
| 44 'tokenize', | |
| 45 'trim_empty_lines', | |
| 46 ) | |
| 47 | |
| 48 | |
| 49 def compact(text, *args, **kw): | |
| 50 ''' | |
| 51 Compact whitespace in a string. | |
| 52 | |
| 53 Trims leading and trailing whitespace, replaces runs of whitespace | |
| 54 characters with a single space and interpolates any arguments using | |
| 55 :func:`format()`. | |
| 56 | |
| 57 :param text: The text to compact (a string). | |
| 58 :param args: Any positional arguments are interpolated using :func:`format()`. | |
| 59 :param kw: Any keyword arguments are interpolated using :func:`format()`. | |
| 60 :returns: The compacted text (a string). | |
| 61 | |
| 62 Here's an example of how I like to use the :func:`compact()` function, this | |
| 63 is an example from a random unrelated project I'm working on at the moment:: | |
| 64 | |
| 65 raise PortDiscoveryError(compact(""" | |
| 66 Failed to discover port(s) that Apache is listening on! | |
| 67 Maybe I'm parsing the wrong configuration file? ({filename}) | |
| 68 """, filename=self.ports_config)) | |
| 69 | |
| 70 The combination of :func:`compact()` and Python's multi line strings allows | |
| 71 me to write long text fragments with interpolated variables that are easy | |
| 72 to write, easy to read and work well with Python's whitespace | |
| 73 sensitivity. | |
| 74 ''' | |
| 75 non_whitespace_tokens = text.split() | |
| 76 compacted_text = ' '.join(non_whitespace_tokens) | |
| 77 return format(compacted_text, *args, **kw) | |
| 78 | |
| 79 | |
| 80 def compact_empty_lines(text): | |
| 81 """ | |
| 82 Replace repeating empty lines with a single empty line (similar to ``cat -s``). | |
| 83 | |
| 84 :param text: The text in which to compact empty lines (a string). | |
| 85 :returns: The text with empty lines compacted (a string). | |
| 86 """ | |
| 87 i = 0 | |
| 88 lines = text.splitlines(True) | |
| 89 while i < len(lines): | |
| 90 if i > 0 and is_empty_line(lines[i - 1]) and is_empty_line(lines[i]): | |
| 91 lines.pop(i) | |
| 92 else: | |
| 93 i += 1 | |
| 94 return ''.join(lines) | |
| 95 | |
| 96 | |
| 97 def concatenate(items, conjunction='and', serial_comma=False): | |
| 98 """ | |
| 99 Concatenate a list of items in a human friendly way. | |
| 100 | |
| 101 :param items: | |
| 102 | |
| 103 A sequence of strings. | |
| 104 | |
| 105 :param conjunction: | |
| 106 | |
| 107 The word to use before the last item (a string, defaults to "and"). | |
| 108 | |
| 109 :param serial_comma: | |
| 110 | |
| 111 :data:`True` to use a `serial comma`_, :data:`False` otherwise | |
| 112 (defaults to :data:`False`). | |
| 113 | |
| 114 :returns: | |
| 115 | |
| 116 A single string. | |
| 117 | |
| 118 >>> from humanfriendly.text import concatenate | |
| 119 >>> concatenate(["eggs", "milk", "bread"]) | |
| 120 'eggs, milk and bread' | |
| 121 | |
| 122 .. _serial comma: https://en.wikipedia.org/wiki/Serial_comma | |
| 123 """ | |
| 124 items = list(items) | |
| 125 if len(items) > 1: | |
| 126 final_item = items.pop() | |
| 127 formatted = ', '.join(items) | |
| 128 if serial_comma: | |
| 129 formatted += ',' | |
| 130 return ' '.join([formatted, conjunction, final_item]) | |
| 131 elif items: | |
| 132 return items[0] | |
| 133 else: | |
| 134 return '' | |
| 135 | |
| 136 | |
| 137 def dedent(text, *args, **kw): | |
| 138 """ | |
| 139 Dedent a string (remove common leading whitespace from all lines). | |
| 140 | |
| 141 Removes common leading whitespace from all lines in the string using | |
| 142 :func:`textwrap.dedent()`, removes leading and trailing empty lines using | |
| 143 :func:`trim_empty_lines()` and interpolates any arguments using | |
| 144 :func:`format()`. | |
| 145 | |
| 146 :param text: The text to dedent (a string). | |
| 147 :param args: Any positional arguments are interpolated using :func:`format()`. | |
| 148 :param kw: Any keyword arguments are interpolated using :func:`format()`. | |
| 149 :returns: The dedented text (a string). | |
| 150 | |
| 151 The :func:`compact()` function's documentation contains an example of how I | |
| 152 like to use the :func:`compact()` and :func:`dedent()` functions. The main | |
| 153 difference is that I use :func:`compact()` for text that will be presented | |
| 154 to the user (where whitespace is not so significant) and :func:`dedent()` | |
| 155 for data file and code generation tasks (where newlines and indentation are | |
| 156 very significant). | |
| 157 """ | |
| 158 dedented_text = textwrap.dedent(text) | |
| 159 trimmed_text = trim_empty_lines(dedented_text) | |
| 160 return format(trimmed_text, *args, **kw) | |
| 161 | |
| 162 | |
| 163 def format(text, *args, **kw): | |
| 164 """ | |
| 165 Format a string using the string formatting operator and/or :meth:`str.format()`. | |
| 166 | |
| 167 :param text: The text to format (a string). | |
| 168 :param args: Any positional arguments are interpolated into the text using | |
| 169 the string formatting operator (``%``). If no positional | |
| 170 arguments are given no interpolation is done. | |
| 171 :param kw: Any keyword arguments are interpolated into the text using the | |
| 172 :meth:`str.format()` function. If no keyword arguments are given | |
| 173 no interpolation is done. | |
| 174 :returns: The text with any positional and/or keyword arguments | |
| 175 interpolated (a string). | |
| 176 | |
| 177 The implementation of this function is so trivial that it seems silly to | |
| 178 even bother writing and documenting it. Justifying this requires some | |
| 179 context :-). | |
| 180 | |
| 181 **Why format() instead of the string formatting operator?** | |
| 182 | |
| 183 For really simple string interpolation Python's string formatting operator | |
| 184 is ideal, but it does have some strange quirks: | |
| 185 | |
| 186 - When you switch from interpolating a single value to interpolating | |
| 187 multiple values you have to wrap them in tuple syntax. Because | |
| 188 :func:`format()` takes a `variable number of arguments`_ it always | |
| 189 receives a tuple (which saves me a context switch :-). Here's an | |
| 190 example: | |
| 191 | |
| 192 >>> from humanfriendly.text import format | |
| 193 >>> # The string formatting operator. | |
| 194 >>> print('the magic number is %s' % 42) | |
| 195 the magic number is 42 | |
| 196 >>> print('the magic numbers are %s and %s' % (12, 42)) | |
| 197 the magic numbers are 12 and 42 | |
| 198 >>> # The format() function. | |
| 199 >>> print(format('the magic number is %s', 42)) | |
| 200 the magic number is 42 | |
| 201 >>> print(format('the magic numbers are %s and %s', 12, 42)) | |
| 202 the magic numbers are 12 and 42 | |
| 203 | |
| 204 - When you interpolate a single value and someone accidentally passes in a | |
| 205 tuple your code raises a :exc:`~exceptions.TypeError`. Because | |
| 206 :func:`format()` takes a `variable number of arguments`_ it always | |
| 207 receives a tuple so this can never happen. Here's an example: | |
| 208 | |
| 209 >>> # How expecting to interpolate a single value can fail. | |
| 210 >>> value = (12, 42) | |
| 211 >>> print('the magic value is %s' % value) | |
| 212 Traceback (most recent call last): | |
| 213 File "<stdin>", line 1, in <module> | |
| 214 TypeError: not all arguments converted during string formatting | |
| 215 >>> # The following line works as intended, no surprises here! | |
| 216 >>> print(format('the magic value is %s', value)) | |
| 217 the magic value is (12, 42) | |
| 218 | |
| 219 **Why format() instead of the str.format() method?** | |
| 220 | |
| 221 When you're doing complex string interpolation the :meth:`str.format()` | |
| 222 function results in more readable code, however I frequently find myself | |
| 223 adding parentheses to force evaluation order. The :func:`format()` function | |
| 224 avoids this because of the relative priority between the comma and dot | |
| 225 operators. Here's an example: | |
| 226 | |
| 227 >>> "{adjective} example" + " " + "(can't think of anything less {adjective})".format(adjective='silly') | |
| 228 "{adjective} example (can't think of anything less silly)" | |
| 229 >>> ("{adjective} example" + " " + "(can't think of anything less {adjective})").format(adjective='silly') | |
| 230 "silly example (can't think of anything less silly)" | |
| 231 >>> format("{adjective} example" + " " + "(can't think of anything less {adjective})", adjective='silly') | |
| 232 "silly example (can't think of anything less silly)" | |
| 233 | |
| 234 The :func:`compact()` and :func:`dedent()` functions are wrappers that | |
| 235 combine :func:`format()` with whitespace manipulation to make it easy to | |
| 236 write nice to read Python code. | |
| 237 | |
| 238 .. _variable number of arguments: https://docs.python.org/2/tutorial/controlflow.html#arbitrary-argument-lists | |
| 239 """ | |
| 240 if args: | |
| 241 text %= args | |
| 242 if kw: | |
| 243 text = text.format(**kw) | |
| 244 return text | |
| 245 | |
| 246 | |
| 247 def generate_slug(text, delimiter="-"): | |
| 248 """ | |
| 249 Convert text to a normalized "slug" without whitespace. | |
| 250 | |
| 251 :param text: The original text, for example ``Some Random Text!``. | |
| 252 :param delimiter: The delimiter used to separate words | |
| 253 (defaults to the ``-`` character). | |
| 254 :returns: The slug text, for example ``some-random-text``. | |
| 255 :raises: :exc:`~exceptions.ValueError` when the provided | |
| 256 text is nonempty but results in an empty slug. | |
| 257 """ | |
| 258 slug = text.lower() | |
| 259 escaped = delimiter.replace("\\", "\\\\") | |
| 260 slug = re.sub("[^a-z0-9]+", escaped, slug) | |
| 261 slug = slug.strip(delimiter) | |
| 262 if text and not slug: | |
| 263 msg = "The provided text %r results in an empty slug!" | |
| 264 raise ValueError(format(msg, text)) | |
| 265 return slug | |
| 266 | |
| 267 | |
| 268 def is_empty_line(text): | |
| 269 """ | |
| 270 Check if a text is empty or contains only whitespace. | |
| 271 | |
| 272 :param text: The text to check for "emptiness" (a string). | |
| 273 :returns: :data:`True` if the text is empty or contains only whitespace, | |
| 274 :data:`False` otherwise. | |
| 275 """ | |
| 276 return len(text) == 0 or text.isspace() | |
| 277 | |
| 278 | |
| 279 def join_lines(text): | |
| 280 """ | |
| 281 Remove "hard wrapping" from the paragraphs in a string. | |
| 282 | |
| 283 :param text: The text to reformat (a string). | |
| 284 :returns: The text without hard wrapping (a string). | |
| 285 | |
| 286 This function works by removing line breaks when the last character before | |
| 287 a line break and the first character after the line break are both | |
| 288 non-whitespace characters. This means that common leading indentation will | |
| 289 break :func:`join_lines()` (in that case you can use :func:`dedent()` | |
| 290 before calling :func:`join_lines()`). | |
| 291 """ | |
| 292 return re.sub(r'(\S)\n(\S)', r'\1 \2', text) | |
| 293 | |
| 294 | |
| 295 def pluralize(count, singular, plural=None): | |
| 296 """ | |
| 297 Combine a count with the singular or plural form of a word. | |
| 298 | |
| 299 :param count: The count (a number). | |
| 300 :param singular: The singular form of the word (a string). | |
| 301 :param plural: The plural form of the word (a string or :data:`None`). | |
| 302 :returns: The count and singular or plural word concatenated (a string). | |
| 303 | |
| 304 See :func:`pluralize_raw()` for the logic underneath :func:`pluralize()`. | |
| 305 """ | |
| 306 return '%s %s' % (count, pluralize_raw(count, singular, plural)) | |
| 307 | |
| 308 | |
| 309 def pluralize_raw(count, singular, plural=None): | |
| 310 """ | |
| 311 Select the singular or plural form of a word based on a count. | |
| 312 | |
| 313 :param count: The count (a number). | |
| 314 :param singular: The singular form of the word (a string). | |
| 315 :param plural: The plural form of the word (a string or :data:`None`). | |
| 316 :returns: The singular or plural form of the word (a string). | |
| 317 | |
| 318 When the given count is exactly 1.0 the singular form of the word is | |
| 319 selected, in all other cases the plural form of the word is selected. | |
| 320 | |
| 321 If the plural form of the word is not provided it is obtained by | |
| 322 concatenating the singular form of the word with the letter "s". Of course | |
| 323 this will not always be correct, which is why you have the option to | |
| 324 specify both forms. | |
| 325 """ | |
| 326 if not plural: | |
| 327 plural = singular + 's' | |
| 328 return singular if float(count) == 1.0 else plural | |
| 329 | |
| 330 | |
| 331 def random_string(length=(25, 100), characters=string.ascii_letters): | |
| 332 """random_string(length=(25, 100), characters=string.ascii_letters) | |
| 333 Generate a random string. | |
| 334 | |
| 335 :param length: The length of the string to be generated (a number or a | |
| 336 tuple with two numbers). If this is a tuple then a random | |
| 337 number between the two numbers given in the tuple is used. | |
| 338 :param characters: The characters to be used (a string, defaults | |
| 339 to :data:`string.ascii_letters`). | |
| 340 :returns: A random string. | |
| 341 | |
| 342 The :func:`random_string()` function is very useful in test suites; by the | |
| 343 time I included it in :mod:`humanfriendly.text` I had already included | |
| 344 variants of this function in seven different test suites :-). | |
| 345 """ | |
| 346 if not isinstance(length, numbers.Number): | |
| 347 length = random.randint(length[0], length[1]) | |
| 348 return ''.join(random.choice(characters) for _ in range(length)) | |
| 349 | |
| 350 | |
| 351 def split(text, delimiter=','): | |
| 352 """ | |
| 353 Split a comma-separated list of strings. | |
| 354 | |
| 355 :param text: The text to split (a string). | |
| 356 :param delimiter: The delimiter to split on (a string). | |
| 357 :returns: A list of zero or more nonempty strings. | |
| 358 | |
| 359 Here's the default behavior of Python's built in :meth:`str.split()` | |
| 360 function: | |
| 361 | |
| 362 >>> 'foo,bar, baz,'.split(',') | |
| 363 ['foo', 'bar', ' baz', ''] | |
| 364 | |
| 365 In contrast here's the default behavior of the :func:`split()` function: | |
| 366 | |
| 367 >>> from humanfriendly.text import split | |
| 368 >>> split('foo,bar, baz,') | |
| 369 ['foo', 'bar', 'baz'] | |
| 370 | |
| 371 Here is an example that parses a nested data structure (a mapping of | |
| 372 logging level names to one or more styles per level) that's encoded in a | |
| 373 string so it can be set as an environment variable: | |
| 374 | |
| 375 >>> from pprint import pprint | |
| 376 >>> encoded_data = 'debug=green;warning=yellow;error=red;critical=red,bold' | |
| 377 >>> parsed_data = dict((k, split(v, ',')) for k, v in (split(kv, '=') for kv in split(encoded_data, ';'))) | |
| 378 >>> pprint(parsed_data) | |
| 379 {'debug': ['green'], | |
| 380 'warning': ['yellow'], | |
| 381 'error': ['red'], | |
| 382 'critical': ['red', 'bold']} | |
| 383 """ | |
| 384 return [token.strip() for token in text.split(delimiter) if token and not token.isspace()] | |
| 385 | |
| 386 | |
| 387 def split_paragraphs(text): | |
| 388 """ | |
| 389 Split a string into paragraphs (one or more lines delimited by an empty line). | |
| 390 | |
| 391 :param text: The text to split into paragraphs (a string). | |
| 392 :returns: A list of strings. | |
| 393 """ | |
| 394 paragraphs = [] | |
| 395 for chunk in text.split('\n\n'): | |
| 396 chunk = trim_empty_lines(chunk) | |
| 397 if chunk and not chunk.isspace(): | |
| 398 paragraphs.append(chunk) | |
| 399 return paragraphs | |
| 400 | |
| 401 | |
| 402 def tokenize(text): | |
| 403 """ | |
| 404 Tokenize a text into numbers and strings. | |
| 405 | |
| 406 :param text: The text to tokenize (a string). | |
| 407 :returns: A list of strings and/or numbers. | |
| 408 | |
| 409 This function is used to implement robust tokenization of user input in | |
| 410 functions like :func:`.parse_size()` and :func:`.parse_timespan()`. It | |
| 411 automatically coerces integer and floating point numbers, ignores | |
| 412 whitespace and knows how to separate numbers from strings even without | |
| 413 whitespace. Some examples to make this more concrete: | |
| 414 | |
| 415 >>> from humanfriendly.text import tokenize | |
| 416 >>> tokenize('42') | |
| 417 [42] | |
| 418 >>> tokenize('42MB') | |
| 419 [42, 'MB'] | |
| 420 >>> tokenize('42.5MB') | |
| 421 [42.5, 'MB'] | |
| 422 >>> tokenize('42.5 MB') | |
| 423 [42.5, 'MB'] | |
| 424 """ | |
| 425 tokenized_input = [] | |
| 426 for token in re.split(r'(\d+(?:\.\d+)?)', text): | |
| 427 token = token.strip() | |
| 428 if re.match(r'\d+\.\d+', token): | |
| 429 tokenized_input.append(float(token)) | |
| 430 elif token.isdigit(): | |
| 431 tokenized_input.append(int(token)) | |
| 432 elif token: | |
| 433 tokenized_input.append(token) | |
| 434 return tokenized_input | |
| 435 | |
| 436 | |
| 437 def trim_empty_lines(text): | |
| 438 """ | |
| 439 Trim leading and trailing empty lines from the given text. | |
| 440 | |
| 441 :param text: The text to trim (a string). | |
| 442 :returns: The trimmed text (a string). | |
| 443 """ | |
| 444 lines = text.splitlines(True) | |
| 445 while lines and is_empty_line(lines[0]): | |
| 446 lines.pop(0) | |
| 447 while lines and is_empty_line(lines[-1]): | |
| 448 lines.pop(-1) | |
| 449 return ''.join(lines) |
