Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/humanfriendly/text.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
| author | shellac |
|---|---|
| date | Sat, 02 May 2020 07:14:21 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:26e78fe6e8c4 |
|---|---|
| 1 # Human friendly input/output in Python. | |
| 2 # | |
| 3 # Author: Peter Odding <peter@peterodding.com> | |
| 4 # Last Change: March 1, 2020 | |
| 5 # URL: https://humanfriendly.readthedocs.io | |
| 6 | |
| 7 """ | |
| 8 Simple text manipulation functions. | |
| 9 | |
| 10 The :mod:`~humanfriendly.text` module contains simple functions to manipulate text: | |
| 11 | |
| 12 - The :func:`concatenate()` and :func:`pluralize()` functions make it easy to | |
| 13 generate human friendly output. | |
| 14 | |
| 15 - The :func:`format()`, :func:`compact()` and :func:`dedent()` functions | |
| 16 provide a clean and simple to use syntax for composing large text fragments | |
| 17 with interpolated variables. | |
| 18 | |
| 19 - The :func:`tokenize()` function parses simple user input. | |
| 20 """ | |
| 21 | |
| 22 # Standard library modules. | |
| 23 import math | |
| 24 import numbers | |
| 25 import random | |
| 26 import re | |
| 27 import string | |
| 28 import textwrap | |
| 29 | |
| 30 # Public identifiers that require documentation. | |
| 31 __all__ = ( | |
| 32 'compact', | |
| 33 'compact_empty_lines', | |
| 34 'concatenate', | |
| 35 'dedent', | |
| 36 'format', | |
| 37 'generate_slug', | |
| 38 'is_empty_line', | |
| 39 'join_lines', | |
| 40 'pluralize', | |
| 41 'random_string', | |
| 42 'split', | |
| 43 'split_paragraphs', | |
| 44 'tokenize', | |
| 45 'trim_empty_lines', | |
| 46 ) | |
| 47 | |
| 48 | |
| 49 def compact(text, *args, **kw): | |
| 50 ''' | |
| 51 Compact whitespace in a string. | |
| 52 | |
| 53 Trims leading and trailing whitespace, replaces runs of whitespace | |
| 54 characters with a single space and interpolates any arguments using | |
| 55 :func:`format()`. | |
| 56 | |
| 57 :param text: The text to compact (a string). | |
| 58 :param args: Any positional arguments are interpolated using :func:`format()`. | |
| 59 :param kw: Any keyword arguments are interpolated using :func:`format()`. | |
| 60 :returns: The compacted text (a string). | |
| 61 | |
| 62 Here's an example of how I like to use the :func:`compact()` function, this | |
| 63 is an example from a random unrelated project I'm working on at the moment:: | |
| 64 | |
| 65 raise PortDiscoveryError(compact(""" | |
| 66 Failed to discover port(s) that Apache is listening on! | |
| 67 Maybe I'm parsing the wrong configuration file? ({filename}) | |
| 68 """, filename=self.ports_config)) | |
| 69 | |
| 70 The combination of :func:`compact()` and Python's multi line strings allows | |
| 71 me to write long text fragments with interpolated variables that are easy | |
| 72 to write, easy to read and work well with Python's whitespace | |
| 73 sensitivity. | |
| 74 ''' | |
| 75 non_whitespace_tokens = text.split() | |
| 76 compacted_text = ' '.join(non_whitespace_tokens) | |
| 77 return format(compacted_text, *args, **kw) | |
| 78 | |
| 79 | |
| 80 def compact_empty_lines(text): | |
| 81 """ | |
| 82 Replace repeating empty lines with a single empty line (similar to ``cat -s``). | |
| 83 | |
| 84 :param text: The text in which to compact empty lines (a string). | |
| 85 :returns: The text with empty lines compacted (a string). | |
| 86 """ | |
| 87 i = 0 | |
| 88 lines = text.splitlines(True) | |
| 89 while i < len(lines): | |
| 90 if i > 0 and is_empty_line(lines[i - 1]) and is_empty_line(lines[i]): | |
| 91 lines.pop(i) | |
| 92 else: | |
| 93 i += 1 | |
| 94 return ''.join(lines) | |
| 95 | |
| 96 | |
| 97 def concatenate(items): | |
| 98 """ | |
| 99 Concatenate a list of items in a human friendly way. | |
| 100 | |
| 101 :param items: A sequence of strings. | |
| 102 :returns: A single string. | |
| 103 | |
| 104 >>> from humanfriendly.text import concatenate | |
| 105 >>> concatenate(["eggs", "milk", "bread"]) | |
| 106 'eggs, milk and bread' | |
| 107 """ | |
| 108 items = list(items) | |
| 109 if len(items) > 1: | |
| 110 return ', '.join(items[:-1]) + ' and ' + items[-1] | |
| 111 elif items: | |
| 112 return items[0] | |
| 113 else: | |
| 114 return '' | |
| 115 | |
| 116 | |
| 117 def dedent(text, *args, **kw): | |
| 118 """ | |
| 119 Dedent a string (remove common leading whitespace from all lines). | |
| 120 | |
| 121 Removes common leading whitespace from all lines in the string using | |
| 122 :func:`textwrap.dedent()`, removes leading and trailing empty lines using | |
| 123 :func:`trim_empty_lines()` and interpolates any arguments using | |
| 124 :func:`format()`. | |
| 125 | |
| 126 :param text: The text to dedent (a string). | |
| 127 :param args: Any positional arguments are interpolated using :func:`format()`. | |
| 128 :param kw: Any keyword arguments are interpolated using :func:`format()`. | |
| 129 :returns: The dedented text (a string). | |
| 130 | |
| 131 The :func:`compact()` function's documentation contains an example of how I | |
| 132 like to use the :func:`compact()` and :func:`dedent()` functions. The main | |
| 133 difference is that I use :func:`compact()` for text that will be presented | |
| 134 to the user (where whitespace is not so significant) and :func:`dedent()` | |
| 135 for data file and code generation tasks (where newlines and indentation are | |
| 136 very significant). | |
| 137 """ | |
| 138 dedented_text = textwrap.dedent(text) | |
| 139 trimmed_text = trim_empty_lines(dedented_text) | |
| 140 return format(trimmed_text, *args, **kw) | |
| 141 | |
| 142 | |
| 143 def format(text, *args, **kw): | |
| 144 """ | |
| 145 Format a string using the string formatting operator and/or :meth:`str.format()`. | |
| 146 | |
| 147 :param text: The text to format (a string). | |
| 148 :param args: Any positional arguments are interpolated into the text using | |
| 149 the string formatting operator (``%``). If no positional | |
| 150 arguments are given no interpolation is done. | |
| 151 :param kw: Any keyword arguments are interpolated into the text using the | |
| 152 :meth:`str.format()` function. If no keyword arguments are given | |
| 153 no interpolation is done. | |
| 154 :returns: The text with any positional and/or keyword arguments | |
| 155 interpolated (a string). | |
| 156 | |
| 157 The implementation of this function is so trivial that it seems silly to | |
| 158 even bother writing and documenting it. Justifying this requires some | |
| 159 context :-). | |
| 160 | |
| 161 **Why format() instead of the string formatting operator?** | |
| 162 | |
| 163 For really simple string interpolation Python's string formatting operator | |
| 164 is ideal, but it does have some strange quirks: | |
| 165 | |
| 166 - When you switch from interpolating a single value to interpolating | |
| 167 multiple values you have to wrap them in tuple syntax. Because | |
| 168 :func:`format()` takes a `variable number of arguments`_ it always | |
| 169 receives a tuple (which saves me a context switch :-). Here's an | |
| 170 example: | |
| 171 | |
| 172 >>> from humanfriendly.text import format | |
| 173 >>> # The string formatting operator. | |
| 174 >>> print('the magic number is %s' % 42) | |
| 175 the magic number is 42 | |
| 176 >>> print('the magic numbers are %s and %s' % (12, 42)) | |
| 177 the magic numbers are 12 and 42 | |
| 178 >>> # The format() function. | |
| 179 >>> print(format('the magic number is %s', 42)) | |
| 180 the magic number is 42 | |
| 181 >>> print(format('the magic numbers are %s and %s', 12, 42)) | |
| 182 the magic numbers are 12 and 42 | |
| 183 | |
| 184 - When you interpolate a single value and someone accidentally passes in a | |
| 185 tuple your code raises a :exc:`~exceptions.TypeError`. Because | |
| 186 :func:`format()` takes a `variable number of arguments`_ it always | |
| 187 receives a tuple so this can never happen. Here's an example: | |
| 188 | |
| 189 >>> # How expecting to interpolate a single value can fail. | |
| 190 >>> value = (12, 42) | |
| 191 >>> print('the magic value is %s' % value) | |
| 192 Traceback (most recent call last): | |
| 193 File "<stdin>", line 1, in <module> | |
| 194 TypeError: not all arguments converted during string formatting | |
| 195 >>> # The following line works as intended, no surprises here! | |
| 196 >>> print(format('the magic value is %s', value)) | |
| 197 the magic value is (12, 42) | |
| 198 | |
| 199 **Why format() instead of the str.format() method?** | |
| 200 | |
| 201 When you're doing complex string interpolation the :meth:`str.format()` | |
| 202 function results in more readable code, however I frequently find myself | |
| 203 adding parentheses to force evaluation order. The :func:`format()` function | |
| 204 avoids this because of the relative priority between the comma and dot | |
| 205 operators. Here's an example: | |
| 206 | |
| 207 >>> "{adjective} example" + " " + "(can't think of anything less {adjective})".format(adjective='silly') | |
| 208 "{adjective} example (can't think of anything less silly)" | |
| 209 >>> ("{adjective} example" + " " + "(can't think of anything less {adjective})").format(adjective='silly') | |
| 210 "silly example (can't think of anything less silly)" | |
| 211 >>> format("{adjective} example" + " " + "(can't think of anything less {adjective})", adjective='silly') | |
| 212 "silly example (can't think of anything less silly)" | |
| 213 | |
| 214 The :func:`compact()` and :func:`dedent()` functions are wrappers that | |
| 215 combine :func:`format()` with whitespace manipulation to make it easy to | |
| 216 write nice to read Python code. | |
| 217 | |
| 218 .. _variable number of arguments: https://docs.python.org/2/tutorial/controlflow.html#arbitrary-argument-lists | |
| 219 """ | |
| 220 if args: | |
| 221 text %= args | |
| 222 if kw: | |
| 223 text = text.format(**kw) | |
| 224 return text | |
| 225 | |
| 226 | |
| 227 def generate_slug(text, delimiter="-"): | |
| 228 """ | |
| 229 Convert text to a normalized "slug" without whitespace. | |
| 230 | |
| 231 :param text: The original text, for example ``Some Random Text!``. | |
| 232 :param delimiter: The delimiter used to separate words | |
| 233 (defaults to the ``-`` character). | |
| 234 :returns: The slug text, for example ``some-random-text``. | |
| 235 :raises: :exc:`~exceptions.ValueError` when the provided | |
| 236 text is nonempty but results in an empty slug. | |
| 237 """ | |
| 238 slug = text.lower() | |
| 239 escaped = delimiter.replace("\\", "\\\\") | |
| 240 slug = re.sub("[^a-z0-9]+", escaped, slug) | |
| 241 slug = slug.strip(delimiter) | |
| 242 if text and not slug: | |
| 243 msg = "The provided text %r results in an empty slug!" | |
| 244 raise ValueError(format(msg, text)) | |
| 245 return slug | |
| 246 | |
| 247 | |
| 248 def is_empty_line(text): | |
| 249 """ | |
| 250 Check if a text is empty or contains only whitespace. | |
| 251 | |
| 252 :param text: The text to check for "emptiness" (a string). | |
| 253 :returns: :data:`True` if the text is empty or contains only whitespace, | |
| 254 :data:`False` otherwise. | |
| 255 """ | |
| 256 return len(text) == 0 or text.isspace() | |
| 257 | |
| 258 | |
| 259 def join_lines(text): | |
| 260 """ | |
| 261 Remove "hard wrapping" from the paragraphs in a string. | |
| 262 | |
| 263 :param text: The text to reformat (a string). | |
| 264 :returns: The text without hard wrapping (a string). | |
| 265 | |
| 266 This function works by removing line breaks when the last character before | |
| 267 a line break and the first character after the line break are both | |
| 268 non-whitespace characters. This means that common leading indentation will | |
| 269 break :func:`join_lines()` (in that case you can use :func:`dedent()` | |
| 270 before calling :func:`join_lines()`). | |
| 271 """ | |
| 272 return re.sub(r'(\S)\n(\S)', r'\1 \2', text) | |
| 273 | |
| 274 | |
| 275 def pluralize(count, singular, plural=None): | |
| 276 """ | |
| 277 Combine a count with the singular or plural form of a word. | |
| 278 | |
| 279 If the plural form of the word is not provided it is obtained by | |
| 280 concatenating the singular form of the word with the letter "s". Of course | |
| 281 this will not always be correct, which is why you have the option to | |
| 282 specify both forms. | |
| 283 | |
| 284 :param count: The count (a number). | |
| 285 :param singular: The singular form of the word (a string). | |
| 286 :param plural: The plural form of the word (a string or :data:`None`). | |
| 287 :returns: The count and singular/plural word concatenated (a string). | |
| 288 """ | |
| 289 if not plural: | |
| 290 plural = singular + 's' | |
| 291 return '%s %s' % (count, singular if math.floor(float(count)) == 1 else plural) | |
| 292 | |
| 293 | |
| 294 def random_string(length=(25, 100), characters=string.ascii_letters): | |
| 295 """random_string(length=(25, 100), characters=string.ascii_letters) | |
| 296 Generate a random string. | |
| 297 | |
| 298 :param length: The length of the string to be generated (a number or a | |
| 299 tuple with two numbers). If this is a tuple then a random | |
| 300 number between the two numbers given in the tuple is used. | |
| 301 :param characters: The characters to be used (a string, defaults | |
| 302 to :data:`string.ascii_letters`). | |
| 303 :returns: A random string. | |
| 304 | |
| 305 The :func:`random_string()` function is very useful in test suites; by the | |
| 306 time I included it in :mod:`humanfriendly.text` I had already included | |
| 307 variants of this function in seven different test suites :-). | |
| 308 """ | |
| 309 if not isinstance(length, numbers.Number): | |
| 310 length = random.randint(length[0], length[1]) | |
| 311 return ''.join(random.choice(characters) for _ in range(length)) | |
| 312 | |
| 313 | |
| 314 def split(text, delimiter=','): | |
| 315 """ | |
| 316 Split a comma-separated list of strings. | |
| 317 | |
| 318 :param text: The text to split (a string). | |
| 319 :param delimiter: The delimiter to split on (a string). | |
| 320 :returns: A list of zero or more nonempty strings. | |
| 321 | |
| 322 Here's the default behavior of Python's built in :meth:`str.split()` | |
| 323 function: | |
| 324 | |
| 325 >>> 'foo,bar, baz,'.split(',') | |
| 326 ['foo', 'bar', ' baz', ''] | |
| 327 | |
| 328 In contrast here's the default behavior of the :func:`split()` function: | |
| 329 | |
| 330 >>> from humanfriendly.text import split | |
| 331 >>> split('foo,bar, baz,') | |
| 332 ['foo', 'bar', 'baz'] | |
| 333 | |
| 334 Here is an example that parses a nested data structure (a mapping of | |
| 335 logging level names to one or more styles per level) that's encoded in a | |
| 336 string so it can be set as an environment variable: | |
| 337 | |
| 338 >>> from pprint import pprint | |
| 339 >>> encoded_data = 'debug=green;warning=yellow;error=red;critical=red,bold' | |
| 340 >>> parsed_data = dict((k, split(v, ',')) for k, v in (split(kv, '=') for kv in split(encoded_data, ';'))) | |
| 341 >>> pprint(parsed_data) | |
| 342 {'debug': ['green'], | |
| 343 'warning': ['yellow'], | |
| 344 'error': ['red'], | |
| 345 'critical': ['red', 'bold']} | |
| 346 """ | |
| 347 return [token.strip() for token in text.split(delimiter) if token and not token.isspace()] | |
| 348 | |
| 349 | |
| 350 def split_paragraphs(text): | |
| 351 """ | |
| 352 Split a string into paragraphs (one or more lines delimited by an empty line). | |
| 353 | |
| 354 :param text: The text to split into paragraphs (a string). | |
| 355 :returns: A list of strings. | |
| 356 """ | |
| 357 paragraphs = [] | |
| 358 for chunk in text.split('\n\n'): | |
| 359 chunk = trim_empty_lines(chunk) | |
| 360 if chunk and not chunk.isspace(): | |
| 361 paragraphs.append(chunk) | |
| 362 return paragraphs | |
| 363 | |
| 364 | |
| 365 def tokenize(text): | |
| 366 """ | |
| 367 Tokenize a text into numbers and strings. | |
| 368 | |
| 369 :param text: The text to tokenize (a string). | |
| 370 :returns: A list of strings and/or numbers. | |
| 371 | |
| 372 This function is used to implement robust tokenization of user input in | |
| 373 functions like :func:`.parse_size()` and :func:`.parse_timespan()`. It | |
| 374 automatically coerces integer and floating point numbers, ignores | |
| 375 whitespace and knows how to separate numbers from strings even without | |
| 376 whitespace. Some examples to make this more concrete: | |
| 377 | |
| 378 >>> from humanfriendly.text import tokenize | |
| 379 >>> tokenize('42') | |
| 380 [42] | |
| 381 >>> tokenize('42MB') | |
| 382 [42, 'MB'] | |
| 383 >>> tokenize('42.5MB') | |
| 384 [42.5, 'MB'] | |
| 385 >>> tokenize('42.5 MB') | |
| 386 [42.5, 'MB'] | |
| 387 """ | |
| 388 tokenized_input = [] | |
| 389 for token in re.split(r'(\d+(?:\.\d+)?)', text): | |
| 390 token = token.strip() | |
| 391 if re.match(r'\d+\.\d+', token): | |
| 392 tokenized_input.append(float(token)) | |
| 393 elif token.isdigit(): | |
| 394 tokenized_input.append(int(token)) | |
| 395 elif token: | |
| 396 tokenized_input.append(token) | |
| 397 return tokenized_input | |
| 398 | |
| 399 | |
| 400 def trim_empty_lines(text): | |
| 401 """ | |
| 402 Trim leading and trailing empty lines from the given text. | |
| 403 | |
| 404 :param text: The text to trim (a string). | |
| 405 :returns: The trimmed text (a string). | |
| 406 """ | |
| 407 lines = text.splitlines(True) | |
| 408 while lines and is_empty_line(lines[0]): | |
| 409 lines.pop(0) | |
| 410 while lines and is_empty_line(lines[-1]): | |
| 411 lines.pop(-1) | |
| 412 return ''.join(lines) |
