comparison env/lib/python3.9/site-packages/humanfriendly/text.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 # Human friendly input/output in Python.
2 #
3 # Author: Peter Odding <peter@peterodding.com>
4 # Last Change: December 1, 2020
5 # URL: https://humanfriendly.readthedocs.io
6
7 """
8 Simple text manipulation functions.
9
10 The :mod:`~humanfriendly.text` module contains simple functions to manipulate text:
11
12 - The :func:`concatenate()` and :func:`pluralize()` functions make it easy to
13 generate human friendly output.
14
15 - The :func:`format()`, :func:`compact()` and :func:`dedent()` functions
16 provide a clean and simple to use syntax for composing large text fragments
17 with interpolated variables.
18
19 - The :func:`tokenize()` function parses simple user input.
20 """
21
22 # Standard library modules.
23 import numbers
24 import random
25 import re
26 import string
27 import textwrap
28
29 # Public identifiers that require documentation.
30 __all__ = (
31 'compact',
32 'compact_empty_lines',
33 'concatenate',
34 'dedent',
35 'format',
36 'generate_slug',
37 'is_empty_line',
38 'join_lines',
39 'pluralize',
40 'pluralize_raw',
41 'random_string',
42 'split',
43 'split_paragraphs',
44 'tokenize',
45 'trim_empty_lines',
46 )
47
48
49 def compact(text, *args, **kw):
50 '''
51 Compact whitespace in a string.
52
53 Trims leading and trailing whitespace, replaces runs of whitespace
54 characters with a single space and interpolates any arguments using
55 :func:`format()`.
56
57 :param text: The text to compact (a string).
58 :param args: Any positional arguments are interpolated using :func:`format()`.
59 :param kw: Any keyword arguments are interpolated using :func:`format()`.
60 :returns: The compacted text (a string).
61
62 Here's an example of how I like to use the :func:`compact()` function, this
63 is an example from a random unrelated project I'm working on at the moment::
64
65 raise PortDiscoveryError(compact("""
66 Failed to discover port(s) that Apache is listening on!
67 Maybe I'm parsing the wrong configuration file? ({filename})
68 """, filename=self.ports_config))
69
70 The combination of :func:`compact()` and Python's multi line strings allows
71 me to write long text fragments with interpolated variables that are easy
72 to write, easy to read and work well with Python's whitespace
73 sensitivity.
74 '''
75 non_whitespace_tokens = text.split()
76 compacted_text = ' '.join(non_whitespace_tokens)
77 return format(compacted_text, *args, **kw)
78
79
80 def compact_empty_lines(text):
81 """
82 Replace repeating empty lines with a single empty line (similar to ``cat -s``).
83
84 :param text: The text in which to compact empty lines (a string).
85 :returns: The text with empty lines compacted (a string).
86 """
87 i = 0
88 lines = text.splitlines(True)
89 while i < len(lines):
90 if i > 0 and is_empty_line(lines[i - 1]) and is_empty_line(lines[i]):
91 lines.pop(i)
92 else:
93 i += 1
94 return ''.join(lines)
95
96
97 def concatenate(items, conjunction='and', serial_comma=False):
98 """
99 Concatenate a list of items in a human friendly way.
100
101 :param items:
102
103 A sequence of strings.
104
105 :param conjunction:
106
107 The word to use before the last item (a string, defaults to "and").
108
109 :param serial_comma:
110
111 :data:`True` to use a `serial comma`_, :data:`False` otherwise
112 (defaults to :data:`False`).
113
114 :returns:
115
116 A single string.
117
118 >>> from humanfriendly.text import concatenate
119 >>> concatenate(["eggs", "milk", "bread"])
120 'eggs, milk and bread'
121
122 .. _serial comma: https://en.wikipedia.org/wiki/Serial_comma
123 """
124 items = list(items)
125 if len(items) > 1:
126 final_item = items.pop()
127 formatted = ', '.join(items)
128 if serial_comma:
129 formatted += ','
130 return ' '.join([formatted, conjunction, final_item])
131 elif items:
132 return items[0]
133 else:
134 return ''
135
136
137 def dedent(text, *args, **kw):
138 """
139 Dedent a string (remove common leading whitespace from all lines).
140
141 Removes common leading whitespace from all lines in the string using
142 :func:`textwrap.dedent()`, removes leading and trailing empty lines using
143 :func:`trim_empty_lines()` and interpolates any arguments using
144 :func:`format()`.
145
146 :param text: The text to dedent (a string).
147 :param args: Any positional arguments are interpolated using :func:`format()`.
148 :param kw: Any keyword arguments are interpolated using :func:`format()`.
149 :returns: The dedented text (a string).
150
151 The :func:`compact()` function's documentation contains an example of how I
152 like to use the :func:`compact()` and :func:`dedent()` functions. The main
153 difference is that I use :func:`compact()` for text that will be presented
154 to the user (where whitespace is not so significant) and :func:`dedent()`
155 for data file and code generation tasks (where newlines and indentation are
156 very significant).
157 """
158 dedented_text = textwrap.dedent(text)
159 trimmed_text = trim_empty_lines(dedented_text)
160 return format(trimmed_text, *args, **kw)
161
162
163 def format(text, *args, **kw):
164 """
165 Format a string using the string formatting operator and/or :meth:`str.format()`.
166
167 :param text: The text to format (a string).
168 :param args: Any positional arguments are interpolated into the text using
169 the string formatting operator (``%``). If no positional
170 arguments are given no interpolation is done.
171 :param kw: Any keyword arguments are interpolated into the text using the
172 :meth:`str.format()` function. If no keyword arguments are given
173 no interpolation is done.
174 :returns: The text with any positional and/or keyword arguments
175 interpolated (a string).
176
177 The implementation of this function is so trivial that it seems silly to
178 even bother writing and documenting it. Justifying this requires some
179 context :-).
180
181 **Why format() instead of the string formatting operator?**
182
183 For really simple string interpolation Python's string formatting operator
184 is ideal, but it does have some strange quirks:
185
186 - When you switch from interpolating a single value to interpolating
187 multiple values you have to wrap them in tuple syntax. Because
188 :func:`format()` takes a `variable number of arguments`_ it always
189 receives a tuple (which saves me a context switch :-). Here's an
190 example:
191
192 >>> from humanfriendly.text import format
193 >>> # The string formatting operator.
194 >>> print('the magic number is %s' % 42)
195 the magic number is 42
196 >>> print('the magic numbers are %s and %s' % (12, 42))
197 the magic numbers are 12 and 42
198 >>> # The format() function.
199 >>> print(format('the magic number is %s', 42))
200 the magic number is 42
201 >>> print(format('the magic numbers are %s and %s', 12, 42))
202 the magic numbers are 12 and 42
203
204 - When you interpolate a single value and someone accidentally passes in a
205 tuple your code raises a :exc:`~exceptions.TypeError`. Because
206 :func:`format()` takes a `variable number of arguments`_ it always
207 receives a tuple so this can never happen. Here's an example:
208
209 >>> # How expecting to interpolate a single value can fail.
210 >>> value = (12, 42)
211 >>> print('the magic value is %s' % value)
212 Traceback (most recent call last):
213 File "<stdin>", line 1, in <module>
214 TypeError: not all arguments converted during string formatting
215 >>> # The following line works as intended, no surprises here!
216 >>> print(format('the magic value is %s', value))
217 the magic value is (12, 42)
218
219 **Why format() instead of the str.format() method?**
220
221 When you're doing complex string interpolation the :meth:`str.format()`
222 function results in more readable code, however I frequently find myself
223 adding parentheses to force evaluation order. The :func:`format()` function
224 avoids this because of the relative priority between the comma and dot
225 operators. Here's an example:
226
227 >>> "{adjective} example" + " " + "(can't think of anything less {adjective})".format(adjective='silly')
228 "{adjective} example (can't think of anything less silly)"
229 >>> ("{adjective} example" + " " + "(can't think of anything less {adjective})").format(adjective='silly')
230 "silly example (can't think of anything less silly)"
231 >>> format("{adjective} example" + " " + "(can't think of anything less {adjective})", adjective='silly')
232 "silly example (can't think of anything less silly)"
233
234 The :func:`compact()` and :func:`dedent()` functions are wrappers that
235 combine :func:`format()` with whitespace manipulation to make it easy to
236 write nice to read Python code.
237
238 .. _variable number of arguments: https://docs.python.org/2/tutorial/controlflow.html#arbitrary-argument-lists
239 """
240 if args:
241 text %= args
242 if kw:
243 text = text.format(**kw)
244 return text
245
246
247 def generate_slug(text, delimiter="-"):
248 """
249 Convert text to a normalized "slug" without whitespace.
250
251 :param text: The original text, for example ``Some Random Text!``.
252 :param delimiter: The delimiter used to separate words
253 (defaults to the ``-`` character).
254 :returns: The slug text, for example ``some-random-text``.
255 :raises: :exc:`~exceptions.ValueError` when the provided
256 text is nonempty but results in an empty slug.
257 """
258 slug = text.lower()
259 escaped = delimiter.replace("\\", "\\\\")
260 slug = re.sub("[^a-z0-9]+", escaped, slug)
261 slug = slug.strip(delimiter)
262 if text and not slug:
263 msg = "The provided text %r results in an empty slug!"
264 raise ValueError(format(msg, text))
265 return slug
266
267
268 def is_empty_line(text):
269 """
270 Check if a text is empty or contains only whitespace.
271
272 :param text: The text to check for "emptiness" (a string).
273 :returns: :data:`True` if the text is empty or contains only whitespace,
274 :data:`False` otherwise.
275 """
276 return len(text) == 0 or text.isspace()
277
278
279 def join_lines(text):
280 """
281 Remove "hard wrapping" from the paragraphs in a string.
282
283 :param text: The text to reformat (a string).
284 :returns: The text without hard wrapping (a string).
285
286 This function works by removing line breaks when the last character before
287 a line break and the first character after the line break are both
288 non-whitespace characters. This means that common leading indentation will
289 break :func:`join_lines()` (in that case you can use :func:`dedent()`
290 before calling :func:`join_lines()`).
291 """
292 return re.sub(r'(\S)\n(\S)', r'\1 \2', text)
293
294
295 def pluralize(count, singular, plural=None):
296 """
297 Combine a count with the singular or plural form of a word.
298
299 :param count: The count (a number).
300 :param singular: The singular form of the word (a string).
301 :param plural: The plural form of the word (a string or :data:`None`).
302 :returns: The count and singular or plural word concatenated (a string).
303
304 See :func:`pluralize_raw()` for the logic underneath :func:`pluralize()`.
305 """
306 return '%s %s' % (count, pluralize_raw(count, singular, plural))
307
308
309 def pluralize_raw(count, singular, plural=None):
310 """
311 Select the singular or plural form of a word based on a count.
312
313 :param count: The count (a number).
314 :param singular: The singular form of the word (a string).
315 :param plural: The plural form of the word (a string or :data:`None`).
316 :returns: The singular or plural form of the word (a string).
317
318 When the given count is exactly 1.0 the singular form of the word is
319 selected, in all other cases the plural form of the word is selected.
320
321 If the plural form of the word is not provided it is obtained by
322 concatenating the singular form of the word with the letter "s". Of course
323 this will not always be correct, which is why you have the option to
324 specify both forms.
325 """
326 if not plural:
327 plural = singular + 's'
328 return singular if float(count) == 1.0 else plural
329
330
331 def random_string(length=(25, 100), characters=string.ascii_letters):
332 """random_string(length=(25, 100), characters=string.ascii_letters)
333 Generate a random string.
334
335 :param length: The length of the string to be generated (a number or a
336 tuple with two numbers). If this is a tuple then a random
337 number between the two numbers given in the tuple is used.
338 :param characters: The characters to be used (a string, defaults
339 to :data:`string.ascii_letters`).
340 :returns: A random string.
341
342 The :func:`random_string()` function is very useful in test suites; by the
343 time I included it in :mod:`humanfriendly.text` I had already included
344 variants of this function in seven different test suites :-).
345 """
346 if not isinstance(length, numbers.Number):
347 length = random.randint(length[0], length[1])
348 return ''.join(random.choice(characters) for _ in range(length))
349
350
351 def split(text, delimiter=','):
352 """
353 Split a comma-separated list of strings.
354
355 :param text: The text to split (a string).
356 :param delimiter: The delimiter to split on (a string).
357 :returns: A list of zero or more nonempty strings.
358
359 Here's the default behavior of Python's built in :meth:`str.split()`
360 function:
361
362 >>> 'foo,bar, baz,'.split(',')
363 ['foo', 'bar', ' baz', '']
364
365 In contrast here's the default behavior of the :func:`split()` function:
366
367 >>> from humanfriendly.text import split
368 >>> split('foo,bar, baz,')
369 ['foo', 'bar', 'baz']
370
371 Here is an example that parses a nested data structure (a mapping of
372 logging level names to one or more styles per level) that's encoded in a
373 string so it can be set as an environment variable:
374
375 >>> from pprint import pprint
376 >>> encoded_data = 'debug=green;warning=yellow;error=red;critical=red,bold'
377 >>> parsed_data = dict((k, split(v, ',')) for k, v in (split(kv, '=') for kv in split(encoded_data, ';')))
378 >>> pprint(parsed_data)
379 {'debug': ['green'],
380 'warning': ['yellow'],
381 'error': ['red'],
382 'critical': ['red', 'bold']}
383 """
384 return [token.strip() for token in text.split(delimiter) if token and not token.isspace()]
385
386
387 def split_paragraphs(text):
388 """
389 Split a string into paragraphs (one or more lines delimited by an empty line).
390
391 :param text: The text to split into paragraphs (a string).
392 :returns: A list of strings.
393 """
394 paragraphs = []
395 for chunk in text.split('\n\n'):
396 chunk = trim_empty_lines(chunk)
397 if chunk and not chunk.isspace():
398 paragraphs.append(chunk)
399 return paragraphs
400
401
402 def tokenize(text):
403 """
404 Tokenize a text into numbers and strings.
405
406 :param text: The text to tokenize (a string).
407 :returns: A list of strings and/or numbers.
408
409 This function is used to implement robust tokenization of user input in
410 functions like :func:`.parse_size()` and :func:`.parse_timespan()`. It
411 automatically coerces integer and floating point numbers, ignores
412 whitespace and knows how to separate numbers from strings even without
413 whitespace. Some examples to make this more concrete:
414
415 >>> from humanfriendly.text import tokenize
416 >>> tokenize('42')
417 [42]
418 >>> tokenize('42MB')
419 [42, 'MB']
420 >>> tokenize('42.5MB')
421 [42.5, 'MB']
422 >>> tokenize('42.5 MB')
423 [42.5, 'MB']
424 """
425 tokenized_input = []
426 for token in re.split(r'(\d+(?:\.\d+)?)', text):
427 token = token.strip()
428 if re.match(r'\d+\.\d+', token):
429 tokenized_input.append(float(token))
430 elif token.isdigit():
431 tokenized_input.append(int(token))
432 elif token:
433 tokenized_input.append(token)
434 return tokenized_input
435
436
437 def trim_empty_lines(text):
438 """
439 Trim leading and trailing empty lines from the given text.
440
441 :param text: The text to trim (a string).
442 :returns: The trimmed text (a string).
443 """
444 lines = text.splitlines(True)
445 while lines and is_empty_line(lines[0]):
446 lines.pop(0)
447 while lines and is_empty_line(lines[-1]):
448 lines.pop(-1)
449 return ''.join(lines)