Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/humanfriendly/text.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4f3585e2f14b |
---|---|
1 # Human friendly input/output in Python. | |
2 # | |
3 # Author: Peter Odding <peter@peterodding.com> | |
4 # Last Change: December 1, 2020 | |
5 # URL: https://humanfriendly.readthedocs.io | |
6 | |
7 """ | |
8 Simple text manipulation functions. | |
9 | |
10 The :mod:`~humanfriendly.text` module contains simple functions to manipulate text: | |
11 | |
12 - The :func:`concatenate()` and :func:`pluralize()` functions make it easy to | |
13 generate human friendly output. | |
14 | |
15 - The :func:`format()`, :func:`compact()` and :func:`dedent()` functions | |
16 provide a clean and simple to use syntax for composing large text fragments | |
17 with interpolated variables. | |
18 | |
19 - The :func:`tokenize()` function parses simple user input. | |
20 """ | |
21 | |
22 # Standard library modules. | |
23 import numbers | |
24 import random | |
25 import re | |
26 import string | |
27 import textwrap | |
28 | |
29 # Public identifiers that require documentation. | |
30 __all__ = ( | |
31 'compact', | |
32 'compact_empty_lines', | |
33 'concatenate', | |
34 'dedent', | |
35 'format', | |
36 'generate_slug', | |
37 'is_empty_line', | |
38 'join_lines', | |
39 'pluralize', | |
40 'pluralize_raw', | |
41 'random_string', | |
42 'split', | |
43 'split_paragraphs', | |
44 'tokenize', | |
45 'trim_empty_lines', | |
46 ) | |
47 | |
48 | |
49 def compact(text, *args, **kw): | |
50 ''' | |
51 Compact whitespace in a string. | |
52 | |
53 Trims leading and trailing whitespace, replaces runs of whitespace | |
54 characters with a single space and interpolates any arguments using | |
55 :func:`format()`. | |
56 | |
57 :param text: The text to compact (a string). | |
58 :param args: Any positional arguments are interpolated using :func:`format()`. | |
59 :param kw: Any keyword arguments are interpolated using :func:`format()`. | |
60 :returns: The compacted text (a string). | |
61 | |
62 Here's an example of how I like to use the :func:`compact()` function, this | |
63 is an example from a random unrelated project I'm working on at the moment:: | |
64 | |
65 raise PortDiscoveryError(compact(""" | |
66 Failed to discover port(s) that Apache is listening on! | |
67 Maybe I'm parsing the wrong configuration file? ({filename}) | |
68 """, filename=self.ports_config)) | |
69 | |
70 The combination of :func:`compact()` and Python's multi line strings allows | |
71 me to write long text fragments with interpolated variables that are easy | |
72 to write, easy to read and work well with Python's whitespace | |
73 sensitivity. | |
74 ''' | |
75 non_whitespace_tokens = text.split() | |
76 compacted_text = ' '.join(non_whitespace_tokens) | |
77 return format(compacted_text, *args, **kw) | |
78 | |
79 | |
80 def compact_empty_lines(text): | |
81 """ | |
82 Replace repeating empty lines with a single empty line (similar to ``cat -s``). | |
83 | |
84 :param text: The text in which to compact empty lines (a string). | |
85 :returns: The text with empty lines compacted (a string). | |
86 """ | |
87 i = 0 | |
88 lines = text.splitlines(True) | |
89 while i < len(lines): | |
90 if i > 0 and is_empty_line(lines[i - 1]) and is_empty_line(lines[i]): | |
91 lines.pop(i) | |
92 else: | |
93 i += 1 | |
94 return ''.join(lines) | |
95 | |
96 | |
97 def concatenate(items, conjunction='and', serial_comma=False): | |
98 """ | |
99 Concatenate a list of items in a human friendly way. | |
100 | |
101 :param items: | |
102 | |
103 A sequence of strings. | |
104 | |
105 :param conjunction: | |
106 | |
107 The word to use before the last item (a string, defaults to "and"). | |
108 | |
109 :param serial_comma: | |
110 | |
111 :data:`True` to use a `serial comma`_, :data:`False` otherwise | |
112 (defaults to :data:`False`). | |
113 | |
114 :returns: | |
115 | |
116 A single string. | |
117 | |
118 >>> from humanfriendly.text import concatenate | |
119 >>> concatenate(["eggs", "milk", "bread"]) | |
120 'eggs, milk and bread' | |
121 | |
122 .. _serial comma: https://en.wikipedia.org/wiki/Serial_comma | |
123 """ | |
124 items = list(items) | |
125 if len(items) > 1: | |
126 final_item = items.pop() | |
127 formatted = ', '.join(items) | |
128 if serial_comma: | |
129 formatted += ',' | |
130 return ' '.join([formatted, conjunction, final_item]) | |
131 elif items: | |
132 return items[0] | |
133 else: | |
134 return '' | |
135 | |
136 | |
137 def dedent(text, *args, **kw): | |
138 """ | |
139 Dedent a string (remove common leading whitespace from all lines). | |
140 | |
141 Removes common leading whitespace from all lines in the string using | |
142 :func:`textwrap.dedent()`, removes leading and trailing empty lines using | |
143 :func:`trim_empty_lines()` and interpolates any arguments using | |
144 :func:`format()`. | |
145 | |
146 :param text: The text to dedent (a string). | |
147 :param args: Any positional arguments are interpolated using :func:`format()`. | |
148 :param kw: Any keyword arguments are interpolated using :func:`format()`. | |
149 :returns: The dedented text (a string). | |
150 | |
151 The :func:`compact()` function's documentation contains an example of how I | |
152 like to use the :func:`compact()` and :func:`dedent()` functions. The main | |
153 difference is that I use :func:`compact()` for text that will be presented | |
154 to the user (where whitespace is not so significant) and :func:`dedent()` | |
155 for data file and code generation tasks (where newlines and indentation are | |
156 very significant). | |
157 """ | |
158 dedented_text = textwrap.dedent(text) | |
159 trimmed_text = trim_empty_lines(dedented_text) | |
160 return format(trimmed_text, *args, **kw) | |
161 | |
162 | |
163 def format(text, *args, **kw): | |
164 """ | |
165 Format a string using the string formatting operator and/or :meth:`str.format()`. | |
166 | |
167 :param text: The text to format (a string). | |
168 :param args: Any positional arguments are interpolated into the text using | |
169 the string formatting operator (``%``). If no positional | |
170 arguments are given no interpolation is done. | |
171 :param kw: Any keyword arguments are interpolated into the text using the | |
172 :meth:`str.format()` function. If no keyword arguments are given | |
173 no interpolation is done. | |
174 :returns: The text with any positional and/or keyword arguments | |
175 interpolated (a string). | |
176 | |
177 The implementation of this function is so trivial that it seems silly to | |
178 even bother writing and documenting it. Justifying this requires some | |
179 context :-). | |
180 | |
181 **Why format() instead of the string formatting operator?** | |
182 | |
183 For really simple string interpolation Python's string formatting operator | |
184 is ideal, but it does have some strange quirks: | |
185 | |
186 - When you switch from interpolating a single value to interpolating | |
187 multiple values you have to wrap them in tuple syntax. Because | |
188 :func:`format()` takes a `variable number of arguments`_ it always | |
189 receives a tuple (which saves me a context switch :-). Here's an | |
190 example: | |
191 | |
192 >>> from humanfriendly.text import format | |
193 >>> # The string formatting operator. | |
194 >>> print('the magic number is %s' % 42) | |
195 the magic number is 42 | |
196 >>> print('the magic numbers are %s and %s' % (12, 42)) | |
197 the magic numbers are 12 and 42 | |
198 >>> # The format() function. | |
199 >>> print(format('the magic number is %s', 42)) | |
200 the magic number is 42 | |
201 >>> print(format('the magic numbers are %s and %s', 12, 42)) | |
202 the magic numbers are 12 and 42 | |
203 | |
204 - When you interpolate a single value and someone accidentally passes in a | |
205 tuple your code raises a :exc:`~exceptions.TypeError`. Because | |
206 :func:`format()` takes a `variable number of arguments`_ it always | |
207 receives a tuple so this can never happen. Here's an example: | |
208 | |
209 >>> # How expecting to interpolate a single value can fail. | |
210 >>> value = (12, 42) | |
211 >>> print('the magic value is %s' % value) | |
212 Traceback (most recent call last): | |
213 File "<stdin>", line 1, in <module> | |
214 TypeError: not all arguments converted during string formatting | |
215 >>> # The following line works as intended, no surprises here! | |
216 >>> print(format('the magic value is %s', value)) | |
217 the magic value is (12, 42) | |
218 | |
219 **Why format() instead of the str.format() method?** | |
220 | |
221 When you're doing complex string interpolation the :meth:`str.format()` | |
222 function results in more readable code, however I frequently find myself | |
223 adding parentheses to force evaluation order. The :func:`format()` function | |
224 avoids this because of the relative priority between the comma and dot | |
225 operators. Here's an example: | |
226 | |
227 >>> "{adjective} example" + " " + "(can't think of anything less {adjective})".format(adjective='silly') | |
228 "{adjective} example (can't think of anything less silly)" | |
229 >>> ("{adjective} example" + " " + "(can't think of anything less {adjective})").format(adjective='silly') | |
230 "silly example (can't think of anything less silly)" | |
231 >>> format("{adjective} example" + " " + "(can't think of anything less {adjective})", adjective='silly') | |
232 "silly example (can't think of anything less silly)" | |
233 | |
234 The :func:`compact()` and :func:`dedent()` functions are wrappers that | |
235 combine :func:`format()` with whitespace manipulation to make it easy to | |
236 write nice to read Python code. | |
237 | |
238 .. _variable number of arguments: https://docs.python.org/2/tutorial/controlflow.html#arbitrary-argument-lists | |
239 """ | |
240 if args: | |
241 text %= args | |
242 if kw: | |
243 text = text.format(**kw) | |
244 return text | |
245 | |
246 | |
247 def generate_slug(text, delimiter="-"): | |
248 """ | |
249 Convert text to a normalized "slug" without whitespace. | |
250 | |
251 :param text: The original text, for example ``Some Random Text!``. | |
252 :param delimiter: The delimiter used to separate words | |
253 (defaults to the ``-`` character). | |
254 :returns: The slug text, for example ``some-random-text``. | |
255 :raises: :exc:`~exceptions.ValueError` when the provided | |
256 text is nonempty but results in an empty slug. | |
257 """ | |
258 slug = text.lower() | |
259 escaped = delimiter.replace("\\", "\\\\") | |
260 slug = re.sub("[^a-z0-9]+", escaped, slug) | |
261 slug = slug.strip(delimiter) | |
262 if text and not slug: | |
263 msg = "The provided text %r results in an empty slug!" | |
264 raise ValueError(format(msg, text)) | |
265 return slug | |
266 | |
267 | |
268 def is_empty_line(text): | |
269 """ | |
270 Check if a text is empty or contains only whitespace. | |
271 | |
272 :param text: The text to check for "emptiness" (a string). | |
273 :returns: :data:`True` if the text is empty or contains only whitespace, | |
274 :data:`False` otherwise. | |
275 """ | |
276 return len(text) == 0 or text.isspace() | |
277 | |
278 | |
279 def join_lines(text): | |
280 """ | |
281 Remove "hard wrapping" from the paragraphs in a string. | |
282 | |
283 :param text: The text to reformat (a string). | |
284 :returns: The text without hard wrapping (a string). | |
285 | |
286 This function works by removing line breaks when the last character before | |
287 a line break and the first character after the line break are both | |
288 non-whitespace characters. This means that common leading indentation will | |
289 break :func:`join_lines()` (in that case you can use :func:`dedent()` | |
290 before calling :func:`join_lines()`). | |
291 """ | |
292 return re.sub(r'(\S)\n(\S)', r'\1 \2', text) | |
293 | |
294 | |
295 def pluralize(count, singular, plural=None): | |
296 """ | |
297 Combine a count with the singular or plural form of a word. | |
298 | |
299 :param count: The count (a number). | |
300 :param singular: The singular form of the word (a string). | |
301 :param plural: The plural form of the word (a string or :data:`None`). | |
302 :returns: The count and singular or plural word concatenated (a string). | |
303 | |
304 See :func:`pluralize_raw()` for the logic underneath :func:`pluralize()`. | |
305 """ | |
306 return '%s %s' % (count, pluralize_raw(count, singular, plural)) | |
307 | |
308 | |
309 def pluralize_raw(count, singular, plural=None): | |
310 """ | |
311 Select the singular or plural form of a word based on a count. | |
312 | |
313 :param count: The count (a number). | |
314 :param singular: The singular form of the word (a string). | |
315 :param plural: The plural form of the word (a string or :data:`None`). | |
316 :returns: The singular or plural form of the word (a string). | |
317 | |
318 When the given count is exactly 1.0 the singular form of the word is | |
319 selected, in all other cases the plural form of the word is selected. | |
320 | |
321 If the plural form of the word is not provided it is obtained by | |
322 concatenating the singular form of the word with the letter "s". Of course | |
323 this will not always be correct, which is why you have the option to | |
324 specify both forms. | |
325 """ | |
326 if not plural: | |
327 plural = singular + 's' | |
328 return singular if float(count) == 1.0 else plural | |
329 | |
330 | |
331 def random_string(length=(25, 100), characters=string.ascii_letters): | |
332 """random_string(length=(25, 100), characters=string.ascii_letters) | |
333 Generate a random string. | |
334 | |
335 :param length: The length of the string to be generated (a number or a | |
336 tuple with two numbers). If this is a tuple then a random | |
337 number between the two numbers given in the tuple is used. | |
338 :param characters: The characters to be used (a string, defaults | |
339 to :data:`string.ascii_letters`). | |
340 :returns: A random string. | |
341 | |
342 The :func:`random_string()` function is very useful in test suites; by the | |
343 time I included it in :mod:`humanfriendly.text` I had already included | |
344 variants of this function in seven different test suites :-). | |
345 """ | |
346 if not isinstance(length, numbers.Number): | |
347 length = random.randint(length[0], length[1]) | |
348 return ''.join(random.choice(characters) for _ in range(length)) | |
349 | |
350 | |
351 def split(text, delimiter=','): | |
352 """ | |
353 Split a comma-separated list of strings. | |
354 | |
355 :param text: The text to split (a string). | |
356 :param delimiter: The delimiter to split on (a string). | |
357 :returns: A list of zero or more nonempty strings. | |
358 | |
359 Here's the default behavior of Python's built in :meth:`str.split()` | |
360 function: | |
361 | |
362 >>> 'foo,bar, baz,'.split(',') | |
363 ['foo', 'bar', ' baz', ''] | |
364 | |
365 In contrast here's the default behavior of the :func:`split()` function: | |
366 | |
367 >>> from humanfriendly.text import split | |
368 >>> split('foo,bar, baz,') | |
369 ['foo', 'bar', 'baz'] | |
370 | |
371 Here is an example that parses a nested data structure (a mapping of | |
372 logging level names to one or more styles per level) that's encoded in a | |
373 string so it can be set as an environment variable: | |
374 | |
375 >>> from pprint import pprint | |
376 >>> encoded_data = 'debug=green;warning=yellow;error=red;critical=red,bold' | |
377 >>> parsed_data = dict((k, split(v, ',')) for k, v in (split(kv, '=') for kv in split(encoded_data, ';'))) | |
378 >>> pprint(parsed_data) | |
379 {'debug': ['green'], | |
380 'warning': ['yellow'], | |
381 'error': ['red'], | |
382 'critical': ['red', 'bold']} | |
383 """ | |
384 return [token.strip() for token in text.split(delimiter) if token and not token.isspace()] | |
385 | |
386 | |
387 def split_paragraphs(text): | |
388 """ | |
389 Split a string into paragraphs (one or more lines delimited by an empty line). | |
390 | |
391 :param text: The text to split into paragraphs (a string). | |
392 :returns: A list of strings. | |
393 """ | |
394 paragraphs = [] | |
395 for chunk in text.split('\n\n'): | |
396 chunk = trim_empty_lines(chunk) | |
397 if chunk and not chunk.isspace(): | |
398 paragraphs.append(chunk) | |
399 return paragraphs | |
400 | |
401 | |
402 def tokenize(text): | |
403 """ | |
404 Tokenize a text into numbers and strings. | |
405 | |
406 :param text: The text to tokenize (a string). | |
407 :returns: A list of strings and/or numbers. | |
408 | |
409 This function is used to implement robust tokenization of user input in | |
410 functions like :func:`.parse_size()` and :func:`.parse_timespan()`. It | |
411 automatically coerces integer and floating point numbers, ignores | |
412 whitespace and knows how to separate numbers from strings even without | |
413 whitespace. Some examples to make this more concrete: | |
414 | |
415 >>> from humanfriendly.text import tokenize | |
416 >>> tokenize('42') | |
417 [42] | |
418 >>> tokenize('42MB') | |
419 [42, 'MB'] | |
420 >>> tokenize('42.5MB') | |
421 [42.5, 'MB'] | |
422 >>> tokenize('42.5 MB') | |
423 [42.5, 'MB'] | |
424 """ | |
425 tokenized_input = [] | |
426 for token in re.split(r'(\d+(?:\.\d+)?)', text): | |
427 token = token.strip() | |
428 if re.match(r'\d+\.\d+', token): | |
429 tokenized_input.append(float(token)) | |
430 elif token.isdigit(): | |
431 tokenized_input.append(int(token)) | |
432 elif token: | |
433 tokenized_input.append(token) | |
434 return tokenized_input | |
435 | |
436 | |
437 def trim_empty_lines(text): | |
438 """ | |
439 Trim leading and trailing empty lines from the given text. | |
440 | |
441 :param text: The text to trim (a string). | |
442 :returns: The trimmed text (a string). | |
443 """ | |
444 lines = text.splitlines(True) | |
445 while lines and is_empty_line(lines[0]): | |
446 lines.pop(0) | |
447 while lines and is_empty_line(lines[-1]): | |
448 lines.pop(-1) | |
449 return ''.join(lines) |