comparison column_maker.xml @ 9:6595517c2dd8 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_maker commit fe76077775aaca531f6a563fdfcbd73fbf1528e7
author iuc
date Thu, 28 Jul 2022 15:28:30 +0000
parents 02026300aa45
children
comparison
equal deleted inserted replaced
8:02026300aa45 9:6595517c2dd8
1 <tool id="Add_a_column1" name="Compute" version="1.6"> 1 <tool id="Add_a_column1" name="Compute" version="2.0">
2 <description>an expression on every row</description> 2 <description>on rows</description>
3 <macros>
4 <xml name="compute_repeat">
5 <repeat name="expressions" title="Expressions" min="1" default="1">
6 <param name="cond" type="text" value="c3-c2" label="Add expression">
7 <sanitizer>
8 <valid initial="default">
9 <add value="&lt;" />
10 <add value="&gt;" />
11 <add value="&quot;" />
12 <add value="&apos;" />
13 </valid>
14 </sanitizer>
15 </param>
16 <conditional name="add_column">
17 <param name="mode" type="select" label="Mode of the operation">
18 <option value="">Append</option>
19 <option value="I">Insert</option>
20 <option value="R">Replace</option>
21 </param>
22 <when value="">
23 <param name="pos" type="hidden" value="" />
24 </when>
25 <when value="I">
26 <param name="pos" type="integer" min="1" value="1" label="Insert new column before existing column number" />
27 </when>
28 <when value="R">
29 <param name="pos" type="integer" min="1" value="1" label="Use new column to replace column number" />
30 </when>
31 </conditional>
32 <yield />
33 </repeat>
34 </xml>
35 </macros>
3 <requirements> 36 <requirements>
4 <requirement type="package" version="3.8">python</requirement> 37 <requirement type="package" version="3.8">python</requirement>
5 <requirement type="package" version="1.19.1">numpy</requirement> 38 <requirement type="package" version="1.23.1">numpy</requirement>
6 </requirements> 39 </requirements>
7 <command detect_errors="aggressive"><![CDATA[ 40 <command detect_errors="aggressive"><![CDATA[
8 ## inject colums and column_types metadata into inputs json 41 python '$__tool_directory__/column_maker.py'
9 #import json 42 #if str($error_handling.auto_col_types) == 'on':
10 #set inputs_dict = json.load(open($inputs)) 43 #set $col_types = $input.metadata.column_types
11 #set inputs_dict['columns'] = $input.metadata.columns 44 #else:
12 #set inputs_dict['column_types'] = $input.metadata.column_types 45 #set $col_types = ','.join(['str' for t in $input.metadata.column_types.split(',')])
13 ## flatten conditional
14 #if $header_lines_conditional.header_lines_select == "yes":
15 #set inputs_dict['header_new_column_name'] = str($header_lines_conditional.header_new_column_name)
16 #end if 46 #end if
17 #set x = json.dump($inputs_dict, open($inputs, 'w')) 47 --column-types $col_types
18 48 $avoid_scientific_notation
19 python '$__tool_directory__/column_maker.py' 49 #if str($ops.header_lines_select) == 'yes':
50 --header
51 #end if
52 --file '$expressions_file'
53 $error_handling.fail_on_non_existent_columns
54 $error_handling.non_computable.action
55 #if str($error_handling.non_computable.action) == '--non-computable-default':
56 '$error_handling.non_computable.default_value'
57 #end if
20 '$input' 58 '$input'
21 '$out_file1' 59 '$out_file1'
22 --load_json '$inputs'
23 ]]></command> 60 ]]></command>
24 <configfiles> 61 <configfiles>
25 <inputs name="inputs"/> 62 <configfile name="expressions_file"><![CDATA[
63 #if str($ops.header_lines_select) == 'yes':
64 #for $expr in $ops.expressions:
65 ${expr.cond};${expr.add_column.pos}${expr.add_column.mode};${expr.new_column_name}
66 #end for
67 #else:
68 #for $expr in $ops.expressions:
69 ${expr.cond};${expr.add_column.pos}${expr.add_column.mode};
70 #end for
71 #end if
72 ]]></configfile>
26 </configfiles> 73 </configfiles>
27 <inputs> 74 <inputs>
28 <param name="cond" type="text" value="c3-c2" label="Add expression"> 75 <param name="input" type="data" format="tabular" label="Input file" help="Dataset missing? See TIP below" />
29 <sanitizer> 76 <conditional name="ops">
30 <valid initial="default">
31 <add value="&lt;" />
32 <add value="&gt;" />
33 <add value="&quot;" />
34 <add value="&apos;" />
35 </valid>
36 </sanitizer>
37 </param>
38 <param name="input" type="data" format="tabular" label="as a new column to" help="Dataset missing? See TIP below"/>
39 <param name="round" type="boolean" truevalue="yes" falsevalue="no" label="Round result?" />
40 <param name="avoid_scientific_notation" type="boolean" truevalue="yes" falsevalue="no"
41 label="Avoid scientific notation"
42 help="If yes, use fully expanded decimal representation when writing new columns (use only if expression produces decimal numbers)." />
43 <conditional name="header_lines_conditional">
44 <param name="header_lines_select" type="select" 77 <param name="header_lines_select" type="select"
45 label="Input has a header line with column names?" 78 label="Input has a header line with column names?"
46 help="Select Yes to be able to specify a name for the new column and have it added to the header line. If you select No, the first line will be treated as a regular line: If it is empty or starts with a # character it will be skipped, otherwise the tool will attempt to compute the specified expression on it." > 79 help="Select Yes to be able to specify names for new columns and have them added to the header line. If you select No, the first line will be treated as a regular line: If it is empty or starts with a # character it will be skipped, otherwise the tool will attempt to compute the specified expression on it." >
47 <option value="no">No</option> 80 <option value="no">No</option>
48 <option value="yes">Yes</option> 81 <option value="yes">Yes</option>
49 </param> 82 </param>
50 <when value="no" /> 83 <when value="no">
84 <expand macro="compute_repeat" />
85 </when>
51 <when value="yes"> 86 <when value="yes">
52 <param name="header_new_column_name" type="text" value="New Column" label="The new column name" /> 87 <expand macro="compute_repeat">
88 <param name="new_column_name" type="text" value="New Column" label="The new column name" />
89 </expand>
53 </when> 90 </when>
54 </conditional> 91 </conditional>
92 <param name="avoid_scientific_notation" type="boolean" truevalue="--avoid-scientific-notation" falsevalue=""
93 label="Avoid scientific notation in any newly computed columns"
94 help="If yes, use fully expanded decimal representation when writing new columns with floating point values. To prevent scientific notation in just specific new columns, you can use numpy's format_float_positional function in the corresponding expression." />
95 <section name="error_handling" title="Error handling">
96 <param name="auto_col_types" type="boolean" truevalue="on" falsevalue="off" checked="true" label="Autodetect column types"
97 help="By default, try to use the column types that Galaxy has recorded for the input. This simplifies expressions, but can occasionally cause problems on its own. If disabled all column values are assumed to be strings and you will have to handle conversions to different types explicitly in the expression." />
98 <param argument="--fail-on-non-existent-columns" type="boolean" truevalue="--fail-on-non-existent-columns" falsevalue="" checked="true" label="Fail on references to non-existent columns"
99 help="If any expression references a column number that does not exist when that expression gets computed, the tool run will fail. Uncheck to have such a situation handled as a case of a non-computable expression as configured below." />
100 <conditional name="non_computable">
101 <param name="action" type="select" label="If an expression cannot be computed for a row">
102 <option value="--fail-on-non-computable">Fail the entire tool run</option>
103 <option value="--skip-non-computable">Skip the row</option>
104 <option value="--keep-non-computable">Keep the row unchanged</option>
105 <option value="--non-computable-blank">Produce an empty column value for the row</option>
106 <option value="--non-computable-default">Fill in a replacement value</option>
107 </param>
108 <when value="--fail-on-non-computable" />
109 <when value="--skip-non-computable" />
110 <when value="--keep-non-computable" />
111 <when value="--non-computable-blank" />
112 <when value="--non-computable-default">
113 <param name="default_value" type="text" label="Replacement value" help="Pick from suggestions or enter your own.">
114 <option value="nan">nan (not a number)</option>
115 <option value="inf">inf (infinity)</option>
116 <option value="-inf">-inf (negative infinity)</option>
117 <option value="NA">NA (not available)</option>
118 <option value=".">.</option>
119 </param>
120 </when>
121 </conditional>
122 </section>
55 </inputs> 123 </inputs>
56 <outputs> 124 <outputs>
57 <data name="out_file1" format_source="input" metadata_source="input"/> 125 <data name="out_file1" format_source="input" metadata_source="input"/>
58 </outputs> 126 </outputs>
59 <tests> 127 <tests>
60 <test> 128 <test>
61 <param name="cond" value="c3-c2"/> 129 <param name="cond" value="float(c3-c2)"/>
62 <param name="input" value="1.bed"/> 130 <param name="input" value="1.bed" ftype="bed" />
63 <param name="round" value="false"/>
64 <output name="out_file1" file="column_maker_out1.interval"/> 131 <output name="out_file1" file="column_maker_out1.interval"/>
65 </test> 132 </test>
66 <test> 133 <test>
67 <param name="cond" value="c4*1"/> 134 <param name="cond" value="c4*1."/>
135 <param name="input" value="1.interval" ftype="interval" />
136 <output name="out_file1" file="column_maker_out2.interval"/>
137 </test>
138 <test>
139 <param name="cond" value="c4*1."/>
140 <param name="input" value="1.header.tsv" ftype="tabular" />
141 <param name="header_lines_select" value="yes" />
142 <param name="new_column_name" value="value1_again" />
143 <output name="out_file1" file="column_maker_out2.header.tsv"/>
144 </test>
145 <test>
146 <param name="cond" value="round(c4*1)"/>
68 <param name="input" value="1.interval"/> 147 <param name="input" value="1.interval"/>
69 <param name="round" value="false"/>
70 <output name="out_file1" file="column_maker_out2.interval"/>
71 </test>
72 <test>
73 <param name="cond" value="c4*1"/>
74 <param name="input" value="1.header.tsv"/>
75 <param name="round" value="false"/>
76 <conditional name="header_lines_conditional">
77 <param name="header_lines_select" value="yes" />
78 <param name="header_new_column_name" value="value1_again" />
79 </conditional>
80 <output name="out_file1" file="column_maker_out2.header.tsv"/>
81 </test>
82 <test>
83 <param name="cond" value="c4*1"/>
84 <param name="input" value="1.interval"/>
85 <param name="round" value="true"/>
86 <output name="out_file1" file="column_maker_out3.interval"/> 148 <output name="out_file1" file="column_maker_out3.interval"/>
87 </test> 149 </test>
88 <test> 150 <test>
89 <!-- test that single column input works --> 151 <!-- test that single column input works -->
90 <param name="cond" value="c1/10"/> 152 <param name="cond" value="c1/10"/>
91 <param name="input" value="1.tab" ftype="tabular"/> 153 <param name="input" value="1.tab" ftype="tabular" />
92 <param name="round" value="no"/>
93 <output name="out_file1" file="column_maker_out4.tab"/> 154 <output name="out_file1" file="column_maker_out4.tab"/>
94 </test> 155 </test>
95 <test> 156 <test>
96 <param name="cond" value="float(.0000000000001)"/> 157 <param name="cond" value="float(.0000000000001)"/>
97 <param name="input" value="1.bed"/> 158 <param name="input" value="1.bed"/>
98 <param name="round" value="false"/>
99 <output name="out_file1"> 159 <output name="out_file1">
100 <assert_contents> 160 <assert_contents>
101 <has_text text="CCDS10397" /> 161 <has_text text="CCDS10397" />
102 <has_text text="1e-13" /> 162 <has_text text="1e-13" />
103 </assert_contents> 163 </assert_contents>
104 </output> 164 </output>
105 </test> 165 </test>
106 <test> 166 <test>
107 <param name="cond" value="float(.0000000000001)"/> 167 <param name="cond" value="float(.0000000000001)"/>
108 <param name="input" value="1.bed"/> 168 <param name="input" value="1.bed" ftype="bed" />
109 <param name="round" value="false"/>
110 <param name="avoid_scientific_notation" value="true"/> 169 <param name="avoid_scientific_notation" value="true"/>
111 <output name="out_file1"> 170 <output name="out_file1">
112 <assert_contents> 171 <assert_contents>
113 <has_text text="CCDS10397" /> 172 <has_text text="CCDS10397" />
114 <has_text text=".0000000000001" /> 173 <has_text text=".0000000000001" />
115 <not_has_text text="1e-13" /> 174 <not_has_text text="1e-13" />
116 </assert_contents> 175 </assert_contents>
117 </output> 176 </output>
118 </test> 177 </test>
178 <test>
179 <param name="input" value="1.tab" ftype="tabular" />
180 <repeat name="expressions">
181 <param name="cond" value="c1/10" />
182 <conditional name="add_column">
183 <param name="mode" value="R" />
184 <param name="pos" value="1" />
185 </conditional>
186 </repeat>
187 <repeat name="expressions">
188 <param name="cond" value="round(c1*10)" />
189 <conditional name="add_column">
190 <param name="mode" value="I" />
191 <param name="pos" value="1" />
192 </conditional>
193 </repeat>
194 <output name="out_file1" file="column_maker_out4.tab" />
195 </test>
196 <!-- Test list column type in input -->
197 <test>
198 <param name="input" value="bed12.bed" ftype="bed12" />
199 <!-- get largest blocksize from column 11 of bed12 and use it as
200 new score value -->
201 <param name="cond" value="max(map(int, c11))" />
202 <conditional name="add_column">
203 <param name="mode" value="R" />
204 <param name="pos" value="5" />
205 </conditional>
206 <output name="out_file1" file="bed12_modified.bed" />
207 </test>
208 <!-- Test error handling example from help section -->
209 <test>
210 <param name="input" value="short_line_test.tab" ftype="tabular" />
211 <param name="cond" value="c6" />
212 <conditional name="add_column">
213 <param name="mode" value="R" />
214 <param name="pos" value="6" />
215 </conditional>
216 <param name="fail_on_non_existent_columns" value="false" />
217 <param name="action" value="--non-computable-default" />
218 <param name="default_value" value="." />
219 <output name="out_file1" file="short_line_test_out.tab" />
220 </test>
221 <!-- Test athletes BMI calculation in presence of NA values as in
222 https://training.galaxyproject.org/training-material/topics/introduction/tutorials/data-manipulation-olympics/tutorial.html#exercises-4
223 -->
224 <test>
225 <param name="input" value="olympics.tsv" ftype="tabular" />
226 <param name="header_lines_select" value="yes" />
227 <param name="new_column_name" value="BMI" />
228 <param name="cond" value="int(c8) / (int(c7) * int(c7)) * 10000" />
229 <param name="auto_col_types" value="false" />
230 <param name="action" value="--non-computable-default" />
231 <param name="default_value" value="NA" />
232 <output name="out_file1" file="olympics_bmi_out.tab" />
233 </test>
234 <!-- Test operation used by iwc SARS-CoV-2 consensus building WF that
235 turns a 3-column CHROM POS REF tabular dataset into a 3-column BED
236 dataset. -->
237 <test>
238 <param name="input" value="chrom_pos_ref.tab" ftype="tabular" />
239 <repeat name="expressions">
240 <param name="cond" value="int(c2) - (len(c3) == 1)" />
241 <conditional name="add_column">
242 <param name="mode" value="R" />
243 <param name="pos" value="2" />
244 </conditional>
245 </repeat>
246 <repeat name="expressions">
247 <param name="cond" value="int(c2) + ((len(c3) - 1) or 1)" />
248 <conditional name="add_column">
249 <param name="mode" value="R" />
250 <param name="pos" value="3" />
251 </conditional>
252 </repeat>
253 <output name="out_file1" file="bed_from_chrom_pos_ref.bed" />
254 </test>
255 <!-- Test failure on expression syntax errors -->
256 <test expect_failure="true">
257 <param name="cond" value="c3- = c2"/>
258 <param name="input" value="1.bed" ftype="bed" />
259 <assert_stderr>
260 <has_text text="syntax error during parsing." />
261 </assert_stderr>
262 </test>
263 <!-- Test failure on expression NameErrors -->
264 <test expect_failure="true">
265 <param name="cond" value="floatfloat(c3-c2)"/>
266 <param name="input" value="1.bed" ftype="bed" />
267 <assert_stderr>
268 <has_text text="name 'floatfloat' is not defined" />
269 </assert_stderr>
270 </test>
271 <!-- Test failure on non-existent column ref -->
272 <test expect_failure="true">
273 <param name="cond" value="c7 - c2"/>
274 <param name="input" value="1.bed" ftype="bed" />
275 <assert_stderr>
276 <has_text text="name 'c7' is not defined" />
277 </assert_stderr>
278 </test>
279 <!-- Test failure on non-computable expression -->
280 <test expect_failure="true">
281 <param name="cond" value="c3 / 0"/>
282 <param name="input" value="1.bed" ftype="bed" />
283 <assert_stderr>
284 <has_text text="division by zero" />
285 </assert_stderr>
286 </test>
287 <!-- Test keep-non-computable prevents failure -->
288 <test>
289 <param name="cond" value="c3 / 0"/>
290 <param name="input" value="1.bed" ftype="bed" />
291 <param name="action" value="--keep-non-computable" />
292 <output name="out_file1" file="1.bed" />
293 </test>
119 </tests> 294 </tests>
120 <help><![CDATA[ 295 <help><![CDATA[
121 .. class:: infomark 296 .. class:: infomark
122 297
123 **TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* 298 **TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert*
124 299
125 ----- 300 -----
126 301
127 **What it does** 302 **What it does**
128 303
129 This tool computes an expression for every row of a dataset and appends the result as a new column (field). 304 This tool computes an expression on every row of a dataset and appends or inserts the result as a new column (field).
305
306 Several expressions can be specified and will be applied sequentially to each row.
307
308 **Expression rules**
130 309
131 - Columns are referenced with **c** and a **number**. For example, **c1** refers to the first column of a tab-delimited file 310 - Columns are referenced with **c** and a **number**. For example, **c1** refers to the first column of a tab-delimited file
132 311
133 - **c3-c2** will add a length column to the dataset if **c2** and **c3** are start and end position 312 - The following built-in Python functions are available for use in expressions::
313
314 abs | all | any | ascii | bin | bool | chr | ceil | complex | divmod
315
316 exp | float | floor | format | hex | int | len | list | log | log10
317
318 list | map | max | min | oct | ord | pow | range | reversed
319
320 round | set | sorted | sqrt | str | sum | type
321
322 - In addition the numpy function ``format_float_positional`` is available to
323 control the formatting of floating point numbers.
324
325 - Expressions can be chained, and the tool will keep track of newly added
326 columns while working through the chain. This means you can reference a column
327 that was created as the result of a previous expression in later ones.
134 328
135 ----- 329 -----
136 330
137 **Example** 331 **Simple examples**
138 332
139 If this is your input:: 333 If this is your input::
140 334
141 chr1 151077881 151077918 2 200 - 335 chr1 151077881 151077918 2 200 -
142 chr1 151081985 151082078 3 500 + 336 chr1 151081985 151082078 3 500 +
143 337
144 computing "c4*c5" will produce:: 338 computing "c4 * c5" will produce::
145
146 chr1 151077881 151077918 2 200 - 400.0
147 chr1 151081985 151082078 3 500 + 1500.0
148
149 if, at the same time, "Round result?" is set to **YES** results will look like this::
150 339
151 chr1 151077881 151077918 2 200 - 400 340 chr1 151077881 151077918 2 200 - 400
152 chr1 151081985 151082078 3 500 + 1500 341 chr1 151081985 151082078 3 500 + 1500
153 342
154 You can also use this tool to evaluate expressions. For example, computing "c3>=c2" for Input will result in the following:: 343 You can also use this tool to evaluate expressions.
344 For example, computing "c3 >= c2" for the input above will result in the following::
155 345
156 chr1 151077881 151077918 2 200 - True 346 chr1 151077881 151077918 2 200 - True
157 chr1 151081985 151082078 3 500 + True 347 chr1 151081985 151082078 3 500 + True
158 348
159 or computing "type(c2)==type('') for Input will return:: 349 Similarly, computing "type(c2) == type(c3) will return::
160 350
161 chr1 151077881 151077918 2 200 - False 351 chr1 151077881 151077918 2 200 - True
162 chr1 151081985 151082078 3 500 + False 352 chr1 151081985 151082078 3 500 + True
163 353
164 354 -----
165 The following built-in functions are available:: 355
166 356 **Error handling**
167 abs | all | any | bin | bool | chr | ceil | cmp | complex 357
168 358 The tool will always fail on syntax errors in and other unrecoverable parsing
169 divmod | exp | float | log | log10 | floor | hex | int | len | long 359 errors with any of your expressions. For other problems, however, it offers
170 360 control over how they should be handled:
171 max | min | oct | ord | pow | range | reversed 361
172 362 1. The default for "Autodetect column types" is "Yes", which means the tool
173 round | sorted | sqrt | str | sum | type | unichr | unicode | 363 will evaluate each column value as the type that Galaxy assumes for the
174 364 column. This default behavior will allow you to write simpler expressions.
365 The arithmetic expression "c4 * c5" from the first simple example,
366 for instance, works only because Galaxy realizes that c4 and c5 are integer
367 columns. Occasionally, this autodetection can cause issues. A common
368 such situation are missing values in columns that Galaxy thinks are of
369 numeric type. If you're getting errors like "Failed to convert some of the
370 columns in line #X ...", a solution might be to turn off column type
371 autodetection. The price you will have to pay for doing so is that now you
372 will have to handle type conversions yourself. In the first example you would
373 now have to use the epression: "int(c4) * int(c5)".
374
375 2. By default, if any expression references columns that are not existing before
376 that expression gets computed, the tool will fail, but you can uncheck the
377 "Fail on references to non-existent columns" option. If you do so, the result
378 will depend on your choice for "If an expression cannot be computed for a row"
379 (see 3.)
380
381 3. The default for rows, for which an expression fails to compute is, again, to
382 fail the tool run, but you can also choose to:
383
384 - skip the row on output
385
386 This is a simple way to only keep lines conforming to an expected standard.
387 It is also easy to mask problems with your expressions with this option so
388 take a look at the results and try to understand what gets skipped and for
389 what reasons (the stdout of the tool will contain information about both).
390
391 - keep the row unchanged
392
393 This can be a good solution if your input contains special separator lines
394 that don't follow the general tabular format of other lines and you would
395 like to keep those lines
396
397 - produce an empty column value for the row
398
399 This will use the empty string as a substitute for non-computable items.
400 Different from the "keep the row unchanged option" the problematic line will
401 have a column added or changed. This option is a good choice for inputs
402 in which all rows have the same tabular layout where you want to make sure
403 that the same is true for the output, i.e. that all output lines still have
404 the same number of columns.
405
406 - fill in a replacement value
407
408 This option is very similar to the previous one, but lets you control the
409 replacement value.
410
411 **Example**
412
413 In the following input::
414
415 chr1 151077881 151077918 2 200 -
416 chr1 151081985 151082078 3 500 +
417 chr1 151090031 151090938 4 700
418
419 the last line does not have a strand column. This violates the bed file format
420 specification, which says that unknown strand is to be encoded as ``.`` in the
421 strand column.
422
423 You can fix the file with the following tool run:
424
425 **Add expression**: `c6`
426
427 **Mode of the operation**: `Replace`
428
429 **Use new column to replace column number**: `6`
430
431 **Fail on references to non-existent columns**: `No`
432
433 **If an expression cannot be computed for a row**: `Fill in a replacement value`
434
435 **Replacement value**: `.`
175 ]]></help> 436 ]]></help>
176 <citations /> 437 <citations />
177 </tool> 438 </tool>