comparison column_join.xml @ 0:6bb6c0a30c67 draft default tip

Uploaded
author jjohnson
date Tue, 01 Apr 2014 09:30:45 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6bb6c0a30c67
1 <tool id="column_join" name="Column Join" version="1.1.0">
2 <description></description>
3 <command interpreter="python">
4 column_join.py
5 --output=$output
6 --input1=$input1
7 --input2=$input2
8 --hinge=$hinge
9 --columns=$columns
10 #if $fill_empty_columns.fill_empty_columns_switch == "fill_empty":
11 --fill_options_file=$fill_options_file
12 #end if
13 #for $f in $file_chooser:
14 ${f.input}
15 #end for
16 </command>
17 <inputs>
18 <param name="input1" type="data" format="tabular" label="Choose the first file for the join" />
19 <param name="hinge" type="data_column" data_ref="input1" multiple="false" numerical="false" label="Use this column and columns to left the 'hinge' (matching data for each join)" help="All columns to left of selected column (plus selected column) will be used. Select 2 for pileup" />
20 <param name="columns" type="data_column" data_ref="input1" multiple="true" numerical="false" label="Include these column" help="Multi-select list - hold the appropriate key while clicking to select multiple columns" />
21 <conditional name="fill_empty_columns">
22 <param name="fill_empty_columns_switch" type="select" label="Fill empty columns">
23 <option value="no_fill" selected="True">No</option>
24 <option value="fill_empty">Yes</option>
25 </param>
26 <when value="no_fill" />
27 <when value="fill_empty">
28 <conditional name="do_fill_empty_columns">
29 <param name="column_fill_type" type="select" label="Fill Columns by">
30 <option value="single_fill_value" selected="True">Single fill value</option>
31 <option value="fill_value_by_column">Values by column</option>
32 </param>
33 <when value="single_fill_value">
34 <param type="text" name="fill_value" label="Fill value" value="." />
35 </when>
36 <when value="fill_value_by_column">
37 <repeat name="column_fill" title="Fill Column">
38 <param name="column_number" label="Column" type="data_column" data_ref="input1" />
39 <param type="text" name="fill_value" value="." />
40 </repeat>
41 </when>
42 </conditional>
43 </when>
44 </conditional>
45 <param name="input2" type="data" format="tabular" label="Choose the second file for the join" />
46 <repeat name="file_chooser" title="Additional Input">
47 <param name="input" label="Additional input file" type="data" format="tabular" />
48 </repeat>
49 </inputs>
50 <configfiles>
51 <configfile name="fill_options_file">&lt;%
52 import json
53 %&gt;
54 #set $__fill_options = {}
55 #if $fill_empty_columns['fill_empty_columns_switch'] == 'fill_empty':
56 #if $fill_empty_columns['do_fill_empty_columns']['column_fill_type'] == 'single_fill_value':
57 #set $__start_fill = $fill_empty_columns['do_fill_empty_columns']['fill_value'].value
58 #else:
59 #set $__start_fill = ""
60 #end if
61 #set $__fill_options['file1_columns'] = [ __start_fill for i in range( int( $input1.metadata.columns ) ) ]
62 #if $fill_empty_columns['do_fill_empty_columns']['column_fill_type'] == 'fill_value_by_column':
63 #for column_fill in $fill_empty_columns['do_fill_empty_columns']['column_fill']:
64 #set $__fill_options['file1_columns'][ int( column_fill['column_number'].value ) - 1 ] = column_fill['fill_value'].value
65 #end for
66 #end if
67 #end if
68 ${json.dumps( __fill_options )}
69 </configfile>
70 </configfiles>
71 <outputs>
72 <data name="output" format="tabular" />
73 </outputs>
74 <tests>
75 <test>
76 <param name="input1" value="column_join_in1.pileup" ftype="pileup" />
77 <param name="hinge" value="2" />
78 <param name="columns" value="1,2,3,4,5,7" />
79 <param name="fill_empty_columns_switch" value="fill_empty" />
80 <param name="column_fill_type" value="single_fill_value" />
81 <param name="fill_value" value="?" />
82 <param name="input2" value="column_join_in2.pileup" ftype="pileup" />
83 <param name="input" value="column_join_in3.pileup" ftype="pileup" />
84 <output name="output" file="column_join_out1.pileup" ftype="tabular" />
85 </test>
86 <test>
87 <param name="input1" value="column_join_in4.pileup" ftype="pileup" />
88 <param name="hinge" value="2" />
89 <param name="columns" value="1,2,3,4" />
90 <param name="fill_empty_columns_switch" value="no_fill" />
91 <param name="input2" value="column_join_in5.pileup" ftype="pileup" />
92 <param name="input" value="column_join_in6.pileup" ftype="pileup" />
93 <output name="output" file="column_join_out2.pileup" ftype="tabular" />
94 </test>
95 <!-- This test is failing for an unclear reason (the column values do not get
96 passed into the script), but passes in the browser
97 <test>
98 <param name="input1" value="column_join_in7.pileup" ftype="tabular" />
99 <param name="hinge" value="2" />
100 <param name="columns" value="3,4,5" />
101 <param name="fill_empty_columns_switch" value="fill_empty" />
102 <param name="column_fill_type" value="fill_value_by_column" />
103 <param name="column_number" value="5" />
104 <param name="fill_value" value="X" />
105 <param name="input2" value="column_join_in8.pileup" ftype="tabular" />
106 <param name="input" value="column_join_in9.pileup" ftype="tabular" />
107 <output name="output" file="column_join_out3.pileup" ftype="tabular" />
108 </test>
109 -->
110 <test>
111 <param name="input1" value="column_join_in10.pileup" ftype="pileup" />
112 <param name="hinge" value="1" />
113 <param name="columns" value="2,7" />
114 <param name="fill_empty_columns_switch" value="no_fill" />
115 <param name="input2" value="column_join_in11.pileup" ftype="pileup" />
116 <param name="input" value="column_join_in12.pileup" ftype="pileup" />
117 <output name="output" file="column_join_out4.pileup" ftype="tabular" />
118 </test>
119 <test>
120 <!-- Test for handling missing column -->
121 <param name="input1" value="column_join_in13.tabular" ftype="tabular" />
122 <param name="hinge" value="1" />
123 <param name="columns" value="5" />
124 <param name="fill_empty_columns_switch" value="fill_empty" />
125 <param name="column_fill_type" value="single_fill_value" />
126 <param name="fill_value" value="0" />
127 <param name="input2" value="column_join_in14.tabular" ftype="tabular" />
128 <param name="input" value="column_join_in15.tabular" ftype="tabular" />
129 <output name="output" file="column_join_out5.tabular" ftype="tabular" />
130 </test>
131 </tests>
132 <help>
133 **What it does**
134
135 This tool allows you to join several files with the same column structure into one file, removing certain columns if necessary. The user needs to select a 'hinge', which is the number of left-most columns to match on. They also need to select the columns to include in the join, which should include the hinge columns, too.
136
137 Note that the files are expected to have the same number of columns. If for some reason the join column is missing (this only applies to the last column(s)), the tool attempts to handle this situation by inserting an empty item (or the appropriate filler) for that column on that row. This could lead to the situation where a row has a hinge but entirely empty or filled columns, if the hinge exists in at least one file but every file that has it is missing the join column. Also, note that the tool does not distinguish between a file missing the hinge altogether and a file having the hinge but missing the column (in both cases the column would be empty or filled). There is an example of this below.
138
139 -----
140
141 **General Example**
142
143 Given the following files::
144
145 FILE 1
146 chr2 1 T 6 .C..., I$$III
147 chr2 2 G 6 ..N.., III@II
148 chr2 3 C 7 ..C..., I$IIIII
149 chr2 4 G 7 .G...., I#IIIII
150 chr2 5 G 7 ...N.., IIII#BI
151 chr2 6 A 7 ..T..., I$IDIII
152 chr1 1 C 1 ^:. I
153 chr1 2 G 2 .^:. $I
154 chr1 3 A 2 .. I%
155 chr1 4 C 2 .. I$
156 chr1 5 T 3 ..^:. I#I
157 chr1 6 G 3 ..^:, I#I
158
159 FILE 2
160 chr1 3 T 1 ^:. I
161 chr1 4 G 2 .^:. $I
162 chr1 5 T 2 .. I%
163 chr1 6 C 3 ..^:. III
164 chr1 7 G 3 ..^:. I#I
165 chr1 8 T 4 ...^:, I#II
166 chr2 77 C 6 .G..., I$$III
167 chr2 78 G 6 ..N.., III@II
168 chr2 79 T 7 ..N..., I$IIIII
169 chr2 80 C 7 .G...., I#IIIII
170 chr2 81 G 7 ...A.., IIII#BI
171 chr2 82 A 8 ...G..., I$IDIIII
172 chr2 83 T 8 .A.....N IIIIIIII
173 chr2 84 A 9 ......T. I$IIIIIII
174
175 FILE 3
176 chr1 1 A 1 . I
177 chr1 2 T 2 G. I$
178 chr1 3 C 2 ., I@
179 chr1 4 C 3 ..N III
180 chr1 42 C 5 ...N^:. III@I
181 chr1 43 C 5 .N..^:. IIIII
182 chr1 44 T 5 .A.., IA@II
183 chr1 45 A 6 .N...^:. IIIII$
184 chr1 46 G 6 .GN..^:. I@IIII
185 chr1 47 A 7 ....^:.., IIIII$I
186 chr2 73 T 5 .N.., II$II
187 chr2 74 A 5 ...., IIIII
188 chr2 75 T 5 ...., IIIII
189 chr2 76 T 5 ...., IIIII
190 chr2 77 C 5 ...., IIIBI
191 chr2 78 T 5 ...., IDIII
192
193 To join on columns 3 and 4 combining on columns 1 and 2, columns 1-4 should be selected for the 'Include these columns' option, and column 2 selected for the 'hinge'. With these settings, the following would be output::
194
195 chr1 1 C 1 A 1
196 chr1 2 G 2 T 2
197 chr1 3 A 2 T 1 C 2
198 chr1 4 C 2 G 2 C 3
199 chr1 5 T 3 T 2
200 chr1 6 G 3 C 3
201 chr1 7 G 3
202 chr1 8 T 4
203 chr1 42 C 5
204 chr1 43 C 5
205 chr1 44 T 5
206 chr1 45 A 6
207 chr1 46 G 6
208 chr1 47 A 7
209 chr2 1 T 6
210 chr2 2 G 6
211 chr2 3 C 7
212 chr2 4 G 7
213 chr2 5 G 7
214 chr2 6 A 7
215 chr2 73 T 5
216 chr2 74 A 5
217 chr2 75 T 5
218 chr2 76 T 5
219 chr2 77 C 6 C 5
220 chr2 78 G 6 T 5
221 chr2 79 T 7
222 chr2 80 C 7
223 chr2 81 G 7
224 chr2 82 A 8
225 chr2 83 T 8
226 chr2 84 A 9
227
228 **Example with missing columns**
229
230 Given the following input files::
231
232 FILE 1
233 1 A
234 2 B b
235 4 C c
236 5 D
237 6 E e
238
239 FILE 2
240 1 M m
241 2 N
242 3 O o
243 4 P p
244 5 Q
245 7 R r
246
247 if we join only column 3 using column 1 as the hinge and with a fill value of '0', this is what will be output::
248
249 1 0 m
250 2 b 0
251 3 0 o
252 4 c p
253 5 0 0
254 6 e 0
255 7 0 r
256
257 Row 5 appears in both files with the missing column, so it's got nothing but fill values in the output file.
258
259 </help>
260 </tool>