comparison imputation.xml @ 1:2e7d47c0b027 draft

"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
author malex
date Mon, 08 Mar 2021 22:04:06 +0000
parents
children caba07f41453
comparison
equal deleted inserted replaced
0:b54326490b4d 1:2e7d47c0b027
1 <tool id="secimtools_imputation" name="Imputation (Mean, Median, K-Nearest Neighbours, Stochastic)" version="@WRAPPER_VERSION@">
2 <description>of missing values using selected algorithm.</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="requirements" />
7 <stdio>
8 <exit_code range="1:" level="warning" description="UserWarning"/>
9 <exit_code range="1:" level="warning" description="VisibleDeprecationWarning"/>
10 </stdio>
11 <command detect_errors="exit_code"><![CDATA[
12 imputation.py
13 --input $input
14 --design $design
15 --ID $uniqID
16 --group $group
17
18 --output $imputed
19
20 --knn $k
21 --strategy $imputation
22 --row_cutoff $rowCutoff
23 --col_cutoff $colCutoff
24 --distribution $distribution
25
26 #if $noZero
27 --no_zero
28 #end if
29
30 #if $noNeg
31 --no_negative
32 #end if
33
34 #if $exclude
35 --exclude $exclude
36 #end if
37 ]]></command>
38 <inputs>
39 <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/>
40 <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/>
41 <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers.."/>
42 <param name="group" type="text" size="30" label="Group/Treatment" help="Name of the column in your design file that contains group classifications."/>
43 <param name="imputation" size="30" type="select" value="" label="Imputation Strategy" help="Choose an imputation strategy.">
44 <option value="knn" selected="true">K-Nearest Neighbors</option>
45 <option value="bayesian" selected="true">Stochastic</option>
46 <option value="mean" selected="true">Mean</option>
47 <option value="median" selected="true">Median</option>
48 </param>
49 <param name="noZero" type="boolean" label="Count Zeroes as missing" help="Zeroes can be counted as missing or left as data."/>
50 <param name="noNeg" type="boolean" label="Count Negative as missing" help="Negatives can be counted as missing or left as data."/>
51 <param name="exclude" type="text" size="30" value="" label="Additional values to treat as missing [Optional]" help="Separate additional values to treat as missing data with commas."/>
52 <param name="rowCutoff" type="text" size="30" value=".5" label="Row Percent Cutoff Value" help="Proportion of missing values allowed per group per row. If the proportion of missing values for each feature is greater than the specifed value, then the sample mean is imputed instead of values from the K-Nearest Neighbors algorithm. Default: 0.5 (50%)."/>
53 <param name="k" type="text" size="30" value="5" label="K value" help="Only for K-Nearest Neighbors Imputation, ignore for other imputation methods. K value is the number of neighbors to search. Default: 5. If less then 5 neighbours are available, all are used."/>
54 <param name="colCutoff" type="text" size="30" value=".8" label="Column Percent Cutoff Value" help="Only for K-Nearest Neighbors Imputation, ignore for other imputation methods. If the proportion of missing values is greater than the specified value, the imputation stops and the tool returns an error. Default: 0.8 (80%)."/>
55 <param name="distribution" size="30" type="select" value="" label="Bayesian Distribution" help="Only for Stochastic Imputation, ignore for other imputation methods. Choose between normal and Poisson distributions.">
56 <option value="Poisson" selected="true">Poisson</option>
57 <option value="Normal" selected="true">Normal</option>
58 </param>
59 </inputs>
60 <outputs>
61 <data format="tabular" name="imputed" label="${tool.name} on ${on_string}"/>
62 </outputs>
63 <tests>
64 <test>
65 <param name="input" value="ST000006_data.tsv"/>
66 <param name="design" value="ST000006_design.tsv"/>
67 <param name="uniqID" value="Retention_Index" />
68 <param name="group" value="White_wine_type_and_source" />
69 <param name="imputation" value="knn" />
70 <output name="imputed" file="ST000006_imputation_output_KNN.tsv" />
71 </test>
72 </tests>
73 <help><![CDATA[
74
75 @TIP_AND_WARNING@
76
77 **Tool Description**
78
79 The tool performs an imputation procedure for missing data based on three conceptually different methods:
80
81 (1) naive imputation (mean, median),
82 (2) K-nearest neighbor imputation (KNN) and
83 (3) stochastic imputation (based on normal and Poisson distributions)
84
85 Imputations are performed separately for each sample group since treatment groups are expected to be different.
86 If only a single sample (column) is available for a given group, nothing is imputed and the sample is kept intact.
87 An option to select which values should be treated as missing is included.
88 The default value for missing data is an empty cell in the dataset with the option to treat zeroes, negative values and user-defined value(s) as missing and subsequently impute missing values.
89
90 (1) Naive imputation:
91
92 Computes the mean (or median) of the features within the samples for a given group and uses that value to impute values for that feature among the missing samples.
93 Feature values for all missing samples in the group get the same value equal to the mean (median) of the available samples, provided the allowed missing threshold is met.
94
95 (2) K-Nearest Neighbors (KNN) imputation:
96
97 Based on the procedure where nearest neighbor samples (K value default = 5) for the given sample within each group are considered.
98 The neighboring samples are used to generate the missing value for the current samples.
99 If less than the specified K value number of neighbors are available for the current sample in the current group, then the maximum available number of neighbors is used.
100 If the proportion of missing values for each row (feature) is greater than the specified Row Percent Cutoff value (default 0.5), then the column (sample) mean is imputed instead of values from the KNN algorithm.
101 The proportion of missing values for each column (sample) can be specified (Column Percent Cutoff default = 0.8) and determines whether a sample should be imputed or not.
102 If the proportion of missing values for each sample is greater than the specified value, then the missing values are not imputed and the imputation process is interrupted.
103 The algorithm is deterministic and always imputes the same missing values for the same settings.
104 More details on the algorithm are available via the reference and link below:
105
106 Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing value estimation methods for DNA microarrays BIOINFORMATICS Vol. 17 no. 6. 2001 Pages 520-525.
107
108 https://bioconductor.org/packages/release/bioc/html/impute.html
109
110 (3) Stochastic imputation:
111
112 Based on the assumption that each feature within a given group follows some underlying distribution.
113 As a result, all missing values are generated from the underlying distribution.
114 The parameter(s) of the underlying distribution is (are) estimated from the observed features.
115 Two distribution options are available:
116
117 Normal (recommended for logged and negative data) and Poisson (recommended for nonnegative counts).
118 The normal distribution parameters are estimated by the mean and standard deviation of the observed samples for a given feature.
119 If all observed values for a feature are the same, then the standard deviation is assumed to be 1/3 the absolute value of the mean.
120 The Poisson distribution parameter is estimated by the mean of the observed values for a given feature and is expected to be positive for the imputation procedure to work correctly.
121
122 --------------------------------------------------------------------------------
123
124 **Note**
125
126 - This tool currently treats all variables as continuous numeric
127 variables. Running the tool on categorical variables might result in
128 incorrect results.
129 - Rows containing non-numeric (or missing) data in any
130 of the chosen columns will be skipped from the analysis.
131
132 --------------------------------------------------------------------------------
133
134 **Input**
135
136 - Two input datasets are required.
137
138 @WIDE@
139
140 **NOTE:** The sample IDs must match the sample IDs in the Design File (below).
141 Extra columns will automatically be ignored.
142
143 @METADATA@
144
145 @UNIQID@
146
147 @GROUP@
148
149 **Imputation Strategy.**
150
151 - Select an imputation strategy.
152
153 **Count Zeroes as missing.**
154
155 - Zeroes can be treated as missing or left as data.
156
157 **Count Negative as missing.**
158
159 - Negatives can be treated as missing or left as data.
160
161 **Additional values to treat missing [Optional].**
162
163 - Additional values to treat as missing data, separate with commas.
164
165 **Row Percent Cutoff Value.**
166
167 - Proportion of missing values allowed per group per row. If the proportion of missing samples in the row is greater than the cutoff value specified, nothing will be imputed for that row. Default: 0.5 (50%).
168
169 **K value.**
170
171 - If you are not using the KNN Imputation, then ignore. K value is the number of neighbors to search. Default: 5. If less then 5 neighbours are available, all are used.
172
173 **Column Percent Cutoff Value.**
174
175 - If you are not using the KNN Imputation, then ignore. The maximum proportion of missing data allowed in any data column (sample). Default: 0.8 (80%). The imputation will fail if the proportion in the data exceeds this cutoff!
176
177 **Bayesian Distribution.**
178
179 - Choose between Normal and Poisson distributions for stochastic imputation.
180
181 --------------------------------------------------------------------------------
182
183 **Output**
184
185 TSV file containing the same column names as the original Wide Dataset where the values in each cell correspond to either the original values or to values obtained during the imputation procedure.
186
187
188 ]]></help>
189 <expand macro="citations"/>
190 </tool>