comparison mqppep_anova.xml @ 1:08678c931f5d draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 43e7a43b545c24b2dc33d039198551c032aa79be
author galaxyp
date Fri, 28 Oct 2022 18:27:21 +0000
parents dbff53e6f75f
children dda27b9273a8
comparison
equal deleted inserted replaced
0:dbff53e6f75f 1:08678c931f5d
5 profile="21.05" 5 profile="21.05"
6 > 6 >
7 <description>Runs ANOVA and KSEA for phosphopeptides.</description> 7 <description>Runs ANOVA and KSEA for phosphopeptides.</description>
8 <macros> 8 <macros>
9 <import>macros.xml</import> 9 <import>macros.xml</import>
10 <xml name="group_matching_parm">
11 <param name="group_filter_mode" type="select"
12 help="Regular expression matching mode 'fixed', 'perl', or 'grep' with option for case insensitivity. See https://rdrr.io/r/base/grep.html"
13 label="Sample-group matching mode"
14 >
15 <option value="r">ERE ("extended regular expressions")</option>
16 <option value="ri"> - ERE, case insensitive</option>
17 <option value="p" selected="true">PCRE ("PERL-compatible regular expressions")</option>
18 <option value="pi"> - PCRE, case insensitive</option>
19 <option value="f">fixed strings ("no regular expressions")</option>
20 <option value="fi"> - fixed strings, case insensitive</option>
21 </param>
22 <param name="group_filter_patterns" type="text" value=".+"
23 help="Comma-separated list of regular expressions matching group-names"
24 label="Sample-group matching pattern">
25 <sanitizer>
26 <valid initial="string.printable">
27 <remove value="&apos;"/>
28 </valid>
29 </sanitizer>
30 </param>
31 </xml>
10 </macros> 32 </macros>
11 <edam_topics> 33 <edam_topics>
12 <edam_topic>topic_0121</edam_topic><!-- proteomics --> 34 <edam_topic>topic_0121</edam_topic><!-- proteomics -->
13 <edam_topic>topic_3520</edam_topic><!-- proteomics experiment--> 35 <edam_topic>topic_3520</edam_topic><!-- proteomics experiment-->
14 </edam_topics> 36 </edam_topics>
25 <!-- 47 <!--
26 The weird invocation used here is because knitr and install_tinytex 48 The weird invocation used here is because knitr and install_tinytex
27 both need access to a writeable directory, but most directories in a 49 both need access to a writeable directory, but most directories in a
28 biocontainer are read-only, so this builds a pseudo-home under /tmp 50 biocontainer are read-only, so this builds a pseudo-home under /tmp
29 --> 51 -->
52 <required_files>
53 <include path="KSEA_impl_flowchart.pdf" />
54 <include path="kinase_name_uniprot_lut.tabular.bz2" />
55 <include path="kinase_uniprot_description_lut.tabular.bz2" />
56 <include path="kinase_uniprot_description_lut.tabular.bz2" />
57 <include path="mqppep_anova.R" />
58 <include path="mqppep_anova_preamble.tex" />
59 <include path="mqppep_anova_script.Rmd" />
60 <include path="perpage.tex" />
61 </required_files>
30 <command detect_errors="exit_code"><![CDATA[ 62 <command detect_errors="exit_code"><![CDATA[
63 (printenv | sort) &&
31 cp '$__tool_directory__/mqppep_anova_script.Rmd' . && 64 cp '$__tool_directory__/mqppep_anova_script.Rmd' . &&
32 cp '$__tool_directory__/mqppep_anova.R' . && 65 cp '$__tool_directory__/mqppep_anova.R' . &&
66 cp '$__tool_directory__/kinase_name_uniprot_lut.tabular.bz2' . &&
67 cp '$__tool_directory__/kinase_uniprot_description_lut.tabular.bz2' . &&
68 cp '$__tool_directory__/mqppep_anova_preamble.tex' . &&
69 cp '$__tool_directory__/perpage.tex' . &&
70 cp '$__tool_directory__/KSEA_impl_flowchart.pdf' . &&
33 Rscript mqppep_anova.R 71 Rscript mqppep_anova.R
34 --inputFile '$input_file' 72 --inputFile '$input_file'
35 --alphaFile '$alpha_file' 73 --alphaFile '$alpha_file'
36 --preproc_sqlite '$preproc_sqlite' 74 --preproc_sqlite '$preproc_sqlite'
37 --firstDataColumn $intensity_column_regex_f 75 --firstDataColumn '$intensity_column_regex_f'
38 --imputationMethod $imputation.imputation_method 76 --imputationMethod $imputation.imputation_method
39 #if $imputation.imputation_method == "random" 77 #if $imputation.imputation_method == "random"
40 --meanPercentile '$imputation.meanPercentile' 78 --meanPercentile '$imputation.meanPercentile'
41 --sdPercentile '$imputation.sdPercentile' 79 --sdPercentile '$imputation.sdPercentile'
42 #end if 80 #end if
43 --regexSampleNames $sample_names_regex_f 81 --regexSampleNames '$sample_names_regex_f'
44 --regexSampleGrouping $sample_grouping_regex_f 82 --regexSampleGrouping '$sample_grouping_regex_f'
45 --imputedDataFile $imputed_data_file 83 #if $group_filter.group_filter_method == "none"
84 --sampleGroupFilter 'none'
85 #else
86 --sampleGroupFilter '$group_filter.group_filter_method'
87 --sampleGroupFilterPatterns '$group_filter_patterns_f'
88 --sampleGroupFilterMode '$group_filter.group_filter_mode'
89 #end if
90 --intensityMinValuesPerClass '$intnsty_min_vals_per_smpl_grp'
91 --imputedDataFile '$imputed_data_file'
46 --imputedQNLTDataFile '$imp_qn_lt_file' 92 --imputedQNLTDataFile '$imp_qn_lt_file'
47 --ksea_sqlite '$ksea_sqlite' 93 --ksea_sqlite '$ksea_sqlite'
94 --kseaMinSubstrateCount '$ksea_min_substrate_count'
48 --ksea_cutoff_threshold '$ksea_cutoff_threshold' 95 --ksea_cutoff_threshold '$ksea_cutoff_threshold'
49 --ksea_cutoff_statistic 'FDR' 96 --ksea_cutoff_statistic 'FDR'
97 --kseaUseAbsoluteLog2FC '$ksea_use_absolute_log2_fc'
98 --minQuality '$ksea_min_quality'
99 --anova_ksea_metadata '$anova_ksea_metadata'
50 --reportFile '$report_file' 100 --reportFile '$report_file'
51 --anova_ksea_metadata '$anova_ksea_metadata'
52 ]]></command> 101 ]]></command>
102 <!--
103 -->
53 <configfiles> 104 <configfiles>
54 <configfile name="sample_names_regex_f"> 105 <configfile name="sample_names_regex_f">
55 $sample_names_regex 106 $sample_names_regex
56 </configfile> 107 </configfile>
57 <configfile name="sample_grouping_regex_f"> 108 <configfile name="sample_grouping_regex_f">
58 $sample_grouping_regex 109 $sample_grouping_regex
59 </configfile> 110 </configfile>
111 <configfile name="group_filter_patterns_f">
112 #if $group_filter.group_filter_method != "none"
113 $group_filter.group_filter_patterns
114 #end if
115 </configfile>
60 <configfile name="intensity_column_regex_f"> 116 <configfile name="intensity_column_regex_f">
61 $intensity_column_regex 117 $intensity_column_regex
62 </configfile> 118 </configfile>
63 </configfiles> 119 </configfiles>
64 <inputs> 120 <inputs>
65 <param name="input_file" type="data" format="tabular" label="Filtered Phosphopeptide Intensities" 121 <!--
66 help="Phosphopeptide intensities filtered for minimal quality. First column label 'Phosphopeptide'; sample-intensities must begin in column 10 and must have column labels to match argument [sample_names_regex]" 122 needed inputs:
67 /> 123 - # should filters be used to identify sample-groups to be included or excluded
68 <param name="alpha_file" type="data" format="tabular" label="ANOVA alpha cutoff level" 124 sampleGroupFilter: !r c("none", "exclude", "include")[3]
125 - # what patterns should be used to match sample-groups
126 # (extracted by regexSampleGrouping) when determining sample-groups
127 # that should be included or excluded
128 sampleGroupFilterPatterns: ".*CR,N.*"
129 - # minimum number of observed values per class
130 intensityMinPerClass: 0
131 - # what should be the primary criterion to eliminate excessive heatmap rows
132 intensityHeatmapCriteria: !r c("quality", "na_count", "p_value")[1]
133 suggested or advanced inputs:
134 - kinaseNameUprtLutBz2: "./kinase_name_uniprot_lut.tabular.bz2"
135 - kinaseUprtDescLutBz2: "./kinase_uniprot_description_lut.tabular.bz2"
136 -->
137 <param name="input_file" type="data" format="tabular" label="Filtered phosphopeptide intensities (tabular)"
138 help="'preproc_tab' dataset produced by 'MaxQuant Phosphopeptide Preprocessing' tool"
139 />
140 <param name="alpha_file" type="data" format="tabular" label="ANOVA alpha cutoff level (tabular)"
69 help="ANOVA alpha cutoff values for significance testing: tabular data having one column and no header" 141 help="ANOVA alpha cutoff values for significance testing: tabular data having one column and no header"
70 /> 142 />
71 <param name="preproc_sqlite" type="data" format="sqlite" label="preproc_sqlite dataset from mqppep_preproc" 143 <param name="preproc_sqlite" type="data" format="sqlite" label="Database from mqppep_preproc (sqlite)"
72 help="'preproc_sqlite' dataset produced by 'MaxQuant Phosphopeptide Preprocessing' tool" 144 help="'preproc_sqlite' dataset produced by 'MaxQuant Phosphopeptide Preprocessing' tool"
73 /> 145 />
74 <param name="intensity_column_regex" type="text" value="^Intensity[^_]" 146 <param name="intensity_column_regex" type="text" value="^Intensity[^_]"
75 label="Intensity-column pattern" 147 label="Intensity-column pattern"
76 help="Pattern matching columns that have peptide intensity data (PERL-compatible regular expression matching column label)" 148 help="Pattern matching columns that have peptide intensity data (PERL-compatible regular expression matching column label)"
77 /> 149 />
78 <!-- imputation_method <- c("group-median","median","mean","random")[1] --> 150 <!-- imputation_method <- c("group-median","median","mean","random")[1] -->
79 <conditional name="imputation"> 151 <conditional name="imputation">
80 <param name="imputation_method" type="select" label="Imputation method" 152 <param name="imputation_method" type="select" label="Imputation method"
81 help="Impute missing values by (1) using median for each sample-group; (2) using median across all samples; (3) using mean across all samples; or (4) using randomly generated values having same std. dev. as across all samples (with mean specified by [meanPercentile])" 153 help="Impute missing values by (1) using median for each sample-group; (2) using median across all samples; (3) using mean across all samples; or (4) using randomly generated values having same SD as across all samples (with mean specified by 'Mean percentile for random values')"
82 > 154 >
83 <option value="random" selected="true">random</option> 155 <option value="random" selected="true">random</option>
84 <option value="group-median">group-median</option> 156 <option value="group-median">group-median</option>
85 <option value="median">median</option> 157 <option value="median">median</option>
86 <option value="mean">mean</option> 158 <option value="mean">mean</option>
91 <when value="random"> 163 <when value="random">
92 <param name="meanPercentile" type="integer" value="1" min="1" max="99" 164 <param name="meanPercentile" type="integer" value="1" min="1" max="99"
93 label="Mean percentile for random values" 165 label="Mean percentile for random values"
94 help="Percentile center of random values; range [1,99]" 166 help="Percentile center of random values; range [1,99]"
95 /> 167 />
96 <param name="sdPercentile" type="float" value="1.0" 168 <param name="sdPercentile" type="float" value="1"
97 label="Percentile std. dev. for random values" 169 label="Percentile SD for random values"
98 help="Standard deviation adjustment-factor for random values; real number. (1.0 means SD equal to the SD for the entire data set.)" 170 help="Standard deviation adjustment-factor for random values; real number. (1.0 means SD of random values equal to the SD for the entire data set.)"
99 /> 171 />
100 </when> 172 </when>
101 </conditional> 173 </conditional>
102 <param name="sample_names_regex" type="text" value="\.\d+[A-Z]$" 174 <param name="sample_names_regex" type="text" value="\.\d+[A-Z]$"
103 help="Pattern extracting sample-names from names of columns that have peptide intensity data (PERL-compatible regular expression)" 175 help="Pattern extracting sample-names from names of columns of 'Filtered phosphopeptide intensities' that have peptide intensity data (PERL-compatible regular expression)"
104 label="Sample-extraction pattern"> 176 label="Sample-name extraction pattern">
105 <sanitizer> 177 <sanitizer>
106 <valid initial="string.printable"> 178 <valid initial="string.printable">
107 <remove value="&apos;"/> 179 <remove value="&apos;"/>
108 </valid> 180 </valid>
109 </sanitizer> 181 </sanitizer>
110 </param> 182 </param>
111 <param name="sample_grouping_regex" type="text" value="\d+" 183 <param name="sample_grouping_regex" type="text" value="\d+"
112 help="Pattern extracting sample-group from the sample-names that are extracted by 'Sample-extraction pattern' (PERL-compatible regular expression)" 184 help="Pattern extracting sample-group from the extracted sample-names (PERL-compatible regular expression)"
113 label="Group-extraction pattern"> 185 label="Sample-group extraction pattern">
114 <sanitizer> 186 <sanitizer>
115 <valid initial="string.printable"> 187 <valid initial="string.printable">
116 <remove value="&apos;"/> 188 <remove value="&apos;"/>
117 </valid> 189 </valid>
118 </sanitizer> 190 </sanitizer>
119 </param> 191 </param>
192 <param name="intnsty_min_vals_per_smpl_grp" type="integer" value="1" min="0"
193 label="Minimum number of values per sample-group"
194 help="Only consider as comparable those intensities having at least this number of values in each sample-group (range [0,&#8734;])"
195 />
196 <conditional name="group_filter">
197 <param name="group_filter_method" type="select" label="Filter sample-groups"
198 help="What filter should be applied to sample-group names? (1) 'none', no filter; (2) 'include', match is required; (3) 'exclude', match is forbidden."
199 >
200 <option value="none" selected="true">none</option>
201 <option value="include">include</option>
202 <option value="exclude">exclude</option>
203 </param>
204 <when value="none" />
205 <when value="include">
206 <expand macro="group_matching_parm"/>
207 </when>
208 <when value="exclude">
209 <expand macro="group_matching_parm"/>
210 </when>
211 </conditional>
212 <param name="ksea_min_substrate_count" type="integer" value="1" min="1"
213 label="Minimum number of kinase-substrates for KSEA"
214 help="Minimum number of substrates to consider any kinase for KSEA (range [1,&#8734;])"
215 />
120 <param name="ksea_cutoff_threshold" type="float" value="0.05" 216 <param name="ksea_cutoff_threshold" type="float" value="0.05"
121 label="KSEA threshold level" 217 label="KSEA threshold level"
122 help="Maximum FDR to be used to score a kinase enrichment as significant" 218 help="Maximum FDR to be used to score a kinase enrichment as significant; see warning against setting this too low in help text below."
219 />
220 <param name="ksea_use_absolute_log2_fc"
221 type="boolean"
222 label="Use abs(log2(fold-change)) for KSEA"
223 help="Should log2(fold-change) be used for KSEA? (Checking this may alter (possibly reduce) the number of hits.)"
224 checked="false"
225 truevalue="TRUE"
226 falsevalue="FALSE"
227 />
228 <param name="ksea_min_quality" type="integer" value="0" min="0"
229 label="Minimum quality of substrates for KSEA"
230 help="Minimum 'quality' of substrates to be considered for KSEA (range [0,&#8734;]); higher numbers reduce the number of substrates considered - see help text below."
123 /> 231 />
124 </inputs> 232 </inputs>
125 <outputs> 233 <outputs>
126 <data name="imputed_data_file" format="tabular" label="${input_file.name}.${imputation.imputation_method}-imputed_intensities" ></data> 234 <!-- earlier outputs will appear lower in the history list; therefore, put report at the top -->
127 <data name="imp_qn_lt_file" format="tabular" label="${input_file.name}.${imputation.imputation_method}-imputed_QN_LT_intensities" ></data> 235 <data name="ksea_sqlite" format="sqlite" label="${input_file.name}..${imputation.imputation_method}-imputed_ksea_sqlite" />
128 <data name="anova_ksea_metadata" format="tabular" label="${input_file.name}.${imputation.imputation_method}-anova_ksea_metadata" ></data> 236 <data name="anova_ksea_metadata" format="tabular" label="${input_file.name}.${imputation.imputation_method}-anova_ksea_metadata" />
129 <!-- 237 <data name="imputed_data_file" format="tabular" label="${input_file.name}.${imputation.imputation_method}-imputed_intensities" />
130 <data name="report_file" format="html" label="${input_file.name}.${imputation.imputation_method}-imputed_report (download/unzip to view)" ></data> 238 <data name="imp_qn_lt_file" format="tabular" label="${input_file.name}.${imputation.imputation_method}-imputed_QN_LT_intensities" />
131 --> 239 <data name="report_file" format="pdf" label="${input_file.name}.${imputation.imputation_method}-imputed_report" />
132 <data name="report_file" format="pdf" label="${input_file.name}.${imputation.imputation_method}-imputed_report" ></data>
133 <data name="ksea_sqlite" format="sqlite" label="${input_file.name}..${imputation.imputation_method}-imputed_ksea_sqlite">
134 </data>
135 </outputs> 240 </outputs>
136 <tests> 241 <tests>
137 <test> 242 <test><!-- test #1 -->
138 <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/> 243 <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/>
139 <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/> 244 <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/>
140 <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/> 245 <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/>
141 <param name="intensity_column_regex" value="^Intensity[^_]"/> 246 <param name="intensity_column_regex" value="^Intensity[^_]"/>
142 <param name="imputation_method" value="median"/> 247 <param name="imputation_method" value="median"/>
154 <output name="imp_qn_lt_file"> 259 <output name="imp_qn_lt_file">
155 <assert_contents> 260 <assert_contents>
156 <has_text text="Phosphopeptide" /> 261 <has_text text="Phosphopeptide" />
157 <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> 262 <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" />
158 <!-- missing missing observed missing observed observed --> 263 <!-- missing missing observed missing observed observed -->
159 <has_text_matching expression="pSQKQEEENPAEETGEEK.*6.962256.*6.908828.*6.814580.*6.865411.*6.908828.*7.088909" /> 264 <has_text_matching expression="pSQKQEEENPAEETGEEK.*6.962256.*6.908828.*6.814580.*6.865411.*6.908828.*7.093748" />
160 265
161 <has_text text="pSQKQEEENPAEETGEEK" /> 266 <has_text text="pSQKQEEENPAEETGEEK" />
162 </assert_contents> 267 </assert_contents>
163 </output> 268 </output>
164 </test> 269 </test>
165 <test> 270 <test><!-- test #2 -->
166 <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/> 271 <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/>
167 <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/> 272 <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/>
168 <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/> 273 <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/>
169 <param name="intensity_column_regex" value="^Intensity[^_]"/> 274 <param name="intensity_column_regex" value="^Intensity[^_]"/>
170 <param name="imputation_method" value="mean"/> 275 <param name="imputation_method" value="mean"/>
276 <!--
277 <param name="meanPercentile" value="1"/>
278 <param name="sdPercentile" value="1"/>
279 -->
171 <param name="sample_names_regex" value="\.\d+[A-Z]$"/> 280 <param name="sample_names_regex" value="\.\d+[A-Z]$"/>
172 <param name="sample_grouping_regex" value="\d+"/> 281 <param name="sample_grouping_regex" value="\d+"/>
282 <param name="intnsty_min_vals_per_smpl_grp" value="1"/>
283 <param name="group_filter_method" value="none"/>
284 <!--
285 <param name="group_filter_mode" value="r"/>
286 <param name="group_filter_patterns" value="\.+"/>
287 -->
288 <param name="ksea_min_substrate_count" value="1"/>
289 <param name="ksea_cutoff_threshold" value="0.5"/>
173 <output name="imputed_data_file"> 290 <output name="imputed_data_file">
174 <assert_contents> 291 <assert_contents>
175 <has_text text="Phosphopeptide" /> 292 <has_text text="Phosphopeptide" />
176 <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> 293 <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" />
177 <!-- missing missing observd missing observd observd --> 294 <!-- missing missing observd missing observd observd -->
182 <output name="imp_qn_lt_file"> 299 <output name="imp_qn_lt_file">
183 <assert_contents> 300 <assert_contents>
184 <has_text text="Phosphopeptide" /> 301 <has_text text="Phosphopeptide" />
185 <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> 302 <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" />
186 <!-- missing missing observed missing observed observed --> 303 <!-- missing missing observed missing observed observed -->
187 <has_text_matching expression="pSQKQEEENPAEETGEEK.*6.839850.*6.797424.*6.797424.*6.797424.*6.896609.*7.092451" /> 304 <has_text_matching expression="pSQKQEEENPAEETGEEK.*6.839850.*6.797424.*6.797424.*6.797424.*6.896609.*7.097251" />
188 </assert_contents> 305 </assert_contents>
189 </output> 306 </output>
190 </test> 307 </test>
191 <test> 308 <test><!-- test #3 -->
192 <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/> 309 <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/>
193 <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/> 310 <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/>
194 <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/> 311 <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/>
195 <param name="intensity_column_regex" value="^Intensity[^_]"/> 312 <param name="intensity_column_regex" value="^Intensity[^_]"/>
196 <param name="imputation_method" value="group-median"/> 313 <param name="imputation_method" value="group-median"/>
314 <!--
315 <param name="meanPercentile" value="1"/>
316 <param name="sdPercentile" value="1"/>
317 -->
197 <param name="sample_names_regex" value="\.\d+[A-Z]$"/> 318 <param name="sample_names_regex" value="\.\d+[A-Z]$"/>
198 <param name="sample_grouping_regex" value="\d+"/> 319 <param name="sample_grouping_regex" value="\d+"/>
320 <param name="intnsty_min_vals_per_smpl_grp" value="1"/>
321 <param name="group_filter_method" value="none"/>
322 <!--
323 <param name="group_filter_mode" value="r"/>
324 <param name="group_filter_patterns" value="\.+"/>
325 -->
326 <param name="ksea_min_substrate_count" value="1"/>
327 <param name="ksea_cutoff_threshold" value="0.5"/>
199 <output name="imputed_data_file"> 328 <output name="imputed_data_file">
200 <assert_contents> 329 <assert_contents>
201 <has_text text="Phosphopeptide" /> 330 <has_text text="Phosphopeptide" />
202 <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> 331 <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" />
203 <!-- missing missing observd missing observd observd --> 332 <!-- missing missing observd missing observd observd -->
212 <!-- missing missing observed missing observed observed --> 341 <!-- missing missing observed missing observed observed -->
213 <has_text_matching expression="pSQKQEEENPAEETGEEK.*6.946112.*6.888985.*6.792137.*6.792137.*6.888985.*7.089555" /> 342 <has_text_matching expression="pSQKQEEENPAEETGEEK.*6.946112.*6.888985.*6.792137.*6.792137.*6.888985.*7.089555" />
214 </assert_contents> 343 </assert_contents>
215 </output> 344 </output>
216 </test> 345 </test>
217 <test> 346 <test><!-- test #4 -->
218 <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/> 347 <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/>
219 <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/> 348 <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/>
220 <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/> 349 <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/>
221 <param name="intensity_column_regex" value="^Intensity[^_]"/> 350 <param name="intensity_column_regex" value="^Intensity[^_]"/>
222 <param name="imputation_method" value="random"/> 351 <param name="imputation_method" value="random"/>
235 </output> 364 </output>
236 <output name="imp_qn_lt_file"> 365 <output name="imp_qn_lt_file">
237 <assert_contents> 366 <assert_contents>
238 <has_text text="Phosphopeptide" /> 367 <has_text text="Phosphopeptide" />
239 <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> 368 <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" />
240 <has_text text="5.409549" /> <!-- log-transformed value for pTYVDPFTpYEDPNQAVR .1B --> 369 <has_text text="5.522821" /> <!-- log-transformed value for pTYVDPFTpYEDPNQAVR .1B -->
241 <has_text text="6.464714" /> <!-- log-transformed value for pSQKQEEENPAEETGEEK .2A --> 370 <has_text text="6.638251" /> <!-- log-transformed value for pSQKQEEENPAEETGEEK .2A -->
242 </assert_contents> 371 </assert_contents>
243 </output> 372 </output>
244 </test> 373 </test>
245 </tests> 374 </tests>
246 <help><![CDATA[ 375 <help><![CDATA[
247 ==================================================== 376 ====================================================
248 Phopsphoproteomic Enrichment Pipeline ANOVA and KSEA 377 Phopsphoproteomic Enrichment Pipeline ANOVA and KSEA
249 ==================================================== 378 ====================================================
250 379
251 **Input files** 380 **Overview**
252 381 ============
253 ``Filtered Phosphopeptide Intensities`` 382
383 Perform statistical analysis of preprocessed MaxQuant output data collected as described in `[Cheng, 2018] <https://doi.org/10.3791/57996>`_.
384
385 - Extracts sample-group IDs from sample names.
386 - Imputes missing values.
387 - Performs ANOVA analysis for each phosphopeptide.
388 - Performs Kinase-Substrate Enrichment Analysis (KSEA) using the method described by `Casado et al. (2013) <doi:10.1126/scisignal.2003573>`_; see *"Algorithms"* section below.
389
390 **Workflow position**
391 =====================
392
393 Upstream tool
394 The "MaxQuant Phosphopeptide Preprocessing" tool (``mqppep_preproc``) that transforms MaxQuant output for phospoproteome-enriched samples into a form suitable for statistical analysis.
395
396 **Input datasets**
397 ==================
398
399 ``Filtered phosphopeptide intensities`` (tabular)
254 Phosphopeptides annotated with SwissProt and phosphosite metadata (in tabular format). 400 Phosphopeptides annotated with SwissProt and phosphosite metadata (in tabular format).
255 This is the output from the "Phopsphoproteomic Enrichment Pipeline Merge and Filter" 401 This is the output from the "MaxQuant Phopsphopeptide Preprocessing"
256 (``mqppep_mrgflt``) tool. 402 (``mqppep_preproc``) tool.
257 403
258 ``ANOVA alpha cutoff level`` 404 - First column label 'Phosphopeptide'.
405 - Sample-intensities must begin in first column matching 'Intensity-column pattern' and must have column labels to match argument 'Sample-name extraction pattern'.
406
407 ``ANOVA alpha cutoff level`` (tabular)
259 List of alpha cutoff values for significance testing; text file having one column and no header. For example: 408 List of alpha cutoff values for significance testing; text file having one column and no header. For example:
260 409
261 :: 410 ::
262 411
263 0.2 412 0.2
264 0.1 413 0.1
265 0.05 414 0.05
266 415
416 ``Database from mqppep_preproc`` (sqlite)
417 SQLite database produced by the "MaxQuant Phopsphopeptide Preprocessing"
418 (``mqppep_preproc``) tool.
419
267 **Input parameters** 420 **Input parameters**
421 ====================
268 422
269 ``Intensity-column pattern`` 423 ``Intensity-column pattern``
270 First column of ``input_file`` having intensity values (integer or PERL-compatible regular expression matching column label). Default: **Intensity** 424 First column of ``Filtered phosphopeptide intensities`` having intensity values (integer or PERL-compatible regular expression matching column label). Default::
425
426 ^Intensity[^_]
271 427
272 ``Imputation method`` 428 ``Imputation method``
273 Impute missing values by: 429 Impute missing values by:
274 430
275 1. ``group-median`` - use median for each sample-group; 431 1. ``group-median`` - use median for each sample-group;
276 2. ``mean`` - use mean across all samples; or 432 2. ``mean`` - use mean across all samples; or
277 3. ``median`` - use median across all samples; 433 3. ``median`` - use median across all samples;
278 4. ``random`` - use randomly generated values where: 434 4. ``random`` - use randomly generated values where:
279 435
280 - ``Mean percentile for random values`` specifies the percentile among non-missing values to be used as mean of random values, and 436 (i) ``Mean percentile for random values`` specifies the percentile among non-missing values to be used as mean of random values, and
281 - ``Percentile std. dev. for random values`` specifies the factor to be multiplied by the standard deviation among the non-missing values (across all samples) to determine the standard deviation of random values. 437 (ii) ``Percentile SD for random values`` specifies the factor to be multiplied by the standard deviation among the non-missing values (across all samples) to determine the standard deviation of random values.
282 438
283 ``Sample-extraction pattern`` 439 ``Sample-name extraction pattern``
284 PERL-compatible regular expression extracting the sample-name from the the name of a column of instensities (from ``input_file``) for one sample. 440 PERL-compatible regular expression extracting the sample-name from the the name of a column of intensities (from ``Filtered phosphopeptide intensities``) for one sample.
285 441
286 - For example, ``"\.\d+[A-Z]$"`` applied to ``Intensity.splunge.10A`` would produce ``.10A`` 442 - For example, ``"\.\d+[A-Z]$"`` applied to "``Intensity.splunge.10A``" would produce "``.10A``".
287 - Note that *this is case sensitive* by default. 443 - Note that *this is case sensitive* by default.
288 444
289 ``Group-extraction pattern`` 445 ``Sample-group extraction pattern``
290 PERL-compatible regular expression extracting the sample-grouping from the sample-name that was extracted with ``sample_names_regex`` from a column of intensites (from ``input_file``). 446 PERL-compatible regular expression extracting the sample-grouping from the sample-name (that was in turn extracted with ``Sample-name extraction pattern`` from a column of intensites from ``Filtered phosphopeptide intensities``).
291 447
292 - For example, ``"\d+$"`` applied to ``.10A`` would produce ``10`` 448 - For example, ``"\d+$"`` applied to "``.10A``" would produce "``10``".
293 - Note that *this is case sensitive* by default. 449 - Note that *this is case sensitive* by default.
294 450
451 ``Minimum number of values per sample-group``
452 Sometimes you may wish to filter out the intensities that are poorly represented among some sample groups because they complicate the comparison process. You can use this parameter to specify the minimum number of values in any sample-group (range [0,]]>&#8734;<![CDATA[])
453
454 ``Filter sample-groups``
455 Sometimes you may have spectra that are for treatments that you are not considering for your comparison. You can specify a filter (or not) for sample-group names; if you do, you can specify whether groups that match your criteria should be excluded from the analysis ("forbidden") or included in the analysis ("required").
456
457 ``Sample-group matching mode``
458 The R `base::grep` function that is used here for pattern matching is exhaustively documented at https://rdrr.io/r/base/grep.html. There are two choices you make here. The first is whether to differentiate lowercase and uppercase characters. The second is wheter to require exact matches ("fixed" pattern-matching mode) or to use "PERL-compatible regular expressions) ("perl") or "extendd regular expressions" ("grep"). See https://rdrr.io/r/base/grep.html for further info.
459
460 ``Sample-group matching pattern``
461 This is a comma-separated list of patterns to match to group-names, according to the ``Sample-group matching mode`` that you have chosen.
462
463 ``Minimum number of kinase-substrates for KSEA``
464 For KSEA, you may decide that you wish to ignore kinases having fewer substrates than some minimum; specify that minimum here (range [1,]]>&#8734;<![CDATA[])
465
295 ``KSEA threshold level`` 466 ``KSEA threshold level``
296 Specifies minimum FDR at which a kinase will be considered to be enriched; the default choice of 0.05 is arbitrary. 467 Specifies minimum FDR at which a kinase will be considered to be enriched; the default choice of ``0.05`` is arbitrary and may exclude kinases that are interesting. The KSEA FDR perhaps should not be treated as conservatively as would be appropriate for hypothesis testing. For example, at an FDR of ``0.05``, for every ``20`` kinases that on discards, ``19`` are likely truely enriched.
468
469 ``Use abs(log2(fold-change)) for KSEA``
470 When TRUE, consider only the magnitude of the differences across the contrast for all of the substrates when aggregating them to assess the enrichment of a given kinase's substrates. When FALSE, also consider the direction. Surprisingly, setting this to TRUE may decrease the enriched kinases.
471
472 ``Minimum quality of substrates for KSEA``
473 An arbitrary "quality score" is assigned to each substrate, as described in the PDF report produced by the tool. This score takes into account both FDR-adjusted p-value and the number of missing values for each substrate. Setting the minimum to zero retains all substrates, which may be a large number.
297 474
298 **Outputs** 475 **Outputs**
299 476 ===========
300 ``imputed_intensities (input_file.imputation_method-imputed_intensities)`` 477
301 Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, in tabular format. 478 Report dataset
302 479 *[input file].[imputation method]*-``imputed_report``
303 ``imputed_QN_LT_intensities (input_file.imputation_method-imputed_QN_LT_intensities)`` 480
304 Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, quantile-normalized (**QN**), and log10-transformed (**LT**), in tabular format. 481 Summary report for normalization, imputation, and **ANOVA**, in PDF format.
305 482
306 ``report_file (input_file.imputation_method-imputed_report)`` 483 Imputed intensities
307 Summary report for normalization, imputation, and **ANOVA**, in PDF format. 484 *[input file].[imputation method]*-``imputed_intensities``
308 485
309 ``anova_ksea_metadata (input_file.imputation_method-imputed_anova_ksea_metadata)`` 486 Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, in tabular format.
310 Phosphopeptide metadata including ANOVA significance and KSEA enrichments. 487
311 488 Imputed quantum-normalized log-transformed intensities
312 ``ksea_sqlite (input_file.imputation_method-imputed_ksea_sqlite)`` 489 *[input file].[imputation method]*-``imputed_QN_LT_intensities``
313 SQLite database for ad-hoc report creation. 490
491 Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, quantile-normalized (**QN**), and log10-transformed (**LT**), in tabular format.
492
493 ANOVA KSEA metadata
494 *[input file].[imputation method]*-``imputed_anova_ksea_metadata``
495 Phosphopeptide metadata including ANOVA significance and KSEA enrichments.
496
497 KSEA SQLite database sqlite
498 *[input file].[imputation method]*-``imputed_ksea_sqlite``
499 An SQLite database that is usable for *ad hoc* report creation.
314 500
315 **Algorithm** 501 **Algorithm**
316 502 =============
317 The KSEA algorithm used here is as in the KSEAapp package as reported in [Wiredja 2017]. 503
318 The code is adapted from "Danica D. Wiredja (2017). KSEAapp: Kinase-Substrate Enrichment Analysis. R package version 0.99.0." to work with output from the "MaxQuant Phosphopeptide Preprocessing" Galaxy tool. 504 The KSEA algorithm used here is as in the KSEAapp package as reported in `[Wiredja 2017] <https://doi.org/10.1093/bioinformatics/btx415>`_.
505 The code is adapted from `"Danica D. Wiredja (2017). KSEAapp: Kinase-Substrate Enrichment Analysis. R package version 0.99.0." <https://cran.r-project.org/package=KSEAapp>`_ to work with output from the "MaxQuant Phosphopeptide Preprocessing" Galaxy tool and the multiple kinase-substrate databases that the latter tool searches.
319 506
320 **Authors** 507 **Authors**
508 ===========
321 509
322 ``Larry C. Cheng`` 510 ``Larry C. Cheng``
323 (`ORCiD 0000-0002-6922-6433 <https://orcid.org/0000-0002-6922-6433>`_) wrote the original script. 511 (`ORCiD 0000-0002-6922-6433 <https://orcid.org/0000-0002-6922-6433>`_) wrote the original script.
324 512
325 ``Arthur C. Eschenlauer`` 513 ``Arthur C. Eschenlauer``
335 <citations> 523 <citations>
336 <!-- Cheng_2018 "Phosphopeptide Enrichment ..." PMID: 30124664 --> 524 <!-- Cheng_2018 "Phosphopeptide Enrichment ..." PMID: 30124664 -->
337 <citation type="doi">10.3791/57996</citation> 525 <citation type="doi">10.3791/57996</citation>
338 <!-- Wiredja_2017 "The KSEA App ..." PMID: 28655153 --> 526 <!-- Wiredja_2017 "The KSEA App ..." PMID: 28655153 -->
339 <citation type="doi">10.1093/bioinformatics/btx415</citation> 527 <citation type="doi">10.1093/bioinformatics/btx415</citation>
528 <citation type="bibtex">@Manual{,
529 title = {KSEAapp: Kinase-Substrate Enrichment Analysis},
530 author = {Danica D. Wiredja},
531 year = {2017},
532 note = {R package version 0.99.0},
533 }</citation>
340 </citations> 534 </citations>
341 </tool> 535 </tool>