comparison msstats.xml @ 0:80b40b9ab835 draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/msstats commit d2341b64b690d975bc6e29c81f7d13b66c0c5b7f"
author galaxyp
date Sat, 25 Jul 2020 13:21:47 -0400
parents
children 3e2606fa85bf
comparison
equal deleted inserted replaced
-1:000000000000 0:80b40b9ab835
1 <tool id="msstats" name="MSstats" version="@VERSION@.0" python_template_version="3.5">
2 <description>statistical relative protein significance analysis in DDA, SRM and DIA Mass Spectrometry</description>
3 <macros>
4 <token name="@VERSION@">3.20.1</token>
5 <xml name="useUniquePeptide">
6 <param name="useUniquePeptide" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="remove peptides that are assigned for more than one proteins" help="We assume to use unique peptide for each protein"/>
7 </xml>
8 <xml name="summaryforMultipleRows">
9 <param name="summaryforMultipleRows" type="select" label="Summary for MultipleRows" help="summaryforMultipleRows - when there are multiple measurements for certain feature and certain run, use highest or sum of all">
10 <option value="max" selected="true">max</option>
11 <option value="sum">sum</option>
12 </param>
13 </xml>
14 <xml name="fewMeasurements">
15 <param name="fewMeasurements" type="select" label="Remove the features that have 1 or 2 measurements across runs" help="(fewMeasurements)">
16 <option value="remove" selected="true">remove</option>
17 <option value="keep">keep</option>
18 </param>
19 </xml>
20 <xml name="removeProtein_with1Peptide">
21 <param name="removeProtein_with1Peptide" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false" label="Remove the proteins which have only 1 peptide and charge"/>
22 </xml>
23
24 </macros>
25 <requirements>
26 <requirement type="package" version="@VERSION@">bioconductor-msstats</requirement>
27 </requirements>
28 <command detect_errors="exit_code"><![CDATA[
29 cat '$msstats_script' > '$r_script' &&
30 cat '$msstats_script' &&
31 Rscript '$msstats_script'
32 && cat msstats*.log > $log
33 ]]></command>
34 <configfiles>
35 <configfile name="msstats_script"><![CDATA[
36 library('MSstats', warn.conflicts = F, quietly = T, verbose = F)
37
38 #if $input.input_src == 'MSstats'
39
40 #if $input.msstats_input.is_of_type('csv')
41 raw <- read.csv("$input.msstats_input")
42 #else
43 raw <- read.table("$input.msstats_input", sep="\t", header=TRUE)
44 #end if
45
46 #elif $input.input_src == 'MaxQuant'
47 \# Read in MaxQuant files
48 mq_evidence <- read.table("$input.evidence", sep="\t", header=TRUE)
49
50 mq_proteinGroups <- read.table("$input.proteinGroups", sep="\t", header=TRUE)
51
52 \# Read in annotation including condition and biological replicates per run.
53 \# Users should make this annotation file. It is not the output from MaxQuant.
54 #if $input.annotation.is_of_type('csv')
55 annot <- read.csv("$input.annotation", header=TRUE)
56 #else
57 annot <- read.table("$input.annotation", sep="\t", header=TRUE)
58 #end if
59
60 raw <- MaxQtoMSstatsFormat(evidence=mq_evidence,
61 proteinGroups=mq_proteinGroups,
62 annotation=annot,
63 proteinID="$input.proteinID",
64 useUniquePeptide=$input.input_options.useUniquePeptide,
65 summaryforMultipleRows=$input.input_options.summaryforMultipleRows,
66 fewMeasurements="$input.input_options.fewMeasurements",
67 removeMpeptides=$input.input_options.removeMpeptides,
68 removeOxidationMpeptides=$input.input_options.removeOxidationMpeptides,
69 removeProtein_with1Peptide=$input.input_options.removeProtein_with1Peptide)
70
71 #elif $input.input_src == 'OpenMS'
72
73 #if $input.evidence.is_of_type('csv')
74 input <- read.csv("$input.evidence", header=TRUE)
75 #else
76 input <- read.table("$input.evidence", sep="\t", header=TRUE)
77 #end if
78 #if $input.annotation.is_of_type('csv')
79 annot <- read.csv("$input.annotation", header=TRUE)
80 #else
81 annot <- read.table("$input.annotation", sep="\t", header=TRUE)
82 #end if
83
84 raw <- OpenMStoMSstatsFormat(input,
85 annotation=annot,
86 useUniquePeptide=$input.input_options.useUniquePeptide,
87 summaryforMultipleRows=$input.input_options.summaryforMultipleRows,
88 fewMeasurements="$input.input_options.fewMeasurements",
89 removeProtein_with1Peptide=$input.input_options.removeProtein_with1Peptide)
90
91 #elif $input.input_src == 'OpenSWATH'
92
93 #if $input.evidence.is_of_type('csv')
94 input <- read.csv("$input.evidence", header=TRUE)
95 #else
96 input <- read.table("$input.evidence", sep="\t", header=TRUE)
97 #end if
98 #if $input.annotation.is_of_type('csv')
99 annot <- read.csv("$input.annotation", header=TRUE)
100 #else
101 annot <- read.table("$input.annotation", sep="\t", header=TRUE)
102 #end if
103
104 raw <- OpenSWATHtoMSstatsFormat(input,
105 annotation=annot,
106 filter_with_mscore=$input.input_options.filter_with_mscore,
107 mscore_cutoff=$input.input_options.mscore_cutoff,
108 useUniquePeptide=$input.input_options.useUniquePeptide,
109 fewMeasurements="$input.input_options.fewMeasurements",
110 removeProtein_with1Feature=$input.input_options.removeProtein_with1Feature,
111 summaryforMultipleRows=$input.input_options.summaryforMultipleRows)
112
113 #end if
114
115 processed_data <- dataProcess(raw,
116 logTrans=$dp_options.logTrans,
117 normalization="$dp_options.norm.normalization",
118 #if $dp_options.norm.normalization == 'globalStandards'
119 nameStandards=c($dp_options.norm.nameStandards),
120 #end if
121 ## address=$dp_options.address,
122 fillIncompleteRows=$dp_options.fillIncompleteRows,
123 featureSubset="$dp_options.features.featureSubset",
124 #if $dp_options.features.featureSubset == 'topN'
125 n_top_feature=$dp_options.features.n_top_feature,
126 #end if
127 #if $dp_options.features.featureSubset == 'highQuality'
128 remove_uninformative_feature_outlier=$dp_options.features.remove_uninformative_feature_outlier,
129 #end if
130 summaryMethod="$dp_options.summarize.summaryMethod",
131 #if $dp_options.summarize.summaryMethod == 'TMP'
132 MBimpute=$dp_options.summarize.MBimpute,
133 remove50missing=$dp_options.summarize.remove50missing,
134 #end if
135 #if $dp_options.summarize.summaryMethod == 'linear'
136 equalFeatureVar=$dp_options.summarize.equalFeatureVar,
137 #end if
138 #if $dp_options.censoredInt == 'NULL'
139 censoredInt=NULL,
140 #else
141 censoredInt="$dp_options.censoredInt",
142 #end if
143 cutoffCensored="$dp_options.cutoffCensored",
144 maxQuantileforCensored=$dp_options.maxQuantileforCensored,
145 clusters=NULL)
146
147 #if 'processed_data' in $selected_outputs
148 write.table(processed_data\$ProcessedData, "ProcessedData.tsv", sep = "\t", quote = F, row.names = F, dec = ".")
149 #end if
150 #if 'runlevel_data' in $selected_outputs
151 write.table(processed_data\$RunlevelData, "RunlevelData.tsv", sep = "\t", quote = F, row.names = F, dec = ".")
152 #end if
153
154 #if 'qcplot' in $selected_outputs
155 dataProcessPlots(data = processed_data, type="QCplot", ylimUp=35,
156 width=5, height=5, address="MSStats_only_")
157 #end if
158
159 #if 'profile_plot' in $selected_outputs
160 dataProcessPlots(data = processed_data, type="ProfilePlot", ylimUp=35, featureName="NA", width=5, height=5, address="MSStats_only_")
161 #end if
162
163 #if 'condition_plot' in $selected_outputs
164 dataProcessPlots(data = processed_data, type="ConditionPlot", width=5, height=5, address="MSStats_only_")
165 #end if
166
167 ## Quantifiaction
168 #if 'quant_sample_matrix' in $selected_outputs
169 sampleQuantMatrix <- quantification(processed_data, type="Sample")
170 write.table(sampleQuantMatrix, "SampleQuantificationMatrix.tsv", sep = "\t", quote = F, row.names = F, dec = ".")
171 #end if
172
173 #if 'quant_sample_long' in $selected_outputs
174 sampleQuantLong <- quantification(processed_data, type="Sample", format="long")
175 write.table(sampleQuantLong, "SampleQuantificationLong.tsv", sep = "\t", quote = F, row.names = F, dec = ".")
176 #end if
177
178 #if 'quant_group_matrix' in $selected_outputs
179 groupQuantMatrix <- quantification(processed_data, type="Group")
180 write.table(groupQuantMatrix, "GroupQuantificationMatrix.tsv", sep = "\t", quote = F, row.names = F, dec = ".")
181 #end if
182
183 #if 'quant_group_long' in $selected_outputs
184 groupQuantLong <- quantification(processed_data, type="Group", format="long")
185 write.table(groupQuantLong, "GroupQuantificationLong.tsv", sep = "\t", quote = F, row.names = F, dec = ".")
186 #end if
187
188 ## Group Comparison
189 #if $group.group_comparison == 'yes'
190 \# Group Comparison
191 #if $group.comparison_matrix.is_of_type('csv')
192 comp_matrix <- read.csv("$group.comparison_matrix", header=TRUE)
193 #else
194 comp_matrix <- read.table("$group.comparison_matrix", sep="\t", header=TRUE)
195 #end if
196
197 ## first columns contains comparison names, use as row name
198 comparison <- comp_matrix[,-1]
199 row.names(comparison) <- as.character(comp_matrix[,1])
200 ## order of conditions has to be the same as they appear in the levels function
201 comparison <- as.matrix(comparison[levels(processed_data\$ProcessedData\$GROUP_ORIGINAL)])
202
203 ## perform group comparison
204 comparisons <- groupComparison(contrast.matrix = comparison, data = processed_data)
205
206 print(comparisons\$fittedmodel)
207 #if 'fittedmodel' in $group.select_outputs
208 capture.output(print(comparisons\$fittedmodel), file="ComparisonFittedModel.txt")
209 #end if
210
211
212 #if 'comparison_result' in $group.select_outputs
213 write.table(comparisons\$ComparisonResult, "ComparisonResult.tsv", sep = "\t", quote = F, row.names = F, dec = ".")
214 #end if
215
216 #if 'model_qc' in $group.select_outputs
217 write.table(comparisons\$ModelQC, "ModelQC.tsv", sep = "\t", quote = F, row.names = F, dec = ".")
218 #end if
219
220 ## TODO: transform fittedmodel to table
221 ##class(DDA2009.comparisons$fittedmodel) # list, probably good to output this somehow
222
223 ## Visualizations:
224
225 #if 'qqplot' in $group.select_outputs
226 \# normal quantile-quantile plots
227 modelBasedQCPlots(data=comparisons, type="QQPlots",
228 width=5, height=5, address="MSStats_group_")
229 #end if
230
231 #if 'residualplot' in $group.select_outputs
232 \# residual plots
233 modelBasedQCPlots(data=comparisons, type="ResidualPlots",
234 width=5, height=5, address="MSStats_group_")
235 #end if
236
237 #if 'volcanoplot' in $group.select_outputs
238 \# volcano plot
239 groupComparisonPlots(data = comparisons\$ComparisonResult, type = 'VolcanoPlot',
240 width=5, height=5, address="MSStats_group_")
241 #end if
242
243 #if 'heatmap' in $group.select_outputs
244 \# heatmap - works only for more than 1 comparison
245 if (nrow(comparison)>1)
246 {
247 groupComparisonPlots(data = comparisons\$ComparisonResult, type = 'Heatmap', address="MSStats_group_")
248 }
249 #end if
250
251 #if 'comparisonplot' in $group.select_outputs
252 \#comparison
253 groupComparisonPlots(data=comparisons\$ComparisonResult, type="ComparisonPlot",
254 width=5, height=5, address="MSStats_group_")
255 #end if
256
257 #end if
258 ]]></configfile>
259 </configfiles>
260 <inputs>
261 <conditional name="input">
262 <param name="input_src" type="select" label="input source">
263 <option value="MSstats">MStats 10 column format</option>
264 <option value="MaxQuant">MaxQuant</option>
265 <option value="OpenMS">OpenMS</option>
266 <option value="OpenSWATH">OpenSWATH</option>
267 </param>
268 <when value="MSstats">
269 <param name="msstats_input" type="data" format="tabular,csv" label="MSstats 10-column input"/>
270 </when>
271 <when value="MaxQuant">
272 <param name="evidence" type="data" format="tabular,csv" label="evidence.txt - feature-level data"/>
273 <param name="annotation" type="data" format="tabular,csv" label="annotation.txt data which includes Raw.file, Condition, BioReplicate, Run, IsotopeLabelType information"/>
274 <param name="proteinGroups" type="data" format="tabular,csv" label="proteinGroups.txt" help="It needs to matching protein group ID. If proteinGroups=NULL, use 'Proteins' column in 'evidence.txt'"/>
275 <param name="proteinID" type="select" label="Select Protein ID in evidence.txt">
276 <option value="Proteins">Protein column</option>
277 <option value="Leading.razor.protein">Leading razor protein column</option>
278 </param>
279 <section name="input_options" title="MaxQtoMSstatsFormat Options" expanded="false">
280 <expand macro="useUniquePeptide"/>
281 <expand macro="summaryforMultipleRows"/>
282 <expand macro="fewMeasurements"/>
283 <param name="removeMpeptides" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Remove the peptides including 'M' sequence"/>
284 <param name="removeOxidationMpeptides" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Remove the peptides including Oxidized 'M' sequence"/>
285 <expand macro="removeProtein_with1Peptide"/>
286 </section>
287 </when>
288 <when value="OpenMS">
289 <param name="evidence" type="data" format="tabular,csv" label="OpenSWATH_input"/>
290 <param name="annotation" type="data" format="tabular,csv" label="OpenSWATH_annotation"/>
291 <section name="input_options" title="MaxQtoMSstatsFormat Options" expanded="false">
292 <expand macro="useUniquePeptide"/>
293 <expand macro="summaryforMultipleRows"/>
294 <expand macro="fewMeasurements"/>
295 <expand macro="removeProtein_with1Peptide"/>
296 </section>
297 </when>
298 <when value="OpenSWATH">
299 <param name="evidence" type="data" format="tabular,csv" label="OpenSWATH_input"/>
300 <param name="annotation" type="data" format="tabular,csv" label="OpenSWATH_annotation"/>
301 <section name="input_options" title="OpenSWATHtoMSstatsFormat Options" expanded="false">
302 <param name="filter_with_mscore" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Remove the peptides including 'M' sequence"/>
303 <param name="mscore_cutoff" type="float" value="0.01" min="0" max="1.0" label="mscore_cutoff"/>
304 <expand macro="useUniquePeptide"/>
305 <expand macro="fewMeasurements"/>
306 <expand macro="summaryforMultipleRows"/>
307 <param name="removeProtein_with1Feature" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false" label="Remove the proteins which have only 1 peptide and charge"/>
308 </section>
309 </when>
310 </conditional>
311 <section name="dp_options" title="dataProcess Options" expanded="false">
312 <param name="logTrans" type="select" label="Log-transform Variable ABUNDANCE with base:" help="(logTrans)">
313 <option value="2" selected="true">2</option>
314 <option value="10">10</option>
315 </param>
316 <conditional name="norm">
317 <param name="normalization" type="select" label="Normalization to remove systematic bias between MS runs">
318 <option value="equalizeMedians" selected="true">equalizeMedians - represents constant normalization</option>
319 <option value="quantile">quantile - quantile normalization</option>
320 <option value="globalStandards">globalStandards - normalization with global standards proteins</option>
321 <option value="FALSE">no normalization is performed</option>
322 </param>
323 <when value="equalizeMedians"/>
324 <when value="quantile"/>
325 <when value="globalStandards">
326 <param name="nameStandards" type="text" value="" label="global standard peptide names">
327 <help>peptide names should be double-quoted and separated by commas</help>
328 <validator type="empty_field" />
329 <validator type="regex" message="double-quoted names separated by commas"><![CDATA[^".+"(,".+")*$]]></validator>
330 </param>
331 </when>
332 <when value="FALSE"/>
333 </conditional>
334 <param name="fillIncompleteRows" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Fill Incomplete Rows" help=" If the input dataset has incomplete rows, TRUE (default) adds the rows with intensity value=NA for missing peaks. FALSE reports error message with list of features which have incomplete rows"/>
335 <conditional name="features">
336 <param name="featureSubset" type="select" label="Features to use">
337 <option value="all" selected="true">Use all features that the data set has</option>
338 <option value="top3">Use the top 3 features which have highest average of log2(intensity) across runs</option>
339 <option value="topN">Use the top N features which have highest average of log2(intensity) across runs</option>
340 <option value="highQuality">Flag uninformative feature and outliers</option>
341 </param>
342 <when value="all"/>
343 <when value="top3"/>
344 <when value="topN">
345 <param name="n_top_feature" type="integer" value="3" min="1" label="The number of top features for featureSubset"/>
346 </when>
347 <when value="highQuality">
348 <param name="remove_uninformative_feature_outlier" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false" label="Remove features flagged with Uninformative feature_quality"/>
349 </when>
350 </conditional>
351 <conditional name="summarize">
352 <param name="summaryMethod" type="select" label="Summary Method">
353 <option value="TMP" selected="true">TMP - Tukey's median polish</option>
354 <option value="linear" selected="true">linear - linear mixed model</option>
355 </param>
356 <when value="TMP">
357 <param name="MBimpute" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Impute Missing Values 'NA' or '0' (depending on censoredInt option) by Accelated failure model" help="(MBimpute) TRUE - inserts 'NA' or '0' (depending on censoredInt option), . FALSE uses the values assigned by cutoffCensored"/>
358 <param name="remove50missing" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false" label="Remove runs which have more than 50% missing values"/>
359 </when>
360 <when value="linear">
361 <param name="equalFeatureVar" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false" label="Account for heterogeneous variation among intensities from different features" help="(equalFeatureVar) TRUE assumes equal variance among intensities from features. FALSE means that we cannot assume equal variance among intensities from features, then we will account for heterogeneous variation from different features"/>
362 </when>
363 </conditional>
364 <param name="censoredInt" type="select" label="Missing values to censor">
365 <help>The output from Skyline and Progenesis should use '0'</help>
366 <option value="NA" selected="true">Assume that all 'NA's in 'Intensity' column are censored</option>
367 <option value="0">Use zero intensities '0' as censored intensity</option>
368 <option value="NULL">Assume all NA intensites are randomly missing</option>
369 </param>
370 <param name="cutoffCensored" type="select" label="Cutoff value for censoring">
371 <option value="minFeature" selected="true">minimum value for each feature</option>
372 <option value="minRun">minimum value for each run</option>
373 <option value="minFeatureNRun">smallest between minimum value of corresponding feature and minimum value of corresponding run</option>
374 </param>
375 <param name="maxQuantileforCensored" type="float" value="0.999" min="0.75" max="1.0" label="Maximum quantile for deciding censored missing values"/>
376 </section>
377 <param name="selected_outputs" type="select" multiple="true" optional="false" label="Select outputs">
378 <option value="log" selected="true">MSstats log</option>
379 <option value="r_script" selected="false">MSstats Rscript</option>
380 <option value="processed_data" selected="true">MSstats ProcessedData</option>
381 <option value="runlevel_data" selected="false">MSstats RunlevelData</option>
382 <option value="qcplot" selected="true">MSstats QCPlot.pdf</option>
383 <option value="profile_plot" selected="false">MSstats ProfilePlot.pdf</option>
384 <option value="profile_wsum_plot" selected="false">MSstats ProfilePlot_wSummarization.pdf</option>
385 <option value="condition_plot" selected="false">MSstats ConditionPlot.pdf</option>
386 <option value="quant_sample_matrix" selected="false">Sample Quantification Matrix Table</option>
387 <option value="quant_sample_long" selected="false">Sample Quantification Long Table</option>
388 <option value="quant_group_matrix" selected="true">Group Quantification Matrix Table</option>
389 <option value="quant_group_long" selected="false">Group Quantification Long Table</option>
390 </param>
391
392 <conditional name="group">
393 <param name="group_comparison" type="select" label="Compare Groups">
394 <option value="no">No</option>
395 <option value="yes">Yes</option>
396 </param>
397 <when value="no"/>
398 <when value="yes">
399 <param name="comparison_matrix" type="data" format="tabular,csv" label="Comparison Matrix"/>
400 <param name="select_outputs" type="select" multiple="true" label="Select outputs">
401 <help>Heatmap requires more than one comparison</help>
402 <option value="fittedmodel" selected="true">MSstats ComparisonFittedModel.txt</option>
403 <option value="comparison_result" selected="true">MSstats ComparisonResult.tsv</option>
404 <option value="model_qc" selected="false">MSstats ModelQC.tsv</option>
405 <option value="qqplot" selected="false">MSstats QQPlot.pdf</option>
406 <option value="residualplot" selected="false">MSstats ResidualPlot.pdf</option>
407 <option value="volcanoplot" selected="true">MSstats VolcanoPlot.pdf</option>
408 <option value="heatmap" selected="false">MSstats Heatmap.pdf</option>
409 <option value="comparisonplot" selected="true">MSstats ComparisonPlot.pdf</option>
410 </param>
411 </when>
412 </conditional>
413 </inputs>
414
415 <outputs>
416 <data name="log" format="txt" label="MSstats log">
417 <filter>'log' in selected_outputs</filter>
418 </data>
419 <data name="r_script" format="txt" label="MSstats Rscript">
420 <filter>'r_script' in selected_outputs</filter>
421 </data>
422 <data name="processed_data" format="tabular" label="MSstats ProcessedData" from_work_dir="ProcessedData.tsv">
423 <filter>'processed_data' in selected_outputs</filter>
424 </data>
425 <data name="runlevel_data" format="tabular" label="MSstats RunlevelData" from_work_dir="RunlevelData.tsv">
426 <filter>'runlevel_data' in selected_outputs</filter>
427 </data>
428 <data name="qcplot" format="pdf" label="MSstats QCPlot.pdf" from_work_dir="MSStats_only_QCPlot.pdf">
429 <filter>'qcplot' in selected_outputs</filter>
430 </data>
431 <data name="profile_plot" format="pdf" label="MSstats ProfilePlot.pdf" from_work_dir="MSStats_only_ProfilePlot.pdf">
432 <filter>'profile_plot' in selected_outputs</filter>
433 </data>
434 <data name="profile_wsum_plot" format="pdf" label="MSstats ProfilePlot_wSummarization.pdf" from_work_dir="MSStats_only_ProfilePlot_wSummarization.pdf">
435 <filter>'profile_wsum_plot' in selected_outputs</filter>
436 </data>
437 <data name="condition_plot" format="pdf" label="MSstats ConditionPlot.pdf" from_work_dir="MSStats_only_ConditionPlot.pdf">
438 <filter>'condition_plot' in selected_outputs</filter>
439 </data>
440 <data name="quant_sample_matrix" format="tabular" label="MSstats SampleQuantificationMatrix.tsv" from_work_dir="SampleQuantificationMatrix.tsv">
441 <filter>'quant_sample_matrix' in selected_outputs</filter>
442 </data>
443 <data name="quant_sample_long" format="tabular" label="MSstats SampleQuantificationLong.tsv" from_work_dir="SampleQuantificationLong.tsv">
444 <filter>'quant_sample_long' in selected_outputs</filter>
445 </data>
446 <data name="quant_group_matrix" format="tabular" label="MSstats GroupQuantificationMatrix.tsv" from_work_dir="GroupQuantificationMatrix.tsv">
447 <filter>'quant_group_matrix' in selected_outputs</filter>
448 </data>
449 <data name="quant_group_long" format="tabular" label="MSstats GroupQuantificationLong.tsv" from_work_dir="GroupQuantificationLong.tsv">
450 <filter>'quant_group_long' in selected_outputs</filter>
451 </data>
452 <data name="comparison_result" format="tabular" label="MSstats ComparisonResult.tsv" from_work_dir="ComparisonResult.tsv">
453 <filter> group['group_comparison'] == 'yes' and 'comparison_result' in group['select_outputs']</filter>
454 </data>
455 <data name="fittedmodel" format="txt" label="MSstats ComparisonFittedModel.txt" from_work_dir="ComparisonFittedModel.txt">
456 <filter> group['group_comparison'] == 'yes' and 'fittedmodel' in group['select_outputs']</filter>
457 </data>
458 <data name="model_qc" format="tabular" label="MSstats ModelQC.tsv" from_work_dir="ModelQC.tsv">
459 <filter> group['group_comparison'] == 'yes' and 'model_qc' in group['select_outputs']</filter>
460 </data>
461 <data name="qqplot" format="pdf" label="MSstats ModelQQ.pdf" from_work_dir="MSStats_group_QQPlot.pdf">
462 <filter> group['group_comparison'] == 'yes' and 'qqplot' in group['select_outputs']</filter>
463 </data>
464 <data name="residualplot" format="pdf" label="MSstats ResidualPlot.pdf" from_work_dir="MSStats_group_ResidualPlot.pdf">
465 <filter> group['group_comparison'] == 'yes' and 'residualplot' in group['select_outputs']</filter>
466 </data>
467 <data name="volcanoplot" format="pdf" label="MSstats VolcanoPlot.pdf" from_work_dir="MSStats_group_VolcanoPlot.pdf">
468 <filter> group['group_comparison'] == 'yes' and 'volcanoplot' in group['select_outputs']</filter>
469 </data>
470 <data name="heatmap" format="pdf" label="MSstats Heatmap.pdf" from_work_dir="MSStats_group_Heatmap.pdf">
471 <filter> group['group_comparison'] == 'yes' and 'heatmap' in group['select_outputs']</filter>
472 </data>
473 <data name="comparisonplot" format="pdf" label="MSstats ComparisonPlot.pdf" from_work_dir="MSStats_group_ComparisonPlot.pdf">
474 <filter> group['group_comparison'] == 'yes' and 'comparisonplot' in group['select_outputs']</filter>
475 </data>
476 <!--
477 Tabular file (from groupcomparison): "fittedmodel"
478 -->
479 </outputs>
480 <tests>
481
482 <test>
483 <conditional name="input">
484 <param name="input_src" value="MSstats"/>
485 <param name="msstats_input" ftype="csv" value="msstats_testfile.txt"/>
486 </conditional>
487 <param name="selected_outputs" value="processed_data,profile_plot,profile_wsum_plot,quant_sample_matrix,quant_group_long"/>
488 <output name="processed_data">
489 <assert_contents>
490 <has_text text="D.GPLTGTYR" />
491 <has_n_columns n="16" />
492 <has_n_lines n="2071" />
493 </assert_contents>
494 </output>
495 <output name="quant_sample_matrix">
496 <assert_contents>
497 <has_text text="C2_1" />
498 <has_n_columns n="7" />
499 <has_n_lines n="7" />
500 </assert_contents>
501 </output>
502 <output name="quant_group_long">
503 <assert_contents>
504 <has_text text="LogIntensity" />
505 <has_n_columns n="3" />
506 <has_n_lines n="37" />
507 </assert_contents>
508 </output>
509 <output name="profile_plot" file="MSstats ProfilePlot.pdf" compare="sim_size"/>
510 <output name="profile_wsum_plot" file="profile_wsum_plot.pdf" compare="sim_size"/>
511 </test>
512
513 <test>
514 <conditional name="input">
515 <param name="input_src" value="MSstats"/>
516 <param name="msstats_input" ftype="tabular" value="msstats_testfile.tsv"/>
517 </conditional>
518 <conditional name="group">
519 <param name="group_comparison" value="yes"/>
520 <param name="comparison_matrix" ftype="csv" value="comparison_matrix.csv"/>
521 </conditional>
522 <param name="select_outputs" value="residualplot,model_qc"/>
523 <output name="processed_data">
524 <assert_contents>
525 <has_text text="D.GPLTGTYR" />
526 <has_n_columns n="16" />
527 <has_n_lines n="2071" />
528 </assert_contents>
529 </output>
530 <output name="model_qc">
531 <assert_contents>
532 <has_text text="MissingPercentage" />
533 <has_n_columns n="15" />
534 <has_n_lines n="108" />
535 </assert_contents>
536 </output>
537 <output name="residualplot" file="residual_plot.pdf" compare="sim_size"/>
538 </test>
539
540 <test>
541 <conditional name="input">
542 <param name="input_src" value="MaxQuant"/>
543 <param name="evidence" ftype="tabular" value="test_MQ_evidence.tabular"/>
544 <param name="annotation" ftype="tabular" value="test_MQ_annotation.txt"/>
545 <param name="proteinGroups" ftype="tabular" value="test_MQ_proteingroups.tabular"/>
546 </conditional>
547 <param name="selected_outputs" value="condition_plot,processed_data,runlevel_data"/>
548 <conditional name="group">
549 <param name="group_comparison" value="yes"/>
550 <param name="comparison_matrix" ftype="csv" value="test_MQ_group12_comparison_matrix.csv"/>
551 </conditional>
552 <param name="select_outputs" value="qqplot,comparison_result"/>
553 <output name="processed_data">
554 <assert_contents>
555 <has_text text="SPILVATAVAAR" />
556 <has_n_columns n="16" />
557 <has_n_lines n="57" />
558 </assert_contents>
559 </output>
560 <output name="runlevel_data">
561 <assert_contents>
562 <has_text text="qx017084.raw.thermo" />
563 <has_n_columns n="13" />
564 <has_n_lines n="13" />
565 </assert_contents>
566 </output>
567 <output name="comparison_result">
568 <assert_contents>
569 <has_text text="r2-r1" />
570 <has_n_columns n="11" />
571 <has_n_lines n="4" />
572 </assert_contents>
573 </output>
574 <output name="condition_plot" file="condition_plot.pdf" compare="sim_size"/>
575 <output name="qqplot" file="qq_plot.pdf" compare="sim_size"/>
576 </test>
577
578
579
580
581 <!--
582 <test>
583 <conditional name="input">
584 <param name="input_src" value="OpenMS"/>
585 <param name="evidence" ftype="tabular" value=""/>
586 <param name="annotation" ftype="tabular" value=""/>
587 </conditional>
588 <output name="processed_data">
589 <assert_contents>
590 <has_text text="D.GPLTGTYR" />
591 </assert_contents>
592 </output>
593 </test>
594 -->
595
596 <test>
597 <conditional name="input">
598 <param name="input_src" value="OpenSWATH"/>
599 <param name="evidence" ftype="tabular" value="test_swath_input_data.tabular"/>
600 <param name="annotation" ftype="tabular" value="test_swath_annotations.tabular"/>
601 </conditional>
602 <output name="processed_data">
603 <assert_contents>
604 <has_text text="GETLGLIGFGR" />
605 <has_n_columns n="16" />
606 <has_n_lines n="253" />
607 </assert_contents>
608 </output>
609 <output name="qcplot" file="QC_plot.pdf" compare="sim_size"/>
610 </test>
611
612 <test>
613 <conditional name="input">
614 <param name="input_src" value="OpenSWATH"/>
615 <param name="evidence" ftype="tabular" value="test_swath_input_data.tabular"/>
616 <param name="annotation" ftype="tabular" value="test_swath_annotations.tabular"/>
617 </conditional>
618 <param name="selected_outputs" value="r_script,processed_data,quant_sample_long"/>
619 <conditional name="group">
620 <param name="group_comparison" value="yes"/>
621 <param name="comparison_matrix" ftype="csv" value="test_swath_group12_comparison_matrix.csv"/>
622 </conditional>
623 <param name="select_outputs" value="comparison_result,volcanoplot,residualplot"/>
624 <output name="processed_data">
625 <assert_contents>
626 <has_text text="GETLGLIGFGR" />
627 <has_n_columns n="16" />
628 <has_n_lines n="253" />
629 </assert_contents>
630 </output>
631 <output name="quant_sample_long">
632 <assert_contents>
633 <has_text text="NPT_96" />
634 <has_n_columns n="3" />
635 <has_n_lines n="31" />
636 </assert_contents>
637 </output>
638 <output name="comparison_result">
639 <assert_contents>
640 <has_text text="Q5VYK3" />
641 <has_n_columns n="11" />
642 <has_n_lines n="6" />
643 </assert_contents>
644 </output>
645 <output name="volcanoplot" file="volcanoplot.pdf" compare="sim_size"/>
646 <output name="residualplot" file="residualplot.pdf" compare="sim_size"/>
647 </test>
648
649 </tests>
650 <help><![CDATA[
651 MSstats is an open-source R package for statistical relative quantification of proteins and peptides in global, targeted and data-independent proteomics. `More information on MSstats <http://msstats.org/>`_
652
653 The MSstats Galaxy tool (version @VERSION@) allows the detection of differentially abundant proteins for label-free MS experiments with complex designs on data derived from open-source proteomics software available in Galaxy (e.g. MaxQuant, OpenMS, OpenSWATH). Processing functionalities such as log transformation, normalization, feature selection, missing value imputation and quantification are available as well.
654
655 -----
656
657 **Input data**
658
659 - Data in tabular or csv format, generated by spectral processing tools such as `MaxQuant <http://coxdocs.org/doku.php?id=maxquant:start/>`_, `OpenSWATH <http://openswath.org/en/latest/>`_ will be automatically converted to 10-column MSstats format
660
661 - MaxQuant format: evidence.txt, proteinGroups.txt
662 - OpenSWATH format: pyprophet export file
663 - MSstats format: tabular file with 10 column either manually curated or other sources such as swath2stats tool which is implemented in Pyprophet export in Galaxy. For manual curation: Names of headers are fixed but not case sensitive:
664
665 - ProteinName: protein ID or peptide ID for peptide-level modeling and analysis; statistical analysis will be done separately for each unique label in this column
666 - PeptideSequence: Amino acid sequence for each peptides. If the peptide sequences should be distinguished based on post-translational modifications, this column can be renamed to PeptideModifiedSequence.
667 - PrecursorCharge: charge state of precursor.
668 - FragmentIon: e.g. b4, y3, if unknown use a single value for all entries.
669 - ProductCharge: charge state of product. If unknown use 0 for all entries.
670 - IsotopeLabelType: This column indicates whether this measurement is based on the endogenous peptides (use “L”) or labeled reference peptides (use “H”).
671 - Condition: For group comparison experiments, this column indicates groups of interest (such as “Disease” or “Control”). For time-course experiments, this column indicates time points (such as “T1”, “T2”, etc). If the experimental design contains both distinct groups of subjects and multiple time points per subject, this column should indicate a combination of these values (such as “Disease_T1”, “Disease_T2”, “Control_T1”, “Control_T2”, etc.).
672 - BioReplicate: This column should contain a unique identifier for each biological replicate in the experiment. For example, in a clinical proteomic investigation this should be a unique patient id. Patients from distinct groups should have distinct ids. MSstats does not require the presence of technical replicates in the experiment. If the technical replicates are present, all samples or runs from a same biological replicate should have a same id. MSstats automatically detects the presence of technical replicates and accounts for them in the model-based analysis.
673 - Run: This column contains the identifier of a mass spectrometry run. Each mass spectrometry run should have a unique identifier, regardless of the origin of the biological sample. In SRM experiments, if all the transitions of a biological or a technical replicate are split into multiple “methods” due to the technical limitations, each method should have a separate identifier. When processed by Skyline, distinct values of runs correspond to distinct input file names. It is possible to use the actual input file names as values in the column Run.
674 - Intensity: This column should contain the quantified signal of a feature in a run without any transformation (in particular, no logarithm transform). The signals can be quantified as the peak height or the peak of area under curve. Any other quantitative representation of abundance can also be used.
675 - Example file header:
676 ::
677
678 proteinname peptidesequence precursorcharge fragmention productcharge
679 P02768 DLGEENFK 3 y7 0
680 P02768 DLGEENFK 3 y8 0
681 P02768 ETYGEMADCCAK 2 b3 0
682 P02768 ETYGEMADCCAK 2 b4 0
683 ... ... ... ... ...
684
685 isotopelabeltype condition bioreplicate run intensity
686 L 1 ReplA 1 4298.12
687 H 1 ReplA 1 1974.59
688 L 1 ReplA 1 7183.22
689 H 1 ReplA 1 8467.58
690 ... ... ... ... ...
691
692
693 - Annotations as tabular file are needed for all input options except MSstats format
694
695 - 4 columns: Filename, Condition, Bioreplicate, Run; additional 5th column only for MaxQuant: Isotopelabeltype
696
697 - Filename: the file name has to be exactly as it appears in the other input files (e.g. S1207.raw.thermo; in/AA12_mzML.mzML)
698 - all other columns: see description above for MSstats format columns
699
700 - Comparison matrix as tabular file
701
702 - 1st column: name of comparison
703 - additionally one column for each condition that is present in the tabular file. Use 1 and -1 to indicate the conditions to compare and 0 for conditions that are not compared. Multiple groups can be combined by using 0.5.
704 - first row contains the names of the groups, they must exactly match the condition name used in the annotation file
705 - each additional row represents one comparison
706 - Example for a two group comparison
707
708 ::
709
710 names groupA groupB
711 groupA-groupB 1 -1
712
713
714 - Example for an experiment with 5 groups and 4 different comparisons
715
716 ::
717
718 names G1 G2 G3 G4 G5
719 G2-G1 -1 1 0 0 0
720 G4-G5 0 0 0 1 -1
721 G3-G5 0 0 -1 0 1
722 G1+G2-G5 0.5 0.5 0 0 -1
723
724 **Options**
725
726 - data conversion from MaxQuant and OpenSWATH to MSstats format:
727
728 - MaxQuant input: + Contaminant, + Reverse, + Only.identified.by.site, proteins are automatically removed during conversion
729
730 - data processing options:
731
732 - MaxQuant input: Contaminants and reverse and only ID by site) from MaxQuant tool are automatically removed;
733 - log transformation
734 - normalization of MS runs
735 - Feature selection
736 - Missing value imputation:
737
738 - MaxQuant input: All missing values are NA, usecensoredInt must be 'NA'
739 - OpenSWATH input: secensoredInt must be '0'
740 - Summary method: TMP + censoredInt = NULL: It assumes that all intensities are missing at random, therefore no action with MBimpute = FALSE or error with MBimpute = TRUE
741 - censoredInt='NA'or'0'& MBimpute=TRUE: AFT model-based imputation usingcutoffCensoredvalue in the AFT model
742 - censoredInt='NA'or'0'&MBimpute=FALSE: censored intensities (hereNA’s) will be replaced withthe value specified incutoffCensored.
743 - Summarizing intensities per MS run
744 - group comparison: automatic detection of differentially abundant proteins between two conditions, conditions have to be specified with the 'comparison matrix'
745 - quantification per sample or group
746
747 - sample: relative protein abundance in each biological replicate. If there are technical replicates for biological replicates,sample quantification will be the median among technical replicates. If there is no technical replicate for biological replicate (sample), sample quantification will be the same as run-level summarization.
748 - group: relative protein abundance in each condition, summarized over the biological replicates (median among sample quantification). In presence of completely missing values in a condition, the estimates will be zero
749
750 **Output options**
751
752 - Different outputs available. Especially for studies with many proteins, it is suggested to select only the necessary pdf outputs as many of them generate one plot per protein.
753
754 - MSstats log - check log file for warnings and information on the analysis steps (txt)
755 - r-script - can be used to re-run analysis outside Galaxy (txt)
756 - processed_data - transformed, normalized, imputed intensities (tabular)
757 - runlevel_data - summarized intensities per run (tabular)
758 - qcplot - log2 intensity boxplot for all proteins and run on first page, followed by one boxplot per protein (pdf)
759 - profile_plot - log2 intensity profiles one plot per protein and run (pdf)
760 - profile_wsum_plot - log2 intensity profiles one plot per protein and run with run summarization (pdf)
761 - condition_plot - log2 intensity range for each protein and condition (pdf)
762 - quant_sample_matrix - relative protein abundance in each biological replicate (tabular)
763 - quant_sample_long - relative protein abundance in each biological replicate, long format (tabular)
764 - quant_group_matrix - relative protein abundance in each condition (tabular)
765 - quant_group_long - relative protein abundance in each condition, long format (tabular)
766 - comparison_result - summary of statistical results per protein and comparison (tabular)
767 - model_qc - summary statistics per run (tabular)
768 - qqplot - one QQplot per protein (pdf)
769 - residualplot - one residual plot per protein (pdf)
770 - volcanoplot - one volcano plot per comparison (pdf)
771 - heatmap - needs at least 2 comparisons, one heatmap for all proteins and comparisons (pdf)
772 - comparisonplot - log2 intensity range for each protein and comparison (pdf)
773
774 For additional help please visit the `MSstats documentation <http://msstats.org/msstats-2/>`_
775
776
777 ]]></help>
778 <citations>
779 <citation type="doi">10.1093/bioinformatics/btu305</citation>
780 </citations>
781 </tool>
782