changeset 0:2bc6bbf651b9 draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/pyprophet commit a83d231286a8df67483df46e76b4b3a2ef90b251"
author galaxyp
date Tue, 25 Feb 2020 18:23:48 -0500
parents
children bd693a71a50b
files macros.xml pyprophet_export.xml test-data/merged.osw test-data/open_swath_output1.osw test-data/open_swath_output2.osw test-data/output.tabular test-data/patient_specific_OSW_optimized_decoys.pqp test-data/peptide1.osw test-data/peptide1.pdf test-data/peptide2.osw test-data/peptide2.pdf test-data/protein1.osw test-data/protein1.pdf test-data/protein2.osw test-data/protein2.pdf test-data/score.osw test-data/score_plots.pdf test-data/score_report.pdf test-data/study_design.tabular test-data/subsample.tabular test-data/test_data.osw
diffstat 21 files changed, 391 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Tue Feb 25 18:23:48 2020 -0500
@@ -0,0 +1,23 @@
+<macros>
+    <token name="@VERSION@">2.1.4</token>
+
+    <xml name="requirements">
+    <requirements>
+        <requirement type="package" version="2.1.4">pyprophet</requirement>
+        <yield/>
+    </requirements>
+    </xml>
+
+    <xml name="citations">
+    <citations>
+        <citation type="doi">10.1038/nmeth.4398</citation>
+        <citation type="doi">10.1038/nbt.3908</citation>
+        <citation type="doi">10.1093/bioinformatics/btu686</citation>
+        <citation type="doi">10.1038/nmeth.1584</citation>
+        <yield/>
+    </citations>
+    </xml>
+
+    <token name="@link@">http://openswath.org/en/latest/docs/pyprophet.html</token>
+
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyprophet_export.xml	Tue Feb 25 18:23:48 2020 -0500
@@ -0,0 +1,364 @@
+<tool id="pyprophet_export" name="PyProphet export" version="@VERSION@.0">
+    <description>
+    Export tabular files, optional swath2stats export
+    </description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements">
+        <requirement type="package" version="1.16.0">bioconductor-swath2stats</requirement>
+        <requirement type="package" version="0.8.4">r-dplyr</requirement>
+        <requirement type="package" version="1.12.8">r-data.table</requirement>
+        <requirement type="package" version="2.3">r-gridextra</requirement>
+    </expand>
+    <command detect_errors="aggressive">
+    <![CDATA[
+        ln -s '$input' ./input.osw &&
+        pyprophet export
+        --in=./input.osw
+        --format=$conditional_output.format
+
+        #if $conditional_output.format=='legacy_split':
+            $conditional_output.transition_quant
+            --max_transition_pep=$conditional_output.max_transition_pep
+            --ipf=$conditional_output.ipf
+            --ipf_max_peptidoform_pep=$conditional_output.ipf_max_peptidoform_pep
+            --max_rs_peakgroup_qvalue=$conditional_output.max_rs_peakgroup_qvalue
+            --max_global_peptide_qvalue=$conditional_output.max_global_peptide_qvalue
+            --max_global_protein_qvalue=$conditional_output.max_global_protein_qvalue
+
+        #elif $conditional_output.format=='legacy_merged':
+            $conditional_output.transition_quant
+            --max_transition_pep=$conditional_output.max_transition_pep
+            --ipf=$conditional_output.ipf
+            --ipf_max_peptidoform_pep=$conditional_output.ipf_max_peptidoform_pep
+            --max_rs_peakgroup_qvalue=$conditional_output.max_rs_peakgroup_qvalue
+            --max_global_peptide_qvalue=$conditional_output.max_global_peptide_qvalue
+            --max_global_protein_qvalue=$conditional_output.max_global_protein_qvalue
+
+        #elif $conditional_output.format=='matrix':
+            --ipf=$conditional_output.ipf
+            --ipf_max_peptidoform_pep=$conditional_output.ipf_max_peptidoform_pep
+            --max_rs_peakgroup_qvalue=$conditional_output.max_rs_peakgroup_qvalue
+            --max_global_peptide_qvalue=$conditional_output.max_global_peptide_qvalue
+            --max_global_protein_qvalue=$conditional_output.max_global_protein_qvalue
+        #end if
+        $peptide_error
+        $protein_error
+        --out=./output.tsv
+
+        #if $conditional_swath2stats.swath2stats=='yes_swath2stats':
+            && cat '${swath2stats}'
+            && Rscript '${swath2stats}'
+        #end if
+
+        #if $conditional_output.format=='score_plots':
+            && mv *score_plots.pdf '$score_plots'
+        #else:
+            && mv output.tsv '$export_file'
+        #end if
+
+
+    ]]>
+    </command>
+    <configfiles>
+        <configfile name="swath2stats"><![CDATA[
+
+#if $conditional_swath2stats.swath2stats=='yes_swath2stats':
+
+library("SWATH2stats")
+library("data.table")
+library("dplyr")
+library(gridExtra)
+
+########################### Input ##############################################
+
+## read in pyprophet export file
+data_me <- data.frame(fread('output.tsv', sep='\t', header=TRUE))
+
+## read in study design template
+study_design <- data.frame(fread('$conditional_swath2stats.study_design', sep='\t', header=TRUE))
+
+## merge both files on filename column
+data.annotated <- sample_annotation(data_me, study_design, column.file = "filename")
+
+
+########################### QC plots and tabular files #########################
+
+## remove decoys when generating plots
+data.annotated.nodecoy <- subset(data.annotated, decoy==FALSE)
+
+pdf("summary.pdf", fonts = "Times", pointsize = 12)
+plot(0,type='n',axes=FALSE,ann=FALSE)
+title(main="Summarized plots and tables from pyprophet export file")
+
+## Look at Numbers of peptides and proteins per run
+grid.table(count_analytes(data.annotated.nodecoy), rows= NULL)
+
+## Correlation of the intensities
+correlation_int <- plot_correlation_between_samples(data.annotated.nodecoy, column.values = 'Intensity')
+
+## Plot the correlation of the delta_rt, which is the deviation of the retention time from the expected retention time
+correlation_rt <- plot_correlation_between_samples(data.annotated.nodecoy, column.values = 'delta_rt')
+
+## Plot the variation of the signal across replicates
+variation <- plot_variation(data.annotated.nodecoy) 
+plot(0,type='n',axes=FALSE,ann=FALSE)
+grid.table(variation[[2]])
+
+## Plot the total variation versus variation within replicates
+variation_total <- plot_variation_vs_total(data.annotated.nodecoy)
+
+## Calculate the summed signal per peptide and protein across samples
+peptide_signal <- write_matrix_peptides(data.annotated.nodecoy) 
+protein_signal <- write_matrix_proteins(data.annotated.nodecoy) 
+
+
+#if str($conditional_swath2stats.conditional_fdr_replica.calc_fdr_replica) =="calc_fdr_replica_yes": 
+
+    ## Estimate the overall FDR across runs using a target decoy strategy
+    fdr_target_decoy <- assess_fdr_overall(data.annotated, n.range = $conditional_swath2stats.conditional_fdr_replica.n_range, FFT = $conditional_swath2stats.conditional_fdr_replica.fft, output = 'Rconsole')
+    print(fdr_target_decoy)
+    dev.off()
+#else
+    dev.off()
+#end if
+
+############################# Filtering ########################################
+
+data.filtered = data.annotated
+
+#if str($conditional_swath2stats.conditional_fdr_replica.calc_fdr_replica) =="calc_fdr_replica_yes": 
+
+    ## According to this FDR estimation one can filter the data with a higher mscore threshold to reach an overall protein FDR of 5%.
+    ## Check what m-score cut-off is requiered for Protein FDR of 5 %
+    cutoff_mscore = mscore4protfdr(data_me, FFT = $conditional_swath2stats.conditional_fdr_replica.fft, fdr_target = $conditional_swath2stats.conditional_fdr_replica.fdr_target)
+print(cutoff_mscore)
+    ## Filter data for values that pass the 0.001 mscore criteria in at least two replicates of one condition
+    data.filtered <- filter_mscore_condition(data.filtered, cutoff_mscore, n.replica = $conditional_swath2stats.conditional_fdr_replica.n_replica)
+#end if
+
+#if str($conditional_swath2stats.conditional_max_pep.filter_max_pep) == "filter_max_pep_yes":
+    ## Select only the 10 peptides showing strongest signal per protein
+    data.filtered <- filter_on_max_peptides(data.filtered, n_peptides = $conditional_swath2stats.conditional_max_pep.n_peptides_max)
+#end if
+
+
+#if str($conditional_swath2stats.conditional_min_pep.filter_min_pep) == "filter_min_pep_yes":
+    ## Filter for proteins that are supported by at least two peptides
+    data.filtered <- filter_on_min_peptides(data.filtered, n_peptides = $conditional_swath2stats.conditional_min_pep.n_peptides_min)
+#end if
+
+########################### Output ############################################
+## Convert the data into a transition-level format (one row per transition measured).
+data.transition <- disaggregate(data.filtered)
+
+## Convert the data into the format required by MSstats.
+MSstats.input <- convert4MSstats(data.transition)
+
+### Transitions which were found at different RT / multiple scans are combined by summarizing the Intensities
+Test = MSstats.input %>% group_by(ProteinName, PeptideSequence, PrecursorCharge, FragmentIon, ProductCharge, IsotopeLabelType, BioReplicate, Condition, Run) %>% summarise(Intensity = sum(Intensity))
+
+Test = Test[, c("ProteinName", "PeptideSequence", "PrecursorCharge", "FragmentIon", "ProductCharge", "IsotopeLabelType", "Intensity", "BioReplicate", "Condition", "Run")]
+
+write.table(Test, file="$msstats_input", quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t")
+write.table(peptide_signal, file="$peptide_signal", quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t")
+write.table(protein_signal, file="$protein_signal", quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t")
+
+#end if
+
+    ]]></configfile>
+    </configfiles>
+    <inputs>
+        <param name="input" type="data" format="osw" label="Input file" help="This file needs to be in OSW format (--in)" />
+        <conditional name="conditional_output">
+        <param argument="format" type="select" label="Export format, either matrix, legacy_split, legacy_merged (mProphet/PyProphet) or score_plots format" >
+            <option value="legacy_split" selected="True">legaxy_split</option>
+            <option value="legacy_merged">legacy_merged</option>
+            <option value="matrix">matrix</option>
+            <option value="score_plots">score_plots</option>
+        </param>
+            <when value="legacy_split">
+
+                <param name="transition_quant" type="boolean" truevalue="--transition_quantification" falsevalue="--no-transition_quantification" checked="True" label="Report aggregated transition-level quantification" help="(--transition_quantification / --no-transition_quantification)" />
+                <param argument="max_transition_pep" type="float" value="0.7" label="Maximum PEP to retain scored transitions for quantification (requires transition-level scoring)" />
+                <param argument="ipf" type="select" display="radio" label="Should IPF results be reported if present? 'peptidoform': Report results on peptidoform-level, 'augmented': Augment OpenSWATH results with IPF scores, 'disable': Ignore IPF results" >
+                  <option value="peptidoform" selected="True" >peptidoform </option>
+                  <option value="augmented">augmented</option>
+                  <option value="disable">disable</option>
+                </param>
+                <param argument="ipf_max_peptidoform_pep" type="float" value="0.4" label="IPF: Filter results to maximum run-specific peptidoform-level PEP" />
+                <param argument="max_rs_peakgroup_qvalue" type="float" value="0.05" label="Filter results to maximum run-specific peak group-level q-value" />
+                <param argument="max_global_peptide_qvalue" type="float" value="0.01" label="Filter results to maximum global peptide-level q-value" />
+                <param argument="max_global_protein_qvalue" type="float" value="0.01" label="ilter results to maximum global protein-level q-value" />
+            </when>
+            <when value="legacy_merged">
+
+                <param name="transition_quant" type="boolean" truevalue="--transition_quantification" falsevalue="--no-transition_quantification" checked="True" label="Report aggregated transition-level quantification" help="(--transition_quantification / --no-transition_quantification)" />
+                <param argument="max_transition_pep" type="float" value="0.7" label="Maximum PEP to retain scored transitions for quantification (requires transition-level scoring)" />
+                <param argument="ipf" type="select" display="radio" label="Should IPF results be reported if present? 'peptidoform': Report results on peptidoform-level, 'augmented': Augment OpenSWATH results with IPF scores, 'disable': Ignore IPF results" >
+                  <option value="peptidoform" selected="True">peptidoform </option>
+                  <option value="augmented">augmented</option>
+                  <option value="disable">disable</option>
+                </param>
+                <param argument="ipf_max_peptidoform_pep" type="float" value="0.4" label="IPF: Filter results to maximum run-specific peptidoform-level PEP" />
+                <param argument="max_rs_peakgroup_qvalue" type="float" value="0.05" label="Filter results to maximum run-specific peak group-level q-value" />
+                <param argument="max_global_peptide_qvalue" type="float" value="0.01" label="Filter results to maximum global peptide-level q-value" />
+                <param argument="max_global_protein_qvalue" type="float" value="0.01" label="ilter results to maximum global protein-level q-value" />
+            </when>
+            <when value="matrix">
+
+                <param argument="ipf" type="select" display="radio" label="Should IPF results be reported if present? 'peptidoform': Report results on peptidoform-level, 'augmented': Augment OpenSWATH results with IPF scores, 'disable': Ignore IPF results" >
+                  <option value="peptidoform" selected="True">peptidoform </option>
+                  <option value="augmented">augmented</option>
+                  <option value="disable">disable</option>
+                </param>
+                <param argument="ipf_max_peptidoform_pep" type="float" value="0.4" label="IPF: Filter results to maximum run-specific peptidoform-level PEP" />
+                <param argument="max_rs_peakgroup_qvalue" type="float" value="0.05" label="Filter results to maximum run-specific peak group-level q-value" />
+                <param argument="max_global_peptide_qvalue" type="float" value="0.01" label="Filter results to maximum global peptide-level q-value" />
+                <param argument="max_global_protein_qvalue" type="float" value="0.01" label="ilter results to maximum global protein-level q-value" />
+            </when>
+            <when value="score_plots"/>
+        </conditional>
+        <param name="peptide_error" type="boolean" truevalue="--peptide" falsevalue="--no-peptide"  checked="True" label="Append peptide-level error-rate estimates if available" help="(--peptide / --no-peptide)" />
+        <param name="protein_error" type="boolean" truevalue="--protein" falsevalue="--no-protein"  checked="True" label="Append protein-level error-rate estimates if available" help="(--protein / --no-protein)" />
+        <conditional name="conditional_swath2stats">
+            <param name="swath2stats" type="select" label="Use swath2stats to export file for statsics" >
+                <option value="yes_swath2stats" selected="True">yes</option>
+                <option value="no_swath2stats">no</option>
+            </param>
+                <when value="yes_swath2stats">
+                <param name="study_design" type="data" format="tabular" label="Study design tabular file" help="Needs to have columns with Filename, Condition, BioReplicate, Run" />
+                    <conditional name="conditional_fdr_replica">
+                        <param name="calc_fdr_replica" type="select" label="Filter for fdr and number of replicates" >
+                        <option value="calc_fdr_replica_yes" selected="True">Yes</option>
+                        <option value="calc_fdr_replica_no">No</option>
+                    </param>
+                        <when value="calc_fdr_replica_yes">
+                            <param name="fft" type="float" value="0.5" label="FFT. Ratio of false positives to true negatives, q-values from pyProphet stats output" help="As an approximation, the q-values of multiple runs are averaged and supplied as argument FFT. Numeric from 0 to 1."/>
+                            <param name="n_range" type="float" value="10" label="Option to set the number of magnitude for which the m_score threshold is decreased" />
+                            <param name="fdr_target" type="float" value="0.05" label="FDR target." help="An m_score cutoff achieving and FDR smaller fdr_target will be selected. Calculated as FDR = decoys*FFT/targets" />
+                            <param name="n_replica" type="integer" value="2" label="Number Replicates." help="Number of measurements within at least one condition that have to pass the mscore threshold for this transition." />
+                        </when>
+                        <when value="calc_fdr_replica_no"/>
+                    </conditional>
+                    <conditional name="conditional_max_pep">
+                        <param name="filter_max_pep" type="select" label="Filter for a maximum number of peptides per protein" >
+                        <option value="filter_max_pep_yes" selected="True">Yes</option>
+                        <option value="filter_max_pep_no">No</option>
+                    </param>
+                        <when value="filter_max_pep_yes">
+                            <param name="n_peptides_max" type="integer" value="10" label="Maximum number of peptides per protein." help="Maximum number of highest intense peptides to filter the data on." />
+                        </when>
+                        <when value="filter_max_pep_no"/>
+                    </conditional>
+                    <conditional name="conditional_min_pep">
+                        <param name="filter_min_pep" type="select" label="Filter for a proteins that are supported by a minimum number of peptides" >
+                        <option value="filter_min_pep_yes" selected="True">Yes</option>
+                        <option value="filter_min_pep_no">No</option>
+                    </param>
+                        <when value="filter_min_pep_yes">
+                            <param name="n_peptides_min" type="integer" value="2" label="Minimum number of peptides per protein" help="Number of minimal number of peptide IDs associated with a protein ID in order to be kept in the dataset." />
+                        </when>
+                        <when value="filter_min_pep_no"/>
+                    </conditional>
+                </when>
+                <when value="no_swath2stats"/>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="export_file" format="tabular" label="${tool.name} on ${on_string}: export.tabular" >
+            <filter>conditional_output['format'] != 'score_plots'</filter>
+        </data>
+        <data name="score_plots" format="pdf" label="${tool.name} on ${on_string}: score_plots.pdf" >
+            <filter>conditional_output['format'] == 'score_plots'</filter>
+        </data>
+        <data name="summary" format="pdf" from_work_dir="summary.pdf" label = "${tool.name} on ${on_string}: summary.pdf">
+            <filter>conditional_swath2stats['swath2stats'] == 'yes_swath2stats'</filter>
+        </data>
+        <data name="peptide_signal" format="tabular" label="${tool.name} on ${on_string}: peptide_signal.tabular" from_work_dir="peptide_signal.tabular" >
+            <filter>conditional_swath2stats['swath2stats'] == 'yes_swath2stats'</filter>
+        </data>
+        <data name="protein_signal" format="tabular" label="${tool.name} on ${on_string}: protein_signal.tabular" from_work_dir="protein_signal.tabular" >
+            <filter>conditional_swath2stats['swath2stats'] == 'yes_swath2stats'</filter>
+        </data>
+        <data name="msstats_input" format="tabular" label="${tool.name} on ${on_string}: msstats_input.tabular" from_work_dir="msstats_input.tabular" >
+            <filter>conditional_swath2stats['swath2stats'] == 'yes_swath2stats'</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <param name="input" value="protein2.osw" ftype="osw" />
+            <param name="format" value="legacy_merged" />
+            <param name="max_global_peptide_qvalue" value="0.2" />
+            <conditional name="conditional_swath2stats">
+                <param name="swath2stats" value="no_swath2stats"/>
+            </conditional>
+            <output name="export_file" file="output.tabular" />
+        </test>
+        <test expect_num_outputs="1">
+            <param name="input" value="protein2.osw" ftype="osw" />
+            <param name="format" value="score_plots" />
+            <conditional name="conditional_swath2stats">
+                <param name="swath2stats" value="no_swath2stats"/>
+            </conditional>
+            <output name="score_plots" file="score_plots.pdf" />
+        </test>
+        <test expect_failure="true">
+            <param name="input" value="protein2.osw" ftype="osw" />
+            <param name="format" value="legacy_merged" />
+            <conditional name="conditional_swath2stats">
+                <param name="study_design" value="study_design.tabular" ftype="tabular" />
+                <conditional name="conditional_fdr_replica">
+                    <param name="calc_fdr_replica" value="calc_fdr_replica_no"/>
+                </conditional>
+                <conditional name="conditional_max_pep">
+                    <param name="filter_max_pep" value="filter_max_pep_no" />
+                </conditional>
+                <conditional name="conditional_min_pep">
+                    <param name="filter_min_pep" value="filter_min_pep_no" />
+                </conditional>
+            </conditional>
+            <assert_stderr>
+                <has_text text="replacement has 1 row, data has 0" />
+            </assert_stderr>
+        </test>
+    </tests>
+    <help>
+<![CDATA[
+**What it does**
+
+PyProphet: Semi-supervised learning and scoring of OpenSWATH results.
+
+Export tabular (tsv) tables. 
+
+Optional SWATH2stats output. SWATH2stats is intended to transform SWATH data from the OpenSWATH software into a format readable by other statistics packages while performing filtering, annotation and FDR estimation.
+
+**Study desing file for SWATH2stats**
+
+    - Tabular file with columns that are named: Filename, Condition, BioReplicate, Run. 
+    - The Filename should be part or the same as the original filenames used in OpenSWATH workflow
+    - The Condition should be a 
+    - The BioReplicate is corresponds to the biological replicate
+    - The Run is the number of the run in which the sample was measured
+
+        ::
+
+                      Filename       Condition    BioReplicate     Run
+                    healthy1.mzml     healthy         1             1
+                    healthy2.mzml     healthy         2             2
+                    diseased1.mzml    diseased        3             3
+                        ...
+                        ...
+
+
+PyProphet is a Python re-implementation of the mProphet algorithm (Reiter 2010 Nature Methods) optimized for SWATH-MS data acquired by data-independent acquisition (DIA). The algorithm was originally published in (Telemann 2014 Bioinformatics) and has since been extended to support new data types and analysis modes (Rosenberger 2017, Nature biotechnology and Nature methods).
+
+For more information, visit @link@
+
+]]>
+    </help>
+    <expand macro="citations">
+        <citation type="doi">10.1371/journal.pone.0153160</citation>
+    </expand>
+</tool>
Binary file test-data/merged.osw has changed
Binary file test-data/open_swath_output1.osw has changed
Binary file test-data/open_swath_output2.osw has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output.tabular	Tue Feb 25 18:23:48 2020 -0500
@@ -0,0 +1,1 @@
+transition_group_id	decoy	run_id	filename	RT	assay_rt	delta_rt	iRT	assay_iRT	delta_iRT	Sequence	FullPeptideName	Charge	mz	Intensity	aggr_prec_Peak_Area	aggr_prec_Peak_Apex	leftWidth	rightWidth	peak_group_rank	d_score	m_score	id	aggr_Peak_Area	aggr_Peak_Apex	aggr_Fragment_Annotation	ProteinName	m_score_peptide_experiment_wide	m_score_peptide_global	m_score_protein_experiment_wide	m_score_protein_global
Binary file test-data/patient_specific_OSW_optimized_decoys.pqp has changed
Binary file test-data/peptide1.osw has changed
Binary file test-data/peptide1.pdf has changed
Binary file test-data/peptide2.osw has changed
Binary file test-data/peptide2.pdf has changed
Binary file test-data/protein1.osw has changed
Binary file test-data/protein1.pdf has changed
Binary file test-data/protein2.osw has changed
Binary file test-data/protein2.pdf has changed
Binary file test-data/score.osw has changed
Binary file test-data/score_plots.pdf has changed
Binary file test-data/score_report.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/study_design.tabular	Tue Feb 25 18:23:48 2020 -0500
@@ -0,0 +1,3 @@
+Filename	Condition	BioReplicate	Run
+./TN22.mzML	late	1	1
+./TN23.mzML	early	2	2
Binary file test-data/subsample.tabular has changed
Binary file test-data/test_data.osw has changed