Mercurial > repos > galaxyp > proteomiqon_psmstatistics
changeset 0:b8590dea5e2d draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteomiqon_psmstatistics commit 7e7a2e71b21a1c92ae0c79cc4e25a7c2a5f0c4f1"
author | galaxyp |
---|---|
date | Sun, 18 Jul 2021 13:53:08 +0000 |
parents | |
children | f831f7d721f8 |
files | proteomiqon_psmstatistics.xml static/images/SemiSupervisedScoring.png test-data/result_1.json test-data/result_2.json test-data/result_3.json test-data/sample.db test-data/sample.psm |
diffstat | 7 files changed, 249 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/proteomiqon_psmstatistics.xml Sun Jul 18 13:53:08 2021 +0000 @@ -0,0 +1,175 @@ +<tool id="proteomiqon_psmstatistics" name="ProteomIQon PSMStatistics" version="@VERSION@" profile="20.05"> + <description> + utilizes semi supervised machine learning techniques to integrate search engine scores as well as the mentioned quality scores into one single consensus score. + </description> + <macros> + <token name="@VERSION@">0.0.6</token> + </macros> + <requirements> + <requirement type="package" version="@VERSION@">proteomiqon-psmstatistics</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + #import re + #set basename = $re.sub(r'[^\w ,.\-+]','_', $psm.element_identifier) + #if $outputParamfile: + cat '$paramfile' >> '$out_paramfile' && + #end if + ln -s '$psm' '${basename}.psm' && + ln -s '$out_qpsm' '${basename}.qpsm' && + proteomiqon-psmstatistics -i './${basename}.psm' -d '$peptideDB' -p '$paramfile' -o ./ + ]]> + </command> + <configfiles> + <configfile name="paramfile"> + <![CDATA[ + { + "Threshold": + { + #if $ThresholdCond.ProcessingType == "Estimate" + "Case":"Estimate", + "Fields": + [ + { + "QValueThreshold" : $ThresholdCond.QValueThreshold, + "PepValueThreshold" : $ThresholdCond.PepValueThreshold, + "MaxIterations" : $ThresholdCond.MaxIterations, + "MinimumIncreaseBetweenIterations" : $ThresholdCond.MinimumIncreaseBetweenIterations, + "PepValueFittingMethod" : { + "Case":"$ThresholdCond.PepValueFittingMethod" + } + } + ] + #else if $ThresholdCond.ProcessingType == "Fixed" + "Case":"Fixed", + "Fields": + [ + { + "SequestLike":$ThresholdCond.SequestLike, + "Andromeda":$ThresholdCond.Andromeda + } + ] + #end if + }, + "ParseProteinIDRegexPattern":"${ParseProteinIDRegexPattern}", + "KeepTemporaryFiles":false + } + ]]> + </configfile> + </configfiles> + <inputs> + <param name="psm" type="data" format="tabular" label="PSM" help="Specify list of peptide spectrum matches to be scored."/> + <param name="peptideDB" type="data" format="sqlite" label="Peptide database" help="Specify the peptide data base."/> + <conditional name="ThresholdCond"> + <param name="ProcessingType" type="select" label="Processing type" help="Specify how PSM thresholds should be estimated using semi supervised machine learning techniques (recommended) or if fixed score values should be used instead."> + <option value="Estimate" selected="true">Estimate</option> + <option value="Fixed">Fixed</option> + </param> + <when value="Estimate"> + <param name="QValueThreshold" type="float" value="0.01" label="Q-Value threshold" /> + <param name="PepValueThreshold" type="float" value="0.05" label="Pep-Value threshold" /> + <param name="MaxIterations" type="integer" value="15" label="Max iterations" /> + <param name="MinimumIncreaseBetweenIterations" type="float" value="0.005" label="Minimum increase between iterations" /> + <param name="PepValueFittingMethod" type="select" label="Pep-Value fitting method"> + <option value="LinearSpline">Linear Spline</option> + <option value="LogisticRegressionLogit" selected="true">Logistic Regression Logit</option> + </param> + </when> + <when value="Fixed"> + <param name="SequestLike" type="float" value="5.0" label="Sequest like" /> + <param name="Andromeda" type="float" value="40.0" label="Andromeda" /> + </when> + </conditional> + <param name="ParseProteinIDRegexPattern" type="text" value="id" label="Parse protein ID regex pattern" help="Fasta headers do often contain additional information in addition to your protein identifier, by specifying a regex pattern the tool can extract the protein IDs. If you fasta headers are already cleaned you can leave this field empty."> + <sanitizer sanitize="false" /> + </param> + <param name="outputParamfile" type="boolean" value="false" label="Output parameter file"/> + </inputs> + <outputs> + <data format="tabular" name="out_qpsm" /> + <data format="json" name="out_paramfile"> + <filter>outputParamfile</filter> + </data> + </outputs> + <tests> + <test expect_num_outputs="1"> + <param name="psm" value="sample.psm"/> + <param name="peptideDB" value="sample.db"/> + <param name="ParseProteinIDRegexPattern" value="id"/> + <param name="outputParamfile" value="false"/> + <conditional name="ThresholdCond"> + <param name="ProcessingType" value="Estimate"/> + <param name="QValueThreshold" value="0.01"/> + <param name="PepValueThreshold" value="0.05"/> + <param name="MaxIterations" value="15"/> + <param name="MinimumIncreaseBetweenIterations" value="0.005"/> + <param name="PepValueFittingMethod" value="LinearSpline"/> + </conditional> + </test> + <test expect_num_outputs="2"> + <param name="psm" value="sample.psm"/> + <param name="peptideDB" value="sample.db"/> + <param name="ParseProteinIDRegexPattern" value="id"/> + <param name="outputParamfile" value="true"/> + <conditional name="ThresholdCond"> + <param name="ProcessingType" value="Estimate"/> + <param name="QValueThreshold" value="0.01"/> + <param name="PepValueThreshold" value="0.05"/> + <param name="MaxIterations" value="15"/> + <param name="MinimumIncreaseBetweenIterations" value="0.005"/> + <param name="PepValueFittingMethod" value="LinearSpline"/> + </conditional> + <output name="out_paramfile" file="result_1.json"/> + </test> + <test expect_num_outputs="2"> + <param name="psm" value="sample.psm"/> + <param name="peptideDB" value="sample.db"/> + <param name="ParseProteinIDRegexPattern" value="id"/> + <param name="outputParamfile" value="true"/> + <conditional name="ThresholdCond"> + <param name="ProcessingType" value="Estimate"/> + <param name="QValueThreshold" value="0.01"/> + <param name="PepValueThreshold" value="0.05"/> + <param name="MaxIterations" value="15"/> + <param name="MinimumIncreaseBetweenIterations" value="0.005"/> + <param name="PepValueFittingMethod" value="LogisticRegressionLogit"/> + </conditional> + <output name="out_paramfile" file="result_2.json"/> + </test> + <test expect_num_outputs="2"> + <param name="psm" value="sample.psm"/> + <param name="peptideDB" value="sample.db"/> + <param name="ParseProteinIDRegexPattern" value="id"/> + <param name="outputParamfile" value="true"/> + <conditional name="ThresholdCond"> + <param name="ProcessingType" value="Fixed"/> + <param name="SequestLike" value="5.0"/> + <param name="Andromeda" value="40.0"/> + </conditional> + <output name="out_paramfile" file="result_3.json"/> + </test> + </tests> + <help> + <![CDATA[ +What It Does +------------ +**Disclaimer** Disclaimer this tool needs a `peptide database <https://csbiology.github.io/ProteomIQon/tools/PeptideDB.html>`_ and `peptide spectrum matches <https://csbiology.github.io/ProteomIQon/tools/PeptideSpectrumMatching.html>`_. + +An established method to identify acquired MS/MS spectra is the comparison of each spectrum with peptides in a reference database. + +To measure the similarity of in silico generated spectra and measured MS/MS scans we use our own implementations of three established search enginge scores: SEQUEST, Andromeda and XTandem. +Additionally, we also record quality control parameters such as the mass difference between the precursor ion and the theoretically calulated mass or the uniquness of each score in comparison to 'competing' peptides within the search space. +The PSMStatistics tool utilizes semi supervised machine learning techniques to integrate search engine scores as well as the mentioned quality scores into one single consensus score. + +.. image:: SemiSupervisedScoring.png + :width: 768pt + :height: 345pt + +Since the search space is extended by so called decoys - reversed counterparts of peptides within the search space - we can estimate the distribution of 'true negatives' and calculate local (PEP values) and global (Q values) false discovery rates at each consensus score. +The reported peptides at user defined local and global FDR cutoffs can then be used as inputs for any downstream analysis be it ProteinInference or PSMBasedQuantification. + +Further Reading +--------------- +Additional information about the tool can be found in the `documentation <https://csbiology.github.io/ProteomIQon/tools/PSMStatistics.html>`_. + ]]> + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/result_1.json Sun Jul 18 13:53:08 2021 +0000 @@ -0,0 +1,24 @@ + + + { + "Threshold": + { + "Case":"Estimate", + "Fields": + [ + { + "QValueThreshold" : 0.01, + "PepValueThreshold" : 0.05, + "MaxIterations" : 15, + "MinimumIncreaseBetweenIterations" : 0.005, + "PepValueFittingMethod" : { + "Case":"LinearSpline" + } + } + ] + }, + "ParseProteinIDRegexPattern":"id", + "KeepTemporaryFiles":false + } + + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/result_2.json Sun Jul 18 13:53:08 2021 +0000 @@ -0,0 +1,24 @@ + + + { + "Threshold": + { + "Case":"Estimate", + "Fields": + [ + { + "QValueThreshold" : 0.01, + "PepValueThreshold" : 0.05, + "MaxIterations" : 15, + "MinimumIncreaseBetweenIterations" : 0.005, + "PepValueFittingMethod" : { + "Case":"LogisticRegressionLogit" + } + } + ] + }, + "ParseProteinIDRegexPattern":"id", + "KeepTemporaryFiles":false + } + + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/result_3.json Sun Jul 18 13:53:08 2021 +0000 @@ -0,0 +1,19 @@ + + + { + "Threshold": + { + "Case":"Fixed", + "Fields": + [ + { + "SequestLike":5.0, + "Andromeda":40.0 + } + ] + }, + "ParseProteinIDRegexPattern":"id", + "KeepTemporaryFiles":false + } + + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample.psm Sun Jul 18 13:53:08 2021 +0000 @@ -0,0 +1,7 @@ +PSMId GlobalMod PepSequenceID ModSequenceID Label ScanNr ScanTime Charge PrecursorMZ TheoMass AbsDeltaMass PeptideLength MissCleavages SequestScore SequestNormDeltaBestToRest SequestNormDeltaNext AndroScore AndroNormDeltaBestToRest AndroNormDeltaNext XtandemScore XtandemNormDeltaBestToRest XtandemNormDeltaNext StringSequence +sample=1-period=1-cycle=2033-experiment=4_0_2_0 1 4 8 -1 0 32.9949 2 383.2268582 764.4508194 0.01165603632 7 -1 2.780764976 0.6779896705 0.0 26.3150871 0.7657267969 0.0 20.14279733 0.5594493789 0.0 ILVGDIK +sample=1-period=1-cycle=2033-experiment=4_0_2_0 1 4 8 1 0 32.9949 2 383.2268582 764.4508194 0.01165603632 7 -1 8.635639049 0.0 0.6779896705 112.3264921 0.0 0.7657267969 45.72186797 0.0 0.5594493789 ILVGDIK +sample=1-period=1-cycle=2043-experiment=5_2_2_0 0 22 87 -1 2 33.36088333 2 399.242789 796.4806883 0.009663316474 7 -1 1.412423849 0.8192994029 0.0 0.0 1.0 0.0 7.323219744 0.7506817907 0.0 ALEVIPR +sample=1-period=1-cycle=2043-experiment=5_2_2_0 0 22 87 1 2 33.36088333 2 399.242789 796.4806883 0.009663316474 7 -1 7.816376209 0.0 0.8192994029 74.99645487 0.0 1.0 29.37298389 0.0 0.7506817907 ALEVIPR +sample=1-period=1-cycle=2038-experiment=7_1_2_0 1 22 88 -1 1 33.18005 2 404.2288055 806.4510372 0.007979227388 7 -1 1.589227453 0.7970537368 0.0 11.5896755 0.8973792939 0.0 18.85459139 0.6070363597 0.0 ALEVIPR +sample=1-period=1-cycle=2038-experiment=7_1_2_0 1 22 88 1 1 33.18005 2 404.2288055 806.4510372 0.007979227388 7 -1 7.830779576 0.0 0.7970537368 112.937008 0.0 0.8973792939 47.98049859 0.0 0.6070363597 ALEVIPR