changeset 0:b8590dea5e2d draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteomiqon_psmstatistics commit 7e7a2e71b21a1c92ae0c79cc4e25a7c2a5f0c4f1"
author galaxyp
date Sun, 18 Jul 2021 13:53:08 +0000
parents
children f831f7d721f8
files proteomiqon_psmstatistics.xml static/images/SemiSupervisedScoring.png test-data/result_1.json test-data/result_2.json test-data/result_3.json test-data/sample.db test-data/sample.psm
diffstat 7 files changed, 249 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/proteomiqon_psmstatistics.xml	Sun Jul 18 13:53:08 2021 +0000
@@ -0,0 +1,175 @@
+<tool id="proteomiqon_psmstatistics" name="ProteomIQon PSMStatistics" version="@VERSION@" profile="20.05">
+    <description>
+        utilizes semi supervised machine learning techniques to integrate search engine scores as well as the mentioned quality scores into one single consensus score.
+    </description>
+    <macros>
+        <token name="@VERSION@">0.0.6</token>
+    </macros>
+    <requirements>
+        <requirement type="package" version="@VERSION@">proteomiqon-psmstatistics</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        #import re
+        #set basename = $re.sub(r'[^\w ,.\-+]','_', $psm.element_identifier)
+        #if $outputParamfile:
+            cat '$paramfile' >> '$out_paramfile' &&
+        #end if
+        ln -s '$psm' '${basename}.psm' &&
+        ln -s '$out_qpsm' '${basename}.qpsm' &&
+        proteomiqon-psmstatistics -i './${basename}.psm' -d '$peptideDB' -p '$paramfile' -o ./
+    ]]>
+    </command>
+    <configfiles>
+        <configfile name="paramfile">
+            <![CDATA[
+            {
+                "Threshold":
+                {
+                    #if $ThresholdCond.ProcessingType == "Estimate"
+                    "Case":"Estimate",
+                    "Fields":
+                    [
+                        {
+                            "QValueThreshold" : $ThresholdCond.QValueThreshold,
+                            "PepValueThreshold" : $ThresholdCond.PepValueThreshold,
+                            "MaxIterations" : $ThresholdCond.MaxIterations,
+                            "MinimumIncreaseBetweenIterations" : $ThresholdCond.MinimumIncreaseBetweenIterations,
+                            "PepValueFittingMethod" : {
+                                "Case":"$ThresholdCond.PepValueFittingMethod"
+                            }
+                        }
+                    ]
+                    #else if $ThresholdCond.ProcessingType == "Fixed"
+                    "Case":"Fixed",
+                    "Fields":
+                    [
+                        {
+                            "SequestLike":$ThresholdCond.SequestLike,
+                            "Andromeda":$ThresholdCond.Andromeda
+                        }
+                    ]
+                    #end if
+                },
+                "ParseProteinIDRegexPattern":"${ParseProteinIDRegexPattern}",
+                "KeepTemporaryFiles":false
+            }
+            ]]>
+        </configfile>
+    </configfiles>
+    <inputs>
+        <param name="psm" type="data" format="tabular" label="PSM" help="Specify list of peptide spectrum matches to be scored."/>
+        <param name="peptideDB" type="data" format="sqlite" label="Peptide database" help="Specify the peptide data base."/>
+        <conditional name="ThresholdCond">
+            <param name="ProcessingType" type="select" label="Processing type" help="Specify how PSM thresholds should be estimated using semi supervised machine learning techniques (recommended) or if fixed score values should be used instead.">
+                <option value="Estimate" selected="true">Estimate</option>
+                <option value="Fixed">Fixed</option>
+            </param>
+            <when value="Estimate">
+                <param name="QValueThreshold" type="float" value="0.01" label="Q-Value threshold" />
+                <param name="PepValueThreshold" type="float" value="0.05" label="Pep-Value threshold" />
+                <param name="MaxIterations" type="integer" value="15" label="Max iterations" />
+                <param name="MinimumIncreaseBetweenIterations" type="float" value="0.005" label="Minimum increase between iterations" />
+                <param name="PepValueFittingMethod" type="select" label="Pep-Value fitting method">
+                    <option value="LinearSpline">Linear Spline</option>
+                    <option value="LogisticRegressionLogit" selected="true">Logistic Regression Logit</option>
+                </param>
+            </when>
+            <when value="Fixed">
+                <param name="SequestLike" type="float" value="5.0" label="Sequest like" />
+                <param name="Andromeda" type="float" value="40.0" label="Andromeda" />
+            </when>
+        </conditional>
+        <param name="ParseProteinIDRegexPattern" type="text" value="id" label="Parse protein ID regex pattern" help="Fasta headers do often contain additional information in addition to your protein identifier, by specifying a regex pattern the tool can extract the protein IDs. If you fasta headers are already cleaned you can leave this field empty.">
+            <sanitizer sanitize="false" />
+        </param>
+        <param name="outputParamfile" type="boolean" value="false" label="Output parameter file"/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="out_qpsm" />
+        <data format="json" name="out_paramfile">
+            <filter>outputParamfile</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <param name="psm" value="sample.psm"/>
+            <param name="peptideDB" value="sample.db"/>
+            <param name="ParseProteinIDRegexPattern" value="id"/>
+            <param name="outputParamfile" value="false"/>
+            <conditional name="ThresholdCond">
+                <param name="ProcessingType" value="Estimate"/>
+                <param name="QValueThreshold" value="0.01"/>
+                <param name="PepValueThreshold" value="0.05"/>
+                <param name="MaxIterations" value="15"/>
+                <param name="MinimumIncreaseBetweenIterations" value="0.005"/>
+                <param name="PepValueFittingMethod" value="LinearSpline"/>
+            </conditional>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="psm" value="sample.psm"/>
+            <param name="peptideDB" value="sample.db"/>
+            <param name="ParseProteinIDRegexPattern" value="id"/>
+            <param name="outputParamfile" value="true"/>
+            <conditional name="ThresholdCond">
+                <param name="ProcessingType" value="Estimate"/>
+                <param name="QValueThreshold" value="0.01"/>
+                <param name="PepValueThreshold" value="0.05"/>
+                <param name="MaxIterations" value="15"/>
+                <param name="MinimumIncreaseBetweenIterations" value="0.005"/>
+                <param name="PepValueFittingMethod" value="LinearSpline"/>
+            </conditional>
+            <output name="out_paramfile" file="result_1.json"/>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="psm" value="sample.psm"/>
+            <param name="peptideDB" value="sample.db"/>
+            <param name="ParseProteinIDRegexPattern" value="id"/>
+            <param name="outputParamfile" value="true"/>
+            <conditional name="ThresholdCond">
+                <param name="ProcessingType" value="Estimate"/>
+                <param name="QValueThreshold" value="0.01"/>
+                <param name="PepValueThreshold" value="0.05"/>
+                <param name="MaxIterations" value="15"/>
+                <param name="MinimumIncreaseBetweenIterations" value="0.005"/>
+                <param name="PepValueFittingMethod" value="LogisticRegressionLogit"/>
+            </conditional>
+            <output name="out_paramfile" file="result_2.json"/>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="psm" value="sample.psm"/>
+            <param name="peptideDB" value="sample.db"/>
+            <param name="ParseProteinIDRegexPattern" value="id"/>
+            <param name="outputParamfile" value="true"/>
+            <conditional name="ThresholdCond">
+                <param name="ProcessingType" value="Fixed"/>
+                <param name="SequestLike" value="5.0"/>
+                <param name="Andromeda" value="40.0"/>
+            </conditional>
+            <output name="out_paramfile" file="result_3.json"/>
+        </test>
+    </tests>
+    <help>
+    <![CDATA[
+What It Does
+------------
+**Disclaimer** Disclaimer this tool needs a `peptide database <https://csbiology.github.io/ProteomIQon/tools/PeptideDB.html>`_ and `peptide spectrum matches <https://csbiology.github.io/ProteomIQon/tools/PeptideSpectrumMatching.html>`_.
+
+An established method to identify acquired MS/MS spectra is the comparison of each spectrum with peptides in a reference database.
+
+To measure the similarity of in silico generated spectra and measured MS/MS scans we use our own implementations of three established search enginge scores: SEQUEST, Andromeda and XTandem. 
+Additionally, we also record quality control parameters such as the mass difference between the precursor ion and the theoretically calulated mass or the uniquness of each score in comparison to 'competing' peptides within the search space. 
+The PSMStatistics tool utilizes semi supervised machine learning techniques to integrate search engine scores as well as the mentioned quality scores into one single consensus score.
+
+.. image:: SemiSupervisedScoring.png
+            :width: 768pt
+            :height: 345pt
+
+Since the search space is extended by so called decoys - reversed counterparts of peptides within the search space - we can estimate the distribution of 'true negatives' and calculate local (PEP values) and global (Q values) false discovery rates at each consensus score. 
+The reported peptides at user defined local and global FDR cutoffs can then be used as inputs for any downstream analysis be it ProteinInference or PSMBasedQuantification.
+
+Further Reading
+---------------
+Additional information about the tool can be found in the `documentation <https://csbiology.github.io/ProteomIQon/tools/PSMStatistics.html>`_.  
+    ]]>
+    </help>
+</tool>
Binary file static/images/SemiSupervisedScoring.png has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/result_1.json	Sun Jul 18 13:53:08 2021 +0000
@@ -0,0 +1,24 @@
+
+            
+            {
+                "Threshold":
+                {
+                    "Case":"Estimate",
+                    "Fields":
+                    [
+                        {
+                            "QValueThreshold" : 0.01,
+                            "PepValueThreshold" : 0.05,
+                            "MaxIterations" : 15,
+                            "MinimumIncreaseBetweenIterations" : 0.005,
+                            "PepValueFittingMethod" : {
+                                "Case":"LinearSpline"
+                            }
+                        }
+                    ]
+                },
+                "ParseProteinIDRegexPattern":"id",
+                "KeepTemporaryFiles":false
+            }
+            
+        
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/result_2.json	Sun Jul 18 13:53:08 2021 +0000
@@ -0,0 +1,24 @@
+
+            
+            {
+                "Threshold":
+                {
+                    "Case":"Estimate",
+                    "Fields":
+                    [
+                        {
+                            "QValueThreshold" : 0.01,
+                            "PepValueThreshold" : 0.05,
+                            "MaxIterations" : 15,
+                            "MinimumIncreaseBetweenIterations" : 0.005,
+                            "PepValueFittingMethod" : {
+                                "Case":"LogisticRegressionLogit"
+                            }
+                        }
+                    ]
+                },
+                "ParseProteinIDRegexPattern":"id",
+                "KeepTemporaryFiles":false
+            }
+            
+        
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/result_3.json	Sun Jul 18 13:53:08 2021 +0000
@@ -0,0 +1,19 @@
+
+            
+            {
+                "Threshold":
+                {
+                    "Case":"Fixed",
+                    "Fields":
+                    [
+                        {
+                            "SequestLike":5.0,
+                            "Andromeda":40.0
+                        }
+                    ]
+                },
+                "ParseProteinIDRegexPattern":"id",
+                "KeepTemporaryFiles":false
+            }
+            
+        
\ No newline at end of file
Binary file test-data/sample.db has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample.psm	Sun Jul 18 13:53:08 2021 +0000
@@ -0,0 +1,7 @@
+PSMId	GlobalMod	PepSequenceID	ModSequenceID	Label	ScanNr	ScanTime	Charge	PrecursorMZ	TheoMass	AbsDeltaMass	PeptideLength	MissCleavages	SequestScore	SequestNormDeltaBestToRest	SequestNormDeltaNext	AndroScore	AndroNormDeltaBestToRest	AndroNormDeltaNext	XtandemScore	XtandemNormDeltaBestToRest	XtandemNormDeltaNext	StringSequence
+sample=1-period=1-cycle=2033-experiment=4_0_2_0	1	4	8	-1	0	32.9949	2	383.2268582	764.4508194	0.01165603632	7	-1	2.780764976	0.6779896705	0.0	26.3150871	0.7657267969	0.0	20.14279733	0.5594493789	0.0	ILVGDIK
+sample=1-period=1-cycle=2033-experiment=4_0_2_0	1	4	8	1	0	32.9949	2	383.2268582	764.4508194	0.01165603632	7	-1	8.635639049	0.0	0.6779896705	112.3264921	0.0	0.7657267969	45.72186797	0.0	0.5594493789	ILVGDIK
+sample=1-period=1-cycle=2043-experiment=5_2_2_0	0	22	87	-1	2	33.36088333	2	399.242789	796.4806883	0.009663316474	7	-1	1.412423849	0.8192994029	0.0	0.0	1.0	0.0	7.323219744	0.7506817907	0.0	ALEVIPR
+sample=1-period=1-cycle=2043-experiment=5_2_2_0	0	22	87	1	2	33.36088333	2	399.242789	796.4806883	0.009663316474	7	-1	7.816376209	0.0	0.8192994029	74.99645487	0.0	1.0	29.37298389	0.0	0.7506817907	ALEVIPR
+sample=1-period=1-cycle=2038-experiment=7_1_2_0	1	22	88	-1	1	33.18005	2	404.2288055	806.4510372	0.007979227388	7	-1	1.589227453	0.7970537368	0.0	11.5896755	0.8973792939	0.0	18.85459139	0.6070363597	0.0	ALEVIPR
+sample=1-period=1-cycle=2038-experiment=7_1_2_0	1	22	88	1	1	33.18005	2	404.2288055	806.4510372	0.007979227388	7	-1	7.830779576	0.0	0.7970537368	112.937008	0.0	0.8973792939	47.98049859	0.0	0.6070363597	ALEVIPR