changeset 0:8e4fb95a319a draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteomiqon_proteininference commit 1b4c15d5c84c890663475a22cc6ff71bbc9aa90c"
author galaxyp
date Mon, 26 Jul 2021 13:34:26 +0000
parents
children 7729b9043b80
files proteomiqon_proteininference.xml static/images/ProteinInference.png test-data/result_1.json test-data/result_2.json test-data/result_3.json test-data/result_4.json test-data/result_5.json test-data/result_6.json test-data/result_7.json test-data/sample.db test-data/sample_1.qpsm test-data/sample_2.qpsm test-data/sample_3.qpsm
diffstat 13 files changed, 415 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/proteomiqon_proteininference.xml	Mon Jul 26 13:34:26 2021 +0000
@@ -0,0 +1,269 @@
+<tool id="proteomiqon_proteininference" name="ProteomIQon ProteinInference" version="@VERSION@" profile="20.05">
+    <description>
+        uses identified peptides to infere proteins explaining their presence in the sample.
+    </description>
+    <macros>
+        <token name="@VERSION@">0.0.7</token>
+    </macros>
+    <requirements>
+        <requirement type="package" version="@VERSION@">proteomiqon-proteininference</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        #import re
+        #if $PSMInputModeCond.PSMInputMode == "single"
+            #set basename = $re.sub(r'[^\w ,.\-+]','_',$PSMInputModeCond.scoredPSMs.element_identifier)
+            ln -s '$scoredPSMs' '${basename}.qpsm' &&
+        #elif $PSMInputModeCond.PSMInputMode == "multi"
+            #for $psmfile in $PSMInputModeCond.scoredPSMs
+                #set basename = $re.sub(r'[^\w ,.\-+]','_',$psmfile.element_identifier)
+                ln -s '$psmfile' '${basename}.qpsm' &&
+            #end for
+        #end if
+        #if $outputParamfile:
+            cat '$paramfile' >> '$out_paramfile' &&
+        #end if
+        proteomiqon-proteininference -i './' -d '$peptideDB' -p '$paramfile' -o './out'
+    ]]>
+    </command>
+    <configfiles>
+        <configfile name="paramfile">
+            <![CDATA[
+            {
+                "ProteinIdentifierRegex": "${ProteinIdentifierRegex}",
+                "Protein": {
+                  "Case": "${Protein}"
+                },
+                "Peptide": {
+                  "Case": "${Peptide}"
+                },
+                "GroupFiles": ${GroupFiles},
+                #if $GetQValueCond.GetQValue == "LogisticRegression"
+                "GetQValue": {
+                  "Case": "${GetQValueCond.GetQValue}",
+                  "Fields": [
+                    {
+                      "Case": "${GetQValueCond.LogisticRegressionType}"
+                    }
+                  ]
+                }
+                #else
+                "GetQValue": {
+                    "Case": "${GetQValueCond.GetQValue}",
+                }
+                #end if
+            }
+            ]]>
+        </configfile>
+    </configfiles>
+    <inputs>
+        <param name="peptideDB" type="data" format="sqlite" label="Peptide database" help="Specify the peptide data base."/>
+        <conditional name="PSMInputModeCond">
+            <param name="PSMInputMode" type="select" label="PSM file input mode">
+                <option value="single">Single File</option>
+                <option value="multi">Multiple Files</option>
+            </param>
+            <when value="single">
+                <param name="scoredPSMs" type="data" format="tabular" label="Scored PSM file" help="Specify peptide identifications." />
+            </when>
+            <when value="multi">
+                <param name="scoredPSMs" type="data" format="tabular" label="Scored PSM files" help="Specify list of peptide identifications." multiple="true"/>
+            </when>
+        </conditional>
+        <param name="ProteinIdentifierRegex" type="text" value="id" label="Protein identifier regex pattern" help="Fasta headers do often contain additional information in addition to your protein identifier, by specifying a regex pattern the tool can extract the protein IDs. If you fasta headers are already cleaned you can leave this field empty.">
+            <sanitizer sanitize="false" />
+        </param>
+        <param name="Protein" type="select" label="Protein" help="Specify how protein groups are created. For details please refer to the description below.">
+            <option value="Minimal">Minimal</option>
+            <option value="Maximal" selected="true">Maximal</option>
+        </param>
+        <param name="Peptide" type="select" label="Peptide" help="Specify how peptides are used to infer protein groups. For details please refer to the description below.">
+            <option value="Minimal" selected="true">Minimal</option>
+            <option value="Maximal">Maximal</option>
+            <option value="MaximalInverse">MaximalInverse</option>
+        </param>
+        <param name="GroupFiles" type="boolean" checked="true" label="Groupe files" help="If checked, protein inference is carried out using peptide information from all files simultaneously."/>
+        <conditional name="GetQValueCond">
+            <param name="GetQValue" type="select" label="Q-Value method" help="Specify if and how q-value calculation should be carried out.">
+                <option value="Storey" selected="true">Storey</option>
+                <option value="LogisticRegression">Logistic Regression</option>
+                <option value="NoQValue">NoQValue</option>
+            </param>
+            <when value="Storey"/>
+            <when value="LogisticRegression">
+                <param name="LogisticRegressionType" type="select" label="Logistic regression type">
+                    <option value="Conservative">Conservative</option>
+                    <option value="MAYU" selected="true">MAYU</option>
+                    <option value="DecoyTargetRatio">DecoyTargetRatio</option>
+                </param>
+            </when>
+            <when value="NoQValue"/>
+        </conditional>
+        <param name="outputParamfile" type="boolean" value="false" label="Output parameter file"/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="out_prot" >
+            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.prot" ext="tabular" directory="out" visible="true"  assign_primary_output="true" />
+        </data>
+        <data format="json" name="out_paramfile">
+            <filter>outputParamfile</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <param name="peptideDB" value="sample.db"/>
+            <conditional name="PSMInputModeCond">
+                <param name="PSMInputMode" value="single"/>
+                <param name="scoredPSMs" value="sample_1.qpsm"/>
+            </conditional>
+            <param name="ProteinIdentifierRegex" value="id"/>
+            <param name="Protein" value="Minimal"/>
+            <param name="Peptide" value="Minimal"/>
+            <param name="GroupFiles" value="true"/>
+            <param name="outputParamfile" value="false"/>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="peptideDB" value="sample.db"/>
+            <conditional name="PSMInputModeCond">
+                <param name="PSMInputMode" value="single"/>
+                <param name="scoredPSMs" value="sample_1.qpsm"/>
+            </conditional>
+            <param name="ProteinIdentifierRegex" value="id"/>
+            <param name="Protein" value="Maximal"/>
+            <param name="Peptide" value="Maximal"/>
+            <param name="GroupFiles" value="true"/>
+            <conditional name="GetQValueCond">
+                <param name="GetQValue" value="Storey"/>
+            </conditional>
+            <param name="outputParamfile" value="true"/>
+            <output name="out_paramfile" file="result_1.json" />
+        </test>
+        <test expect_num_outputs="2">
+            <param name="peptideDB" value="sample.db"/>
+            <conditional name="PSMInputModeCond">
+                <param name="PSMInputMode" value="single"/>
+                <param name="scoredPSMs" value="sample_1.qpsm"/>
+            </conditional>
+            <param name="ProteinIdentifierRegex" value="id"/>
+            <param name="Protein" value="Minimal"/>
+            <param name="Peptide" value="MaximalInverse"/>
+            <param name="GroupFiles" value="true"/>
+            <conditional name="GetQValueCond">
+                <param name="GetQValue" value="NoQValue"/>
+            </conditional>
+            <param name="outputParamfile" value="true"/>
+            <output name="out_paramfile" file="result_2.json" />
+        </test>
+        <test expect_num_outputs="2">
+            <param name="peptideDB" value="sample.db"/>
+            <conditional name="PSMInputModeCond">
+                <param name="PSMInputMode" value="single"/>
+                <param name="scoredPSMs" value="sample_1.qpsm"/>
+            </conditional>
+            <param name="ProteinIdentifierRegex" value="id"/>
+            <param name="Protein" value="Minimal"/>
+            <param name="Peptide" value="MaximalInverse"/>
+            <param name="GroupFiles" value="true"/>
+            <conditional name="GetQValueCond">
+                <param name="GetQValue" value="LogisticRegression"/>
+                <param name="LogisticRegressionType" value="Conservative"/>
+            </conditional>
+            <param name="outputParamfile" value="true"/>
+            <output name="out_paramfile" file="result_3.json" />
+        </test>
+        <test expect_num_outputs="2">
+            <param name="peptideDB" value="sample.db"/>
+            <conditional name="PSMInputModeCond">
+                <param name="PSMInputMode" value="single"/>
+                <param name="scoredPSMs" value="sample_1.qpsm"/>
+            </conditional>
+            <param name="ProteinIdentifierRegex" value="id"/>
+            <param name="Protein" value="Minimal"/>
+            <param name="Peptide" value="MaximalInverse"/>
+            <param name="GroupFiles" value="true"/>
+            <conditional name="GetQValueCond">
+                <param name="GetQValue" value="LogisticRegression"/>
+                <param name="LogisticRegressionType" value="MAYU"/>
+            </conditional>
+            <param name="outputParamfile" value="true"/>
+            <output name="out_paramfile" file="result_4.json" />
+        </test>
+        <test expect_num_outputs="2">
+            <param name="peptideDB" value="sample.db"/>
+            <conditional name="PSMInputModeCond">
+                <param name="PSMInputMode" value="single"/>
+                <param name="scoredPSMs" value="sample_1.qpsm"/>
+            </conditional>
+            <param name="ProteinIdentifierRegex" value="id"/>
+            <param name="Protein" value="Minimal"/>
+            <param name="Peptide" value="MaximalInverse"/>
+            <param name="GroupFiles" value="true"/>
+            <conditional name="GetQValueCond">
+                <param name="GetQValue" value="LogisticRegression"/>
+                <param name="LogisticRegressionType" value="DecoyTargetRatio"/>
+            </conditional>
+            <param name="outputParamfile" value="true"/>
+            <output name="out_paramfile" file="result_5.json" />
+        </test>
+        <test expect_num_outputs="2">
+            <param name="peptideDB" value="sample.db"/>
+            <conditional name="PSMInputModeCond">
+                <param name="PSMInputMode" value="multi"/>
+                <param name="scoredPSMs" value="sample_1.qpsm,sample_2.qpsm,sample_3.qpsm"/>
+            </conditional>
+            <param name="ProteinIdentifierRegex" value="id"/>
+            <param name="Protein" value="Maximal"/>
+            <param name="Peptide" value="Maximal"/>
+            <param name="GroupFiles" value="true"/>
+            <conditional name="GetQValueCond">
+                <param name="GetQValue" value="NoQValue"/>
+            </conditional>
+            <param name="outputParamfile" value="true"/>
+            <output name="out_paramfile" file="result_6.json" />
+        </test>
+        <test expect_num_outputs="2">
+            <param name="peptideDB" value="sample.db"/>
+            <conditional name="PSMInputModeCond">
+                <param name="PSMInputMode" value="multi"/>
+                <param name="scoredPSMs" value="sample_1.qpsm,sample_2.qpsm,sample_3.qpsm"/>
+            </conditional>
+            <param name="ProteinIdentifierRegex" value="id"/>
+            <param name="Protein" value="Maximal"/>
+            <param name="Peptide" value="Maximal"/>
+            <param name="GroupFiles" value="false"/>
+            <conditional name="GetQValueCond">
+                <param name="GetQValue" value="NoQValue"/>
+            </conditional>
+            <param name="outputParamfile" value="true"/>
+            <output name="out_paramfile" file="result_7.json" />
+        </test>
+    </tests>
+    <help>
+    <![CDATA[
+What It Does
+------------
+**Disclaimer** Disclaimer this tool needs a `peptide database <https://csbiology.github.io/ProteomIQon/tools/PeptideDB.html>`_ and `peptide spectrum matches <https://csbiology.github.io/ProteomIQon/tools/PeptideSpectrumMatching.html>`_ which `passed fdr thresholds <https://csbiology.github.io/ProteomIQon/tools/PSMStatistics.html>`_.
+
+MS-based shotgun proteomics estimates protein abundances using a proxy: peptides. The process of 'Protein Inference' is concerned with the mapping of identified peptides to the proteins they putatively originated from. This process is not as straightforward as one might think at a first glance on the subject, since the peptide-to-protein mapping is not necessarily a one-to-one relationship but in many cases a one-to-many relationship. This is due to the fact that many proteins share peptides with an identical sequence, e.g. two proteins originating from two different splice variants of the same gene.
+
+One way to cope with this problem is to introduce the concept of protein groups, which allow us to report the aggregation of all peptides which map to all isoforms of a gene independently from the peptides mapping uniquely to a single isoform. 
+While this approach has its merits it leaves room for fine tuning when implemented. 
+Lets say we have two proteins pA and pB which were both discovered by one peptide uniquely mapping to each of them and additionally by a third peptide, which maps to both of them: How do we report our findings? 
+We could report both proteins seperately and as a protein group, we could only report the protein group, or we could report both proteins but not the protein group. 
+A problem of comparable complexity occurs when we think about peptides when calculating the abundances for the proteingroup pA;pB. 
+Do we use the peptides only once, or do we also use the peptides mapping uniquely to protein pA and pB? 
+Fortunately, the tool ProteinInference gives you the possibility to choose any of the described scenarios by tuning the parameters described below. 
+The following scheme gives an overview how parameter settings influence inferred protein groups:
+
+.. image:: $PATH_TO_IMAGES/ProteinInference.png
+            :width: 1048pt
+            :height: 358pt
+
+Moreover, we report each protein group with a so called 'Peptide evidence class'. This metric gives an indication how pure the peptide composition of a protein group is and lets us differentiate between protein groups that consist of isoforms of a splice variant or contain a rather arbitrary mix of proteins. 
+In order to determine these inter-protein relationships the user can optionally supply a gff3 file.
+
+Further Reading
+---------------
+Additional information about the tool can be found in the `documentation <https://csbiology.github.io/ProteomIQon/tools/ProteinInference.html>`_.  
+    ]]>
+    </help>
+</tool>
\ No newline at end of file
Binary file static/images/ProteinInference.png has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/result_1.json	Mon Jul 26 13:34:26 2021 +0000
@@ -0,0 +1,17 @@
+
+            
+            {
+                "ProteinIdentifierRegex": "id",
+                "Protein": {
+                  "Case": "Maximal"
+                },
+                "Peptide": {
+                  "Case": "Maximal"
+                },
+                "GroupFiles": true,
+                "GetQValue": {
+                    "Case": "Storey",
+                }
+            }
+            
+        
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/result_2.json	Mon Jul 26 13:34:26 2021 +0000
@@ -0,0 +1,17 @@
+
+            
+            {
+                "ProteinIdentifierRegex": "id",
+                "Protein": {
+                  "Case": "Minimal"
+                },
+                "Peptide": {
+                  "Case": "MaximalInverse"
+                },
+                "GroupFiles": true,
+                "GetQValue": {
+                    "Case": "NoQValue",
+                }
+            }
+            
+        
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/result_3.json	Mon Jul 26 13:34:26 2021 +0000
@@ -0,0 +1,22 @@
+
+            
+            {
+                "ProteinIdentifierRegex": "id",
+                "Protein": {
+                  "Case": "Minimal"
+                },
+                "Peptide": {
+                  "Case": "MaximalInverse"
+                },
+                "GroupFiles": true,
+                "GetQValue": {
+                  "Case": "LogisticRegression",
+                  "Fields": [
+                    {
+                      "Case": "Conservative"
+                    }
+                  ]
+                }
+            }
+            
+        
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/result_4.json	Mon Jul 26 13:34:26 2021 +0000
@@ -0,0 +1,22 @@
+
+            
+            {
+                "ProteinIdentifierRegex": "id",
+                "Protein": {
+                  "Case": "Minimal"
+                },
+                "Peptide": {
+                  "Case": "MaximalInverse"
+                },
+                "GroupFiles": true,
+                "GetQValue": {
+                  "Case": "LogisticRegression",
+                  "Fields": [
+                    {
+                      "Case": "MAYU"
+                    }
+                  ]
+                }
+            }
+            
+        
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/result_5.json	Mon Jul 26 13:34:26 2021 +0000
@@ -0,0 +1,22 @@
+
+            
+            {
+                "ProteinIdentifierRegex": "id",
+                "Protein": {
+                  "Case": "Minimal"
+                },
+                "Peptide": {
+                  "Case": "MaximalInverse"
+                },
+                "GroupFiles": true,
+                "GetQValue": {
+                  "Case": "LogisticRegression",
+                  "Fields": [
+                    {
+                      "Case": "DecoyTargetRatio"
+                    }
+                  ]
+                }
+            }
+            
+        
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/result_6.json	Mon Jul 26 13:34:26 2021 +0000
@@ -0,0 +1,17 @@
+
+            
+            {
+                "ProteinIdentifierRegex": "id",
+                "Protein": {
+                  "Case": "Maximal"
+                },
+                "Peptide": {
+                  "Case": "Maximal"
+                },
+                "GroupFiles": true,
+                "GetQValue": {
+                    "Case": "NoQValue",
+                }
+            }
+            
+        
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/result_7.json	Mon Jul 26 13:34:26 2021 +0000
@@ -0,0 +1,17 @@
+
+            
+            {
+                "ProteinIdentifierRegex": "id",
+                "Protein": {
+                  "Case": "Maximal"
+                },
+                "Peptide": {
+                  "Case": "Maximal"
+                },
+                "GroupFiles": false,
+                "GetQValue": {
+                    "Case": "NoQValue",
+                }
+            }
+            
+        
\ No newline at end of file
Binary file test-data/sample.db has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_1.qpsm	Mon Jul 26 13:34:26 2021 +0000
@@ -0,0 +1,4 @@
+PSMId	GlobalMod	PepSequenceID	ModSequenceID	Label	ScanNr	ScanTime	Charge	PrecursorMZ	TheoMass	AbsDeltaMass	PeptideLength	MissCleavages	SequestScore	SequestNormDeltaBestToRest	SequestNormDeltaNext	AndroScore	AndroNormDeltaBestToRest	AndroNormDeltaNext	XtandemScore	XtandemNormDeltaBestToRest	XtandemNormDeltaNext	ModelScore	QValue	PEPValue	StringSequence	ProteinNames
+sample=1 period=1 cycle=2033 experiment=4	1	4	8	1	0	32.9949	2	383.2268582	764.4508194	0.01165603632	7	0	8.635639049	0	0.6779896705	112.3264921	0	0.7657267969	45.72186797	0	0.5594493789	NaN	NaN	NaN	ILVGDIK	Cre02.g143307.t1.1
+sample=1 period=1 cycle=2043 experiment=5	0	22	87	1	2	33.36088333	2	399.242789	796.4806883	0.009663316474	7	0	7.816376209	0	0.8192994029	74.99645487	0	1	29.37298389	0	0.7506817907	NaN	NaN	NaN	ALEVIPR	Cre01.g026550.t1.1
+sample=1 period=1 cycle=2038 experiment=7	1	22	88	1	1	33.18005	2	404.2288055	806.4510372	0.007979227388	7	0	7.830779576	0	0.7970537368	112.937008	0	0.8973792939	47.98049859	0	0.6070363597	NaN	NaN	NaN	ALEVIPR	Cre01.g026550.t1.1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_2.qpsm	Mon Jul 26 13:34:26 2021 +0000
@@ -0,0 +1,4 @@
+PSMId	GlobalMod	PepSequenceID	ModSequenceID	Label	ScanNr	ScanTime	Charge	PrecursorMZ	TheoMass	AbsDeltaMass	PeptideLength	MissCleavages	SequestScore	SequestNormDeltaBestToRest	SequestNormDeltaNext	AndroScore	AndroNormDeltaBestToRest	AndroNormDeltaNext	XtandemScore	XtandemNormDeltaBestToRest	XtandemNormDeltaNext	ModelScore	QValue	PEPValue	StringSequence	ProteinNames
+sample=1 period=1 cycle=2033 experiment=4	1	4	8	1	0	32.9949	2	383.2268582	764.4508194	0.01165603632	7	0	8.635639049	0	0.6779896705	112.3264921	0	0.7657267969	45.72186797	0	0.5594493789	NaN	NaN	NaN	ILVGDIK	Cre02.g143307.t1.1
+sample=1 period=1 cycle=2043 experiment=5	0	22	87	1	2	33.36088333	2	399.242789	796.4806883	0.009663316474	7	0	7.816376209	0	0.8192994029	74.99645487	0	1	29.37298389	0	0.7506817907	NaN	NaN	NaN	ALEVIPR	Cre01.g026550.t1.1
+sample=1 period=1 cycle=2038 experiment=7	1	22	88	1	1	33.18005	2	404.2288055	806.4510372	0.007979227388	7	0	7.830779576	0	0.7970537368	112.937008	0	0.8973792939	47.98049859	0	0.6070363597	NaN	NaN	NaN	ALEVIPR	Cre01.g026550.t1.1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_3.qpsm	Mon Jul 26 13:34:26 2021 +0000
@@ -0,0 +1,4 @@
+PSMId	GlobalMod	PepSequenceID	ModSequenceID	Label	ScanNr	ScanTime	Charge	PrecursorMZ	TheoMass	AbsDeltaMass	PeptideLength	MissCleavages	SequestScore	SequestNormDeltaBestToRest	SequestNormDeltaNext	AndroScore	AndroNormDeltaBestToRest	AndroNormDeltaNext	XtandemScore	XtandemNormDeltaBestToRest	XtandemNormDeltaNext	ModelScore	QValue	PEPValue	StringSequence	ProteinNames
+sample=1 period=1 cycle=2033 experiment=4	1	4	8	1	0	32.9949	2	383.2268582	764.4508194	0.01165603632	7	0	8.635639049	0	0.6779896705	112.3264921	0	0.7657267969	45.72186797	0	0.5594493789	NaN	NaN	NaN	ILVGDIK	Cre02.g143307.t1.1
+sample=1 period=1 cycle=2043 experiment=5	0	22	87	1	2	33.36088333	2	399.242789	796.4806883	0.009663316474	7	0	7.816376209	0	0.8192994029	74.99645487	0	1	29.37298389	0	0.7506817907	NaN	NaN	NaN	ALEVIPR	Cre01.g026550.t1.1
+sample=1 period=1 cycle=2038 experiment=7	1	22	88	1	1	33.18005	2	404.2288055	806.4510372	0.007979227388	7	0	7.830779576	0	0.7970537368	112.937008	0	0.8973792939	47.98049859	0	0.6070363597	NaN	NaN	NaN	ALEVIPR	Cre01.g026550.t1.1