diff calisp.xml @ 1:867f17ede7f3 draft default tip

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/calisp commit 42e5dfeaa309e6ac17b4616314498a3b628272d2
author galaxyp
date Thu, 14 Sep 2023 12:49:19 +0000
parents 6d93529d19d4
children
line wrap: on
line diff
--- a/calisp.xml	Thu Jun 01 08:34:14 2023 +0000
+++ b/calisp.xml	Thu Sep 14 12:49:19 2023 +0000
@@ -1,7 +1,7 @@
 <tool id="calisp" name="calisp" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05">
     <description>Estimate isotopic composition of peptides from proteomics mass spectrometry data</description>
     <macros>
-        <token name="@TOOL_VERSION@">3.0.10</token>
+        <token name="@TOOL_VERSION@">3.0.13</token>
         <token name="@VERSION_SUFFIX@">0</token>
         <token name="@CALISP_REPO@">https://raw.githubusercontent.com/kinestetika/Calisp/208d495674e2b52fe56cf23457c833d1c2527242</token>
         <xml name="input_macro" tokens="multiple">
@@ -30,8 +30,28 @@
     --bin_delimiter '$bin_delimiter'
     --threads "\${GALAXY_SLOTS:-1}"
     --isotope $isotope
-    $compute_clumps &&
-'$__tool_directory__/feather2tsv.py' --calisp_output calisp-output/
+    $compute_clumps
+#if $isotope_abundance_matrix
+    --isotope_abundance_matrix '$isotope_abundance_matrix'
+#end if
+
+#if $isotope_abundance_matrix
+    && ISOTOPE_ABUNDANCE_MATRIX="$isotope_abundance_matrix"
+#else
+    && ISOTOPE_ABUNDANCE_MATRIX="\$(python -c 'import site; print(f"{site.getsitepackages()[0]}/calisp/isotope_matrix.txt")')"
+#end if
+
+    && python '$__tool_directory__/benchmarking.py'
+        --input calisp-output/
+        --isotope_abundance_matrix "\$ISOTOPE_ABUNDANCE_MATRIX"
+        --isotope $isotope
+#if $benchmark_cond.benchmark == 'yes'
+        --out_filtered '$filtered'
+        --out_summary '$summary'
+        #if $benchmark_cond.nominal_values
+            --nominal_values '$benchmark_cond.nominal_values'
+        #end if
+#end if
     ]]></command>
     <inputs>
         <param argument="--spectrum_file" type="data" multiple="false" format="mzml" label="Spectrum file"/>
@@ -59,18 +79,36 @@
             <option value="36S">36S</option>
         </param>
         <param argument="--compute_clumps" type="boolean" truevalue="--compute_clumps" falsevalue="" checked="false" label="Compute clumps" help="To compute clumpiness of carbon assimilation. Only use when samples are labeled tosaturation. Estimation of clumpiness takes much additional time." />
+        <param argument="--isotope_abundance_matrix" type="data" format="tabular" optional="true" label="Custom isotope abundance matrix" help="If not given the built in matrix will be used" />
+        <conditional name="benchmark_cond">
+            <param name="benchmark" type="select" label="Run benchmarking">
+                <option value="yes">Yes</option>
+                <option value="no">No</option>
+            </param>
+            <when value="yes">
+                <param name="nominal_values" type="data" format="tabular" optional="true" label="Nominal values" help="A table containing ms_run and their nominal value (1, 5, or 10)"/>
+            </when>
+            <when value="no"/>
+        </conditional>
     </inputs>
     <outputs>
         <collection name="output" type="list">
             <discover_datasets pattern="(?P&lt;designation&gt;.*)\.tsv" format="tabular" directory="calisp-output"/>
         </collection>
+        <data name="filtered" format="tabular" label="${tool.name} on ${on_string}: filtered">
+            <filter>benchmark_cond['benchmark'] == 'yes'</filter>
+        </data>
+        <data name="summary" format="tabular" label="${tool.name} on ${on_string}: peptide summary">
+            <filter>benchmark_cond['benchmark'] == 'yes'</filter>
+        </data>
     </outputs>
     <tests>
         <!-- TODO test data to large, avilable from here: https://github.com/kinestetika/Calisp/tree/master/test
-            if possible inlcude via location in the future
-        <test expect_num_outputs="1">
-            <param name="spectrum_file" value="calisp_test_data.mzML" ftype="mzml"/>
-            <param name="peptide_file" value="calisp_test_data_TargetPeptideSpectrumMatch.txt" ftype="tabular"/>
+            if possible inlcude via location in the future -->
+        <!-- <test expect_num_outputs="3">
+            <param name="spectrum_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data.mzML" ftype="mzml"/>
+            <param name="peptide_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data_TargetPeptideSpectrumMatch.txt" ftype="tabular"/>
+            <param name="benchmark" value="true"/>
             <output_collection name="output" count="1">
                 <element name="calisp_test_data">
                     <assert_contents>
@@ -80,13 +118,74 @@
                         <has_text text="P13645"/>
                         <has_text text="NHEEEMKDLR"/>
                         <has_text text="Oxidation"/>
-                        <has_n_columns n="85"/>
+                        <has_n_columns n="84"/>
                         <has_n_lines n="24"/>
                     </assert_contents>
                 </element>
             </output_collection>
-        </test>
-    -->
+            <output name="filtered" ftype="tabular">
+                <assert_contents>
+                    <has_n_lines n="24"/>
+                    <has_n_columns n="87"/>
+                </assert_contents>
+            </output>
+            <output name="summary" value="summary.tsv" ftype="tabular"/>
+        </test> -->
+
+        <!-- same test, but with isotope abundance matrix supplied by the user
+             (using the same as the built in => same results)
+            
+            TODO: test will only work with 23.1 tool-utils package available -->
+        <!-- <test expect_num_outputs="3">
+            <param name="spectrum_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data.mzML" ftype="mzml"/>
+            <param name="peptide_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data_TargetPeptideSpectrumMatch.txt" ftype="tabular"/>
+            <param name="isotope_abundance_matrix" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/src/calisp/isotope_matrix.txt" ftype="tabular"/>
+            <param name="benchmark" value="true"/>
+            <output_collection name="output" count="1">
+                <element name="calisp_test_data">
+                    <assert_contents>
+                        <has_text text="experiment"/>
+                        <has_text text="MKH_260min_1800ng"/>
+                        <has_text text="HOMO"/>
+                        <has_text text="P13645"/>
+                        <has_text text="NHEEEMKDLR"/>
+                        <has_text text="Oxidation"/>
+                        <has_n_columns n="84"/>
+                        <has_n_lines n="24"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+            <output name="filtered" ftype="tabular">
+                <assert_contents>
+                    <has_n_lines n="24"/>
+                    <has_n_columns n="87"/>
+                </assert_contents>
+            </output>
+            <output name="summary" value="summary.tsv" ftype="tabular"/>
+        </test> -->
+
+        <!--  trst output filters for no benchmarking -->
+        <!-- <test expect_num_outputs="1">
+            <param name="spectrum_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data.mzML" ftype="mzml"/>
+            <param name="peptide_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data_TargetPeptideSpectrumMatch.txt" ftype="tabular"/>
+            <conditional name="benchmark_cond">
+                <param name="benchmark" value="no"/>
+            </conditional>
+            <output_collection name="output" count="1">
+                <element name="calisp_test_data">
+                    <assert_contents>
+                        <has_text text="experiment"/>
+                        <has_text text="MKH_260min_1800ng"/>
+                        <has_text text="HOMO"/>
+                        <has_text text="P13645"/>
+                        <has_text text="NHEEEMKDLR"/>
+                        <has_text text="Oxidation"/>
+                        <has_n_columns n="84"/>
+                        <has_n_lines n="24"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test> -->
     </tests>
     <help><![CDATA[
 Calisp (Calgary approach to isotopes in proteomics) is a program that estimates
@@ -165,8 +264,8 @@
 pattern_peak_count                         # of peaks in the pattern
 pattern_median_peak_spacing                medium mass difference between a pattern's peaks
 spectrum_mass_irregularity                 a measure for the standard deviation in the mass difference between a pattern's peaks
-ratio_na                                   the estimated isotope ratio inferred from neutron abundance (sip experiments) 
-ratio_fft                                  the estimated isotope ratio inferred by the fft method (natural isotope abundances)
+ratio_na                                   the estimated isotope ratio (in percent) inferred from neutron abundance (sip experiments) 
+ratio_fft                                  the estimated isotope ratio (in percent) inferred by the fft method (natural isotope abundances)
 error_fft                                  the remaining error after fitting the pattern with fft
 error_clumpy                               the remaining error after fitting the pattern with the clumpy carbon method
 flag_peptide_contains_sulfur               true if peptide contains sulfur
@@ -183,11 +282,123 @@
 m0 - m19                                   the masses of the first 20 peaks of the pattern
 c1 - c6                                    contributions of clumps of 1-6 carbon to ratio_na. These are the outcomes of the clumpy carbon model. These results are only meaningful if the biomass was labeled to saturation.
 ========================================== ===================
+
+Benchmarking
+============
+
+If the user chooses to run the additional benchmarking script two additional
+outputs are created as follows.
+
+Load data:
+----------
+
+- Concatenate calisp result tables
+- add column ``delta_na`` = 1000 * ``ratio_na`` / (1/factor-2)
+- add column ``delta_fft`` = 1000 * ``ratio_fft`` / (1/factor-2)
+
+Filter data:
+------------
+
+Rows are removed for which any of the following criteria applies
+
+- flag_peak_at_minus_one_pos
+- flag_pattern_is_wobbly
+- flag_psm_has_low_confidence
+- flag_psm_is_ambiguous
+- flag_pattern_is_contaminated
+- flag_peptide_assigned_to_multiple_bins
+
+Furthermore in the ``peptide`` column the strings ``"Oxidation"``, ``"Carbamidomethyl"``,
+and text in brackets (i.e. ``[]``) preceded by any number of spaces
+is removed.
+
+Benchmarking:
+-------------
+
+Iterate through all combinations of unique peptides, proteins, and samples
+and output the following tabular information
+
+=================== ===========================
+Column              Content
+=================== ===========================
+file                The name of the mzML spectrum file comprising the peptide
+bin                 bin/mag ids, separated by commas. Calisp expects the protein ids in the psm file to consist of two parts, separated by a delimiter (_ by default). The first part is the bin/mag id, and the second part is the protein id
+%label              The label percentage (≠ 0 if labelled components used during experiments)
+ratio               The natural abundance ratio of the target element (C, H, N, O, S)
+peptide             The labeled peptides
+psm_mz              psm m over z
+n(patterns)         The number of iterations of the same pattern for the peptides has been repeated
+mean intensity      The mean of the total intensity of the pattern
+ratio_NA median     The mean of the estimated isotope ratio inferred from neutron abundance (sip experiments)
+N mean              The mean of the number of neutrons inferred from custom 'neutron' modifications
+ratio_NA SEM        The standard error of the mean of the estimated isotope ratio inferred from neutron abundance (sip experiments)
+ratio_FFT median    The mean of the estimated isotope ratio inferred by the fft method (natural isotope abundances)
+ratio_FFT SEM       The standard error of the mean of the estimated isotope ratio inferred by the fft method (natural isotope abundances)
+False Positive      Any false positive indications
+=================== ===========================
+
+Mean, median, and standard error values are computed for
+all entries of this sample and that have the same peptide.
+
+**Isotope abundance matrix**:
+
+The isotope abundance matrix gives the background unlabeled fraction.
+The default matrix implemented in calisp is given here: 
+https://github.com/kinestetika/Calisp/blob/v@TOOL_VERSION@/src/calisp/isotope_matrix.txt. 
+Columns specify the atom of interest and the rows the isotope, i.e. 
+rows 1-5 correspond to C, N, O, H, S. For instance 
+13C is in the 2nd column of the 1st row and 14C in the 3rd column
+of the same row. 
+
+**Benchmarking without nominal values**:
+
+If no nominal values, i.e. percentage of labeled atoms are given,
+nominal values of 0 are assumed.
+
+The values in the `ratio` column is comuted as `background_isotope / background_unlabelled * 100`
+where `background_unlabelled` is taken from the isotope abundance matrix
+according to the chosen target isotope.
+Then `background_isotope` is given by `1 - background_unlabelled`
+
+All entries of the table are considered not false positive.
+
+**Benchmarking with nominal values**:
+
+The `%label` (the nominal value) of a sample is either 0 (the default),
+1, 5, or 10 and can be provided or each sample by a tabular dataset
+(column 1 should give the sample names and column 2 the nominal value).
+
+The `ratio = I / U * 100` is given by 
+`U = unlabeled_fraction * background_unlabelled` and
+`I = nominal_value / 100 + unlabeled_fraction * background_isotope`
+where 
+`unlabeled_fraction = 1 - nominal_value / 100`
+`background_isotope = 1 - background_unlabelled`
+and `background_unlabelled` is given by the isotope abundance matrix.
+
+A peptide is considered false positive if it's not a contaminant(at the moment only K12)
+and the median of `ratio_na` values for the same peptide and sample
+is greater than a threshold depending on the nominal value:
+
+"For false positive discovery rates we set the threshold at the
+isotope/unlabelled associated with 1/4 of a generation of labeling. The E.
+coli values (1.7, 4.2 and 7.1) are for 1 generation at 1, 5 and 10% label, and
+we take the background (1.07) into account as well.""
+
+============= =========
+nominal value threshold
+============= =========
+1             `1.07 + (1.7 - 1.07) / 4`
+5             `1.07 + (4.2 - 1.07) / 4`
+10            `1.07 + (7.1 - 1.07) / 4`
+============= =========
+
+File an issue at https://github.com/galaxyproteomics/tools-galaxyp/issues if
+different contaminants of thresholds should be considered.
     ]]></help>
     <citations>
         <citation type="doi">10.1186/s40168-022-01454-1</citation>
         <citation type="doi">10.1073/pnas.1722325115</citation>
-        <citation type="doi">10.1101/2021.03.29.437612</citation>
         <citation type="doi">10.1093/bioinformatics/bty046</citation>
     </citations>
-</tool>
\ No newline at end of file
+</tool>