calisp: calisp.xml comparison

comparison calisp.xml @ 1:867f17ede7f3 draft default tip

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/calisp commit 42e5dfeaa309e6ac17b4616314498a3b628272d2

author	galaxyp
date	Thu, 14 Sep 2023 12:49:19 +0000
parents	6d93529d19d4
children

comparison

equal deleted inserted replaced

-:6d93529d19d4
+:867f17ede7f3
 <tool id="calisp" name="calisp" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05">
 <description>Estimate isotopic composition of peptides from proteomics mass spectrometry data</description>
 <macros>
-<token name="@TOOL_VERSION@">3.0.10</token>
+<token name="@TOOL_VERSION@">3.0.13</token>
 <token name="@VERSION_SUFFIX@">0</token>
 <token name="@CALISP_REPO@">https://raw.githubusercontent.com/kinestetika/Calisp/208d495674e2b52fe56cf23457c833d1c2527242</token>
 <xml name="input_macro" tokens="multiple">
 <!-- According to readme mzid input is not yet implented -->
 </xml>
 --output_file calisp-output/
 --mass_accuracy $mass_accuracy
 --bin_delimiter '$bin_delimiter'
 --threads "\${GALAXY_SLOTS:-1}"
 --isotope $isotope
-$compute_clumps &&
+$compute_clumps
-'$__tool_directory__/feather2tsv.py' --calisp_output calisp-output/
+#if $isotope_abundance_matrix
+--isotope_abundance_matrix '$isotope_abundance_matrix'
+#end if
+#if $isotope_abundance_matrix
+&& ISOTOPE_ABUNDANCE_MATRIX="$isotope_abundance_matrix"
+#else
+&& ISOTOPE_ABUNDANCE_MATRIX="\$(python -c 'import site; print(f"{site.getsitepackages()[0]}/calisp/isotope_matrix.txt")')"
+#end if
+&& python '$__tool_directory__/benchmarking.py'
+--input calisp-output/
+--isotope_abundance_matrix "\$ISOTOPE_ABUNDANCE_MATRIX"
+--isotope $isotope
+#if $benchmark_cond.benchmark == 'yes'
+--out_filtered '$filtered'
+--out_summary '$summary'
+#if $benchmark_cond.nominal_values
+--nominal_values '$benchmark_cond.nominal_values'
+#end if
+#end if
 ]]></command>
 <inputs>
 <param argument="--spectrum_file" type="data" multiple="false" format="mzml" label="Spectrum file"/>
 <param argument="--peptide_file" type="data" multiple="false" format="tabular" label="Peptide file" help="Psm file" />
 <param argument="--mass_accuracy" type="float" value="10" label="Mass accuracy" help="The maximum mass difference between theoretical mass and experimental mass of a peptide" />
 <option value="33S">33S</option>
 <option value="34S">34S</option>
 <option value="36S">36S</option>
 </param>
 <param argument="--compute_clumps" type="boolean" truevalue="--compute_clumps" falsevalue="" checked="false" label="Compute clumps" help="To compute clumpiness of carbon assimilation. Only use when samples are labeled tosaturation. Estimation of clumpiness takes much additional time." />
+<param argument="--isotope_abundance_matrix" type="data" format="tabular" optional="true" label="Custom isotope abundance matrix" help="If not given the built in matrix will be used" />
+<conditional name="benchmark_cond">
+<param name="benchmark" type="select" label="Run benchmarking">
+<option value="yes">Yes</option>
+<option value="no">No</option>
+</param>
+<when value="yes">
+<param name="nominal_values" type="data" format="tabular" optional="true" label="Nominal values" help="A table containing ms_run and their nominal value (1, 5, or 10)"/>
+</when>
+<when value="no"/>
+</conditional>
 </inputs>
 <outputs>
 <collection name="output" type="list">
 <discover_datasets pattern="(?P&lt;designation&gt;.*)\.tsv" format="tabular" directory="calisp-output"/>
 </collection>
+<data name="filtered" format="tabular" label="${tool.name} on ${on_string}: filtered">
+<filter>benchmark_cond['benchmark'] == 'yes'</filter>
+</data>
+<data name="summary" format="tabular" label="${tool.name} on ${on_string}: peptide summary">
+<filter>benchmark_cond['benchmark'] == 'yes'</filter>
+</data>
 </outputs>
 <tests>
 <!-- TODO test data to large, avilable from here: https://github.com/kinestetika/Calisp/tree/master/test
-if possible inlcude via location in the future
+if possible inlcude via location in the future -->
-<test expect_num_outputs="1">
+<!-- <test expect_num_outputs="3">
-<param name="spectrum_file" value="calisp_test_data.mzML" ftype="mzml"/>
+<param name="spectrum_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data.mzML" ftype="mzml"/>
-<param name="peptide_file" value="calisp_test_data_TargetPeptideSpectrumMatch.txt" ftype="tabular"/>
+<param name="peptide_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data_TargetPeptideSpectrumMatch.txt" ftype="tabular"/>
+<param name="benchmark" value="true"/>
 <output_collection name="output" count="1">
 <element name="calisp_test_data">
 <assert_contents>
 <has_text text="experiment"/>
 <has_text text="MKH_260min_1800ng"/>
 <has_text text="HOMO"/>
 <has_text text="P13645"/>
 <has_text text="NHEEEMKDLR"/>
 <has_text text="Oxidation"/>
-<has_n_columns n="85"/>
+<has_n_columns n="84"/>
 <has_n_lines n="24"/>
 </assert_contents>
 </element>
 </output_collection>
-</test>
+<output name="filtered" ftype="tabular">
--->
+<assert_contents>
+<has_n_lines n="24"/>
+<has_n_columns n="87"/>
+</assert_contents>
+</output>
+<output name="summary" value="summary.tsv" ftype="tabular"/>
+</test> -->
+<!-- same test, but with isotope abundance matrix supplied by the user
+(using the same as the built in => same results)
+TODO: test will only work with 23.1 tool-utils package available -->
+<!-- <test expect_num_outputs="3">
+<param name="spectrum_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data.mzML" ftype="mzml"/>
+<param name="peptide_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data_TargetPeptideSpectrumMatch.txt" ftype="tabular"/>
+<param name="isotope_abundance_matrix" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/src/calisp/isotope_matrix.txt" ftype="tabular"/>
+<param name="benchmark" value="true"/>
+<output_collection name="output" count="1">
+<element name="calisp_test_data">
+<assert_contents>
+<has_text text="experiment"/>
+<has_text text="MKH_260min_1800ng"/>
+<has_text text="HOMO"/>
+<has_text text="P13645"/>
+<has_text text="NHEEEMKDLR"/>
+<has_text text="Oxidation"/>
+<has_n_columns n="84"/>
+<has_n_lines n="24"/>
+</assert_contents>
+</element>
+</output_collection>
+<output name="filtered" ftype="tabular">
+<assert_contents>
+<has_n_lines n="24"/>
+<has_n_columns n="87"/>
+</assert_contents>
+</output>
+<output name="summary" value="summary.tsv" ftype="tabular"/>
+</test> -->
+<!--  trst output filters for no benchmarking -->
+<!-- <test expect_num_outputs="1">
+<param name="spectrum_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data.mzML" ftype="mzml"/>
+<param name="peptide_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data_TargetPeptideSpectrumMatch.txt" ftype="tabular"/>
+<conditional name="benchmark_cond">
+<param name="benchmark" value="no"/>
+</conditional>
+<output_collection name="output" count="1">
+<element name="calisp_test_data">
+<assert_contents>
+<has_text text="experiment"/>
+<has_text text="MKH_260min_1800ng"/>
+<has_text text="HOMO"/>
+<has_text text="P13645"/>
+<has_text text="NHEEEMKDLR"/>
+<has_text text="Oxidation"/>
+<has_n_columns n="84"/>
+<has_n_lines n="24"/>
+</assert_contents>
+</element>
+</output_collection>
+</test> -->
 </tests>
 <help><![CDATA[
 Calisp (Calgary approach to isotopes in proteomics) is a program that estimates
 isotopic composition (e.g. 13C/12C, delta13C, 15N/14N etc) of peptides from
 proteomics mass spectrometry data. Input data consist of mzML files and files
 pattern_precursor_id                       id of the ms1 spectrum that was the source of the pattern
 pattern_total_intensity                    total intensity of the pattern
 pattern_peak_count                         # of peaks in the pattern
 pattern_median_peak_spacing                medium mass difference between a pattern's peaks
 spectrum_mass_irregularity                 a measure for the standard deviation in the mass difference between a pattern's peaks
-ratio_na                                   the estimated isotope ratio inferred from neutron abundance (sip experiments)
+ratio_na                                   the estimated isotope ratio (in percent) inferred from neutron abundance (sip experiments)
-ratio_fft                                  the estimated isotope ratio inferred by the fft method (natural isotope abundances)
+ratio_fft                                  the estimated isotope ratio (in percent) inferred by the fft method (natural isotope abundances)
 error_fft                                  the remaining error after fitting the pattern with fft
 error_clumpy                               the remaining error after fitting the pattern with the clumpy carbon method
 flag_peptide_contains_sulfur               true if peptide contains sulfur
 flag_peptide_has_modifications             true if peptide has no modifications
 flag_peptide_assigned_to_multiple_bins     true if peptide is associated with multiple proteins from different bins/mags
 flag_peak_at_minus_one_pos                 true if a peak was detected immediately before the monoisotopic peak, could indicate overlap with another pattern
 i0 - i19                                   the intensities of the first 20 peaks of the pattern
 m0 - m19                                   the masses of the first 20 peaks of the pattern
 c1 - c6                                    contributions of clumps of 1-6 carbon to ratio_na. These are the outcomes of the clumpy carbon model. These results are only meaningful if the biomass was labeled to saturation.
 ========================================== ===================
+Benchmarking
+============
+If the user chooses to run the additional benchmarking script two additional
+outputs are created as follows.
+Load data:
+----------
+- Concatenate calisp result tables
+- add column ``delta_na`` = 1000 * ``ratio_na`` / (1/factor-2)
+- add column ``delta_fft`` = 1000 * ``ratio_fft`` / (1/factor-2)
+Filter data:
+------------
+Rows are removed for which any of the following criteria applies
+- flag_peak_at_minus_one_pos
+- flag_pattern_is_wobbly
+- flag_psm_has_low_confidence
+- flag_psm_is_ambiguous
+- flag_pattern_is_contaminated
+- flag_peptide_assigned_to_multiple_bins
+Furthermore in the ``peptide`` column the strings ``"Oxidation"``, ``"Carbamidomethyl"``,
+and text in brackets (i.e. ``[]``) preceded by any number of spaces
+is removed.
+Benchmarking:
+-------------
+Iterate through all combinations of unique peptides, proteins, and samples
+and output the following tabular information
+=================== ===========================
+Column              Content
+=================== ===========================
+file                The name of the mzML spectrum file comprising the peptide
+bin                 bin/mag ids, separated by commas. Calisp expects the protein ids in the psm file to consist of two parts, separated by a delimiter (_ by default). The first part is the bin/mag id, and the second part is the protein id
+%label              The label percentage (≠ 0 if labelled components used during experiments)
+ratio               The natural abundance ratio of the target element (C, H, N, O, S)
+peptide             The labeled peptides
+psm_mz              psm m over z
+n(patterns)         The number of iterations of the same pattern for the peptides has been repeated
+mean intensity      The mean of the total intensity of the pattern
+ratio_NA median     The mean of the estimated isotope ratio inferred from neutron abundance (sip experiments)
+N mean              The mean of the number of neutrons inferred from custom 'neutron' modifications
+ratio_NA SEM        The standard error of the mean of the estimated isotope ratio inferred from neutron abundance (sip experiments)
+ratio_FFT median    The mean of the estimated isotope ratio inferred by the fft method (natural isotope abundances)
+ratio_FFT SEM       The standard error of the mean of the estimated isotope ratio inferred by the fft method (natural isotope abundances)
+False Positive      Any false positive indications
+=================== ===========================
+Mean, median, and standard error values are computed for
+all entries of this sample and that have the same peptide.
+**Isotope abundance matrix**:
+The isotope abundance matrix gives the background unlabeled fraction.
+The default matrix implemented in calisp is given here:
+https://github.com/kinestetika/Calisp/blob/v@TOOL_VERSION@/src/calisp/isotope_matrix.txt.
+Columns specify the atom of interest and the rows the isotope, i.e.
+rows 1-5 correspond to C, N, O, H, S. For instance
+13C is in the 2nd column of the 1st row and 14C in the 3rd column
+of the same row.
+**Benchmarking without nominal values**:
+If no nominal values, i.e. percentage of labeled atoms are given,
+nominal values of 0 are assumed.
+The values in the `ratio` column is comuted as `background_isotope / background_unlabelled * 100`
+where `background_unlabelled` is taken from the isotope abundance matrix
+according to the chosen target isotope.
+Then `background_isotope` is given by `1 - background_unlabelled`
+All entries of the table are considered not false positive.
+**Benchmarking with nominal values**:
+The `%label` (the nominal value) of a sample is either 0 (the default),
+1, 5, or 10 and can be provided or each sample by a tabular dataset
+(column 1 should give the sample names and column 2 the nominal value).
+The `ratio = I / U * 100` is given by
+`U = unlabeled_fraction * background_unlabelled` and
+`I = nominal_value / 100 + unlabeled_fraction * background_isotope`
+where
+`unlabeled_fraction = 1 - nominal_value / 100`
+`background_isotope = 1 - background_unlabelled`
+and `background_unlabelled` is given by the isotope abundance matrix.
+A peptide is considered false positive if it's not a contaminant(at the moment only K12)
+and the median of `ratio_na` values for the same peptide and sample
+is greater than a threshold depending on the nominal value:
+"For false positive discovery rates we set the threshold at the
+isotope/unlabelled associated with 1/4 of a generation of labeling. The E.
+coli values (1.7, 4.2 and 7.1) are for 1 generation at 1, 5 and 10% label, and
+we take the background (1.07) into account as well.""
+============= =========
+nominal value threshold
+============= =========
+1             `1.07 + (1.7 - 1.07) / 4`
+5             `1.07 + (4.2 - 1.07) / 4`
+10            `1.07 + (7.1 - 1.07) / 4`
+============= =========
+File an issue at https://github.com/galaxyproteomics/tools-galaxyp/issues if
+different contaminants of thresholds should be considered.
 ]]></help>
 <citations>
 <citation type="doi">10.1186/s40168-022-01454-1</citation>
 <citation type="doi">10.1073/pnas.1722325115</citation>
-<citation type="doi">10.1101/2021.03.29.437612</citation>
 <citation type="doi">10.1093/bioinformatics/bty046</citation>
 </citations>
 </tool>

Mercurial > repos > galaxyp > calisp

comparison calisp.xml @ 1:867f17ede7f3 draft default tip