changeset 0:6d93529d19d4 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/calisp commit 060699366b6dd19ad6c3ef3f332f63cc55d75dce
author galaxyp
date Thu, 01 Jun 2023 08:34:14 +0000
parents
children 867f17ede7f3
files calisp.xml feather2tsv.py
diffstat 2 files changed, 228 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/calisp.xml	Thu Jun 01 08:34:14 2023 +0000
@@ -0,0 +1,193 @@
+<tool id="calisp" name="calisp" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05">
+    <description>Estimate isotopic composition of peptides from proteomics mass spectrometry data</description>
+    <macros>
+        <token name="@TOOL_VERSION@">3.0.10</token>
+        <token name="@VERSION_SUFFIX@">0</token>
+        <token name="@CALISP_REPO@">https://raw.githubusercontent.com/kinestetika/Calisp/208d495674e2b52fe56cf23457c833d1c2527242</token>
+        <xml name="input_macro" tokens="multiple">
+            <!-- According to readme mzid input is not yet implented -->
+        </xml>
+    </macros>
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">calisp</requirement>
+    </requirements>
+    <command detect_errors="aggressive"><![CDATA[
+#import re
+
+mkdir -p spectra &&
+#set escaped_specs = re.sub('[^\w\-\.,:]', '_', str($spectrum_file.element_identifier))
+ln -s '$spectrum_file' spectra/'$escaped_specs' &&
+
+mkdir -p psms &&
+#set escaped_peps = re.sub('[^\w\-\.,:]', '_', str($peptide_file.element_identifier))
+ln -s '$peptide_file' psms/'$escaped_peps' &&
+
+calisp 
+    --spectrum_file spectra/
+    --peptide_file psms/
+    --output_file calisp-output/
+    --mass_accuracy $mass_accuracy
+    --bin_delimiter '$bin_delimiter'
+    --threads "\${GALAXY_SLOTS:-1}"
+    --isotope $isotope
+    $compute_clumps &&
+'$__tool_directory__/feather2tsv.py' --calisp_output calisp-output/
+    ]]></command>
+    <inputs>
+        <param argument="--spectrum_file" type="data" multiple="false" format="mzml" label="Spectrum file"/>
+        <param argument="--peptide_file" type="data" multiple="false" format="tabular" label="Peptide file" help="Psm file" />
+        <param argument="--mass_accuracy" type="float" value="10" label="Mass accuracy" help="The maximum mass difference between theoretical mass and experimental mass of a peptide" />
+        <param argument="--bin_delimiter" type="text" value="_" label="Bin delimiter" help="For metagenomic data, the delimiter that separates the bin ID from the protein ID (default: &quot;_&quot;). Use &quot;-&quot; to ignore bins ID entirely.">
+           <sanitizer invalid_char="">
+                <valid initial="string.ascii_letters,string.digits">
+                    <add value="_" />
+                    <add value="-" />
+                    <add value=":" />
+                </valid>
+            </sanitizer> 
+        </param>
+        <param argument="--isotope" type="select" label="Target isotope">
+            <option value="13C" selected="true">13C</option>
+            <option value="14C">14C</option>
+            <option value="15N">15N</option>
+            <option value="17O">17O</option>
+            <option value="18O">18O</option>
+            <option value="2H">2H</option>
+            <option value="3H">3H</option>
+            <option value="33S">33S</option>
+            <option value="34S">34S</option>
+            <option value="36S">36S</option>
+        </param>
+        <param argument="--compute_clumps" type="boolean" truevalue="--compute_clumps" falsevalue="" checked="false" label="Compute clumps" help="To compute clumpiness of carbon assimilation. Only use when samples are labeled tosaturation. Estimation of clumpiness takes much additional time." />
+    </inputs>
+    <outputs>
+        <collection name="output" type="list">
+            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.tsv" format="tabular" directory="calisp-output"/>
+        </collection>
+    </outputs>
+    <tests>
+        <!-- TODO test data to large, avilable from here: https://github.com/kinestetika/Calisp/tree/master/test
+            if possible inlcude via location in the future
+        <test expect_num_outputs="1">
+            <param name="spectrum_file" value="calisp_test_data.mzML" ftype="mzml"/>
+            <param name="peptide_file" value="calisp_test_data_TargetPeptideSpectrumMatch.txt" ftype="tabular"/>
+            <output_collection name="output" count="1">
+                <element name="calisp_test_data">
+                    <assert_contents>
+                        <has_text text="experiment"/>
+                        <has_text text="MKH_260min_1800ng"/>
+                        <has_text text="HOMO"/>
+                        <has_text text="P13645"/>
+                        <has_text text="NHEEEMKDLR"/>
+                        <has_text text="Oxidation"/>
+                        <has_n_columns n="85"/>
+                        <has_n_lines n="24"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+    -->
+    </tests>
+    <help><![CDATA[
+Calisp (Calgary approach to isotopes in proteomics) is a program that estimates
+isotopic composition (e.g. 13C/12C, delta13C, 15N/14N etc) of peptides from
+proteomics mass spectrometry data. Input data consist of mzML files and files
+with peptide spectrum matches.
+
+Calisp was originally developed in Java. This Galaxy tool uses the python 
+reimplementation https://github.com/kinestetika/Calisp.
+Note that, in contrast to the Java version the python reimplementation does
+not use ``mcl`` .
+Compared to Java versions of calisp, the workflow has been simplified.
+Calisp does not filter out any isotopic patterns, or adds up isotopic
+patterns to reduce noise - like the Java version does. It simply estimates the
+ratio for the target isotopes (e.g. 13C/12C) for every isotopic pattern it can
+subsample. It estimates this ratio based on neutron abundance and using fast
+fourier transforms. The former applies to stable isotope probing experiments.
+The latter applies to natural abundances, or to isotope probing experiments with
+very little added label (e.g. using substrates with <1% additional 13C). The
+motivation for omitting filtering is that keeping all subsampled isotopic
+patterns, including bad ones, will enable training of machine learning
+classifiers. Also, because it was shown that the median provides better
+estimates for species in microbial communities than the mean, adding up isotopic
+patterns to improve precision has lost its purpose. There is more power (and
+sensitivity) in numbers.
+
+Because no data are filtered out and no isotopic patterns get added up,
+calisp analyzes at least ten times as many isotopic patterns compared to the
+Java version. That means calisp.py is about ten times slower, it takes about
+5-10 min per .mzML file on a Desktop computer. For natural
+abundance data, it works well to only use those spectra that have a FFT fitting
+error ("error_fft") of less than 0.001. Note that this threshold is less
+stringent then thew one used by the java program.
+
+Input
+=====
+
+Calisp needs two inputs: a spectra file in mzML format and tabular peptipe file (PSM).
+The PSM file contains a column "Spectrum File" that links the peptides to the
+original spectra files. The mzML files are identified by the run id
+information stored in the mzML files or the file name. 
+In order to make the association via the file name work in Galaxy one can either
+
+- use collections where the element identifiers are equal to the data in the column
+- make sure that dataset names are equal to the data in this column
+
+Output table
+============
+
+Each row contains one isotopic pattern, defined by the following columns:
+
+========================================== ===================
+Header name                                Content
+========================================== ===================
+experiment                                 filename of the peptide spectrum match (psm) file
+ms_run                                     filename of the .mzml file
+bins                                       bin/mag ids, separated by commas. Calisp expects the protein ids in the psm file to consist of two parts, separated by a delimiter (_ by default). The first part is the bin/mag id, the second part the protein id
+proteins                                   the ids of the proteins associated with the pattern (without the bin id)
+peptide                                    the aminoacid sequence of the peptide
+peptide_mass                               the mass of the peptide
+C                                          # of carbon atoms in the peptide
+N                                          # of nitrogen atoms in the peptide
+O                                          # of oxygen atoms in the peptide
+H                                          # of hydrogen atoms in the peptide
+S                                          # of sulfur atoms in the peptide
+psm_id                                     psm id
+psm_mz                                     psm m over z
+psm_charge                                 psm charge
+psm_neutrons                               number of neutrons inferred from custom 'neutron' modifications 
+psm_rank                                   rank of the psm
+psm_precursor_id                           id of the ms1 spectrum that was the source of the psm 
+psm_precursor_mz                           mass over charge of the precursor of the psm
+pattern_charge                             charge of the pattern
+pattern_precursor_id                       id of the ms1 spectrum that was the source of the pattern
+pattern_total_intensity                    total intensity of the pattern
+pattern_peak_count                         # of peaks in the pattern
+pattern_median_peak_spacing                medium mass difference between a pattern's peaks
+spectrum_mass_irregularity                 a measure for the standard deviation in the mass difference between a pattern's peaks
+ratio_na                                   the estimated isotope ratio inferred from neutron abundance (sip experiments) 
+ratio_fft                                  the estimated isotope ratio inferred by the fft method (natural isotope abundances)
+error_fft                                  the remaining error after fitting the pattern with fft
+error_clumpy                               the remaining error after fitting the pattern with the clumpy carbon method
+flag_peptide_contains_sulfur               true if peptide contains sulfur
+flag_peptide_has_modifications             true if peptide has no modifications
+flag_peptide_assigned_to_multiple_bins     true if peptide is associated with multiple proteins from different bins/mags
+flag_peptide_assigned_to_multiple_proteins true if peptide is associated with multiple proteins
+flag_peptide_mass_and_elements_undefined   true if peptide has unknown mass and elemental composition
+flag_psm_has_low_confidence                true if psm was flagged as having low confidence (peptide identity uncertain)
+flag_psm_is_ambiguous                      true if psm could not be assigned with certainty
+flag_pattern_is_contaminated               true if multiple patterns have one or more shared peaks
+flag_pattern_is_wobbly                     true if pattern_median_peak_spacing exceeds a treshold
+flag_peak_at_minus_one_pos                 true if a peak was detected immediately before the monoisotopic peak, could indicate overlap with another pattern
+i0 - i19                                   the intensities of the first 20 peaks of the pattern  
+m0 - m19                                   the masses of the first 20 peaks of the pattern
+c1 - c6                                    contributions of clumps of 1-6 carbon to ratio_na. These are the outcomes of the clumpy carbon model. These results are only meaningful if the biomass was labeled to saturation.
+========================================== ===================
+    ]]></help>
+    <citations>
+        <citation type="doi">10.1186/s40168-022-01454-1</citation>
+        <citation type="doi">10.1073/pnas.1722325115</citation>
+        <citation type="doi">10.1101/2021.03.29.437612</citation>
+        <citation type="doi">10.1093/bioinformatics/bty046</citation>
+    </citations>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/feather2tsv.py	Thu Jun 01 08:34:14 2023 +0000
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+"""
+based on https://github.com/kinestetika/Calisp/blob/master/benchmarking/sip%20benchmarking.ipynb
+"""
+
+import argparse
+import os
+
+import pandas as pd
+
+
+def load_calisp_data(filename):
+
+    # (1) load data
+    if os.path.isdir(filename):
+        file_data = []
+        for f in os.listdir(filename):
+            if not f.endswith(".feather"):
+                continue
+            f = os.path.join(filename, f)
+            file_data.append(pd.read_feather(f))
+            base, _ = os.path.splitext(f)
+            file_data[-1].to_csv(f"{base}.tsv", sep="\t")
+        data = pd.concat(file_data)
+    else:
+        data = pd.read_feather(filename)
+        base, _ = os.path.splitext(filename)
+        data.to_csv(f"{base}.tsv", sep="\t")
+
+
+parser = argparse.ArgumentParser(description='feather2tsv')
+parser.add_argument('--calisp_output', required=True, help='feather file')
+args = parser.parse_args()
+
+data = load_calisp_data(args.calisp_output)