Mercurial > repos > galaxyp > calisp
view calisp.xml @ 0:6d93529d19d4 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/calisp commit 060699366b6dd19ad6c3ef3f332f63cc55d75dce
author | galaxyp |
---|---|
date | Thu, 01 Jun 2023 08:34:14 +0000 |
parents | |
children | 867f17ede7f3 |
line wrap: on
line source
<tool id="calisp" name="calisp" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05"> <description>Estimate isotopic composition of peptides from proteomics mass spectrometry data</description> <macros> <token name="@TOOL_VERSION@">3.0.10</token> <token name="@VERSION_SUFFIX@">0</token> <token name="@CALISP_REPO@">https://raw.githubusercontent.com/kinestetika/Calisp/208d495674e2b52fe56cf23457c833d1c2527242</token> <xml name="input_macro" tokens="multiple"> <!-- According to readme mzid input is not yet implented --> </xml> </macros> <requirements> <requirement type="package" version="@TOOL_VERSION@">calisp</requirement> </requirements> <command detect_errors="aggressive"><![CDATA[ #import re mkdir -p spectra && #set escaped_specs = re.sub('[^\w\-\.,:]', '_', str($spectrum_file.element_identifier)) ln -s '$spectrum_file' spectra/'$escaped_specs' && mkdir -p psms && #set escaped_peps = re.sub('[^\w\-\.,:]', '_', str($peptide_file.element_identifier)) ln -s '$peptide_file' psms/'$escaped_peps' && calisp --spectrum_file spectra/ --peptide_file psms/ --output_file calisp-output/ --mass_accuracy $mass_accuracy --bin_delimiter '$bin_delimiter' --threads "\${GALAXY_SLOTS:-1}" --isotope $isotope $compute_clumps && '$__tool_directory__/feather2tsv.py' --calisp_output calisp-output/ ]]></command> <inputs> <param argument="--spectrum_file" type="data" multiple="false" format="mzml" label="Spectrum file"/> <param argument="--peptide_file" type="data" multiple="false" format="tabular" label="Peptide file" help="Psm file" /> <param argument="--mass_accuracy" type="float" value="10" label="Mass accuracy" help="The maximum mass difference between theoretical mass and experimental mass of a peptide" /> <param argument="--bin_delimiter" type="text" value="_" label="Bin delimiter" help="For metagenomic data, the delimiter that separates the bin ID from the protein ID (default: "_"). Use "-" to ignore bins ID entirely."> <sanitizer invalid_char=""> <valid initial="string.ascii_letters,string.digits"> <add value="_" /> <add value="-" /> <add value=":" /> </valid> </sanitizer> </param> <param argument="--isotope" type="select" label="Target isotope"> <option value="13C" selected="true">13C</option> <option value="14C">14C</option> <option value="15N">15N</option> <option value="17O">17O</option> <option value="18O">18O</option> <option value="2H">2H</option> <option value="3H">3H</option> <option value="33S">33S</option> <option value="34S">34S</option> <option value="36S">36S</option> </param> <param argument="--compute_clumps" type="boolean" truevalue="--compute_clumps" falsevalue="" checked="false" label="Compute clumps" help="To compute clumpiness of carbon assimilation. Only use when samples are labeled tosaturation. Estimation of clumpiness takes much additional time." /> </inputs> <outputs> <collection name="output" type="list"> <discover_datasets pattern="(?P<designation>.*)\.tsv" format="tabular" directory="calisp-output"/> </collection> </outputs> <tests> <!-- TODO test data to large, avilable from here: https://github.com/kinestetika/Calisp/tree/master/test if possible inlcude via location in the future <test expect_num_outputs="1"> <param name="spectrum_file" value="calisp_test_data.mzML" ftype="mzml"/> <param name="peptide_file" value="calisp_test_data_TargetPeptideSpectrumMatch.txt" ftype="tabular"/> <output_collection name="output" count="1"> <element name="calisp_test_data"> <assert_contents> <has_text text="experiment"/> <has_text text="MKH_260min_1800ng"/> <has_text text="HOMO"/> <has_text text="P13645"/> <has_text text="NHEEEMKDLR"/> <has_text text="Oxidation"/> <has_n_columns n="85"/> <has_n_lines n="24"/> </assert_contents> </element> </output_collection> </test> --> </tests> <help><![CDATA[ Calisp (Calgary approach to isotopes in proteomics) is a program that estimates isotopic composition (e.g. 13C/12C, delta13C, 15N/14N etc) of peptides from proteomics mass spectrometry data. Input data consist of mzML files and files with peptide spectrum matches. Calisp was originally developed in Java. This Galaxy tool uses the python reimplementation https://github.com/kinestetika/Calisp. Note that, in contrast to the Java version the python reimplementation does not use ``mcl`` . Compared to Java versions of calisp, the workflow has been simplified. Calisp does not filter out any isotopic patterns, or adds up isotopic patterns to reduce noise - like the Java version does. It simply estimates the ratio for the target isotopes (e.g. 13C/12C) for every isotopic pattern it can subsample. It estimates this ratio based on neutron abundance and using fast fourier transforms. The former applies to stable isotope probing experiments. The latter applies to natural abundances, or to isotope probing experiments with very little added label (e.g. using substrates with <1% additional 13C). The motivation for omitting filtering is that keeping all subsampled isotopic patterns, including bad ones, will enable training of machine learning classifiers. Also, because it was shown that the median provides better estimates for species in microbial communities than the mean, adding up isotopic patterns to improve precision has lost its purpose. There is more power (and sensitivity) in numbers. Because no data are filtered out and no isotopic patterns get added up, calisp analyzes at least ten times as many isotopic patterns compared to the Java version. That means calisp.py is about ten times slower, it takes about 5-10 min per .mzML file on a Desktop computer. For natural abundance data, it works well to only use those spectra that have a FFT fitting error ("error_fft") of less than 0.001. Note that this threshold is less stringent then thew one used by the java program. Input ===== Calisp needs two inputs: a spectra file in mzML format and tabular peptipe file (PSM). The PSM file contains a column "Spectrum File" that links the peptides to the original spectra files. The mzML files are identified by the run id information stored in the mzML files or the file name. In order to make the association via the file name work in Galaxy one can either - use collections where the element identifiers are equal to the data in the column - make sure that dataset names are equal to the data in this column Output table ============ Each row contains one isotopic pattern, defined by the following columns: ========================================== =================== Header name Content ========================================== =================== experiment filename of the peptide spectrum match (psm) file ms_run filename of the .mzml file bins bin/mag ids, separated by commas. Calisp expects the protein ids in the psm file to consist of two parts, separated by a delimiter (_ by default). The first part is the bin/mag id, the second part the protein id proteins the ids of the proteins associated with the pattern (without the bin id) peptide the aminoacid sequence of the peptide peptide_mass the mass of the peptide C # of carbon atoms in the peptide N # of nitrogen atoms in the peptide O # of oxygen atoms in the peptide H # of hydrogen atoms in the peptide S # of sulfur atoms in the peptide psm_id psm id psm_mz psm m over z psm_charge psm charge psm_neutrons number of neutrons inferred from custom 'neutron' modifications psm_rank rank of the psm psm_precursor_id id of the ms1 spectrum that was the source of the psm psm_precursor_mz mass over charge of the precursor of the psm pattern_charge charge of the pattern pattern_precursor_id id of the ms1 spectrum that was the source of the pattern pattern_total_intensity total intensity of the pattern pattern_peak_count # of peaks in the pattern pattern_median_peak_spacing medium mass difference between a pattern's peaks spectrum_mass_irregularity a measure for the standard deviation in the mass difference between a pattern's peaks ratio_na the estimated isotope ratio inferred from neutron abundance (sip experiments) ratio_fft the estimated isotope ratio inferred by the fft method (natural isotope abundances) error_fft the remaining error after fitting the pattern with fft error_clumpy the remaining error after fitting the pattern with the clumpy carbon method flag_peptide_contains_sulfur true if peptide contains sulfur flag_peptide_has_modifications true if peptide has no modifications flag_peptide_assigned_to_multiple_bins true if peptide is associated with multiple proteins from different bins/mags flag_peptide_assigned_to_multiple_proteins true if peptide is associated with multiple proteins flag_peptide_mass_and_elements_undefined true if peptide has unknown mass and elemental composition flag_psm_has_low_confidence true if psm was flagged as having low confidence (peptide identity uncertain) flag_psm_is_ambiguous true if psm could not be assigned with certainty flag_pattern_is_contaminated true if multiple patterns have one or more shared peaks flag_pattern_is_wobbly true if pattern_median_peak_spacing exceeds a treshold flag_peak_at_minus_one_pos true if a peak was detected immediately before the monoisotopic peak, could indicate overlap with another pattern i0 - i19 the intensities of the first 20 peaks of the pattern m0 - m19 the masses of the first 20 peaks of the pattern c1 - c6 contributions of clumps of 1-6 carbon to ratio_na. These are the outcomes of the clumpy carbon model. These results are only meaningful if the biomass was labeled to saturation. ========================================== =================== ]]></help> <citations> <citation type="doi">10.1186/s40168-022-01454-1</citation> <citation type="doi">10.1073/pnas.1722325115</citation> <citation type="doi">10.1101/2021.03.29.437612</citation> <citation type="doi">10.1093/bioinformatics/bty046</citation> </citations> </tool>