view flashlfq.xml @ 2:e70198847b2a draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/flashlfq commit 38eecdb36b6b505afcdd99de7351eed9c3729843
author galaxyp
date Thu, 25 Jan 2018 20:37:09 -0500
parents 1e2fc34b1f20
children ff1148892ce3
line wrap: on
line source

<tool id="flashlfq" name="FlashLFQ" version="0.1.99">
    <description>ultrafast label-free quantification for mass-spectrometry proteomics</description>
    <requirements>
        <requirement type="package" version="0.1.99">flashlfq</requirement>
    </requirements>
    <command><![CDATA[
        #import re
        #set $idt_path = $re.sub('\s','_',$re.sub('[.][^.]*$','',$idt.display_name.split('/')[-1])) + ".psmtsv"
        ln -s '${idt}' '${idt_path}' &&
        mkdir spectrum_dir &&
        #for $peak_list in $peak_lists:
            #set $input_name = $re.sub('[.][^.]*$','',$peak_list.display_name.split('/')[-1]) + ".mzML"
            ln -s '${peak_list}' 'spectrum_dir/${input_name}' &&
        #end for

        FlashLFQ 
        --idt '$idt_path'
        --rep spectrum_dir
        --ppm $ppm
        --iso $iso
        --nis $nis
        #if $intensity == 'integrate':
            --int true
        #end if
        #if $charge == 'precursor':
            --chg true
        #end if
        $rmm $mbr
        --pau false
        && cat *_FlashLFQ_Log.txt | sed 's/\(Analysis summary for:\).*working./\1 /' > '$log' 
        && cp *_FlashLFQ_QuantifiedBaseSequences.tsv '$quantifiedBaseSequences'
        && cp *_FlashLFQ_QuantifiedModifiedSequences.tsv '$quantifiedModifiedSequences'
        && cp *_FlashLFQ_QuantifiedPeaks.tsv '$quantifiedPeaks'
        ## create issue for FlashLFQ to name column headers correctly
        && grep -v '^test$' *_FlashLFQ_QuantifiedProteins.tsv > '$quantifiedProteins'
    ]]></command>
    <inputs>
        <param name="idt" type="data" format="tabular" label="identification file" 
             help="MetaMorpheus,Morpheus"/>
        <param name="peak_lists" type="data" format="mzml" multiple="true" label="spectrum files"/>
        <param name="ppm" type="float" value="10" min="1" max="20" label="monoisotopic ppm tolerance"/>
        <param name="iso" type="float" value="5" min="1" max="10" label="isotopic distribution tolerance in ppm"/>
        <param name="nis" type="integer" value="2" min="1" max="30" label="number of isotopes required to be observed"/>
        <param name="intensity" type="select" label="intensity">
            <option value="apex" selected="true">use the apex intensity</option>
            <option value="integrate">integrate chromatographic peak intensity</option>
        </param>
        <param name="charge" type="select" label="charge">
            <option value="all" selected="true">use all identification detected charge states</option>
            <option value="precursor">use precursor charge</option>
        </param>
        <param name="rmm" type="boolean" truevalue="--rmm true" falsevalue="--rmm false" checked="true" 
            label="require observed monoisotopic mass peak"/>
        <param name="mbr" type="boolean" truevalue="--mbr true" falsevalue="--mbr false" checked="false" 
            label="match between runs"/>
    </inputs>
    <outputs>
        <data name="log" format="txt" label="${tool.name} on ${on_string}: Log" />
        <data name="quantifiedPeaks" format="tabular" label="${tool.name} on ${on_string}: QuantifiedPeaks.tsv">
            <actions>
                <action name="column_names" type="metadata" 
                 default="File Name,Base Sequence,Full Sequence,Protein Group,Peptide Monoisotopic Mass,MS2 Retention Time,Precursor Charge,Theoretical MZ,Peak intensity,Peak RT Start,Peak RT Apex,Peak RT End,Peak MZ,Peak Charge,Num Charge States Observed,Peak Detection Type,PSMs Mapped,Base Sequences Mapped,Full Sequences Mapped,Peak Split Valley RT,Peak Apex Mass Error (ppm)"/>
            </actions>
        </data>
        <data name="quantifiedBaseSequences" format="tabular" label="${tool.name} on ${on_string}: QuantifiedBaseSequences.tsv">
            <actions>
                <action name="column_names" type="metadata" 
                 default="Sequence,Protein Group,${','.join(['Intensity_' + i.name for i in $peak_lists])},${','.join(['Detection Type_' + i.name for i in $peak_lists])}"/>
            </actions>
        </data>
        <data name="quantifiedModifiedSequences" format="tabular" label="${tool.name} on ${on_string}: QuantifiedModifiedSequences.tsv">
            <actions>
                <action name="column_names" type="metadata" 
                 default="Sequence,Protein Group,${','.join(['Intensity_' + i.name for i in $peak_lists])},${','.join(['Detection Type_' + i.name for i in $peak_lists])}"/>
            </actions>
        </data>
        <data name="quantifiedProteins" format="tabular" label="${tool.name} on ${on_string}: QuantifiedProteins.tsv">
            <actions>
                <action name="column_names" type="metadata" 
                 default="Protein,${','.join([i.name for i in $peak_lists])}"/>
            </actions>
        </data>
    </outputs>
    <tests>
        <test>
            <param name="idt" value="aggregatePSMs_5ppmAroundZero.psmtsv" ftype="tabular"/>
            <param name="peak_lists" value="sliced-mzml.mzML" ftype="mzml"/>
            <param name="ppm" value="12"/>
            <param name="iso" value="6"/>
            <output name="log">
                <assert_contents>
                    <has_text text="ppmTolerance = 12" />
                    <has_text text="isotopePpmTolerance = 6" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[

**FlashLFQ** is an ultrafast label-free quantification for mass-spectrometry proteomics.

**Accepted command-line arguments:**

::

    --idt [string | identification file path (TSV format)]
    --raw [string | MS data file (.raw or .mzML)]
    --rep [string | repository containing MS data files]
    --ppm [double | monoisotopic ppm tolerance] (default = 10)
    --iso [double | isotopic distribution tolerance in ppm] (default = 5)
    --sil [boolean | silent mode; no console output] (default = false)
    --pau [boolean | pause at end of run] (default = true)
    --int [boolean | integrate chromatographic peak intensity instead of using 
          the apex intensity] (default = false)
    --chg [boolean | use only precursor charge state; when set to false, FlashLFQ looks 
          for all charge states detected in the MS/MS identification file for each peptide] (default = false)
    --mbr [bool|match between runs]
    --rmm [bool|require observed monoisotopic mass peak]
    --nis [int|number of isotopes required to be observed]


**Tab-Delimited Identification Text File**

The first line of the text file should contain column headers identifying what each column is. Note that MetaMorpheus (.psmtsv), Morpheus, MaxQuant (msms.txt), and TDPortal tab-delimited column headers are supported natively and such files can be read without modification. For search software that lists decoys and PSMs above 1% FDR (e.g., MetaMorpheus), you may want to remove these prior to FlashLFQ analysis. FlashLFQ will probably crash if ambiguous PSMs are passed into it (e.g., a PSM with more than 2 peptides listed in one line).

The following headers are required in the list of MS/MS identifications:

  - **File Name** - File extensions should be tolerated, but no extension is tested more extensively (e.g. use MyFile and not MyFile.mzML)
  - **Base Sequence** - Should only contain amino acid sequences, or it will likely result in a crash
  - **Full Sequence** - Modified sequence. Can contain any letters, but must be consistent between the same peptidoform to get accurate results
  - **Peptide Monoisotopic Mass** - Theoretical monoisotopic mass, including modification mass
  - **Scan Retention Time** - MS/MS identification scan retention time
  - **Precursor Charge** - Charge of the ion selected for MS/MS resulting in the identification
  - **Protein Accession** - Protein accession(s) for the peptide; protein quantification is still preliminary


**Outputs**:

  - **QuantifiedProteins.tsv** - Protein intensities are summed here within a run. 

  - **QuantifiedPeaks.tsv** - Each chromatographic peak is shown here, even peaks that were not quantifiable (peak intensity = 0). Details about each peak, such as number of PSMs mapped, start/apex/end retention times, ppm error, etc are contained in this file. A peptide can have multiple peaks over the course of a run (e.g., oxidized peptidoforms elute at different times, etc). Ambiguous peaks are displayed with a | (pipe) delimiter to indicate more than one peptide mapped to that peak.

  - **QuantifiedModifiedSequences.tsv** - Similar to QuantifiedBaseSequences, but instead of being summed by Base Sequence, peptide intensities are summed by modified sequence; this makes it convenient to compare modified peptidoform intensities across runs.

  - **QuantifiedBaseSequences.tsv** - Peptide intensities are summed here within a run (including differently-modified forms of the same amino acid sequence) and displayed in a convenient format for comparing across runs. The identification type (MS/MS or MBR) is also indicated. A peptide with more than 30% of its intensity coming from ambiguous peak(s) is considered not quantifiable and is given an intensity of -1.


  - **Log.txt** - Log of the FlashLFQ run. Includes timestamps and quantification time for each file, total analysis time, directories used, and settings.


    ]]></help>
    <citations>
        <citation type="doi">10.1021/acs.jproteome.7b00608</citation>
    </citations>
</tool>