view validate_fasta_headers.xml @ 5:d4bd627618e5 draft

planemo upload commit ebb3b91b99aaf358d44432c9dc1f0a4d771970cd-dirty
author caleb-easterly
date Wed, 28 Jun 2017 16:05:07 -0400
parents e5a59fabeeba
children
line wrap: on
line source

<tool id="validate_fasta_database" name="Validate FASTA Headers" version="0.1.2">
    <requirements>
    </requirements>
    <stdio>
        <exit_code range="1" level="fatal" description="Invalid FASTA headers detected, was asked to fail"/>
    </stdio>
    <command detect_errors="exit_code"><![CDATA[
        java -jar $__tool_directory__/FastaHeader-1.0-SNAPSHOT.jar '$FASTA' '$goodFasta' '$badFasta' '$crashIfInvalid'
        '$checkIsProtein' $checkLength.checkLength
        #if $checkLength.checkLength
             $checkLength.minimumLength
        #end if
    ]]></command>
    <inputs>
        <param type="data" name="FASTA" format="fasta" label="Select input FASTA dataset"/>
        <param type="boolean" name="crashIfInvalid" label="Fail job if invalid FASTA headers detected?"/>
        <param type="boolean" name="checkIsProtein" label="Ensure that sequence is not DNA or RNA?"/>
        <conditional name="checkLength">
            <param type="boolean" name="checkLength" label="Filter out sequences below a minimum sequenceLength?">
                <option value="true"></option>
                <option value="false"></option>
            </param>
            <when value="true">
                <param name="minimumLength" type="integer" value="0" label="Minimum sequenceLength that AA sequence must have"/>
            </when>
            <when value="false">
            </when>
        </conditional>

    </inputs>
    <outputs>
        <data name="goodFasta" format="fasta" label="Validate FASTA: Passed Sequences"/>
        <data name="badFasta" format="fasta" label="Validate FASTA: Failed Sequences"/>
    </outputs>
    <tests>
        <test>
            <param name="FASTA" value="fastaFilteringTest_IN.fasta"/>
            <output name="goodFasta" file="fastaFilteringTest_OUT1.fasta" />
            <output name="badFasta" file="fastaFilteringTest_OUT2.fasta" />
        </test>
    </tests>
    <help>
<![CDATA[
**Notes**

Takes a FASTA database and validates the headers using the Compomics (developers of SearchGUI and PeptideShaker) schema. 
Custom FASTA databases may be in an invalid format, which causes SearchGUI to crash.
        
**Output**

The main output of this tool, "Validate FASTA: Passed Sequences", is a FASTA database that can be run through SearchGUI without error.
The failed sequences may be examined for typos and other errors. 

In addition, the tool will print the databases assigned by the Compomics utility (i.e., UniProt), for a quick check of the validity of the custom FASTA database. 

Sequences that may cause the tool to report an exception are those that are not valid examples of the following formats: 
    * UniProt,
    * SwissProt (starts with ">sw|" or ">SW|")
    * NCBI (starts with ">gi|" or ">GI|")
    * Halobacterium from Max Planck (starts with "OE")
    * H Influenza, from Novartis (starts with ">hflu_")
    * C Trachomatis (starts with ">C.tr\_" or "C\_trachomatis\_")
    * M Tuberculosis (starts with ">M. tub")
    * Saccharomyces Genome Database (contains "SGDID")
    * Genome translation (ex. ">dm345\_3L-sense [2343534-234353938]")
    * Genome Annotation Framework for Flexible Analysis (GAFFA) (starts with ">GAFFA")
    * UPS (contains "\_HUMAN\_UPS")
           
Many sequences are reported as Generic, which may or may not allow for extraction of the accession number. 
]]>
    </help>
    <citations>
        <citation type="bibtex">
            @misc{fastaValidation,
            author = {The GalaxyP Team},
            date = {22 June 2017},
            title = {FASTA Database Validation Tool}
            }
        </citation>
    </citations>
</tool>