view tools/ncbi_blast_plus/ncbi_makeblastdb.xml @ 11:4c4a0da938ff draft

Uploaded v0.0.22, now wraps BLAST+ 2.2.28 allowing extended tabular output to include the hit descriptions as column 25. Supports $GALAXY_SLOTS. Includes more tests and heavy use of macros.
author peterjc
date Thu, 05 Dec 2013 06:55:59 -0500
parents 70e7dcbf6573
children 623f727cdff1
line wrap: on
line source

<tool id="ncbi_makeblastdb" name="NCBI BLAST+ makeblastdb" version="0.0.22">
    <description>Make BLAST database</description>
    <macros>
        <token name="@BINARY@">makeblastdb</token>
        <import>ncbi_macros.xml</import>
    </macros>
    <expand macro="requirements" />
    <command interpreter="python">check_no_duplicates.py
##First check for duplicates (since BLAST+ 2.2.28 fails to do so)
##and abort (via the ampersand ampersand trick) if any are found.
#for $i in $in
"${i.file}"
#end for
&amp;&amp;
makeblastdb -out "${os.path.join($outfile.extra_files_path,'blastdb')}"
$parse_seqids
$hash_index
## Single call to -in with multiple filenames space separated with outer quotes
## (presumably any filenames with spaces would be a problem). Note this gives
## some extra spaces, e.g. -in " file1 file2 file3  " but BLAST seems happy:
-in "
#for $i in $in
${i.file}
#end for
"
#if $title:
-title "$title"
#else:
##Would default to being based on the cryptic Galaxy filenames, which is unhelpful
-title "BLAST Database"
#end if
-dbtype $dbtype 
#set $mask_string = ''
#set $sep = '-mask_data '
#for $i in $mask_data
#set $mask_string += $sep + str($i.file)
#set $sep = ','
#end for
$mask_string
## #set $gi_mask_string = ''
## #set $sep = '-gi_mask -gi_mask_name '
## #for $i in $gi_mask
## #set $gi_mask_string += $sep + str($i.file)
## #set $sep = ','
## #end for
## $gi_mask_string
## #if $tax.select == 'id':
## -taxid $tax.id
## #else if $tax.select == 'map':
## -taxid_map $tax.map
## #end if
## --------------------------------------------------------------------
## Capture the stdout log information to the primary file (plain text):
&gt;&gt; "$outfile"
    </command>
    <expand macro="stdio" />
    <inputs>
        <param name="dbtype" type="select" display="radio" label="Molecule type of input">
            <option value="prot">protein</option>
            <option value="nucl">nucleotide</option>
        </param>
        <!-- TODO Allow merging of existing BLAST databases (conditional on the database type)
             NOTE Double check the new database would be self contained first
        <repeat name="in" title="BLAST or FASTA Database" min="1">
            <param name="file" type="data" format="fasta,blastdbn,blastdbp" label="BLAST or FASTA database" />
        </repeat>
        -->
        <!-- TODO Switch this to using <param ... multiple="true" /> instead of <repeat> block? -->
        <repeat name="in" title="FASTA file" min="1">
            <param name="file" type="data" format="fasta" />
        </repeat>
        <param name="title" type="text" value="" label="Title for BLAST database" help="This is the database name shown in BLAST search output" />
        <param name="parse_seqids" type="boolean" truevalue="-parse_seqids" falsevalue="" checked="False" label="Parse the sequence identifiers" help="This is only advised if your FASTA file follows the NCBI naming conventions using pipe '|' symbols" />
        <param name="hash_index" type="boolean" truevalue="-hash_index" falsevalue="" checked="true" label="Enable the creation of sequence hash values" help="These hash values can then be used to quickly determine if a given sequence data exists in this BLAST database." />
        <!-- SEQUENCE MASKING OPTIONS -->
        <repeat name="mask_data" title="Masking data file">
            <param name="mask_data_file" type="data" format="maskinfo-asn1,maskinfo-asn1-binary" label="ASN.1 file containing masking data" help="As produced by NCBI masking applications (e.g. dustmasker, segmasker, windowmasker)" />
        </repeat>
        <!-- TODO
        <repeat name="gi_mask" title="Create GI indexed masking data">
            <param name="gi_mask_file" type="data" format="asnb" label="Masking data output file" />
        </repeat>
        -->

        <!-- TAXONOMY OPTIONS -->
        <!-- TODO
        <conditional name="tax">
            <param name="select" type="select" label="Taxonomy options">
                <option value="">Do not assign sequences to Taxonomy IDs</option>
                <option value="id">Assign all sequences to one Taxonomy ID</option>
                <option value="map">Supply text file mapping sequence IDs to taxnomy IDs</option>
            </param>
            <when value="">
            </when>
            <when value="id">
                <param name="id" type="integer" value="" label="NCBI taxonomy ID" help="Integer &gt;=0" />
            </when>
            <when value="map">
                <param name="file" type="data" format="txt" label="Seq ID : Tax ID mapping file" help="Format: SequenceId TaxonomyId" />
            </when>
        </conditional>
        -->
    </inputs>
    <outputs>
        <!-- If we only accepted one FASTA file, we could use its human name here... -->
        <data name="outfile" format="data" label="${dbtype.value_label} BLAST database from ${on_string}">
            <change_format>
                <when input="dbtype" value="nucl" format="blastdbn" />
                <when input="dbtype" value="prot" format="blastdbp" />
            </change_format>
        </data>
    </outputs>
    <tests>
        <!-- Note the (two line) PIN file is not reproducible run to run.
        -->
        <test>
            <param name="dbtype" value="prot" />
            <param name="file" value="four_human_proteins.fasta" ftype="fasta" />
            <param name="title" value="Just 4 human proteins" />
            <param name="parse_seqids" value="" />
            <param name="hash_index" value="true" />
            <output name="out_file" file="four_human_proteins.fasta.log" ftype="blastdbp" lines_diff="6">
                <extra_files type="file" value="four_human_proteins.fasta.phr" name="blastdb.phr" />
                <extra_files type="file" value="four_human_proteins.fasta.pin" name="blastdb.pin" lines_diff="2" />
                <extra_files type="file" value="four_human_proteins.fasta.psq" name="blastdb.psq" />
                <extra_files type="file" value="four_human_proteins.fasta.pog" name="blastdb.pog" />
                <extra_files type="file" value="four_human_proteins.fasta.phd" name="blastdb.phd" />
                <extra_files type="file" value="four_human_proteins.fasta.phi" name="blastdb.phi" />
                <extra_files type="file" value="four_human_proteins.fasta.psd" name="blastdb.psd" />
                <extra_files type="file" value="four_human_proteins.fasta.psi" name="blastdb.psi" />
            </output>
        </test>
    </tests>
    <help>
**What it does**

Make BLAST database from one or more FASTA files and/or BLAST databases.

This is a wrapper for the NCBI BLAST+ tool 'makeblastdb', which is the
replacement for the 'formatdb' tool in the NCBI 'legacy' BLAST suite.

<!--
Applying masks to an existing BLAST database will not change the original database; a new database will be created.
For this reason, it's best to apply all masks at once to minimize the number of unnecessary intermediate databases.
-->

**Documentation**

http://www.ncbi.nlm.nih.gov/books/NBK1763/

**References**

If you use this Galaxy tool in work leading to a scientific publication please
cite the following papers:

@REFERENCES@
    </help>
</tool>