view tools/ncbi_blast_plus/ncbi_blastdbcmd_wrapper.xml @ 13:623f727cdff1 draft

Uploaded v0.1.00, uses BLAST+ 2.2.29, allows custom column selection for tabular output - including taxonomy fields.
author peterjc
date Fri, 14 Mar 2014 07:40:46 -0400
parents 4c4a0da938ff
children 2fe07f50a41e
line wrap: on
line source

<tool id="ncbi_blastdbcmd_wrapper" name="NCBI BLAST+ blastdbcmd entry(s)" version="0.1.00">
    <description>Extract sequence(s) from BLAST database</description>
    <macros>
        <token name="@BINARY@">blastdbcmd</token>
        <import>ncbi_macros.xml</import>
    </macros>
    <expand macro="requirements" />
    <command>
## The command is a Cheetah template which allows some Python based syntax.
## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
blastdbcmd -dbtype $db_opts.db_type -db "${db_opts.database.fields.path}"

##TODO: What about -ctrl_a and -target_only as advanced options?

#if $id_opts.id_type=="file":
-entry_batch "$id_opts.entries"
#else:
##Perform some simple search/replaces to remove whitespace
##and make it comma separated, and escape any pipe characters
-entry "$id_opts.entries.replace('\r',',').replace('\n',',').replace(' ','').replace(',,',',').replace(',,',',').strip(',').replace('|','\|')"
#end if

##When building a BLAST database, to ensure unique IDs makeblastdb will
##do things like turning a FASTA entry with ID of ERP44 into lcl|ERP44
##(if using -parse_seqids) or simply assign it an ID using the record
##number like gnl|BL_ORD_ID|123 (to cope with duplicate IDs in the FASTA
##file). In -parse_seqids mode, a duplicate FASTA ID gives an error.
##
##The BLAST plain text and XML output will contain these BLAST IDs, but
##the tabular output does not (at least, not in BLAST 2.2.25+).
##Therefore in general, Galaxy users won't care about the (internal)
##BLAST identifiers.
##
##The blastdbcmd FASTA output will also contain these IDs, but in the
##context of the BLAST tabular output they are not helpful. Therefore
##to recover the original ID as used in the FASTA file for makeblastdb
##we need a litte post processing.
##
##We remove the NCBI's lcl|... or gnl|BL_ORD_ID|123 prefixes
##using sed, however the exact syntax differs for Mac OS X's sed

#if str($outfmt)=="blastid":
-out "$seq"
#else if sys.platform == "darwin":
| sed -E 's/^>(lcl\||gnl\|BL_ORD_ID\|[0-9]* )/>/1' > "$seq"
#else:
| sed 's/>\(lcl|\|gnl|BL_ORD_ID|[0-9]* \)/>/1' > "$seq"
#end if
    </command>
    <expand macro="stdio" />
    <inputs>
        <expand macro="input_conditional_choose_db_type" />
        <conditional name="id_opts">
            <param name="id_type" type="select" label="Type of identifier list">
              <option value="file">From file</option>
              <option value="prompt">User entered</option>
            </param>
            <when value="file">
                <param name="entries" type="data" format="txt,tabular" label="Sequence identifier(s)" help="Plain text file with one ID per line (i.e. single column tabular file)"/>
            </when>
            <when value="prompt">
                <param name="entries" type="text" label="Sequence identifier(s)" help="Comma or new line separated list." optional="False" area="True" size="10x30"/>
            </when>
        </conditional>
        <param name="outfmt" type="select" label="Output format">
          <option value="original">FASTA with original identifiers</option>
          <option value="blastid">FASTA with BLAST assigned identifiers</option>
        </param>
    </inputs>
    <outputs>
        <data name="seq" format="fasta" label="Sequences from ${db_opts.database.fields.name}" />
    </outputs>
    <help>
    
**What it does**

Extracts FASTA formatted sequences from a BLAST database
using the NCBI BLAST+ blastdbcmd command line tool.

.. class:: warningmark

**BLAST assigned identifiers**

When a BLAST database is constructed from a FASTA file, the
original identifiers can be replaced with BLAST assigned
identifiers, partly to ensure uniqueness. e.g. Sometimes
a prefix of 'lcl|' is added (lcl is short for local),
or an arbitrary name starting 'gnl|BL_ORD_ID|' is created.

If you are using the tabular output from BLAST, it will contain
the original identifiers - not the BLAST assigned identifiers
suitable for use with the blastdbcmd tool.

If you are using the XML or plain text output, this will also
contain the BLAST assigned identifiers. However, this means
getting a list of BLAST assigned identifiers isn't straightforward.

-------

**References**

If you use this Galaxy tool in work leading to a scientific publication please
cite the following papers:

@REFERENCES@
    </help>
</tool>