view tools/ncbi_blast_plus/ncbi_blastdbcmd_wrapper.xml @ 9:9dabbfd73c8a draft

Uploaded v0.0.19, adds wrappers for rpsblast and rpstblastn with new blastdb_d.loc file for their protein domain database. Also includes other minor improvements.
author peterjc
date Thu, 25 Apr 2013 09:38:37 -0400
parents 393a7a35383c
children 70e7dcbf6573
line wrap: on
line source

<tool id="ncbi_blastdbcmd_wrapper" name="NCBI BLAST+ blastdbcmd entry(s)" version="0.0.5">
    <description>Extract sequence(s) from BLAST database</description>
    <requirements>
        <requirement type="binary">blastdbcmd</requirement>
        <requirement type="package" version="2.2.26+">blast+</requirement>
    </requirements>
    <version_command>blastdbcmd -version</version_command>
    <command>
## The command is a Cheetah template which allows some Python based syntax.
## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
blastdbcmd -dbtype $db_opts.db_type -db "${db_opts.database.fields.path}"

##TODO: What about -ctrl_a and -target_only as advanced options?

#if $id_opts.id_type=="file":
-entry_batch "$id_opts.entries"
#else:
##Perform some simple search/replaces to remove whitespace
##and make it comma separated, and escape any pipe characters
-entry "$id_opts.entries.replace('\r',',').replace('\n',',').replace(' ','').replace(',,',',').replace(',,',',').strip(',').replace('|','\|')"
#end if

##When building a BLAST database, to ensure unique IDs makeblastdb will
##do things like turning a FASTA entry with ID of ERP44 into lcl|ERP44
##(if using -parse_seqids) or simply assign it an ID using the record
##number like gnl|BL_ORD_ID|123 (to cope with duplicate IDs in the FASTA
##file). In -parse_seqids mode, a duplicate FASTA ID gives an error.
##
##The BLAST plain text and XML output will contain these BLAST IDs, but
##the tabular output does not (at least, not in BLAST 2.2.25+).
##Therefore in general, Galaxy users won't care about the (internal)
##BLAST identifiers.
##
##The blastdbcmd FASTA output will also contain these IDs, but in the
##context of the BLAST tabular output they are not helpful. Therefore
##to recover the original ID as used in the FASTA file for makeblastdb
##we need a litte post processing.
##
##We remove the NCBI's lcl|... or gnl|BL_ORD_ID|123 prefixes
##using sed, however the exact syntax differs for Mac OS X's sed

#if str($outfmt)=="blastid":
-out "$seq"
#else if sys.platform == "darwin":
| sed -E 's/^>(lcl\||gnl\|BL_ORD_ID\|[0-9]* )/>/1' > "$seq"
#else:
| sed 's/>\(lcl|\|gnl|BL_ORD_ID|[0-9]* \)/>/1' > "$seq"
#end if
    </command>
    <stdio>
        <!-- Anything other than zero is an error -->
        <exit_code range="1:" />
        <exit_code range=":-1" />
	<!-- Suspect blastdbcmd sometimes fails to set error level -->
	<regex match="Error:" />
	<regex match="Exception:" />
    </stdio>
    <inputs>
        <conditional name="db_opts">
            <param name="db_type" type="select" label="Type of BLAST database">
              <option value="nucl" selected="True">Nucleotide</option>
              <option value="prot">Protein</option>
            </param>
            <when value="nucl">
                <param name="database" type="select" label="Nucleotide BLAST database">
                    <options from_file="blastdb.loc">
                      <column name="value" index="0"/>
                      <column name="name" index="1"/>
                      <column name="path" index="2"/>
                    </options>
                </param>
            </when>
            <when value="prot">
                <param name="database" type="select" label="Protein BLAST database">
                    <options from_file="blastdb_p.loc">
                      <column name="value" index="0"/>
                      <column name="name" index="1"/>
                      <column name="path" index="2"/>
                    </options>
                </param>
            </when>
        </conditional>
        <conditional name="id_opts">
            <param name="id_type" type="select" label="Type of identifier list">
              <option value="file">From file</option>
              <option value="prompt">User entered</option>
            </param>
            <when value="file">
                <param name="entries" type="data" format="txt,tabular" label="Sequence identifier(s)" help="Plain text file with one ID per line (i.e. single column tabular file)"/>
            </when>
            <when value="prompt">
                <param name="entries" type="text" label="Sequence identifier(s)" help="Comma or new line separated list." optional="False" area="True" size="10x30"/>
            </when>
        </conditional>
        <param name="outfmt" type="select" label="Output format">
          <option value="original">FASTA with original identifiers</option>
          <option value="blastid">FASTA with BLAST assigned identifiers</option>
        </param>
    </inputs>
    <outputs>
        <data name="seq" format="fasta" label="Sequences from ${db_opts.database.fields.name}" />
    </outputs>
    <help>
    
**What it does**

Extracts FASTA formatted sequences from a BLAST database
using the NCBI BLAST+ blastdbcmd command line tool.

.. class:: warningmark

**BLAST assigned identifiers**

When a BLAST database is constructed from a FASTA file, the
original identifiers can be replaced with BLAST assigned
identifiers, partly to ensure uniqueness. e.g. Sometimes
a prefix of 'lcl|' is added (lcl is short for local),
or an arbitrary name starting 'gnl|BL_ORD_ID|' is created.

If you are using the tabular output from BLAST, it will contain
the original identifiers - not the BLAST assigned identifiers
suitable for use with the blastdbcmd tool.

If you are using the XML or plain text output, this will also
contain the BLAST assigned identifiers. However, this means
getting a list of BLAST assigned identifiers isn't straightforward.

-------

**References**

Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402.

Schaffer et al. Improving the accuracy of PSI-BLAST protein database searches with composition-based statistics and other refinements. 2001. Nucleic Acids Res. 29:2994-3005.

    </help>
</tool>