Mercurial > repos > devteam > ncbi_blast_plus

<tool id="ncbi_blastdbcmd_wrapper" name="NCBI BLAST+ blastdbcmd entry(s)" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>Extract sequence(s) from BLAST database</description>
    <macros>
        <token name="@BINARY@">blastdbcmd</token>
        <import>ncbi_macros.xml</import>
    </macros>
    <expand macro="preamble" />
    <command detect_errors="aggressive" strict="true">
## The command is a Cheetah template which allows some Python based syntax.
## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
blastdbcmd
@DBCMD_OPTS@

##TODO: What about -ctrl_a and -target_only as advanced options?

#if $id_opts.id_type=="file":
-entry_batch '$id_opts.entries'
#else:
##Perform some simple search/replaces to remove whitespace
##and make it comma separated. Quoted so don't escape pipes.
-entry "$id_opts.entries.replace('\r',',').replace('\n',',').replace(' ','').replace(',,',',').replace(',,',',').strip(',')"
#end if

##When building a BLAST database, to ensure unique IDs makeblastdb will
##do things like turning a FASTA entry with ID of ERP44 into lcl|ERP44
##(if using -parse_seqids) or simply assign it an ID using the record
##number like gnl|BL_ORD_ID|123 (to cope with duplicate IDs in the FASTA
##file). In -parse_seqids mode, a duplicate FASTA ID gives an error.
##
##The BLAST plain text and XML output will contain these BLAST IDs, but
##the tabular output does not (at least, not in BLAST 2.2.25+).
##Therefore in general, Galaxy users won't care about the (internal)
##BLAST identifiers.
##
##The blastdbcmd FASTA output will also contain these IDs, but in the
##context of the BLAST tabular output they are not helpful. Therefore
##to recover the original ID as used in the FASTA file for makeblastdb
##we need a litte post processing.
##
##We remove the NCBI's lcl|... or gnl|BL_ORD_ID|123 prefixes
##using sed, however the exact syntax differs for Mac OS X's sed

#if str($outfmt)=="blastid":
-out '$seq'
#else if sys.platform == "darwin":
| sed -E 's/^&gt;(lcl\||gnl\|BL_ORD_ID\|[0-9]* )/&gt;/1' &gt; "$seq"
#else:
| sed 's/&gt;\(lcl|\|gnl|BL_ORD_ID|[0-9]* \)/&gt;/1' &gt; "$seq"
#end if
    </command>
    <inputs>
        <expand macro="input_conditional_choose_db_type" />
        <conditional name="id_opts">
            <param name="id_type" type="select" label="Type of identifier list">
              <option value="file">From file</option>
              <option value="prompt">User entered</option>
            </param>
            <when value="file">
                <param name="entries" argument="-entry_batch" type="data" format="txt,tabular" label="Sequence identifier(s)" help="Plain text file with one ID per line, optionally with space separated range, strand, and algorithm."/>
            </when>
            <when value="prompt">
                <param name="entries" argument="-entry" type="text" optional="false" area="true" size="10x30" label="Sequence identifier(s)" help="Comma or new line separated list"/>
            </when>
        </conditional>
        <param name="outfmt" type="select" label="Output format">
          <option value="original">FASTA with original identifiers</option>
          <option value="blastid">FASTA with BLAST assigned identifiers</option>
        </param>
    </inputs>
    <outputs>

        <data name="seq" format="fasta" label="Sequences from blastdbcmd @ON_DBCMD_OPTS@">
        </data>
    </outputs>
    <tests>
        <test>
            <param name="db_opts|db_type" value="prot" />
            <param name="db_opts|db_origin|database" value="four_human_proteins" />
            <param name="db_opts|db_origin|db_origin_selector" value="db" />
            <param name="id_opts|id_type" value="prompt" />
            <param name="id_opts|entries" value="all" />
            <param name="outfmt" value="original" />
            <output name="seq" file="four_human_proteins.fasta" ftype="fasta" />
        </test>
        <test>
            <!-- This used to recover the original FASTA file, but had GI numbers -->
            <param name="db_opts|db_type" value="nucl" />
            <param name="db_opts|db_origin|database" value="rhodopsin_nucs" />
            <param name="id_opts|id_type" value="prompt" />
            <param name="id_opts|entries" value="all" />
            <param name="outfmt" value="original" />
            <output name="seq" file="rhodopsin_nucs.no_gi.fasta" ftype="fasta" />
        </test>
        <test>
            <!-- This uses various start end frame combinations but all recover full sequence -->
            <param name="db_opts|db_type" value="nucl" />
            <param name="db_opts|db_origin|database" value="rhodopsin_nucs" />
            <param name="id_opts|id_type" value="file" />
            <param name="id_opts|entries" value="rhodopsin_nucs.blastdbcmd.txt" ftype="txt" />
            <param name="outfmt" value="original" />
            <output name="seq" file="rhodopsin_nucs.no_gi.fasta" ftype="fasta" />
        </test>
        <test>
            <param name="db_opts|db_type" value="nucl" />
            <param name="db_opts|db_origin|database" value="rhodopsin_nucs" />
            <param name="id_opts|id_type" value="prompt" />
            <param name="id_opts|entries" value="U59921.1" />
            <param name="outfmt" value="original" />
            <output name="seq" file="rhodopsin_bufo.fasta" ftype="fasta" />
        </test>
        <test>
            <param name="db_opts|db_type" value="nucl" />
            <!-- look in two databases for this entry -->
            <param name="db_opts|db_origin|database" value="rhodopsin_nucs,three_human_mRNA" />
            <param name="id_opts|id_type" value="prompt" />
            <param name="id_opts|entries" value="gi|2734705|gb|U59921.1|BBU59921" />
            <param name="outfmt" value="original" />
            <output name="seq" file="rhodopsin_bufo.fasta" ftype="fasta" />
        </test>
    </tests>
    <help>

**What it does**

Extracts FASTA formatted sequences from a BLAST database
using the NCBI BLAST+ blastdbcmd command line tool.

When giving a text file of entries, use one line per sequence.
Optional valies should be space separate - the simplest syntax
is ``identifier start-end`` (where ``end`` can be just ``-``),
or ``identifier start-end strand`` (wheere the strand given as
either ``+`` or ``-``).

.. class:: warningmark

**BLAST assigned identifiers**

When a BLAST database is constructed from a FASTA file, the
original identifiers can be replaced with BLAST assigned
identifiers, partly to ensure uniqueness. e.g. Sometimes
a prefix of 'lcl|' is added (lcl is short for local),
or an arbitrary name starting 'gnl|BL_ORD_ID|' is created.

If you are using the tabular output from BLAST, it will contain
the original identifiers - not the BLAST assigned identifiers
suitable for use with the blastdbcmd tool.

If you are using the XML or plain text output, this will also
contain the BLAST assigned identifiers. However, this means
getting a list of BLAST assigned identifiers isn't straightforward.

-------

@CLI_OPTIONS@

-------

**References**

If you use this Galaxy tool in work leading to a scientific publication please
cite the following papers:

@REFERENCES@
    </help>
    <expand macro="blast_citations" />
</tool>
author	peterjc
date	Wed, 09 Sep 2020 15:32:17 +0000
parents	2889433c7ae1
children	87a7ee4cb36f