Mercurial > repos > devteam > ncbi_blast_plus
diff tools/ncbi_blast_plus/ncbi_blastdbcmd_wrapper.xml @ 5:393a7a35383c draft
Uploaded v0.0.14 adding local BLAST database support.
This *requires* the matching update to the blast_datatypes repository. This adds basic wrappers for makeblastdb and blastdbinfo.
This update includes work by Edward Kirton.
author | peterjc |
---|---|
date | Fri, 09 Nov 2012 06:53:55 -0500 |
parents | |
children | 9dabbfd73c8a |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/ncbi_blast_plus/ncbi_blastdbcmd_wrapper.xml Fri Nov 09 06:53:55 2012 -0500 @@ -0,0 +1,135 @@ +<tool id="ncbi_blastdbcmd_wrapper" name="NCBI BLAST+ blastdbcmd entry(s)" version="0.0.3"> + <description>Extract sequence(s) from BLAST database</description> + <command> +## The command is a Cheetah template which allows some Python based syntax. +## Lines starting hash hash are comments. Galaxy will turn newlines into spaces +blastdbcmd -dbtype $db_opts.db_type -db "${db_opts.database.fields.path}" + +##TODO: What about -ctrl_a and -target_only as advanced options? + +#if $id_opts.id_type=="file": +-entry_batch "$id_opts.entries" +#else: +##Perform some simple search/replaces to remove whitespace +##and make it comma separated, and escape any pipe characters +-entry "$id_opts.entries.replace('\r',',').replace('\n',',').replace(' ','').replace(',,',',').replace(',,',',').strip(',').replace('|','\|')" +#end if + +##When building a BLAST database, to ensure unique IDs makeblastdb will +##do things like turning a FASTA entry with ID of ERP44 into lcl|ERP44 +##(if using -parse_seqids) or simply assign it an ID using the record +##number like gnl|BL_ORD_ID|123 (to cope with duplicate IDs in the FASTA +##file). In -parse_seqids mode, a duplicate FASTA ID gives an error. +## +##The BLAST plain text and XML output will contain these BLAST IDs, but +##the tabular output does not (at least, not in BLAST 2.2.25+). +##Therefore in general, Galaxy users won't care about the (internal) +##BLAST identifiers. +## +##The blastdbcmd FASTA output will also contain these IDs, but in the +##context of the BLAST tabular output they are not helpful. Therefore +##to recover the original ID as used in the FASTA file for makeblastdb +##we need a litte post processing. +## +##We remove the NCBI's lcl|... or gnl|BL_ORD_ID|123 prefixes +##using sed, however the exact syntax differs for Mac OS X's sed + +#if str($outfmt)=="blastid": +-out "$seq" +#else if sys.platform == "darwin": +| sed -E 's/^>(lcl\||gnl\|BL_ORD_ID\|[0-9]* )/>/1' > "$seq" +#else: +| sed 's/>\(lcl|\|gnl|BL_ORD_ID|[0-9]* \)/>/1' > "$seq" +#end if + </command> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + <!-- Suspect blastdbcmd sometimes fails to set error level --> + <regex match="Error:" /> + <regex match="EXception:" /> + </stdio> + <inputs> + <conditional name="db_opts"> + <param name="db_type" type="select" label="Type of BLAST database"> + <option value="nucl" selected="True">Nucleotide</option> + <option value="prot">Protein</option> + </param> + <when value="nucl"> + <param name="database" type="select" label="Nucleotide BLAST database"> + <options from_file="blastdb.loc"> + <column name="value" index="0"/> + <column name="name" index="1"/> + <column name="path" index="2"/> + </options> + </param> + </when> + <when value="prot"> + <param name="database" type="select" label="Protein BLAST database"> + <options from_file="blastdb_p.loc"> + <column name="value" index="0"/> + <column name="name" index="1"/> + <column name="path" index="2"/> + </options> + </param> + </when> + </conditional> + <conditional name="id_opts"> + <param name="id_type" type="select" label="Type of identifier list"> + <option value="file">From file</option> + <option value="prompt">User entered</option> + </param> + <when value="file"> + <param name="entries" type="data" format="txt,tabular" label="Sequence identifier(s)" help="Plain text file with one ID per line (i.e. single column tabular file)"/> + </when> + <when value="prompt"> + <param name="entries" type="text" label="Sequence identifier(s)" help="Comma or new line separated list." optional="False" area="True" size="10x30"/> + </when> + </conditional> + <param name="outfmt" type="select" label="Output format"> + <option value="original">FASTA with original identifiers</option> + <option value="blastid">FASTA with BLAST assigned identifiers</option> + </param> + </inputs> + <outputs> + <data name="seq" format="fasta" label="Sequences from ${db_opts.database.fields.name}" /> + </outputs> + <requirements> + <requirement type="binary">blastdbcmd</requirement> + </requirements> + <help> + +**What it does** + +Extracts FASTA formatted sequences from a BLAST database +using the NCBI BLAST+ blastdbcmd command line tool. + +.. class:: warningmark + +**BLAST assigned identifiers** + +When a BLAST database is constructed from a FASTA file, the +original identifiers can be replaced with BLAST assigned +identifiers, partly to ensure uniqueness. e.g. Sometimes +a prefix of 'lcl|' is added (lcl is short for local), +or an arbitrary name starting 'gnl|BL_ORD_ID|' is created. + +If you are using the tabular output from BLAST, it will contain +the original identifiers - not the BLAST assigned identifiers +suitable for use with the blastdbcmd tool. + +If you are using the XML or plain text output, this will also +contain the BLAST assigned identifiers. However, this means +getting a list of BLAST assigned identifiers isn't straightforward. + +------- + +**References** + +Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402. + +Schaffer et al. Improving the accuracy of PSI-BLAST protein database searches with composition-based statistics and other refinements. 2001. Nucleic Acids Res. 29:2994-3005. + + </help> +</tool>