diff tools/ncbi_blast_plus/ncbi_blastdbcmd_wrapper.xml @ 5:393a7a35383c draft

Uploaded v0.0.14 adding local BLAST database support. This *requires* the matching update to the blast_datatypes repository. This adds basic wrappers for makeblastdb and blastdbinfo. This update includes work by Edward Kirton.
author peterjc
date Fri, 09 Nov 2012 06:53:55 -0500
parents
children 9dabbfd73c8a
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/ncbi_blastdbcmd_wrapper.xml	Fri Nov 09 06:53:55 2012 -0500
@@ -0,0 +1,135 @@
+<tool id="ncbi_blastdbcmd_wrapper" name="NCBI BLAST+ blastdbcmd entry(s)" version="0.0.3">
+    <description>Extract sequence(s) from BLAST database</description>
+    <command>
+## The command is a Cheetah template which allows some Python based syntax.
+## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
+blastdbcmd -dbtype $db_opts.db_type -db "${db_opts.database.fields.path}"
+
+##TODO: What about -ctrl_a and -target_only as advanced options?
+
+#if $id_opts.id_type=="file":
+-entry_batch "$id_opts.entries"
+#else:
+##Perform some simple search/replaces to remove whitespace
+##and make it comma separated, and escape any pipe characters
+-entry "$id_opts.entries.replace('\r',',').replace('\n',',').replace(' ','').replace(',,',',').replace(',,',',').strip(',').replace('|','\|')"
+#end if
+
+##When building a BLAST database, to ensure unique IDs makeblastdb will
+##do things like turning a FASTA entry with ID of ERP44 into lcl|ERP44
+##(if using -parse_seqids) or simply assign it an ID using the record
+##number like gnl|BL_ORD_ID|123 (to cope with duplicate IDs in the FASTA
+##file). In -parse_seqids mode, a duplicate FASTA ID gives an error.
+##
+##The BLAST plain text and XML output will contain these BLAST IDs, but
+##the tabular output does not (at least, not in BLAST 2.2.25+).
+##Therefore in general, Galaxy users won't care about the (internal)
+##BLAST identifiers.
+##
+##The blastdbcmd FASTA output will also contain these IDs, but in the
+##context of the BLAST tabular output they are not helpful. Therefore
+##to recover the original ID as used in the FASTA file for makeblastdb
+##we need a litte post processing.
+##
+##We remove the NCBI's lcl|... or gnl|BL_ORD_ID|123 prefixes
+##using sed, however the exact syntax differs for Mac OS X's sed
+
+#if str($outfmt)=="blastid":
+-out "$seq"
+#else if sys.platform == "darwin":
+| sed -E 's/^>(lcl\||gnl\|BL_ORD_ID\|[0-9]* )/>/1' > "$seq"
+#else:
+| sed 's/>\(lcl|\|gnl|BL_ORD_ID|[0-9]* \)/>/1' > "$seq"
+#end if
+    </command>
+    <stdio>
+        <!-- Anything other than zero is an error -->
+        <exit_code range="1:" />
+        <exit_code range=":-1" />
+	<!-- Suspect blastdbcmd sometimes fails to set error level -->
+	<regex match="Error:" />
+	<regex match="EXception:" />
+    </stdio>
+    <inputs>
+        <conditional name="db_opts">
+            <param name="db_type" type="select" label="Type of BLAST database">
+              <option value="nucl" selected="True">Nucleotide</option>
+              <option value="prot">Protein</option>
+            </param>
+            <when value="nucl">
+                <param name="database" type="select" label="Nucleotide BLAST database">
+                    <options from_file="blastdb.loc">
+                      <column name="value" index="0"/>
+                      <column name="name" index="1"/>
+                      <column name="path" index="2"/>
+                    </options>
+                </param>
+            </when>
+            <when value="prot">
+                <param name="database" type="select" label="Protein BLAST database">
+                    <options from_file="blastdb_p.loc">
+                      <column name="value" index="0"/>
+                      <column name="name" index="1"/>
+                      <column name="path" index="2"/>
+                    </options>
+                </param>
+            </when>
+        </conditional>
+        <conditional name="id_opts">
+            <param name="id_type" type="select" label="Type of identifier list">
+              <option value="file">From file</option>
+              <option value="prompt">User entered</option>
+            </param>
+            <when value="file">
+                <param name="entries" type="data" format="txt,tabular" label="Sequence identifier(s)" help="Plain text file with one ID per line (i.e. single column tabular file)"/>
+            </when>
+            <when value="prompt">
+                <param name="entries" type="text" label="Sequence identifier(s)" help="Comma or new line separated list." optional="False" area="True" size="10x30"/>
+            </when>
+        </conditional>
+        <param name="outfmt" type="select" label="Output format">
+          <option value="original">FASTA with original identifiers</option>
+          <option value="blastid">FASTA with BLAST assigned identifiers</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="seq" format="fasta" label="Sequences from ${db_opts.database.fields.name}" />
+    </outputs>
+    <requirements>
+        <requirement type="binary">blastdbcmd</requirement>
+    </requirements>
+    <help>
+    
+**What it does**
+
+Extracts FASTA formatted sequences from a BLAST database
+using the NCBI BLAST+ blastdbcmd command line tool.
+
+.. class:: warningmark
+
+**BLAST assigned identifiers**
+
+When a BLAST database is constructed from a FASTA file, the
+original identifiers can be replaced with BLAST assigned
+identifiers, partly to ensure uniqueness. e.g. Sometimes
+a prefix of 'lcl|' is added (lcl is short for local),
+or an arbitrary name starting 'gnl|BL_ORD_ID|' is created.
+
+If you are using the tabular output from BLAST, it will contain
+the original identifiers - not the BLAST assigned identifiers
+suitable for use with the blastdbcmd tool.
+
+If you are using the XML or plain text output, this will also
+contain the BLAST assigned identifiers. However, this means
+getting a list of BLAST assigned identifiers isn't straightforward.
+
+-------
+
+**References**
+
+Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402.
+
+Schaffer et al. Improving the accuracy of PSI-BLAST protein database searches with composition-based statistics and other refinements. 2001. Nucleic Acids Res. 29:2994-3005.
+
+    </help>
+</tool>