5
|
1 <tool id="ncbi_blastdbcmd_wrapper" name="NCBI BLAST+ blastdbcmd entry(s)" version="0.0.3">
|
|
2 <description>Extract sequence(s) from BLAST database</description>
|
|
3 <command>
|
|
4 ## The command is a Cheetah template which allows some Python based syntax.
|
|
5 ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
|
|
6 blastdbcmd -dbtype $db_opts.db_type -db "${db_opts.database.fields.path}"
|
|
7
|
|
8 ##TODO: What about -ctrl_a and -target_only as advanced options?
|
|
9
|
|
10 #if $id_opts.id_type=="file":
|
|
11 -entry_batch "$id_opts.entries"
|
|
12 #else:
|
|
13 ##Perform some simple search/replaces to remove whitespace
|
|
14 ##and make it comma separated, and escape any pipe characters
|
|
15 -entry "$id_opts.entries.replace('\r',',').replace('\n',',').replace(' ','').replace(',,',',').replace(',,',',').strip(',').replace('|','\|')"
|
|
16 #end if
|
|
17
|
|
18 ##When building a BLAST database, to ensure unique IDs makeblastdb will
|
|
19 ##do things like turning a FASTA entry with ID of ERP44 into lcl|ERP44
|
|
20 ##(if using -parse_seqids) or simply assign it an ID using the record
|
|
21 ##number like gnl|BL_ORD_ID|123 (to cope with duplicate IDs in the FASTA
|
|
22 ##file). In -parse_seqids mode, a duplicate FASTA ID gives an error.
|
|
23 ##
|
|
24 ##The BLAST plain text and XML output will contain these BLAST IDs, but
|
|
25 ##the tabular output does not (at least, not in BLAST 2.2.25+).
|
|
26 ##Therefore in general, Galaxy users won't care about the (internal)
|
|
27 ##BLAST identifiers.
|
|
28 ##
|
|
29 ##The blastdbcmd FASTA output will also contain these IDs, but in the
|
|
30 ##context of the BLAST tabular output they are not helpful. Therefore
|
|
31 ##to recover the original ID as used in the FASTA file for makeblastdb
|
|
32 ##we need a litte post processing.
|
|
33 ##
|
|
34 ##We remove the NCBI's lcl|... or gnl|BL_ORD_ID|123 prefixes
|
|
35 ##using sed, however the exact syntax differs for Mac OS X's sed
|
|
36
|
|
37 #if str($outfmt)=="blastid":
|
|
38 -out "$seq"
|
|
39 #else if sys.platform == "darwin":
|
|
40 | sed -E 's/^>(lcl\||gnl\|BL_ORD_ID\|[0-9]* )/>/1' > "$seq"
|
|
41 #else:
|
|
42 | sed 's/>\(lcl|\|gnl|BL_ORD_ID|[0-9]* \)/>/1' > "$seq"
|
|
43 #end if
|
|
44 </command>
|
|
45 <stdio>
|
|
46 <!-- Anything other than zero is an error -->
|
|
47 <exit_code range="1:" />
|
|
48 <exit_code range=":-1" />
|
|
49 <!-- Suspect blastdbcmd sometimes fails to set error level -->
|
|
50 <regex match="Error:" />
|
|
51 <regex match="EXception:" />
|
|
52 </stdio>
|
|
53 <inputs>
|
|
54 <conditional name="db_opts">
|
|
55 <param name="db_type" type="select" label="Type of BLAST database">
|
|
56 <option value="nucl" selected="True">Nucleotide</option>
|
|
57 <option value="prot">Protein</option>
|
|
58 </param>
|
|
59 <when value="nucl">
|
|
60 <param name="database" type="select" label="Nucleotide BLAST database">
|
|
61 <options from_file="blastdb.loc">
|
|
62 <column name="value" index="0"/>
|
|
63 <column name="name" index="1"/>
|
|
64 <column name="path" index="2"/>
|
|
65 </options>
|
|
66 </param>
|
|
67 </when>
|
|
68 <when value="prot">
|
|
69 <param name="database" type="select" label="Protein BLAST database">
|
|
70 <options from_file="blastdb_p.loc">
|
|
71 <column name="value" index="0"/>
|
|
72 <column name="name" index="1"/>
|
|
73 <column name="path" index="2"/>
|
|
74 </options>
|
|
75 </param>
|
|
76 </when>
|
|
77 </conditional>
|
|
78 <conditional name="id_opts">
|
|
79 <param name="id_type" type="select" label="Type of identifier list">
|
|
80 <option value="file">From file</option>
|
|
81 <option value="prompt">User entered</option>
|
|
82 </param>
|
|
83 <when value="file">
|
|
84 <param name="entries" type="data" format="txt,tabular" label="Sequence identifier(s)" help="Plain text file with one ID per line (i.e. single column tabular file)"/>
|
|
85 </when>
|
|
86 <when value="prompt">
|
|
87 <param name="entries" type="text" label="Sequence identifier(s)" help="Comma or new line separated list." optional="False" area="True" size="10x30"/>
|
|
88 </when>
|
|
89 </conditional>
|
|
90 <param name="outfmt" type="select" label="Output format">
|
|
91 <option value="original">FASTA with original identifiers</option>
|
|
92 <option value="blastid">FASTA with BLAST assigned identifiers</option>
|
|
93 </param>
|
|
94 </inputs>
|
|
95 <outputs>
|
|
96 <data name="seq" format="fasta" label="Sequences from ${db_opts.database.fields.name}" />
|
|
97 </outputs>
|
|
98 <requirements>
|
|
99 <requirement type="binary">blastdbcmd</requirement>
|
|
100 </requirements>
|
|
101 <help>
|
|
102
|
|
103 **What it does**
|
|
104
|
|
105 Extracts FASTA formatted sequences from a BLAST database
|
|
106 using the NCBI BLAST+ blastdbcmd command line tool.
|
|
107
|
|
108 .. class:: warningmark
|
|
109
|
|
110 **BLAST assigned identifiers**
|
|
111
|
|
112 When a BLAST database is constructed from a FASTA file, the
|
|
113 original identifiers can be replaced with BLAST assigned
|
|
114 identifiers, partly to ensure uniqueness. e.g. Sometimes
|
|
115 a prefix of 'lcl|' is added (lcl is short for local),
|
|
116 or an arbitrary name starting 'gnl|BL_ORD_ID|' is created.
|
|
117
|
|
118 If you are using the tabular output from BLAST, it will contain
|
|
119 the original identifiers - not the BLAST assigned identifiers
|
|
120 suitable for use with the blastdbcmd tool.
|
|
121
|
|
122 If you are using the XML or plain text output, this will also
|
|
123 contain the BLAST assigned identifiers. However, this means
|
|
124 getting a list of BLAST assigned identifiers isn't straightforward.
|
|
125
|
|
126 -------
|
|
127
|
|
128 **References**
|
|
129
|
|
130 Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402.
|
|
131
|
|
132 Schaffer et al. Improving the accuracy of PSI-BLAST protein database searches with composition-based statistics and other refinements. 2001. Nucleic Acids Res. 29:2994-3005.
|
|
133
|
|
134 </help>
|
|
135 </tool>
|