Mercurial > repos > devteam > ncbi_blast_plus
comparison tools/ncbi_blast_plus/ncbi_rpsblast_wrapper.xml @ 9:9dabbfd73c8a draft
Uploaded v0.0.19, adds wrappers for rpsblast and rpstblastn with new blastdb_d.loc file for their protein domain database.
Also includes other minor improvements.
author | peterjc |
---|---|
date | Thu, 25 Apr 2013 09:38:37 -0400 |
parents | |
children | 70e7dcbf6573 |
comparison
equal
deleted
inserted
replaced
8:1f546099212f | 9:9dabbfd73c8a |
---|---|
1 <tool id="ncbi_rpsblast_wrapper" name="NCBI BLAST+ rpsblast" version="0.0.3"> | |
2 <description>Search protein domain database (PSSMs) with protein query sequence(s)</description> | |
3 <!-- If job splitting is enabled, break up the query file into parts --> | |
4 <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" shared_inputs="subject" merge_outputs="output1"></parallelism> | |
5 <requirements> | |
6 <requirement type="binary">rpsblast</requirement> | |
7 <requirement type="package" version="2.2.26+">blast+</requirement> | |
8 </requirements> | |
9 <version_command>rpsblast -version</version_command> | |
10 <command> | |
11 ## The command is a Cheetah template which allows some Python based syntax. | |
12 ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces | |
13 rpsblast | |
14 -query "$query" | |
15 #if $db_opts.db_opts_selector == "db": | |
16 -db "${db_opts.database.fields.path}" | |
17 #elif $db_opts.db_opts_selector == "histdb": | |
18 -db "${os.path.join($db_opts.histdb.extra_files_path,'blastdb')}" | |
19 #end if | |
20 -evalue $evalue_cutoff | |
21 -out "$output1" | |
22 ##Set the extended list here so if/when we add things, saved workflows are not affected | |
23 #if str($out_format)=="ext": | |
24 -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" | |
25 #else: | |
26 -outfmt $out_format | |
27 #end if | |
28 -num_threads 8 | |
29 #if $adv_opts.adv_opts_selector=="advanced": | |
30 $adv_opts.filter_query | |
31 ## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string | |
32 ## Note -max_target_seqs overrides -num_descriptions and -num_alignments | |
33 #if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): | |
34 -max_target_seqs $adv_opts.max_hits | |
35 #end if | |
36 #if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): | |
37 -word_size $adv_opts.word_size | |
38 #end if | |
39 $adv_opts.parse_deflines | |
40 ## End of advanced options: | |
41 #end if | |
42 </command> | |
43 <stdio> | |
44 <!-- Anything other than zero is an error --> | |
45 <exit_code range="1:" /> | |
46 <exit_code range=":-1" /> | |
47 <!-- In case the return code has not been set propery check stderr too --> | |
48 <regex match="Error:" /> | |
49 <regex match="Exception:" /> | |
50 </stdio> | |
51 <inputs> | |
52 <param name="query" type="data" format="fasta" label="Protein query sequence(s)"/> | |
53 <conditional name="db_opts"> | |
54 <param name="db_opts_selector" type="select" label="Protein domain database (PSSM)"> | |
55 <option value="db" selected="True">Locally installed BLAST database</option> | |
56 <!-- TODO - define new datatype | |
57 <option value="histdb">BLAST protein domain database from your history</option> | |
58 --> | |
59 </param> | |
60 <when value="db"> | |
61 <param name="database" type="select" label="Protein domain database"> | |
62 <options from_file="blastdb_d.loc"> | |
63 <column name="value" index="0"/> | |
64 <column name="name" index="1"/> | |
65 <column name="path" index="2"/> | |
66 </options> | |
67 </param> | |
68 <param name="histdb" type="hidden" value="" /> | |
69 <param name="subject" type="hidden" value="" /> | |
70 </when> | |
71 <!-- TODO - define new datatype | |
72 <when value="histdb"> | |
73 <param name="database" type="hidden" value="" /> | |
74 <param name="histdb" type="data" format="blastdbd" label="Protein domain database" /> | |
75 <param name="subject" type="hidden" value="" /> | |
76 </when> | |
77 --> | |
78 </conditional> | |
79 <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" /> | |
80 <param name="out_format" type="select" label="Output format"> | |
81 <option value="6">Tabular (standard 12 columns)</option> | |
82 <option value="ext" selected="True">Tabular (extended 24 columns)</option> | |
83 <option value="5">BLAST XML</option> | |
84 <option value="0">Pairwise text</option> | |
85 <option value="0 -html">Pairwise HTML</option> | |
86 <option value="2">Query-anchored text</option> | |
87 <option value="2 -html">Query-anchored HTML</option> | |
88 <option value="4">Flat query-anchored text</option> | |
89 <option value="4 -html">Flat query-anchored HTML</option> | |
90 <!-- | |
91 <option value="-outfmt 11">BLAST archive format (ASN.1)</option> | |
92 --> | |
93 </param> | |
94 <conditional name="adv_opts"> | |
95 <param name="adv_opts_selector" type="select" label="Advanced Options"> | |
96 <option value="basic" selected="True">Hide Advanced Options</option> | |
97 <option value="advanced">Show Advanced Options</option> | |
98 </param> | |
99 <when value="basic" /> | |
100 <when value="advanced"> | |
101 <!-- Could use a select (yes, no, other) where other allows setting 'window locut hicut' --> | |
102 <param name="filter_query" type="boolean" label="Filter out low complexity regions (with SEG)" truevalue="-seg yes" falsevalue="-seg no" checked="false" /> | |
103 <!-- Why doesn't optional override a validator? I want to accept an empty string OR a non-negative integer --> | |
104 <param name="max_hits" type="integer" value="0" label="Maximum hits to show" help="Use zero for default limits"> | |
105 <validator type="in_range" min="0" /> | |
106 </param> | |
107 <!-- I'd like word_size to be optional, with minimum 2 for rpsblast --> | |
108 <param name="word_size" type="integer" value="0" label="Word size for wordfinder algorithm" help="Use zero for default, otherwise minimum 2."> | |
109 <validator type="in_range" min="0" /> | |
110 </param> | |
111 <param name="parse_deflines" type="boolean" label="Should the query and subject defline(s) be parsed?" truevalue="-parse_deflines" falsevalue="" checked="false" help="This affects the formatting of the query/subject ID strings"/> | |
112 </when> | |
113 </conditional> | |
114 </inputs> | |
115 <outputs> | |
116 <data name="output1" format="tabular" label="rpsblast on ${on_string}"> | |
117 <change_format> | |
118 <when input="out_format" value="0" format="txt"/> | |
119 <when input="out_format" value="0 -html" format="html"/> | |
120 <when input="out_format" value="2" format="txt"/> | |
121 <when input="out_format" value="2 -html" format="html"/> | |
122 <when input="out_format" value="4" format="txt"/> | |
123 <when input="out_format" value="4 -html" format="html"/> | |
124 <when input="out_format" value="5" format="blastxml"/> | |
125 </change_format> | |
126 </data> | |
127 </outputs> | |
128 <help> | |
129 | |
130 .. class:: warningmark | |
131 | |
132 **Note**. Database searches may take a substantial amount of time. | |
133 For large input datasets it is advisable to allow overnight processing. | |
134 | |
135 ----- | |
136 | |
137 **What it does** | |
138 | |
139 Search a *protein domain database* using a *protein query*, | |
140 using the NCBI BLAST+ rpsblast command line tool. | |
141 | |
142 The protein domain databases use position-specific scoring matrices | |
143 (PSSMs) and are available for a number of domain collections including: | |
144 | |
145 *CDD* - NCBI curarated meta-collection of domains, see | |
146 http://www.ncbi.nlm.nih.gov/Structure/cdd/cdd_help.shtml#NCBI_curated_domains | |
147 | |
148 *Kog* - PSSMs from automatically aligned sequences and sequence | |
149 fragments classified in the KOGs resource, the eukaryotic | |
150 counterpart to COGs, see http://www.ncbi.nlm.nih.gov/COG/new/ | |
151 | |
152 *Cog* - PSSMs from automatically aligned sequences and sequence | |
153 fragments classified in the COGs resource, which focuses primarily | |
154 on prokaryotes, see http://www.ncbi.nlm.nih.gov/COG/new/ | |
155 | |
156 *Pfam* - PSSMs from Pfam-A seed alignment database, see | |
157 http://pfam.sanger.ac.uk/ | |
158 | |
159 *Smart* - PSSMs from SMART domain alignment database, see | |
160 http://smart.embl-heidelberg.de/ | |
161 | |
162 *Tigr* - PSSMs from TIGRFAM database of protein families, see | |
163 http://www.jcvi.org/cms/research/projects/tigrfams/overview/ | |
164 | |
165 *Prk* - PSSms from automatically aligned stable clusters in the | |
166 Protein Clusters database, see | |
167 http://www.ncbi.nlm.nih.gov/proteinclusters?cmd=search&db=proteinclusters | |
168 | |
169 The exact list of domain databases offered will depend on how your | |
170 local Galaxy has been configured. | |
171 | |
172 ----- | |
173 | |
174 **Output format** | |
175 | |
176 Because Galaxy focuses on processing tabular data, the default output of this | |
177 tool is tabular. The standard BLAST+ tabular output contains 12 columns: | |
178 | |
179 ====== ========= ============================================ | |
180 Column NCBI name Description | |
181 ------ --------- -------------------------------------------- | |
182 1 qseqid Query Seq-id (ID of your sequence) | |
183 2 sseqid Subject Seq-id (ID of the database hit) | |
184 3 pident Percentage of identical matches | |
185 4 length Alignment length | |
186 5 mismatch Number of mismatches | |
187 6 gapopen Number of gap openings | |
188 7 qstart Start of alignment in query | |
189 8 qend End of alignment in query | |
190 9 sstart Start of alignment in subject (database hit) | |
191 10 send End of alignment in subject (database hit) | |
192 11 evalue Expectation value (E-value) | |
193 12 bitscore Bit score | |
194 ====== ========= ============================================ | |
195 | |
196 The BLAST+ tools can optionally output additional columns of information, | |
197 but this takes longer to calculate. Most (but not all) of these columns are | |
198 included by selecting the extended tabular output. The extra columns are | |
199 included *after* the standard 12 columns. This is so that you can write | |
200 workflow filtering steps that accept either the 12 or 24 column tabular | |
201 BLAST output. Galaxy now uses this extended 24 column output by default. | |
202 | |
203 ====== ============= =========================================== | |
204 Column NCBI name Description | |
205 ------ ------------- ------------------------------------------- | |
206 13 sallseqid All subject Seq-id(s), separated by a ';' | |
207 14 score Raw score | |
208 15 nident Number of identical matches | |
209 16 positive Number of positive-scoring matches | |
210 17 gaps Total number of gaps | |
211 18 ppos Percentage of positive-scoring matches | |
212 19 qframe Query frame | |
213 20 sframe Subject frame | |
214 21 qseq Aligned part of query sequence | |
215 22 sseq Aligned part of subject sequence | |
216 23 qlen Query sequence length | |
217 24 slen Subject sequence length | |
218 ====== ============= =========================================== | |
219 | |
220 The third option is BLAST XML output, which is designed to be parsed by | |
221 another program, and is understood by some Galaxy tools. | |
222 | |
223 You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). | |
224 The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. | |
225 The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. | |
226 The two query anchored outputs show a multiple sequence alignment between the query and all the matches, | |
227 and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). | |
228 | |
229 ------- | |
230 | |
231 **References** | |
232 | |
233 Marchler-Bauer A, Bryant SH. CD-Search: protein domain annotations on the fly. Nucleic Acids Res. 2004 Jul 1;32(Web Server issue):W327-31. | |
234 | |
235 </help> | |
236 </tool> |