Mercurial > repos > devteam > ncbi_blast_plus
comparison tools/ncbi_blast_plus/ncbi_makeblastdb.xml @ 13:623f727cdff1 draft
Uploaded v0.1.00, uses BLAST+ 2.2.29, allows custom column selection for tabular output - including taxonomy fields.
author | peterjc |
---|---|
date | Fri, 14 Mar 2014 07:40:46 -0400 |
parents | 4c4a0da938ff |
children | 2fe07f50a41e |
comparison
equal
deleted
inserted
replaced
12:6560192c5098 | 13:623f727cdff1 |
---|---|
1 <tool id="ncbi_makeblastdb" name="NCBI BLAST+ makeblastdb" version="0.0.22"> | 1 <tool id="ncbi_makeblastdb" name="NCBI BLAST+ makeblastdb" version="0.1.00"> |
2 <description>Make BLAST database</description> | 2 <description>Make BLAST database</description> |
3 <macros> | 3 <macros> |
4 <token name="@BINARY@">makeblastdb</token> | 4 <token name="@BINARY@">makeblastdb</token> |
5 <import>ncbi_macros.xml</import> | 5 <import>ncbi_macros.xml</import> |
6 </macros> | 6 </macros> |
7 <expand macro="requirements" /> | 7 <expand macro="requirements" /> |
8 <command interpreter="python">check_no_duplicates.py | 8 <command interpreter="python">check_no_duplicates.py |
9 ##First check for duplicates (since BLAST+ 2.2.28 fails to do so) | 9 ##First check for duplicates (since BLAST+ 2.2.28 fails to do so) |
10 ##and abort (via the ampersand ampersand trick) if any are found. | 10 ##and abort (via the ampersand ampersand trick) if any are found. |
11 #for $i in $in | 11 #for i in $input_file#"${i}" #end for# |
12 "${i.file}" | |
13 #end for | |
14 && | 12 && |
15 makeblastdb -out "${os.path.join($outfile.extra_files_path,'blastdb')}" | 13 makeblastdb -out "${os.path.join($outfile.extra_files_path,'blastdb')}" |
16 $parse_seqids | 14 $parse_seqids |
17 $hash_index | 15 $hash_index |
18 ## Single call to -in with multiple filenames space separated with outer quotes | 16 ## Single call to -in with multiple filenames space separated with outer quotes |
19 ## (presumably any filenames with spaces would be a problem). Note this gives | 17 ## (presumably any filenames with spaces would be a problem). Note this gives |
20 ## some extra spaces, e.g. -in " file1 file2 file3 " but BLAST seems happy: | 18 ## some extra spaces, e.g. -in "file1 file2 file3 " but BLAST seems happy: |
21 -in " | 19 -in "#for i in $input_file#${i} #end for#" |
22 #for $i in $in | |
23 ${i.file} | |
24 #end for | |
25 " | |
26 #if $title: | 20 #if $title: |
27 -title "$title" | 21 -title "$title" |
28 #else: | 22 #else: |
29 ##Would default to being based on the cryptic Galaxy filenames, which is unhelpful | 23 ##Would default to being based on the cryptic Galaxy filenames, which is unhelpful |
30 -title "BLAST Database" | 24 -title "BLAST Database" |
31 #end if | 25 #end if |
32 -dbtype $dbtype | 26 -dbtype $dbtype |
33 #set $mask_string = '' | 27 ## -------------------------------------------------------------------- |
34 #set $sep = '-mask_data ' | 28 ## Masking |
35 #for $i in $mask_data | 29 ## -------------------------------------------------------------------- |
36 #set $mask_string += $sep + str($i.file) | 30 ## HACK: If no mask files, evaluates as a list with just None in it: |
37 #set $sep = ',' | 31 ## See Trello issue https://trello.com/c/lp5YmA1O |
32 #if ' '.join( map(str, $mask_data_file) ) != 'None': | |
33 #for i in $mask_data_file: | |
34 -mask_data "${i}" | |
38 #end for | 35 #end for |
39 $mask_string | 36 #end if |
40 ## #set $gi_mask_string = '' | 37 ## -------------------------------------------------------------------- |
41 ## #set $sep = '-gi_mask -gi_mask_name ' | 38 ## Taxonomy |
42 ## #for $i in $gi_mask | 39 ## -------------------------------------------------------------------- |
43 ## #set $gi_mask_string += $sep + str($i.file) | 40 #if $tax.taxselect == 'id': |
44 ## #set $sep = ',' | 41 -taxid $tax.taxid |
45 ## #end for | 42 ## TODO - Can we use a tabular file for the taxonomy mapping? |
46 ## $gi_mask_string | 43 ## #else if $tax.taxselect == 'map': |
47 ## #if $tax.select == 'id': | 44 ## -taxid_map $tax.taxmap |
48 ## -taxid $tax.id | 45 #end if |
49 ## #else if $tax.select == 'map': | |
50 ## -taxid_map $tax.map | |
51 ## #end if | |
52 ## -------------------------------------------------------------------- | 46 ## -------------------------------------------------------------------- |
53 ## Capture the stdout log information to the primary file (plain text): | 47 ## Capture the stdout log information to the primary file (plain text): |
54 >> "$outfile" | 48 > "$outfile" |
55 </command> | 49 </command> |
56 <expand macro="stdio" /> | 50 <expand macro="stdio" /> |
57 <inputs> | 51 <inputs> |
58 <param name="dbtype" type="select" display="radio" label="Molecule type of input"> | 52 <param name="dbtype" type="select" display="radio" label="Molecule type of input"> |
59 <option value="prot">protein</option> | 53 <option value="prot">protein</option> |
60 <option value="nucl">nucleotide</option> | 54 <option value="nucl">nucleotide</option> |
61 </param> | 55 </param> |
62 <!-- TODO Allow merging of existing BLAST databases (conditional on the database type) | 56 <!-- TODO Allow merging of existing BLAST databases (conditional on the database type)? |
63 NOTE Double check the new database would be self contained first | 57 NOTE Double check the new database would be self contained first |
64 <repeat name="in" title="BLAST or FASTA Database" min="1"> | |
65 <param name="file" type="data" format="fasta,blastdbn,blastdbp" label="BLAST or FASTA database" /> | |
66 </repeat> | |
67 --> | 58 --> |
68 <!-- TODO Switch this to using <param ... multiple="true" /> instead of <repeat> block? --> | 59 <!-- Note this is a mandatory parameter - default should be most recent FASTA file --> |
69 <repeat name="in" title="FASTA file" min="1"> | 60 <param name="input_file" type="data" multiple="true" optional="false" format="fasta" label="Input FASTA files(s)" help="One or more FASTA files" /> |
70 <param name="file" type="data" format="fasta" /> | |
71 </repeat> | |
72 <param name="title" type="text" value="" label="Title for BLAST database" help="This is the database name shown in BLAST search output" /> | 61 <param name="title" type="text" value="" label="Title for BLAST database" help="This is the database name shown in BLAST search output" /> |
73 <param name="parse_seqids" type="boolean" truevalue="-parse_seqids" falsevalue="" checked="False" label="Parse the sequence identifiers" help="This is only advised if your FASTA file follows the NCBI naming conventions using pipe '|' symbols" /> | 62 <param name="parse_seqids" type="boolean" truevalue="-parse_seqids" falsevalue="" checked="False" label="Parse the sequence identifiers" help="This is only advised if your FASTA file follows the NCBI naming conventions using pipe '|' symbols" /> |
74 <param name="hash_index" type="boolean" truevalue="-hash_index" falsevalue="" checked="true" label="Enable the creation of sequence hash values" help="These hash values can then be used to quickly determine if a given sequence data exists in this BLAST database." /> | 63 <param name="hash_index" type="boolean" truevalue="-hash_index" falsevalue="" checked="true" label="Enable the creation of sequence hash values" help="These hash values can then be used to quickly determine if a given sequence data exists in this BLAST database." /> |
75 <!-- SEQUENCE MASKING OPTIONS --> | 64 <!-- SEQUENCE MASKING OPTIONS --> |
76 <repeat name="mask_data" title="Masking data file"> | 65 <!-- Note this is an optional parameter - default should be NO files --> |
77 <param name="mask_data_file" type="data" format="maskinfo-asn1,maskinfo-asn1-binary" label="ASN.1 file containing masking data" help="As produced by NCBI masking applications (e.g. dustmasker, segmasker, windowmasker)" /> | 66 <param name="mask_data_file" type="data" multiple="true" optional="true" value="" format="maskinfo-asn1,maskinfo-asn1-binary" label="Optional ASN.1 file(s) containing masking data" help="As produced by NCBI masking applications (e.g. dustmasker, segmasker, windowmasker)" /> |
78 </repeat> | 67 <!-- TODO - Option to create GI indexed masking data? via -gi_mask and -gi_mask_name? --> |
79 <!-- TODO | |
80 <repeat name="gi_mask" title="Create GI indexed masking data"> | |
81 <param name="gi_mask_file" type="data" format="asnb" label="Masking data output file" /> | |
82 </repeat> | |
83 --> | |
84 | |
85 <!-- TAXONOMY OPTIONS --> | 68 <!-- TAXONOMY OPTIONS --> |
86 <!-- TODO | |
87 <conditional name="tax"> | 69 <conditional name="tax"> |
88 <param name="select" type="select" label="Taxonomy options"> | 70 <param name="taxselect" type="select" label="Taxonomy options"> |
89 <option value="">Do not assign sequences to Taxonomy IDs</option> | 71 <option value="">Do not assign a Taxonomy ID to the sequences</option> |
90 <option value="id">Assign all sequences to one Taxonomy ID</option> | 72 <option value="id">Assign the same Taxonomy ID to all the sequences</option> |
73 <!-- | |
91 <option value="map">Supply text file mapping sequence IDs to taxnomy IDs</option> | 74 <option value="map">Supply text file mapping sequence IDs to taxnomy IDs</option> |
75 --> | |
92 </param> | 76 </param> |
93 <when value=""> | 77 <when value=""> |
94 </when> | 78 </when> |
95 <when value="id"> | 79 <when value="id"> |
96 <param name="id" type="integer" value="" label="NCBI taxonomy ID" help="Integer >=0" /> | 80 <param name="taxid" type="integer" value="" label="NCBI taxonomy ID" help="Integer >=0, e.g. 9606 for Homo sapiens" min="0" /> |
97 </when> | 81 </when> |
82 <!-- TODO: File format? | |
98 <when value="map"> | 83 <when value="map"> |
99 <param name="file" type="data" format="txt" label="Seq ID : Tax ID mapping file" help="Format: SequenceId TaxonomyId" /> | 84 <param name="taxmap" type="data" format="txt" label="Seq ID : Tax ID mapping file" help="Format: SequenceId TaxonomyId" /> |
100 </when> | 85 </when> |
86 --> | |
101 </conditional> | 87 </conditional> |
102 --> | |
103 </inputs> | 88 </inputs> |
104 <outputs> | 89 <outputs> |
105 <!-- If we only accepted one FASTA file, we could use its human name here... --> | 90 <!-- If we only accepted one FASTA file, we could use its human name here... --> |
106 <data name="outfile" format="data" label="${dbtype.value_label} BLAST database from ${on_string}"> | 91 <data name="outfile" format="data" label="${dbtype.value_label} BLAST database from ${on_string}"> |
107 <change_format> | 92 <change_format> |
110 </change_format> | 95 </change_format> |
111 </data> | 96 </data> |
112 </outputs> | 97 </outputs> |
113 <tests> | 98 <tests> |
114 <!-- Note the (two line) PIN file is not reproducible run to run. | 99 <!-- Note the (two line) PIN file is not reproducible run to run. |
100 Likewise there is a datestamp in the log file as well, so use contains comparison | |
101 With and without the masking makes no difference. | |
102 With and without the taxid the only real difference is in the *.phr file. | |
115 --> | 103 --> |
116 <test> | 104 <test> |
117 <param name="dbtype" value="prot" /> | 105 <param name="dbtype" value="prot" /> |
118 <param name="file" value="four_human_proteins.fasta" ftype="fasta" /> | 106 <param name="input_file" value="four_human_proteins.fasta" ftype="fasta" /> |
119 <param name="title" value="Just 4 human proteins" /> | 107 <param name="title" value="Just 4 human proteins" /> |
120 <param name="parse_seqids" value="" /> | 108 <param name="parse_seqids" value="" /> |
121 <param name="hash_index" value="true" /> | 109 <param name="hash_index" value="true" /> |
122 <output name="out_file" file="four_human_proteins.fasta.log" ftype="blastdbp" lines_diff="6"> | 110 <output name="out_file" compare="contains" file="four_human_proteins.fasta.log.txt" ftype="blastdbp"> |
111 <extra_files type="file" value="four_human_proteins.fasta.phr" name="blastdb.phr" /> | |
112 <extra_files type="file" value="four_human_proteins.fasta.pin" name="blastdb.pin" lines_diff="2" /> | |
113 <extra_files type="file" value="four_human_proteins.fasta.psq" name="blastdb.psq" /> | |
114 <extra_files type="file" value="four_human_proteins.fasta.pog" name="blastdb.pog" /> | |
115 <extra_files type="file" value="four_human_proteins.fasta.phd" name="blastdb.phd" /> | |
116 <extra_files type="file" value="four_human_proteins.fasta.phi" name="blastdb.phi" /> | |
117 <extra_files type="file" value="four_human_proteins.fasta.psd" name="blastdb.psd" /> | |
118 <extra_files type="file" value="four_human_proteins.fasta.psi" name="blastdb.psi" /> | |
119 </output> | |
120 </test> | |
121 <test> | |
122 <param name="dbtype" value="prot" /> | |
123 <param name="input_file" value="four_human_proteins.fasta" ftype="fasta" /> | |
124 <param name="title" value="Just 4 human proteins" /> | |
125 <param name="parse_seqids" value="" /> | |
126 <param name="hash_index" value="true" /> | |
127 <param name="taxselect" value="id" /> | |
128 <param name="taxid" value="9606" /> | |
129 <output name="out_file" compare="contains" file="four_human_proteins_taxid.fasta.log.txt" ftype="blastdbp"> | |
130 <extra_files type="file" value="four_human_proteins_taxid.fasta.phr" name="blastdb.phr" /> | |
131 <extra_files type="file" value="four_human_proteins_taxid.fasta.pin" name="blastdb.pin" lines_diff="2" /> | |
132 <extra_files type="file" value="four_human_proteins_taxid.fasta.psq" name="blastdb.psq" /> | |
133 <extra_files type="file" value="four_human_proteins_taxid.fasta.pog" name="blastdb.pog" /> | |
134 <extra_files type="file" value="four_human_proteins_taxid.fasta.phd" name="blastdb.phd" /> | |
135 <extra_files type="file" value="four_human_proteins_taxid.fasta.phi" name="blastdb.phi" /> | |
136 <extra_files type="file" value="four_human_proteins_taxid.fasta.psd" name="blastdb.psd" /> | |
137 <extra_files type="file" value="four_human_proteins_taxid.fasta.psi" name="blastdb.psi" /> | |
138 </output> | |
139 </test> | |
140 <test> | |
141 <param name="dbtype" value="prot" /> | |
142 <param name="input_file" value="four_human_proteins.fasta" ftype="fasta" /> | |
143 <param name="title" value="Just 4 human proteins" /> | |
144 <param name="parse_seqids" value="" /> | |
145 <param name="hash_index" value="true" /> | |
146 <param name="mask_data_file" value="segmasker_four_human.maskinfo-asn1" ftype="maskinfo-asn1" /> | |
147 <output name="out_file" compare="contains" file="four_human_proteins.fasta.log.txt" ftype="blastdbp"> | |
123 <extra_files type="file" value="four_human_proteins.fasta.phr" name="blastdb.phr" /> | 148 <extra_files type="file" value="four_human_proteins.fasta.phr" name="blastdb.phr" /> |
124 <extra_files type="file" value="four_human_proteins.fasta.pin" name="blastdb.pin" lines_diff="2" /> | 149 <extra_files type="file" value="four_human_proteins.fasta.pin" name="blastdb.pin" lines_diff="2" /> |
125 <extra_files type="file" value="four_human_proteins.fasta.psq" name="blastdb.psq" /> | 150 <extra_files type="file" value="four_human_proteins.fasta.psq" name="blastdb.psq" /> |
126 <extra_files type="file" value="four_human_proteins.fasta.pog" name="blastdb.pog" /> | 151 <extra_files type="file" value="four_human_proteins.fasta.pog" name="blastdb.pog" /> |
127 <extra_files type="file" value="four_human_proteins.fasta.phd" name="blastdb.phd" /> | 152 <extra_files type="file" value="four_human_proteins.fasta.phd" name="blastdb.phd" /> |