# HG changeset patch # User peterjc # Date 1352462035 18000 # Node ID 393a7a35383cd19dd11d1c528bc5c7529d67b35d # Parent 9d5beacae92b094c22b439cf0eb9e51edada34b9 Uploaded v0.0.14 adding local BLAST database support. This *requires* the matching update to the blast_datatypes repository. This adds basic wrappers for makeblastdb and blastdbinfo. This update includes work by Edward Kirton. diff -r 9d5beacae92b -r 393a7a35383c tools/ncbi_blast_plus/ncbi_blast_plus.txt --- a/tools/ncbi_blast_plus/ncbi_blast_plus.txt Wed Sep 19 13:08:31 2012 -0400 +++ b/tools/ncbi_blast_plus/ncbi_blast_plus.txt Fri Nov 09 06:53:55 2012 -0500 @@ -42,6 +42,8 @@ e-values v0.0.13 - Use the new error handling options in Galaxy (the previously bundled hide_stderr.py script is no longer needed). +v0.0.14 - Support for makeblastdb and local BLAST databases in the history + (using work from Edward Kirton). Developers diff -r 9d5beacae92b -r 393a7a35383c tools/ncbi_blast_plus/ncbi_blastdbcmd_info.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/ncbi_blast_plus/ncbi_blastdbcmd_info.xml Fri Nov 09 06:53:55 2012 -0500 @@ -0,0 +1,63 @@ + + Show BLAST database information from blastdbcmd + +blastdbcmd -dbtype $db_opts.db_type -db "${db_opts.database.fields.path}" -info -out $info + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + blastdbcmd + + + +**What it does** + +Calls the NCBI BLAST+ blastdbcmd command line tool with the -info +switch to give summary information about a BLAST database, such as +the size (number of sequences and total length) and date. + +------- + +**References** + +Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402. + +Schaffer et al. Improving the accuracy of PSI-BLAST protein database searches with composition-based statistics and other refinements. 2001. Nucleic Acids Res. 29:2994-3005. + + + diff -r 9d5beacae92b -r 393a7a35383c tools/ncbi_blast_plus/ncbi_blastdbcmd_wrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/ncbi_blast_plus/ncbi_blastdbcmd_wrapper.xml Fri Nov 09 06:53:55 2012 -0500 @@ -0,0 +1,135 @@ + + Extract sequence(s) from BLAST database + +## The command is a Cheetah template which allows some Python based syntax. +## Lines starting hash hash are comments. Galaxy will turn newlines into spaces +blastdbcmd -dbtype $db_opts.db_type -db "${db_opts.database.fields.path}" + +##TODO: What about -ctrl_a and -target_only as advanced options? + +#if $id_opts.id_type=="file": +-entry_batch "$id_opts.entries" +#else: +##Perform some simple search/replaces to remove whitespace +##and make it comma separated, and escape any pipe characters +-entry "$id_opts.entries.replace('\r',',').replace('\n',',').replace(' ','').replace(',,',',').replace(',,',',').strip(',').replace('|','\|')" +#end if + +##When building a BLAST database, to ensure unique IDs makeblastdb will +##do things like turning a FASTA entry with ID of ERP44 into lcl|ERP44 +##(if using -parse_seqids) or simply assign it an ID using the record +##number like gnl|BL_ORD_ID|123 (to cope with duplicate IDs in the FASTA +##file). In -parse_seqids mode, a duplicate FASTA ID gives an error. +## +##The BLAST plain text and XML output will contain these BLAST IDs, but +##the tabular output does not (at least, not in BLAST 2.2.25+). +##Therefore in general, Galaxy users won't care about the (internal) +##BLAST identifiers. +## +##The blastdbcmd FASTA output will also contain these IDs, but in the +##context of the BLAST tabular output they are not helpful. Therefore +##to recover the original ID as used in the FASTA file for makeblastdb +##we need a litte post processing. +## +##We remove the NCBI's lcl|... or gnl|BL_ORD_ID|123 prefixes +##using sed, however the exact syntax differs for Mac OS X's sed + +#if str($outfmt)=="blastid": +-out "$seq" +#else if sys.platform == "darwin": +| sed -E 's/^>(lcl\||gnl\|BL_ORD_ID\|[0-9]* )/>/1' > "$seq" +#else: +| sed 's/>\(lcl|\|gnl|BL_ORD_ID|[0-9]* \)/>/1' > "$seq" +#end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + blastdbcmd + + + +**What it does** + +Extracts FASTA formatted sequences from a BLAST database +using the NCBI BLAST+ blastdbcmd command line tool. + +.. class:: warningmark + +**BLAST assigned identifiers** + +When a BLAST database is constructed from a FASTA file, the +original identifiers can be replaced with BLAST assigned +identifiers, partly to ensure uniqueness. e.g. Sometimes +a prefix of 'lcl|' is added (lcl is short for local), +or an arbitrary name starting 'gnl|BL_ORD_ID|' is created. + +If you are using the tabular output from BLAST, it will contain +the original identifiers - not the BLAST assigned identifiers +suitable for use with the blastdbcmd tool. + +If you are using the XML or plain text output, this will also +contain the BLAST assigned identifiers. However, this means +getting a list of BLAST assigned identifiers isn't straightforward. + +------- + +**References** + +Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402. + +Schaffer et al. Improving the accuracy of PSI-BLAST protein database searches with composition-based statistics and other refinements. 2001. Nucleic Acids Res. 29:2994-3005. + + + diff -r 9d5beacae92b -r 393a7a35383c tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml --- a/tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml Wed Sep 19 13:08:31 2012 -0400 +++ b/tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml Fri Nov 09 06:53:55 2012 -0500 @@ -1,7 +1,7 @@ - + Search nucleotide database with nucleotide query sequence(s) - + blastn -version ## The command is a Cheetah template which allows some Python based syntax. @@ -10,6 +10,8 @@ -query "$query" #if $db_opts.db_opts_selector == "db": -db "${db_opts.database.fields.path}" +#elif $db_opts.db_opts_selector == "histdb": + -db "${os.path.join($db_opts.histdb.extra_files_path,'blastdb')}" #else: -subject "$db_opts.subject" #end if @@ -40,15 +42,20 @@ #end if + - + + + + - + + @@ -58,10 +65,17 @@ + + + + + + - + + diff -r 9d5beacae92b -r 393a7a35383c tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml --- a/tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml Wed Sep 19 13:08:31 2012 -0400 +++ b/tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml Fri Nov 09 06:53:55 2012 -0500 @@ -1,7 +1,7 @@ - + Search protein database with protein query sequence(s) - + blastp -version ## The command is a Cheetah template which allows some Python based syntax. @@ -10,6 +10,8 @@ -query "$query" #if $db_opts.db_opts_selector == "db": -db "${db_opts.database.fields.path}" +#elif $db_opts.db_opts_selector == "histdb": + -db "${os.path.join($db_opts.histdb.extra_files_path,'blastdb')}" #else: -subject "$db_opts.subject" #end if @@ -41,15 +43,20 @@ #end if + - + + + + - + + @@ -59,10 +66,17 @@ + + + + + + + diff -r 9d5beacae92b -r 393a7a35383c tools/ncbi_blast_plus/ncbi_blastx_wrapper.xml --- a/tools/ncbi_blast_plus/ncbi_blastx_wrapper.xml Wed Sep 19 13:08:31 2012 -0400 +++ b/tools/ncbi_blast_plus/ncbi_blastx_wrapper.xml Fri Nov 09 06:53:55 2012 -0500 @@ -1,7 +1,7 @@ - + Search protein database with translated nucleotide query sequence(s) - + blastx -version ## The command is a Cheetah template which allows some Python based syntax. @@ -10,6 +10,8 @@ -query "$query" #if $db_opts.db_opts_selector == "db": -db "${db_opts.database.fields.path}" +#elif $db_opts.db_opts_selector == "histdb": + -db "${os.path.join($db_opts.histdb.extra_files_path,'blastdb')}" #else: -subject "$db_opts.subject" #end if @@ -41,15 +43,20 @@ #end if + - + + + + - + + @@ -59,10 +66,17 @@ - + + + + + + + - + + diff -r 9d5beacae92b -r 393a7a35383c tools/ncbi_blast_plus/ncbi_makeblastdb.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/ncbi_blast_plus/ncbi_makeblastdb.xml Fri Nov 09 06:53:55 2012 -0500 @@ -0,0 +1,121 @@ + +Make BLAST database +makeblastdb -version + +makeblastdb -out ${os.path.join($outfile.extra_files_path,'blastdb')} +$parse_seqids +$hash_index +## Single call to -in with multiple filenames space separated with outer quotes +## (presumably any filenames with spaces would be a problem). Note this gives +## some extra spaces, e.g. -in " file1 file2 file3 " but BLAST seems happy: +-in " +#for $i in $in +${i.file} #end for +" +#if $title: +-title "$title" +#else: +##Would default to being based on the cryptic Galaxy filenames, which is unhelpful +-title "BLAST Database" +#end if +-dbtype $dbtype +## #set $sep = '-mask_data ' +## #for $i in $mask_data +## $sep${i.file} +## #set $set = ', ' +## #end for +## #set $sep = '-gi_mask -gi_mask_name ' +## #for $i in $gi_mask +## $sep${i.file} +## #set $set = ', ' +## #end for +## #if $tax.select == 'id': +## -taxid $tax.id +## #else if $tax.select == 'map': +## -taxid_map $tax.map +## #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + makeblastdb + + +**What it does** + +Make BLAST database from one or more FASTA files and/or BLAST databases. + +This is a wrapper for the NCBI BLAST+ tool 'makeblastdb', which is the +replacement for the 'formatdb' tool in the NCBI 'legacy' BLAST suite. + + + +**Documentation** + +http://www.ncbi.nlm.nih.gov/books/NBK1763/ + + diff -r 9d5beacae92b -r 393a7a35383c tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml --- a/tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml Wed Sep 19 13:08:31 2012 -0400 +++ b/tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml Fri Nov 09 06:53:55 2012 -0500 @@ -1,7 +1,7 @@ - + Search translated nucleotide database with protein query sequence(s) - + tblastn -version ## The command is a Cheetah template which allows some Python based syntax. @@ -10,6 +10,8 @@ -query "$query" #if $db_opts.db_opts_selector == "db": -db "${db_opts.database.fields.path}" +#elif $db_opts.db_opts_selector == "histdb": + -db "${os.path.join($db_opts.histdb.extra_files_path,'blastdb')}" #else: -subject "$db_opts.subject" #end if @@ -41,15 +43,20 @@ #end if + - + + + + - + + @@ -59,10 +66,17 @@ - + + + + + + + - + + diff -r 9d5beacae92b -r 393a7a35383c tools/ncbi_blast_plus/ncbi_tblastx_wrapper.xml --- a/tools/ncbi_blast_plus/ncbi_tblastx_wrapper.xml Wed Sep 19 13:08:31 2012 -0400 +++ b/tools/ncbi_blast_plus/ncbi_tblastx_wrapper.xml Fri Nov 09 06:53:55 2012 -0500 @@ -1,7 +1,7 @@ - + Search translated nucleotide database with translated nucleotide query sequence(s) - + tblastx -version ## The command is a Cheetah template which allows some Python based syntax. @@ -10,6 +10,8 @@ -query "$query" #if $db_opts.db_opts_selector == "db": -db "${db_opts.database.fields.path}" +#elif $db_opts.db_opts_selector == "histdb": + -db "${os.path.join($db_opts.histdb.extra_files_path,'blastdb')}" #else: -subject "$db_opts.subject" #end if @@ -41,15 +43,20 @@ #end if + - + + + + - + + @@ -59,10 +66,17 @@ - + + + + + + + - + +