# HG changeset patch # User peterjc # Date 1366897117 14400 # Node ID 9dabbfd73c8ac5b9b08a02ffa9a1927e11494a4e # Parent 1f546099212f04f539fd44e97f15b57a84756065 Uploaded v0.0.19, adds wrappers for rpsblast and rpstblastn with new blastdb_d.loc file for their protein domain database. Also includes other minor improvements. diff -r 1f546099212f -r 9dabbfd73c8a tool-data/blastdb.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/blastdb.loc.sample Thu Apr 25 09:38:37 2013 -0400 @@ -0,0 +1,39 @@ +#This is a sample file distributed with Galaxy that is used to define a +#list of nucleotide BLAST databases, using three columns tab separated +#(longer whitespace are TAB characters): +# +# +# +#The captions typically contain spaces and might end with the build date. +#It is important that the actual database name does not have a space in +#it, and that there are only two tabs on each line. +# +#So, for example, if your database is nt and the path to your base name +#is /depot/data2/galaxy/blastdb/nt/nt.chunk, then the blastdb.loc entry +#would look like this: +# +#nt_02_Dec_2009 nt 02 Dec 2009 /depot/data2/galaxy/blastdb/nt/nt.chunk +# +#and your /depot/data2/galaxy/blastdb/nt directory would contain all of +#your "base names" (e.g.): +# +#-rw-r--r-- 1 wychung galaxy 23437408 2008-04-09 11:26 nt.chunk.00.nhr +#-rw-r--r-- 1 wychung galaxy 3689920 2008-04-09 11:26 nt.chunk.00.nin +#-rw-r--r-- 1 wychung galaxy 251215198 2008-04-09 11:26 nt.chunk.00.nsq +#...etc... +# +#Your blastdb.loc file should include an entry per line for each "base name" +#you have stored. For example: +# +#nt_02_Dec_2009 nt 02 Dec 2009 /depot/data2/galaxy/blastdb/nt/nt.chunk +#wgs_30_Nov_2009 wgs 30 Nov 2009 /depot/data2/galaxy/blastdb/wgs/wgs.chunk +#test_20_Sep_2008 test 20 Sep 2008 /depot/data2/galaxy/blastdb/test/test +#...etc... +# +#You can download the NCBI provided protein databases like NT from here: +#ftp://ftp.ncbi.nlm.nih.gov/blast/db/ +# +#See also blastdb_p.loc which is for any protein BLAST database, and +#blastdb_d.loc which is for any protein domains databases (like CDD). + + diff -r 1f546099212f -r 9dabbfd73c8a tool-data/blastdb_d.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/blastdb_d.loc.sample Thu Apr 25 09:38:37 2013 -0400 @@ -0,0 +1,35 @@ +#This is a sample file distributed with Galaxy that is used to define a +#list of protein domain databases, using three columns tab separated +#(longer whitespace are TAB characters): +# +# +# +#The captions typically contain spaces and might end with the build date. +#It is important that the actual database name does not have a space in it, +#and that there are only two tabs on each line. +# +#You can download the NCBI provided databases as tar-balls from here: +#ftp://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/ +# +#So, for example, if your database is CDD and the path to your base name +#is /data/blastdb/Cdd, then the blastdb_d.loc entry would look like this: +# +#Cdd{tab}NCBI Conserved Domains Database (CDD){tab}/data/blastdb/Cdd +# +#and your /data/blastdb directory would contain all of the files associated +#with the database, /data/blastdb/Cdd.*. +# +#Your blastdb_d.loc file should include an entry per line for each "base name" +#you have stored. For example: +# +#Cdd NCBI CDD /data/blastdb/domains/Cdd +#Kog KOG (eukaryotes) /data/blastdb/domains/Kog +#Cog COG (prokaryotes) /data/blastdb/domains/Cog +#Pfam Pfam-A /data/blastdb/domains/Pfam +#Smart SMART /data/blastdb/domains/Smart +#Tigr TIGR /data/blastdb/domains/Tigr +#Prk Protein Clusters database /data/blastdb/domains/Prk +#...etc... +# +#See also blastdb.loc which is for any nucleotide BLAST database, and +#blastdb_p.loc which is for any protein BLAST databases. diff -r 1f546099212f -r 9dabbfd73c8a tool-data/blastdb_p.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/blastdb_p.loc.sample Thu Apr 25 09:38:37 2013 -0400 @@ -0,0 +1,30 @@ +#This is a sample file distributed with Galaxy that is used to define a +#list of protein BLAST databases, using three columns tab separated +#(longer whitespace are TAB characters): +# +# +# +#The captions typically contain spaces and might end with the build date. +#It is important that the actual database name does not have a space in +#it, and that there are only two tabs on each line. +# +#So, for example, if your database is NR and the path to your base name +#is /data/blastdb/nr, then the blastdb_p.loc entry would look like this: +# +#nr{tab}NCBI NR (non redundant){tab}/data/blastdb/nr +# +#and your /data/blastdb directory would contain all of the files associated +#with the database, /data/blastdb/nr.*. +# +#Your blastdb_p.loc file should include an entry per line for each "base name" +#you have stored. For example: +# +#nr_05Jun2010 NCBI NR (non redundant) 05 Jun 2010 /data/blastdb/05Jun2010/nr +#nr_15Aug2010 NCBI NR (non redundant) 15 Aug 2010 /data/blastdb/15Aug2010/nr +#...etc... +# +#You can download the NCBI provided protein databases like NR from here: +#ftp://ftp.ncbi.nlm.nih.gov/blast/db/ +# +#See also blastdb.loc which is for any nucleotide BLAST database, and +#blastdb_d.loc which is for any protein domains databases (like CDD). diff -r 1f546099212f -r 9dabbfd73c8a tools/ncbi_blast_plus/blastdb.loc.sample --- a/tools/ncbi_blast_plus/blastdb.loc.sample Tue Feb 19 12:49:43 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,38 +0,0 @@ -#This is a sample file distributed with Galaxy that is used to define a -#list of nucleotide BLAST databases, using three columns tab separated -#(longer whitespace are TAB characters): -# -# -# -#The captions typically contain spaces and might end with the build date. -#It is important that the actual database name does not have a space in it, -#and that the first tab that appears in the line is right before the path. -# -#So, for example, if your database is nt and the path to your base name -#is /depot/data2/galaxy/blastdb/nt/nt.chunk, then the blastdb.loc entry -#would look like this: -# -#nt_02_Dec_2009 nt 02 Dec 2009 /depot/data2/galaxy/blastdb/nt/nt.chunk -# -#and your /depot/data2/galaxy/blastdb/nt directory would contain all of -#your "base names" (e.g.): -# -#-rw-r--r-- 1 wychung galaxy 23437408 2008-04-09 11:26 nt.chunk.00.nhr -#-rw-r--r-- 1 wychung galaxy 3689920 2008-04-09 11:26 nt.chunk.00.nin -#-rw-r--r-- 1 wychung galaxy 251215198 2008-04-09 11:26 nt.chunk.00.nsq -#...etc... -# -#Your blastdb.loc file should include an entry per line for each "base name" -#you have stored. For example: -# -#nt_02_Dec_2009 nt 02 Dec 2009 /depot/data2/galaxy/blastdb/nt/nt.chunk -#wgs_30_Nov_2009 wgs 30 Nov 2009 /depot/data2/galaxy/blastdb/wgs/wgs.chunk -#test_20_Sep_2008 test 20 Sep 2008 /depot/data2/galaxy/blastdb/test/test -#...etc... -# -#See also blastdb_p.loc which is for any protein BLAST database. -# -#Note that for backwards compatibility with workflows, the unique ID of -#an entry must be the path that was in the original loc file, because that -#is the value stored in the workflow for that parameter. -# diff -r 1f546099212f -r 9dabbfd73c8a tools/ncbi_blast_plus/blastdb_p.loc.sample --- a/tools/ncbi_blast_plus/blastdb_p.loc.sample Tue Feb 19 12:49:43 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,27 +0,0 @@ -#This is a sample file distributed with Galaxy that is used to define a -#list of protein BLAST databases, using three columns tab separated -#(longer whitespace are TAB characters): -# -# -# -#The captions typically contain spaces and might end with the build date. -#It is important that the actual database name does not have a space in it, -#and that the first tab that appears in the line is right before the path. -# -#So, for example, if your database is NR and the path to your base name -#is /data/blastdb/nr, then the blastdb_p.loc entry would look like this: -# -#nr NCBI NR (non redundant) /data/blastdb/nr -# -#and your /data/blastdb directory would contain all of the files associated -#with the database, /data/blastdb/nr.*. -# -#Your blastdb_p.loc file should include an entry per line for each "base name" -#you have stored. For example: -# -#nr_05Jun2010 NCBI NR (non redundant) 05 Jun 2010 /data/blastdb/05Jun2010/nr -#nr_15Aug2010 NCBI NR (non redundant) 15 Aug 2010 /data/blastdb/15Aug2010/nr -#...etc... -# -#See also blastdb.loc which is for any nucleotide BLAST database. -# diff -r 1f546099212f -r 9dabbfd73c8a tools/ncbi_blast_plus/ncbi_blast_plus.txt --- a/tools/ncbi_blast_plus/ncbi_blast_plus.txt Tue Feb 19 12:49:43 2013 -0500 +++ b/tools/ncbi_blast_plus/ncbi_blast_plus.txt Thu Apr 25 09:38:37 2013 -0400 @@ -6,9 +6,9 @@ See the licence text below. Currently tested with NCBI BLAST 2.2.26+ (i.e. version 2.2.26 of BLAST+), -and do not work with the NCBI 'legacy' BLAST suite (e.g. blastall). +and does not work with the NCBI 'legacy' BLAST suite (e.g. blastall). -Note that these wrappers (and the associated datetypes) were originally +Note that these wrappers (and the associated datatypes) were originally distributed as part of the main Galaxy repository, but as of August 2012 moved to the Galaxy Tool Shed as 'ncbi_blast_plus' (and 'blast_datatypes'). My thanks to Dannon Baker from the Galaxy development team for his assistance @@ -25,16 +25,21 @@ You must tell Galaxy about any system level BLAST databases using configuration files blastdb.loc (nucleotide databases like NT) and blastdb_p.loc (protein -databases like NR), located in the tool-data folder. Sample fils are included -which explain the tab based format to use. +databases like NR), and blastdb_d.loc (protein domain databases like CDD or +SMART) which are located in the tool-data/ folder. Sample files are included +which explain the tab-based format to use. + +You can download the NCBI provided databases as tar-balls from here: +ftp://ftp.ncbi.nlm.nih.gov/blast/db/ (nucleotide and protein databases like NR) +ftp://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/ (domain databases like CDD) Manual Installation =================== For those not using Galaxy's automated installation from the Tool Shed, put -the XML and Python files under tools/ncbi_blast_plus and add the XML files -to your tool_conf.xml as normal (and do the same in tool_conf.xml.sample +the XML and Python files in the tools/ncbi_blast_plus/ folder and add the XML +files to your tool_conf.xml as normal (and do the same in tool_conf.xml.sample in order to run the unit tests). For example, use:
@@ -46,6 +51,8 @@ + +
@@ -53,15 +60,13 @@ defines the BLAST XML file format ('blastxml') and protein and nucleotide BLAST databases composite file formats ('blastdbp' and 'blastdbn'). -You must tell Galaxy about any system level BLAST databases using configuration -files blastdb.loc (nucleotide databases like NT) and blastdb_p.loc (protein -databases like NR), located in the tool-data folder. Sample fils are included -which explain the tab based format to use. +As described above for an automated installation, you must also tell Galaxy +about any system level BLAST databases using the tool-data/blastdb*.loc files. You must install the NCBI BLAST+ standalone tools somewhere on the system path. Currently the unit tests are written using "BLAST 2.2.26+". -Run the functional tests (adusting the section identifier to match your +Run the functional tests (adjusting the section identifier to match your tool_conf.xml.sample file): ./run_functional_tests.sh -sid NCBI_BLAST+-ncbi_blast_plus_tools @@ -89,20 +94,28 @@ v0.0.17 - The BLAST+ search tools now default to extended tabular output (all too often our users where having to re-run searches just to get one of the missing columns like query or subject length) +v0.0.18 - Defensive quoting of filenames in case of spaces (where possible, + BLAST+ handling of some mult-file arguments is problematic). +v0.0.19 - Added wrappers for rpsblast and rpstblastn, and new blastdb_d.loc + for the domain databases they use (e.g. CDD, PFAM or SMART). + - Correct case of exception regular expression (for error handling + fall-back in case the return code is not set properly). + - Clearer naming of output files. Developers ========== -This script and related tools are being developed on the following hg branch: -http://bitbucket.org/peterjc/galaxy-central/src/tools +This script and related tools are being developed on the 'tools' branch of the +following Mercurial repository: +https://bitbucket.org/peterjc/galaxy-central/ -For making the "Galaxy Tool Shed" http://community.g2.bx.psu.edu/ tarball I use +For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball I use the following command from the Galaxy root folder: $ ./tools/ncbi_blast_plus/make_ncbi_blast_plus.sh -This similifies ensuring a consistent set of files is bundled each time, +This simplifies ensuring a consistent set of files is bundled each time, including all the relevant test files. diff -r 1f546099212f -r 9dabbfd73c8a tools/ncbi_blast_plus/ncbi_blastdbcmd_info.xml --- a/tools/ncbi_blast_plus/ncbi_blastdbcmd_info.xml Tue Feb 19 12:49:43 2013 -0500 +++ b/tools/ncbi_blast_plus/ncbi_blastdbcmd_info.xml Thu Apr 25 09:38:37 2013 -0400 @@ -1,7 +1,12 @@ - + Show BLAST database information from blastdbcmd + + blastdbcmd + blast+ + + blastdbcmd -version -blastdbcmd -dbtype $db_opts.db_type -db "${db_opts.database.fields.path}" -info -out $info +blastdbcmd -dbtype $db_opts.db_type -db "${db_opts.database.fields.path}" -info -out "$info" @@ -9,7 +14,7 @@ - + @@ -40,9 +45,6 @@ - - blastdbcmd - **What it does** diff -r 1f546099212f -r 9dabbfd73c8a tools/ncbi_blast_plus/ncbi_blastdbcmd_wrapper.xml --- a/tools/ncbi_blast_plus/ncbi_blastdbcmd_wrapper.xml Tue Feb 19 12:49:43 2013 -0500 +++ b/tools/ncbi_blast_plus/ncbi_blastdbcmd_wrapper.xml Thu Apr 25 09:38:37 2013 -0400 @@ -1,5 +1,10 @@ - + Extract sequence(s) from BLAST database + + blastdbcmd + blast+ + + blastdbcmd -version ## The command is a Cheetah template which allows some Python based syntax. ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces @@ -48,7 +53,7 @@ - + @@ -95,9 +100,6 @@ - - blastdbcmd - **What it does** diff -r 1f546099212f -r 9dabbfd73c8a tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml --- a/tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml Tue Feb 19 12:49:43 2013 -0500 +++ b/tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml Thu Apr 25 09:38:37 2013 -0400 @@ -1,7 +1,11 @@ - + Search nucleotide database with nucleotide query sequence(s) + + blastn + blast+ + blastn -version ## The command is a Cheetah template which allows some Python based syntax. @@ -17,7 +21,7 @@ #end if -task $blast_type -evalue $evalue_cutoff --out $output1 +-out "$output1" ##Set the extended list here so if/when we add things, saved workflows are not affected #if str($out_format)=="ext": -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" @@ -47,13 +51,13 @@ - + - + @@ -132,7 +136,7 @@ - + @@ -144,9 +148,6 @@ - - blastn - .. class:: warningmark diff -r 1f546099212f -r 9dabbfd73c8a tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml --- a/tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml Tue Feb 19 12:49:43 2013 -0500 +++ b/tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml Thu Apr 25 09:38:37 2013 -0400 @@ -1,7 +1,11 @@ - + Search protein database with protein query sequence(s) + + blastp + blast+ + blastp -version ## The command is a Cheetah template which allows some Python based syntax. @@ -17,7 +21,7 @@ #end if -task $blast_type -evalue $evalue_cutoff --out $output1 +-out "$output1" ##Set the extended list here so if/when we add things, saved workflows are not affected #if str($out_format)=="ext": -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" @@ -48,13 +52,13 @@ - + - + @@ -137,7 +141,7 @@ - + @@ -149,9 +153,6 @@ - - blastp - diff -r 1f546099212f -r 9dabbfd73c8a tools/ncbi_blast_plus/ncbi_blastx_wrapper.xml --- a/tools/ncbi_blast_plus/ncbi_blastx_wrapper.xml Tue Feb 19 12:49:43 2013 -0500 +++ b/tools/ncbi_blast_plus/ncbi_blastx_wrapper.xml Thu Apr 25 09:38:37 2013 -0400 @@ -1,7 +1,11 @@ - + Search protein database with translated nucleotide query sequence(s) + + blastx + blast+ + blastx -version ## The command is a Cheetah template which allows some Python based syntax. @@ -17,7 +21,7 @@ #end if -query_gencode $query_gencode -evalue $evalue_cutoff --out $output1 +-out "$output1" ##Set the extended list here so if/when we add things, saved workflows are not affected #if str($out_format)=="ext": -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" @@ -48,13 +52,13 @@ - + - + @@ -154,7 +158,7 @@ - + @@ -166,9 +170,6 @@ - - blastx - diff -r 1f546099212f -r 9dabbfd73c8a tools/ncbi_blast_plus/ncbi_makeblastdb.xml --- a/tools/ncbi_blast_plus/ncbi_makeblastdb.xml Tue Feb 19 12:49:43 2013 -0500 +++ b/tools/ncbi_blast_plus/ncbi_makeblastdb.xml Thu Apr 25 09:38:37 2013 -0400 @@ -1,8 +1,12 @@ - -Make BLAST database -makeblastdb -version - -makeblastdb -out ${os.path.join($outfile.extra_files_path,'blastdb')} + + Make BLAST database + + makeblastdb + blast+ + + makeblastdb -version + +makeblastdb -out "${os.path.join($outfile.extra_files_path,'blastdb')}" $parse_seqids $hash_index ## Single call to -in with multiple filenames space separated with outer quotes @@ -41,7 +45,7 @@ - + @@ -91,16 +95,13 @@ - + - - makeblastdb - **What it does** diff -r 1f546099212f -r 9dabbfd73c8a tools/ncbi_blast_plus/ncbi_rpsblast_wrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/ncbi_blast_plus/ncbi_rpsblast_wrapper.xml Thu Apr 25 09:38:37 2013 -0400 @@ -0,0 +1,236 @@ + + Search protein domain database (PSSMs) with protein query sequence(s) + + + + rpsblast + blast+ + + rpsblast -version + +## The command is a Cheetah template which allows some Python based syntax. +## Lines starting hash hash are comments. Galaxy will turn newlines into spaces +rpsblast +-query "$query" +#if $db_opts.db_opts_selector == "db": + -db "${db_opts.database.fields.path}" +#elif $db_opts.db_opts_selector == "histdb": + -db "${os.path.join($db_opts.histdb.extra_files_path,'blastdb')}" +#end if +-evalue $evalue_cutoff +-out "$output1" +##Set the extended list here so if/when we add things, saved workflows are not affected +#if str($out_format)=="ext": + -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" +#else: + -outfmt $out_format +#end if +-num_threads 8 +#if $adv_opts.adv_opts_selector=="advanced": +$adv_opts.filter_query +## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string +## Note -max_target_seqs overrides -num_descriptions and -num_alignments +#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): +-max_target_seqs $adv_opts.max_hits +#end if +#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): +-word_size $adv_opts.word_size +#end if +$adv_opts.parse_deflines +## End of advanced options: +#end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: warningmark + +**Note**. Database searches may take a substantial amount of time. +For large input datasets it is advisable to allow overnight processing. + +----- + +**What it does** + +Search a *protein domain database* using a *protein query*, +using the NCBI BLAST+ rpsblast command line tool. + +The protein domain databases use position-specific scoring matrices +(PSSMs) and are available for a number of domain collections including: + +*CDD* - NCBI curarated meta-collection of domains, see +http://www.ncbi.nlm.nih.gov/Structure/cdd/cdd_help.shtml#NCBI_curated_domains + +*Kog* - PSSMs from automatically aligned sequences and sequence +fragments classified in the KOGs resource, the eukaryotic +counterpart to COGs, see http://www.ncbi.nlm.nih.gov/COG/new/ + +*Cog* - PSSMs from automatically aligned sequences and sequence +fragments classified in the COGs resource, which focuses primarily +on prokaryotes, see http://www.ncbi.nlm.nih.gov/COG/new/ + +*Pfam* - PSSMs from Pfam-A seed alignment database, see +http://pfam.sanger.ac.uk/ + +*Smart* - PSSMs from SMART domain alignment database, see +http://smart.embl-heidelberg.de/ + +*Tigr* - PSSMs from TIGRFAM database of protein families, see +http://www.jcvi.org/cms/research/projects/tigrfams/overview/ + +*Prk* - PSSms from automatically aligned stable clusters in the +Protein Clusters database, see +http://www.ncbi.nlm.nih.gov/proteinclusters?cmd=search&db=proteinclusters + +The exact list of domain databases offered will depend on how your +local Galaxy has been configured. + +----- + +**Output format** + +Because Galaxy focuses on processing tabular data, the default output of this +tool is tabular. The standard BLAST+ tabular output contains 12 columns: + +====== ========= ============================================ +Column NCBI name Description +------ --------- -------------------------------------------- + 1 qseqid Query Seq-id (ID of your sequence) + 2 sseqid Subject Seq-id (ID of the database hit) + 3 pident Percentage of identical matches + 4 length Alignment length + 5 mismatch Number of mismatches + 6 gapopen Number of gap openings + 7 qstart Start of alignment in query + 8 qend End of alignment in query + 9 sstart Start of alignment in subject (database hit) + 10 send End of alignment in subject (database hit) + 11 evalue Expectation value (E-value) + 12 bitscore Bit score +====== ========= ============================================ + +The BLAST+ tools can optionally output additional columns of information, +but this takes longer to calculate. Most (but not all) of these columns are +included by selecting the extended tabular output. The extra columns are +included *after* the standard 12 columns. This is so that you can write +workflow filtering steps that accept either the 12 or 24 column tabular +BLAST output. Galaxy now uses this extended 24 column output by default. + +====== ============= =========================================== +Column NCBI name Description +------ ------------- ------------------------------------------- + 13 sallseqid All subject Seq-id(s), separated by a ';' + 14 score Raw score + 15 nident Number of identical matches + 16 positive Number of positive-scoring matches + 17 gaps Total number of gaps + 18 ppos Percentage of positive-scoring matches + 19 qframe Query frame + 20 sframe Subject frame + 21 qseq Aligned part of query sequence + 22 sseq Aligned part of subject sequence + 23 qlen Query sequence length + 24 slen Subject sequence length +====== ============= =========================================== + +The third option is BLAST XML output, which is designed to be parsed by +another program, and is understood by some Galaxy tools. + +You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). +The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. +The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. +The two query anchored outputs show a multiple sequence alignment between the query and all the matches, +and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). + +------- + +**References** + +Marchler-Bauer A, Bryant SH. CD-Search: protein domain annotations on the fly. Nucleic Acids Res. 2004 Jul 1;32(Web Server issue):W327-31. + + + diff -r 1f546099212f -r 9dabbfd73c8a tools/ncbi_blast_plus/ncbi_rpstblastn_wrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/ncbi_blast_plus/ncbi_rpstblastn_wrapper.xml Thu Apr 25 09:38:37 2013 -0400 @@ -0,0 +1,237 @@ + + Search protein domain database (PSSMs) with translated nucleotide query sequence(s) + + + + rpstblastn + blast+ + + rpstblastn -version + +## The command is a Cheetah template which allows some Python based syntax. +## Lines starting hash hash are comments. Galaxy will turn newlines into spaces +rpstblastn +-query "$query" +#if $db_opts.db_opts_selector == "db": + -db "${db_opts.database.fields.path}" +#elif $db_opts.db_opts_selector == "histdb": + -db "${os.path.join($db_opts.histdb.extra_files_path,'blastdb')}" +#end if +-evalue $evalue_cutoff +-out "$output1" +##Set the extended list here so if/when we add things, saved workflows are not affected +#if str($out_format)=="ext": + -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" +#else: + -outfmt $out_format +#end if +##Seems rpstblastn does not currently support multiple threads :( +##-num_threads 8 +#if $adv_opts.adv_opts_selector=="advanced": +$adv_opts.filter_query +## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string +## Note -max_target_seqs overrides -num_descriptions and -num_alignments +#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): +-max_target_seqs $adv_opts.max_hits +#end if +#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): +-word_size $adv_opts.word_size +#end if +$adv_opts.parse_deflines +## End of advanced options: +#end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: warningmark + +**Note**. Database searches may take a substantial amount of time. +For large input datasets it is advisable to allow overnight processing. + +----- + +**What it does** + +Search a *protein domain database* using a *nucleotide query*, +using the NCBI BLAST+ rpstblastn command line tool. + +The protein domain databases use position-specific scoring matrices +(PSSMs) and are available for a number of domain collections including: + +*CDD* - NCBI curarated meta-collection of domains, see +http://www.ncbi.nlm.nih.gov/Structure/cdd/cdd_help.shtml#NCBI_curated_domains + +*Kog* - PSSMs from automatically aligned sequences and sequence +fragments classified in the KOGs resource, the eukaryotic +counterpart to COGs, see http://www.ncbi.nlm.nih.gov/COG/new/ + +*Cog* - PSSMs from automatically aligned sequences and sequence +fragments classified in the COGs resource, which focuses primarily +on prokaryotes, see http://www.ncbi.nlm.nih.gov/COG/new/ + +*Pfam* - PSSMs from Pfam-A seed alignment database, see +http://pfam.sanger.ac.uk/ + +*Smart* - PSSMs from SMART domain alignment database, see +http://smart.embl-heidelberg.de/ + +*Tigr* - PSSMs from TIGRFAM database of protein families, see +http://www.jcvi.org/cms/research/projects/tigrfams/overview/ + +*Prk* - PSSms from automatically aligned stable clusters in the +Protein Clusters database, see +http://www.ncbi.nlm.nih.gov/proteinclusters?cmd=search&db=proteinclusters + +The exact list of domain databases offered will depend on how your +local Galaxy has been configured. + +----- + +**Output format** + +Because Galaxy focuses on processing tabular data, the default output of this +tool is tabular. The standard BLAST+ tabular output contains 12 columns: + +====== ========= ============================================ +Column NCBI name Description +------ --------- -------------------------------------------- + 1 qseqid Query Seq-id (ID of your sequence) + 2 sseqid Subject Seq-id (ID of the database hit) + 3 pident Percentage of identical matches + 4 length Alignment length + 5 mismatch Number of mismatches + 6 gapopen Number of gap openings + 7 qstart Start of alignment in query + 8 qend End of alignment in query + 9 sstart Start of alignment in subject (database hit) + 10 send End of alignment in subject (database hit) + 11 evalue Expectation value (E-value) + 12 bitscore Bit score +====== ========= ============================================ + +The BLAST+ tools can optionally output additional columns of information, +but this takes longer to calculate. Most (but not all) of these columns are +included by selecting the extended tabular output. The extra columns are +included *after* the standard 12 columns. This is so that you can write +workflow filtering steps that accept either the 12 or 24 column tabular +BLAST output. Galaxy now uses this extended 24 column output by default. + +====== ============= =========================================== +Column NCBI name Description +------ ------------- ------------------------------------------- + 13 sallseqid All subject Seq-id(s), separated by a ';' + 14 score Raw score + 15 nident Number of identical matches + 16 positive Number of positive-scoring matches + 17 gaps Total number of gaps + 18 ppos Percentage of positive-scoring matches + 19 qframe Query frame + 20 sframe Subject frame + 21 qseq Aligned part of query sequence + 22 sseq Aligned part of subject sequence + 23 qlen Query sequence length + 24 slen Subject sequence length +====== ============= =========================================== + +The third option is BLAST XML output, which is designed to be parsed by +another program, and is understood by some Galaxy tools. + +You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). +The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. +The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. +The two query anchored outputs show a multiple sequence alignment between the query and all the matches, +and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). + +------- + +**References** + +Marchler-Bauer A, Bryant SH. CD-Search: protein domain annotations on the fly. Nucleic Acids Res. 2004 Jul 1;32(Web Server issue):W327-31. + + + diff -r 1f546099212f -r 9dabbfd73c8a tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml --- a/tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml Tue Feb 19 12:49:43 2013 -0500 +++ b/tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml Thu Apr 25 09:38:37 2013 -0400 @@ -1,7 +1,11 @@ - + Search translated nucleotide database with protein query sequence(s) + + tblastn + blast+ + tblastn -version ## The command is a Cheetah template which allows some Python based syntax. @@ -16,7 +20,7 @@ -subject "$db_opts.subject" #end if -evalue $evalue_cutoff --out $output1 +-out "$output1" ##Set the extended list here so if/when we add things, saved workflows are not affected #if str($out_format)=="ext": -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" @@ -48,13 +52,13 @@ - + - + @@ -154,7 +158,7 @@ - + @@ -166,9 +170,6 @@ - - tblastn - diff -r 1f546099212f -r 9dabbfd73c8a tools/ncbi_blast_plus/ncbi_tblastx_wrapper.xml --- a/tools/ncbi_blast_plus/ncbi_tblastx_wrapper.xml Tue Feb 19 12:49:43 2013 -0500 +++ b/tools/ncbi_blast_plus/ncbi_tblastx_wrapper.xml Thu Apr 25 09:38:37 2013 -0400 @@ -1,7 +1,11 @@ - + Search translated nucleotide database with translated nucleotide query sequence(s) + + tblastx + blast+ + tblastx -version ## The command is a Cheetah template which allows some Python based syntax. @@ -17,7 +21,7 @@ #end if -query_gencode $query_gencode -evalue $evalue_cutoff --out $output1 +-out "$output1" ##Set the extended list here so if/when we add things, saved workflows are not affected #if str($out_format)=="ext": -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" @@ -48,13 +52,13 @@ - + - + @@ -174,7 +178,7 @@ - + @@ -186,9 +190,6 @@ - - tblastx - .. class:: warningmark