annotate megablast_wrapper.py @ 1:fb2e0e1dac89 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_wrapper commit a1517c9d22029095120643bbe2c8fa53754dd2b7
author devteam
date Wed, 11 Nov 2015 12:20:16 -0500
parents dc7b4acb3fa6
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
1 #!/usr/bin/env python
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
2 """
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
3 run megablast for metagenomics data
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
4
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
5 usage: %prog [options]
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
6 -d, --db_build=d: The database to use
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
7 -i, --input=i: Input FASTQ candidate file
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
8 -w, --word_size=w: Size of best perfect match
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
9 -c, --identity_cutoff=c: Report hits at or above this identity
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
10 -e, --eval_cutoff=e: Expectation value cutoff
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
11 -f, --filter_query=f: Filter out low complexity regions
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
12 -x, --index_dir=x: Data index directory
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
13 -o, --output=o: Output file
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
14
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
15 usage: %prog db_build input_file word_size identity_cutoff eval_cutoff filter_query index_dir output_file
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
16 """
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
17
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
18 # This version (April 26, 2012) replaces megablast with blast+ blastn
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
19 # There is now no need to augment NCBI-formatted databases and these can be
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
20 # directly downloaded from NCBI ftp site
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
21
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
22 import os, subprocess, sys, tempfile
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
23 from bx.cookbook import doc_optparse
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
24
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
25 assert sys.version_info[:2] >= ( 2, 4 )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
26
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
27 def stop_err( msg ):
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
28 sys.stderr.write( "%s\n" % msg )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
29 sys.exit()
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
30
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
31 def __main__():
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
32 #Parse Command Line
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
33 options, args = doc_optparse.parse( __doc__ )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
34 query_filename = options.input.strip() # -query
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
35 output_filename = options.output.strip() # -out
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
36 mega_word_size = options.word_size # -word_size
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
37 mega_iden_cutoff = options.identity_cutoff # -perc_identity
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
38 mega_evalue_cutoff = options.eval_cutoff # -evalue
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
39 mega_temp_output = tempfile.NamedTemporaryFile().name
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
40 GALAXY_DATA_INDEX_DIR = options.index_dir
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
41 DB_LOC = "%s/blastdb.loc" % GALAXY_DATA_INDEX_DIR
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
42
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
43 # megablast parameters
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
44 try:
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
45 int( mega_word_size )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
46 except:
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
47 stop_err( 'Invalid value for word size' )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
48 try:
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
49 float( mega_iden_cutoff )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
50 except:
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
51 stop_err( 'Invalid value for identity cut-off' )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
52 try:
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
53 float( mega_evalue_cutoff )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
54 except:
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
55 stop_err( 'Invalid value for Expectation value' )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
56
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
57 if not os.path.exists( os.path.split( options.db_build )[0] ):
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
58 stop_err( 'Cannot locate the target database directory. Please check your location file.' )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
59
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
60 try:
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
61 threads = int( os.environ['GALAXY_SLOTS'])
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
62 except Exception:
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
63 threads = 8
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
64 # arguments for megablast
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
65 megablast_command = "blastn -task megablast -db %s -query %s -out %s -outfmt '6 qseqid sgi slen ppos length mismatch gaps qstart qend sstart send evalue bitscore' -num_threads %d -word_size %s -perc_identity %s -evalue %s -dust %s > /dev/null" \
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
66 % ( options.db_build, query_filename, mega_temp_output, threads, mega_word_size, mega_iden_cutoff, mega_evalue_cutoff, options.filter_query )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
67
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
68 print megablast_command
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
69
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
70 tmp = tempfile.NamedTemporaryFile().name
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
71 try:
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
72 tmp_stderr = open( tmp, 'wb' )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
73 proc = subprocess.Popen( args=megablast_command, shell=True, stderr=tmp_stderr.fileno() )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
74 returncode = proc.wait()
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
75 tmp_stderr.close()
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
76 # get stderr, allowing for case where it's very large
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
77 tmp_stderr = open( tmp, 'rb' )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
78 stderr = ''
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
79 buffsize = 1048576
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
80 try:
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
81 while True:
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
82 stderr += tmp_stderr.read( buffsize )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
83 if not stderr or len( stderr ) % buffsize != 0:
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
84 break
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
85 except OverflowError:
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
86 pass
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
87 tmp_stderr.close()
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
88 if returncode != 0:
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
89 raise Exception, stderr
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
90 if os.path.exists( tmp ):
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
91 os.unlink( tmp )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
92 except Exception, e:
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
93 if os.path.exists( mega_temp_output ):
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
94 os.unlink( mega_temp_output )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
95 if os.path.exists( tmp ):
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
96 os.unlink( tmp )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
97 stop_err( 'Cannot execute megaablast. ' + str( e ) )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
98
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
99 output = open( output_filename, 'w' )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
100 invalid_lines = 0
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
101 for i, line in enumerate( file( mega_temp_output ) ):
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
102 line = line.rstrip( '\r\n' )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
103 fields = line.split()
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
104 try:
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
105 # convert the last column (bit-score as this is causing problem in filter tool) to float
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
106 fields[-1] = float( fields[-1] )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
107 new_line = "%s\t%0.1f" % ( '\t'.join( fields[:-1] ), fields[-1] )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
108 except:
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
109 new_line = line
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
110 invalid_lines += 1
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
111 output.write( "%s\n" % new_line )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
112 output.close()
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
113
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
114 if os.path.exists( mega_temp_output ):
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
115 os.unlink( mega_temp_output ) #remove the tempfile that we just reformatted the contents of
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
116
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
117 if invalid_lines:
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
118 print "Unable to parse %d lines. Keep the default format." % invalid_lines
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
119
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
120 # megablast generates a file called error.log, if empty, delete it, if not, show the contents
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
121 if os.path.exists( './error.log' ):
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
122 for i, line in enumerate( file( './error.log' ) ):
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
123 line = line.rstrip( '\r\n' )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
124 print line
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
125 os.remove( './error.log' )
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
126
dc7b4acb3fa6 Imported from capsule None
devteam
parents:
diff changeset
127 if __name__ == "__main__" : __main__()