annotate tools/metag_tools/megablast_wrapper.py @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/env python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 run megablast for metagenomics data
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 usage: %prog [options]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 -d, --db_build=d: The database to use
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 -i, --input=i: Input FASTQ candidate file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 -w, --word_size=w: Size of best perfect match
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 -c, --identity_cutoff=c: Report hits at or above this identity
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 -e, --eval_cutoff=e: Expectation value cutoff
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 -f, --filter_query=f: Filter out low complexity regions
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 -x, --index_dir=x: Data index directory
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 -o, --output=o: Output file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 usage: %prog db_build input_file word_size identity_cutoff eval_cutoff filter_query index_dir output_file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 import os, subprocess, sys, tempfile
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 from galaxy import eggs
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 import pkg_resources; pkg_resources.require( "bx-python" )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 from bx.cookbook import doc_optparse
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 assert sys.version_info[:2] >= ( 2, 4 )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 def stop_err( msg ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 sys.stderr.write( "%s\n" % msg )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 sys.exit()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 def __main__():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 #Parse Command Line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 options, args = doc_optparse.parse( __doc__ )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 query_filename = options.input.strip()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 output_filename = options.output.strip()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 mega_word_size = options.word_size # -W
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 mega_iden_cutoff = options.identity_cutoff # -p
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 mega_evalue_cutoff = options.eval_cutoff # -e
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 mega_temp_output = tempfile.NamedTemporaryFile().name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 GALAXY_DATA_INDEX_DIR = options.index_dir
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 DB_LOC = "%s/blastdb.loc" % GALAXY_DATA_INDEX_DIR
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 # megablast parameters
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 int( mega_word_size )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 stop_err( 'Invalid value for word size' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 float( mega_iden_cutoff )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 stop_err( 'Invalid value for identity cut-off' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 float( mega_evalue_cutoff )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 stop_err( 'Invalid value for Expectation value' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 if not os.path.exists( os.path.split( options.db_build )[0] ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 stop_err( 'Cannot locate the target database directory. Please check your location file.' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 # arguments for megablast
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 megablast_command = "megablast -d %s -i %s -o %s -m 8 -a 8 -W %s -p %s -e %s -F %s > /dev/null" \
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 % ( options.db_build, query_filename, mega_temp_output, mega_word_size, mega_iden_cutoff, mega_evalue_cutoff, options.filter_query )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 print megablast_command
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 tmp = tempfile.NamedTemporaryFile().name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 tmp_stderr = open( tmp, 'wb' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 proc = subprocess.Popen( args=megablast_command, shell=True, stderr=tmp_stderr.fileno() )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 returncode = proc.wait()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 tmp_stderr.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 # get stderr, allowing for case where it's very large
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 tmp_stderr = open( tmp, 'rb' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 stderr = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 buffsize = 1048576
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 while True:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 stderr += tmp_stderr.read( buffsize )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 if not stderr or len( stderr ) % buffsize != 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 except OverflowError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 pass
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 tmp_stderr.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 if returncode != 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 raise Exception, stderr
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 if os.path.exists( tmp ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 os.unlink( tmp )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 except Exception, e:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 if os.path.exists( mega_temp_output ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 os.unlink( mega_temp_output )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 if os.path.exists( tmp ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 os.unlink( tmp )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 stop_err( 'Error indexing reference sequence. ' + str( e ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 output = open( output_filename, 'w' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 invalid_lines = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 for i, line in enumerate( file( mega_temp_output ) ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 line = line.rstrip( '\r\n' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 fields = line.split()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 # get gi and length of that gi seq
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 gi, gi_len = fields[1].split( '_' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 # convert the last column (causing problem in filter tool) to float
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 fields[-1] = float( fields[-1] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 new_line = "%s\t%s\t%s\t%s\t%0.1f" % ( fields[0], gi, gi_len, '\t'.join( fields[2:-1] ), fields[-1] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 new_line = line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 invalid_lines += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 output.write( "%s\n" % new_line )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108 output.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 if os.path.exists( mega_temp_output ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 os.unlink( mega_temp_output ) #remove the tempfile that we just reformatted the contents of
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 if invalid_lines:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 print "Unable to parse %d lines. Keep the default format." % invalid_lines
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 # megablast generates a file called error.log, if empty, delete it, if not, show the contents
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 if os.path.exists( './error.log' ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 for i, line in enumerate( file( './error.log' ) ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119 line = line.rstrip( '\r\n' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 print line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 os.remove( './error.log' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 if __name__ == "__main__" : __main__()