comparison tools/metag_tools/megablast_wrapper.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 #!/usr/bin/env python
2 """
3 run megablast for metagenomics data
4
5 usage: %prog [options]
6 -d, --db_build=d: The database to use
7 -i, --input=i: Input FASTQ candidate file
8 -w, --word_size=w: Size of best perfect match
9 -c, --identity_cutoff=c: Report hits at or above this identity
10 -e, --eval_cutoff=e: Expectation value cutoff
11 -f, --filter_query=f: Filter out low complexity regions
12 -x, --index_dir=x: Data index directory
13 -o, --output=o: Output file
14
15 usage: %prog db_build input_file word_size identity_cutoff eval_cutoff filter_query index_dir output_file
16 """
17
18 import os, subprocess, sys, tempfile
19 from galaxy import eggs
20 import pkg_resources; pkg_resources.require( "bx-python" )
21 from bx.cookbook import doc_optparse
22
23 assert sys.version_info[:2] >= ( 2, 4 )
24
25 def stop_err( msg ):
26 sys.stderr.write( "%s\n" % msg )
27 sys.exit()
28
29 def __main__():
30 #Parse Command Line
31 options, args = doc_optparse.parse( __doc__ )
32 query_filename = options.input.strip()
33 output_filename = options.output.strip()
34 mega_word_size = options.word_size # -W
35 mega_iden_cutoff = options.identity_cutoff # -p
36 mega_evalue_cutoff = options.eval_cutoff # -e
37 mega_temp_output = tempfile.NamedTemporaryFile().name
38 GALAXY_DATA_INDEX_DIR = options.index_dir
39 DB_LOC = "%s/blastdb.loc" % GALAXY_DATA_INDEX_DIR
40
41 # megablast parameters
42 try:
43 int( mega_word_size )
44 except:
45 stop_err( 'Invalid value for word size' )
46 try:
47 float( mega_iden_cutoff )
48 except:
49 stop_err( 'Invalid value for identity cut-off' )
50 try:
51 float( mega_evalue_cutoff )
52 except:
53 stop_err( 'Invalid value for Expectation value' )
54
55 if not os.path.exists( os.path.split( options.db_build )[0] ):
56 stop_err( 'Cannot locate the target database directory. Please check your location file.' )
57
58 # arguments for megablast
59 megablast_command = "megablast -d %s -i %s -o %s -m 8 -a 8 -W %s -p %s -e %s -F %s > /dev/null" \
60 % ( options.db_build, query_filename, mega_temp_output, mega_word_size, mega_iden_cutoff, mega_evalue_cutoff, options.filter_query )
61
62 print megablast_command
63
64 tmp = tempfile.NamedTemporaryFile().name
65 try:
66 tmp_stderr = open( tmp, 'wb' )
67 proc = subprocess.Popen( args=megablast_command, shell=True, stderr=tmp_stderr.fileno() )
68 returncode = proc.wait()
69 tmp_stderr.close()
70 # get stderr, allowing for case where it's very large
71 tmp_stderr = open( tmp, 'rb' )
72 stderr = ''
73 buffsize = 1048576
74 try:
75 while True:
76 stderr += tmp_stderr.read( buffsize )
77 if not stderr or len( stderr ) % buffsize != 0:
78 break
79 except OverflowError:
80 pass
81 tmp_stderr.close()
82 if returncode != 0:
83 raise Exception, stderr
84 if os.path.exists( tmp ):
85 os.unlink( tmp )
86 except Exception, e:
87 if os.path.exists( mega_temp_output ):
88 os.unlink( mega_temp_output )
89 if os.path.exists( tmp ):
90 os.unlink( tmp )
91 stop_err( 'Error indexing reference sequence. ' + str( e ) )
92
93 output = open( output_filename, 'w' )
94 invalid_lines = 0
95 for i, line in enumerate( file( mega_temp_output ) ):
96 line = line.rstrip( '\r\n' )
97 fields = line.split()
98 try:
99 # get gi and length of that gi seq
100 gi, gi_len = fields[1].split( '_' )
101 # convert the last column (causing problem in filter tool) to float
102 fields[-1] = float( fields[-1] )
103 new_line = "%s\t%s\t%s\t%s\t%0.1f" % ( fields[0], gi, gi_len, '\t'.join( fields[2:-1] ), fields[-1] )
104 except:
105 new_line = line
106 invalid_lines += 1
107 output.write( "%s\n" % new_line )
108 output.close()
109
110 if os.path.exists( mega_temp_output ):
111 os.unlink( mega_temp_output ) #remove the tempfile that we just reformatted the contents of
112
113 if invalid_lines:
114 print "Unable to parse %d lines. Keep the default format." % invalid_lines
115
116 # megablast generates a file called error.log, if empty, delete it, if not, show the contents
117 if os.path.exists( './error.log' ):
118 for i, line in enumerate( file( './error.log' ) ):
119 line = line.rstrip( '\r\n' )
120 print line
121 os.remove( './error.log' )
122
123 if __name__ == "__main__" : __main__()