0
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 run megablast for metagenomics data
|
|
4
|
|
5 usage: %prog [options]
|
|
6 -d, --db_build=d: The database to use
|
|
7 -i, --input=i: Input FASTQ candidate file
|
|
8 -w, --word_size=w: Size of best perfect match
|
|
9 -c, --identity_cutoff=c: Report hits at or above this identity
|
|
10 -e, --eval_cutoff=e: Expectation value cutoff
|
|
11 -f, --filter_query=f: Filter out low complexity regions
|
|
12 -x, --index_dir=x: Data index directory
|
|
13 -o, --output=o: Output file
|
|
14
|
|
15 usage: %prog db_build input_file word_size identity_cutoff eval_cutoff filter_query index_dir output_file
|
|
16 """
|
|
17
|
|
18 import os, subprocess, sys, tempfile
|
|
19 from galaxy import eggs
|
|
20 import pkg_resources; pkg_resources.require( "bx-python" )
|
|
21 from bx.cookbook import doc_optparse
|
|
22
|
|
23 assert sys.version_info[:2] >= ( 2, 4 )
|
|
24
|
|
25 def stop_err( msg ):
|
|
26 sys.stderr.write( "%s\n" % msg )
|
|
27 sys.exit()
|
|
28
|
|
29 def __main__():
|
|
30 #Parse Command Line
|
|
31 options, args = doc_optparse.parse( __doc__ )
|
|
32 query_filename = options.input.strip()
|
|
33 output_filename = options.output.strip()
|
|
34 mega_word_size = options.word_size # -W
|
|
35 mega_iden_cutoff = options.identity_cutoff # -p
|
|
36 mega_evalue_cutoff = options.eval_cutoff # -e
|
|
37 mega_temp_output = tempfile.NamedTemporaryFile().name
|
|
38 GALAXY_DATA_INDEX_DIR = options.index_dir
|
|
39 DB_LOC = "%s/blastdb.loc" % GALAXY_DATA_INDEX_DIR
|
|
40
|
|
41 # megablast parameters
|
|
42 try:
|
|
43 int( mega_word_size )
|
|
44 except:
|
|
45 stop_err( 'Invalid value for word size' )
|
|
46 try:
|
|
47 float( mega_iden_cutoff )
|
|
48 except:
|
|
49 stop_err( 'Invalid value for identity cut-off' )
|
|
50 try:
|
|
51 float( mega_evalue_cutoff )
|
|
52 except:
|
|
53 stop_err( 'Invalid value for Expectation value' )
|
|
54
|
|
55 if not os.path.exists( os.path.split( options.db_build )[0] ):
|
|
56 stop_err( 'Cannot locate the target database directory. Please check your location file.' )
|
|
57
|
|
58 # arguments for megablast
|
|
59 megablast_command = "megablast -d %s -i %s -o %s -m 8 -a 8 -W %s -p %s -e %s -F %s > /dev/null" \
|
|
60 % ( options.db_build, query_filename, mega_temp_output, mega_word_size, mega_iden_cutoff, mega_evalue_cutoff, options.filter_query )
|
|
61
|
|
62 print megablast_command
|
|
63
|
|
64 tmp = tempfile.NamedTemporaryFile().name
|
|
65 try:
|
|
66 tmp_stderr = open( tmp, 'wb' )
|
|
67 proc = subprocess.Popen( args=megablast_command, shell=True, stderr=tmp_stderr.fileno() )
|
|
68 returncode = proc.wait()
|
|
69 tmp_stderr.close()
|
|
70 # get stderr, allowing for case where it's very large
|
|
71 tmp_stderr = open( tmp, 'rb' )
|
|
72 stderr = ''
|
|
73 buffsize = 1048576
|
|
74 try:
|
|
75 while True:
|
|
76 stderr += tmp_stderr.read( buffsize )
|
|
77 if not stderr or len( stderr ) % buffsize != 0:
|
|
78 break
|
|
79 except OverflowError:
|
|
80 pass
|
|
81 tmp_stderr.close()
|
|
82 if returncode != 0:
|
|
83 raise Exception, stderr
|
|
84 if os.path.exists( tmp ):
|
|
85 os.unlink( tmp )
|
|
86 except Exception, e:
|
|
87 if os.path.exists( mega_temp_output ):
|
|
88 os.unlink( mega_temp_output )
|
|
89 if os.path.exists( tmp ):
|
|
90 os.unlink( tmp )
|
|
91 stop_err( 'Error indexing reference sequence. ' + str( e ) )
|
|
92
|
|
93 output = open( output_filename, 'w' )
|
|
94 invalid_lines = 0
|
|
95 for i, line in enumerate( file( mega_temp_output ) ):
|
|
96 line = line.rstrip( '\r\n' )
|
|
97 fields = line.split()
|
|
98 try:
|
|
99 # get gi and length of that gi seq
|
|
100 gi, gi_len = fields[1].split( '_' )
|
|
101 # convert the last column (causing problem in filter tool) to float
|
|
102 fields[-1] = float( fields[-1] )
|
|
103 new_line = "%s\t%s\t%s\t%s\t%0.1f" % ( fields[0], gi, gi_len, '\t'.join( fields[2:-1] ), fields[-1] )
|
|
104 except:
|
|
105 new_line = line
|
|
106 invalid_lines += 1
|
|
107 output.write( "%s\n" % new_line )
|
|
108 output.close()
|
|
109
|
|
110 if os.path.exists( mega_temp_output ):
|
|
111 os.unlink( mega_temp_output ) #remove the tempfile that we just reformatted the contents of
|
|
112
|
|
113 if invalid_lines:
|
|
114 print "Unable to parse %d lines. Keep the default format." % invalid_lines
|
|
115
|
|
116 # megablast generates a file called error.log, if empty, delete it, if not, show the contents
|
|
117 if os.path.exists( './error.log' ):
|
|
118 for i, line in enumerate( file( './error.log' ) ):
|
|
119 line = line.rstrip( '\r\n' )
|
|
120 print line
|
|
121 os.remove( './error.log' )
|
|
122
|
|
123 if __name__ == "__main__" : __main__()
|