Mercurial > repos > devteam > megablast_wrapper
comparison megablast_wrapper.py @ 0:dc7b4acb3fa6 draft
Imported from capsule None
author | devteam |
---|---|
date | Mon, 19 May 2014 12:33:49 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dc7b4acb3fa6 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 run megablast for metagenomics data | |
4 | |
5 usage: %prog [options] | |
6 -d, --db_build=d: The database to use | |
7 -i, --input=i: Input FASTQ candidate file | |
8 -w, --word_size=w: Size of best perfect match | |
9 -c, --identity_cutoff=c: Report hits at or above this identity | |
10 -e, --eval_cutoff=e: Expectation value cutoff | |
11 -f, --filter_query=f: Filter out low complexity regions | |
12 -x, --index_dir=x: Data index directory | |
13 -o, --output=o: Output file | |
14 | |
15 usage: %prog db_build input_file word_size identity_cutoff eval_cutoff filter_query index_dir output_file | |
16 """ | |
17 | |
18 # This version (April 26, 2012) replaces megablast with blast+ blastn | |
19 # There is now no need to augment NCBI-formatted databases and these can be | |
20 # directly downloaded from NCBI ftp site | |
21 | |
22 import os, subprocess, sys, tempfile | |
23 from bx.cookbook import doc_optparse | |
24 | |
25 assert sys.version_info[:2] >= ( 2, 4 ) | |
26 | |
27 def stop_err( msg ): | |
28 sys.stderr.write( "%s\n" % msg ) | |
29 sys.exit() | |
30 | |
31 def __main__(): | |
32 #Parse Command Line | |
33 options, args = doc_optparse.parse( __doc__ ) | |
34 query_filename = options.input.strip() # -query | |
35 output_filename = options.output.strip() # -out | |
36 mega_word_size = options.word_size # -word_size | |
37 mega_iden_cutoff = options.identity_cutoff # -perc_identity | |
38 mega_evalue_cutoff = options.eval_cutoff # -evalue | |
39 mega_temp_output = tempfile.NamedTemporaryFile().name | |
40 GALAXY_DATA_INDEX_DIR = options.index_dir | |
41 DB_LOC = "%s/blastdb.loc" % GALAXY_DATA_INDEX_DIR | |
42 | |
43 # megablast parameters | |
44 try: | |
45 int( mega_word_size ) | |
46 except: | |
47 stop_err( 'Invalid value for word size' ) | |
48 try: | |
49 float( mega_iden_cutoff ) | |
50 except: | |
51 stop_err( 'Invalid value for identity cut-off' ) | |
52 try: | |
53 float( mega_evalue_cutoff ) | |
54 except: | |
55 stop_err( 'Invalid value for Expectation value' ) | |
56 | |
57 if not os.path.exists( os.path.split( options.db_build )[0] ): | |
58 stop_err( 'Cannot locate the target database directory. Please check your location file.' ) | |
59 | |
60 try: | |
61 threads = int( os.environ['GALAXY_SLOTS']) | |
62 except Exception: | |
63 threads = 8 | |
64 # arguments for megablast | |
65 megablast_command = "blastn -task megablast -db %s -query %s -out %s -outfmt '6 qseqid sgi slen ppos length mismatch gaps qstart qend sstart send evalue bitscore' -num_threads %d -word_size %s -perc_identity %s -evalue %s -dust %s > /dev/null" \ | |
66 % ( options.db_build, query_filename, mega_temp_output, threads, mega_word_size, mega_iden_cutoff, mega_evalue_cutoff, options.filter_query ) | |
67 | |
68 print megablast_command | |
69 | |
70 tmp = tempfile.NamedTemporaryFile().name | |
71 try: | |
72 tmp_stderr = open( tmp, 'wb' ) | |
73 proc = subprocess.Popen( args=megablast_command, shell=True, stderr=tmp_stderr.fileno() ) | |
74 returncode = proc.wait() | |
75 tmp_stderr.close() | |
76 # get stderr, allowing for case where it's very large | |
77 tmp_stderr = open( tmp, 'rb' ) | |
78 stderr = '' | |
79 buffsize = 1048576 | |
80 try: | |
81 while True: | |
82 stderr += tmp_stderr.read( buffsize ) | |
83 if not stderr or len( stderr ) % buffsize != 0: | |
84 break | |
85 except OverflowError: | |
86 pass | |
87 tmp_stderr.close() | |
88 if returncode != 0: | |
89 raise Exception, stderr | |
90 if os.path.exists( tmp ): | |
91 os.unlink( tmp ) | |
92 except Exception, e: | |
93 if os.path.exists( mega_temp_output ): | |
94 os.unlink( mega_temp_output ) | |
95 if os.path.exists( tmp ): | |
96 os.unlink( tmp ) | |
97 stop_err( 'Cannot execute megaablast. ' + str( e ) ) | |
98 | |
99 output = open( output_filename, 'w' ) | |
100 invalid_lines = 0 | |
101 for i, line in enumerate( file( mega_temp_output ) ): | |
102 line = line.rstrip( '\r\n' ) | |
103 fields = line.split() | |
104 try: | |
105 # convert the last column (bit-score as this is causing problem in filter tool) to float | |
106 fields[-1] = float( fields[-1] ) | |
107 new_line = "%s\t%0.1f" % ( '\t'.join( fields[:-1] ), fields[-1] ) | |
108 except: | |
109 new_line = line | |
110 invalid_lines += 1 | |
111 output.write( "%s\n" % new_line ) | |
112 output.close() | |
113 | |
114 if os.path.exists( mega_temp_output ): | |
115 os.unlink( mega_temp_output ) #remove the tempfile that we just reformatted the contents of | |
116 | |
117 if invalid_lines: | |
118 print "Unable to parse %d lines. Keep the default format." % invalid_lines | |
119 | |
120 # megablast generates a file called error.log, if empty, delete it, if not, show the contents | |
121 if os.path.exists( './error.log' ): | |
122 for i, line in enumerate( file( './error.log' ) ): | |
123 line = line.rstrip( '\r\n' ) | |
124 print line | |
125 os.remove( './error.log' ) | |
126 | |
127 if __name__ == "__main__" : __main__() |