Mercurial > repos > devteam > megablast_wrapper
comparison megablast_wrapper.py @ 0:dc7b4acb3fa6 draft
Imported from capsule None
| author | devteam |
|---|---|
| date | Mon, 19 May 2014 12:33:49 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dc7b4acb3fa6 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 run megablast for metagenomics data | |
| 4 | |
| 5 usage: %prog [options] | |
| 6 -d, --db_build=d: The database to use | |
| 7 -i, --input=i: Input FASTQ candidate file | |
| 8 -w, --word_size=w: Size of best perfect match | |
| 9 -c, --identity_cutoff=c: Report hits at or above this identity | |
| 10 -e, --eval_cutoff=e: Expectation value cutoff | |
| 11 -f, --filter_query=f: Filter out low complexity regions | |
| 12 -x, --index_dir=x: Data index directory | |
| 13 -o, --output=o: Output file | |
| 14 | |
| 15 usage: %prog db_build input_file word_size identity_cutoff eval_cutoff filter_query index_dir output_file | |
| 16 """ | |
| 17 | |
| 18 # This version (April 26, 2012) replaces megablast with blast+ blastn | |
| 19 # There is now no need to augment NCBI-formatted databases and these can be | |
| 20 # directly downloaded from NCBI ftp site | |
| 21 | |
| 22 import os, subprocess, sys, tempfile | |
| 23 from bx.cookbook import doc_optparse | |
| 24 | |
| 25 assert sys.version_info[:2] >= ( 2, 4 ) | |
| 26 | |
| 27 def stop_err( msg ): | |
| 28 sys.stderr.write( "%s\n" % msg ) | |
| 29 sys.exit() | |
| 30 | |
| 31 def __main__(): | |
| 32 #Parse Command Line | |
| 33 options, args = doc_optparse.parse( __doc__ ) | |
| 34 query_filename = options.input.strip() # -query | |
| 35 output_filename = options.output.strip() # -out | |
| 36 mega_word_size = options.word_size # -word_size | |
| 37 mega_iden_cutoff = options.identity_cutoff # -perc_identity | |
| 38 mega_evalue_cutoff = options.eval_cutoff # -evalue | |
| 39 mega_temp_output = tempfile.NamedTemporaryFile().name | |
| 40 GALAXY_DATA_INDEX_DIR = options.index_dir | |
| 41 DB_LOC = "%s/blastdb.loc" % GALAXY_DATA_INDEX_DIR | |
| 42 | |
| 43 # megablast parameters | |
| 44 try: | |
| 45 int( mega_word_size ) | |
| 46 except: | |
| 47 stop_err( 'Invalid value for word size' ) | |
| 48 try: | |
| 49 float( mega_iden_cutoff ) | |
| 50 except: | |
| 51 stop_err( 'Invalid value for identity cut-off' ) | |
| 52 try: | |
| 53 float( mega_evalue_cutoff ) | |
| 54 except: | |
| 55 stop_err( 'Invalid value for Expectation value' ) | |
| 56 | |
| 57 if not os.path.exists( os.path.split( options.db_build )[0] ): | |
| 58 stop_err( 'Cannot locate the target database directory. Please check your location file.' ) | |
| 59 | |
| 60 try: | |
| 61 threads = int( os.environ['GALAXY_SLOTS']) | |
| 62 except Exception: | |
| 63 threads = 8 | |
| 64 # arguments for megablast | |
| 65 megablast_command = "blastn -task megablast -db %s -query %s -out %s -outfmt '6 qseqid sgi slen ppos length mismatch gaps qstart qend sstart send evalue bitscore' -num_threads %d -word_size %s -perc_identity %s -evalue %s -dust %s > /dev/null" \ | |
| 66 % ( options.db_build, query_filename, mega_temp_output, threads, mega_word_size, mega_iden_cutoff, mega_evalue_cutoff, options.filter_query ) | |
| 67 | |
| 68 print megablast_command | |
| 69 | |
| 70 tmp = tempfile.NamedTemporaryFile().name | |
| 71 try: | |
| 72 tmp_stderr = open( tmp, 'wb' ) | |
| 73 proc = subprocess.Popen( args=megablast_command, shell=True, stderr=tmp_stderr.fileno() ) | |
| 74 returncode = proc.wait() | |
| 75 tmp_stderr.close() | |
| 76 # get stderr, allowing for case where it's very large | |
| 77 tmp_stderr = open( tmp, 'rb' ) | |
| 78 stderr = '' | |
| 79 buffsize = 1048576 | |
| 80 try: | |
| 81 while True: | |
| 82 stderr += tmp_stderr.read( buffsize ) | |
| 83 if not stderr or len( stderr ) % buffsize != 0: | |
| 84 break | |
| 85 except OverflowError: | |
| 86 pass | |
| 87 tmp_stderr.close() | |
| 88 if returncode != 0: | |
| 89 raise Exception, stderr | |
| 90 if os.path.exists( tmp ): | |
| 91 os.unlink( tmp ) | |
| 92 except Exception, e: | |
| 93 if os.path.exists( mega_temp_output ): | |
| 94 os.unlink( mega_temp_output ) | |
| 95 if os.path.exists( tmp ): | |
| 96 os.unlink( tmp ) | |
| 97 stop_err( 'Cannot execute megaablast. ' + str( e ) ) | |
| 98 | |
| 99 output = open( output_filename, 'w' ) | |
| 100 invalid_lines = 0 | |
| 101 for i, line in enumerate( file( mega_temp_output ) ): | |
| 102 line = line.rstrip( '\r\n' ) | |
| 103 fields = line.split() | |
| 104 try: | |
| 105 # convert the last column (bit-score as this is causing problem in filter tool) to float | |
| 106 fields[-1] = float( fields[-1] ) | |
| 107 new_line = "%s\t%0.1f" % ( '\t'.join( fields[:-1] ), fields[-1] ) | |
| 108 except: | |
| 109 new_line = line | |
| 110 invalid_lines += 1 | |
| 111 output.write( "%s\n" % new_line ) | |
| 112 output.close() | |
| 113 | |
| 114 if os.path.exists( mega_temp_output ): | |
| 115 os.unlink( mega_temp_output ) #remove the tempfile that we just reformatted the contents of | |
| 116 | |
| 117 if invalid_lines: | |
| 118 print "Unable to parse %d lines. Keep the default format." % invalid_lines | |
| 119 | |
| 120 # megablast generates a file called error.log, if empty, delete it, if not, show the contents | |
| 121 if os.path.exists( './error.log' ): | |
| 122 for i, line in enumerate( file( './error.log' ) ): | |
| 123 line = line.rstrip( '\r\n' ) | |
| 124 print line | |
| 125 os.remove( './error.log' ) | |
| 126 | |
| 127 if __name__ == "__main__" : __main__() |
