Mercurial > repos > xuebing > sharplabtool
diff tools/ngs_simulation/ngs_simulation.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/ngs_simulation/ngs_simulation.py Fri Mar 09 19:37:19 2012 -0500 @@ -0,0 +1,280 @@ +#!/usr/bin/env python + +""" +Runs Ben's simulation. + +usage: %prog [options] + -i, --input=i: Input genome (FASTA format) + -g, --genome=g: If built-in, the genome being used + -l, --read_len=l: Read length + -c, --avg_coverage=c: Average coverage + -e, --error_rate=e: Error rate (0-1) + -n, --num_sims=n: Number of simulations to run + -p, --polymorphism=p: Frequency/ies for minor allele (comma-separate list of 0-1) + -d, --detection_thresh=d: Detection thresholds (comma-separate list of 0-1) + -p, --output_png=p: Plot output + -s, --summary_out=s: Whether or not to output a file with summary of all simulations + -m, --output_summary=m: File name for output summary of all simulations + -f, --new_file_path=f: Directory for summary output files + +""" +# removed output of all simulation results on request (not working) +# -r, --sim_results=r: Output all tabular simulation results (number of polymorphisms times number of detection thresholds) +# -o, --output=o: Base name for summary output for each run + +from rpy import * +import os +import random, sys, tempfile +from galaxy import eggs +import pkg_resources; pkg_resources.require( "bx-python" ) +from bx.cookbook import doc_optparse + +def stop_err( msg ): + sys.stderr.write( '%s\n' % msg ) + sys.exit() + +def __main__(): + #Parse Command Line + options, args = doc_optparse.parse( __doc__ ) + # validate parameters + error = '' + try: + read_len = int( options.read_len ) + if read_len <= 0: + raise Exception, ' greater than 0' + except TypeError, e: + error = ': %s' % str( e ) + if error: + stop_err( 'Make sure your number of reads is an integer value%s' % error ) + error = '' + try: + avg_coverage = int( options.avg_coverage ) + if avg_coverage <= 0: + raise Exception, ' greater than 0' + except Exception, e: + error = ': %s' % str( e ) + if error: + stop_err( 'Make sure your average coverage is an integer value%s' % error ) + error = '' + try: + error_rate = float( options.error_rate ) + if error_rate >= 1.0: + error_rate = 10 ** ( -error_rate / 10.0 ) + elif error_rate < 0: + raise Exception, ' between 0 and 1' + except Exception, e: + error = ': %s' % str( e ) + if error: + stop_err( 'Make sure the error rate is a decimal value%s or the quality score is at least 1' % error ) + try: + num_sims = int( options.num_sims ) + except TypeError, e: + stop_err( 'Make sure the number of simulations is an integer value: %s' % str( e ) ) + if len( options.polymorphism ) > 0: + polymorphisms = [ float( p ) for p in options.polymorphism.split( ',' ) ] + else: + stop_err( 'Select at least one polymorphism value to use' ) + if len( options.detection_thresh ) > 0: + detection_threshes = [ float( dt ) for dt in options.detection_thresh.split( ',' ) ] + else: + stop_err( 'Select at least one detection threshold to use' ) + + # mutation dictionaries + hp_dict = { 'A':'G', 'G':'A', 'C':'T', 'T':'C', 'N':'N' } # heteroplasmy dictionary + mt_dict = { 'A':'C', 'C':'A', 'G':'T', 'T':'G', 'N':'N'} # misread dictionary + + # read fasta file to seq string + all_lines = open( options.input, 'rb' ).readlines() + seq = '' + for line in all_lines: + line = line.rstrip() + if line.startswith('>'): + pass + else: + seq += line.upper() + seq_len = len( seq ) + + # output file name template +# removed output of all simulation results on request (not working) +# if options.sim_results == "true": +# out_name_template = os.path.join( options.new_file_path, 'primary_output%s_' + options.output + '_visible_tabular' ) +# else: +# out_name_template = tempfile.NamedTemporaryFile().name + '_%s' + out_name_template = tempfile.NamedTemporaryFile().name + '_%s' + print 'out_name_template:', out_name_template + + # set up output files + outputs = {} + i = 1 + for p in polymorphisms: + outputs[ p ] = {} + for d in detection_threshes: + outputs[ p ][ d ] = out_name_template % i + i += 1 + + # run sims + for polymorphism in polymorphisms: + for detection_thresh in detection_threshes: + output = open( outputs[ polymorphism ][ detection_thresh ], 'wb' ) + output.write( 'FP\tFN\tGENOMESIZE=%s\n' % seq_len ) + sim_count = 0 + while sim_count < num_sims: + # randomly pick heteroplasmic base index + hbase = random.choice( range( 0, seq_len ) ) + #hbase = seq_len/2#random.randrange( 0, seq_len ) + # create 2D quasispecies list + qspec = map( lambda x: [], [0] * seq_len ) + # simulate read indices and assign to quasispecies + i = 0 + while i < ( avg_coverage * ( seq_len / read_len ) ): # number of reads (approximates coverage) + start = random.choice( range( 0, seq_len ) ) + #start = seq_len/2#random.randrange( 0, seq_len ) # assign read start + if random.random() < 0.5: # positive sense read + end = start + read_len # assign read end + if end > seq_len: # overshooting origin + read = range( start, seq_len ) + range( 0, ( end - seq_len ) ) + else: # regular read + read = range( start, end ) + else: # negative sense read + end = start - read_len # assign read end + if end < -1: # overshooting origin + read = range( start, -1, -1) + range( ( seq_len - 1 ), ( seq_len + end ), -1 ) + else: # regular read + read = range( start, end, -1 ) + # assign read to quasispecies list by index + for j in read: + if j == hbase and random.random() < polymorphism: # heteroplasmic base is variant with p = het + ref = hp_dict[ seq[ j ] ] + else: # ref is the verbatim reference nucleotide (all positions) + ref = seq[ j ] + if random.random() < error_rate: # base in read is misread with p = err + qspec[ j ].append( mt_dict[ ref ] ) + else: # otherwise we carry ref through to the end + qspec[ j ].append(ref) + # last but not least + i += 1 + bases, fpos, fneg = {}, 0, 0 # last two will be outputted to summary file later + for i, nuc in enumerate( seq ): + cov = len( qspec[ i ] ) + bases[ 'A' ] = qspec[ i ].count( 'A' ) + bases[ 'C' ] = qspec[ i ].count( 'C' ) + bases[ 'G' ] = qspec[ i ].count( 'G' ) + bases[ 'T' ] = qspec[ i ].count( 'T' ) + # calculate max NON-REF deviation + del bases[ nuc ] + maxdev = float( max( bases.values() ) ) / cov + # deal with non-het sites + if i != hbase: + if maxdev >= detection_thresh: # greater than detection threshold = false positive + fpos += 1 + # deal with het sites + if i == hbase: + hnuc = hp_dict[ nuc ] # let's recover het variant + if ( float( bases[ hnuc ] ) / cov ) < detection_thresh: # less than detection threshold = false negative + fneg += 1 + del bases[ hnuc ] # ignore het variant + maxdev = float( max( bases.values() ) ) / cov # check other non-ref bases at het site + if maxdev >= detection_thresh: # greater than detection threshold = false positive (possible) + fpos += 1 + # output error sums and genome size to summary file + output.write( '%d\t%d\n' % ( fpos, fneg ) ) + sim_count += 1 + # close output up + output.close() + + # Parameters (heteroplasmy, error threshold, colours) + r( ''' + het=c(%s) + err=c(%s) + grade = (0:32)/32 + hues = rev(gray(grade)) + ''' % ( ','.join( [ str( p ) for p in polymorphisms ] ), ','.join( [ str( d ) for d in detection_threshes ] ) ) ) + + # Suppress warnings + r( 'options(warn=-1)' ) + + # Create allsum (for FP) and allneg (for FN) objects + r( 'allsum <- data.frame()' ) + for polymorphism in polymorphisms: + for detection_thresh in detection_threshes: + output = outputs[ polymorphism ][ detection_thresh ] + cmd = ''' + ngsum = read.delim('%s', header=T) + ngsum$fprate <- ngsum$FP/%s + ngsum$hetcol <- %s + ngsum$errcol <- %s + allsum <- rbind(allsum, ngsum) + ''' % ( output, seq_len, polymorphism, detection_thresh ) + r( cmd ) + + if os.path.getsize( output ) == 0: + for p in outputs.keys(): + for d in outputs[ p ].keys(): + sys.stderr.write(outputs[ p ][ d ] + ' '+str( os.path.getsize( outputs[ p ][ d ] ) )+'\n') + + if options.summary_out == "true": + r( 'write.table(summary(ngsum), file="%s", quote=FALSE, sep="\t", row.names=FALSE)' % options.output_summary ) + + # Summary objects (these could be printed) + r( ''' + tr_pos <- tapply(allsum$fprate,list(allsum$hetcol,allsum$errcol), mean) + tr_neg <- tapply(allsum$FN,list(allsum$hetcol,allsum$errcol), mean) + cat('\nFalse Positive Rate Summary\n\t', file='%s', append=T, sep='\t') + write.table(format(tr_pos, digits=4), file='%s', append=T, quote=F, sep='\t') + cat('\nFalse Negative Rate Summary\n\t', file='%s', append=T, sep='\t') + write.table(format(tr_neg, digits=4), file='%s', append=T, quote=F, sep='\t') + ''' % tuple( [ options.output_summary ] * 4 ) ) + + # Setup graphs + #pdf(paste(prefix,'_jointgraph.pdf',sep=''), 15, 10) + r( ''' + png('%s', width=800, height=500, units='px', res=250) + layout(matrix(data=c(1,2,1,3,1,4), nrow=2, ncol=3), widths=c(4,6,2), heights=c(1,10,10)) + ''' % options.output_png ) + + # Main title + genome = '' + if options.genome: + genome = '%s: ' % options.genome + r( ''' + par(mar=c(0,0,0,0)) + plot(1, type='n', axes=F, xlab='', ylab='') + text(1,1,paste('%sVariation in False Positives and Negatives (', %s, ' simulations, coverage ', %s,')', sep=''), font=2, family='sans', cex=0.7) + ''' % ( genome, options.num_sims, options.avg_coverage ) ) + + # False positive boxplot + r( ''' + par(mar=c(5,4,2,2), las=1, cex=0.35) + boxplot(allsum$fprate ~ allsum$errcol, horizontal=T, ylim=rev(range(allsum$fprate)), cex.axis=0.85) + title(main='False Positives', xlab='false positive rate', ylab='') + ''' ) + + # False negative heatmap (note zlim command!) + num_polys = len( polymorphisms ) + num_dets = len( detection_threshes ) + r( ''' + par(mar=c(5,4,2,1), las=1, cex=0.35) + image(1:%s, 1:%s, tr_neg, zlim=c(0,1), col=hues, xlab='', ylab='', axes=F, border=1) + axis(1, at=1:%s, labels=rownames(tr_neg), lwd=1, cex.axis=0.85, axs='i') + axis(2, at=1:%s, labels=colnames(tr_neg), lwd=1, cex.axis=0.85) + title(main='False Negatives', xlab='minor allele frequency', ylab='detection threshold') + ''' % ( num_polys, num_dets, num_polys, num_dets ) ) + + # Scale alongside + r( ''' + par(mar=c(2,2,2,3), las=1) + image(1, grade, matrix(grade, ncol=length(grade), nrow=1), col=hues, xlab='', ylab='', xaxt='n', las=1, cex.axis=0.85) + title(main='Key', cex=0.35) + mtext('false negative rate', side=1, cex=0.35) + ''' ) + + # Close graphics + r( ''' + layout(1) + dev.off() + ''' ) + + # Tidy up +# r( 'rm(folder,prefix,sim,cov,het,err,grade,hues,i,j,ngsum)' ) + +if __name__ == "__main__" : __main__()