| 0 | 1 #!/usr/bin/env python | 
|  | 2 | 
|  | 3 """ | 
|  | 4 Runs DWGSIM | 
|  | 5 | 
|  | 6 usage: dwgsim_wrapper.py [options] | 
|  | 7     -e,--errorOne=e: base/color error rate of the first read [0.020] | 
|  | 8     -E,--errorTwo=E: base/color error rate of the second read [0.020] | 
|  | 9     -d,--innertDist=d: inner distance between the two ends [500] | 
|  | 10     -s,--stdev=s: standard deviation [50] | 
|  | 11     -N,--numReads=N: number of read pairs [1000000] | 
|  | 12     -1,--lengthOne=1: length of the first read [70] | 
|  | 13     -2,--lengthTwo=2: length of the second read [70] | 
|  | 14     -r,--mutRate=r: rate of mutations [0.0010] | 
|  | 15     -R,--fracIndels=R: fraction of indels [0.10] | 
|  | 16     -X,--indelExt=X: probability an indel is extended [0.30] | 
|  | 17     -y,--randProb=y: probability of a random DNA read [0.10] | 
|  | 18     -n,--maxN=n: maximum number of Ns allowed in a given read [0] | 
|  | 19     -c,--colorSpace=c: generate reads in color space (SOLiD reads) | 
|  | 20     -S,--strand=S: strand 0: default, 1: same strand, 2: opposite strand | 
|  | 21     -H,--haploid=H: haploid mode | 
|  | 22     -f,--fasta=f: the reference genome FASTA | 
|  | 23     -3,--outputBFAST=3: the BFAST output FASTQ | 
|  | 24     -4,--outputBWA1=4: the first BWA output FASTQ | 
|  | 25     -5,--outputBWA2=5: the second BWA output FASTQ | 
|  | 26     -6,--outputMutations=6: the output mutations TXT | 
|  | 27 """ | 
|  | 28 | 
|  | 29 import optparse, os, shutil, subprocess, sys, tempfile | 
|  | 30 | 
|  | 31 def stop_err( msg ): | 
|  | 32     sys.stderr.write( '%s\n' % msg ) | 
|  | 33     sys.exit() | 
|  | 34 | 
|  | 35 def run_process ( cmd, name, tmp_dir, buffsize ): | 
|  | 36     try: | 
|  | 37         tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name | 
|  | 38         tmp_stderr = open( tmp, 'wb' ) | 
|  | 39         proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) | 
|  | 40         returncode = proc.wait() | 
|  | 41         tmp_stderr.close() | 
|  | 42         # get stderr, allowing for case where it's very large | 
|  | 43         tmp_stderr = open( tmp, 'rb' ) | 
|  | 44         stderr = '' | 
|  | 45         try: | 
|  | 46             while True: | 
|  | 47                 stderr += tmp_stderr.read( buffsize ) | 
|  | 48                 if not stderr or len( stderr ) % buffsize != 0: | 
|  | 49                     break | 
|  | 50         except OverflowError: | 
|  | 51             pass | 
|  | 52         tmp_stderr.close() | 
|  | 53         if returncode != 0: | 
|  | 54             raise Exception, stderr | 
|  | 55     except Exception, e: | 
|  | 56         raise Exception, 'Error in \'' + name + '\'. \n' + str( e ) | 
|  | 57 | 
|  | 58 def check_output ( output, canBeEmpty ): | 
|  | 59     if 0 < os.path.getsize( output ): | 
|  | 60         return True | 
|  | 61     elif False == canBeEmpty: | 
|  | 62         raise Exception, 'The output file is empty:' + output | 
|  | 63 | 
|  | 64 def __main__(): | 
|  | 65     #Parse Command Line | 
|  | 66     parser = optparse.OptionParser() | 
|  | 67     parser.add_option( '-e', '--errorOne', dest='errorOne', type='float', help='base/color error rate of the first read' ) | 
|  | 68     parser.add_option( '-E', '--errorTwo', dest='errorTwo', type='float', help='base/color error rate of the second read' ) | 
|  | 69     parser.add_option( '-d', '--innertDist', dest='innertDist', type='int', help='inner distance between the two ends' ) | 
|  | 70     parser.add_option( '-s', '--stdev', dest='stdev', type='float', help='standard deviation' ) | 
|  | 71     parser.add_option( '-N', '--numReads', dest='numReads', type='int', help='number of read pairs' ) | 
|  | 72     parser.add_option( '-1', '--lengthOne', dest='lengthOne', type='int', help='length of the first read' ) | 
|  | 73     parser.add_option( '-2', '--lengthTwo', dest='lengthTwo', type='int', help='length of the second read' ) | 
|  | 74     parser.add_option( '-r', '--mutRate', dest='mutRate', type='float', help='rate of mutations' ) | 
|  | 75     parser.add_option( '-R', '--fracIndels', dest='fracIndels', type='float', help='fraction of indels' ) | 
|  | 76     parser.add_option( '-X', '--indelExt', dest='indelExt', type='float', help='probability an indel is extended' ) | 
|  | 77     parser.add_option( '-y', '--randProb', dest='randProb', type='float', help='probability of a random DNA read' ) | 
|  | 78     parser.add_option( '-n', '--maxN', dest='maxN', type='int', help='maximum number of Ns allowed in a given read' ) | 
|  | 79     parser.add_option( '-c', '--colorSpace', action='store_true', dest='colorSpace', default=False, help='generate reads in color space (SOLiD reads)' ) | 
|  | 80     parser.add_option( '-S', '--strand', dest='strand', type='choice', default='0', choices=('0', '1', '2'), help='strand 0: default, 1: same strand, 2: opposite strand' ) | 
|  | 81     parser.add_option( '-H', '--haploid', action='store_true', dest='haploid', default=False, help='haploid mode' ) | 
|  | 82     parser.add_option( '-f', '--fasta', dest='fasta', help='The reference genome fasta' ) | 
|  | 83     parser.add_option( '-3', '--outputBFAST', dest='outputBFAST', help='the BFAST output FASTQ' ) | 
|  | 84     parser.add_option( '-4', '--outputBWA1', dest='outputBWA1', help='the first BWA output FASTQ' ) | 
|  | 85     parser.add_option( '-5', '--outputBWA2', dest='outputBWA2', help='the second BWA output FASTQ' ) | 
|  | 86     parser.add_option( '-6', '--outputMutations', dest='outputMutations', help='the output mutations TXT' ) | 
|  | 87 | 
|  | 88     (options, args) = parser.parse_args() | 
|  | 89 | 
|  | 90     # output version # of tool | 
|  | 91     try: | 
|  | 92         tmp = tempfile.NamedTemporaryFile().name | 
|  | 93         tmp_stdout = open( tmp, 'wb' ) | 
|  | 94         proc = subprocess.Popen( args='dwgsim 2>&1', shell=True, stdout=tmp_stdout ) | 
|  | 95         tmp_stdout.close() | 
|  | 96         returncode = proc.wait() | 
|  | 97         stdout = None | 
|  | 98         for line in open( tmp_stdout.name, 'rb' ): | 
|  | 99             if line.lower().find( 'version' ) >= 0: | 
|  | 100                 stdout = line.strip() | 
|  | 101                 break | 
|  | 102         if stdout: | 
|  | 103             sys.stdout.write( '%s\n' % stdout ) | 
|  | 104         else: | 
|  | 105             raise Exception | 
|  | 106     except: | 
|  | 107         sys.stdout.write( 'Could not determine DWGSIM version\n' ) | 
|  | 108 | 
|  | 109     buffsize = 1048576 | 
|  | 110 | 
|  | 111     # make temp directory for dwgsim, requires trailing slash | 
|  | 112     tmp_dir = '%s/' % tempfile.mkdtemp() | 
|  | 113 | 
|  | 114     #'generic' options used in all dwgsim commands here | 
|  | 115 | 
|  | 116     try: | 
|  | 117         reference_filepath = tempfile.NamedTemporaryFile( dir=tmp_dir, suffix='.fa' ).name | 
|  | 118         os.symlink( options.fasta, reference_filepath ) | 
|  | 119         assert reference_filepath and os.path.exists( reference_filepath ), 'A valid genome reference was not provided.' | 
|  | 120         tmp_dir = '%s/' % tempfile.mkdtemp() | 
|  | 121         dwgsim_output_prefix = "dwgsim_output_prefix" | 
|  | 122         dwgsim_cmd = 'dwgsim -e %s -E %s -d %s -s %s -N %s -1 %s -2 %s -r %s -R %s -X %s -y %s -n %s' % \ | 
|  | 123                 (options.errorOne, \ | 
|  | 124                 options.errorTwo, \ | 
|  | 125                 options.innertDist, \ | 
|  | 126                 options.stdev, \ | 
|  | 127                 options.numReads, \ | 
|  | 128                 options.lengthOne, \ | 
|  | 129                 options.lengthTwo, \ | 
|  | 130                 options.mutRate, \ | 
|  | 131                 options.fracIndels, \ | 
|  | 132                 options.indelExt, \ | 
|  | 133                 options.randProb, \ | 
|  | 134                 options.maxN) | 
|  | 135         if options.colorSpace: | 
|  | 136             dwgsim_cmd = dwgsim_cmd + ' -c' | 
|  | 137         if options.haploid: | 
|  | 138             dwgsim_cmd = dwgsim_cmd + ' -H' | 
|  | 139         dwgsim_cmd = dwgsim_cmd + ' ' + options.fasta | 
|  | 140         dwgsim_cmd = dwgsim_cmd + ' ' + tmp_dir + '/' + dwgsim_output_prefix | 
|  | 141 | 
|  | 142         # need to nest try-except in try-finally to handle 2.4 | 
|  | 143         try: | 
|  | 144             # dwgsim | 
|  | 145             run_process ( dwgsim_cmd, 'dwgsim', tmp_dir, buffsize ) | 
|  | 146             # Move files | 
|  | 147             cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.mutations.txt' + ' ' + options.outputMutations | 
|  | 148             run_process ( cmd, 'mv #1', tmp_dir, buffsize ) | 
|  | 149             cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.bfast.fastq' + ' ' + options.outputBFAST | 
|  | 150             run_process ( cmd, 'mv #2', tmp_dir, buffsize ) | 
|  | 151             cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.bwa.read1.fastq' + ' ' + options.outputBWA1 | 
|  | 152             run_process ( cmd, 'mv #3', tmp_dir, buffsize ) | 
|  | 153             cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.bwa.read2.fastq' + ' ' + options.outputBWA2 | 
|  | 154             run_process ( cmd, 'mv #4', tmp_dir, buffsize ) | 
|  | 155             # check that there are results in the output file | 
|  | 156             check_output ( options.outputMutations, True ) | 
|  | 157             check_output ( options.outputBFAST, False ) | 
|  | 158             check_output ( options.outputBWA1, False ) | 
|  | 159             check_output ( options.outputBWA2, False ) | 
|  | 160             sys.stdout.write( 'DWGSIM successful' ) | 
|  | 161         except Exception, e: | 
|  | 162             stop_err( 'DWGSIM failed.\n' + str( e ) ) | 
|  | 163     finally: | 
|  | 164         # clean up temp dir | 
|  | 165         if os.path.exists( tmp_dir ): | 
|  | 166             shutil.rmtree( tmp_dir ) | 
|  | 167 | 
|  | 168 if __name__=="__main__": __main__() |