Next changeset 1:512671604bab (2011-08-14) |
Commit message:
Uploaded |
added:
dwgsim_wrapper.py |
b |
diff -r 000000000000 -r 1f7032731f66 dwgsim_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dwgsim_wrapper.py Sun Aug 14 20:07:45 2011 -0400 |
[ |
b'@@ -0,0 +1,168 @@\n+#!/usr/bin/env python\n+\n+"""\n+Runs DWGSIM\n+\n+usage: dwgsim_wrapper.py [options]\n+ -e,--errorOne=e: base/color error rate of the first read [0.020]\n+ -E,--errorTwo=E: base/color error rate of the second read [0.020]\n+ -d,--innertDist=d: inner distance between the two ends [500]\n+ -s,--stdev=s: standard deviation [50]\n+ -N,--numReads=N: number of read pairs [1000000]\n+ -1,--lengthOne=1: length of the first read [70]\n+ -2,--lengthTwo=2: length of the second read [70]\n+ -r,--mutRate=r: rate of mutations [0.0010]\n+ -R,--fracIndels=R: fraction of indels [0.10]\n+ -X,--indelExt=X: probability an indel is extended [0.30]\n+ -y,--randProb=y: probability of a random DNA read [0.10]\n+ -n,--maxN=n: maximum number of Ns allowed in a given read [0]\n+ -c,--colorSpace=c: generate reads in color space (SOLiD reads)\n+ -S,--strand=S: strand 0: default, 1: same strand, 2: opposite strand\n+ -H,--haploid=H: haploid mode\n+ -f,--fasta=f: the reference genome FASTA\n+ -3,--outputBFAST=3: the BFAST output FASTQ\n+ -4,--outputBWA1=4: the first BWA output FASTQ\n+ -5,--outputBWA2=5: the second BWA output FASTQ\n+ -6,--outputMutations=6: the output mutations TXT\n+"""\n+\n+import optparse, os, shutil, subprocess, sys, tempfile\n+\n+def stop_err( msg ):\n+ sys.stderr.write( \'%s\\n\' % msg )\n+ sys.exit()\n+\n+def run_process ( cmd, name, tmp_dir, buffsize ):\n+ try:\n+ tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name\n+ tmp_stderr = open( tmp, \'wb\' )\n+ proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )\n+ returncode = proc.wait()\n+ tmp_stderr.close()\n+ # get stderr, allowing for case where it\'s very large\n+ tmp_stderr = open( tmp, \'rb\' )\n+ stderr = \'\'\n+ try:\n+ while True:\n+ stderr += tmp_stderr.read( buffsize )\n+ if not stderr or len( stderr ) % buffsize != 0:\n+ break\n+ except OverflowError:\n+ pass\n+ tmp_stderr.close()\n+ if returncode != 0:\n+ raise Exception, stderr\n+ except Exception, e:\n+ raise Exception, \'Error in \\\'\' + name + \'\\\'. \\n\' + str( e )\n+\n+def check_output ( output, canBeEmpty ):\n+ if 0 < os.path.getsize( output ):\n+ return True\n+ elif False == canBeEmpty:\n+ raise Exception, \'The output file is empty:\' + output\n+\n+def __main__():\n+ #Parse Command Line\n+ parser = optparse.OptionParser()\n+ parser.add_option( \'-e\', \'--errorOne\', dest=\'errorOne\', type=\'float\', help=\'base/color error rate of the first read\' )\n+ parser.add_option( \'-E\', \'--errorTwo\', dest=\'errorTwo\', type=\'float\', help=\'base/color error rate of the second read\' )\n+ parser.add_option( \'-d\', \'--innertDist\', dest=\'innertDist\', type=\'int\', help=\'inner distance between the two ends\' )\n+ parser.add_option( \'-s\', \'--stdev\', dest=\'stdev\', type=\'float\', help=\'standard deviation\' )\n+ parser.add_option( \'-N\', \'--numReads\', dest=\'numReads\', type=\'int\', help=\'number of read pairs\' )\n+ parser.add_option( \'-1\', \'--lengthOne\', dest=\'lengthOne\', type=\'int\', help=\'length of the first read\' )\n+ parser.add_option( \'-2\', \'--lengthTwo\', dest=\'lengthTwo\', type=\'int\', help=\'length of the second read\' )\n+ parser.add_option( \'-r\', \'--mutRate\', dest=\'mutRate\', type=\'float\', help=\'rate of mutations\' )\n+ parser.add_option( \'-R\', \'--fracIndels\', dest=\'fracIndels\', type=\'float\', help=\'fraction of indels\' )\n+ parser.add_option( \'-X\', \'--indelExt\', dest=\'indelExt\', type=\'float\', help=\'probability an indel is extended\' )\n+ parser.add_option( \'-y\', \'--randProb\', dest=\'randProb\', type=\'float\', help=\'probability of a random DNA read\' )\n+ parser.add_option( \'-n\', \'--maxN\', dest=\'maxN\', type=\'int\', help=\'maximum number of Ns allowed in a given read\' )\n+ parser.add_option( \'-c\', \'--colorSpace\', action=\'store_true\', dest=\'colorSpace\', default=False, help=\'generate reads in colo'..b'dd_option( \'-3\', \'--outputBFAST\', dest=\'outputBFAST\', help=\'the BFAST output FASTQ\' )\n+ parser.add_option( \'-4\', \'--outputBWA1\', dest=\'outputBWA1\', help=\'the first BWA output FASTQ\' )\n+ parser.add_option( \'-5\', \'--outputBWA2\', dest=\'outputBWA2\', help=\'the second BWA output FASTQ\' )\n+ parser.add_option( \'-6\', \'--outputMutations\', dest=\'outputMutations\', help=\'the output mutations TXT\' )\n+ \n+ (options, args) = parser.parse_args()\n+\n+ # output version # of tool\n+ try:\n+ tmp = tempfile.NamedTemporaryFile().name\n+ tmp_stdout = open( tmp, \'wb\' )\n+ proc = subprocess.Popen( args=\'dwgsim 2>&1\', shell=True, stdout=tmp_stdout )\n+ tmp_stdout.close()\n+ returncode = proc.wait()\n+ stdout = None\n+ for line in open( tmp_stdout.name, \'rb\' ):\n+ if line.lower().find( \'version\' ) >= 0:\n+ stdout = line.strip()\n+ break\n+ if stdout:\n+ sys.stdout.write( \'%s\\n\' % stdout )\n+ else:\n+ raise Exception\n+ except:\n+ sys.stdout.write( \'Could not determine DWGSIM version\\n\' )\n+\n+ buffsize = 1048576\n+\n+ # make temp directory for dwgsim, requires trailing slash\n+ tmp_dir = \'%s/\' % tempfile.mkdtemp()\n+\n+ #\'generic\' options used in all dwgsim commands here\n+\n+ try:\n+ reference_filepath = tempfile.NamedTemporaryFile( dir=tmp_dir, suffix=\'.fa\' ).name\n+ os.symlink( options.fasta, reference_filepath )\n+ assert reference_filepath and os.path.exists( reference_filepath ), \'A valid genome reference was not provided.\'\n+ tmp_dir = \'%s/\' % tempfile.mkdtemp()\n+ dwgsim_output_prefix = "dwgsim_output_prefix"\n+ dwgsim_cmd = \'dwgsim -e %s -E %s -d %s -s %s -N %s -1 %s -2 %s -r %s -R %s -X %s -y %s -n %s\' % \\\n+ (options.errorOne, \\\n+ options.errorTwo, \\\n+ options.innertDist, \\\n+ options.stdev, \\\n+ options.numReads, \\\n+ options.lengthOne, \\\n+ options.lengthTwo, \\\n+ options.mutRate, \\\n+ options.fracIndels, \\\n+ options.indelExt, \\\n+ options.randProb, \\\n+ options.maxN)\n+ if options.colorSpace:\n+ dwgsim_cmd = dwgsim_cmd + \' -c\'\n+ if options.haploid:\n+ dwgsim_cmd = dwgsim_cmd + \' -H\'\n+ dwgsim_cmd = dwgsim_cmd + \' \' + options.fasta\n+ dwgsim_cmd = dwgsim_cmd + \' \' + tmp_dir + \'/\' + dwgsim_output_prefix\n+\n+ # need to nest try-except in try-finally to handle 2.4\n+ try:\n+ # dwgsim\n+ run_process ( dwgsim_cmd, \'dwgsim\', tmp_dir, buffsize )\n+ # Move files\n+ cmd = \'mv \' + tmp_dir + \'/\' + dwgsim_output_prefix + \'.mutations.txt\' + \' \' + options.outputMutations\n+ run_process ( cmd, \'mv #1\', tmp_dir, buffsize )\n+ cmd = \'mv \' + tmp_dir + \'/\' + dwgsim_output_prefix + \'.bfast.fastq\' + \' \' + options.outputBFAST\n+ run_process ( cmd, \'mv #2\', tmp_dir, buffsize )\n+ cmd = \'mv \' + tmp_dir + \'/\' + dwgsim_output_prefix + \'.bwa.read1.fastq\' + \' \' + options.outputBWA1\n+ run_process ( cmd, \'mv #3\', tmp_dir, buffsize )\n+ cmd = \'mv \' + tmp_dir + \'/\' + dwgsim_output_prefix + \'.bwa.read2.fastq\' + \' \' + options.outputBWA2\n+ run_process ( cmd, \'mv #4\', tmp_dir, buffsize )\n+ # check that there are results in the output file\n+ check_output ( options.outputMutations, True )\n+ check_output ( options.outputBFAST, False )\n+ check_output ( options.outputBWA1, False )\n+ check_output ( options.outputBWA2, False )\n+ sys.stdout.write( \'DWGSIM successful\' )\n+ except Exception, e:\n+ stop_err( \'DWGSIM failed.\\n\' + str( e ) )\n+ finally:\n+ # clean up temp dir\n+ if os.path.exists( tmp_dir ):\n+ shutil.rmtree( tmp_dir )\n+\n+if __name__=="__main__": __main__()\n' |