Mercurial > repos > bgruening > bismark
comparison bismark_wrapper.py @ 0:62c6da72dd4a draft
Uploaded
| author | bgruening |
|---|---|
| date | Sat, 06 Jul 2013 09:57:36 -0400 |
| parents | |
| children | 82814a8a2395 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:62c6da72dd4a |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import argparse | |
| 4 import os | |
| 5 import shutil | |
| 6 import subprocess | |
| 7 import sys | |
| 8 import shlex | |
| 9 import tempfile | |
| 10 import fileinput | |
| 11 import fileinput | |
| 12 from glob import glob | |
| 13 | |
| 14 def stop_err( msg ): | |
| 15 sys.stderr.write( "%s\n" % msg ) | |
| 16 sys.exit() | |
| 17 | |
| 18 def __main__(): | |
| 19 | |
| 20 print 'tempfile_location',tempfile.gettempdir() | |
| 21 #Parse Command Line | |
| 22 parser = argparse.ArgumentParser(description='Wrapper for the bismark bisulfite mapper.') | |
| 23 parser.add_argument( '-p', '--num-threads', dest='num_threads', | |
| 24 type=int, default=4, help='Use this many threads to align reads. The default is 4.' ) | |
| 25 | |
| 26 parser.add_argument( '--bismark_path', dest='bismark_path', help='Path to the bismark perl scripts' ) | |
| 27 | |
| 28 parser.add_argument( '--bowtie2', action='store_true', default=False, help='Running bismark with bowtie2 and not with bowtie.' ) | |
| 29 | |
| 30 # input options | |
| 31 parser.add_argument( '--own-file', dest='own_file', help='' ) | |
| 32 parser.add_argument( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' ) | |
| 33 parser.add_argument( '-O', '--output', dest='output' ) | |
| 34 | |
| 35 | |
| 36 parser.add_argument( '--output-report-file', dest='output_report_file' ) | |
| 37 parser.add_argument( '--suppress-header', dest='suppress_header', action="store_true" ) | |
| 38 | |
| 39 parser.add_argument( '--mate-paired', dest='mate_paired', action='store_true', help='Reads are mate-paired', default=False) | |
| 40 | |
| 41 | |
| 42 parser.add_argument( '-1', '--mate1', dest='mate1', | |
| 43 help='The forward reads file in Sanger FASTQ or FASTA format.' ) | |
| 44 parser.add_argument( '-2', '--mate2', dest='mate2', | |
| 45 help='The reverse reads file in Sanger FASTQ or FASTA format.' ) | |
| 46 | |
| 47 parser.add_argument( '--output-unmapped-reads', dest='output_unmapped_reads', | |
| 48 help='Additional output file with unmapped reads (single-end).' ) | |
| 49 parser.add_argument( '--output-unmapped-reads-l', dest='output_unmapped_reads_l', | |
| 50 help='File name for unmapped reads (left, paired-end).' ) | |
| 51 parser.add_argument( '--output-unmapped-reads-r', dest='output_unmapped_reads_r', | |
| 52 help='File name for unmapped reads (right, paired-end).' ) | |
| 53 | |
| 54 | |
| 55 parser.add_argument( '--output-suppressed-reads', dest='output_suppressed_reads', | |
| 56 help='Additional output file with suppressed reads (single-end).' ) | |
| 57 parser.add_argument( '--output-suppressed-reads-l', dest='output_suppressed_reads_l', | |
| 58 help='File name for suppressed reads (left, paired-end).' ) | |
| 59 parser.add_argument( '--output-suppressed-reads-r', dest='output_suppressed_reads_r', | |
| 60 help='File name for suppressed reads (right, paired-end).' ) | |
| 61 parser.add_argument( '--stdout', dest='output_stdout', | |
| 62 help='File name for the standard output of bismark.' ) | |
| 63 | |
| 64 | |
| 65 parser.add_argument( '--single-paired', dest='single_paired', | |
| 66 help='The single-end reads file in Sanger FASTQ or FASTA format.' ) | |
| 67 | |
| 68 parser.add_argument( '--fastq', action='store_true', help='Query filetype is in FASTQ format') | |
| 69 parser.add_argument( '--fasta', action='store_true', help='Query filetype is in FASTA format') | |
| 70 parser.add_argument( '--phred64-quals', dest='phred64', action="store_true" ) | |
| 71 | |
| 72 | |
| 73 parser.add_argument( '--skip-reads', dest='skip_reads', type=int ) | |
| 74 parser.add_argument( '--qupto', type=int) | |
| 75 | |
| 76 | |
| 77 # paired end options | |
| 78 parser.add_argument( '-I', '--minins', dest='min_insert' ) | |
| 79 parser.add_argument( '-X', '--maxins', dest='max_insert' ) | |
| 80 parser.add_argument( '--no-mixed', dest='no_mixed', action="store_true" ) | |
| 81 parser.add_argument( '--no-discordant', dest='no_discordant', action="store_true" ) | |
| 82 | |
| 83 #parse general options | |
| 84 # default 20 | |
| 85 parser.add_argument( '--seed-len', dest='seed_len', type=int) | |
| 86 # default 15 | |
| 87 parser.add_argument( '--seed-extention-attempts', dest='seed_extention_attempts', type=int ) | |
| 88 # default 0 | |
| 89 parser.add_argument( '--seed-mismatches', dest='seed_mismatches', type=int ) | |
| 90 # default 2 | |
| 91 parser.add_argument( '--max-reseed', dest='max_reseed', type=int ) | |
| 92 """ | |
| 93 # default 70 | |
| 94 parser.add_argument( '--maqerr', dest='maqerr', type=int ) | |
| 95 """ | |
| 96 | |
| 97 """ | |
| 98 The number of megabytes of memory a given thread is given to store path | |
| 99 descriptors in --best mode. Best-first search must keep track of many paths | |
| 100 at once to ensure it is always extending the path with the lowest cumulative | |
| 101 cost. Bowtie tries to minimize the memory impact of the descriptors, but | |
| 102 they can still grow very large in some cases. If you receive an error message | |
| 103 saying that chunk memory has been exhausted in --best mode, try adjusting | |
| 104 this parameter up to dedicate more memory to the descriptors. Default: 512. | |
| 105 """ | |
| 106 parser.add_argument( '--chunkmbs', type=int, default=512 ) | |
| 107 | |
| 108 args = parser.parse_args() | |
| 109 | |
| 110 # Create bismark index if necessary. | |
| 111 index_dir = "" | |
| 112 if args.own_file: | |
| 113 """ | |
| 114 Create a temporary index with the offered files from the user. | |
| 115 Utilizing the script: bismark_genome_preparation | |
| 116 bismark_genome_preparation --bowtie2 hg19/ | |
| 117 """ | |
| 118 tmp_index_dir = tempfile.mkdtemp() | |
| 119 index_path = os.path.join( tmp_index_dir, '.'.join( os.path.split( args.own_file )[1].split( '.' )[:-1] ) ) | |
| 120 try: | |
| 121 """ | |
| 122 Create a hard link pointing to args.own_file named 'index_path'.fa. | |
| 123 """ | |
| 124 os.symlink( args.own_file, index_path + '.fa' ) | |
| 125 except Exception, e: | |
| 126 if os.path.exists( tmp_index_dir ): | |
| 127 shutil.rmtree( tmp_index_dir ) | |
| 128 stop_err( 'Error in linking the reference database.\n' + str( e ) ) | |
| 129 # bismark_genome_preparation needs the complete path to the folder in which the database is stored | |
| 130 if args.bowtie2: | |
| 131 cmd_index = 'bismark_genome_preparation --bowtie2 %s ' % ( tmp_index_dir ) | |
| 132 else: | |
| 133 cmd_index = 'bismark_genome_preparation %s ' % ( tmp_index_dir ) | |
| 134 if args.bismark_path: | |
| 135 if os.path.exists(args.bismark_path): | |
| 136 # add the path to the bismark perl scripts, that is needed for galaxy | |
| 137 cmd_index = os.path.join(args.bismark_path, cmd_index) | |
| 138 else: | |
| 139 # assume the same directory as that script | |
| 140 cmd_index = 'perl %s' % os.path.join(os.path.realpath(os.path.dirname(__file__)), cmd_index) | |
| 141 try: | |
| 142 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name | |
| 143 tmp_stderr = open( tmp, 'wb' ) | |
| 144 proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stdout=open(os.devnull, 'wb'), stderr=tmp_stderr.fileno() ) | |
| 145 returncode = proc.wait() | |
| 146 tmp_stderr.close() | |
| 147 # get stderr, allowing for case where it's very large | |
| 148 tmp_stderr = open( tmp, 'rb' ) | |
| 149 stderr = '' | |
| 150 buffsize = 1048576 | |
| 151 try: | |
| 152 while True: | |
| 153 stderr += tmp_stderr.read( buffsize ) | |
| 154 if not stderr or len( stderr ) % buffsize != 0: | |
| 155 break | |
| 156 except OverflowError: | |
| 157 pass | |
| 158 tmp_stderr.close() | |
| 159 if returncode != 0: | |
| 160 raise Exception, stderr | |
| 161 except Exception, e: | |
| 162 if os.path.exists( tmp_index_dir ): | |
| 163 shutil.rmtree( tmp_index_dir ) | |
| 164 stop_err( 'Error indexing reference sequence\n' + str( e ) ) | |
| 165 index_dir = tmp_index_dir | |
| 166 else: | |
| 167 # bowtie path is the path to the index directory and the first path of the index file name | |
| 168 index_dir = os.path.dirname( args.index_path ) | |
| 169 | |
| 170 # Build bismark command | |
| 171 | |
| 172 """ | |
| 173 Bismark requires a large amount of temporary disc space. If that is not available, for example on a cluster you can hardcode the | |
| 174 TMP to some larger space. It's not recommended but it works. | |
| 175 """ | |
| 176 #tmp_bismark_dir = tempfile.mkdtemp( dir='/data/0/galaxy_db/tmp/' ) | |
| 177 tmp_bismark_dir = tempfile.mkdtemp() | |
| 178 output_dir = os.path.join( tmp_bismark_dir, 'results') | |
| 179 cmd = 'bismark %(args)s --bam --temp_dir %(tmp_bismark_dir)s -o %(output_dir)s --quiet %(genome_folder)s %(reads)s' | |
| 180 | |
| 181 if args.fasta: | |
| 182 # he query input files (specified as mate1,mate2 or singles) are FastA | |
| 183 cmd = '%s %s' % (cmd, '--fasta') | |
| 184 elif args.fastq: | |
| 185 cmd = '%s %s' % (cmd, '--fastq') | |
| 186 | |
| 187 if args.bismark_path: | |
| 188 # add the path to the bismark perl scripts, that is needed for galaxy | |
| 189 if os.path.exists(args.bismark_path): | |
| 190 cmd = os.path.join(args.bismark_path, cmd) | |
| 191 else: | |
| 192 # assume the same directory as that script | |
| 193 cmd = 'perl %s' % os.path.join(os.path.realpath(os.path.dirname(__file__)), cmd) | |
| 194 | |
| 195 arguments = { | |
| 196 'genome_folder': index_dir, | |
| 197 'args': '', | |
| 198 'tmp_bismark_dir': tmp_bismark_dir, | |
| 199 'output_dir': output_dir, | |
| 200 } | |
| 201 | |
| 202 additional_opts = '' | |
| 203 # Set up the reads | |
| 204 if args.mate_paired: | |
| 205 # paired-end reads library | |
| 206 reads = '-1 %s ' % ( args.mate1 ) | |
| 207 reads += ' -2 %s ' % ( args.mate2 ) | |
| 208 additional_opts += ' -I %s -X %s ' % (args.min_insert, args.max_insert) | |
| 209 else: | |
| 210 # single paired reads library | |
| 211 reads = ' %s ' % ( args.single_paired ) | |
| 212 | |
| 213 | |
| 214 if not args.bowtie2: | |
| 215 # use bowtie specific options | |
| 216 #additional_opts += ' --best ' # bug in bismark, --best is not available as option. Only --non-best, best-mode is activated by default | |
| 217 if args.seed_mismatches: | |
| 218 # --seedmms | |
| 219 additional_opts += ' -n %s ' % args.seed_mismatches | |
| 220 if args.seed_len: | |
| 221 # --seedlen | |
| 222 additional_opts += ' -l %s ' % args.seed_len | |
| 223 | |
| 224 # alignment options | |
| 225 if args.bowtie2: | |
| 226 additional_opts += ' -p %s --bowtie2 ' % args.num_threads | |
| 227 if args.seed_mismatches: | |
| 228 additional_opts += ' -N %s ' % args.seed_mismatches | |
| 229 if args.seed_len: | |
| 230 additional_opts += ' -L %s ' % args.seed_len | |
| 231 if args.seed_extention_attempts: | |
| 232 additional_opts += ' -D %s ' % args.seed_extention_attempts | |
| 233 if args.max_reseed: | |
| 234 additional_opts += ' -R %s ' % args.max_reseed | |
| 235 if args.no_discordant: | |
| 236 additional_opts += ' --no-discordant ' | |
| 237 if args.no_mixed: | |
| 238 additional_opts += ' --no-mixed ' | |
| 239 """ | |
| 240 if args.maqerr: | |
| 241 additional_opts += ' --maqerr %s ' % args.maqerr | |
| 242 """ | |
| 243 if args.skip_reads: | |
| 244 additional_opts += ' --skip %s ' % args.skip_reads | |
| 245 if args.qupto: | |
| 246 additional_opts += ' --qupto %s ' % args.qupto | |
| 247 if args.phred64: | |
| 248 additional_opts += ' --phred64-quals ' | |
| 249 if args.suppress_header: | |
| 250 additional_opts += ' --sam-no-hd ' | |
| 251 if args.output_unmapped_reads or ( args.output_unmapped_reads_l and args.output_unmapped_reads_r): | |
| 252 additional_opts += ' --un ' | |
| 253 if args.output_suppressed_reads or ( args.output_suppressed_reads_l and args.output_suppressed_reads_r): | |
| 254 additional_opts += ' --ambiguous ' | |
| 255 | |
| 256 arguments.update( {'args': additional_opts, 'reads': reads} ) | |
| 257 | |
| 258 # Final bismark command: | |
| 259 cmd = cmd % arguments | |
| 260 print 'bismark_cmd:', cmd | |
| 261 #sys.stderr.write( cmd ) | |
| 262 #sys.exit(1) | |
| 263 # Run | |
| 264 try: | |
| 265 tmp_out = tempfile.NamedTemporaryFile().name | |
| 266 tmp_stdout = open( tmp_out, 'wb' ) | |
| 267 tmp_err = tempfile.NamedTemporaryFile().name | |
| 268 tmp_stderr = open( tmp_err, 'wb' ) | |
| 269 proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr ) | |
| 270 returncode = proc.wait() | |
| 271 | |
| 272 if returncode != 0: | |
| 273 tmp_stdout.close() | |
| 274 tmp_stderr.close() | |
| 275 # get stderr, allowing for case where it's very large | |
| 276 tmp_stderr = open( tmp_err, 'rb' ) | |
| 277 stderr = '' | |
| 278 buffsize = 1048576 | |
| 279 try: | |
| 280 while True: | |
| 281 stderr += tmp_stderr.read( buffsize ) | |
| 282 if not stderr or len( stderr ) % buffsize != 0: | |
| 283 break | |
| 284 except OverflowError: | |
| 285 pass | |
| 286 | |
| 287 raise Exception, stderr | |
| 288 | |
| 289 tmp_stdout.close() | |
| 290 tmp_stderr.close() | |
| 291 | |
| 292 # TODO: look for errors in program output. | |
| 293 except Exception, e: | |
| 294 stop_err( 'Error in bismark:\n' + str( e ) ) | |
| 295 | |
| 296 # collect and copy output files | |
| 297 if args.output_report_file: | |
| 298 output_report_file = open(args.output_report_file, 'w+') | |
| 299 for line in fileinput.input(glob( os.path.join( output_dir, '*report.txt') )): | |
| 300 output_report_file.write(line) | |
| 301 output_report_file.close() | |
| 302 | |
| 303 | |
| 304 if args.output_suppressed_reads: | |
| 305 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads.txt'))[0], args.output_suppressed_reads ) | |
| 306 if args.output_suppressed_reads_l: | |
| 307 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_1.txt'))[0], args.output_suppressed_reads_l ) | |
| 308 if args.output_suppressed_reads_r: | |
| 309 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_2.txt'))[0], args.output_suppressed_reads_r ) | |
| 310 | |
| 311 if args.output_unmapped_reads: | |
| 312 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads.txt'))[0], args.output_unmapped_reads ) | |
| 313 if args.output_unmapped_reads_l: | |
| 314 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_1.txt'))[0], args.output_unmapped_reads_l ) | |
| 315 if args.output_unmapped_reads_r: | |
| 316 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_2.txt'))[0], args.output_unmapped_reads_r ) | |
| 317 | |
| 318 try: | |
| 319 """ | |
| 320 merge all bam files | |
| 321 """ | |
| 322 #tmp_out = tempfile.NamedTemporaryFile( dir=output_dir ).name | |
| 323 tmp_stdout = open( tmp_out, 'wab' ) | |
| 324 tmp_err = tempfile.NamedTemporaryFile( dir=output_dir ).name | |
| 325 tmp_stderr = open( tmp_err, 'wb' ) | |
| 326 | |
| 327 tmp_res = tempfile.NamedTemporaryFile( dir= output_dir).name | |
| 328 | |
| 329 bam_files = glob( os.path.join( output_dir, '*.bam') ) | |
| 330 if len( bam_files ) > 1: | |
| 331 cmd = 'samtools merge -@ %s -f - %s ' % ( args.num_threads, ' '.join( bam_files ) ) | |
| 332 | |
| 333 p1 = subprocess.Popen( args=shlex.split( cmd ), stdout=subprocess.PIPE ) | |
| 334 proc = subprocess.Popen( ['samtools', 'sort', '-@', str(args.num_threads), '-', tmp_res], stdin=p1.stdout, stdout=tmp_stdout, stderr=tmp_stderr ) | |
| 335 else: | |
| 336 proc = subprocess.Popen( ['samtools', 'sort', '-@', str(args.num_threads), bam_files[0], tmp_res], stdout=tmp_stdout, stderr=tmp_stderr ) | |
| 337 | |
| 338 returncode = proc.wait() | |
| 339 tmp_stdout.close() | |
| 340 tmp_stderr.close() | |
| 341 if returncode != 0: | |
| 342 raise Exception, open( tmp_stderr.name ).read() | |
| 343 | |
| 344 bam_path = "%s.bam" % tmp_res | |
| 345 if os.path.exists( bam_path ): | |
| 346 shutil.copy( bam_path, args.output ) | |
| 347 else: | |
| 348 stop_err( 'BAM file no found:\n' + str( bam_path ) ) | |
| 349 | |
| 350 # TODO: look for errors in program output. | |
| 351 except Exception, e: | |
| 352 stop_err( 'Error in merging bam files:\n' + str( e ) ) | |
| 353 | |
| 354 | |
| 355 if args.output_stdout: | |
| 356 # copy the temporary saved stdout from bismark | |
| 357 shutil.move( tmp_out, args.output_stdout ) | |
| 358 | |
| 359 # Clean up temp dirs | |
| 360 if args.own_file: | |
| 361 if os.path.exists( tmp_index_dir ): | |
| 362 shutil.rmtree( tmp_index_dir ) | |
| 363 if os.path.exists( tmp_bismark_dir ): | |
| 364 shutil.rmtree( tmp_bismark_dir ) | |
| 365 if os.path.exists( output_dir ): | |
| 366 shutil.rmtree( output_dir ) | |
| 367 | |
| 368 if __name__=="__main__": __main__() |
