comparison bismark_wrapper.py @ 0:62c6da72dd4a draft

Uploaded
author bgruening
date Sat, 06 Jul 2013 09:57:36 -0400
parents
children 82814a8a2395
comparison
equal deleted inserted replaced
-1:000000000000 0:62c6da72dd4a
1 #!/usr/bin/env python
2
3 import argparse
4 import os
5 import shutil
6 import subprocess
7 import sys
8 import shlex
9 import tempfile
10 import fileinput
11 import fileinput
12 from glob import glob
13
14 def stop_err( msg ):
15 sys.stderr.write( "%s\n" % msg )
16 sys.exit()
17
18 def __main__():
19
20 print 'tempfile_location',tempfile.gettempdir()
21 #Parse Command Line
22 parser = argparse.ArgumentParser(description='Wrapper for the bismark bisulfite mapper.')
23 parser.add_argument( '-p', '--num-threads', dest='num_threads',
24 type=int, default=4, help='Use this many threads to align reads. The default is 4.' )
25
26 parser.add_argument( '--bismark_path', dest='bismark_path', help='Path to the bismark perl scripts' )
27
28 parser.add_argument( '--bowtie2', action='store_true', default=False, help='Running bismark with bowtie2 and not with bowtie.' )
29
30 # input options
31 parser.add_argument( '--own-file', dest='own_file', help='' )
32 parser.add_argument( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' )
33 parser.add_argument( '-O', '--output', dest='output' )
34
35
36 parser.add_argument( '--output-report-file', dest='output_report_file' )
37 parser.add_argument( '--suppress-header', dest='suppress_header', action="store_true" )
38
39 parser.add_argument( '--mate-paired', dest='mate_paired', action='store_true', help='Reads are mate-paired', default=False)
40
41
42 parser.add_argument( '-1', '--mate1', dest='mate1',
43 help='The forward reads file in Sanger FASTQ or FASTA format.' )
44 parser.add_argument( '-2', '--mate2', dest='mate2',
45 help='The reverse reads file in Sanger FASTQ or FASTA format.' )
46
47 parser.add_argument( '--output-unmapped-reads', dest='output_unmapped_reads',
48 help='Additional output file with unmapped reads (single-end).' )
49 parser.add_argument( '--output-unmapped-reads-l', dest='output_unmapped_reads_l',
50 help='File name for unmapped reads (left, paired-end).' )
51 parser.add_argument( '--output-unmapped-reads-r', dest='output_unmapped_reads_r',
52 help='File name for unmapped reads (right, paired-end).' )
53
54
55 parser.add_argument( '--output-suppressed-reads', dest='output_suppressed_reads',
56 help='Additional output file with suppressed reads (single-end).' )
57 parser.add_argument( '--output-suppressed-reads-l', dest='output_suppressed_reads_l',
58 help='File name for suppressed reads (left, paired-end).' )
59 parser.add_argument( '--output-suppressed-reads-r', dest='output_suppressed_reads_r',
60 help='File name for suppressed reads (right, paired-end).' )
61 parser.add_argument( '--stdout', dest='output_stdout',
62 help='File name for the standard output of bismark.' )
63
64
65 parser.add_argument( '--single-paired', dest='single_paired',
66 help='The single-end reads file in Sanger FASTQ or FASTA format.' )
67
68 parser.add_argument( '--fastq', action='store_true', help='Query filetype is in FASTQ format')
69 parser.add_argument( '--fasta', action='store_true', help='Query filetype is in FASTA format')
70 parser.add_argument( '--phred64-quals', dest='phred64', action="store_true" )
71
72
73 parser.add_argument( '--skip-reads', dest='skip_reads', type=int )
74 parser.add_argument( '--qupto', type=int)
75
76
77 # paired end options
78 parser.add_argument( '-I', '--minins', dest='min_insert' )
79 parser.add_argument( '-X', '--maxins', dest='max_insert' )
80 parser.add_argument( '--no-mixed', dest='no_mixed', action="store_true" )
81 parser.add_argument( '--no-discordant', dest='no_discordant', action="store_true" )
82
83 #parse general options
84 # default 20
85 parser.add_argument( '--seed-len', dest='seed_len', type=int)
86 # default 15
87 parser.add_argument( '--seed-extention-attempts', dest='seed_extention_attempts', type=int )
88 # default 0
89 parser.add_argument( '--seed-mismatches', dest='seed_mismatches', type=int )
90 # default 2
91 parser.add_argument( '--max-reseed', dest='max_reseed', type=int )
92 """
93 # default 70
94 parser.add_argument( '--maqerr', dest='maqerr', type=int )
95 """
96
97 """
98 The number of megabytes of memory a given thread is given to store path
99 descriptors in --best mode. Best-first search must keep track of many paths
100 at once to ensure it is always extending the path with the lowest cumulative
101 cost. Bowtie tries to minimize the memory impact of the descriptors, but
102 they can still grow very large in some cases. If you receive an error message
103 saying that chunk memory has been exhausted in --best mode, try adjusting
104 this parameter up to dedicate more memory to the descriptors. Default: 512.
105 """
106 parser.add_argument( '--chunkmbs', type=int, default=512 )
107
108 args = parser.parse_args()
109
110 # Create bismark index if necessary.
111 index_dir = ""
112 if args.own_file:
113 """
114 Create a temporary index with the offered files from the user.
115 Utilizing the script: bismark_genome_preparation
116 bismark_genome_preparation --bowtie2 hg19/
117 """
118 tmp_index_dir = tempfile.mkdtemp()
119 index_path = os.path.join( tmp_index_dir, '.'.join( os.path.split( args.own_file )[1].split( '.' )[:-1] ) )
120 try:
121 """
122 Create a hard link pointing to args.own_file named 'index_path'.fa.
123 """
124 os.symlink( args.own_file, index_path + '.fa' )
125 except Exception, e:
126 if os.path.exists( tmp_index_dir ):
127 shutil.rmtree( tmp_index_dir )
128 stop_err( 'Error in linking the reference database.\n' + str( e ) )
129 # bismark_genome_preparation needs the complete path to the folder in which the database is stored
130 if args.bowtie2:
131 cmd_index = 'bismark_genome_preparation --bowtie2 %s ' % ( tmp_index_dir )
132 else:
133 cmd_index = 'bismark_genome_preparation %s ' % ( tmp_index_dir )
134 if args.bismark_path:
135 if os.path.exists(args.bismark_path):
136 # add the path to the bismark perl scripts, that is needed for galaxy
137 cmd_index = os.path.join(args.bismark_path, cmd_index)
138 else:
139 # assume the same directory as that script
140 cmd_index = 'perl %s' % os.path.join(os.path.realpath(os.path.dirname(__file__)), cmd_index)
141 try:
142 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
143 tmp_stderr = open( tmp, 'wb' )
144 proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stdout=open(os.devnull, 'wb'), stderr=tmp_stderr.fileno() )
145 returncode = proc.wait()
146 tmp_stderr.close()
147 # get stderr, allowing for case where it's very large
148 tmp_stderr = open( tmp, 'rb' )
149 stderr = ''
150 buffsize = 1048576
151 try:
152 while True:
153 stderr += tmp_stderr.read( buffsize )
154 if not stderr or len( stderr ) % buffsize != 0:
155 break
156 except OverflowError:
157 pass
158 tmp_stderr.close()
159 if returncode != 0:
160 raise Exception, stderr
161 except Exception, e:
162 if os.path.exists( tmp_index_dir ):
163 shutil.rmtree( tmp_index_dir )
164 stop_err( 'Error indexing reference sequence\n' + str( e ) )
165 index_dir = tmp_index_dir
166 else:
167 # bowtie path is the path to the index directory and the first path of the index file name
168 index_dir = os.path.dirname( args.index_path )
169
170 # Build bismark command
171
172 """
173 Bismark requires a large amount of temporary disc space. If that is not available, for example on a cluster you can hardcode the
174 TMP to some larger space. It's not recommended but it works.
175 """
176 #tmp_bismark_dir = tempfile.mkdtemp( dir='/data/0/galaxy_db/tmp/' )
177 tmp_bismark_dir = tempfile.mkdtemp()
178 output_dir = os.path.join( tmp_bismark_dir, 'results')
179 cmd = 'bismark %(args)s --bam --temp_dir %(tmp_bismark_dir)s -o %(output_dir)s --quiet %(genome_folder)s %(reads)s'
180
181 if args.fasta:
182 # he query input files (specified as mate1,mate2 or singles) are FastA
183 cmd = '%s %s' % (cmd, '--fasta')
184 elif args.fastq:
185 cmd = '%s %s' % (cmd, '--fastq')
186
187 if args.bismark_path:
188 # add the path to the bismark perl scripts, that is needed for galaxy
189 if os.path.exists(args.bismark_path):
190 cmd = os.path.join(args.bismark_path, cmd)
191 else:
192 # assume the same directory as that script
193 cmd = 'perl %s' % os.path.join(os.path.realpath(os.path.dirname(__file__)), cmd)
194
195 arguments = {
196 'genome_folder': index_dir,
197 'args': '',
198 'tmp_bismark_dir': tmp_bismark_dir,
199 'output_dir': output_dir,
200 }
201
202 additional_opts = ''
203 # Set up the reads
204 if args.mate_paired:
205 # paired-end reads library
206 reads = '-1 %s ' % ( args.mate1 )
207 reads += ' -2 %s ' % ( args.mate2 )
208 additional_opts += ' -I %s -X %s ' % (args.min_insert, args.max_insert)
209 else:
210 # single paired reads library
211 reads = ' %s ' % ( args.single_paired )
212
213
214 if not args.bowtie2:
215 # use bowtie specific options
216 #additional_opts += ' --best ' # bug in bismark, --best is not available as option. Only --non-best, best-mode is activated by default
217 if args.seed_mismatches:
218 # --seedmms
219 additional_opts += ' -n %s ' % args.seed_mismatches
220 if args.seed_len:
221 # --seedlen
222 additional_opts += ' -l %s ' % args.seed_len
223
224 # alignment options
225 if args.bowtie2:
226 additional_opts += ' -p %s --bowtie2 ' % args.num_threads
227 if args.seed_mismatches:
228 additional_opts += ' -N %s ' % args.seed_mismatches
229 if args.seed_len:
230 additional_opts += ' -L %s ' % args.seed_len
231 if args.seed_extention_attempts:
232 additional_opts += ' -D %s ' % args.seed_extention_attempts
233 if args.max_reseed:
234 additional_opts += ' -R %s ' % args.max_reseed
235 if args.no_discordant:
236 additional_opts += ' --no-discordant '
237 if args.no_mixed:
238 additional_opts += ' --no-mixed '
239 """
240 if args.maqerr:
241 additional_opts += ' --maqerr %s ' % args.maqerr
242 """
243 if args.skip_reads:
244 additional_opts += ' --skip %s ' % args.skip_reads
245 if args.qupto:
246 additional_opts += ' --qupto %s ' % args.qupto
247 if args.phred64:
248 additional_opts += ' --phred64-quals '
249 if args.suppress_header:
250 additional_opts += ' --sam-no-hd '
251 if args.output_unmapped_reads or ( args.output_unmapped_reads_l and args.output_unmapped_reads_r):
252 additional_opts += ' --un '
253 if args.output_suppressed_reads or ( args.output_suppressed_reads_l and args.output_suppressed_reads_r):
254 additional_opts += ' --ambiguous '
255
256 arguments.update( {'args': additional_opts, 'reads': reads} )
257
258 # Final bismark command:
259 cmd = cmd % arguments
260 print 'bismark_cmd:', cmd
261 #sys.stderr.write( cmd )
262 #sys.exit(1)
263 # Run
264 try:
265 tmp_out = tempfile.NamedTemporaryFile().name
266 tmp_stdout = open( tmp_out, 'wb' )
267 tmp_err = tempfile.NamedTemporaryFile().name
268 tmp_stderr = open( tmp_err, 'wb' )
269 proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr )
270 returncode = proc.wait()
271
272 if returncode != 0:
273 tmp_stdout.close()
274 tmp_stderr.close()
275 # get stderr, allowing for case where it's very large
276 tmp_stderr = open( tmp_err, 'rb' )
277 stderr = ''
278 buffsize = 1048576
279 try:
280 while True:
281 stderr += tmp_stderr.read( buffsize )
282 if not stderr or len( stderr ) % buffsize != 0:
283 break
284 except OverflowError:
285 pass
286
287 raise Exception, stderr
288
289 tmp_stdout.close()
290 tmp_stderr.close()
291
292 # TODO: look for errors in program output.
293 except Exception, e:
294 stop_err( 'Error in bismark:\n' + str( e ) )
295
296 # collect and copy output files
297 if args.output_report_file:
298 output_report_file = open(args.output_report_file, 'w+')
299 for line in fileinput.input(glob( os.path.join( output_dir, '*report.txt') )):
300 output_report_file.write(line)
301 output_report_file.close()
302
303
304 if args.output_suppressed_reads:
305 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads.txt'))[0], args.output_suppressed_reads )
306 if args.output_suppressed_reads_l:
307 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_1.txt'))[0], args.output_suppressed_reads_l )
308 if args.output_suppressed_reads_r:
309 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_2.txt'))[0], args.output_suppressed_reads_r )
310
311 if args.output_unmapped_reads:
312 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads.txt'))[0], args.output_unmapped_reads )
313 if args.output_unmapped_reads_l:
314 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_1.txt'))[0], args.output_unmapped_reads_l )
315 if args.output_unmapped_reads_r:
316 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_2.txt'))[0], args.output_unmapped_reads_r )
317
318 try:
319 """
320 merge all bam files
321 """
322 #tmp_out = tempfile.NamedTemporaryFile( dir=output_dir ).name
323 tmp_stdout = open( tmp_out, 'wab' )
324 tmp_err = tempfile.NamedTemporaryFile( dir=output_dir ).name
325 tmp_stderr = open( tmp_err, 'wb' )
326
327 tmp_res = tempfile.NamedTemporaryFile( dir= output_dir).name
328
329 bam_files = glob( os.path.join( output_dir, '*.bam') )
330 if len( bam_files ) > 1:
331 cmd = 'samtools merge -@ %s -f - %s ' % ( args.num_threads, ' '.join( bam_files ) )
332
333 p1 = subprocess.Popen( args=shlex.split( cmd ), stdout=subprocess.PIPE )
334 proc = subprocess.Popen( ['samtools', 'sort', '-@', str(args.num_threads), '-', tmp_res], stdin=p1.stdout, stdout=tmp_stdout, stderr=tmp_stderr )
335 else:
336 proc = subprocess.Popen( ['samtools', 'sort', '-@', str(args.num_threads), bam_files[0], tmp_res], stdout=tmp_stdout, stderr=tmp_stderr )
337
338 returncode = proc.wait()
339 tmp_stdout.close()
340 tmp_stderr.close()
341 if returncode != 0:
342 raise Exception, open( tmp_stderr.name ).read()
343
344 bam_path = "%s.bam" % tmp_res
345 if os.path.exists( bam_path ):
346 shutil.copy( bam_path, args.output )
347 else:
348 stop_err( 'BAM file no found:\n' + str( bam_path ) )
349
350 # TODO: look for errors in program output.
351 except Exception, e:
352 stop_err( 'Error in merging bam files:\n' + str( e ) )
353
354
355 if args.output_stdout:
356 # copy the temporary saved stdout from bismark
357 shutil.move( tmp_out, args.output_stdout )
358
359 # Clean up temp dirs
360 if args.own_file:
361 if os.path.exists( tmp_index_dir ):
362 shutil.rmtree( tmp_index_dir )
363 if os.path.exists( tmp_bismark_dir ):
364 shutil.rmtree( tmp_bismark_dir )
365 if os.path.exists( output_dir ):
366 shutil.rmtree( output_dir )
367
368 if __name__=="__main__": __main__()