comparison bismark_mapping/bismark_wrapper.py @ 7:fcadce4d9a06 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author bgruening
date Sat, 06 May 2017 13:18:09 -0400
parents
children
comparison
equal deleted inserted replaced
6:0f8646f22b8d 7:fcadce4d9a06
1 #!/usr/bin/env python
2
3 import argparse
4 import os
5 import shutil
6 import subprocess
7 import sys
8 import shlex
9 import tempfile
10 import fileinput
11 import fileinput
12 from glob import glob
13
14 def stop_err( msg ):
15 sys.stderr.write( "%s\n" % msg )
16 sys.exit()
17
18 def __main__():
19
20 #Parse Command Line
21 parser = argparse.ArgumentParser(description='Wrapper for the bismark bisulfite mapper.')
22 parser.add_argument( '-p', '--num-threads', dest='num_threads',
23 type=int, default=4, help='Use this many threads to align reads. The default is 4.' )
24
25 parser.add_argument( '--bismark_path', dest='bismark_path', help='Path to the bismark perl scripts' )
26
27 parser.add_argument( '--bowtie2', action='store_true', default=False, help='Running bismark with bowtie2 and not with bowtie.' )
28
29 # input options
30 parser.add_argument( '--own-file', dest='own_file', help='' )
31 parser.add_argument( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' )
32 parser.add_argument( '-O', '--output', dest='output' )
33
34
35 parser.add_argument( '--output-report-file', dest='output_report_file' )
36 parser.add_argument( '--suppress-header', dest='suppress_header', action="store_true" )
37
38 parser.add_argument( '--mate-paired', dest='mate_paired', action='store_true', help='Reads are mate-paired', default=False)
39
40
41 parser.add_argument( '-1', '--mate1', dest='mate1',
42 help='The forward reads file in Sanger FASTQ or FASTA format.' )
43 parser.add_argument( '-2', '--mate2', dest='mate2',
44 help='The reverse reads file in Sanger FASTQ or FASTA format.' )
45 parser.add_argument( '--sort-bam', dest='sort_bam', action="store_true" )
46
47 parser.add_argument( '--output-unmapped-reads', dest='output_unmapped_reads',
48 help='Additional output file with unmapped reads (single-end).' )
49 parser.add_argument( '--output-unmapped-reads-l', dest='output_unmapped_reads_l',
50 help='File name for unmapped reads (left, paired-end).' )
51 parser.add_argument( '--output-unmapped-reads-r', dest='output_unmapped_reads_r',
52 help='File name for unmapped reads (right, paired-end).' )
53
54
55 parser.add_argument( '--output-suppressed-reads', dest='output_suppressed_reads',
56 help='Additional output file with suppressed reads (single-end).' )
57 parser.add_argument( '--output-suppressed-reads-l', dest='output_suppressed_reads_l',
58 help='File name for suppressed reads (left, paired-end).' )
59 parser.add_argument( '--output-suppressed-reads-r', dest='output_suppressed_reads_r',
60 help='File name for suppressed reads (right, paired-end).' )
61 parser.add_argument( '--stdout', dest='output_stdout',
62 help='File name for the standard output of bismark.' )
63
64
65 parser.add_argument( '--single-paired', dest='single_paired',
66 help='The single-end reads file in Sanger FASTQ or FASTA format.' )
67
68 parser.add_argument( '--fastq', action='store_true', help='Query filetype is in FASTQ format')
69 parser.add_argument( '--fasta', action='store_true', help='Query filetype is in FASTA format')
70 parser.add_argument( '--phred64-quals', dest='phred64', action="store_true" )
71
72
73 parser.add_argument( '--skip-reads', dest='skip_reads', type=int )
74 parser.add_argument( '--qupto', type=int)
75
76
77 # paired end options
78 parser.add_argument( '-I', '--minins', dest='min_insert' )
79 parser.add_argument( '-X', '--maxins', dest='max_insert' )
80 parser.add_argument( '--no-mixed', dest='no_mixed', action="store_true" )
81 parser.add_argument( '--no-discordant', dest='no_discordant', action="store_true" )
82
83 #parse general options
84 # default 20
85 parser.add_argument( '--seed-len', dest='seed_len', type=int)
86 # default 15
87 parser.add_argument( '--seed-extention-attempts', dest='seed_extention_attempts', type=int )
88 # default 0
89 parser.add_argument( '--seed-mismatches', dest='seed_mismatches', type=int )
90 # default 2
91 parser.add_argument( '--max-reseed', dest='max_reseed', type=int )
92 """
93 # default 70
94 parser.add_argument( '--maqerr', dest='maqerr', type=int )
95 """
96
97 """
98 The number of megabytes of memory a given thread is given to store path
99 descriptors in --best mode. Best-first search must keep track of many paths
100 at once to ensure it is always extending the path with the lowest cumulative
101 cost. Bowtie tries to minimize the memory impact of the descriptors, but
102 they can still grow very large in some cases. If you receive an error message
103 saying that chunk memory has been exhausted in --best mode, try adjusting
104 this parameter up to dedicate more memory to the descriptors. Default: 512.
105 """
106 parser.add_argument( '--chunkmbs', type=int, default=512 )
107
108 args = parser.parse_args()
109
110 # Create bismark index if necessary.
111 index_dir = ""
112 if args.own_file:
113 """
114 Create a temporary index with the offered files from the user.
115 Utilizing the script: bismark_genome_preparation
116 bismark_genome_preparation --bowtie2 hg19/
117 """
118 tmp_index_dir = tempfile.mkdtemp()
119 index_path = os.path.join( tmp_index_dir, '.'.join( os.path.split( args.own_file )[1].split( '.' )[:-1] ) )
120 try:
121 """
122 Create a hard link pointing to args.own_file named 'index_path'.fa.
123 """
124 os.symlink( args.own_file, index_path + '.fa' )
125 except Exception, e:
126 if os.path.exists( tmp_index_dir ):
127 shutil.rmtree( tmp_index_dir )
128 stop_err( 'Error in linking the reference database.\n' + str( e ) )
129 # bismark_genome_preparation needs the complete path to the folder in which the database is stored
130 if args.bowtie2:
131 cmd_index = 'bismark_genome_preparation --bowtie2 %s ' % ( tmp_index_dir )
132 else:
133 cmd_index = 'bismark_genome_preparation %s ' % ( tmp_index_dir )
134 if args.bismark_path:
135 if os.path.exists(args.bismark_path):
136 # add the path to the bismark perl scripts, that is needed for galaxy
137 cmd_index = os.path.join(args.bismark_path, cmd_index)
138 else:
139 # assume the same directory as that script
140 cmd_index = 'perl %s' % os.path.join(os.path.realpath(os.path.dirname(__file__)), cmd_index)
141 try:
142 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
143 tmp_stderr = open( tmp, 'wb' )
144 proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stdout=open(os.devnull, 'wb'), stderr=tmp_stderr.fileno() )
145 returncode = proc.wait()
146 tmp_stderr.close()
147 # get stderr, allowing for case where it's very large
148 tmp_stderr = open( tmp, 'rb' )
149 stderr = ''
150 buffsize = 1048576
151 try:
152 while True:
153 stderr += tmp_stderr.read( buffsize )
154 if not stderr or len( stderr ) % buffsize != 0:
155 break
156 except OverflowError:
157 pass
158 tmp_stderr.close()
159 if returncode != 0:
160 raise Exception, stderr
161 except Exception, e:
162 if os.path.exists( tmp_index_dir ):
163 shutil.rmtree( tmp_index_dir )
164 stop_err( 'Error indexing reference sequence\n' + str( e ) )
165 index_dir = tmp_index_dir
166 else:
167 # bowtie path is the path to the index directory and the first path of the index file name
168 index_dir = os.path.dirname( args.index_path )
169
170 # Build bismark command
171
172 """
173 Bismark requires a large amount of temporary disc space. If that is not available, for example on a cluster you can hardcode the
174 TMP to some larger space. It's not recommended but it works.
175 """
176 #tmp_bismark_dir = tempfile.mkdtemp( dir='/data/0/galaxy_db/tmp/' )
177 tmp_bismark_dir = tempfile.mkdtemp()
178 output_dir = os.path.join( tmp_bismark_dir, 'results')
179 cmd = 'bismark %(args)s --bam --temp_dir %(tmp_bismark_dir)s --gzip -o %(output_dir)s --quiet %(genome_folder)s %(reads)s'
180
181 if args.fasta:
182 # the query input files (specified as mate1,mate2 or singles) are FastA
183 cmd = '%s %s' % (cmd, '--fasta')
184 elif args.fastq:
185 cmd = '%s %s' % (cmd, '--fastq')
186
187 if args.bismark_path:
188 # add the path to the bismark perl scripts, that is needed for galaxy
189 if os.path.exists(args.bismark_path):
190 cmd = os.path.join(args.bismark_path, cmd)
191 else:
192 # assume the same directory as that script
193 cmd = 'perl %s' % os.path.join(os.path.realpath(os.path.dirname(__file__)), cmd)
194
195 arguments = {
196 'genome_folder': index_dir,
197 'args': '',
198 'tmp_bismark_dir': tmp_bismark_dir,
199 'output_dir': output_dir,
200 }
201
202 additional_opts = ''
203 # Set up the reads
204 if args.mate_paired:
205 # paired-end reads library
206 reads = '-1 %s ' % ( args.mate1 )
207 reads += ' -2 %s ' % ( args.mate2 )
208 additional_opts += ' -I %s -X %s ' % (args.min_insert, args.max_insert)
209 else:
210 # single paired reads library
211 reads = ' %s ' % ( args.single_paired )
212
213
214 if not args.bowtie2:
215 # use bowtie specific options
216 #additional_opts += ' --best ' # bug in bismark, --best is not available as option. Only --non-best, best-mode is activated by default
217 if args.seed_mismatches:
218 # --seedmms
219 additional_opts += ' -n %s ' % args.seed_mismatches
220 if args.seed_len:
221 # --seedlen
222 additional_opts += ' -l %s ' % args.seed_len
223
224 # alignment options
225 if args.bowtie2:
226 additional_opts += ' -p %s --bowtie2 ' % (int(args.num_threads/2)) #divides by 2 here since bismark will spawn 2 (original top and original bottom) jobs with -p threads each
227 if args.seed_mismatches:
228 additional_opts += ' -N %s ' % args.seed_mismatches
229 if args.seed_len:
230 additional_opts += ' -L %s ' % args.seed_len
231 if args.seed_extention_attempts:
232 additional_opts += ' -D %s ' % args.seed_extention_attempts
233 if args.max_reseed:
234 additional_opts += ' -R %s ' % args.max_reseed
235 if args.no_discordant:
236 additional_opts += ' --no-discordant '
237 if args.no_mixed:
238 additional_opts += ' --no-mixed '
239 """
240 if args.maqerr:
241 additional_opts += ' --maqerr %s ' % args.maqerr
242 """
243 if args.skip_reads:
244 additional_opts += ' --skip %s ' % args.skip_reads
245 if args.qupto:
246 additional_opts += ' --qupto %s ' % args.qupto
247 if args.phred64:
248 additional_opts += ' --phred64-quals '
249 if args.suppress_header:
250 additional_opts += ' --sam-no-hd '
251 if args.output_unmapped_reads or ( args.output_unmapped_reads_l and args.output_unmapped_reads_r):
252 additional_opts += ' --un '
253 if args.output_suppressed_reads or ( args.output_suppressed_reads_l and args.output_suppressed_reads_r):
254 additional_opts += ' --ambiguous '
255
256 arguments.update( {'args': additional_opts, 'reads': reads} )
257
258 # Final bismark command:
259 cmd = cmd % arguments
260 # Run
261 try:
262 tmp_out = tempfile.NamedTemporaryFile().name
263 tmp_stdout = open( tmp_out, 'wb' )
264 tmp_err = tempfile.NamedTemporaryFile().name
265 tmp_stderr = open( tmp_err, 'wb' )
266 proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr )
267 returncode = proc.wait()
268
269 if returncode != 0:
270 tmp_stdout.close()
271 tmp_stderr.close()
272 # get stderr, allowing for case where it's very large
273 tmp_stderr = open( tmp_err, 'rb' )
274 stderr = ''
275 buffsize = 1048576
276 try:
277 while True:
278 stderr += tmp_stderr.read( buffsize )
279 if not stderr or len( stderr ) % buffsize != 0:
280 break
281 except OverflowError:
282 pass
283
284 raise Exception, stderr
285 tmp_stdout.close()
286 tmp_stderr.close()
287
288 # TODO: look for errors in program output.
289 except Exception, e:
290 stop_err( 'Error in bismark:\n' + str( e ) )
291
292 # collect and copy output files
293 if args.output_report_file:
294 output_report_file = open(args.output_report_file, 'w+')
295 for line in fileinput.input(glob( os.path.join( output_dir, '*report.txt') )):
296 output_report_file.write(line)
297 output_report_file.close()
298
299
300 if args.output_suppressed_reads:
301 if glob(os.path.join( output_dir, '*ambiguous_reads.txt')):
302 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads.txt'))[0], args.output_suppressed_reads )
303 if args.output_suppressed_reads_l:
304 if glob(os.path.join(output_dir, '*ambiguous_reads_1.txt')):
305 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_1.txt'))[0], args.output_suppressed_reads_l )
306 if args.output_suppressed_reads_r:
307 if glob(os.path.join(output_dir, '*ambiguous_reads_2.txt')):
308 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_2.txt'))[0], args.output_suppressed_reads_r )
309
310 if args.output_unmapped_reads:
311 if glob(os.path.join(output_dir, '*unmapped_reads.txt')):
312 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads.txt'))[0], args.output_unmapped_reads )
313 if args.output_unmapped_reads_l:
314 if glob(os.path.join(output_dir, '*unmapped_reads_1.txt')):
315 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_1.txt'))[0], args.output_unmapped_reads_l )
316 if args.output_unmapped_reads_r:
317 if glob(os.path.join(output_dir, '*unmapped_reads_2.txt')):
318 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_2.txt'))[0], args.output_unmapped_reads_r )
319
320 try:
321 """
322 merge all bam files
323 """
324 #tmp_out = tempfile.NamedTemporaryFile( dir=output_dir ).name
325 tmp_stdout = open( tmp_out, 'wab' )
326 #tmp_err = tempfile.NamedTemporaryFile( dir=output_dir ).name
327 tmp_stderr = open( tmp_err, 'wab' )
328
329 tmp_res = tempfile.NamedTemporaryFile( dir= output_dir).name
330
331 bam_files = glob( os.path.join( output_dir, '*.bam') )
332 if len( bam_files ) > 1:
333 cmd = 'samtools merge -@ %s -f %s %s ' % ( args.num_threads, tmp_res, ' '.join( bam_files ) )
334
335 proc = subprocess.Popen( args=shlex.split( cmd ), stdout=subprocess.PIPE )
336
337 returncode = proc.wait()
338 tmp_stdout.close()
339 tmp_stderr.close()
340 if returncode != 0:
341 raise Exception, open( tmp_stderr.name ).read()
342 else:
343 tmp_res = bam_files[0]
344
345 bam_path = "%s" % tmp_res
346
347 if os.path.exists( bam_path ):
348 if args.sort_bam:
349 cmd = 'samtools sort -@ %s %s sorted_bam' % (args.num_threads, bam_path)
350 proc = subprocess.Popen( args=shlex.split( cmd ) )
351 returncode = proc.wait()
352 if returncode != 0:
353 raise Exception("Error during '%s'" % cmd)
354 shutil.move( 'sorted_bam.bam', args.output )
355 else:
356 shutil.move( bam_path, args.output )
357 else:
358 stop_err( 'BAM file no found:\n' + str( bam_path ) )
359
360
361 # TODO: look for errors in program output.
362 except Exception, e:
363 stop_err( 'Error in merging bam files:\n' + str( e ) )
364
365
366 if args.output_stdout:
367 # copy the temporary saved stdout from bismark
368 shutil.move( tmp_out, args.output_stdout )
369
370 # Clean up temp dirs
371 if args.own_file:
372 if os.path.exists( tmp_index_dir ):
373 shutil.rmtree( tmp_index_dir )
374 if os.path.exists( tmp_bismark_dir ):
375 shutil.rmtree( tmp_bismark_dir )
376 if os.path.exists( output_dir ):
377 shutil.rmtree( output_dir )
378
379 if __name__=="__main__": __main__()