comparison bismark_wrapper.py @ 0:36d124f44c0a draft

inital commit
author bjoern-gruening
date Tue, 25 Dec 2012 05:45:46 -0500
parents
children 427fb56f2e41
comparison
equal deleted inserted replaced
-1:000000000000 0:36d124f44c0a
1 #!/usr/bin/env python
2
3 import argparse, os, shutil, subprocess, sys, tempfile, fileinput
4 import fileinput
5 from glob import glob
6
7 def stop_err( msg ):
8 sys.stderr.write( "%s\n" % msg )
9 sys.exit()
10
11 def __main__():
12 #Parse Command Line
13 parser = argparse.ArgumentParser(description='Wrapper for the bismark bisulfite mapper.')
14 parser.add_argument( '-p', '--num-threads', dest='num_threads',
15 type=int, default=4, help='Use this many threads to align reads. The default is 4.' )
16
17 parser.add_argument( '--bismark_path', dest='bismark_path', help='Path to the bismark perl scripts' )
18
19 parser.add_argument( '--bowtie2', action='store_true', default=False, help='Running bismark with bowtie2 and not with bowtie.' )
20
21 # input options
22 parser.add_argument( '--own-file', dest='own_file', help='' )
23 parser.add_argument( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' )
24 parser.add_argument( '-O', '--output', dest='output' )
25 parser.add_argument( '--output-report-file', dest='output_report_file' )
26 parser.add_argument( '--suppress-header', dest='suppress_header', action="store_true" )
27
28 parser.add_argument( '--mate-paired', dest='mate_paired', action='store_true', help='Reads are mate-paired', default=False)
29
30
31 parser.add_argument( '-1', '--mate1', dest='mate1',
32 help='The forward reads file in Sanger FASTQ or FASTA format.' )
33 parser.add_argument( '-2', '--mate2', dest='mate2',
34 help='The reverse reads file in Sanger FASTQ or FASTA format.' )
35
36 parser.add_argument( '--output-unmapped-reads', dest='output_unmapped_reads',
37 help='Additional output file with unmapped reads (single-end).' )
38 parser.add_argument( '--output-unmapped-reads-l', dest='output_unmapped_reads_l',
39 help='File name for unmapped reads (left, paired-end).' )
40 parser.add_argument( '--output-unmapped-reads-r', dest='output_unmapped_reads_r',
41 help='File name for unmapped reads (right, paired-end).' )
42
43
44 parser.add_argument( '--output-suppressed-reads', dest='output_suppressed_reads',
45 help='Additional output file with suppressed reads (single-end).' )
46 parser.add_argument( '--output-suppressed-reads-l', dest='output_suppressed_reads_l',
47 help='File name for suppressed reads (left, paired-end).' )
48 parser.add_argument( '--output-suppressed-reads-r', dest='output_suppressed_reads_r',
49 help='File name for suppressed reads (right, paired-end).' )
50
51
52 parser.add_argument( '--single-paired', dest='single_paired',
53 help='The single-end reads file in Sanger FASTQ or FASTA format.' )
54
55 parser.add_argument( '--fastq', action='store_true', help='Query filetype is in FASTQ format')
56 parser.add_argument( '--fasta', action='store_true', help='Query filetype is in FASTA format')
57 parser.add_argument( '--phred64-quals', dest='phred64', action="store_true" )
58
59
60 parser.add_argument( '--skip-reads', dest='skip_reads', type=int )
61 parser.add_argument( '--qupto', type=int)
62
63
64 # paired end options
65 parser.add_argument( '-I', '--minins', dest='min_insert' )
66 parser.add_argument( '-X', '--maxins', dest='max_insert' )
67 parser.add_argument( '--no-mixed', dest='no_mixed', action="store_true" )
68 parser.add_argument( '--no-discordant', dest='no_discordant', action="store_true" )
69
70 #parse general options
71 # default 20
72 parser.add_argument( '--seed-len', dest='seed_len', type=int)
73 # default 15
74 parser.add_argument( '--seed-extention-attempts', dest='seed_extention_attempts', type=int )
75 # default 0
76 parser.add_argument( '--seed-mismatches', dest='seed_mismatches', type=int )
77 # default 2
78 parser.add_argument( '--max-reseed', dest='max_reseed', type=int )
79 """
80 # default 70
81 parser.add_argument( '--maqerr', dest='maqerr', type=int )
82 """
83
84 """
85 The number of megabytes of memory a given thread is given to store path
86 descriptors in --best mode. Best-first search must keep track of many paths
87 at once to ensure it is always extending the path with the lowest cumulative
88 cost. Bowtie tries to minimize the memory impact of the descriptors, but
89 they can still grow very large in some cases. If you receive an error message
90 saying that chunk memory has been exhausted in --best mode, try adjusting
91 this parameter up to dedicate more memory to the descriptors. Default: 512.
92 """
93 parser.add_argument( '--chunkmbs', type=int, default=512 )
94
95 args = parser.parse_args()
96
97 # Create bismark index if necessary.
98 index_dir = ""
99 if args.own_file:
100 """
101 Create a temporary index with the offered files from the user.
102 Utilizing the script: bismark_genome_preparation
103 bismark_genome_preparation --bowtie2 hg19/
104 """
105 tmp_index_dir = tempfile.mkdtemp()
106 index_path = os.path.join( tmp_index_dir, '.'.join( os.path.split( args.own_file )[1].split( '.' )[:-1] ) )
107 try:
108 """
109 Create a hard link pointing to args.own_file named 'index_path'.fa.
110 """
111 os.symlink( args.own_file, index_path + '.fa' )
112 except Exception, e:
113 if os.path.exists( tmp_index_dir ):
114 shutil.rmtree( tmp_index_dir )
115 stop_err( 'Error in linking the reference database.\n' + str( e ) )
116 # bismark_genome_preparation needs the complete path to the folder in which the database is stored
117 if args.bowtie2:
118 cmd_index = 'bismark_genome_preparation --bowtie2 %s ' % ( tmp_index_dir )
119 else:
120 cmd_index = 'bismark_genome_preparation %s ' % ( tmp_index_dir )
121 if args.bismark_path:
122 # add the path to the bismark perl scripts, that is needed for galaxy
123 cmd_index = '%s/%s' % (args.bismark_path, cmd_index)
124 try:
125 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
126 tmp_stderr = open( tmp, 'wb' )
127 proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stdout=open(os.devnull, 'wb'), stderr=tmp_stderr.fileno() )
128 returncode = proc.wait()
129 tmp_stderr.close()
130 # get stderr, allowing for case where it's very large
131 tmp_stderr = open( tmp, 'rb' )
132 stderr = ''
133 buffsize = 1048576
134 try:
135 while True:
136 stderr += tmp_stderr.read( buffsize )
137 if not stderr or len( stderr ) % buffsize != 0:
138 break
139 except OverflowError:
140 pass
141 tmp_stderr.close()
142 if returncode != 0:
143 raise Exception, stderr
144 except Exception, e:
145 if os.path.exists( tmp_index_dir ):
146 shutil.rmtree( tmp_index_dir )
147 stop_err( 'Error indexing reference sequence\n' + str( e ) )
148 index_dir = tmp_index_dir
149 else:
150 index_dir = args.index_path
151
152 # Build bismark command
153 tmp_bismark_dir = tempfile.mkdtemp()
154 output_dir = os.path.join( tmp_bismark_dir, 'results')
155 cmd = 'bismark %(args)s --temp_dir %(tmp_bismark_dir)s -o %(output_dir)s --quiet %(genome_folder)s %(reads)s'
156 if args.bismark_path:
157 # add the path to the bismark perl scripts, that is needed for galaxy
158 cmd = '%s/%s' % (args.bismark_path, cmd)
159
160 arguments = {
161 'genome_folder': index_dir,
162 'args': '',
163 'tmp_bismark_dir': tmp_bismark_dir,
164 'output_dir': output_dir,
165 }
166
167 additional_opts = ''
168 # Set up the reads
169 if args.mate_paired:
170 # paired-end reads library
171 reads = '-1 %s ' % ( args.mate1 )
172 reads += ' -2 %s ' % ( args.mate2 )
173 additional_opts += ' -I %s -X %s ' % (args.min_insert, args.max_insert)
174 else:
175 # single paired reads library
176 reads = ' %s ' % ( args.single_paired )
177
178
179 if not args.bowtie2:
180 # use bowtie specific options
181 additional_opts += ' --best '
182 if args.seed_mismatches:
183 # --seedmms
184 additional_opts += ' -n %s ' % args.seed_mismatches
185 if args.seed_len:
186 # --seedlen
187 additional_opts += ' -l %s ' % args.seed_len
188
189 # alignment options
190 if args.bowtie2:
191 additional_opts += ' -p %s --bowtie2 ' % args.num_threads
192 if args.seed_mismatches:
193 additional_opts += ' -N %s ' % args.seed_mismatches
194 if args.seed_len:
195 additional_opts += ' -L %s ' % args.seed_len
196 if args.seed_extention_attempts:
197 additional_opts += ' -D %s ' % args.seed_extention_attempts
198 if args.max_reseed:
199 additional_opts += ' -R %s ' % args.max_reseed
200 if args.no_discordant:
201 additional_opts += ' --no-discordant '
202 if args.no_mixed:
203 additional_opts += ' --no-mixed '
204 """
205 if args.maqerr:
206 additional_opts += ' --maqerr %s ' % args.maqerr
207 """
208 if args.skip_reads:
209 additional_opts += ' --skip %s ' % args.skip_reads
210 if args.qupto:
211 additional_opts += ' --qupto %s ' % args.qupto
212 if args.phred64:
213 additional_opts += ' --phred64-quals '
214 if args.suppress_header:
215 additional_opts += ' --sam-no-hd '
216 if args.output_unmapped_reads or ( args.output_unmapped_reads_l and args.output_unmapped_reads_r):
217 additional_opts += ' --un '
218 if args.output_suppressed_reads or ( args.output_suppressed_reads_l and args.output_suppressed_reads_r):
219 additional_opts += ' --ambiguous '
220
221 arguments.update( {'args': additional_opts, 'reads': reads} )
222
223 # Final command:
224 cmd = cmd % arguments
225
226 # Run
227 try:
228 tmp_out = tempfile.NamedTemporaryFile().name
229 tmp_stdout = open( tmp_out, 'wb' )
230 tmp_err = tempfile.NamedTemporaryFile().name
231 tmp_stderr = open( tmp_err, 'wb' )
232 proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr )
233 returncode = proc.wait()
234 tmp_stderr.close()
235 # get stderr, allowing for case where it's very large
236 tmp_stderr = open( tmp_err, 'rb' )
237 stderr = ''
238 buffsize = 1048576
239 try:
240 while True:
241 stderr += tmp_stderr.read( buffsize )
242 if not stderr or len( stderr ) % buffsize != 0:
243 break
244 except OverflowError:
245 pass
246 tmp_stdout.close()
247 tmp_stderr.close()
248 if returncode != 0:
249 raise Exception, stderr
250
251 # TODO: look for errors in program output.
252 except Exception, e:
253 stop_err( 'Error in bismark:\n' + str( e ) )
254
255
256 # collect and copy output files
257 """
258 if args.output_report_file:
259 output_report_file = open(args.output_report_file, 'w+')
260 for line in fileinput.input(glob( os.path.join( output_dir, '*.txt') )):
261 output_report_file.write(line)
262 output_report_file.close()
263 """
264
265 if args.output_suppressed_reads:
266 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads.txt'))[0], args.output_suppressed_reads )
267 if args.output_suppressed_reads_l:
268 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_1.txt'))[0], args.output_suppressed_reads_l )
269 if args.output_suppressed_reads_r:
270 shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_2.txt'))[0], args.output_suppressed_reads_r )
271
272 if args.output_unmapped_reads:
273 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads.txt'))[0], args.output_unmapped_reads )
274 if args.output_unmapped_reads_l:
275 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_1.txt'))[0], args.output_unmapped_reads_l )
276 if args.output_unmapped_reads_r:
277 shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_2.txt'))[0], args.output_unmapped_reads_r )
278
279 shutil.move( glob( os.path.join( output_dir, '*.sam'))[0] , args.output)
280
281 # Clean up temp dirs
282 if args.own_file:
283 if os.path.exists( tmp_index_dir ):
284 shutil.rmtree( tmp_index_dir )
285 if os.path.exists( tmp_bismark_dir ):
286 shutil.rmtree( tmp_bismark_dir )
287
288 if __name__=="__main__": __main__()