Mercurial > repos > iuc > varscan_mpileup
comparison varscan.py @ 2:d062703d6f13 draft
planemo upload for repository https://github.com/galaxyproject/iuc/tree/master/tools/varscan commit 30867f1f022bed18ba1c3b8dc9c54226890b3a9c
author | iuc |
---|---|
date | Tue, 04 Dec 2018 05:16:18 -0500 |
parents | |
children | ab2f908f0b08 |
comparison
equal
deleted
inserted
replaced
1:0bc800d67a0e | 2:d062703d6f13 |
---|---|
1 #!/usr/bin/env python3 | |
2 from __future__ import print_function | |
3 | |
4 import argparse | |
5 import io | |
6 import os | |
7 import subprocess | |
8 import sys | |
9 import tempfile | |
10 import time | |
11 from contextlib import ExitStack | |
12 from functools import partial | |
13 from threading import Thread | |
14 | |
15 import pysam | |
16 | |
17 | |
18 class VariantCallingError (RuntimeError): | |
19 """Exception class for issues with samtools and varscan subprocesses.""" | |
20 | |
21 def __init__(self, message=None, call='', error=''): | |
22 self.message = message | |
23 self.call = call.strip() | |
24 self.error = error.strip() | |
25 | |
26 def __str__(self): | |
27 if self.message is None: | |
28 return '' | |
29 if self.error: | |
30 msg_header = '"{0}" failed with:\n{1}\n\n'.format( | |
31 self.call, self.error | |
32 ) | |
33 else: | |
34 msg_header = '{0} failed.\n' | |
35 'No further information about this error is available.\n\n'.format( | |
36 self.call | |
37 ) | |
38 return msg_header + self.message | |
39 | |
40 | |
41 class VarScanCaller (object): | |
42 def __init__(self, ref_genome, bam_input_files, | |
43 max_depth=None, | |
44 min_mapqual=None, min_basequal=None, | |
45 threads=1, verbose=False, quiet=True | |
46 ): | |
47 self.ref_genome = ref_genome | |
48 self.bam_input_files = bam_input_files | |
49 self.max_depth = max_depth | |
50 self.min_mapqual = min_mapqual | |
51 self.min_basequal = min_basequal | |
52 self.threads = threads | |
53 self.verbose = verbose | |
54 self.quiet = quiet | |
55 | |
56 with pysam.FastaFile(ref_genome) as ref_fa: | |
57 self.ref_contigs = ref_fa.references | |
58 self.ref_lengths = ref_fa.lengths | |
59 | |
60 self.pileup_engine = ['samtools', 'mpileup'] | |
61 self.varcall_engine = ['varscan', 'somatic'] | |
62 self.requires_stdout_redirect = False | |
63 self.TemporaryContigVCF = partial( | |
64 tempfile.NamedTemporaryFile, | |
65 mode='wb', suffix='', delete=False, dir=os.getcwd() | |
66 ) | |
67 self.tmpfiles = [] | |
68 | |
69 def _get_pysam_pileup_args(self): | |
70 param_dict = {} | |
71 if self.max_depth is not None: | |
72 param_dict['max_depth'] = self.max_depth | |
73 if self.min_mapqual is not None: | |
74 param_dict['min_mapping_quality'] = self.min_mapqual | |
75 if self.min_basequal is not None: | |
76 param_dict['min_base_quality'] = self.min_basequal | |
77 param_dict['compute_baq'] = False | |
78 param_dict['stepper'] = 'samtools' | |
79 return param_dict | |
80 | |
81 def varcall_parallel(self, normal_purity=None, tumor_purity=None, | |
82 min_coverage=None, | |
83 min_var_count=None, | |
84 min_var_freq=None, min_hom_freq=None, | |
85 p_value=None, somatic_p_value=None, | |
86 threads=None, verbose=None, quiet=None | |
87 ): | |
88 if not threads: | |
89 threads = self.threads | |
90 if verbose is None: | |
91 verbose = self.verbose | |
92 if quiet is None: | |
93 quiet = self.quiet | |
94 # mapping of method parameters to varcall engine command line options | |
95 varcall_engine_option_mapping = [ | |
96 ('--normal-purity', normal_purity), | |
97 ('--tumor-purity', tumor_purity), | |
98 ('--min-coverage', min_coverage), | |
99 ('--min-reads2', min_var_count), | |
100 ('--min-var-freq', min_var_freq), | |
101 ('--min-freq-for-hom', min_hom_freq), | |
102 ('--p-value', p_value), | |
103 ('--somatic-p-value', somatic_p_value), | |
104 ('--min-avg-qual', self.min_basequal) | |
105 ] | |
106 varcall_engine_options = [] | |
107 for option, value in varcall_engine_option_mapping: | |
108 if value is not None: | |
109 varcall_engine_options += [option, str(value)] | |
110 pileup_engine_options = ['-B'] | |
111 if self.max_depth is not None: | |
112 pileup_engine_options += ['-d', str(self.max_depth)] | |
113 if self.min_mapqual is not None: | |
114 pileup_engine_options += ['-q', str(self.min_mapqual)] | |
115 if self.min_basequal is not None: | |
116 pileup_engine_options += ['-Q', str(self.min_basequal)] | |
117 | |
118 # Create a tuple of calls to samtools mpileup and varscan for | |
119 # each contig. The contig name is stored as the third element of | |
120 # that tuple. | |
121 # The calls are stored in the reverse order of the contig list so | |
122 # that they can be popped off later in the original order | |
123 calls = [( | |
124 self.pileup_engine + pileup_engine_options + [ | |
125 '-r', contig + ':', | |
126 '-f', self.ref_genome | |
127 ] + self.bam_input_files, | |
128 self.varcall_engine + [ | |
129 '-', '{out}', '--output-vcf', '1', '--mpileup', '1' | |
130 ] + varcall_engine_options, | |
131 contig | |
132 ) for contig in self.ref_contigs[::-1]] | |
133 | |
134 if verbose: | |
135 print('Starting variant calling ..') | |
136 | |
137 # launch subprocesses and monitor their status | |
138 subprocesses = [] | |
139 error_table = {} | |
140 tmp_io_started = [] | |
141 tmp_io_finished = [] | |
142 self.tmpfiles = [] | |
143 | |
144 def enqueue_stderr_output(out, stderr_buffer): | |
145 for line in iter(out.readline, b''): | |
146 # Eventually we are going to print the contents of | |
147 # the stderr_buffer to sys.stderr so we can | |
148 # decode things here using its encoding. | |
149 # We do a 'backslashreplace' just to be on the safe side. | |
150 stderr_buffer.write(line.decode(sys.stderr.encoding, | |
151 'backslashreplace')) | |
152 out.close() | |
153 | |
154 try: | |
155 while subprocesses or calls: | |
156 while calls and len(subprocesses) < threads: | |
157 # There are still calls waiting for execution and we | |
158 # have unoccupied threads so we launch a new combined | |
159 # call to samtools mpileup and the variant caller. | |
160 | |
161 # pop the call arguments from our call stack | |
162 call = calls.pop() | |
163 # get the name of the contig that this call is going | |
164 # to work on | |
165 contig = call[2] | |
166 # Based on the contig name, generate a readable and | |
167 # file system-compatible prefix and use it to create | |
168 # a named temporary file, to which the call output | |
169 # will be redirected. | |
170 # At the moment we create the output file we add it to | |
171 # the list of all temporary output files so that we can | |
172 # remove it eventually during cleanup. | |
173 call_out = self.TemporaryContigVCF( | |
174 prefix=''.join( | |
175 c if c.isalnum() else '_' for c in contig | |
176 ) + '_', | |
177 ) | |
178 # maintain a list of variant call outputs | |
179 # in the order the subprocesses got launched | |
180 tmp_io_started.append(call_out.name) | |
181 if self.requires_stdout_redirect: | |
182 # redirect stdout to the temporary file just created | |
183 stdout_p2 = call_out | |
184 stderr_p2 = subprocess.PIPE | |
185 else: | |
186 # variant caller wants to write output to file directly | |
187 stdout_p2 = subprocess.PIPE | |
188 stderr_p2 = subprocess.STDOUT | |
189 call[1][call[1].index('{out}')] = call_out.name | |
190 call_out.close() | |
191 # for reporting purposes, join the arguments for the | |
192 # samtools and the variant caller calls into readable | |
193 # strings | |
194 c_str = (' '.join(call[0]), ' '.join(call[1])) | |
195 error_table[c_str] = [io.StringIO(), io.StringIO()] | |
196 # start the subprocesses | |
197 p1 = subprocess.Popen( | |
198 call[0], | |
199 stdout=subprocess.PIPE, stderr=subprocess.PIPE | |
200 ) | |
201 p2 = subprocess.Popen( | |
202 call[1], | |
203 stdin=p1.stdout, | |
204 stdout=stdout_p2, | |
205 stderr=stderr_p2 | |
206 ) | |
207 # subprocess bookkeeping | |
208 subprocesses.append((c_str, p1, p2, call_out, contig)) | |
209 # make sure our newly launched call does not block | |
210 # because its buffers fill up | |
211 p1.stdout.close() | |
212 t1 = Thread( | |
213 target=enqueue_stderr_output, | |
214 args=(p1.stderr, error_table[c_str][0]) | |
215 ) | |
216 t2 = Thread( | |
217 target=enqueue_stderr_output, | |
218 args=( | |
219 p2.stderr | |
220 if self.requires_stdout_redirect else | |
221 p2.stdout, | |
222 error_table[c_str][1] | |
223 ) | |
224 ) | |
225 t1.daemon = t2.daemon = True | |
226 t1.start() | |
227 t2.start() | |
228 | |
229 if verbose: | |
230 print( | |
231 'Calling variants for contig: {0}' | |
232 .format(call[2]) | |
233 ) | |
234 | |
235 # monitor all running calls to see if any of them are done | |
236 for call, p1, p2, ofo, contig in subprocesses: | |
237 p1_stat = p1.poll() | |
238 p2_stat = p2.poll() | |
239 if p1_stat is not None or p2_stat is not None: | |
240 # There is an outcome for this process! | |
241 # Lets see what it is | |
242 if p1_stat or p2_stat: | |
243 print() | |
244 print( | |
245 error_table[call][0].getvalue(), | |
246 error_table[call][1].getvalue(), | |
247 file=sys.stderr | |
248 ) | |
249 raise VariantCallingError( | |
250 'Variant Calling for contig {0} failed.' | |
251 .format(contig), | |
252 call='{0} | {1}'.format(call[0], call[1]) | |
253 ) | |
254 if p1_stat == 0 and p2_stat is None: | |
255 # VarScan is not handling the no output from | |
256 # samtools mpileup situation correctly so maybe | |
257 # that's the issue here | |
258 last_words = error_table[call][1].getvalue( | |
259 ).splitlines()[-4:] | |
260 if len(last_words) < 4 or any( | |
261 not msg.startswith('Input stream not ready') | |
262 for msg in last_words | |
263 ): | |
264 break | |
265 # lets give this process a bit more time | |
266 # VarScan is waiting for input it will never | |
267 # get, stop it. | |
268 p2.terminate() | |
269 subprocesses.remove((call, p1, p2, ofo, contig)) | |
270 ofo.close() | |
271 break | |
272 if p2_stat == 0: | |
273 # Things went fine. | |
274 # maintain a list of variant call outputs | |
275 # that finished successfully (in the order | |
276 # they finished) | |
277 tmp_io_finished.append(ofo.name) | |
278 if verbose: | |
279 print() | |
280 print('Contig {0} finished.'.format(contig)) | |
281 if not quiet: | |
282 print() | |
283 print( | |
284 'stderr output from samtools mpileup/' | |
285 'bcftools:'.upper(), | |
286 file=sys.stderr | |
287 ) | |
288 print( | |
289 error_table[call][0].getvalue(), | |
290 error_table[call][1].getvalue(), | |
291 file=sys.stderr | |
292 ) | |
293 # Discard the collected stderr output from | |
294 # the call, remove the call from the list of | |
295 # running calls and close its output file. | |
296 del error_table[call] | |
297 subprocesses.remove((call, p1, p2, ofo, contig)) | |
298 # Closing the output file is important or we | |
299 # may hit the file system limit for open files | |
300 # if there are lots of contigs. | |
301 ofo.close() | |
302 break | |
303 # wait a bit in between monitoring cycles | |
304 time.sleep(2) | |
305 finally: | |
306 for call, p1, p2, ofo, contig in subprocesses: | |
307 # make sure we do not leave running subprocesses behind | |
308 for proc in (p1, p2): | |
309 try: | |
310 proc.terminate() | |
311 except Exception: | |
312 pass | |
313 # close currently open files | |
314 ofo.close() | |
315 # store the files with finished content in the order that | |
316 # the corresponding jobs were launched | |
317 self.tmpfiles = [f for f in tmp_io_started if f in tmp_io_finished] | |
318 # Make sure remaining buffered stderr output of | |
319 # subprocesses does not get lost. | |
320 # Currently, we don't screen this output for real errors, | |
321 # but simply rewrite everything. | |
322 if not quiet and error_table: | |
323 print() | |
324 print( | |
325 'stderr output from samtools mpileup/bcftools:'.upper(), | |
326 file=sys.stderr | |
327 ) | |
328 for call, errors in error_table.items(): | |
329 print(' | '.join(call), ':', file=sys.stderr) | |
330 print('-' * 20, file=sys.stderr) | |
331 print('samtools mpileup output:', file=sys.stderr) | |
332 print(errors[0].getvalue(), file=sys.stderr) | |
333 print('varscan somatic output:', file=sys.stderr) | |
334 print(errors[1].getvalue(), file=sys.stderr) | |
335 | |
336 def _add_ref_contigs_to_header(self, header): | |
337 for chrom, length in zip(self.ref_contigs, self.ref_lengths): | |
338 header.add_meta( | |
339 'contig', | |
340 items=[('ID', chrom), ('length', length)] | |
341 ) | |
342 | |
343 def _add_filters_to_header(self, header): | |
344 varscan_fpfilters = { | |
345 'VarCount': 'Fewer than {min_var_count2} variant-supporting reads', | |
346 'VarFreq': 'Variant allele frequency below {min_var_freq2}', | |
347 'VarAvgRL': | |
348 'Average clipped length of variant-supporting reads < ' | |
349 '{min_var_len}', | |
350 'VarReadPos': 'Relative average read position < {min_var_readpos}', | |
351 'VarDist3': | |
352 'Average distance to effective 3\' end < {min_var_dist3}', | |
353 'VarMMQS': | |
354 'Average mismatch quality sum for variant reads > ' | |
355 '{max_var_mmqs}', | |
356 'VarMapQual': | |
357 'Average mapping quality of variant reads < {min_var_mapqual}', | |
358 'VarBaseQual': | |
359 'Average base quality of variant reads < {min_var_basequal}', | |
360 'Strand': | |
361 'Strand representation of variant reads < {min_strandedness}', | |
362 'RefAvgRL': | |
363 'Average clipped length of ref-supporting reads < ' | |
364 '{min_ref_len}', | |
365 'RefReadPos': | |
366 'Relative average read position < {min_ref_readpos}', | |
367 'RefDist3': | |
368 'Average distance to effective 3\' end < {min_ref_dist3}', | |
369 'RefMapQual': | |
370 'Average mapping quality of reference reads < ' | |
371 '{min_ref_mapqual}', | |
372 'RefBaseQual': | |
373 'Average base quality of ref-supporting reads < ' | |
374 '{min_ref_basequal}', | |
375 'RefMMQS': | |
376 'Average mismatch quality sum for ref-supporting reads > ' | |
377 '{max_ref_mmqs}', | |
378 'MMQSdiff': | |
379 'Mismatch quality sum difference (var - ref) > ' | |
380 '{max_mmqs_diff}', | |
381 'MinMMQSdiff': | |
382 'Mismatch quality sum difference (var - ref) < ' | |
383 '{max_mmqs_diff}', | |
384 'MapQualDiff': | |
385 'Mapping quality difference (ref - var) > {max_mapqual_diff}', | |
386 'MaxBAQdiff': | |
387 'Average base quality difference (ref - var) > ' | |
388 '{max_basequal_diff}', | |
389 'ReadLenDiff': | |
390 'Average supporting read length difference (ref - var) > ' | |
391 '{max_relative_len_diff}', | |
392 } | |
393 for filter_id, description in varscan_fpfilters.items(): | |
394 header.filters.add(filter_id, None, None, description) | |
395 | |
396 def _add_indel_info_flag_to_header(self, header): | |
397 header.info.add( | |
398 'INDEL', 0, 'Flag', 'Indicates that the variant is an INDEL' | |
399 ) | |
400 | |
401 def _compile_common_header(self, varcall_template, no_filters=False): | |
402 # fix the header generated by VarScan | |
403 # by adding reference and contig information | |
404 common_header = pysam.VariantHeader() | |
405 common_header.add_meta('reference', value=self.ref_genome) | |
406 self._add_ref_contigs_to_header(common_header) | |
407 if not no_filters: | |
408 # add filter info | |
409 self._add_filters_to_header(common_header) | |
410 # change the source information | |
411 common_header.add_meta('source', value='varscan.py') | |
412 # declare an INDEL flag for record INFO fields | |
413 self._add_indel_info_flag_to_header(common_header) | |
414 # take the remaining metadata from the template header produced by | |
415 # VarScan | |
416 with pysam.VariantFile(varcall_template, 'r') as original_data: | |
417 varscan_header = original_data.header | |
418 for sample in varscan_header.samples: | |
419 common_header.samples.add(sample) | |
420 common_header.merge(varscan_header) | |
421 return common_header | |
422 | |
423 def pileup_masker(self, mask): | |
424 def apply_mask_on_pileup(piled_items): | |
425 for item, status in zip(piled_items, mask): | |
426 if status: | |
427 yield item | |
428 return apply_mask_on_pileup | |
429 | |
430 def get_allele_specific_pileup_column_stats( | |
431 self, allele, pile_column, ref_fetch | |
432 ): | |
433 # number of reads supporting the given allele on | |
434 # forward and reverse strand, and in total | |
435 var_reads_plus = var_reads_minus = 0 | |
436 var_supp_read_mask = [] | |
437 for base in pile_column.get_query_sequences(): | |
438 if base == allele: | |
439 # allele supporting read on + strand | |
440 var_reads_plus += 1 | |
441 var_supp_read_mask.append(True) | |
442 elif base.upper() == allele: | |
443 # allele supporting read on - strand | |
444 var_reads_minus += 1 | |
445 var_supp_read_mask.append(True) | |
446 else: | |
447 var_supp_read_mask.append(False) | |
448 var_reads_total = var_reads_plus + var_reads_minus | |
449 | |
450 if var_reads_total == 0: | |
451 # No stats without reads! | |
452 return None | |
453 | |
454 var_supp_only = self.pileup_masker(var_supp_read_mask) | |
455 | |
456 # average mapping quality of the reads supporting the | |
457 # given allele | |
458 avg_mapping_quality = sum( | |
459 mq for mq in var_supp_only( | |
460 pile_column.get_mapping_qualities() | |
461 ) | |
462 ) / var_reads_total | |
463 | |
464 # for the remaining stats we need access to complete | |
465 # read information | |
466 piled_reads = [ | |
467 p for p in var_supp_only(pile_column.pileups) | |
468 ] | |
469 assert len(piled_reads) == var_reads_total | |
470 sum_avg_base_qualities = 0 | |
471 sum_dist_from_center = 0 | |
472 sum_dist_from_3prime = 0 | |
473 sum_clipped_length = 0 | |
474 sum_unclipped_length = 0 | |
475 sum_num_mismatches_as_fraction = 0 | |
476 sum_mismatch_qualities = 0 | |
477 | |
478 for p in piled_reads: | |
479 sum_avg_base_qualities += sum( | |
480 p.alignment.query_qualities | |
481 ) / p.alignment.infer_query_length() | |
482 sum_clipped_length += p.alignment.query_alignment_length | |
483 unclipped_length = p.alignment.infer_read_length() | |
484 sum_unclipped_length += unclipped_length | |
485 read_center = p.alignment.query_alignment_length / 2 | |
486 sum_dist_from_center += 1 - abs( | |
487 p.query_position - read_center | |
488 ) / read_center | |
489 if p.alignment.is_reverse: | |
490 sum_dist_from_3prime += p.query_position / unclipped_length | |
491 else: | |
492 sum_dist_from_3prime += 1 - p.query_position / unclipped_length | |
493 | |
494 sum_num_mismatches = 0 | |
495 for qpos, rpos in p.alignment.get_aligned_pairs(): | |
496 if qpos is not None and rpos is not None: | |
497 if p.alignment.query_sequence[qpos] != ref_fetch( | |
498 rpos, rpos + 1 | |
499 ).upper(): # ref bases can be lowercase! | |
500 sum_num_mismatches += 1 | |
501 sum_mismatch_qualities += p.alignment.query_qualities[ | |
502 qpos | |
503 ] | |
504 sum_num_mismatches_as_fraction += ( | |
505 sum_num_mismatches / p.alignment.query_alignment_length | |
506 ) | |
507 avg_basequality = sum_avg_base_qualities / var_reads_total | |
508 avg_pos_as_fraction = sum_dist_from_center / var_reads_total | |
509 avg_num_mismatches_as_fraction = ( | |
510 sum_num_mismatches_as_fraction / var_reads_total | |
511 ) | |
512 avg_sum_mismatch_qualities = sum_mismatch_qualities / var_reads_total | |
513 avg_clipped_length = sum_clipped_length / var_reads_total | |
514 avg_distance_to_effective_3p_end = ( | |
515 sum_dist_from_3prime / var_reads_total | |
516 ) | |
517 | |
518 return ( | |
519 avg_mapping_quality, | |
520 avg_basequality, | |
521 var_reads_plus, | |
522 var_reads_minus, | |
523 avg_pos_as_fraction, | |
524 avg_num_mismatches_as_fraction, | |
525 avg_sum_mismatch_qualities, | |
526 avg_clipped_length, | |
527 avg_distance_to_effective_3p_end | |
528 ) | |
529 | |
530 def _postprocess_variant_records(self, invcf, | |
531 min_var_count2, min_var_count2_lc, | |
532 min_var_freq2, max_somatic_p, | |
533 max_somatic_p_depth, | |
534 min_ref_readpos, min_var_readpos, | |
535 min_ref_dist3, min_var_dist3, | |
536 min_ref_len, min_var_len, | |
537 max_relative_len_diff, | |
538 min_strandedness, min_strand_reads, | |
539 min_ref_basequal, min_var_basequal, | |
540 max_basequal_diff, | |
541 min_ref_mapqual, min_var_mapqual, | |
542 max_mapqual_diff, | |
543 max_ref_mmqs, max_var_mmqs, | |
544 min_mmqs_diff, max_mmqs_diff, | |
545 **args): | |
546 # set FILTER field according to Varscan criteria | |
547 # multiple FILTER entries must be separated by semicolons | |
548 # No filters applied should be indicated with MISSING | |
549 | |
550 # since posterior filters are always applied to just one sample, | |
551 # a better place to store the info is in the FT genotype field: | |
552 # can be PASS, '.' to indicate that filters have not been applied, | |
553 # or a semicolon-separated list of filters that failed | |
554 # unfortunately, gemini does not support this field | |
555 | |
556 with ExitStack() as io_stack: | |
557 normal_reads, tumor_reads = ( | |
558 io_stack.enter_context( | |
559 pysam.Samfile(fn, 'rb')) for fn in self.bam_input_files | |
560 ) | |
561 refseq = io_stack.enter_context(pysam.FastaFile(self.ref_genome)) | |
562 pileup_args = self._get_pysam_pileup_args() | |
563 for record in invcf: | |
564 if any(len(allele) > 1 for allele in record.alleles): | |
565 # skip indel postprocessing for the moment | |
566 yield record | |
567 continue | |
568 # get pileup for genomic region affected by this variant | |
569 if record.info['SS'] == '2': | |
570 # a somatic variant => generate pileup from tumor data | |
571 pile = tumor_reads.pileup( | |
572 record.chrom, record.start, record.stop, | |
573 **pileup_args | |
574 ) | |
575 sample_of_interest = 'TUMOR' | |
576 elif record.info['SS'] in ['1', '3']: | |
577 # a germline or LOH variant => pileup from normal data | |
578 pile = normal_reads.pileup( | |
579 record.chrom, record.start, record.stop, | |
580 **pileup_args | |
581 ) | |
582 sample_of_interest = 'NORMAL' | |
583 else: | |
584 # TO DO: figure out if there is anything interesting to do | |
585 # for SS status codes 0 (reference) and 5 (unknown) | |
586 yield record | |
587 continue | |
588 | |
589 # apply false-positive filtering a la varscan fpfilter | |
590 # find the variant site in the pileup columns | |
591 for pile_column in pile: | |
592 if pile_column.reference_pos == record.start: | |
593 break | |
594 # extract required information | |
595 # overall read depth at the site | |
596 read_depth = pile_column.get_num_aligned() | |
597 assert read_depth > 0 | |
598 # no multiallelic sites in varscan | |
599 assert len(record.alleles) == 2 | |
600 if record.samples[sample_of_interest]['RD'] > 0: | |
601 ref_stats, alt_stats = [ | |
602 self.get_allele_specific_pileup_column_stats( | |
603 allele, | |
604 pile_column, | |
605 partial( | |
606 pysam.FastaFile.fetch, refseq, record.chrom | |
607 ) | |
608 ) | |
609 for allele in record.alleles | |
610 ] | |
611 else: | |
612 ref_stats = None | |
613 alt_stats = self.get_allele_specific_pileup_column_stats( | |
614 record.alleles[1], | |
615 pile_column, | |
616 partial(pysam.FastaFile.fetch, refseq, record.chrom) | |
617 ) | |
618 ref_count = 0 | |
619 if ref_stats: | |
620 ref_count = ref_stats[2] + ref_stats[3] | |
621 if ref_stats[1] < min_ref_basequal: | |
622 record.filter.add('RefBaseQual') | |
623 if ref_count >= 2: | |
624 if ref_stats[0] < min_ref_mapqual: | |
625 record.filter.add('RefMapQual') | |
626 if ref_stats[4] < min_ref_readpos: | |
627 record.filter.add('RefReadPos') | |
628 # ref_stats[5] (avg_num_mismatches_as_fraction | |
629 # is not a filter criterion in VarScan fpfilter | |
630 if ref_stats[6] > max_ref_mmqs: | |
631 record.filter.add('RefMMQS') | |
632 if ref_stats[7] < min_ref_len: | |
633 # VarScan fpfilter does not apply this filter | |
634 # for indels, but there is no reason | |
635 # not to do it. | |
636 record.filter.add('RefAvgRL') | |
637 if ref_stats[8] < min_ref_dist3: | |
638 record.filter.add('RefDist3') | |
639 if alt_stats: | |
640 alt_count = alt_stats[2] + alt_stats[3] | |
641 if ( | |
642 alt_count < min_var_count2_lc | |
643 ) or ( | |
644 read_depth >= max_somatic_p_depth and | |
645 alt_count < min_var_count2 | |
646 ): | |
647 record.filter.add('VarCount') | |
648 if alt_count / read_depth < min_var_freq2: | |
649 record.filter.add('VarFreq') | |
650 if alt_stats[1] < min_var_basequal: | |
651 record.filter.add('VarBaseQual') | |
652 if alt_count > min_strand_reads: | |
653 if ( | |
654 alt_stats[2] / alt_count < min_strandedness | |
655 ) or ( | |
656 alt_stats[3] / alt_count < min_strandedness | |
657 ): | |
658 record.filter.add('Strand') | |
659 if alt_stats[2] + alt_stats[3] >= 2: | |
660 if alt_stats[0] < min_var_mapqual: | |
661 record.filter.add('VarMapQual') | |
662 if alt_stats[4] < min_var_readpos: | |
663 record.filter.add('VarReadPos') | |
664 # alt_stats[5] (avg_num_mismatches_as_fraction | |
665 # is not a filter criterion in VarScan fpfilter | |
666 if alt_stats[6] > max_var_mmqs: | |
667 record.filter.add('VarMMQS') | |
668 if alt_stats[7] < min_var_len: | |
669 # VarScan fpfilter does not apply this filter | |
670 # for indels, but there is no reason | |
671 # not to do it. | |
672 record.filter.add('VarAvgRL') | |
673 if alt_stats[8] < min_var_dist3: | |
674 record.filter.add('VarDist3') | |
675 if ref_count >= 2 and alt_count >= 2: | |
676 if (ref_stats[0] - alt_stats[0]) > max_mapqual_diff: | |
677 record.filter.add('MapQualDiff') | |
678 if (ref_stats[1] - alt_stats[1]) > max_basequal_diff: | |
679 record.filter.add('MaxBAQdiff') | |
680 mmqs_diff = alt_stats[6] - ref_stats[6] | |
681 if mmqs_diff < min_mmqs_diff: | |
682 record.filter.add('MinMMQSdiff') | |
683 if mmqs_diff > max_mmqs_diff: | |
684 record.filter.add('MMQSdiff') | |
685 if ( | |
686 1 - alt_stats[7] / ref_stats[7] | |
687 ) > max_relative_len_diff: | |
688 record.filter.add('ReadLenDiff') | |
689 else: | |
690 # No variant-supporting reads for this record! | |
691 # This can happen in rare cases because of | |
692 # samtools mpileup issues, but indicates a | |
693 # rather unreliable variant call. | |
694 record.filter.add('VarCount') | |
695 record.filter.add('VarFreq') | |
696 yield record | |
697 | |
698 def _indel_flagged_records(self, vcf): | |
699 for record in vcf: | |
700 record.info['INDEL'] = True | |
701 yield record | |
702 | |
703 def _merge_generator(self, vcf1, vcf2): | |
704 try: | |
705 record1 = next(vcf1) | |
706 except StopIteration: | |
707 for record2 in vcf2: | |
708 yield record2 | |
709 return | |
710 try: | |
711 record2 = next(vcf2) | |
712 except StopIteration: | |
713 yield record1 | |
714 for record1 in vcf1: | |
715 yield record1 | |
716 return | |
717 while True: | |
718 if (record1.start, record1.stop) < (record2.start, record2.stop): | |
719 yield record1 | |
720 try: | |
721 record1 = next(vcf1) | |
722 except StopIteration: | |
723 yield record2 | |
724 for record2 in vcf2: | |
725 yield record2 | |
726 return | |
727 else: | |
728 yield record2 | |
729 try: | |
730 record2 = next(vcf2) | |
731 except StopIteration: | |
732 yield record1 | |
733 for record1 in vcf1: | |
734 yield record1 | |
735 return | |
736 | |
737 def merge_and_postprocess(self, snps_out, indels_out=None, | |
738 no_filters=False, **filter_args): | |
739 temporary_data = self.tmpfiles | |
740 self.tmpfiles = [] | |
741 temporary_snp_files = [f + '.snp.vcf' for f in temporary_data] | |
742 temporary_indel_files = [f + '.indel.vcf' for f in temporary_data] | |
743 | |
744 for f in temporary_data: | |
745 try: | |
746 os.remove(f) | |
747 except Exception: | |
748 pass | |
749 | |
750 def noop_gen(data, **kwargs): | |
751 for d in data: | |
752 yield d | |
753 | |
754 if no_filters: | |
755 apply_filters = noop_gen | |
756 else: | |
757 apply_filters = self._postprocess_variant_records | |
758 | |
759 output_header = self._compile_common_header( | |
760 temporary_snp_files[0], | |
761 no_filters | |
762 ) | |
763 if indels_out is None: | |
764 with open(snps_out, 'w') as o: | |
765 o.write(str(output_header).format(**filter_args)) | |
766 for snp_f, indel_f in zip( | |
767 temporary_snp_files, temporary_indel_files | |
768 ): | |
769 with pysam.VariantFile(snp_f, 'r') as snp_invcf: | |
770 # fix the input header on the fly | |
771 # to avoid Warnings from htslib about missing contig | |
772 # info | |
773 self._add_ref_contigs_to_header(snp_invcf.header) | |
774 self._add_filters_to_header(snp_invcf.header) | |
775 self._add_indel_info_flag_to_header(snp_invcf.header) | |
776 with pysam.VariantFile(indel_f, 'r') as indel_invcf: | |
777 # fix the input header on the fly | |
778 # to avoid Warnings from htslib about missing | |
779 # contig info | |
780 self._add_ref_contigs_to_header(indel_invcf.header) | |
781 self._add_filters_to_header(indel_invcf.header) | |
782 self._add_indel_info_flag_to_header( | |
783 indel_invcf.header | |
784 ) | |
785 for record in apply_filters( | |
786 self._merge_generator( | |
787 snp_invcf, | |
788 self._indel_flagged_records(indel_invcf) | |
789 ), | |
790 **filter_args | |
791 ): | |
792 o.write(str(record)) | |
793 try: | |
794 os.remove(snp_f) | |
795 except Exception: | |
796 pass | |
797 try: | |
798 os.remove(indel_f) | |
799 except Exception: | |
800 pass | |
801 | |
802 else: | |
803 with open(snps_out, 'w') as o: | |
804 o.write(str(output_header).format(**filter_args)) | |
805 for f in temporary_snp_files: | |
806 with pysam.VariantFile(f, 'r') as invcf: | |
807 # fix the input header on the fly | |
808 # to avoid Warnings from htslib about missing | |
809 # contig info and errors because of undeclared | |
810 # filters | |
811 self._add_ref_contigs_to_header(invcf.header) | |
812 self._add_filters_to_header(invcf.header) | |
813 for record in apply_filters( | |
814 invcf, **filter_args | |
815 ): | |
816 o.write(str(record)) | |
817 try: | |
818 os.remove(f) | |
819 except Exception: | |
820 pass | |
821 with open(indels_out, 'w') as o: | |
822 o.write(str(output_header)) | |
823 for f in temporary_indel_files: | |
824 with pysam.VariantFile(f, 'r') as invcf: | |
825 # fix the input header on the fly | |
826 # to avoid Warnings from htslib about missing | |
827 # contig info and errors because of undeclared | |
828 # filters | |
829 self._add_ref_contigs_to_header(invcf.header) | |
830 self._add_filters_to_header(invcf.header) | |
831 self._add_indel_info_flag_to_header(invcf.header) | |
832 for record in apply_filters( | |
833 self._indel_flagged_records(invcf), **filter_args | |
834 ): | |
835 o.write(str(record)) | |
836 try: | |
837 os.remove(f) | |
838 except Exception: | |
839 pass | |
840 | |
841 | |
842 def varscan_call(ref_genome, normal, tumor, output_path, **args): | |
843 """Preparse arguments and orchestrate calling and postprocessing.""" | |
844 | |
845 if args.pop('split_output'): | |
846 if '%T' in output_path: | |
847 out = ( | |
848 output_path.replace('%T', 'snp'), | |
849 output_path.replace('%T', 'indel') | |
850 ) | |
851 else: | |
852 out = ( | |
853 output_path + '.snp', | |
854 output_path + '.indel' | |
855 ) | |
856 else: | |
857 out = (output_path, None) | |
858 | |
859 instance_args = { | |
860 k: args.pop(k) for k in [ | |
861 'max_depth', | |
862 'min_mapqual', | |
863 'min_basequal', | |
864 'threads', | |
865 'verbose', | |
866 'quiet' | |
867 ] | |
868 } | |
869 varscan_somatic_args = { | |
870 k: args.pop(k) for k in [ | |
871 'normal_purity', | |
872 'tumor_purity', | |
873 'min_coverage', | |
874 'min_var_count', | |
875 'min_var_freq', | |
876 'min_hom_freq', | |
877 'somatic_p_value', | |
878 'p_value' | |
879 ] | |
880 } | |
881 | |
882 v = VarScanCaller(ref_genome, [normal, tumor], **instance_args) | |
883 v.varcall_parallel(**varscan_somatic_args) | |
884 v.merge_and_postprocess(*out, **args) | |
885 | |
886 | |
887 if __name__ == '__main__': | |
888 p = argparse.ArgumentParser() | |
889 p.add_argument( | |
890 'ref_genome', | |
891 metavar='reference_genome', | |
892 help='the reference genome (in fasta format)' | |
893 ) | |
894 p.add_argument( | |
895 '--normal', | |
896 metavar='BAM_file', required=True, | |
897 help='the BAM input file of aligned reads from the normal sample' | |
898 ) | |
899 p.add_argument( | |
900 '--tumor', | |
901 metavar='BAM_file', required=True, | |
902 help='the BAM input file of aligned reads from the tumor sample' | |
903 ) | |
904 p.add_argument( | |
905 '-o', '--ofile', required=True, | |
906 metavar='OFILE', dest='output_path', | |
907 help='Name of the variant output file. With --split-output, the name ' | |
908 'may use the %%T replacement token or will be used as the ' | |
909 'basename for the two output files to be generated (see ' | |
910 '-s|--split-output below).' | |
911 ) | |
912 p.add_argument( | |
913 '-s', '--split-output', | |
914 dest='split_output', action='store_true', default=False, | |
915 help='indicate that separate output files for SNPs and indels ' | |
916 'should be generated (original VarScan behavior). If specified, ' | |
917 '%%T in the --ofile file name will be replaced with "snp" and ' | |
918 '"indel" to generate the names of the SNP and indel output ' | |
919 'files, respectively. If %%T is not found in the file name, it ' | |
920 'will get interpreted as a basename to which ".snp"/".indel" ' | |
921 'will be appended.' | |
922 ) | |
923 p.add_argument( | |
924 '-t', '--threads', | |
925 type=int, default=1, | |
926 help='level of parallelism' | |
927 ) | |
928 p.add_argument( | |
929 '-v', '--verbose', | |
930 action='store_true', | |
931 help='be verbose about progress' | |
932 ) | |
933 p.add_argument( | |
934 '-q', '--quiet', | |
935 action='store_true', | |
936 help='suppress output from wrapped tools' | |
937 ) | |
938 call_group = p.add_argument_group('Variant calling parameters') | |
939 call_group.add_argument( | |
940 '--normal-purity', | |
941 dest='normal_purity', type=float, | |
942 default=1.0, | |
943 help='Estimated purity of the normal sample (default: 1.0)' | |
944 ) | |
945 call_group.add_argument( | |
946 '--tumor-purity', | |
947 dest='tumor_purity', type=float, | |
948 default=1.0, | |
949 help='Estimated purity of the tumor sample (default: 1.0)' | |
950 ) | |
951 call_group.add_argument( | |
952 '--max-pileup-depth', | |
953 dest='max_depth', type=int, default=8000, | |
954 help='Maximum depth of generated pileups (samtools mpileup -d option; ' | |
955 'default: 8000)' | |
956 ) | |
957 call_group.add_argument( | |
958 '--min-basequal', | |
959 dest='min_basequal', type=int, | |
960 default=13, | |
961 help='Minimum base quality at the variant position to use a read ' | |
962 '(default: 13)' | |
963 ) | |
964 call_group.add_argument( | |
965 '--min-mapqual', | |
966 dest='min_mapqual', type=int, | |
967 default=0, | |
968 help='Minimum mapping quality required to use a read ' | |
969 '(default: 0)' | |
970 ) | |
971 call_group.add_argument( | |
972 '--min-coverage', | |
973 dest='min_coverage', type=int, | |
974 default=8, | |
975 help='Minimum site coverage required in the normal and in the tumor ' | |
976 'sample to call a variant (default: 8)' | |
977 ) | |
978 call_group.add_argument( | |
979 '--min-var-count', | |
980 dest='min_var_count', type=int, | |
981 default=2, | |
982 help='Minimum number of variant-supporting reads required to call a ' | |
983 'variant (default: 2)' | |
984 ) | |
985 call_group.add_argument( | |
986 '--min-var-freq', | |
987 dest='min_var_freq', type=float, | |
988 default=0.1, | |
989 help='Minimum variant allele frequency for calling (default: 0.1)' | |
990 ) | |
991 call_group.add_argument( | |
992 '--min-hom-freq', | |
993 dest='min_hom_freq', type=float, | |
994 default=0.75, | |
995 help='Minimum variant allele frequency for homozygous call ' | |
996 '(default: 0.75)' | |
997 ) | |
998 call_group.add_argument( | |
999 '--p-value', | |
1000 dest='p_value', type=float, | |
1001 default=0.99, | |
1002 help='P-value threshold for heterozygous call (default: 0.99)' | |
1003 ) | |
1004 call_group.add_argument( | |
1005 '--somatic-p-value', | |
1006 dest='somatic_p_value', type=float, | |
1007 default=0.05, | |
1008 help='P-value threshold for somatic call (default: 0.05)' | |
1009 ) | |
1010 filter_group = p.add_argument_group('Posterior variant filter parameters') | |
1011 filter_group.add_argument( | |
1012 '--no-filters', | |
1013 dest='no_filters', action='store_true', | |
1014 help='Disable all posterior variant filters. ' | |
1015 'If specified, all following options will be ignored' | |
1016 ) | |
1017 filter_group.add_argument( | |
1018 '--min-var-count2', | |
1019 dest='min_var_count2', type=int, | |
1020 default=4, | |
1021 help='Minimum number of variant-supporting reads (default: 4)' | |
1022 ) | |
1023 filter_group.add_argument( | |
1024 '--min-var-count2-lc', | |
1025 dest='min_var_count2_lc', type=int, | |
1026 default=2, | |
1027 help='Minimum number of variant-supporting reads when depth below ' | |
1028 '--somatic-p-depth (default: 2)' | |
1029 ) | |
1030 filter_group.add_argument( | |
1031 '--min-var-freq2', | |
1032 dest='min_var_freq2', type=float, | |
1033 default=0.05, | |
1034 help='Minimum variant allele frequency (default: 0.05)' | |
1035 ) | |
1036 filter_group.add_argument( | |
1037 '--max-somatic-p', | |
1038 dest='max_somatic_p', type=float, | |
1039 default=0.05, | |
1040 help='Maximum somatic p-value (default: 0.05)' | |
1041 ) | |
1042 filter_group.add_argument( | |
1043 '--max-somatic-p-depth', | |
1044 dest='max_somatic_p_depth', type=int, | |
1045 default=10, | |
1046 help='Depth required to run --max-somatic-p filter (default: 10)' | |
1047 ) | |
1048 filter_group.add_argument( | |
1049 '--min-ref-readpos', | |
1050 dest='min_ref_readpos', type=float, | |
1051 default=0.1, | |
1052 help='Minimum average relative distance of site from the ends of ' | |
1053 'ref-supporting reads (default: 0.1)' | |
1054 ) | |
1055 filter_group.add_argument( | |
1056 '--min-var-readpos', | |
1057 dest='min_var_readpos', type=float, | |
1058 default=0.1, | |
1059 help='Minimum average relative distance of site from the ends of ' | |
1060 'variant-supporting reads (default: 0.1)' | |
1061 ) | |
1062 filter_group.add_argument( | |
1063 '--min-ref-dist3', | |
1064 dest='min_ref_dist3', type=float, | |
1065 default=0.1, | |
1066 help='Minimum average relative distance of site from the effective ' | |
1067 '3\'end of ref-supporting reads (default: 0.1)' | |
1068 ) | |
1069 filter_group.add_argument( | |
1070 '--min-var-dist3', | |
1071 dest='min_var_dist3', type=float, | |
1072 default=0.1, | |
1073 help='Minimum average relative distance of site from the effective ' | |
1074 '3\'end of variant-supporting reads (default: 0.1)' | |
1075 ) | |
1076 filter_group.add_argument( | |
1077 '--min-ref-len', | |
1078 dest='min_ref_len', type=int, | |
1079 default=90, | |
1080 help='Minimum average trimmed length of reads supporting the ref ' | |
1081 'allele (default: 90)' | |
1082 ) | |
1083 filter_group.add_argument( | |
1084 '--min-var-len', | |
1085 dest='min_var_len', type=int, | |
1086 default=90, | |
1087 help='Minimum average trimmed length of reads supporting the variant ' | |
1088 'allele (default: 90)' | |
1089 ) | |
1090 filter_group.add_argument( | |
1091 '--max-len-diff', | |
1092 dest='max_relative_len_diff', type=float, | |
1093 default=0.25, | |
1094 help='Maximum average relative read length difference (ref - var; ' | |
1095 'default: 0.25)' | |
1096 ) | |
1097 filter_group.add_argument( | |
1098 '--min-strandedness', | |
1099 dest='min_strandedness', type=float, | |
1100 default=0.01, | |
1101 help='Minimum fraction of variant reads from each strand ' | |
1102 '(default: 0.01)' | |
1103 ) | |
1104 filter_group.add_argument( | |
1105 '--min-strand-reads', | |
1106 dest='min_strand_reads', type=int, | |
1107 default=5, | |
1108 help='Minimum allele depth required to run --min-strandedness filter ' | |
1109 '(default: 5)' | |
1110 ) | |
1111 filter_group.add_argument( | |
1112 '--min-ref-basequal', | |
1113 dest='min_ref_basequal', type=int, | |
1114 default=15, | |
1115 help='Minimum average base quality for the ref allele (default: 15)' | |
1116 ) | |
1117 filter_group.add_argument( | |
1118 '--min-var-basequal', | |
1119 dest='min_var_basequal', type=int, | |
1120 default=15, | |
1121 help='Minimum average base quality for the variant allele ' | |
1122 '(default: 15)' | |
1123 ) | |
1124 filter_group.add_argument( | |
1125 '--max-basequal-diff', | |
1126 dest='max_basequal_diff', type=int, | |
1127 default=50, | |
1128 help='Maximum average base quality diff (ref - var; default: 50)' | |
1129 ) | |
1130 filter_group.add_argument( | |
1131 '--min-ref-mapqual', | |
1132 dest='min_ref_mapqual', type=int, | |
1133 default=15, | |
1134 help='Minimum average mapping quality of reads supporting the ref ' | |
1135 'allele (default: 15)' | |
1136 ) | |
1137 filter_group.add_argument( | |
1138 '--min-var-mapqual', | |
1139 dest='min_var_mapqual', type=int, | |
1140 default=15, | |
1141 help='Minimum average mapping quality of reads supporting the variant ' | |
1142 'allele (default: 15)' | |
1143 ) | |
1144 filter_group.add_argument( | |
1145 '--max-mapqual-diff', | |
1146 dest='max_mapqual_diff', type=int, | |
1147 default=50, | |
1148 help='Maximum average mapping quality difference (ref - var; ' | |
1149 'default: 50)' | |
1150 ) | |
1151 filter_group.add_argument( | |
1152 '--max-ref-mmqs', | |
1153 dest='max_ref_mmqs', type=int, | |
1154 default=100, | |
1155 help='Maximum mismatch quality sum of reads supporting the ref ' | |
1156 'allele (default: 100)' | |
1157 ) | |
1158 filter_group.add_argument( | |
1159 '--max-var-mmqs', | |
1160 dest='max_var_mmqs', type=int, | |
1161 default=100, | |
1162 help='Maximum mismatch quality sum of reads supporting the variant ' | |
1163 'allele (default: 100)' | |
1164 ) | |
1165 filter_group.add_argument( | |
1166 '--min-mmqs-diff', | |
1167 dest='min_mmqs_diff', type=int, | |
1168 default=0, | |
1169 help='Minimum mismatch quality sum difference (var - ref; default: 0)' | |
1170 ) | |
1171 filter_group.add_argument( | |
1172 '--max-mmqs-diff', | |
1173 dest='max_mmqs_diff', type=int, | |
1174 default=50, | |
1175 help='Maximum mismatch quality sum difference (var - ref; default: 50)' | |
1176 ) | |
1177 args = vars(p.parse_args()) | |
1178 varscan_call(**args) |