Mercurial > repos > iuc > varscan_mpileup
comparison varscan.py @ 9:e3f170cc4f95 draft
"planemo upload for repository https://github.com/galaxyproject/iuc/tree/master/tools/varscan commit fcf5ac14629c694f0f64773fab0428b1e78fe156"
author | iuc |
---|---|
date | Fri, 16 Aug 2019 15:50:25 -0400 |
parents | 45288fb1f3f5 |
children | db94eadb92c1 |
comparison
equal
deleted
inserted
replaced
8:45288fb1f3f5 | 9:e3f170cc4f95 |
---|---|
6 import os | 6 import os |
7 import subprocess | 7 import subprocess |
8 import sys | 8 import sys |
9 import tempfile | 9 import tempfile |
10 import time | 10 import time |
11 from collections import namedtuple | |
11 from contextlib import ExitStack | 12 from contextlib import ExitStack |
12 from functools import partial | 13 from functools import partial |
13 from threading import Thread | 14 from threading import Thread |
14 | 15 |
15 import pysam | 16 import pysam |
17 | |
18 | |
19 AlleleStats = namedtuple( | |
20 "AlleleStats", | |
21 [ | |
22 'reads_total', | |
23 'reads_fw', | |
24 'reads_rv', | |
25 'avg_mapqual', | |
26 'avg_basequal', | |
27 'avg_dist_from_center', | |
28 'avg_mismatch_fraction', | |
29 'avg_mismatch_qualsum', | |
30 'avg_clipped_len', | |
31 'avg_dist_from_3prime' | |
32 ] | |
33 ) | |
34 | |
35 | |
36 def _get_allele_specific_pileup_column_stats(pileups, ref_fetch, | |
37 ignore_md, ignore_nm, | |
38 mm_runs, detect_q2_runs): | |
39 var_reads_plus = var_reads_minus = 0 | |
40 sum_mapping_qualities = 0 | |
41 sum_base_qualities = 0 | |
42 sum_dist_from_center = 0 | |
43 sum_dist_from_3prime = 0 | |
44 sum_clipped_length = 0 | |
45 sum_unclipped_length = 0 | |
46 sum_mismatch_fractions = 0 | |
47 sum_mismatch_qualities = 0 | |
48 | |
49 for p in pileups: | |
50 if p.alignment.is_reverse: | |
51 var_reads_minus += 1 | |
52 else: | |
53 var_reads_plus += 1 | |
54 sum_mapping_qualities += p.alignment.mapping_quality | |
55 sum_base_qualities += p.alignment.query_qualities[p.query_position] | |
56 sum_clipped_length += p.alignment.query_alignment_length | |
57 unclipped_length = p.alignment.query_length | |
58 sum_unclipped_length += unclipped_length | |
59 # The following calculations are all in 1-based coordinates | |
60 # with respect to the physical 5'-end of the read sequence. | |
61 if p.alignment.is_reverse: | |
62 read_base_pos = unclipped_length - p.query_position | |
63 read_first_aln_pos = ( | |
64 unclipped_length - p.alignment.query_alignment_end + 1 | |
65 ) | |
66 read_last_aln_pos = ( | |
67 unclipped_length - p.alignment.query_alignment_start | |
68 ) | |
69 qualities_3to5prime = p.alignment.query_alignment_qualities | |
70 else: | |
71 read_base_pos = p.query_position + 1 | |
72 read_first_aln_pos = p.alignment.query_alignment_start + 1 | |
73 read_last_aln_pos = p.alignment.query_alignment_end | |
74 qualities_3to5prime = reversed( | |
75 p.alignment.query_alignment_qualities | |
76 ) | |
77 | |
78 read_last_effective_aln_pos = read_last_aln_pos | |
79 if detect_q2_runs: | |
80 # Note: the original bam-readcount algorithm always takes | |
81 # terminal q2 runs into account when determining the | |
82 # effective 3'-ends of reads. | |
83 # However, q2 runs have no special meaning since Illumina | |
84 # pipeline version 1.8 so detecting them is optional | |
85 # in this code. | |
86 for qual in qualities_3to5prime: | |
87 if qual != 2: | |
88 break | |
89 read_last_effective_aln_pos -= 1 | |
90 | |
91 # Note: the original bam-readcount algorithm defines the | |
92 # read center as: | |
93 # read_center = p.alignment.query_alignment_length / 2 | |
94 # This is less accurate than the implementation here. | |
95 read_center = (read_first_aln_pos + read_last_aln_pos) / 2 | |
96 sum_dist_from_center += 1 - abs( | |
97 read_base_pos - read_center | |
98 ) / (read_center - 1) | |
99 # To calculate the distance of base positions from the 3'-ends | |
100 # of reads bam-readcount uses the formula: | |
101 # sum_dist_from_3prime += abs( | |
102 # read_last_effective_aln_pos - read_base_pos | |
103 # ) / (unclipped_length - 1) | |
104 # , which treats base positions on both sides of the effective | |
105 # 3'-end equally. Since this seems hard to justify, we cap | |
106 # the distance calculation at 0 for base positions past the | |
107 # effective 3'-end (which, in turn, should only be possible | |
108 # with detection of q2 runs). | |
109 if read_last_effective_aln_pos > read_base_pos: | |
110 sum_dist_from_3prime += ( | |
111 read_last_effective_aln_pos - read_base_pos | |
112 ) / (unclipped_length - 1) | |
113 | |
114 if sum_mismatch_fractions >= 0 or sum_mismatch_qualities >= 0: | |
115 # sum_mismatch_fractions and sum_mismatch_qualities will be | |
116 # set to float('nan') if reference info (in form of an MD tag or | |
117 # an actual reference sequence) was not available for any previous | |
118 # read in the pileup. | |
119 # In that case, there is no point to trying to calculate | |
120 # mismatch stats for other reads anymore. | |
121 | |
122 # The following mismatch calculations use this logic: | |
123 # 1. For determining the edit distance between an aligned read and | |
124 # the reference, we follow the authoritative definition of the | |
125 # NM tag calculation in | |
126 # http://samtools.github.io/hts-specs/SAMtags.pdf | |
127 # | |
128 # For historical reasons, the result of this calculation will | |
129 # disagree more often than not with NM tag values calculated by | |
130 # other tools. | |
131 # If precalculated NM tag values are present on the aligned | |
132 # reads, these can be given preference through the use_nm flag. | |
133 # Doing so will mimick the behavior of bam-readcount, which | |
134 # requires and always just looks at NM tags. | |
135 # 2. For determining mismatch quality sums, a mismatch is defined | |
136 # differently and in accordance with the implementation in | |
137 # bam-readcount: | |
138 # - only mismatches (not inserted or deleted bases) are | |
139 # considered | |
140 # - 'N' in the reference is considered a match for any read base | |
141 # - any matching (case-insensitive) base in reference and read | |
142 # is considered a match, even if that base is not one of | |
143 # A, C, G or T. | |
144 # In both 1. and 2. above a '=' in the read is always considered a | |
145 # match, irrespective of the reference base. | |
146 | |
147 num_mismatches = 0 | |
148 if not ignore_md: | |
149 try: | |
150 # see if the read has an MD tag, in which case pysam can | |
151 # calculate the reference sequence for us | |
152 ref_seq = p.alignment.get_reference_sequence().upper() | |
153 except ValueError: | |
154 ignore_md = True | |
155 if not ignore_nm: | |
156 try: | |
157 num_mismatches = p.alignment.get_tag('NM') | |
158 except KeyError: | |
159 ignore_nm = True | |
160 | |
161 if ignore_md: | |
162 if not ref_fetch: | |
163 # cannot calculate mismatch stats without ref info | |
164 sum_mismatch_qualities = float('nan') | |
165 if ignore_nm: | |
166 sum_mismatch_fractions = float('nan') | |
167 else: | |
168 sum_mismatch_fractions += ( | |
169 num_mismatches / p.alignment.query_alignment_length | |
170 ) | |
171 continue | |
172 # Without an MD tag we need to extract the relevant part | |
173 # of the reference from the full sequence by position. | |
174 ref_positions = p.alignment.get_reference_positions() | |
175 ref_seq = ref_fetch( | |
176 ref_positions[0], ref_positions[-1] + 1 | |
177 ).upper() | |
178 | |
179 potential_matches = {'A', 'C', 'G', 'T'} | |
180 aligned_pairs = p.alignment.get_aligned_pairs(matches_only=True) | |
181 ref_offset = aligned_pairs[0][1] | |
182 last_mismatch_pos = None | |
183 mismatch_run_quals = [] | |
184 for qpos, rpos in aligned_pairs: | |
185 read_base = p.alignment.query_sequence[qpos].upper() | |
186 if read_base == '=': | |
187 # always treat the special read base '=' as a | |
188 # match, irrespective of the reference base | |
189 continue | |
190 ref_base = ref_seq[rpos - ref_offset] | |
191 # see if we have a mismatch to use for mismatch quality sum | |
192 # calculation | |
193 if ref_base != 'N' and ref_base != read_base: | |
194 if mm_runs: | |
195 if ( | |
196 last_mismatch_pos is None | |
197 ) or ( | |
198 last_mismatch_pos + 1 == qpos | |
199 ): | |
200 mismatch_run_quals.append( | |
201 p.alignment.query_qualities[qpos] | |
202 ) | |
203 else: | |
204 sum_mismatch_qualities += max(mismatch_run_quals) | |
205 mismatch_run_quals = [ | |
206 p.alignment.query_qualities[qpos] | |
207 ] | |
208 last_mismatch_pos = qpos | |
209 else: | |
210 sum_mismatch_qualities += ( | |
211 p.alignment.query_qualities[qpos] | |
212 ) | |
213 if ignore_nm: | |
214 # see if we have a mismatch that increases the edit | |
215 # distance according to the SAMtags specs | |
216 if ( | |
217 read_base not in potential_matches | |
218 ) or ( | |
219 ref_base not in potential_matches | |
220 ) or ( | |
221 read_base != ref_base | |
222 ): | |
223 num_mismatches += 1 | |
224 | |
225 if mismatch_run_quals: | |
226 sum_mismatch_qualities += max(mismatch_run_quals) | |
227 if ignore_nm: | |
228 # use the number of mismatches calculated above, | |
229 # but add inserted and deleted bases to it | |
230 cigar_stats = p.alignment.get_cigar_stats()[0] | |
231 num_mismatches += cigar_stats[1] + cigar_stats[2] | |
232 sum_mismatch_fractions += ( | |
233 num_mismatches / p.alignment.query_alignment_length | |
234 ) | |
235 | |
236 return ( | |
237 var_reads_plus, | |
238 var_reads_minus, | |
239 sum_mapping_qualities, | |
240 sum_base_qualities, | |
241 sum_dist_from_center, | |
242 sum_mismatch_fractions, | |
243 sum_mismatch_qualities, | |
244 sum_clipped_length, | |
245 sum_dist_from_3prime | |
246 ) | |
247 | |
248 | |
249 def get_allele_stats(reads, pos, alleles, | |
250 ref=None, ignore_md=True, ignore_nm=True, | |
251 mm_runs=True, detect_q2_runs=False, | |
252 pileup_args=None): | |
253 chrom, start, stop = pos | |
254 if pileup_args is None: | |
255 pileup_args = {} | |
256 if pileup_args.get('stepper') == 'samtools': | |
257 if pileup_args.get('compute_baq', True) is not False: | |
258 if 'fastafile' not in pileup_args: | |
259 pileup_args['fastafile'] = ref | |
260 # be careful when passing in a custom 'fastafile' option: | |
261 # providing it slows down pysam tremendously even if the option | |
262 # isn't actually required. | |
263 | |
264 pile = reads.pileup( | |
265 chrom, start, stop, | |
266 **pileup_args | |
267 ) | |
268 # apply false-positive filtering a la varscan fpfilter | |
269 # find the variant site in the pileup columns | |
270 for pile_column in pile: | |
271 if pile_column.reference_pos == start: | |
272 break | |
273 else: | |
274 # With no reads covering the genomic position | |
275 # we can only return "empty" allele stats | |
276 return [ | |
277 AlleleStats(0, 0, 0, *[float('nan')] * 7) | |
278 for allele in alleles | |
279 ] | |
280 | |
281 # extract required information | |
282 # overall read depth at the site | |
283 read_depth = pile_column.get_num_aligned() | |
284 assert read_depth > 0 | |
285 | |
286 alleles_stats = [] | |
287 for allele in alleles: | |
288 if '-' in allele: | |
289 allele, indel_type, indel_after = allele.partition('-') | |
290 indel_len = -len(indel_after) | |
291 elif '+' in allele: | |
292 allele, indel_type, indel_after = allele.partition('+') | |
293 indel_len = len(indel_after) | |
294 else: | |
295 indel_type = None | |
296 | |
297 stats_it = ( | |
298 p for p in pile_column.pileups | |
299 # skip reads that don't support the allele we're looking for | |
300 if (p.query_position is not None) | |
301 and (p.alignment.query_sequence[p.query_position] == allele) | |
302 ) | |
303 if indel_type == '-': | |
304 stats_it = ( | |
305 p for p in stats_it if p.indel == indel_len | |
306 ) | |
307 elif indel_type == '+': | |
308 stats_it = ( | |
309 p for p in stats_it if ( | |
310 p.indel == indel_len | |
311 ) and ( | |
312 p.alignment.query_sequence[ | |
313 p.query_position + 1: | |
314 p.query_position + 1 + indel_len | |
315 ] == indel_after | |
316 ) | |
317 ) | |
318 allele_stats = _get_allele_specific_pileup_column_stats( | |
319 stats_it, | |
320 partial( | |
321 pysam.FastaFile.fetch, ref, chrom | |
322 ) if ref else None, | |
323 ignore_md, | |
324 ignore_nm, | |
325 mm_runs, | |
326 detect_q2_runs | |
327 ) | |
328 | |
329 allele_reads_total = allele_stats[0] + allele_stats[1] | |
330 if allele_reads_total == 0: | |
331 # No stats without reads! | |
332 alleles_stats.append( | |
333 AlleleStats(read_depth, 0, 0, *[float('nan')] * 7) | |
334 ) | |
335 else: | |
336 alleles_stats.append( | |
337 AlleleStats( | |
338 read_depth, allele_stats[0], allele_stats[1], | |
339 *(i / allele_reads_total for i in allele_stats[2:]) | |
340 ) | |
341 ) | |
342 return alleles_stats | |
16 | 343 |
17 | 344 |
18 class VariantCallingError (RuntimeError): | 345 class VariantCallingError (RuntimeError): |
19 """Exception class for issues with samtools and varscan subprocesses.""" | 346 """Exception class for issues with samtools and varscan subprocesses.""" |
20 | 347 |
38 return msg_header + self.message | 365 return msg_header + self.message |
39 | 366 |
40 | 367 |
41 class VarScanCaller (object): | 368 class VarScanCaller (object): |
42 def __init__(self, ref_genome, bam_input_files, | 369 def __init__(self, ref_genome, bam_input_files, |
43 max_depth=None, | 370 max_depth=None, count_orphans=False, detect_overlaps=True, |
44 min_mapqual=None, min_basequal=None, | 371 min_mapqual=None, min_basequal=None, |
45 threads=1, verbose=False, quiet=True | 372 threads=1, verbose=False, quiet=True |
46 ): | 373 ): |
47 self.ref_genome = ref_genome | 374 self.ref_genome = ref_genome |
48 self.bam_input_files = bam_input_files | 375 self.bam_input_files = bam_input_files |
49 self.max_depth = max_depth | 376 self.max_depth = max_depth |
377 self.count_orphans = count_orphans | |
378 self.detect_overlaps = detect_overlaps | |
50 self.min_mapqual = min_mapqual | 379 self.min_mapqual = min_mapqual |
51 self.min_basequal = min_basequal | 380 self.min_basequal = min_basequal |
52 self.threads = threads | 381 self.threads = threads |
53 self.verbose = verbose | 382 self.verbose = verbose |
54 self.quiet = quiet | 383 self.quiet = quiet |
65 mode='wb', suffix='', delete=False, dir=os.getcwd() | 394 mode='wb', suffix='', delete=False, dir=os.getcwd() |
66 ) | 395 ) |
67 self.tmpfiles = [] | 396 self.tmpfiles = [] |
68 | 397 |
69 def _get_pysam_pileup_args(self): | 398 def _get_pysam_pileup_args(self): |
70 param_dict = {} | 399 # Compute the pileup args dynamically to account for possibly updated |
400 # instance attributes. | |
401 | |
402 # fixed default parameters | |
403 # Note on the fixed default for 'ignore_overlaps': | |
404 # In order to use the exact same pileup parameters during variant | |
405 # calling and postprocessing, we would have to compute the setting | |
406 # dynamically like so: | |
407 # if not self.detect_overlaps: | |
408 # param_dict['ignore_overlaps'] = False | |
409 # However, samtools/pysam implement overlap correction by manipulating | |
410 # (setting to zero) the lower qualities of bases that are part of an | |
411 # overlap (see | |
412 # https://sourceforge.net/p/samtools/mailman/message/32793789/). This | |
413 # manipulation has such a big undesirable effect on base quality stats | |
414 # calculated during postprocessing that calculating the stats on a | |
415 # slightly different pileup seems like the lesser evil. | |
416 | |
417 param_dict = { | |
418 'ignore_overlaps': False, | |
419 'compute_baq': False, | |
420 'stepper': 'samtools', | |
421 } | |
422 # user-controllable parameters | |
423 if self.count_orphans: | |
424 param_dict['ignore_orphans'] = False | |
71 if self.max_depth is not None: | 425 if self.max_depth is not None: |
72 param_dict['max_depth'] = self.max_depth | 426 param_dict['max_depth'] = self.max_depth |
73 if self.min_mapqual is not None: | 427 if self.min_mapqual is not None: |
74 param_dict['min_mapping_quality'] = self.min_mapqual | 428 param_dict['min_mapping_quality'] = self.min_mapqual |
75 if self.min_basequal is not None: | 429 if self.min_basequal is not None: |
76 param_dict['min_base_quality'] = self.min_basequal | 430 param_dict['min_base_quality'] = self.min_basequal |
77 param_dict['compute_baq'] = False | |
78 param_dict['stepper'] = 'samtools' | |
79 return param_dict | 431 return param_dict |
80 | 432 |
81 def varcall_parallel(self, normal_purity=None, tumor_purity=None, | 433 def varcall_parallel(self, normal_purity=None, tumor_purity=None, |
82 min_coverage=None, | 434 min_coverage=None, |
83 min_var_count=None, | 435 min_var_count=None, |
106 varcall_engine_options = [] | 458 varcall_engine_options = [] |
107 for option, value in varcall_engine_option_mapping: | 459 for option, value in varcall_engine_option_mapping: |
108 if value is not None: | 460 if value is not None: |
109 varcall_engine_options += [option, str(value)] | 461 varcall_engine_options += [option, str(value)] |
110 pileup_engine_options = ['-B'] | 462 pileup_engine_options = ['-B'] |
463 if self.count_orphans: | |
464 pileup_engine_options += ['-A'] | |
465 if not self.detect_overlaps: | |
466 pileup_engine_options += ['-x'] | |
111 if self.max_depth is not None: | 467 if self.max_depth is not None: |
112 pileup_engine_options += ['-d', str(self.max_depth)] | 468 pileup_engine_options += ['-d', str(self.max_depth)] |
113 if self.min_mapqual is not None: | 469 if self.min_mapqual is not None: |
114 pileup_engine_options += ['-q', str(self.min_mapqual)] | 470 pileup_engine_options += ['-q', str(self.min_mapqual)] |
115 if self.min_basequal is not None: | 471 if self.min_basequal is not None: |
504 # FREQ is easy to calculate from AD | 860 # FREQ is easy to calculate from AD |
505 del record.format['FREQ'] | 861 del record.format['FREQ'] |
506 del record.format['RD'] | 862 del record.format['RD'] |
507 del record.format['DP4'] | 863 del record.format['DP4'] |
508 | 864 |
509 def get_allele_specific_pileup_column_stats( | |
510 self, allele, pile_column, ref_fetch, use_md=True | |
511 ): | |
512 var_reads_plus = var_reads_minus = 0 | |
513 sum_mapping_qualities = 0 | |
514 sum_base_qualities = 0 | |
515 sum_dist_from_center = 0 | |
516 sum_dist_from_3prime = 0 | |
517 sum_clipped_length = 0 | |
518 sum_unclipped_length = 0 | |
519 sum_num_mismatches_as_fraction = 0 | |
520 sum_mismatch_qualities = 0 | |
521 | |
522 for p in pile_column.pileups: | |
523 # skip reads that don't support the allele we're looking for | |
524 if p.query_position is None: | |
525 continue | |
526 if p.alignment.query_sequence[p.query_position] != allele: | |
527 continue | |
528 | |
529 if p.alignment.is_reverse: | |
530 var_reads_minus += 1 | |
531 else: | |
532 var_reads_plus += 1 | |
533 sum_mapping_qualities += p.alignment.mapping_quality | |
534 sum_base_qualities += p.alignment.query_qualities[p.query_position] | |
535 sum_clipped_length += p.alignment.query_alignment_length | |
536 unclipped_length = p.alignment.query_length | |
537 sum_unclipped_length += unclipped_length | |
538 # The following calculations are all in 1-based coordinates | |
539 # with respect to the physical 5'-end of the read sequence. | |
540 if p.alignment.is_reverse: | |
541 read_base_pos = unclipped_length - p.query_position | |
542 read_first_aln_pos = ( | |
543 unclipped_length - p.alignment.query_alignment_end + 1 | |
544 ) | |
545 read_last_aln_pos = ( | |
546 unclipped_length - p.alignment.query_alignment_start | |
547 ) | |
548 else: | |
549 read_base_pos = p.query_position + 1 | |
550 read_first_aln_pos = p.alignment.query_alignment_start + 1 | |
551 read_last_aln_pos = p.alignment.query_alignment_end | |
552 # Note: the original bam-readcount algorithm uses the less accurate | |
553 # p.alignment.query_alignment_length / 2 as the read center | |
554 read_center = (read_first_aln_pos + read_last_aln_pos) / 2 | |
555 sum_dist_from_center += 1 - abs( | |
556 read_base_pos - read_center | |
557 ) / (read_center - 1) | |
558 # Note: the original bam-readcount algorithm uses a more | |
559 # complicated definition of the 3'-end of a read sequence, which | |
560 # clips off runs of bases with a quality scores of exactly 2. | |
561 sum_dist_from_3prime += ( | |
562 read_last_aln_pos - read_base_pos | |
563 ) / (unclipped_length - 1) | |
564 | |
565 sum_num_mismatches = 0 | |
566 if use_md: | |
567 try: | |
568 # see if the read has an MD tag, in which case pysam can | |
569 # calculate the reference sequence for us | |
570 aligned_pairs = p.alignment.get_aligned_pairs( | |
571 matches_only=True, with_seq=True | |
572 ) | |
573 except ValueError: | |
574 use_md = False | |
575 if use_md: | |
576 # The ref sequence got calculated from alignment and MD tag. | |
577 for qpos, rpos, ref_base in aligned_pairs: | |
578 # pysam uses lowercase ref bases to indicate mismatches | |
579 if ( | |
580 ref_base == 'a' or ref_base == 't' or | |
581 ref_base == 'g' or ref_base == 'c' | |
582 ): | |
583 sum_num_mismatches += 1 | |
584 sum_mismatch_qualities += p.alignment.query_qualities[ | |
585 qpos | |
586 ] | |
587 else: | |
588 # We need to obtain the aligned portion of the reference | |
589 # sequence. | |
590 aligned_pairs = p.alignment.get_aligned_pairs( | |
591 matches_only=True | |
592 ) | |
593 # note that ref bases can be lowercase | |
594 ref_seq = ref_fetch( | |
595 aligned_pairs[0][1], aligned_pairs[-1][1] + 1 | |
596 ).upper() | |
597 ref_offset = aligned_pairs[0][1] | |
598 | |
599 for qpos, rpos in p.alignment.get_aligned_pairs( | |
600 matches_only=True | |
601 ): | |
602 # see if we have a mismatch to the reference at this | |
603 # position, but note that | |
604 # - the query base may be lowercase (SAM/BAM spec) | |
605 # - an '=', if present in the query seq, indicates a match | |
606 # to the reference (SAM/BAM spec) | |
607 # - there cannot be a mismatch with an N in the reference | |
608 ref_base = ref_seq[rpos - ref_offset] | |
609 read_base = p.alignment.query_sequence[qpos].upper() | |
610 if ( | |
611 read_base != '=' and ref_base != 'N' | |
612 ) and ( | |
613 ref_base != read_base | |
614 ): | |
615 sum_num_mismatches += 1 | |
616 sum_mismatch_qualities += p.alignment.query_qualities[ | |
617 qpos | |
618 ] | |
619 sum_num_mismatches_as_fraction += ( | |
620 sum_num_mismatches / p.alignment.query_alignment_length | |
621 ) | |
622 | |
623 var_reads_total = var_reads_plus + var_reads_minus | |
624 if var_reads_total == 0: | |
625 # No stats without reads! | |
626 return None | |
627 avg_mapping_quality = sum_mapping_qualities / var_reads_total | |
628 avg_basequality = sum_base_qualities / var_reads_total | |
629 avg_pos_as_fraction = sum_dist_from_center / var_reads_total | |
630 avg_num_mismatches_as_fraction = ( | |
631 sum_num_mismatches_as_fraction / var_reads_total | |
632 ) | |
633 avg_sum_mismatch_qualities = sum_mismatch_qualities / var_reads_total | |
634 avg_clipped_length = sum_clipped_length / var_reads_total | |
635 avg_distance_to_effective_3p_end = ( | |
636 sum_dist_from_3prime / var_reads_total | |
637 ) | |
638 | |
639 return ( | |
640 avg_mapping_quality, | |
641 avg_basequality, | |
642 var_reads_plus, | |
643 var_reads_minus, | |
644 avg_pos_as_fraction, | |
645 avg_num_mismatches_as_fraction, | |
646 avg_sum_mismatch_qualities, | |
647 avg_clipped_length, | |
648 avg_distance_to_effective_3p_end | |
649 ) | |
650 | |
651 def _postprocess_variant_records(self, invcf, | 865 def _postprocess_variant_records(self, invcf, |
652 min_var_count2, min_var_count2_lc, | 866 min_var_count2, min_var_count2_lc, |
653 min_var_freq2, max_somatic_p, | 867 min_var_freq2, min_var_count2_depth, |
654 max_somatic_p_depth, | |
655 min_ref_readpos, min_var_readpos, | 868 min_ref_readpos, min_var_readpos, |
656 min_ref_dist3, min_var_dist3, | 869 min_ref_dist3, min_var_dist3, |
870 detect_q2_runs, | |
657 min_ref_len, min_var_len, | 871 min_ref_len, min_var_len, |
658 max_relative_len_diff, | 872 max_relative_len_diff, |
659 min_strandedness, min_strand_reads, | 873 min_strandedness, min_strand_reads, |
660 min_ref_basequal, min_var_basequal, | 874 min_ref_basequal, min_var_basequal, |
661 max_basequal_diff, | 875 max_basequal_diff, |
662 min_ref_mapqual, min_var_mapqual, | 876 min_ref_mapqual, min_var_mapqual, |
663 max_mapqual_diff, | 877 max_mapqual_diff, |
664 max_ref_mmqs, max_var_mmqs, | 878 max_ref_mmqs, max_var_mmqs, |
665 min_mmqs_diff, max_mmqs_diff, | 879 min_mmqs_diff, max_mmqs_diff, |
880 ignore_md, | |
666 **args): | 881 **args): |
667 # set FILTER field according to Varscan criteria | 882 # set FILTER field according to Varscan criteria |
668 # multiple FILTER entries must be separated by semicolons | 883 # multiple FILTER entries must be separated by semicolons |
669 # No filters applied should be indicated with MISSING | 884 # No filters applied should be indicated with MISSING |
670 | 885 |
679 io_stack.enter_context( | 894 io_stack.enter_context( |
680 pysam.Samfile(fn, 'rb')) for fn in self.bam_input_files | 895 pysam.Samfile(fn, 'rb')) for fn in self.bam_input_files |
681 ) | 896 ) |
682 refseq = io_stack.enter_context(pysam.FastaFile(self.ref_genome)) | 897 refseq = io_stack.enter_context(pysam.FastaFile(self.ref_genome)) |
683 pileup_args = self._get_pysam_pileup_args() | 898 pileup_args = self._get_pysam_pileup_args() |
899 _get_stats = get_allele_stats | |
684 for record in invcf: | 900 for record in invcf: |
685 if any(len(allele) > 1 for allele in record.alleles): | 901 is_indel = False |
686 # skip indel postprocessing for the moment | |
687 yield record | |
688 continue | |
689 if record.alleles[0] == 'N': | 902 if record.alleles[0] == 'N': |
690 # It makes no sense to call SNPs against an unknown | 903 # It makes no sense to call SNPs against an unknown |
691 # reference base | 904 # reference base |
692 continue | 905 continue |
906 if len(record.alleles[0]) > 1: | |
907 # a deletion | |
908 is_indel = True | |
909 alleles = [ | |
910 record.alleles[1], record.alleles[0].replace( | |
911 record.alleles[1], record.alleles[1] + '-', 1 | |
912 ) | |
913 ] | |
914 elif len(record.alleles[1]) > 1: | |
915 # an insertion | |
916 is_indel = True | |
917 alleles = [ | |
918 record.alleles[0], | |
919 record.alleles[1].replace( | |
920 record.alleles[0], record.alleles[0] + '+', 1 | |
921 ) | |
922 ] | |
923 else: | |
924 # a regular SNV | |
925 alleles = record.alleles[:2] | |
693 # get pileup for genomic region affected by this variant | 926 # get pileup for genomic region affected by this variant |
694 if record.info['SS'] == '2': | 927 if record.info['SS'] == '2': |
695 # a somatic variant => generate pileup from tumor data | 928 # a somatic variant => generate pileup from tumor data |
696 pile = tumor_reads.pileup( | 929 reads_of_interest = tumor_reads |
697 record.chrom, record.start, record.stop, | 930 calculate_ref_stats = record.samples['TUMOR']['RD'] > 0 |
698 **pileup_args | |
699 ) | |
700 sample_of_interest = 'TUMOR' | |
701 elif record.info['SS'] in ['1', '3']: | 931 elif record.info['SS'] in ['1', '3']: |
702 # a germline or LOH variant => pileup from normal data | 932 # a germline or LOH variant => pileup from normal data |
703 pile = normal_reads.pileup( | 933 reads_of_interest = normal_reads |
704 record.chrom, record.start, record.stop, | 934 calculate_ref_stats = record.samples['NORMAL']['RD'] > 0 |
705 **pileup_args | |
706 ) | |
707 sample_of_interest = 'NORMAL' | |
708 else: | 935 else: |
709 # TO DO: figure out if there is anything interesting to do | 936 # TO DO: figure out if there is anything interesting to do |
710 # for SS status codes 0 (reference) and 5 (unknown) | 937 # for SS status codes 0 (reference) and 5 (unknown) |
711 yield record | 938 yield record |
712 continue | 939 continue |
713 | 940 |
714 # apply false-positive filtering a la varscan fpfilter | |
715 # find the variant site in the pileup columns | |
716 for pile_column in pile: | |
717 if pile_column.reference_pos == record.start: | |
718 break | |
719 # extract required information | |
720 # overall read depth at the site | |
721 read_depth = pile_column.get_num_aligned() | |
722 assert read_depth > 0 | |
723 # no multiallelic sites in varscan | 941 # no multiallelic sites in varscan |
724 assert len(record.alleles) == 2 | 942 assert len(record.alleles) == 2 |
725 if record.samples[sample_of_interest]['RD'] > 0: | 943 |
726 ref_stats, alt_stats = [ | 944 if calculate_ref_stats: |
727 self.get_allele_specific_pileup_column_stats( | 945 ref_stats, alt_stats = _get_stats( |
728 allele, | 946 reads_of_interest, |
729 pile_column, | 947 (record.chrom, record.start, record.stop), |
730 partial( | 948 alleles, |
731 pysam.FastaFile.fetch, refseq, record.chrom | 949 refseq, |
732 ) | 950 ignore_md=ignore_md, |
733 ) | 951 ignore_nm=False, |
734 for allele in record.alleles | 952 mm_runs=True, |
735 ] | 953 detect_q2_runs=detect_q2_runs, |
954 pileup_args=pileup_args | |
955 ) | |
736 else: | 956 else: |
737 ref_stats = None | 957 ref_stats = None |
738 alt_stats = self.get_allele_specific_pileup_column_stats( | 958 alt_stats = _get_stats( |
739 record.alleles[1], | 959 reads_of_interest, |
740 pile_column, | 960 (record.chrom, record.start, record.stop), |
741 partial(pysam.FastaFile.fetch, refseq, record.chrom) | 961 alleles[1:2], |
742 ) | 962 refseq, |
963 ignore_md=ignore_md, | |
964 ignore_nm=False, | |
965 mm_runs=True, | |
966 detect_q2_runs=detect_q2_runs, | |
967 pileup_args=pileup_args | |
968 )[0] | |
743 ref_count = 0 | 969 ref_count = 0 |
744 if ref_stats: | 970 if ref_stats: |
745 ref_count = ref_stats[2] + ref_stats[3] | 971 ref_count = ref_stats.reads_fw + ref_stats.reads_rv |
746 if ref_stats[1] < min_ref_basequal: | 972 if ref_stats.avg_basequal < min_ref_basequal: |
747 record.filter.add('RefBaseQual') | 973 record.filter.add('RefBaseQual') |
748 if ref_count >= 2: | 974 if ref_count >= 2: |
749 if ref_stats[0] < min_ref_mapqual: | 975 if ref_stats.avg_mapqual < min_ref_mapqual: |
750 record.filter.add('RefMapQual') | 976 record.filter.add('RefMapQual') |
751 if ref_stats[4] < min_ref_readpos: | 977 if ref_stats.avg_dist_from_center < min_ref_readpos: |
752 record.filter.add('RefReadPos') | 978 record.filter.add('RefReadPos') |
753 # ref_stats[5] (avg_num_mismatches_as_fraction | 979 # ref_stats.avg_mismatch_fraction |
754 # is not a filter criterion in VarScan fpfilter | 980 # is not a filter criterion in VarScan fpfilter |
755 if ref_stats[6] > max_ref_mmqs: | 981 if ref_stats.avg_mismatch_qualsum > max_ref_mmqs: |
756 record.filter.add('RefMMQS') | 982 record.filter.add('RefMMQS') |
757 if ref_stats[7] < min_ref_len: | 983 if not is_indel and ( |
984 ref_stats.avg_clipped_len < min_ref_len | |
985 ): | |
758 # VarScan fpfilter does not apply this filter | 986 # VarScan fpfilter does not apply this filter |
759 # for indels, but there is no reason | 987 # for indels, so we do not do it either. |
760 # not to do it. | |
761 record.filter.add('RefAvgRL') | 988 record.filter.add('RefAvgRL') |
762 if ref_stats[8] < min_ref_dist3: | 989 if ref_stats.avg_dist_from_3prime < min_ref_dist3: |
763 record.filter.add('RefDist3') | 990 record.filter.add('RefDist3') |
764 if alt_stats: | 991 if alt_stats: |
765 alt_count = alt_stats[2] + alt_stats[3] | 992 alt_count = alt_stats.reads_fw + alt_stats.reads_rv |
766 if ( | 993 if alt_count < min_var_count2: |
767 alt_count < min_var_count2_lc | 994 if ( |
768 ) or ( | 995 (alt_count + ref_count) >= min_var_count2_depth |
769 read_depth >= max_somatic_p_depth and | 996 ) or ( |
770 alt_count < min_var_count2 | 997 alt_count < min_var_count2_lc |
998 ): | |
999 record.filter.add('VarCount') | |
1000 if alt_count / alt_stats.reads_total < min_var_freq2: | |
1001 record.filter.add('VarFreq') | |
1002 if not is_indel and ( | |
1003 alt_stats.avg_basequal < min_var_basequal | |
771 ): | 1004 ): |
772 record.filter.add('VarCount') | |
773 if alt_count / read_depth < min_var_freq2: | |
774 record.filter.add('VarFreq') | |
775 if alt_stats[1] < min_var_basequal: | |
776 record.filter.add('VarBaseQual') | 1005 record.filter.add('VarBaseQual') |
777 if alt_count > min_strand_reads: | 1006 if alt_count >= min_strand_reads: |
778 if ( | 1007 if ( |
779 alt_stats[2] / alt_count < min_strandedness | 1008 alt_stats.reads_fw / alt_count < min_strandedness |
780 ) or ( | 1009 ) or ( |
781 alt_stats[3] / alt_count < min_strandedness | 1010 alt_stats.reads_rv / alt_count < min_strandedness |
782 ): | 1011 ): |
783 record.filter.add('Strand') | 1012 record.filter.add('Strand') |
784 if alt_stats[2] + alt_stats[3] >= 2: | 1013 if alt_count >= 2: |
785 if alt_stats[0] < min_var_mapqual: | 1014 if alt_stats.avg_mapqual < min_var_mapqual: |
786 record.filter.add('VarMapQual') | 1015 record.filter.add('VarMapQual') |
787 if alt_stats[4] < min_var_readpos: | 1016 if alt_stats.avg_dist_from_center < min_var_readpos: |
788 record.filter.add('VarReadPos') | 1017 record.filter.add('VarReadPos') |
789 # alt_stats[5] (avg_num_mismatches_as_fraction | 1018 # alt_stats.avg_mismatch_fraction |
790 # is not a filter criterion in VarScan fpfilter | 1019 # is not a filter criterion in VarScan fpfilter |
791 if alt_stats[6] > max_var_mmqs: | 1020 if alt_stats.avg_mismatch_qualsum > max_var_mmqs: |
792 record.filter.add('VarMMQS') | 1021 record.filter.add('VarMMQS') |
793 if alt_stats[7] < min_var_len: | 1022 if not is_indel and ( |
1023 alt_stats.avg_clipped_len < min_var_len | |
1024 ): | |
794 # VarScan fpfilter does not apply this filter | 1025 # VarScan fpfilter does not apply this filter |
795 # for indels, but there is no reason | 1026 # for indels, so we do not do it either. |
796 # not to do it. | |
797 record.filter.add('VarAvgRL') | 1027 record.filter.add('VarAvgRL') |
798 if alt_stats[8] < min_var_dist3: | 1028 if alt_stats.avg_dist_from_3prime < min_var_dist3: |
799 record.filter.add('VarDist3') | 1029 record.filter.add('VarDist3') |
800 if ref_count >= 2 and alt_count >= 2: | 1030 if ref_count >= 2 and alt_count >= 2: |
801 if (ref_stats[0] - alt_stats[0]) > max_mapqual_diff: | 1031 if ( |
1032 ref_stats.avg_mapqual - alt_stats.avg_mapqual | |
1033 ) > max_mapqual_diff: | |
802 record.filter.add('MapQualDiff') | 1034 record.filter.add('MapQualDiff') |
803 if (ref_stats[1] - alt_stats[1]) > max_basequal_diff: | 1035 if ( |
1036 ref_stats.avg_basequal - alt_stats.avg_basequal | |
1037 ) > max_basequal_diff: | |
804 record.filter.add('MaxBAQdiff') | 1038 record.filter.add('MaxBAQdiff') |
805 mmqs_diff = alt_stats[6] - ref_stats[6] | 1039 mmqs_diff = ( |
1040 alt_stats.avg_mismatch_qualsum | |
1041 - ref_stats.avg_mismatch_qualsum | |
1042 ) | |
806 if mmqs_diff < min_mmqs_diff: | 1043 if mmqs_diff < min_mmqs_diff: |
807 record.filter.add('MinMMQSdiff') | 1044 record.filter.add('MinMMQSdiff') |
808 if mmqs_diff > max_mmqs_diff: | 1045 if mmqs_diff > max_mmqs_diff: |
809 record.filter.add('MMQSdiff') | 1046 record.filter.add('MMQSdiff') |
810 if ( | 1047 if ( |
811 1 - alt_stats[7] / ref_stats[7] | 1048 1 - |
1049 alt_stats.avg_clipped_len | |
1050 / ref_stats.avg_clipped_len | |
812 ) > max_relative_len_diff: | 1051 ) > max_relative_len_diff: |
813 record.filter.add('ReadLenDiff') | 1052 record.filter.add('ReadLenDiff') |
814 else: | 1053 else: |
815 # No variant-supporting reads for this record! | 1054 # No variant-supporting reads for this record! |
816 # This can happen in rare cases because of | 1055 # This can happen in rare cases because of |
992 out = (output_path, None) | 1231 out = (output_path, None) |
993 | 1232 |
994 instance_args = { | 1233 instance_args = { |
995 k: args.pop(k) for k in [ | 1234 k: args.pop(k) for k in [ |
996 'max_depth', | 1235 'max_depth', |
1236 'count_orphans', | |
1237 'detect_overlaps', | |
997 'min_mapqual', | 1238 'min_mapqual', |
998 'min_basequal', | 1239 'min_basequal', |
999 'threads', | 1240 'threads', |
1000 'verbose', | 1241 'verbose', |
1001 'quiet' | 1242 'quiet' |
1044 'basename for the two output files to be generated (see ' | 1285 'basename for the two output files to be generated (see ' |
1045 '-s|--split-output below).' | 1286 '-s|--split-output below).' |
1046 ) | 1287 ) |
1047 p.add_argument( | 1288 p.add_argument( |
1048 '-s', '--split-output', | 1289 '-s', '--split-output', |
1049 dest='split_output', action='store_true', default=False, | 1290 dest='split_output', action='store_true', |
1050 help='indicate that separate output files for SNPs and indels ' | 1291 help='indicate that separate output files for SNPs and indels ' |
1051 'should be generated (original VarScan behavior). If specified, ' | 1292 'should be generated (original VarScan behavior). If specified, ' |
1052 '%%T in the --ofile file name will be replaced with "snp" and ' | 1293 '%%T in the --ofile file name will be replaced with "snp" and ' |
1053 '"indel" to generate the names of the SNP and indel output ' | 1294 '"indel" to generate the names of the SNP and indel output ' |
1054 'files, respectively. If %%T is not found in the file name, it ' | 1295 'files, respectively. If %%T is not found in the file name, it ' |
1088 dest='max_depth', type=int, default=8000, | 1329 dest='max_depth', type=int, default=8000, |
1089 help='Maximum depth of generated pileups (samtools mpileup -d option; ' | 1330 help='Maximum depth of generated pileups (samtools mpileup -d option; ' |
1090 'default: 8000)' | 1331 'default: 8000)' |
1091 ) | 1332 ) |
1092 call_group.add_argument( | 1333 call_group.add_argument( |
1334 '--count-orphans', | |
1335 dest='count_orphans', action='store_true', | |
1336 help='Use anomalous read pairs in variant calling ' | |
1337 '(samtools mpileup -A option; default: ignore anomalous pairs)' | |
1338 ) | |
1339 call_group.add_argument( | |
1340 '--no-detect-overlaps', | |
1341 dest='detect_overlaps', action='store_false', | |
1342 help='Disable automatic read-pair overlap detection by samtools ' | |
1343 'mpileup. Overlap detection tries to avoid counting duplicated ' | |
1344 'bases twice during variant calling. ' | |
1345 '(samtools mpileup -x option; default: use overlap detection)' | |
1346 ) | |
1347 call_group.add_argument( | |
1093 '--min-basequal', | 1348 '--min-basequal', |
1094 dest='min_basequal', type=int, | 1349 dest='min_basequal', type=int, |
1095 default=13, | 1350 default=13, |
1096 help='Minimum base quality at the variant position to use a read ' | 1351 help='Minimum base quality at the variant position to use a read ' |
1097 '(default: 13)' | 1352 '(default: 13)' |
1158 filter_group.add_argument( | 1413 filter_group.add_argument( |
1159 '--min-var-count2-lc', | 1414 '--min-var-count2-lc', |
1160 dest='min_var_count2_lc', type=int, | 1415 dest='min_var_count2_lc', type=int, |
1161 default=2, | 1416 default=2, |
1162 help='Minimum number of variant-supporting reads when depth below ' | 1417 help='Minimum number of variant-supporting reads when depth below ' |
1163 '--somatic-p-depth (default: 2)' | 1418 '--min-var-count2-depth (default: 2)' |
1419 ) | |
1420 filter_group.add_argument( | |
1421 '--min-var-count2-depth', | |
1422 dest='min_var_count2_depth', type=int, | |
1423 default=10, | |
1424 help='Combined depth of ref- and variant-supporting reads required to ' | |
1425 'apply the --min-var-count filter instead of --min-var-count-lc ' | |
1426 '(default: 10)' | |
1164 ) | 1427 ) |
1165 filter_group.add_argument( | 1428 filter_group.add_argument( |
1166 '--min-var-freq2', | 1429 '--min-var-freq2', |
1167 dest='min_var_freq2', type=float, | 1430 dest='min_var_freq2', type=float, |
1168 default=0.05, | 1431 default=0.05, |
1169 help='Minimum variant allele frequency (default: 0.05)' | 1432 help='Minimum variant allele frequency (default: 0.05)' |
1170 ) | |
1171 filter_group.add_argument( | |
1172 '--max-somatic-p', | |
1173 dest='max_somatic_p', type=float, | |
1174 default=0.05, | |
1175 help='Maximum somatic p-value (default: 0.05)' | |
1176 ) | |
1177 filter_group.add_argument( | |
1178 '--max-somatic-p-depth', | |
1179 dest='max_somatic_p_depth', type=int, | |
1180 default=10, | |
1181 help='Depth required to run --max-somatic-p filter (default: 10)' | |
1182 ) | 1433 ) |
1183 filter_group.add_argument( | 1434 filter_group.add_argument( |
1184 '--min-ref-readpos', | 1435 '--min-ref-readpos', |
1185 dest='min_ref_readpos', type=float, | 1436 dest='min_ref_readpos', type=float, |
1186 default=0.1, | 1437 default=0.1, |
1207 default=0.1, | 1458 default=0.1, |
1208 help='Minimum average relative distance of site from the effective ' | 1459 help='Minimum average relative distance of site from the effective ' |
1209 '3\'end of variant-supporting reads (default: 0.1)' | 1460 '3\'end of variant-supporting reads (default: 0.1)' |
1210 ) | 1461 ) |
1211 filter_group.add_argument( | 1462 filter_group.add_argument( |
1463 '--detect-q2-runs', | |
1464 dest='detect_q2_runs', action='store_true', | |
1465 help='Look for 3\'-terminal q2 runs and take their lengths into ' | |
1466 'account for determining the effective 3\'end of reads ' | |
1467 '(default: off)' | |
1468 ) | |
1469 filter_group.add_argument( | |
1212 '--min-ref-len', | 1470 '--min-ref-len', |
1213 dest='min_ref_len', type=int, | 1471 dest='min_ref_len', type=int, |
1214 default=90, | 1472 default=90, |
1215 help='Minimum average trimmed length of reads supporting the ref ' | 1473 help='Minimum average trimmed length of reads supporting the ref ' |
1216 'allele (default: 90)' | 1474 'allele (default: 90)' |
1307 '--max-mmqs-diff', | 1565 '--max-mmqs-diff', |
1308 dest='max_mmqs_diff', type=int, | 1566 dest='max_mmqs_diff', type=int, |
1309 default=50, | 1567 default=50, |
1310 help='Maximum mismatch quality sum difference (var - ref; default: 50)' | 1568 help='Maximum mismatch quality sum difference (var - ref; default: 50)' |
1311 ) | 1569 ) |
1570 filter_group.add_argument( | |
1571 '--ignore-md', | |
1572 dest='ignore_md', action='store_true', | |
1573 help='Do not rely on read MD tag, even if it is present on the mapped ' | |
1574 'reads, and recalculate mismatch quality stats from ref ' | |
1575 'alignments instead.' | |
1576 ) | |
1577 | |
1312 args = vars(p.parse_args()) | 1578 args = vars(p.parse_args()) |
1313 varscan_call(**args) | 1579 varscan_call(**args) |