Mercurial > repos > iuc > varscan_copynumber
changeset 9:0bc56f894766 draft
"planemo upload for repository https://github.com/galaxyproject/iuc/tree/master/tools/varscan commit fcf5ac14629c694f0f64773fab0428b1e78fe156"
author | iuc |
---|---|
date | Fri, 16 Aug 2019 15:50:10 -0400 |
parents | c24910ad5441 |
children | 9f8dbbbc0267 |
files | macros.xml varscan.py |
diffstat | 2 files changed, 507 insertions(+), 241 deletions(-) [+] |
line wrap: on
line diff
--- a/macros.xml Thu Mar 28 18:19:36 2019 -0400 +++ b/macros.xml Fri Aug 16 15:50:10 2019 -0400 @@ -56,9 +56,9 @@ <param argument="--min-coverage" name="min_coverage" type="integer" value="8" min="1" max="200" label="Minimum coverage" help="@HELP@"/> </xml> - <xml name="min_reads2"> + <xml name="min_reads2" token_help="Minimum number (default: 2) of variant-supporting reads at a position required to make a call"> <param argument="--min-reads2" name="min_reads2" type="integer" value="2" min="1" max="200" - label="Minimum supporting reads" help="Minimum number of variant-supporting reads at a position required to make a call"/> + label="Minimum supporting reads" help="@HELP@"/> </xml> <xml name="min_avg_qual"> <param argument="--min-avg-qual" name="min_avg_qual" type="integer" value="15" min="1" max="50" @@ -68,12 +68,12 @@ <xml name="min_var_freq" token_value="0.01"> <param argument="--min-var-freq" name="min_var_freq" type="float" value="@VALUE@" min="0" max="1" label="Minimum variant allele frequency" - help="Minimum variant allele frequency required for calling a variant"/> + help="Minimum variant allele frequency (default: @VALUE@) required for calling a variant"/> </xml> <xml name="min_freq_for_hom"> <param argument="--min-freq-for-hom" name="min_freq_for_hom" type="float" value="0.75" min="0" max="1" label="Minimum homozygous variant allele frequency" - help="Minimum variant allele frequency required for calling a homozygous genotype" /> + help="Minimum variant allele frequency (default: 0.75) required for calling a homozygous genotype" /> </xml> <xml name="p_value" token_value="0.01" token_label="p-value threshold for calling variants"
--- a/varscan.py Thu Mar 28 18:19:36 2019 -0400 +++ b/varscan.py Fri Aug 16 15:50:10 2019 -0400 @@ -8,6 +8,7 @@ import sys import tempfile import time +from collections import namedtuple from contextlib import ExitStack from functools import partial from threading import Thread @@ -15,6 +16,332 @@ import pysam +AlleleStats = namedtuple( + "AlleleStats", + [ + 'reads_total', + 'reads_fw', + 'reads_rv', + 'avg_mapqual', + 'avg_basequal', + 'avg_dist_from_center', + 'avg_mismatch_fraction', + 'avg_mismatch_qualsum', + 'avg_clipped_len', + 'avg_dist_from_3prime' + ] +) + + +def _get_allele_specific_pileup_column_stats(pileups, ref_fetch, + ignore_md, ignore_nm, + mm_runs, detect_q2_runs): + var_reads_plus = var_reads_minus = 0 + sum_mapping_qualities = 0 + sum_base_qualities = 0 + sum_dist_from_center = 0 + sum_dist_from_3prime = 0 + sum_clipped_length = 0 + sum_unclipped_length = 0 + sum_mismatch_fractions = 0 + sum_mismatch_qualities = 0 + + for p in pileups: + if p.alignment.is_reverse: + var_reads_minus += 1 + else: + var_reads_plus += 1 + sum_mapping_qualities += p.alignment.mapping_quality + sum_base_qualities += p.alignment.query_qualities[p.query_position] + sum_clipped_length += p.alignment.query_alignment_length + unclipped_length = p.alignment.query_length + sum_unclipped_length += unclipped_length + # The following calculations are all in 1-based coordinates + # with respect to the physical 5'-end of the read sequence. + if p.alignment.is_reverse: + read_base_pos = unclipped_length - p.query_position + read_first_aln_pos = ( + unclipped_length - p.alignment.query_alignment_end + 1 + ) + read_last_aln_pos = ( + unclipped_length - p.alignment.query_alignment_start + ) + qualities_3to5prime = p.alignment.query_alignment_qualities + else: + read_base_pos = p.query_position + 1 + read_first_aln_pos = p.alignment.query_alignment_start + 1 + read_last_aln_pos = p.alignment.query_alignment_end + qualities_3to5prime = reversed( + p.alignment.query_alignment_qualities + ) + + read_last_effective_aln_pos = read_last_aln_pos + if detect_q2_runs: + # Note: the original bam-readcount algorithm always takes + # terminal q2 runs into account when determining the + # effective 3'-ends of reads. + # However, q2 runs have no special meaning since Illumina + # pipeline version 1.8 so detecting them is optional + # in this code. + for qual in qualities_3to5prime: + if qual != 2: + break + read_last_effective_aln_pos -= 1 + + # Note: the original bam-readcount algorithm defines the + # read center as: + # read_center = p.alignment.query_alignment_length / 2 + # This is less accurate than the implementation here. + read_center = (read_first_aln_pos + read_last_aln_pos) / 2 + sum_dist_from_center += 1 - abs( + read_base_pos - read_center + ) / (read_center - 1) + # To calculate the distance of base positions from the 3'-ends + # of reads bam-readcount uses the formula: + # sum_dist_from_3prime += abs( + # read_last_effective_aln_pos - read_base_pos + # ) / (unclipped_length - 1) + # , which treats base positions on both sides of the effective + # 3'-end equally. Since this seems hard to justify, we cap + # the distance calculation at 0 for base positions past the + # effective 3'-end (which, in turn, should only be possible + # with detection of q2 runs). + if read_last_effective_aln_pos > read_base_pos: + sum_dist_from_3prime += ( + read_last_effective_aln_pos - read_base_pos + ) / (unclipped_length - 1) + + if sum_mismatch_fractions >= 0 or sum_mismatch_qualities >= 0: + # sum_mismatch_fractions and sum_mismatch_qualities will be + # set to float('nan') if reference info (in form of an MD tag or + # an actual reference sequence) was not available for any previous + # read in the pileup. + # In that case, there is no point to trying to calculate + # mismatch stats for other reads anymore. + + # The following mismatch calculations use this logic: + # 1. For determining the edit distance between an aligned read and + # the reference, we follow the authoritative definition of the + # NM tag calculation in + # http://samtools.github.io/hts-specs/SAMtags.pdf + # + # For historical reasons, the result of this calculation will + # disagree more often than not with NM tag values calculated by + # other tools. + # If precalculated NM tag values are present on the aligned + # reads, these can be given preference through the use_nm flag. + # Doing so will mimick the behavior of bam-readcount, which + # requires and always just looks at NM tags. + # 2. For determining mismatch quality sums, a mismatch is defined + # differently and in accordance with the implementation in + # bam-readcount: + # - only mismatches (not inserted or deleted bases) are + # considered + # - 'N' in the reference is considered a match for any read base + # - any matching (case-insensitive) base in reference and read + # is considered a match, even if that base is not one of + # A, C, G or T. + # In both 1. and 2. above a '=' in the read is always considered a + # match, irrespective of the reference base. + + num_mismatches = 0 + if not ignore_md: + try: + # see if the read has an MD tag, in which case pysam can + # calculate the reference sequence for us + ref_seq = p.alignment.get_reference_sequence().upper() + except ValueError: + ignore_md = True + if not ignore_nm: + try: + num_mismatches = p.alignment.get_tag('NM') + except KeyError: + ignore_nm = True + + if ignore_md: + if not ref_fetch: + # cannot calculate mismatch stats without ref info + sum_mismatch_qualities = float('nan') + if ignore_nm: + sum_mismatch_fractions = float('nan') + else: + sum_mismatch_fractions += ( + num_mismatches / p.alignment.query_alignment_length + ) + continue + # Without an MD tag we need to extract the relevant part + # of the reference from the full sequence by position. + ref_positions = p.alignment.get_reference_positions() + ref_seq = ref_fetch( + ref_positions[0], ref_positions[-1] + 1 + ).upper() + + potential_matches = {'A', 'C', 'G', 'T'} + aligned_pairs = p.alignment.get_aligned_pairs(matches_only=True) + ref_offset = aligned_pairs[0][1] + last_mismatch_pos = None + mismatch_run_quals = [] + for qpos, rpos in aligned_pairs: + read_base = p.alignment.query_sequence[qpos].upper() + if read_base == '=': + # always treat the special read base '=' as a + # match, irrespective of the reference base + continue + ref_base = ref_seq[rpos - ref_offset] + # see if we have a mismatch to use for mismatch quality sum + # calculation + if ref_base != 'N' and ref_base != read_base: + if mm_runs: + if ( + last_mismatch_pos is None + ) or ( + last_mismatch_pos + 1 == qpos + ): + mismatch_run_quals.append( + p.alignment.query_qualities[qpos] + ) + else: + sum_mismatch_qualities += max(mismatch_run_quals) + mismatch_run_quals = [ + p.alignment.query_qualities[qpos] + ] + last_mismatch_pos = qpos + else: + sum_mismatch_qualities += ( + p.alignment.query_qualities[qpos] + ) + if ignore_nm: + # see if we have a mismatch that increases the edit + # distance according to the SAMtags specs + if ( + read_base not in potential_matches + ) or ( + ref_base not in potential_matches + ) or ( + read_base != ref_base + ): + num_mismatches += 1 + + if mismatch_run_quals: + sum_mismatch_qualities += max(mismatch_run_quals) + if ignore_nm: + # use the number of mismatches calculated above, + # but add inserted and deleted bases to it + cigar_stats = p.alignment.get_cigar_stats()[0] + num_mismatches += cigar_stats[1] + cigar_stats[2] + sum_mismatch_fractions += ( + num_mismatches / p.alignment.query_alignment_length + ) + + return ( + var_reads_plus, + var_reads_minus, + sum_mapping_qualities, + sum_base_qualities, + sum_dist_from_center, + sum_mismatch_fractions, + sum_mismatch_qualities, + sum_clipped_length, + sum_dist_from_3prime + ) + + +def get_allele_stats(reads, pos, alleles, + ref=None, ignore_md=True, ignore_nm=True, + mm_runs=True, detect_q2_runs=False, + pileup_args=None): + chrom, start, stop = pos + if pileup_args is None: + pileup_args = {} + if pileup_args.get('stepper') == 'samtools': + if pileup_args.get('compute_baq', True) is not False: + if 'fastafile' not in pileup_args: + pileup_args['fastafile'] = ref + # be careful when passing in a custom 'fastafile' option: + # providing it slows down pysam tremendously even if the option + # isn't actually required. + + pile = reads.pileup( + chrom, start, stop, + **pileup_args + ) + # apply false-positive filtering a la varscan fpfilter + # find the variant site in the pileup columns + for pile_column in pile: + if pile_column.reference_pos == start: + break + else: + # With no reads covering the genomic position + # we can only return "empty" allele stats + return [ + AlleleStats(0, 0, 0, *[float('nan')] * 7) + for allele in alleles + ] + + # extract required information + # overall read depth at the site + read_depth = pile_column.get_num_aligned() + assert read_depth > 0 + + alleles_stats = [] + for allele in alleles: + if '-' in allele: + allele, indel_type, indel_after = allele.partition('-') + indel_len = -len(indel_after) + elif '+' in allele: + allele, indel_type, indel_after = allele.partition('+') + indel_len = len(indel_after) + else: + indel_type = None + + stats_it = ( + p for p in pile_column.pileups + # skip reads that don't support the allele we're looking for + if (p.query_position is not None) + and (p.alignment.query_sequence[p.query_position] == allele) + ) + if indel_type == '-': + stats_it = ( + p for p in stats_it if p.indel == indel_len + ) + elif indel_type == '+': + stats_it = ( + p for p in stats_it if ( + p.indel == indel_len + ) and ( + p.alignment.query_sequence[ + p.query_position + 1: + p.query_position + 1 + indel_len + ] == indel_after + ) + ) + allele_stats = _get_allele_specific_pileup_column_stats( + stats_it, + partial( + pysam.FastaFile.fetch, ref, chrom + ) if ref else None, + ignore_md, + ignore_nm, + mm_runs, + detect_q2_runs + ) + + allele_reads_total = allele_stats[0] + allele_stats[1] + if allele_reads_total == 0: + # No stats without reads! + alleles_stats.append( + AlleleStats(read_depth, 0, 0, *[float('nan')] * 7) + ) + else: + alleles_stats.append( + AlleleStats( + read_depth, allele_stats[0], allele_stats[1], + *(i / allele_reads_total for i in allele_stats[2:]) + ) + ) + return alleles_stats + + class VariantCallingError (RuntimeError): """Exception class for issues with samtools and varscan subprocesses.""" @@ -40,13 +367,15 @@ class VarScanCaller (object): def __init__(self, ref_genome, bam_input_files, - max_depth=None, + max_depth=None, count_orphans=False, detect_overlaps=True, min_mapqual=None, min_basequal=None, threads=1, verbose=False, quiet=True ): self.ref_genome = ref_genome self.bam_input_files = bam_input_files self.max_depth = max_depth + self.count_orphans = count_orphans + self.detect_overlaps = detect_overlaps self.min_mapqual = min_mapqual self.min_basequal = min_basequal self.threads = threads @@ -67,15 +396,38 @@ self.tmpfiles = [] def _get_pysam_pileup_args(self): - param_dict = {} + # Compute the pileup args dynamically to account for possibly updated + # instance attributes. + + # fixed default parameters + # Note on the fixed default for 'ignore_overlaps': + # In order to use the exact same pileup parameters during variant + # calling and postprocessing, we would have to compute the setting + # dynamically like so: + # if not self.detect_overlaps: + # param_dict['ignore_overlaps'] = False + # However, samtools/pysam implement overlap correction by manipulating + # (setting to zero) the lower qualities of bases that are part of an + # overlap (see + # https://sourceforge.net/p/samtools/mailman/message/32793789/). This + # manipulation has such a big undesirable effect on base quality stats + # calculated during postprocessing that calculating the stats on a + # slightly different pileup seems like the lesser evil. + + param_dict = { + 'ignore_overlaps': False, + 'compute_baq': False, + 'stepper': 'samtools', + } + # user-controllable parameters + if self.count_orphans: + param_dict['ignore_orphans'] = False if self.max_depth is not None: param_dict['max_depth'] = self.max_depth if self.min_mapqual is not None: param_dict['min_mapping_quality'] = self.min_mapqual if self.min_basequal is not None: param_dict['min_base_quality'] = self.min_basequal - param_dict['compute_baq'] = False - param_dict['stepper'] = 'samtools' return param_dict def varcall_parallel(self, normal_purity=None, tumor_purity=None, @@ -108,6 +460,10 @@ if value is not None: varcall_engine_options += [option, str(value)] pileup_engine_options = ['-B'] + if self.count_orphans: + pileup_engine_options += ['-A'] + if not self.detect_overlaps: + pileup_engine_options += ['-x'] if self.max_depth is not None: pileup_engine_options += ['-d', str(self.max_depth)] if self.min_mapqual is not None: @@ -506,154 +862,12 @@ del record.format['RD'] del record.format['DP4'] - def get_allele_specific_pileup_column_stats( - self, allele, pile_column, ref_fetch, use_md=True - ): - var_reads_plus = var_reads_minus = 0 - sum_mapping_qualities = 0 - sum_base_qualities = 0 - sum_dist_from_center = 0 - sum_dist_from_3prime = 0 - sum_clipped_length = 0 - sum_unclipped_length = 0 - sum_num_mismatches_as_fraction = 0 - sum_mismatch_qualities = 0 - - for p in pile_column.pileups: - # skip reads that don't support the allele we're looking for - if p.query_position is None: - continue - if p.alignment.query_sequence[p.query_position] != allele: - continue - - if p.alignment.is_reverse: - var_reads_minus += 1 - else: - var_reads_plus += 1 - sum_mapping_qualities += p.alignment.mapping_quality - sum_base_qualities += p.alignment.query_qualities[p.query_position] - sum_clipped_length += p.alignment.query_alignment_length - unclipped_length = p.alignment.query_length - sum_unclipped_length += unclipped_length - # The following calculations are all in 1-based coordinates - # with respect to the physical 5'-end of the read sequence. - if p.alignment.is_reverse: - read_base_pos = unclipped_length - p.query_position - read_first_aln_pos = ( - unclipped_length - p.alignment.query_alignment_end + 1 - ) - read_last_aln_pos = ( - unclipped_length - p.alignment.query_alignment_start - ) - else: - read_base_pos = p.query_position + 1 - read_first_aln_pos = p.alignment.query_alignment_start + 1 - read_last_aln_pos = p.alignment.query_alignment_end - # Note: the original bam-readcount algorithm uses the less accurate - # p.alignment.query_alignment_length / 2 as the read center - read_center = (read_first_aln_pos + read_last_aln_pos) / 2 - sum_dist_from_center += 1 - abs( - read_base_pos - read_center - ) / (read_center - 1) - # Note: the original bam-readcount algorithm uses a more - # complicated definition of the 3'-end of a read sequence, which - # clips off runs of bases with a quality scores of exactly 2. - sum_dist_from_3prime += ( - read_last_aln_pos - read_base_pos - ) / (unclipped_length - 1) - - sum_num_mismatches = 0 - if use_md: - try: - # see if the read has an MD tag, in which case pysam can - # calculate the reference sequence for us - aligned_pairs = p.alignment.get_aligned_pairs( - matches_only=True, with_seq=True - ) - except ValueError: - use_md = False - if use_md: - # The ref sequence got calculated from alignment and MD tag. - for qpos, rpos, ref_base in aligned_pairs: - # pysam uses lowercase ref bases to indicate mismatches - if ( - ref_base == 'a' or ref_base == 't' or - ref_base == 'g' or ref_base == 'c' - ): - sum_num_mismatches += 1 - sum_mismatch_qualities += p.alignment.query_qualities[ - qpos - ] - else: - # We need to obtain the aligned portion of the reference - # sequence. - aligned_pairs = p.alignment.get_aligned_pairs( - matches_only=True - ) - # note that ref bases can be lowercase - ref_seq = ref_fetch( - aligned_pairs[0][1], aligned_pairs[-1][1] + 1 - ).upper() - ref_offset = aligned_pairs[0][1] - - for qpos, rpos in p.alignment.get_aligned_pairs( - matches_only=True - ): - # see if we have a mismatch to the reference at this - # position, but note that - # - the query base may be lowercase (SAM/BAM spec) - # - an '=', if present in the query seq, indicates a match - # to the reference (SAM/BAM spec) - # - there cannot be a mismatch with an N in the reference - ref_base = ref_seq[rpos - ref_offset] - read_base = p.alignment.query_sequence[qpos].upper() - if ( - read_base != '=' and ref_base != 'N' - ) and ( - ref_base != read_base - ): - sum_num_mismatches += 1 - sum_mismatch_qualities += p.alignment.query_qualities[ - qpos - ] - sum_num_mismatches_as_fraction += ( - sum_num_mismatches / p.alignment.query_alignment_length - ) - - var_reads_total = var_reads_plus + var_reads_minus - if var_reads_total == 0: - # No stats without reads! - return None - avg_mapping_quality = sum_mapping_qualities / var_reads_total - avg_basequality = sum_base_qualities / var_reads_total - avg_pos_as_fraction = sum_dist_from_center / var_reads_total - avg_num_mismatches_as_fraction = ( - sum_num_mismatches_as_fraction / var_reads_total - ) - avg_sum_mismatch_qualities = sum_mismatch_qualities / var_reads_total - avg_clipped_length = sum_clipped_length / var_reads_total - avg_distance_to_effective_3p_end = ( - sum_dist_from_3prime / var_reads_total - ) - - return ( - avg_mapping_quality, - avg_basequality, - var_reads_plus, - var_reads_minus, - avg_pos_as_fraction, - avg_num_mismatches_as_fraction, - avg_sum_mismatch_qualities, - avg_clipped_length, - avg_distance_to_effective_3p_end - ) - def _postprocess_variant_records(self, invcf, min_var_count2, min_var_count2_lc, - min_var_freq2, max_somatic_p, - max_somatic_p_depth, + min_var_freq2, min_var_count2_depth, min_ref_readpos, min_var_readpos, min_ref_dist3, min_var_dist3, + detect_q2_runs, min_ref_len, min_var_len, max_relative_len_diff, min_strandedness, min_strand_reads, @@ -663,6 +877,7 @@ max_mapqual_diff, max_ref_mmqs, max_var_mmqs, min_mmqs_diff, max_mmqs_diff, + ignore_md, **args): # set FILTER field according to Varscan criteria # multiple FILTER entries must be separated by semicolons @@ -681,134 +896,158 @@ ) refseq = io_stack.enter_context(pysam.FastaFile(self.ref_genome)) pileup_args = self._get_pysam_pileup_args() + _get_stats = get_allele_stats for record in invcf: - if any(len(allele) > 1 for allele in record.alleles): - # skip indel postprocessing for the moment - yield record - continue + is_indel = False if record.alleles[0] == 'N': # It makes no sense to call SNPs against an unknown # reference base continue + if len(record.alleles[0]) > 1: + # a deletion + is_indel = True + alleles = [ + record.alleles[1], record.alleles[0].replace( + record.alleles[1], record.alleles[1] + '-', 1 + ) + ] + elif len(record.alleles[1]) > 1: + # an insertion + is_indel = True + alleles = [ + record.alleles[0], + record.alleles[1].replace( + record.alleles[0], record.alleles[0] + '+', 1 + ) + ] + else: + # a regular SNV + alleles = record.alleles[:2] # get pileup for genomic region affected by this variant if record.info['SS'] == '2': # a somatic variant => generate pileup from tumor data - pile = tumor_reads.pileup( - record.chrom, record.start, record.stop, - **pileup_args - ) - sample_of_interest = 'TUMOR' + reads_of_interest = tumor_reads + calculate_ref_stats = record.samples['TUMOR']['RD'] > 0 elif record.info['SS'] in ['1', '3']: # a germline or LOH variant => pileup from normal data - pile = normal_reads.pileup( - record.chrom, record.start, record.stop, - **pileup_args - ) - sample_of_interest = 'NORMAL' + reads_of_interest = normal_reads + calculate_ref_stats = record.samples['NORMAL']['RD'] > 0 else: # TO DO: figure out if there is anything interesting to do # for SS status codes 0 (reference) and 5 (unknown) yield record continue - # apply false-positive filtering a la varscan fpfilter - # find the variant site in the pileup columns - for pile_column in pile: - if pile_column.reference_pos == record.start: - break - # extract required information - # overall read depth at the site - read_depth = pile_column.get_num_aligned() - assert read_depth > 0 # no multiallelic sites in varscan assert len(record.alleles) == 2 - if record.samples[sample_of_interest]['RD'] > 0: - ref_stats, alt_stats = [ - self.get_allele_specific_pileup_column_stats( - allele, - pile_column, - partial( - pysam.FastaFile.fetch, refseq, record.chrom - ) - ) - for allele in record.alleles - ] + + if calculate_ref_stats: + ref_stats, alt_stats = _get_stats( + reads_of_interest, + (record.chrom, record.start, record.stop), + alleles, + refseq, + ignore_md=ignore_md, + ignore_nm=False, + mm_runs=True, + detect_q2_runs=detect_q2_runs, + pileup_args=pileup_args + ) else: ref_stats = None - alt_stats = self.get_allele_specific_pileup_column_stats( - record.alleles[1], - pile_column, - partial(pysam.FastaFile.fetch, refseq, record.chrom) - ) + alt_stats = _get_stats( + reads_of_interest, + (record.chrom, record.start, record.stop), + alleles[1:2], + refseq, + ignore_md=ignore_md, + ignore_nm=False, + mm_runs=True, + detect_q2_runs=detect_q2_runs, + pileup_args=pileup_args + )[0] ref_count = 0 if ref_stats: - ref_count = ref_stats[2] + ref_stats[3] - if ref_stats[1] < min_ref_basequal: + ref_count = ref_stats.reads_fw + ref_stats.reads_rv + if ref_stats.avg_basequal < min_ref_basequal: record.filter.add('RefBaseQual') if ref_count >= 2: - if ref_stats[0] < min_ref_mapqual: + if ref_stats.avg_mapqual < min_ref_mapqual: record.filter.add('RefMapQual') - if ref_stats[4] < min_ref_readpos: + if ref_stats.avg_dist_from_center < min_ref_readpos: record.filter.add('RefReadPos') - # ref_stats[5] (avg_num_mismatches_as_fraction + # ref_stats.avg_mismatch_fraction # is not a filter criterion in VarScan fpfilter - if ref_stats[6] > max_ref_mmqs: + if ref_stats.avg_mismatch_qualsum > max_ref_mmqs: record.filter.add('RefMMQS') - if ref_stats[7] < min_ref_len: + if not is_indel and ( + ref_stats.avg_clipped_len < min_ref_len + ): # VarScan fpfilter does not apply this filter - # for indels, but there is no reason - # not to do it. + # for indels, so we do not do it either. record.filter.add('RefAvgRL') - if ref_stats[8] < min_ref_dist3: + if ref_stats.avg_dist_from_3prime < min_ref_dist3: record.filter.add('RefDist3') if alt_stats: - alt_count = alt_stats[2] + alt_stats[3] - if ( - alt_count < min_var_count2_lc - ) or ( - read_depth >= max_somatic_p_depth and - alt_count < min_var_count2 + alt_count = alt_stats.reads_fw + alt_stats.reads_rv + if alt_count < min_var_count2: + if ( + (alt_count + ref_count) >= min_var_count2_depth + ) or ( + alt_count < min_var_count2_lc + ): + record.filter.add('VarCount') + if alt_count / alt_stats.reads_total < min_var_freq2: + record.filter.add('VarFreq') + if not is_indel and ( + alt_stats.avg_basequal < min_var_basequal ): - record.filter.add('VarCount') - if alt_count / read_depth < min_var_freq2: - record.filter.add('VarFreq') - if alt_stats[1] < min_var_basequal: record.filter.add('VarBaseQual') - if alt_count > min_strand_reads: + if alt_count >= min_strand_reads: if ( - alt_stats[2] / alt_count < min_strandedness + alt_stats.reads_fw / alt_count < min_strandedness ) or ( - alt_stats[3] / alt_count < min_strandedness + alt_stats.reads_rv / alt_count < min_strandedness ): record.filter.add('Strand') - if alt_stats[2] + alt_stats[3] >= 2: - if alt_stats[0] < min_var_mapqual: + if alt_count >= 2: + if alt_stats.avg_mapqual < min_var_mapqual: record.filter.add('VarMapQual') - if alt_stats[4] < min_var_readpos: + if alt_stats.avg_dist_from_center < min_var_readpos: record.filter.add('VarReadPos') - # alt_stats[5] (avg_num_mismatches_as_fraction + # alt_stats.avg_mismatch_fraction # is not a filter criterion in VarScan fpfilter - if alt_stats[6] > max_var_mmqs: + if alt_stats.avg_mismatch_qualsum > max_var_mmqs: record.filter.add('VarMMQS') - if alt_stats[7] < min_var_len: + if not is_indel and ( + alt_stats.avg_clipped_len < min_var_len + ): # VarScan fpfilter does not apply this filter - # for indels, but there is no reason - # not to do it. + # for indels, so we do not do it either. record.filter.add('VarAvgRL') - if alt_stats[8] < min_var_dist3: + if alt_stats.avg_dist_from_3prime < min_var_dist3: record.filter.add('VarDist3') if ref_count >= 2 and alt_count >= 2: - if (ref_stats[0] - alt_stats[0]) > max_mapqual_diff: + if ( + ref_stats.avg_mapqual - alt_stats.avg_mapqual + ) > max_mapqual_diff: record.filter.add('MapQualDiff') - if (ref_stats[1] - alt_stats[1]) > max_basequal_diff: + if ( + ref_stats.avg_basequal - alt_stats.avg_basequal + ) > max_basequal_diff: record.filter.add('MaxBAQdiff') - mmqs_diff = alt_stats[6] - ref_stats[6] + mmqs_diff = ( + alt_stats.avg_mismatch_qualsum + - ref_stats.avg_mismatch_qualsum + ) if mmqs_diff < min_mmqs_diff: record.filter.add('MinMMQSdiff') if mmqs_diff > max_mmqs_diff: record.filter.add('MMQSdiff') if ( - 1 - alt_stats[7] / ref_stats[7] + 1 - + alt_stats.avg_clipped_len + / ref_stats.avg_clipped_len ) > max_relative_len_diff: record.filter.add('ReadLenDiff') else: @@ -994,6 +1233,8 @@ instance_args = { k: args.pop(k) for k in [ 'max_depth', + 'count_orphans', + 'detect_overlaps', 'min_mapqual', 'min_basequal', 'threads', @@ -1046,7 +1287,7 @@ ) p.add_argument( '-s', '--split-output', - dest='split_output', action='store_true', default=False, + dest='split_output', action='store_true', help='indicate that separate output files for SNPs and indels ' 'should be generated (original VarScan behavior). If specified, ' '%%T in the --ofile file name will be replaced with "snp" and ' @@ -1090,6 +1331,20 @@ 'default: 8000)' ) call_group.add_argument( + '--count-orphans', + dest='count_orphans', action='store_true', + help='Use anomalous read pairs in variant calling ' + '(samtools mpileup -A option; default: ignore anomalous pairs)' + ) + call_group.add_argument( + '--no-detect-overlaps', + dest='detect_overlaps', action='store_false', + help='Disable automatic read-pair overlap detection by samtools ' + 'mpileup. Overlap detection tries to avoid counting duplicated ' + 'bases twice during variant calling. ' + '(samtools mpileup -x option; default: use overlap detection)' + ) + call_group.add_argument( '--min-basequal', dest='min_basequal', type=int, default=13, @@ -1160,7 +1415,15 @@ dest='min_var_count2_lc', type=int, default=2, help='Minimum number of variant-supporting reads when depth below ' - '--somatic-p-depth (default: 2)' + '--min-var-count2-depth (default: 2)' + ) + filter_group.add_argument( + '--min-var-count2-depth', + dest='min_var_count2_depth', type=int, + default=10, + help='Combined depth of ref- and variant-supporting reads required to ' + 'apply the --min-var-count filter instead of --min-var-count-lc ' + '(default: 10)' ) filter_group.add_argument( '--min-var-freq2', @@ -1169,18 +1432,6 @@ help='Minimum variant allele frequency (default: 0.05)' ) filter_group.add_argument( - '--max-somatic-p', - dest='max_somatic_p', type=float, - default=0.05, - help='Maximum somatic p-value (default: 0.05)' - ) - filter_group.add_argument( - '--max-somatic-p-depth', - dest='max_somatic_p_depth', type=int, - default=10, - help='Depth required to run --max-somatic-p filter (default: 10)' - ) - filter_group.add_argument( '--min-ref-readpos', dest='min_ref_readpos', type=float, default=0.1, @@ -1209,6 +1460,13 @@ '3\'end of variant-supporting reads (default: 0.1)' ) filter_group.add_argument( + '--detect-q2-runs', + dest='detect_q2_runs', action='store_true', + help='Look for 3\'-terminal q2 runs and take their lengths into ' + 'account for determining the effective 3\'end of reads ' + '(default: off)' + ) + filter_group.add_argument( '--min-ref-len', dest='min_ref_len', type=int, default=90, @@ -1309,5 +1567,13 @@ default=50, help='Maximum mismatch quality sum difference (var - ref; default: 50)' ) + filter_group.add_argument( + '--ignore-md', + dest='ignore_md', action='store_true', + help='Do not rely on read MD tag, even if it is present on the mapped ' + 'reads, and recalculate mismatch quality stats from ref ' + 'alignments instead.' + ) + args = vars(p.parse_args()) varscan_call(**args)