# HG changeset patch # User iuc # Date 1565984994 14400 # Node ID 4e97191a1ff7adff761829b7405b7bf58f5673a1 # Parent b79bb8b09822420dbaa349a22b23d2b00208f8a9 "planemo upload for repository https://github.com/galaxyproject/iuc/tree/master/tools/varscan commit fcf5ac14629c694f0f64773fab0428b1e78fe156" diff -r b79bb8b09822 -r 4e97191a1ff7 macros.xml --- a/macros.xml Thu Mar 28 18:19:00 2019 -0400 +++ b/macros.xml Fri Aug 16 15:49:54 2019 -0400 @@ -56,9 +56,9 @@ - + + label="Minimum supporting reads" help="@HELP@"/> + help="Minimum variant allele frequency (default: @VALUE@) required for calling a variant"/> + help="Minimum variant allele frequency (default: 0.75) required for calling a homozygous genotype" /> read_base_pos: + sum_dist_from_3prime += ( + read_last_effective_aln_pos - read_base_pos + ) / (unclipped_length - 1) + + if sum_mismatch_fractions >= 0 or sum_mismatch_qualities >= 0: + # sum_mismatch_fractions and sum_mismatch_qualities will be + # set to float('nan') if reference info (in form of an MD tag or + # an actual reference sequence) was not available for any previous + # read in the pileup. + # In that case, there is no point to trying to calculate + # mismatch stats for other reads anymore. + + # The following mismatch calculations use this logic: + # 1. For determining the edit distance between an aligned read and + # the reference, we follow the authoritative definition of the + # NM tag calculation in + # http://samtools.github.io/hts-specs/SAMtags.pdf + # + # For historical reasons, the result of this calculation will + # disagree more often than not with NM tag values calculated by + # other tools. + # If precalculated NM tag values are present on the aligned + # reads, these can be given preference through the use_nm flag. + # Doing so will mimick the behavior of bam-readcount, which + # requires and always just looks at NM tags. + # 2. For determining mismatch quality sums, a mismatch is defined + # differently and in accordance with the implementation in + # bam-readcount: + # - only mismatches (not inserted or deleted bases) are + # considered + # - 'N' in the reference is considered a match for any read base + # - any matching (case-insensitive) base in reference and read + # is considered a match, even if that base is not one of + # A, C, G or T. + # In both 1. and 2. above a '=' in the read is always considered a + # match, irrespective of the reference base. + + num_mismatches = 0 + if not ignore_md: + try: + # see if the read has an MD tag, in which case pysam can + # calculate the reference sequence for us + ref_seq = p.alignment.get_reference_sequence().upper() + except ValueError: + ignore_md = True + if not ignore_nm: + try: + num_mismatches = p.alignment.get_tag('NM') + except KeyError: + ignore_nm = True + + if ignore_md: + if not ref_fetch: + # cannot calculate mismatch stats without ref info + sum_mismatch_qualities = float('nan') + if ignore_nm: + sum_mismatch_fractions = float('nan') + else: + sum_mismatch_fractions += ( + num_mismatches / p.alignment.query_alignment_length + ) + continue + # Without an MD tag we need to extract the relevant part + # of the reference from the full sequence by position. + ref_positions = p.alignment.get_reference_positions() + ref_seq = ref_fetch( + ref_positions[0], ref_positions[-1] + 1 + ).upper() + + potential_matches = {'A', 'C', 'G', 'T'} + aligned_pairs = p.alignment.get_aligned_pairs(matches_only=True) + ref_offset = aligned_pairs[0][1] + last_mismatch_pos = None + mismatch_run_quals = [] + for qpos, rpos in aligned_pairs: + read_base = p.alignment.query_sequence[qpos].upper() + if read_base == '=': + # always treat the special read base '=' as a + # match, irrespective of the reference base + continue + ref_base = ref_seq[rpos - ref_offset] + # see if we have a mismatch to use for mismatch quality sum + # calculation + if ref_base != 'N' and ref_base != read_base: + if mm_runs: + if ( + last_mismatch_pos is None + ) or ( + last_mismatch_pos + 1 == qpos + ): + mismatch_run_quals.append( + p.alignment.query_qualities[qpos] + ) + else: + sum_mismatch_qualities += max(mismatch_run_quals) + mismatch_run_quals = [ + p.alignment.query_qualities[qpos] + ] + last_mismatch_pos = qpos + else: + sum_mismatch_qualities += ( + p.alignment.query_qualities[qpos] + ) + if ignore_nm: + # see if we have a mismatch that increases the edit + # distance according to the SAMtags specs + if ( + read_base not in potential_matches + ) or ( + ref_base not in potential_matches + ) or ( + read_base != ref_base + ): + num_mismatches += 1 + + if mismatch_run_quals: + sum_mismatch_qualities += max(mismatch_run_quals) + if ignore_nm: + # use the number of mismatches calculated above, + # but add inserted and deleted bases to it + cigar_stats = p.alignment.get_cigar_stats()[0] + num_mismatches += cigar_stats[1] + cigar_stats[2] + sum_mismatch_fractions += ( + num_mismatches / p.alignment.query_alignment_length + ) + + return ( + var_reads_plus, + var_reads_minus, + sum_mapping_qualities, + sum_base_qualities, + sum_dist_from_center, + sum_mismatch_fractions, + sum_mismatch_qualities, + sum_clipped_length, + sum_dist_from_3prime + ) + + +def get_allele_stats(reads, pos, alleles, + ref=None, ignore_md=True, ignore_nm=True, + mm_runs=True, detect_q2_runs=False, + pileup_args=None): + chrom, start, stop = pos + if pileup_args is None: + pileup_args = {} + if pileup_args.get('stepper') == 'samtools': + if pileup_args.get('compute_baq', True) is not False: + if 'fastafile' not in pileup_args: + pileup_args['fastafile'] = ref + # be careful when passing in a custom 'fastafile' option: + # providing it slows down pysam tremendously even if the option + # isn't actually required. + + pile = reads.pileup( + chrom, start, stop, + **pileup_args + ) + # apply false-positive filtering a la varscan fpfilter + # find the variant site in the pileup columns + for pile_column in pile: + if pile_column.reference_pos == start: + break + else: + # With no reads covering the genomic position + # we can only return "empty" allele stats + return [ + AlleleStats(0, 0, 0, *[float('nan')] * 7) + for allele in alleles + ] + + # extract required information + # overall read depth at the site + read_depth = pile_column.get_num_aligned() + assert read_depth > 0 + + alleles_stats = [] + for allele in alleles: + if '-' in allele: + allele, indel_type, indel_after = allele.partition('-') + indel_len = -len(indel_after) + elif '+' in allele: + allele, indel_type, indel_after = allele.partition('+') + indel_len = len(indel_after) + else: + indel_type = None + + stats_it = ( + p for p in pile_column.pileups + # skip reads that don't support the allele we're looking for + if (p.query_position is not None) + and (p.alignment.query_sequence[p.query_position] == allele) + ) + if indel_type == '-': + stats_it = ( + p for p in stats_it if p.indel == indel_len + ) + elif indel_type == '+': + stats_it = ( + p for p in stats_it if ( + p.indel == indel_len + ) and ( + p.alignment.query_sequence[ + p.query_position + 1: + p.query_position + 1 + indel_len + ] == indel_after + ) + ) + allele_stats = _get_allele_specific_pileup_column_stats( + stats_it, + partial( + pysam.FastaFile.fetch, ref, chrom + ) if ref else None, + ignore_md, + ignore_nm, + mm_runs, + detect_q2_runs + ) + + allele_reads_total = allele_stats[0] + allele_stats[1] + if allele_reads_total == 0: + # No stats without reads! + alleles_stats.append( + AlleleStats(read_depth, 0, 0, *[float('nan')] * 7) + ) + else: + alleles_stats.append( + AlleleStats( + read_depth, allele_stats[0], allele_stats[1], + *(i / allele_reads_total for i in allele_stats[2:]) + ) + ) + return alleles_stats + + class VariantCallingError (RuntimeError): """Exception class for issues with samtools and varscan subprocesses.""" @@ -40,13 +367,15 @@ class VarScanCaller (object): def __init__(self, ref_genome, bam_input_files, - max_depth=None, + max_depth=None, count_orphans=False, detect_overlaps=True, min_mapqual=None, min_basequal=None, threads=1, verbose=False, quiet=True ): self.ref_genome = ref_genome self.bam_input_files = bam_input_files self.max_depth = max_depth + self.count_orphans = count_orphans + self.detect_overlaps = detect_overlaps self.min_mapqual = min_mapqual self.min_basequal = min_basequal self.threads = threads @@ -67,15 +396,38 @@ self.tmpfiles = [] def _get_pysam_pileup_args(self): - param_dict = {} + # Compute the pileup args dynamically to account for possibly updated + # instance attributes. + + # fixed default parameters + # Note on the fixed default for 'ignore_overlaps': + # In order to use the exact same pileup parameters during variant + # calling and postprocessing, we would have to compute the setting + # dynamically like so: + # if not self.detect_overlaps: + # param_dict['ignore_overlaps'] = False + # However, samtools/pysam implement overlap correction by manipulating + # (setting to zero) the lower qualities of bases that are part of an + # overlap (see + # https://sourceforge.net/p/samtools/mailman/message/32793789/). This + # manipulation has such a big undesirable effect on base quality stats + # calculated during postprocessing that calculating the stats on a + # slightly different pileup seems like the lesser evil. + + param_dict = { + 'ignore_overlaps': False, + 'compute_baq': False, + 'stepper': 'samtools', + } + # user-controllable parameters + if self.count_orphans: + param_dict['ignore_orphans'] = False if self.max_depth is not None: param_dict['max_depth'] = self.max_depth if self.min_mapqual is not None: param_dict['min_mapping_quality'] = self.min_mapqual if self.min_basequal is not None: param_dict['min_base_quality'] = self.min_basequal - param_dict['compute_baq'] = False - param_dict['stepper'] = 'samtools' return param_dict def varcall_parallel(self, normal_purity=None, tumor_purity=None, @@ -108,6 +460,10 @@ if value is not None: varcall_engine_options += [option, str(value)] pileup_engine_options = ['-B'] + if self.count_orphans: + pileup_engine_options += ['-A'] + if not self.detect_overlaps: + pileup_engine_options += ['-x'] if self.max_depth is not None: pileup_engine_options += ['-d', str(self.max_depth)] if self.min_mapqual is not None: @@ -506,154 +862,12 @@ del record.format['RD'] del record.format['DP4'] - def get_allele_specific_pileup_column_stats( - self, allele, pile_column, ref_fetch, use_md=True - ): - var_reads_plus = var_reads_minus = 0 - sum_mapping_qualities = 0 - sum_base_qualities = 0 - sum_dist_from_center = 0 - sum_dist_from_3prime = 0 - sum_clipped_length = 0 - sum_unclipped_length = 0 - sum_num_mismatches_as_fraction = 0 - sum_mismatch_qualities = 0 - - for p in pile_column.pileups: - # skip reads that don't support the allele we're looking for - if p.query_position is None: - continue - if p.alignment.query_sequence[p.query_position] != allele: - continue - - if p.alignment.is_reverse: - var_reads_minus += 1 - else: - var_reads_plus += 1 - sum_mapping_qualities += p.alignment.mapping_quality - sum_base_qualities += p.alignment.query_qualities[p.query_position] - sum_clipped_length += p.alignment.query_alignment_length - unclipped_length = p.alignment.query_length - sum_unclipped_length += unclipped_length - # The following calculations are all in 1-based coordinates - # with respect to the physical 5'-end of the read sequence. - if p.alignment.is_reverse: - read_base_pos = unclipped_length - p.query_position - read_first_aln_pos = ( - unclipped_length - p.alignment.query_alignment_end + 1 - ) - read_last_aln_pos = ( - unclipped_length - p.alignment.query_alignment_start - ) - else: - read_base_pos = p.query_position + 1 - read_first_aln_pos = p.alignment.query_alignment_start + 1 - read_last_aln_pos = p.alignment.query_alignment_end - # Note: the original bam-readcount algorithm uses the less accurate - # p.alignment.query_alignment_length / 2 as the read center - read_center = (read_first_aln_pos + read_last_aln_pos) / 2 - sum_dist_from_center += 1 - abs( - read_base_pos - read_center - ) / (read_center - 1) - # Note: the original bam-readcount algorithm uses a more - # complicated definition of the 3'-end of a read sequence, which - # clips off runs of bases with a quality scores of exactly 2. - sum_dist_from_3prime += ( - read_last_aln_pos - read_base_pos - ) / (unclipped_length - 1) - - sum_num_mismatches = 0 - if use_md: - try: - # see if the read has an MD tag, in which case pysam can - # calculate the reference sequence for us - aligned_pairs = p.alignment.get_aligned_pairs( - matches_only=True, with_seq=True - ) - except ValueError: - use_md = False - if use_md: - # The ref sequence got calculated from alignment and MD tag. - for qpos, rpos, ref_base in aligned_pairs: - # pysam uses lowercase ref bases to indicate mismatches - if ( - ref_base == 'a' or ref_base == 't' or - ref_base == 'g' or ref_base == 'c' - ): - sum_num_mismatches += 1 - sum_mismatch_qualities += p.alignment.query_qualities[ - qpos - ] - else: - # We need to obtain the aligned portion of the reference - # sequence. - aligned_pairs = p.alignment.get_aligned_pairs( - matches_only=True - ) - # note that ref bases can be lowercase - ref_seq = ref_fetch( - aligned_pairs[0][1], aligned_pairs[-1][1] + 1 - ).upper() - ref_offset = aligned_pairs[0][1] - - for qpos, rpos in p.alignment.get_aligned_pairs( - matches_only=True - ): - # see if we have a mismatch to the reference at this - # position, but note that - # - the query base may be lowercase (SAM/BAM spec) - # - an '=', if present in the query seq, indicates a match - # to the reference (SAM/BAM spec) - # - there cannot be a mismatch with an N in the reference - ref_base = ref_seq[rpos - ref_offset] - read_base = p.alignment.query_sequence[qpos].upper() - if ( - read_base != '=' and ref_base != 'N' - ) and ( - ref_base != read_base - ): - sum_num_mismatches += 1 - sum_mismatch_qualities += p.alignment.query_qualities[ - qpos - ] - sum_num_mismatches_as_fraction += ( - sum_num_mismatches / p.alignment.query_alignment_length - ) - - var_reads_total = var_reads_plus + var_reads_minus - if var_reads_total == 0: - # No stats without reads! - return None - avg_mapping_quality = sum_mapping_qualities / var_reads_total - avg_basequality = sum_base_qualities / var_reads_total - avg_pos_as_fraction = sum_dist_from_center / var_reads_total - avg_num_mismatches_as_fraction = ( - sum_num_mismatches_as_fraction / var_reads_total - ) - avg_sum_mismatch_qualities = sum_mismatch_qualities / var_reads_total - avg_clipped_length = sum_clipped_length / var_reads_total - avg_distance_to_effective_3p_end = ( - sum_dist_from_3prime / var_reads_total - ) - - return ( - avg_mapping_quality, - avg_basequality, - var_reads_plus, - var_reads_minus, - avg_pos_as_fraction, - avg_num_mismatches_as_fraction, - avg_sum_mismatch_qualities, - avg_clipped_length, - avg_distance_to_effective_3p_end - ) - def _postprocess_variant_records(self, invcf, min_var_count2, min_var_count2_lc, - min_var_freq2, max_somatic_p, - max_somatic_p_depth, + min_var_freq2, min_var_count2_depth, min_ref_readpos, min_var_readpos, min_ref_dist3, min_var_dist3, + detect_q2_runs, min_ref_len, min_var_len, max_relative_len_diff, min_strandedness, min_strand_reads, @@ -663,6 +877,7 @@ max_mapqual_diff, max_ref_mmqs, max_var_mmqs, min_mmqs_diff, max_mmqs_diff, + ignore_md, **args): # set FILTER field according to Varscan criteria # multiple FILTER entries must be separated by semicolons @@ -681,134 +896,158 @@ ) refseq = io_stack.enter_context(pysam.FastaFile(self.ref_genome)) pileup_args = self._get_pysam_pileup_args() + _get_stats = get_allele_stats for record in invcf: - if any(len(allele) > 1 for allele in record.alleles): - # skip indel postprocessing for the moment - yield record - continue + is_indel = False if record.alleles[0] == 'N': # It makes no sense to call SNPs against an unknown # reference base continue + if len(record.alleles[0]) > 1: + # a deletion + is_indel = True + alleles = [ + record.alleles[1], record.alleles[0].replace( + record.alleles[1], record.alleles[1] + '-', 1 + ) + ] + elif len(record.alleles[1]) > 1: + # an insertion + is_indel = True + alleles = [ + record.alleles[0], + record.alleles[1].replace( + record.alleles[0], record.alleles[0] + '+', 1 + ) + ] + else: + # a regular SNV + alleles = record.alleles[:2] # get pileup for genomic region affected by this variant if record.info['SS'] == '2': # a somatic variant => generate pileup from tumor data - pile = tumor_reads.pileup( - record.chrom, record.start, record.stop, - **pileup_args - ) - sample_of_interest = 'TUMOR' + reads_of_interest = tumor_reads + calculate_ref_stats = record.samples['TUMOR']['RD'] > 0 elif record.info['SS'] in ['1', '3']: # a germline or LOH variant => pileup from normal data - pile = normal_reads.pileup( - record.chrom, record.start, record.stop, - **pileup_args - ) - sample_of_interest = 'NORMAL' + reads_of_interest = normal_reads + calculate_ref_stats = record.samples['NORMAL']['RD'] > 0 else: # TO DO: figure out if there is anything interesting to do # for SS status codes 0 (reference) and 5 (unknown) yield record continue - # apply false-positive filtering a la varscan fpfilter - # find the variant site in the pileup columns - for pile_column in pile: - if pile_column.reference_pos == record.start: - break - # extract required information - # overall read depth at the site - read_depth = pile_column.get_num_aligned() - assert read_depth > 0 # no multiallelic sites in varscan assert len(record.alleles) == 2 - if record.samples[sample_of_interest]['RD'] > 0: - ref_stats, alt_stats = [ - self.get_allele_specific_pileup_column_stats( - allele, - pile_column, - partial( - pysam.FastaFile.fetch, refseq, record.chrom - ) - ) - for allele in record.alleles - ] + + if calculate_ref_stats: + ref_stats, alt_stats = _get_stats( + reads_of_interest, + (record.chrom, record.start, record.stop), + alleles, + refseq, + ignore_md=ignore_md, + ignore_nm=False, + mm_runs=True, + detect_q2_runs=detect_q2_runs, + pileup_args=pileup_args + ) else: ref_stats = None - alt_stats = self.get_allele_specific_pileup_column_stats( - record.alleles[1], - pile_column, - partial(pysam.FastaFile.fetch, refseq, record.chrom) - ) + alt_stats = _get_stats( + reads_of_interest, + (record.chrom, record.start, record.stop), + alleles[1:2], + refseq, + ignore_md=ignore_md, + ignore_nm=False, + mm_runs=True, + detect_q2_runs=detect_q2_runs, + pileup_args=pileup_args + )[0] ref_count = 0 if ref_stats: - ref_count = ref_stats[2] + ref_stats[3] - if ref_stats[1] < min_ref_basequal: + ref_count = ref_stats.reads_fw + ref_stats.reads_rv + if ref_stats.avg_basequal < min_ref_basequal: record.filter.add('RefBaseQual') if ref_count >= 2: - if ref_stats[0] < min_ref_mapqual: + if ref_stats.avg_mapqual < min_ref_mapqual: record.filter.add('RefMapQual') - if ref_stats[4] < min_ref_readpos: + if ref_stats.avg_dist_from_center < min_ref_readpos: record.filter.add('RefReadPos') - # ref_stats[5] (avg_num_mismatches_as_fraction + # ref_stats.avg_mismatch_fraction # is not a filter criterion in VarScan fpfilter - if ref_stats[6] > max_ref_mmqs: + if ref_stats.avg_mismatch_qualsum > max_ref_mmqs: record.filter.add('RefMMQS') - if ref_stats[7] < min_ref_len: + if not is_indel and ( + ref_stats.avg_clipped_len < min_ref_len + ): # VarScan fpfilter does not apply this filter - # for indels, but there is no reason - # not to do it. + # for indels, so we do not do it either. record.filter.add('RefAvgRL') - if ref_stats[8] < min_ref_dist3: + if ref_stats.avg_dist_from_3prime < min_ref_dist3: record.filter.add('RefDist3') if alt_stats: - alt_count = alt_stats[2] + alt_stats[3] - if ( - alt_count < min_var_count2_lc - ) or ( - read_depth >= max_somatic_p_depth and - alt_count < min_var_count2 + alt_count = alt_stats.reads_fw + alt_stats.reads_rv + if alt_count < min_var_count2: + if ( + (alt_count + ref_count) >= min_var_count2_depth + ) or ( + alt_count < min_var_count2_lc + ): + record.filter.add('VarCount') + if alt_count / alt_stats.reads_total < min_var_freq2: + record.filter.add('VarFreq') + if not is_indel and ( + alt_stats.avg_basequal < min_var_basequal ): - record.filter.add('VarCount') - if alt_count / read_depth < min_var_freq2: - record.filter.add('VarFreq') - if alt_stats[1] < min_var_basequal: record.filter.add('VarBaseQual') - if alt_count > min_strand_reads: + if alt_count >= min_strand_reads: if ( - alt_stats[2] / alt_count < min_strandedness + alt_stats.reads_fw / alt_count < min_strandedness ) or ( - alt_stats[3] / alt_count < min_strandedness + alt_stats.reads_rv / alt_count < min_strandedness ): record.filter.add('Strand') - if alt_stats[2] + alt_stats[3] >= 2: - if alt_stats[0] < min_var_mapqual: + if alt_count >= 2: + if alt_stats.avg_mapqual < min_var_mapqual: record.filter.add('VarMapQual') - if alt_stats[4] < min_var_readpos: + if alt_stats.avg_dist_from_center < min_var_readpos: record.filter.add('VarReadPos') - # alt_stats[5] (avg_num_mismatches_as_fraction + # alt_stats.avg_mismatch_fraction # is not a filter criterion in VarScan fpfilter - if alt_stats[6] > max_var_mmqs: + if alt_stats.avg_mismatch_qualsum > max_var_mmqs: record.filter.add('VarMMQS') - if alt_stats[7] < min_var_len: + if not is_indel and ( + alt_stats.avg_clipped_len < min_var_len + ): # VarScan fpfilter does not apply this filter - # for indels, but there is no reason - # not to do it. + # for indels, so we do not do it either. record.filter.add('VarAvgRL') - if alt_stats[8] < min_var_dist3: + if alt_stats.avg_dist_from_3prime < min_var_dist3: record.filter.add('VarDist3') if ref_count >= 2 and alt_count >= 2: - if (ref_stats[0] - alt_stats[0]) > max_mapqual_diff: + if ( + ref_stats.avg_mapqual - alt_stats.avg_mapqual + ) > max_mapqual_diff: record.filter.add('MapQualDiff') - if (ref_stats[1] - alt_stats[1]) > max_basequal_diff: + if ( + ref_stats.avg_basequal - alt_stats.avg_basequal + ) > max_basequal_diff: record.filter.add('MaxBAQdiff') - mmqs_diff = alt_stats[6] - ref_stats[6] + mmqs_diff = ( + alt_stats.avg_mismatch_qualsum + - ref_stats.avg_mismatch_qualsum + ) if mmqs_diff < min_mmqs_diff: record.filter.add('MinMMQSdiff') if mmqs_diff > max_mmqs_diff: record.filter.add('MMQSdiff') if ( - 1 - alt_stats[7] / ref_stats[7] + 1 - + alt_stats.avg_clipped_len + / ref_stats.avg_clipped_len ) > max_relative_len_diff: record.filter.add('ReadLenDiff') else: @@ -994,6 +1233,8 @@ instance_args = { k: args.pop(k) for k in [ 'max_depth', + 'count_orphans', + 'detect_overlaps', 'min_mapqual', 'min_basequal', 'threads', @@ -1046,7 +1287,7 @@ ) p.add_argument( '-s', '--split-output', - dest='split_output', action='store_true', default=False, + dest='split_output', action='store_true', help='indicate that separate output files for SNPs and indels ' 'should be generated (original VarScan behavior). If specified, ' '%%T in the --ofile file name will be replaced with "snp" and ' @@ -1090,6 +1331,20 @@ 'default: 8000)' ) call_group.add_argument( + '--count-orphans', + dest='count_orphans', action='store_true', + help='Use anomalous read pairs in variant calling ' + '(samtools mpileup -A option; default: ignore anomalous pairs)' + ) + call_group.add_argument( + '--no-detect-overlaps', + dest='detect_overlaps', action='store_false', + help='Disable automatic read-pair overlap detection by samtools ' + 'mpileup. Overlap detection tries to avoid counting duplicated ' + 'bases twice during variant calling. ' + '(samtools mpileup -x option; default: use overlap detection)' + ) + call_group.add_argument( '--min-basequal', dest='min_basequal', type=int, default=13, @@ -1160,7 +1415,15 @@ dest='min_var_count2_lc', type=int, default=2, help='Minimum number of variant-supporting reads when depth below ' - '--somatic-p-depth (default: 2)' + '--min-var-count2-depth (default: 2)' + ) + filter_group.add_argument( + '--min-var-count2-depth', + dest='min_var_count2_depth', type=int, + default=10, + help='Combined depth of ref- and variant-supporting reads required to ' + 'apply the --min-var-count filter instead of --min-var-count-lc ' + '(default: 10)' ) filter_group.add_argument( '--min-var-freq2', @@ -1169,18 +1432,6 @@ help='Minimum variant allele frequency (default: 0.05)' ) filter_group.add_argument( - '--max-somatic-p', - dest='max_somatic_p', type=float, - default=0.05, - help='Maximum somatic p-value (default: 0.05)' - ) - filter_group.add_argument( - '--max-somatic-p-depth', - dest='max_somatic_p_depth', type=int, - default=10, - help='Depth required to run --max-somatic-p filter (default: 10)' - ) - filter_group.add_argument( '--min-ref-readpos', dest='min_ref_readpos', type=float, default=0.1, @@ -1209,6 +1460,13 @@ '3\'end of variant-supporting reads (default: 0.1)' ) filter_group.add_argument( + '--detect-q2-runs', + dest='detect_q2_runs', action='store_true', + help='Look for 3\'-terminal q2 runs and take their lengths into ' + 'account for determining the effective 3\'end of reads ' + '(default: off)' + ) + filter_group.add_argument( '--min-ref-len', dest='min_ref_len', type=int, default=90, @@ -1309,5 +1567,13 @@ default=50, help='Maximum mismatch quality sum difference (var - ref; default: 50)' ) + filter_group.add_argument( + '--ignore-md', + dest='ignore_md', action='store_true', + help='Do not rely on read MD tag, even if it is present on the mapped ' + 'reads, and recalculate mismatch quality stats from ref ' + 'alignments instead.' + ) + args = vars(p.parse_args()) varscan_call(**args) diff -r b79bb8b09822 -r 4e97191a1ff7 varscan_somatic.xml --- a/varscan_somatic.xml Thu Mar 28 18:19:00 2019 -0400 +++ b/varscan_somatic.xml Fri Aug 16 15:49:54 2019 -0400 @@ -1,4 +1,4 @@ - + Call germline/somatic and LOH variants from tumor-normal sample pairs macros.xml @@ -28,6 +28,16 @@ text="##FILTER=<ID=RefDist3,Description=" /> + +
+ + + + +
+
python @@ -63,8 +73,11 @@ --threads \${GALAXY_SLOTS:-2} #if str($call_params.settings) == "custom": ## samtools mpileup parameters - --min-basequal ${call_params.min_avg_qual} - --min-mapqual ${call_params.min_mapqual} + --min-basequal ${call_params.read_selection.min_basequal} + --min-mapqual ${call_params.read_selection.min_mapqual} + ${call_params.read_selection.count_orphans} + ${call_params.read_selection.detect_overlaps} + --max-pileup-depth ${call_params.read_selection.max_pileup_depth} ## VarScan parameters --min-coverage ${call_params.min_coverage} --min-var-count ${call_params.min_reads2} @@ -75,56 +88,61 @@ #end if #if str($filter_params.settings) == "no_filter": --no-filters - #elif str($filter_params.settings) == "dream3_settings": - --min-var-count2 3 - --min-var-count2-lc 1 - --min-var-freq2 0.05 - --max-somatic-p 0.05 - --max-somatic-p-depth 10 - --min-ref-readpos 0.2 - --min-var-readpos 0.15 - --min-ref-dist3 0.2 - --min-var-dist3 0.15 - --min-ref-len 90 - --min-var-len 90 - --max-len-diff 0.05 - --min-strandedness 0 - --min-strand-reads 5 - --min-ref-basequal 15 - --min-var-basequal 30 - --max-basequal-diff 50 - --min-ref-mapqual 20 - --min-var-mapqual 30 - --max-mapqual-diff 10 - --max-ref-mmqs 50 - --max-var-mmqs 100 - --min-mmqs-diff 0 - --max-mmqs-diff 50 - #elif str($filter_params.settings) == "custom": - --min-var-count2 ${filter_params.min_var_count} - --min-var-count2-lc ${filter_params.min_var_count_lc} - --min-var-freq2 ${filter_params.min_var_freq2} - --max-somatic-p ${filter_params.max_somatic_p} - --max-somatic-p-depth ${filter_params.max_somatic_p_depth} - --min-ref-readpos ${filter_params.min_ref_readpos} - --min-var-readpos ${filter_params.min_var_readpos} - --min-ref-dist3 ${filter_params.min_ref_dist3} - --min-var-dist3 ${filter_params.min_var_dist3} - --min-ref-len ${filter_params.min_ref_len} - --min-var-len ${filter_params.min_var_len} - --max-len-diff ${filter_params.max_len_diff} - --min-strandedness ${filter_params.min_strandedness} - --min-strand-reads ${filter_params.min_strand_reads} - --min-ref-basequal ${filter_params.min_ref_basequal} - --min-var-basequal ${filter_params.min_var_basequal} - --max-basequal-diff ${filter_params.max_basequal_diff} - --min-ref-mapqual ${filter_params.min_ref_mapqual} - --min-var-mapqual ${filter_params.min_var_mapqual} - --max-mapqual-diff ${filter_params.max_mapqual_diff} - --max-ref-mmqs ${filter_params.max_ref_mmqs} - --max-var-mmqs ${filter_params.max_var_mmqs} - --min-mmqs-diff ${filter_params.min_mmqs_diff} - --max-mmqs-diff ${filter_params.max_mmqs_diff} + #else: + #if str($filter_params.settings) == "dream3_settings": + --min-var-count2 3 + --min-var-count2-lc 1 + --min-var-count2-depth 10 + --min-var-freq2 0.05 + --min-ref-readpos 0.2 + --min-var-readpos 0.15 + --min-ref-dist3 0.2 + --min-var-dist3 0.15 + --min-ref-len 90 + --min-var-len 90 + --max-len-diff 0.05 + --min-strandedness 0 + --min-strand-reads 5 + --min-ref-basequal 15 + --min-var-basequal 30 + --max-basequal-diff 50 + --min-ref-mapqual 20 + --min-var-mapqual 30 + --max-mapqual-diff 10 + --max-ref-mmqs 50 + --max-var-mmqs 100 + --min-mmqs-diff 0 + --max-mmqs-diff 50 + #elif str($filter_params.settings) == "custom": + --min-var-count2 ${filter_params.min_var_count} + --min-var-count2-lc ${filter_params.min_var_count_lc} + --min-var-count2-depth ${filter_params.min_var_count_depth} + --min-var-freq2 ${filter_params.min_var_freq2} + --min-ref-readpos ${filter_params.min_ref_readpos} + --min-var-readpos ${filter_params.min_var_readpos} + --min-ref-dist3 ${filter_params.min_ref_dist3} + --min-var-dist3 ${filter_params.min_var_dist3} + --min-ref-len ${filter_params.min_ref_len} + --min-var-len ${filter_params.min_var_len} + --max-len-diff ${filter_params.max_len_diff} + --min-strandedness ${filter_params.min_strandedness} + --min-strand-reads ${filter_params.min_strand_reads} + --min-ref-basequal ${filter_params.min_ref_basequal} + --min-var-basequal ${filter_params.min_var_basequal} + --max-basequal-diff ${filter_params.max_basequal_diff} + --min-ref-mapqual ${filter_params.min_ref_mapqual} + --min-var-mapqual ${filter_params.min_var_mapqual} + --max-mapqual-diff ${filter_params.max_mapqual_diff} + --max-ref-mmqs ${filter_params.max_ref_mmqs} + --max-var-mmqs ${filter_params.max_var_mmqs} + --min-mmqs-diff ${filter_params.min_mmqs_diff} + --max-mmqs-diff ${filter_params.max_mmqs_diff} + #end if + #if $filter_params.experts_only.compat_opts: + #for $opt in str($filter_params.experts_only.compat_opts).split(','): + $opt + #end for + #end if #end if --verbose '$ref_genome' @@ -166,22 +184,35 @@ - - +
+ + + + + +
- + help="Minimum site coverage (default: 8) required in the normal and in the tumor sample to call a variant. This threshold gets applied after eliminating reads based on the read selection criteria above." /> + + help="The p-value threshold (default: 0.99) used to determine if a variant should be called for either sample" /> + help="The p-value threshold (default: 0.05) used to determine if read count differences between the normal and the tumor sample justify classification of a variant as somatic or as an LOH event" />
@@ -192,82 +223,84 @@ - - + + + + + + + help="(default: 4)" /> + help="This setting (default: 2) will be applied instead of the --min-var-count limit for sites with poor overall (see threshold below) coverage." /> + - - + help="(default: 0.05)" /> + help="The minimum average relative distance from either end of ref-supporting reads (default: 0.1) required for variant sites" /> + help="The minimum average relative distance from either end of variant-supporting reads (default: 0.1) required for variant sites" /> + help="The minimum average relative distance from the effective 3'end of ref-supporting reads (default: 0.1) required for variant sites. The effective 3'end is defined by the end of the alignment of the read to the reference (or, if the Illumina 1.5 compatibility setting is used, by the first base in 3'->5' direction with a base quality > 2)." /> + help="The minimum average relative distance from the effective 3'end of variant-supporting reads (default: 0.1) required for variant sites. The effective 3'end is defined as above." /> + help="The minimum average trimmed length (default: 90) required for reads supporting the reference allele" /> + help="The minimum average trimmed length (default: 90) required for reads supporting the variant allele" /> + help="The maximum allowed difference (default: 0.25) in the average relative read length (ref - var) between reads supporting the reference and the variant allele" /> + help="The minimum fraction (default: 0.01) of variant reads that are required to come from the forward and from the reverse strand" /> + help="(default: 5)" /> + help="The minimum average base quality (default: 15) required at the variant site for reads supporting the reference allele" /> + help="The minimum average base quality (default: 15) required at the variant site for reads supporting the variant allele" /> + help="The maximum difference (default: 50) in the average base quality (ref - var) allowed between the variant site positions of reads supporting the reference and the variant allele" /> + help="The minimum average mapping quality (default: 15) required for reads supporting the reference allele" /> + help="The minimum average mapping quality (default: 15) required for reads supporting the variant allele" /> + help="The maximum difference (default: 50) in the average mapping quality (ref - var) allowed between reads supporting the reference and the variant allele" /> + help="The maximum mismatch base quality sum (default: 100) allowed for reads supporting the reference allele" /> + help="The maximum mismatch base quality sum (default: 100) allowed for reads supporting the variant allele" /> + help="The minimum difference (default: 0) in the mismatch base quality sums (var - ref) required between reads supporting the variant and the reference allele" /> + help="The maximum difference (default: 50) in the mismatch base quality sums (var - ref) allowed between reads supporting the variant and the reference allele" /> + @@ -286,6 +319,7 @@ + @@ -305,6 +339,7 @@ + @@ -324,6 +359,7 @@ + @@ -347,6 +383,7 @@ + @@ -355,10 +392,12 @@ +
+ +
- @@ -395,6 +434,7 @@
+ @@ -414,6 +454,7 @@ + @@ -433,6 +474,7 @@ + @@ -490,13 +532,6 @@ This tool wraps the functionality of the ``varscan somatic`` and the ``varscan fpfilter`` command line tools. -.. class:: infomark - - The wrapper aims at providing the same functionality as the - ``varscan fpfilter`` tool, but implements it using ``pysam`` internally. - Note that, as one limitation compared to the original ``varscan`` tool, - the current version does not apply filters to indels! - The tool is designed to detect genetic variants in a **pair of samples** representing normal and tumor tissue from the same individual. It classifies the variants, according to their most likely origin, as **somatic** (variant is