Mercurial > repos > iuc > varscan_mpileup

--- a/test-data/hg19_chrM.fa	Mon Jan 21 12:05:47 2019 -0500
+++ b/test-data/hg19_chrM.fa	Thu Mar 28 18:19:19 2019 -0400
@@ -1,5 +1,5 @@
 >chrM
-GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT
+NNNCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT
 TTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTG
 GAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATT
 CTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACCTACTA
Binary file test-data/high_cov_chrM.bam has changed
--- a/test-data/test-cache/hg19_chrM.fa	Mon Jan 21 12:05:47 2019 -0500
+++ b/test-data/test-cache/hg19_chrM.fa	Thu Mar 28 18:19:19 2019 -0400
@@ -1,5 +1,5 @@
 >chrM
-GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT
+NNNCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT
 TTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTG
 GAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATT
 CTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACCTACTA
--- a/varscan.py	Mon Jan 21 12:05:47 2019 -0500
+++ b/varscan.py	Thu Mar 28 18:19:19 2019 -0400
@@ -402,7 +402,12 @@
         """Add standard FORMAT key declarations to a VCF header."""

         format_field_specs = [
-            ('GT', '1', 'String', 'Genotype'),
+            # pysam will not add quotes around single-word descriptions,
+            # which is a violation of the VCF specification.
+            # To work around this we are using two words as the
+            # GT format description (instead of the commonly used "Genotype").
+            # This could be changed once pysam learns to quote correctly.
+            ('GT', '1', 'String', 'Genotype code'),
             ('GQ', '1', 'Integer', 'Genotype quality'),
             ('DP', '1', 'Integer', 'Read depth'),
             ('AD', 'R', 'Integer', 'Read depth for each allele'),
@@ -423,24 +428,7 @@
         header.merge(temp_header)

     def _compile_common_header(self, varcall_template, no_filters=False):
-        # fix the header generated by VarScan
-        # by adding reference and contig information
-        common_header = pysam.VariantHeader()
-        common_header.add_meta('reference', value=self.ref_genome)
-        self._add_ref_contigs_to_header(common_header)
-        if not no_filters:
-            # add filter info
-            self._add_filters_to_header(common_header)
-        # change the source information
-        common_header.add_meta('source', value='varscan.py')
-        # declare an INDEL flag for record INFO fields
-        self._add_indel_info_flag_to_header(common_header)
-        # generate standard FORMAT field declarations
-        # including a correct AD declaration to prevent VarScan's
-        # non-standard one from getting propagated
-        self._standardize_format_fields(common_header)
-        # take the remaining metadata from the template header produced by
-        # VarScan
+        # read the header produced by VarScan for use as a template
         with pysam.VariantFile(varcall_template, 'r') as original_data:
             varscan_header = original_data.header
         # don't propagate non-standard and redundant FORMAT keys written
@@ -449,9 +437,29 @@
         varscan_header.formats.remove_header('FREQ')
         varscan_header.formats.remove_header('RD')
         varscan_header.formats.remove_header('DP4')
+        # build a new header containing information not written by VarScan
+        common_header = pysam.VariantHeader()
+        # copy over sample info from the VarScan template
         for sample in varscan_header.samples:
             common_header.samples.add(sample)
+        # add reference information
+        common_header.add_meta('reference', value=self.ref_genome)
+        # change the source information
+        common_header.add_meta('source', value='varscan.py')
+        # add contig information
+        self._add_ref_contigs_to_header(common_header)
+        # declare an INDEL flag for record INFO fields
+        self._add_indel_info_flag_to_header(common_header)
+        # merge in remaining information from the VarScan template
         common_header.merge(varscan_header)
+        # add additional FILTER declarations
+        if not no_filters:
+            # add filter info
+            self._add_filters_to_header(common_header)
+        # generate standard FORMAT field declarations
+        # including a correct AD declaration to prevent VarScan's
+        # non-standard one from getting propagated
+        self._standardize_format_fields(common_header)
         return common_header

     def _fix_record_gt_fields(self, record):
@@ -498,54 +506,12 @@
         del record.format['RD']
         del record.format['DP4']

-    def pileup_masker(self, mask):
-        def apply_mask_on_pileup(piled_items):
-            for item, status in zip(piled_items, mask):
-                if status:
-                    yield item
-        return apply_mask_on_pileup
-
     def get_allele_specific_pileup_column_stats(
-        self, allele, pile_column, ref_fetch
+        self, allele, pile_column, ref_fetch, use_md=True
     ):
-        # number of reads supporting the given allele on
-        # forward and reverse strand, and in total
         var_reads_plus = var_reads_minus = 0
-        var_supp_read_mask = []
-        for base in pile_column.get_query_sequences():
-            if base == allele:
-                # allele supporting read on + strand
-                var_reads_plus += 1
-                var_supp_read_mask.append(True)
-            elif base.upper() == allele:
-                # allele supporting read on - strand
-                var_reads_minus += 1
-                var_supp_read_mask.append(True)
-            else:
-                var_supp_read_mask.append(False)
-        var_reads_total = var_reads_plus + var_reads_minus
-
-        if var_reads_total == 0:
-            # No stats without reads!
-            return None
-
-        var_supp_only = self.pileup_masker(var_supp_read_mask)
-
-        # average mapping quality of the reads supporting the
-        # given allele
-        avg_mapping_quality = sum(
-            mq for mq in var_supp_only(
-                pile_column.get_mapping_qualities()
-            )
-        ) / var_reads_total
-
-        # for the remaining stats we need access to complete
-        # read information
-        piled_reads = [
-            p for p in var_supp_only(pile_column.pileups)
-        ]
-        assert len(piled_reads) == var_reads_total
-        sum_avg_base_qualities = 0
+        sum_mapping_qualities = 0
+        sum_base_qualities = 0
         sum_dist_from_center = 0
         sum_dist_from_3prime = 0
         sum_clipped_length = 0
@@ -553,28 +519,99 @@
         sum_num_mismatches_as_fraction = 0
         sum_mismatch_qualities = 0

-        for p in piled_reads:
-            sum_avg_base_qualities += sum(
-                p.alignment.query_qualities
-            ) / p.alignment.infer_query_length()
+        for p in pile_column.pileups:
+            # skip reads that don't support the allele we're looking for
+            if p.query_position is None:
+                continue
+            if p.alignment.query_sequence[p.query_position] != allele:
+                continue
+
+            if p.alignment.is_reverse:
+                var_reads_minus += 1
+            else:
+                var_reads_plus += 1
+            sum_mapping_qualities += p.alignment.mapping_quality
+            sum_base_qualities += p.alignment.query_qualities[p.query_position]
             sum_clipped_length += p.alignment.query_alignment_length
-            unclipped_length = p.alignment.infer_read_length()
+            unclipped_length = p.alignment.query_length
             sum_unclipped_length += unclipped_length
-            read_center = p.alignment.query_alignment_length / 2
+            # The following calculations are all in 1-based coordinates
+            # with respect to the physical 5'-end of the read sequence.
+            if p.alignment.is_reverse:
+                read_base_pos = unclipped_length - p.query_position
+                read_first_aln_pos = (
+                    unclipped_length - p.alignment.query_alignment_end + 1
+                )
+                read_last_aln_pos = (
+                    unclipped_length - p.alignment.query_alignment_start
+                )
+            else:
+                read_base_pos = p.query_position + 1
+                read_first_aln_pos = p.alignment.query_alignment_start + 1
+                read_last_aln_pos = p.alignment.query_alignment_end
+            # Note: the original bam-readcount algorithm uses the less accurate
+            # p.alignment.query_alignment_length / 2 as the read center
+            read_center = (read_first_aln_pos + read_last_aln_pos) / 2
             sum_dist_from_center += 1 - abs(
-                p.query_position - read_center
-            ) / read_center
-            if p.alignment.is_reverse:
-                sum_dist_from_3prime += p.query_position / unclipped_length
-            else:
-                sum_dist_from_3prime += 1 - p.query_position / unclipped_length
+                read_base_pos - read_center
+            ) / (read_center - 1)
+            # Note: the original bam-readcount algorithm uses a more
+            # complicated definition of the 3'-end of a read sequence, which
+            # clips off runs of bases with a quality scores of exactly 2.
+            sum_dist_from_3prime += (
+                read_last_aln_pos - read_base_pos
+            ) / (unclipped_length - 1)

             sum_num_mismatches = 0
-            for qpos, rpos in p.alignment.get_aligned_pairs():
-                if qpos is not None and rpos is not None:
-                    if p.alignment.query_sequence[qpos] != ref_fetch(
-                        rpos, rpos + 1
-                    ).upper():  # ref bases can be lowercase!
+            if use_md:
+                try:
+                    # see if the read has an MD tag, in which case pysam can
+                    # calculate the reference sequence for us
+                    aligned_pairs = p.alignment.get_aligned_pairs(
+                        matches_only=True, with_seq=True
+                    )
+                except ValueError:
+                    use_md = False
+            if use_md:
+                # The ref sequence got calculated from alignment and MD tag.
+                for qpos, rpos, ref_base in aligned_pairs:
+                    # pysam uses lowercase ref bases to indicate mismatches
+                    if (
+                        ref_base == 'a' or ref_base == 't' or
+                        ref_base == 'g' or ref_base == 'c'
+                    ):
+                        sum_num_mismatches += 1
+                        sum_mismatch_qualities += p.alignment.query_qualities[
+                            qpos
+                        ]
+            else:
+                # We need to obtain the aligned portion of the reference
+                # sequence.
+                aligned_pairs = p.alignment.get_aligned_pairs(
+                    matches_only=True
+                )
+                # note that ref bases can be lowercase
+                ref_seq = ref_fetch(
+                    aligned_pairs[0][1], aligned_pairs[-1][1] + 1
+                ).upper()
+                ref_offset = aligned_pairs[0][1]
+
+                for qpos, rpos in p.alignment.get_aligned_pairs(
+                    matches_only=True
+                ):
+                    # see if we have a mismatch to the reference at this
+                    # position, but note that
+                    # - the query base may be lowercase (SAM/BAM spec)
+                    # - an '=', if present in the query seq, indicates a match
+                    #   to the reference (SAM/BAM spec)
+                    # - there cannot be a mismatch with an N in the reference
+                    ref_base = ref_seq[rpos - ref_offset]
+                    read_base = p.alignment.query_sequence[qpos].upper()
+                    if (
+                        read_base != '=' and ref_base != 'N'
+                    ) and (
+                        ref_base != read_base
+                    ):
                         sum_num_mismatches += 1
                         sum_mismatch_qualities += p.alignment.query_qualities[
                             qpos
@@ -582,7 +619,13 @@
             sum_num_mismatches_as_fraction += (
                 sum_num_mismatches / p.alignment.query_alignment_length
             )
-        avg_basequality = sum_avg_base_qualities / var_reads_total
+
+        var_reads_total = var_reads_plus + var_reads_minus
+        if var_reads_total == 0:
+            # No stats without reads!
+            return None
+        avg_mapping_quality = sum_mapping_qualities / var_reads_total
+        avg_basequality = sum_base_qualities / var_reads_total
         avg_pos_as_fraction = sum_dist_from_center / var_reads_total
         avg_num_mismatches_as_fraction = (
             sum_num_mismatches_as_fraction / var_reads_total
@@ -643,6 +686,10 @@
                     # skip indel postprocessing for the moment
                     yield record
                     continue
+                if record.alleles[0] == 'N':
+                    # It makes no sense to call SNPs against an unknown
+                    # reference base
+                    continue
                 # get pileup for genomic region affected by this variant
                 if record.info['SS'] == '2':
                     # a somatic variant => generate pileup from tumor data
@@ -825,12 +872,15 @@
             except Exception:
                 pass

-        def noop_gen(data, **kwargs):
-            for d in data:
-                yield d
+        def filter_minimal(data, **kwargs):
+            for record in data:
+                if record.alleles[0] != 'N' or len(record.alleles[1]) > 1:
+                    # Yield everything except SNPs called against an unknown
+                    # reference base
+                    yield record

         if no_filters:
-            apply_filters = noop_gen
+            apply_filters = filter_minimal
         else:
             apply_filters = self._postprocess_variant_records