Repository 'varscan_mpileup'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/varscan_mpileup

Changeset 8:45288fb1f3f5 (2019-03-28)
Previous changeset 7:7c5e867820cf (2019-01-21) Next changeset 9:e3f170cc4f95 (2019-08-16)
Commit message:
planemo upload for repository https://github.com/galaxyproject/iuc/tree/master/tools/varscan commit bd1034af3217cd9fc654d7187bf674a89465755b
modified:
test-data/hg19_chrM.fa
test-data/test-cache/hg19_chrM.fa
varscan.py
added:
test-data/high_cov_chrM.bam
b
diff -r 7c5e867820cf -r 45288fb1f3f5 test-data/hg19_chrM.fa
--- a/test-data/hg19_chrM.fa Mon Jan 21 12:05:47 2019 -0500
+++ b/test-data/hg19_chrM.fa Thu Mar 28 18:19:19 2019 -0400
b
@@ -1,5 +1,5 @@
 >chrM
-GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT
+NNNCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT
 TTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTG
 GAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATT
 CTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACCTACTA
b
diff -r 7c5e867820cf -r 45288fb1f3f5 test-data/high_cov_chrM.bam
b
Binary file test-data/high_cov_chrM.bam has changed
b
diff -r 7c5e867820cf -r 45288fb1f3f5 test-data/test-cache/hg19_chrM.fa
--- a/test-data/test-cache/hg19_chrM.fa Mon Jan 21 12:05:47 2019 -0500
+++ b/test-data/test-cache/hg19_chrM.fa Thu Mar 28 18:19:19 2019 -0400
b
@@ -1,5 +1,5 @@
 >chrM
-GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT
+NNNCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT
 TTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTG
 GAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATT
 CTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACCTACTA
b
diff -r 7c5e867820cf -r 45288fb1f3f5 varscan.py
--- a/varscan.py Mon Jan 21 12:05:47 2019 -0500
+++ b/varscan.py Thu Mar 28 18:19:19 2019 -0400
[
b'@@ -402,7 +402,12 @@\n         """Add standard FORMAT key declarations to a VCF header."""\n \n         format_field_specs = [\n-            (\'GT\', \'1\', \'String\', \'Genotype\'),\n+            # pysam will not add quotes around single-word descriptions,\n+            # which is a violation of the VCF specification.\n+            # To work around this we are using two words as the\n+            # GT format description (instead of the commonly used "Genotype").\n+            # This could be changed once pysam learns to quote correctly.\n+            (\'GT\', \'1\', \'String\', \'Genotype code\'),\n             (\'GQ\', \'1\', \'Integer\', \'Genotype quality\'),\n             (\'DP\', \'1\', \'Integer\', \'Read depth\'),\n             (\'AD\', \'R\', \'Integer\', \'Read depth for each allele\'),\n@@ -423,24 +428,7 @@\n         header.merge(temp_header)\n \n     def _compile_common_header(self, varcall_template, no_filters=False):\n-        # fix the header generated by VarScan\n-        # by adding reference and contig information\n-        common_header = pysam.VariantHeader()\n-        common_header.add_meta(\'reference\', value=self.ref_genome)\n-        self._add_ref_contigs_to_header(common_header)\n-        if not no_filters:\n-            # add filter info\n-            self._add_filters_to_header(common_header)\n-        # change the source information\n-        common_header.add_meta(\'source\', value=\'varscan.py\')\n-        # declare an INDEL flag for record INFO fields\n-        self._add_indel_info_flag_to_header(common_header)\n-        # generate standard FORMAT field declarations\n-        # including a correct AD declaration to prevent VarScan\'s\n-        # non-standard one from getting propagated\n-        self._standardize_format_fields(common_header)\n-        # take the remaining metadata from the template header produced by\n-        # VarScan\n+        # read the header produced by VarScan for use as a template\n         with pysam.VariantFile(varcall_template, \'r\') as original_data:\n             varscan_header = original_data.header\n         # don\'t propagate non-standard and redundant FORMAT keys written\n@@ -449,9 +437,29 @@\n         varscan_header.formats.remove_header(\'FREQ\')\n         varscan_header.formats.remove_header(\'RD\')\n         varscan_header.formats.remove_header(\'DP4\')\n+        # build a new header containing information not written by VarScan\n+        common_header = pysam.VariantHeader()\n+        # copy over sample info from the VarScan template\n         for sample in varscan_header.samples:\n             common_header.samples.add(sample)\n+        # add reference information\n+        common_header.add_meta(\'reference\', value=self.ref_genome)\n+        # change the source information\n+        common_header.add_meta(\'source\', value=\'varscan.py\')\n+        # add contig information\n+        self._add_ref_contigs_to_header(common_header)\n+        # declare an INDEL flag for record INFO fields\n+        self._add_indel_info_flag_to_header(common_header)\n+        # merge in remaining information from the VarScan template\n         common_header.merge(varscan_header)\n+        # add additional FILTER declarations\n+        if not no_filters:\n+            # add filter info\n+            self._add_filters_to_header(common_header)\n+        # generate standard FORMAT field declarations\n+        # including a correct AD declaration to prevent VarScan\'s\n+        # non-standard one from getting propagated\n+        self._standardize_format_fields(common_header)\n         return common_header\n \n     def _fix_record_gt_fields(self, record):\n@@ -498,54 +506,12 @@\n         del record.format[\'RD\']\n         del record.format[\'DP4\']\n \n-    def pileup_masker(self, mask):\n-        def apply_mask_on_pileup(piled_items):\n-            for item, status in zip(piled_items, mask):\n-                if status:\n-                    yield item\n-        return apply_mask_on_pileup\n-\n     def get_allele_specific_pileup_column_stats(\n-        self, allele, pile_column, ref_fetch\n+        self, allele, pile_colum'..b"alse\n+            if use_md:\n+                # The ref sequence got calculated from alignment and MD tag.\n+                for qpos, rpos, ref_base in aligned_pairs:\n+                    # pysam uses lowercase ref bases to indicate mismatches\n+                    if (\n+                        ref_base == 'a' or ref_base == 't' or\n+                        ref_base == 'g' or ref_base == 'c'\n+                    ):\n+                        sum_num_mismatches += 1\n+                        sum_mismatch_qualities += p.alignment.query_qualities[\n+                            qpos\n+                        ]\n+            else:\n+                # We need to obtain the aligned portion of the reference\n+                # sequence.\n+                aligned_pairs = p.alignment.get_aligned_pairs(\n+                    matches_only=True\n+                )\n+                # note that ref bases can be lowercase\n+                ref_seq = ref_fetch(\n+                    aligned_pairs[0][1], aligned_pairs[-1][1] + 1\n+                ).upper()\n+                ref_offset = aligned_pairs[0][1]\n+\n+                for qpos, rpos in p.alignment.get_aligned_pairs(\n+                    matches_only=True\n+                ):\n+                    # see if we have a mismatch to the reference at this\n+                    # position, but note that\n+                    # - the query base may be lowercase (SAM/BAM spec)\n+                    # - an '=', if present in the query seq, indicates a match\n+                    #   to the reference (SAM/BAM spec)\n+                    # - there cannot be a mismatch with an N in the reference\n+                    ref_base = ref_seq[rpos - ref_offset]\n+                    read_base = p.alignment.query_sequence[qpos].upper()\n+                    if (\n+                        read_base != '=' and ref_base != 'N'\n+                    ) and (\n+                        ref_base != read_base\n+                    ):\n                         sum_num_mismatches += 1\n                         sum_mismatch_qualities += p.alignment.query_qualities[\n                             qpos\n@@ -582,7 +619,13 @@\n             sum_num_mismatches_as_fraction += (\n                 sum_num_mismatches / p.alignment.query_alignment_length\n             )\n-        avg_basequality = sum_avg_base_qualities / var_reads_total\n+\n+        var_reads_total = var_reads_plus + var_reads_minus\n+        if var_reads_total == 0:\n+            # No stats without reads!\n+            return None\n+        avg_mapping_quality = sum_mapping_qualities / var_reads_total\n+        avg_basequality = sum_base_qualities / var_reads_total\n         avg_pos_as_fraction = sum_dist_from_center / var_reads_total\n         avg_num_mismatches_as_fraction = (\n             sum_num_mismatches_as_fraction / var_reads_total\n@@ -643,6 +686,10 @@\n                     # skip indel postprocessing for the moment\n                     yield record\n                     continue\n+                if record.alleles[0] == 'N':\n+                    # It makes no sense to call SNPs against an unknown\n+                    # reference base\n+                    continue\n                 # get pileup for genomic region affected by this variant\n                 if record.info['SS'] == '2':\n                     # a somatic variant => generate pileup from tumor data\n@@ -825,12 +872,15 @@\n             except Exception:\n                 pass\n \n-        def noop_gen(data, **kwargs):\n-            for d in data:\n-                yield d\n+        def filter_minimal(data, **kwargs):\n+            for record in data:\n+                if record.alleles[0] != 'N' or len(record.alleles[1]) > 1:\n+                    # Yield everything except SNPs called against an unknown\n+                    # reference base\n+                    yield record\n \n         if no_filters:\n-            apply_filters = noop_gen\n+            apply_filters = filter_minimal\n         else:\n             apply_filters = self._postprocess_variant_records\n \n"