diff varscan.py @ 6:ab2f908f0b08 draft

planemo upload for repository https://github.com/galaxyproject/iuc/tree/master/tools/varscan commit 4b7660df2384d8c8c6b5a6f02a00d13211e6ff6c
author iuc
date Fri, 18 Jan 2019 19:41:30 -0500
parents d062703d6f13
children 7c5e867820cf
line wrap: on
line diff
--- a/varscan.py	Wed Dec 05 11:17:06 2018 -0500
+++ b/varscan.py	Fri Jan 18 19:41:30 2019 -0500
@@ -398,6 +398,30 @@
             'INDEL', 0, 'Flag', 'Indicates that the variant is an INDEL'
         )
 
+    def _standardize_format_fields(self, header):
+        """Add standard FORMAT key declarations to a VCF header."""
+
+        format_field_specs = [
+            ('GT', '1', 'String', 'Genotype'),
+            ('GQ', '1', 'Integer', 'Genotype quality'),
+            ('DP', '1', 'Integer', 'Read depth'),
+            ('AD', 'R', 'Integer', 'Read depth for each allele'),
+            ('ADF', 'R', 'Integer',
+             'Read depth for each allele on the forward strand'),
+            ('ADR', 'R', 'Integer',
+             'Read depth for each allele on the reverse strand')
+        ]
+        # Existing FORMAT declarations cannot be overwritten.
+        # The only viable strategy is to mark them as removed,
+        # then merge the header with another one containing the
+        # correct declarations. This is what is implemented below.
+        temp_header = pysam.VariantHeader()
+        for spec in format_field_specs:
+            temp_header.formats.add(*spec)
+            if spec[0] in header.formats:
+                header.formats.remove_header(spec[0])
+        header.merge(temp_header)
+
     def _compile_common_header(self, varcall_template, no_filters=False):
         # fix the header generated by VarScan
         # by adding reference and contig information
@@ -411,6 +435,10 @@
         common_header.add_meta('source', value='varscan.py')
         # declare an INDEL flag for record INFO fields
         self._add_indel_info_flag_to_header(common_header)
+        # generate standard FORMAT field declarations
+        # including a correct AD declaration to prevent VarScan's
+        # non-standard one from getting propagated
+        self._standardize_format_fields(common_header)
         # take the remaining metadata from the template header produced by
         # VarScan
         with pysam.VariantFile(varcall_template, 'r') as original_data:
@@ -418,8 +446,34 @@
         for sample in varscan_header.samples:
             common_header.samples.add(sample)
         common_header.merge(varscan_header)
+        # don't propagate non-standard and redundant FORMAT keys written
+        # by VarScan
+        common_header.formats.remove_header('FREQ')
+        common_header.formats.remove_header('RD')
+        common_header.formats.remove_header('DP4')
         return common_header
 
+    def _fix_read_gt_fields(self, read):
+        """Migrate non-standard genotype fields to standard ones."""
+
+        for gt_field in read.samples.values():
+            # generate a standard AD field by combining VarScan's
+            # RD and non-standard AD fields
+            gt_field['AD'] = (gt_field['RD'], gt_field['AD'][0])
+            # split VarScan's DP4 field into the standard fields
+            # ADF and ADR
+            gt_field['ADF'] = (
+                int(gt_field['DP4'][0]), int(gt_field['DP4'][2])
+            )
+            gt_field['ADR'] = (
+                int(gt_field['DP4'][1]), int(gt_field['DP4'][3])
+            )
+        # remove redundant fields
+        # FREQ is easy to calculate from AD
+        del read.format['FREQ']
+        del read.format['RD']
+        del read.format['DP4']
+
     def pileup_masker(self, mask):
         def apply_mask_on_pileup(piled_items):
             for item, status in zip(piled_items, mask):
@@ -773,6 +827,7 @@
                         self._add_ref_contigs_to_header(snp_invcf.header)
                         self._add_filters_to_header(snp_invcf.header)
                         self._add_indel_info_flag_to_header(snp_invcf.header)
+                        self._standardize_format_fields(snp_invcf.header)
                         with pysam.VariantFile(indel_f, 'r') as indel_invcf:
                             # fix the input header on the fly
                             # to avoid Warnings from htslib about missing
@@ -782,6 +837,7 @@
                             self._add_indel_info_flag_to_header(
                                 indel_invcf.header
                             )
+                            self._standardize_format_fields(indel_invcf.header)
                             for record in apply_filters(
                                 self._merge_generator(
                                     snp_invcf,
@@ -789,6 +845,7 @@
                                 ),
                                 **filter_args
                             ):
+                                self._fix_read_gt_fields(record)
                                 o.write(str(record))
                     try:
                         os.remove(snp_f)
@@ -810,9 +867,11 @@
                         # filters
                         self._add_ref_contigs_to_header(invcf.header)
                         self._add_filters_to_header(invcf.header)
+                        self._standardize_format_fields(invcf.header)
                         for record in apply_filters(
                             invcf, **filter_args
                         ):
+                            self._fix_read_gt_fields(record)
                             o.write(str(record))
                     try:
                         os.remove(f)
@@ -829,9 +888,11 @@
                         self._add_ref_contigs_to_header(invcf.header)
                         self._add_filters_to_header(invcf.header)
                         self._add_indel_info_flag_to_header(invcf.header)
+                        self._standardize_format_fields(invcf.header)
                         for record in apply_filters(
                             self._indel_flagged_records(invcf), **filter_args
                         ):
+                            self._fix_read_gt_fields(record)
                             o.write(str(record))
                     try:
                         os.remove(f)