diff mut2read.py @ 2:3f1dbd2c59bf draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/variant_analyzer commit f492e9717cb946f0eb5689cd7b6eb8067abf6468"
author iuc
date Tue, 10 Nov 2020 12:55:29 +0000
parents 3556001ff2db
children
line wrap: on
line diff
--- a/mut2read.py	Wed Dec 04 16:21:17 2019 -0500
+++ b/mut2read.py	Tue Nov 10 12:55:29 2020 +0000
@@ -11,11 +11,10 @@
 
 =======  ==========  =================  ================================
 Version  Date        Author             Description
-0.2.1    2019-10-27  Gundula Povysil    -
+2.0.0    2020-10-30  Gundula Povysil    -
 =======  ==========  =================  ================================
 
-USAGE: python mut2read.py DCS_Mutations.tabular DCS.bam Aligned_Families.tabular Interesting_Reads.fastq
-                          tag_count_dict.json
+USAGE: python mut2read.py DCS_Mutations.tabular DCS.bam Aligned_Families.tabular Interesting_Reads.fastq tag_count_dict.json
 """
 
 import argparse
@@ -25,12 +24,13 @@
 
 import numpy as np
 import pysam
+from cyvcf2 import VCF
 
 
 def make_argparser():
-    parser = argparse.ArgumentParser(description='Takes a tabular file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file and creates a fastq file of reads of tags with mutation.')
+    parser = argparse.ArgumentParser(description='Takes a vcf file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file and creates a fastq file of reads of tags with mutation.')
     parser.add_argument('--mutFile',
-                        help='TABULAR file with DCS mutations.')
+                        help='VCF file with DCS mutations.')
     parser.add_argument('--bamFile',
                         help='BAM file with aligned DCS reads.')
     parser.add_argument('--familiesFile',
@@ -61,71 +61,62 @@
     if os.path.isfile(file3) is False:
         sys.exit("Error: Could not find '{}'".format(file3))
 
-    # read mut file
-    with open(file1, 'r') as mut:
-        mut_array = np.genfromtxt(mut, skip_header=1, delimiter='\t', comments='#', dtype=str)
-
     # read dcs bam file
-    # pysam.index(file2)
     bam = pysam.AlignmentFile(file2, "rb")
 
     # get tags
     tag_dict = {}
     cvrg_dict = {}
 
-    if len(mut_array) == 13:
-        mut_array = mut_array.reshape((1, len(mut_array)))
-
-    for m in range(len(mut_array[:, 0])):
-        print(str(m + 1) + " of " + str(len(mut_array[:, 0])))
-        chrom = mut_array[m, 1]
-        stop_pos = mut_array[m, 2].astype(int)
+    for variant in VCF(file1):
+        chrom = variant.CHROM
+        stop_pos = variant.start
         chrom_stop_pos = str(chrom) + "#" + str(stop_pos)
-        ref = mut_array[m, 9]
-        alt = mut_array[m, 10]
-
+        ref = variant.REF
+        alt = variant.ALT[0]
         dcs_len = []
-
-        for pileupcolumn in bam.pileup(chrom, stop_pos - 2, stop_pos, max_depth=100000000):
+        if len(ref) == len(alt):
+            for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000):
+                if pileupcolumn.reference_pos == stop_pos:
+                    count_alt = 0
+                    count_ref = 0
+                    count_indel = 0
+                    count_n = 0
+                    count_other = 0
+                    count_lowq = 0
+                    print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups),
+                          "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n)
+                    for pileupread in pileupcolumn.pileups:
+                        if not pileupread.is_del and not pileupread.is_refskip:
+                            # query position is None if is_del or is_refskip is set.
+                            nuc = pileupread.alignment.query_sequence[pileupread.query_position]
+                            dcs_len.append(len(pileupread.alignment.query_sequence))
+                            if nuc == alt:
+                                count_alt += 1
+                                tag = pileupread.alignment.query_name
+                                if tag in tag_dict:
+                                    tag_dict[tag][chrom_stop_pos] = alt
+                                else:
+                                    tag_dict[tag] = {}
+                                    tag_dict[tag][chrom_stop_pos] = alt
+                            elif nuc == ref:
+                                count_ref += 1
+                            elif nuc == "N":
+                                count_n += 1
+                            elif nuc == "lowQ":
+                                count_lowq += 1
+                            else:
+                                count_other += 1
+                        else:
+                            count_indel += 1
+                    dcs_median = np.median(np.array(dcs_len))
+                    cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median)
 
-            if pileupcolumn.reference_pos == stop_pos - 1:
-                count_alt = 0
-                count_ref = 0
-                count_indel = 0
-                count_n = 0
-                count_other = 0
-                count_lowq = 0
-                print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups),
-                      "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n)
-                for pileupread in pileupcolumn.pileups:
-                    if not pileupread.is_del and not pileupread.is_refskip:
-                        # query position is None if is_del or is_refskip is set.
-                        nuc = pileupread.alignment.query_sequence[pileupread.query_position]
-                        dcs_len.append(len(pileupread.alignment.query_sequence))
-                        if nuc == alt:
-                            count_alt += 1
-                            tag = pileupread.alignment.query_name
-                            if tag in tag_dict:
-                                tag_dict[tag][chrom_stop_pos] = alt
-                            else:
-                                tag_dict[tag] = {}
-                                tag_dict[tag][chrom_stop_pos] = alt
-                        elif nuc == ref:
-                            count_ref += 1
-                        elif nuc == "N":
-                            count_n += 1
-                        elif nuc == "lowQ":
-                            count_lowq += 1
-                        else:
-                            count_other += 1
-                    else:
-                        count_indel += 1
-                dcs_median = np.median(np.array(dcs_len))
-                cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median)
-
-                print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" %
-                      (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n,
-                       count_indel, count_lowq, dcs_median))
+                    print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" %
+                          (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n,
+                           count_indel, count_lowq, dcs_median))
+        else:
+            print("indels are currently not evaluated")
     bam.close()
 
     with open(json_file, "w") as f: