comparison mut2read.py @ 6:11a2a34f8a2b draft

planemo upload for repository https://github.com/gpovysil/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
author mheinzl
date Mon, 18 Jan 2021 09:49:15 +0000
parents e5953c54cfb5
children ded0dc6a20d3
comparison
equal deleted inserted replaced
5:d9cbf833624e 6:11a2a34f8a2b
12 ======= ========== ================= ================================ 12 ======= ========== ================= ================================
13 Version Date Author Description 13 Version Date Author Description
14 0.2.1 2019-10-27 Gundula Povysil - 14 0.2.1 2019-10-27 Gundula Povysil -
15 ======= ========== ================= ================================ 15 ======= ========== ================= ================================
16 16
17 USAGE: python mut2read.py DCS_Mutations.tabular DCS.bam Aligned_Families.tabular Interesting_Reads.fastq 17 USAGE: python mut2read.py DCS_Mutations.tabular DCS.bam Aligned_Families.tabular Interesting_Reads.fastq tag_count_dict.json
18 tag_count_dict.json
19 """ 18 """
20 19
21 import argparse 20 import argparse
22 import json 21 import json
23 import os 22 import os
24 import sys 23 import sys
25 24
26 import numpy as np 25 import numpy as np
27 import pysam 26 import pysam
27 from cyvcf2 import VCF
28 28
29 29
30 def make_argparser(): 30 def make_argparser():
31 parser = argparse.ArgumentParser(description='Takes a tabular file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file and creates a fastq file of reads of tags with mutation.') 31 parser = argparse.ArgumentParser(description='Takes a vcf file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file and creates a fastq file of reads of tags with mutation.')
32 parser.add_argument('--mutFile', 32 parser.add_argument('--mutFile',
33 help='TABULAR file with DCS mutations.') 33 help='VCF file with DCS mutations.')
34 parser.add_argument('--bamFile', 34 parser.add_argument('--bamFile',
35 help='BAM file with aligned DCS reads.') 35 help='BAM file with aligned DCS reads.')
36 parser.add_argument('--familiesFile', 36 parser.add_argument('--familiesFile',
37 help='TABULAR file with aligned families.') 37 help='TABULAR file with aligned families.')
38 parser.add_argument('--outputFastq', 38 parser.add_argument('--outputFastq',
59 sys.exit("Error: Could not find '{}'".format(file2)) 59 sys.exit("Error: Could not find '{}'".format(file2))
60 60
61 if os.path.isfile(file3) is False: 61 if os.path.isfile(file3) is False:
62 sys.exit("Error: Could not find '{}'".format(file3)) 62 sys.exit("Error: Could not find '{}'".format(file3))
63 63
64 # read mut file
65 with open(file1, 'r') as mut:
66 mut_array = np.genfromtxt(mut, skip_header=1, delimiter='\t', comments='#', dtype=str)
67
68 # read dcs bam file 64 # read dcs bam file
69 # pysam.index(file2) 65 # pysam.index(file2)
70 bam = pysam.AlignmentFile(file2, "rb") 66 bam = pysam.AlignmentFile(file2, "rb")
71 67
72 # get tags 68 # get tags
73 tag_dict = {} 69 tag_dict = {}
74 cvrg_dict = {} 70 cvrg_dict = {}
75 71
76 if mut_array.shape == (1,13): 72 for variant in VCF(file1):
77 mut_array = mut_array.reshape((1, len(mut_array))) 73 chrom = variant.CHROM
78 74 stop_pos = variant.start
79 for m in range(len(mut_array[:, 0])):
80 print(str(m + 1) + " of " + str(len(mut_array[:, 0])))
81 chrom = mut_array[m, 1]
82 stop_pos = mut_array[m, 2].astype(int)
83 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) 75 chrom_stop_pos = str(chrom) + "#" + str(stop_pos)
84 ref = mut_array[m, 9] 76 ref = variant.REF
85 alt = mut_array[m, 10] 77 alt = variant.ALT[0]
78 # nc = variant.format('NC')
79 ad = variant.format('AD')
86 80
87 dcs_len = [] 81 dcs_len = []
82 if len(ref) == len(alt):
83 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000):
88 84
89 for pileupcolumn in bam.pileup(chrom.tostring(), stop_pos - 2, stop_pos, max_depth=100000000): 85 if pileupcolumn.reference_pos == stop_pos:
86 count_alt = 0
87 count_ref = 0
88 count_indel = 0
89 count_n = 0
90 count_other = 0
91 count_lowq = 0
92 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups),
93 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n)
94 for pileupread in pileupcolumn.pileups:
95 if not pileupread.is_del and not pileupread.is_refskip:
96 # query position is None if is_del or is_refskip is set.
97 nuc = pileupread.alignment.query_sequence[pileupread.query_position]
98 dcs_len.append(len(pileupread.alignment.query_sequence))
99 if nuc == alt:
100 count_alt += 1
101 tag = pileupread.alignment.query_name
102 if tag in tag_dict:
103 tag_dict[tag][chrom_stop_pos] = alt
104 else:
105 tag_dict[tag] = {}
106 tag_dict[tag][chrom_stop_pos] = alt
107 elif nuc == ref:
108 count_ref += 1
109 elif nuc == "N":
110 count_n += 1
111 elif nuc == "lowQ":
112 count_lowq += 1
113 else:
114 count_other += 1
115 else:
116 count_indel += 1
117 dcs_median = np.median(np.array(dcs_len))
118 cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median)
90 119
91 if pileupcolumn.reference_pos == stop_pos - 1: 120 print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" %
92 count_alt = 0 121 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n,
93 count_ref = 0 122 count_indel, count_lowq, dcs_median))
94 count_indel = 0 123 else:
95 count_n = 0 124 print("indels are currently not evaluated")
96 count_other = 0
97 count_lowq = 0
98 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups),
99 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n)
100 for pileupread in pileupcolumn.pileups:
101 if not pileupread.is_del and not pileupread.is_refskip:
102 # query position is None if is_del or is_refskip is set.
103 nuc = pileupread.alignment.query_sequence[pileupread.query_position]
104 dcs_len.append(len(pileupread.alignment.query_sequence))
105 if nuc == alt:
106 count_alt += 1
107 tag = pileupread.alignment.query_name
108 if tag in tag_dict:
109 tag_dict[tag][chrom_stop_pos] = alt
110 else:
111 tag_dict[tag] = {}
112 tag_dict[tag][chrom_stop_pos] = alt
113 elif nuc == ref:
114 count_ref += 1
115 elif nuc == "N":
116 count_n += 1
117 elif nuc == "lowQ":
118 count_lowq += 1
119 else:
120 count_other += 1
121 else:
122 count_indel += 1
123 dcs_median = np.median(np.array(dcs_len))
124 cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median)
125
126 print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" %
127 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n,
128 count_indel, count_lowq, dcs_median))
129 bam.close() 125 bam.close()
130 126
131 with open(json_file, "w") as f: 127 with open(json_file, "w") as f:
132 json.dump((tag_dict, cvrg_dict), f) 128 json.dump((tag_dict, cvrg_dict), f)
133 129
151 out.write(curr_qual + "\n") 147 out.write(curr_qual + "\n")
152 148
153 149
154 if __name__ == '__main__': 150 if __name__ == '__main__':
155 sys.exit(mut2read(sys.argv)) 151 sys.exit(mut2read(sys.argv))
152