comparison mut2sscs.py @ 6:11a2a34f8a2b draft

planemo upload for repository https://github.com/gpovysil/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
author mheinzl
date Mon, 18 Jan 2021 09:49:15 +0000
parents e5953c54cfb5
children ded0dc6a20d3
comparison
equal deleted inserted replaced
5:d9cbf833624e 6:11a2a34f8a2b
25 import os 25 import os
26 import sys 26 import sys
27 27
28 import numpy as np 28 import numpy as np
29 import pysam 29 import pysam
30 from cyvcf2 import VCF
30 31
31 32
32 def make_argparser(): 33 def make_argparser():
33 parser = argparse.ArgumentParser(description='Takes a tabular file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file.') 34 parser = argparse.ArgumentParser(description='Takes a vcf file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file.')
34 parser.add_argument('--mutFile', 35 parser.add_argument('--mutFile',
35 help='TABULAR file with DCS mutations.') 36 help='VCR file with DCS mutations.')
36 parser.add_argument('--bamFile', 37 parser.add_argument('--bamFile',
37 help='BAM file with aligned SSCS reads.') 38 help='BAM file with aligned SSCS reads.')
38 parser.add_argument('--outputJson', 39 parser.add_argument('--outputJson',
39 help='Output JSON file to store SSCS counts.') 40 help='Output JSON file to store SSCS counts.')
40 return parser 41 return parser
52 sys.exit("Error: Could not find '{}'".format(file1)) 53 sys.exit("Error: Could not find '{}'".format(file1))
53 54
54 if os.path.isfile(file2) is False: 55 if os.path.isfile(file2) is False:
55 sys.exit("Error: Could not find '{}'".format(file2)) 56 sys.exit("Error: Could not find '{}'".format(file2))
56 57
57 # 1. read mut file 58 # read SSCS bam file
58 with open(file1, 'r') as mut: 59 # pysam.index(file2)
59 mut_array = np.genfromtxt(mut, skip_header=1, delimiter='\t', comments='#', dtype=str)
60
61 # 2 read SSCS bam file
62 # pysam.index(file2)
63 bam = pysam.AlignmentFile(file2, "rb") 60 bam = pysam.AlignmentFile(file2, "rb")
64 61
65 # get tags 62 # get tags
66 mut_pos_dict = {} 63 mut_pos_dict = {}
67 ref_pos_dict = {} 64 ref_pos_dict = {}
68 if mut_array.shape == (1,13):
69 mut_array = mut_array.reshape((1, len(mut_array)))
70 65
71 for m in range(0, len(mut_array[:, 0])): 66 for variant in VCF(file1):
72 print(str(m + 1) + " of " + str(len(mut_array[:, 0]))) 67 chrom = variant.CHROM
73 chrom = mut_array[m, 1] 68 stop_pos = variant.start
74 stop_pos = mut_array[m, 2].astype(int)
75 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) 69 chrom_stop_pos = str(chrom) + "#" + str(stop_pos)
76 ref = mut_array[m, 9] 70 ref = variant.REF
77 alt = mut_array[m, 10] 71 alt = variant.ALT[0]
72 # nc = variant.format('NC')
73 ad = variant.format('AD')
78 74
79 for pileupcolumn in bam.pileup(chrom.tostring(), stop_pos - 2, stop_pos, max_depth=1000000000): 75 if len(ref) == len(alt):
80 if pileupcolumn.reference_pos == stop_pos - 1: 76
81 count_alt = 0 77 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=1000000000):
82 count_ref = 0 78 if pileupcolumn.reference_pos == stop_pos:
83 count_indel = 0 79 count_alt = 0
84 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), 80 count_ref = 0
85 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) 81 count_indel = 0
86 for pileupread in pileupcolumn.pileups: 82 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups),
87 if not pileupread.is_del and not pileupread.is_refskip: 83 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n)
88 tag = pileupread.alignment.query_name 84 for pileupread in pileupcolumn.pileups:
89 abba = tag[-2:] 85 if not pileupread.is_del and not pileupread.is_refskip:
90 # query position is None if is_del or is_refskip is set. 86 tag = pileupread.alignment.query_name
91 if pileupread.alignment.query_sequence[pileupread.query_position] == alt: 87 abba = tag[-2:]
92 count_alt += 1 88 # query position is None if is_del or is_refskip is set.
93 if chrom_stop_pos in mut_pos_dict: 89 if pileupread.alignment.query_sequence[pileupread.query_position] == alt:
94 if abba in mut_pos_dict[chrom_stop_pos]: 90 count_alt += 1
95 mut_pos_dict[chrom_stop_pos][abba] += 1 91 if chrom_stop_pos in mut_pos_dict:
92 if abba in mut_pos_dict[chrom_stop_pos]:
93 mut_pos_dict[chrom_stop_pos][abba] += 1
94 else:
95 mut_pos_dict[chrom_stop_pos][abba] = 1
96 else: 96 else:
97 mut_pos_dict[chrom_stop_pos] = {}
97 mut_pos_dict[chrom_stop_pos][abba] = 1 98 mut_pos_dict[chrom_stop_pos][abba] = 1
98 else: 99 if chrom_stop_pos not in ref_pos_dict:
99 mut_pos_dict[chrom_stop_pos] = {} 100 ref_pos_dict[chrom_stop_pos] = {}
100 mut_pos_dict[chrom_stop_pos][abba] = 1 101 ref_pos_dict[chrom_stop_pos][abba] = 0
101 elif pileupread.alignment.query_sequence[pileupread.query_position] == ref: 102
102 count_ref += 1 103 elif pileupread.alignment.query_sequence[pileupread.query_position] == ref:
103 if chrom_stop_pos in ref_pos_dict: 104 count_ref += 1
104 if abba in ref_pos_dict[chrom_stop_pos]: 105 if chrom_stop_pos in ref_pos_dict:
105 ref_pos_dict[chrom_stop_pos][abba] += 1 106 if abba in ref_pos_dict[chrom_stop_pos]:
107 ref_pos_dict[chrom_stop_pos][abba] += 1
108 else:
109 ref_pos_dict[chrom_stop_pos][abba] = 1
106 else: 110 else:
111 ref_pos_dict[chrom_stop_pos] = {}
107 ref_pos_dict[chrom_stop_pos][abba] = 1 112 ref_pos_dict[chrom_stop_pos][abba] = 1
108 else: 113 else:
109 ref_pos_dict[chrom_stop_pos] = {} 114 count_indel += 1
110 ref_pos_dict[chrom_stop_pos][abba] = 1
111 else:
112 count_indel += 1
113 115
114 print("coverage at pos %s = %s, ref = %s, alt = %s, indel = %s,\n" % 116 print("coverage at pos %s = %s, ref = %s, alt = %s, indel = %s,\n" %
115 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_indel)) 117 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_indel))
116 118
117 # if mutation is in DCS file but not in SSCS, then set counts to NA 119 # if mutation is in DCS file but not in SSCS, then set counts to NA
118 if chrom_stop_pos not in mut_pos_dict.keys(): 120 if chrom_stop_pos not in mut_pos_dict.keys():
119 mut_pos_dict[chrom_stop_pos] = {} 121 mut_pos_dict[chrom_stop_pos] = {}
120 mut_pos_dict[chrom_stop_pos]["ab"] = 0 122 mut_pos_dict[chrom_stop_pos]["ab"] = 0
121 mut_pos_dict[chrom_stop_pos]["ba"] = 0 123 mut_pos_dict[chrom_stop_pos]["ba"] = 0
122 ref_pos_dict[chrom_stop_pos] = {} 124 ref_pos_dict[chrom_stop_pos] = {}
123 ref_pos_dict[chrom_stop_pos]["ab"] = 0 125 ref_pos_dict[chrom_stop_pos]["ab"] = 0
124 ref_pos_dict[chrom_stop_pos]["ba"] = 0 126 ref_pos_dict[chrom_stop_pos]["ba"] = 0
127 else:
128 print("indels are currently not evaluated")
125 bam.close() 129 bam.close()
126 130
127 # save counts 131 # save counts
128 with open(sscs_counts_json, "w") as f: 132 with open(sscs_counts_json, "w") as f:
129 json.dump((mut_pos_dict, ref_pos_dict), f) 133 json.dump((mut_pos_dict, ref_pos_dict), f)
130 134
131 135
132 if __name__ == '__main__': 136 if __name__ == '__main__':
133 sys.exit(mut2sscs(sys.argv)) 137 sys.exit(mut2sscs(sys.argv))
138