Mercurial > repos > mheinzl > variant_analyzer2
comparison mut2sscs.py @ 6:11a2a34f8a2b draft
planemo upload for repository https://github.com/gpovysil/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
author | mheinzl |
---|---|
date | Mon, 18 Jan 2021 09:49:15 +0000 |
parents | e5953c54cfb5 |
children | ded0dc6a20d3 |
comparison
equal
deleted
inserted
replaced
5:d9cbf833624e | 6:11a2a34f8a2b |
---|---|
25 import os | 25 import os |
26 import sys | 26 import sys |
27 | 27 |
28 import numpy as np | 28 import numpy as np |
29 import pysam | 29 import pysam |
30 from cyvcf2 import VCF | |
30 | 31 |
31 | 32 |
32 def make_argparser(): | 33 def make_argparser(): |
33 parser = argparse.ArgumentParser(description='Takes a tabular file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file.') | 34 parser = argparse.ArgumentParser(description='Takes a vcf file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file.') |
34 parser.add_argument('--mutFile', | 35 parser.add_argument('--mutFile', |
35 help='TABULAR file with DCS mutations.') | 36 help='VCR file with DCS mutations.') |
36 parser.add_argument('--bamFile', | 37 parser.add_argument('--bamFile', |
37 help='BAM file with aligned SSCS reads.') | 38 help='BAM file with aligned SSCS reads.') |
38 parser.add_argument('--outputJson', | 39 parser.add_argument('--outputJson', |
39 help='Output JSON file to store SSCS counts.') | 40 help='Output JSON file to store SSCS counts.') |
40 return parser | 41 return parser |
52 sys.exit("Error: Could not find '{}'".format(file1)) | 53 sys.exit("Error: Could not find '{}'".format(file1)) |
53 | 54 |
54 if os.path.isfile(file2) is False: | 55 if os.path.isfile(file2) is False: |
55 sys.exit("Error: Could not find '{}'".format(file2)) | 56 sys.exit("Error: Could not find '{}'".format(file2)) |
56 | 57 |
57 # 1. read mut file | 58 # read SSCS bam file |
58 with open(file1, 'r') as mut: | 59 # pysam.index(file2) |
59 mut_array = np.genfromtxt(mut, skip_header=1, delimiter='\t', comments='#', dtype=str) | |
60 | |
61 # 2 read SSCS bam file | |
62 # pysam.index(file2) | |
63 bam = pysam.AlignmentFile(file2, "rb") | 60 bam = pysam.AlignmentFile(file2, "rb") |
64 | 61 |
65 # get tags | 62 # get tags |
66 mut_pos_dict = {} | 63 mut_pos_dict = {} |
67 ref_pos_dict = {} | 64 ref_pos_dict = {} |
68 if mut_array.shape == (1,13): | |
69 mut_array = mut_array.reshape((1, len(mut_array))) | |
70 | 65 |
71 for m in range(0, len(mut_array[:, 0])): | 66 for variant in VCF(file1): |
72 print(str(m + 1) + " of " + str(len(mut_array[:, 0]))) | 67 chrom = variant.CHROM |
73 chrom = mut_array[m, 1] | 68 stop_pos = variant.start |
74 stop_pos = mut_array[m, 2].astype(int) | |
75 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) | 69 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) |
76 ref = mut_array[m, 9] | 70 ref = variant.REF |
77 alt = mut_array[m, 10] | 71 alt = variant.ALT[0] |
72 # nc = variant.format('NC') | |
73 ad = variant.format('AD') | |
78 | 74 |
79 for pileupcolumn in bam.pileup(chrom.tostring(), stop_pos - 2, stop_pos, max_depth=1000000000): | 75 if len(ref) == len(alt): |
80 if pileupcolumn.reference_pos == stop_pos - 1: | 76 |
81 count_alt = 0 | 77 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=1000000000): |
82 count_ref = 0 | 78 if pileupcolumn.reference_pos == stop_pos: |
83 count_indel = 0 | 79 count_alt = 0 |
84 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), | 80 count_ref = 0 |
85 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) | 81 count_indel = 0 |
86 for pileupread in pileupcolumn.pileups: | 82 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), |
87 if not pileupread.is_del and not pileupread.is_refskip: | 83 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) |
88 tag = pileupread.alignment.query_name | 84 for pileupread in pileupcolumn.pileups: |
89 abba = tag[-2:] | 85 if not pileupread.is_del and not pileupread.is_refskip: |
90 # query position is None if is_del or is_refskip is set. | 86 tag = pileupread.alignment.query_name |
91 if pileupread.alignment.query_sequence[pileupread.query_position] == alt: | 87 abba = tag[-2:] |
92 count_alt += 1 | 88 # query position is None if is_del or is_refskip is set. |
93 if chrom_stop_pos in mut_pos_dict: | 89 if pileupread.alignment.query_sequence[pileupread.query_position] == alt: |
94 if abba in mut_pos_dict[chrom_stop_pos]: | 90 count_alt += 1 |
95 mut_pos_dict[chrom_stop_pos][abba] += 1 | 91 if chrom_stop_pos in mut_pos_dict: |
92 if abba in mut_pos_dict[chrom_stop_pos]: | |
93 mut_pos_dict[chrom_stop_pos][abba] += 1 | |
94 else: | |
95 mut_pos_dict[chrom_stop_pos][abba] = 1 | |
96 else: | 96 else: |
97 mut_pos_dict[chrom_stop_pos] = {} | |
97 mut_pos_dict[chrom_stop_pos][abba] = 1 | 98 mut_pos_dict[chrom_stop_pos][abba] = 1 |
98 else: | 99 if chrom_stop_pos not in ref_pos_dict: |
99 mut_pos_dict[chrom_stop_pos] = {} | 100 ref_pos_dict[chrom_stop_pos] = {} |
100 mut_pos_dict[chrom_stop_pos][abba] = 1 | 101 ref_pos_dict[chrom_stop_pos][abba] = 0 |
101 elif pileupread.alignment.query_sequence[pileupread.query_position] == ref: | 102 |
102 count_ref += 1 | 103 elif pileupread.alignment.query_sequence[pileupread.query_position] == ref: |
103 if chrom_stop_pos in ref_pos_dict: | 104 count_ref += 1 |
104 if abba in ref_pos_dict[chrom_stop_pos]: | 105 if chrom_stop_pos in ref_pos_dict: |
105 ref_pos_dict[chrom_stop_pos][abba] += 1 | 106 if abba in ref_pos_dict[chrom_stop_pos]: |
107 ref_pos_dict[chrom_stop_pos][abba] += 1 | |
108 else: | |
109 ref_pos_dict[chrom_stop_pos][abba] = 1 | |
106 else: | 110 else: |
111 ref_pos_dict[chrom_stop_pos] = {} | |
107 ref_pos_dict[chrom_stop_pos][abba] = 1 | 112 ref_pos_dict[chrom_stop_pos][abba] = 1 |
108 else: | 113 else: |
109 ref_pos_dict[chrom_stop_pos] = {} | 114 count_indel += 1 |
110 ref_pos_dict[chrom_stop_pos][abba] = 1 | |
111 else: | |
112 count_indel += 1 | |
113 | 115 |
114 print("coverage at pos %s = %s, ref = %s, alt = %s, indel = %s,\n" % | 116 print("coverage at pos %s = %s, ref = %s, alt = %s, indel = %s,\n" % |
115 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_indel)) | 117 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_indel)) |
116 | 118 |
117 # if mutation is in DCS file but not in SSCS, then set counts to NA | 119 # if mutation is in DCS file but not in SSCS, then set counts to NA |
118 if chrom_stop_pos not in mut_pos_dict.keys(): | 120 if chrom_stop_pos not in mut_pos_dict.keys(): |
119 mut_pos_dict[chrom_stop_pos] = {} | 121 mut_pos_dict[chrom_stop_pos] = {} |
120 mut_pos_dict[chrom_stop_pos]["ab"] = 0 | 122 mut_pos_dict[chrom_stop_pos]["ab"] = 0 |
121 mut_pos_dict[chrom_stop_pos]["ba"] = 0 | 123 mut_pos_dict[chrom_stop_pos]["ba"] = 0 |
122 ref_pos_dict[chrom_stop_pos] = {} | 124 ref_pos_dict[chrom_stop_pos] = {} |
123 ref_pos_dict[chrom_stop_pos]["ab"] = 0 | 125 ref_pos_dict[chrom_stop_pos]["ab"] = 0 |
124 ref_pos_dict[chrom_stop_pos]["ba"] = 0 | 126 ref_pos_dict[chrom_stop_pos]["ba"] = 0 |
127 else: | |
128 print("indels are currently not evaluated") | |
125 bam.close() | 129 bam.close() |
126 | 130 |
127 # save counts | 131 # save counts |
128 with open(sscs_counts_json, "w") as f: | 132 with open(sscs_counts_json, "w") as f: |
129 json.dump((mut_pos_dict, ref_pos_dict), f) | 133 json.dump((mut_pos_dict, ref_pos_dict), f) |
130 | 134 |
131 | 135 |
132 if __name__ == '__main__': | 136 if __name__ == '__main__': |
133 sys.exit(mut2sscs(sys.argv)) | 137 sys.exit(mut2sscs(sys.argv)) |
138 |