comparison mut2sscs.py @ 0:e5953c54cfb5 draft

planemo upload for repository https://github.com/gpovysil/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
author mheinzl
date Sun, 04 Oct 2020 17:19:39 +0000
parents
children 11a2a34f8a2b
comparison
equal deleted inserted replaced
-1:000000000000 0:e5953c54cfb5
1 #!/usr/bin/env python
2
3 """mut2sscs.py
4
5 Author -- Gundula Povysil
6 Contact -- povysil@bioinf.jku.at
7
8 Takes a tabular file with mutations from DCS and a BAM file of SSCS as input
9 and extracts all tags of reads that carry the mutation.
10 Calculates statistics about number of ab/ba/duplex per mutation.
11
12 ======= ========== ================= ================================
13 Version Date Author Description
14 0.2.1 2019-10-27 Gundula Povysil -
15 ======= ========== ================= ================================
16
17 USAGE: python mut2sscs.py DCS_Mutations.tabular SSCS.bam SSCS_counts.json
18
19 """
20
21 from __future__ import division
22
23 import argparse
24 import json
25 import os
26 import sys
27
28 import numpy as np
29 import pysam
30
31
32 def make_argparser():
33 parser = argparse.ArgumentParser(description='Takes a tabular file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file.')
34 parser.add_argument('--mutFile',
35 help='TABULAR file with DCS mutations.')
36 parser.add_argument('--bamFile',
37 help='BAM file with aligned SSCS reads.')
38 parser.add_argument('--outputJson',
39 help='Output JSON file to store SSCS counts.')
40 return parser
41
42
43 def mut2sscs(argv):
44 parser = make_argparser()
45 args = parser.parse_args(argv[1:])
46
47 file1 = args.mutFile
48 file2 = args.bamFile
49 sscs_counts_json = args.outputJson
50
51 if os.path.isfile(file1) is False:
52 sys.exit("Error: Could not find '{}'".format(file1))
53
54 if os.path.isfile(file2) is False:
55 sys.exit("Error: Could not find '{}'".format(file2))
56
57 # 1. read mut file
58 with open(file1, 'r') as mut:
59 mut_array = np.genfromtxt(mut, skip_header=1, delimiter='\t', comments='#', dtype=str)
60
61 # 2 read SSCS bam file
62 # pysam.index(file2)
63 bam = pysam.AlignmentFile(file2, "rb")
64
65 # get tags
66 mut_pos_dict = {}
67 ref_pos_dict = {}
68 if mut_array.shape == (1,13):
69 mut_array = mut_array.reshape((1, len(mut_array)))
70
71 for m in range(0, len(mut_array[:, 0])):
72 print(str(m + 1) + " of " + str(len(mut_array[:, 0])))
73 chrom = mut_array[m, 1]
74 stop_pos = mut_array[m, 2].astype(int)
75 chrom_stop_pos = str(chrom) + "#" + str(stop_pos)
76 ref = mut_array[m, 9]
77 alt = mut_array[m, 10]
78
79 for pileupcolumn in bam.pileup(chrom.tostring(), stop_pos - 2, stop_pos, max_depth=1000000000):
80 if pileupcolumn.reference_pos == stop_pos - 1:
81 count_alt = 0
82 count_ref = 0
83 count_indel = 0
84 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups),
85 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n)
86 for pileupread in pileupcolumn.pileups:
87 if not pileupread.is_del and not pileupread.is_refskip:
88 tag = pileupread.alignment.query_name
89 abba = tag[-2:]
90 # query position is None if is_del or is_refskip is set.
91 if pileupread.alignment.query_sequence[pileupread.query_position] == alt:
92 count_alt += 1
93 if chrom_stop_pos in mut_pos_dict:
94 if abba in mut_pos_dict[chrom_stop_pos]:
95 mut_pos_dict[chrom_stop_pos][abba] += 1
96 else:
97 mut_pos_dict[chrom_stop_pos][abba] = 1
98 else:
99 mut_pos_dict[chrom_stop_pos] = {}
100 mut_pos_dict[chrom_stop_pos][abba] = 1
101 elif pileupread.alignment.query_sequence[pileupread.query_position] == ref:
102 count_ref += 1
103 if chrom_stop_pos in ref_pos_dict:
104 if abba in ref_pos_dict[chrom_stop_pos]:
105 ref_pos_dict[chrom_stop_pos][abba] += 1
106 else:
107 ref_pos_dict[chrom_stop_pos][abba] = 1
108 else:
109 ref_pos_dict[chrom_stop_pos] = {}
110 ref_pos_dict[chrom_stop_pos][abba] = 1
111 else:
112 count_indel += 1
113
114 print("coverage at pos %s = %s, ref = %s, alt = %s, indel = %s,\n" %
115 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_indel))
116
117 # if mutation is in DCS file but not in SSCS, then set counts to NA
118 if chrom_stop_pos not in mut_pos_dict.keys():
119 mut_pos_dict[chrom_stop_pos] = {}
120 mut_pos_dict[chrom_stop_pos]["ab"] = 0
121 mut_pos_dict[chrom_stop_pos]["ba"] = 0
122 ref_pos_dict[chrom_stop_pos] = {}
123 ref_pos_dict[chrom_stop_pos]["ab"] = 0
124 ref_pos_dict[chrom_stop_pos]["ba"] = 0
125 bam.close()
126
127 # save counts
128 with open(sscs_counts_json, "w") as f:
129 json.dump((mut_pos_dict, ref_pos_dict), f)
130
131
132 if __name__ == '__main__':
133 sys.exit(mut2sscs(sys.argv))