Mercurial > repos > iuc > variant_analyzer
comparison mut2sscs.py @ 2:3f1dbd2c59bf draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/variant_analyzer commit f492e9717cb946f0eb5689cd7b6eb8067abf6468"
author | iuc |
---|---|
date | Tue, 10 Nov 2020 12:55:29 +0000 |
parents | 3556001ff2db |
children |
comparison
equal
deleted
inserted
replaced
1:3556001ff2db | 2:3f1dbd2c59bf |
---|---|
9 and extracts all tags of reads that carry the mutation. | 9 and extracts all tags of reads that carry the mutation. |
10 Calculates statistics about number of ab/ba/duplex per mutation. | 10 Calculates statistics about number of ab/ba/duplex per mutation. |
11 | 11 |
12 ======= ========== ================= ================================ | 12 ======= ========== ================= ================================ |
13 Version Date Author Description | 13 Version Date Author Description |
14 0.2.1 2019-10-27 Gundula Povysil - | 14 2.0.0 2020-10-30 Gundula Povysil - |
15 ======= ========== ================= ================================ | 15 ======= ========== ================= ================================ |
16 | 16 |
17 USAGE: python mut2sscs.py DCS_Mutations.tabular SSCS.bam SSCS_counts.json | 17 USAGE: python mut2sscs.py DCS_Mutations.tabular SSCS.bam SSCS_counts.json |
18 | 18 |
19 """ | 19 """ |
23 import argparse | 23 import argparse |
24 import json | 24 import json |
25 import os | 25 import os |
26 import sys | 26 import sys |
27 | 27 |
28 import numpy as np | |
29 import pysam | 28 import pysam |
29 from cyvcf2 import VCF | |
30 | 30 |
31 | 31 |
32 def make_argparser(): | 32 def make_argparser(): |
33 parser = argparse.ArgumentParser(description='Takes a tabular file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file.') | 33 parser = argparse.ArgumentParser(description='Takes a vcf file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file.') |
34 parser.add_argument('--mutFile', | 34 parser.add_argument('--mutFile', |
35 help='TABULAR file with DCS mutations.') | 35 help='VCR file with DCS mutations.') |
36 parser.add_argument('--bamFile', | 36 parser.add_argument('--bamFile', |
37 help='BAM file with aligned SSCS reads.') | 37 help='BAM file with aligned SSCS reads.') |
38 parser.add_argument('--outputJson', | 38 parser.add_argument('--outputJson', |
39 help='Output JSON file to store SSCS counts.') | 39 help='Output JSON file to store SSCS counts.') |
40 return parser | 40 return parser |
52 sys.exit("Error: Could not find '{}'".format(file1)) | 52 sys.exit("Error: Could not find '{}'".format(file1)) |
53 | 53 |
54 if os.path.isfile(file2) is False: | 54 if os.path.isfile(file2) is False: |
55 sys.exit("Error: Could not find '{}'".format(file2)) | 55 sys.exit("Error: Could not find '{}'".format(file2)) |
56 | 56 |
57 # 1. read mut file | 57 # read SSCS bam file |
58 with open(file1, 'r') as mut: | |
59 mut_array = np.genfromtxt(mut, skip_header=1, delimiter='\t', comments='#', dtype=str) | |
60 | |
61 # 2 read SSCS bam file | |
62 # pysam.index(file2) | |
63 bam = pysam.AlignmentFile(file2, "rb") | 58 bam = pysam.AlignmentFile(file2, "rb") |
64 | 59 |
65 # get tags | 60 # get tags |
66 mut_pos_dict = {} | 61 mut_pos_dict = {} |
67 ref_pos_dict = {} | 62 ref_pos_dict = {} |
68 if len(mut_array) == 13: | |
69 mut_array = mut_array.reshape((1, len(mut_array))) | |
70 | 63 |
71 for m in range(0, len(mut_array[:, 0])): | 64 for variant in VCF(file1): |
72 print(str(m + 1) + " of " + str(len(mut_array[:, 0]))) | 65 chrom = variant.CHROM |
73 chrom = mut_array[m, 1] | 66 stop_pos = variant.start |
74 stop_pos = mut_array[m, 2].astype(int) | |
75 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) | 67 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) |
76 ref = mut_array[m, 9] | 68 ref = variant.REF |
77 alt = mut_array[m, 10] | 69 alt = variant.ALT[0] |
78 | 70 |
79 for pileupcolumn in bam.pileup(chrom, stop_pos - 2, stop_pos, max_depth=1000000000): | 71 if len(ref) == len(alt): |
80 if pileupcolumn.reference_pos == stop_pos - 1: | 72 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=1000000000): |
81 count_alt = 0 | 73 if pileupcolumn.reference_pos == stop_pos: |
82 count_ref = 0 | 74 count_alt = 0 |
83 count_indel = 0 | 75 count_ref = 0 |
84 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), | 76 count_indel = 0 |
85 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) | 77 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), |
86 for pileupread in pileupcolumn.pileups: | 78 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) |
87 if not pileupread.is_del and not pileupread.is_refskip: | 79 for pileupread in pileupcolumn.pileups: |
88 tag = pileupread.alignment.query_name | 80 if not pileupread.is_del and not pileupread.is_refskip: |
89 abba = tag[-2:] | 81 tag = pileupread.alignment.query_name |
90 # query position is None if is_del or is_refskip is set. | 82 abba = tag[-2:] |
91 if pileupread.alignment.query_sequence[pileupread.query_position] == alt: | 83 # query position is None if is_del or is_refskip is set. |
92 count_alt += 1 | 84 if pileupread.alignment.query_sequence[pileupread.query_position] == alt: |
93 if chrom_stop_pos in mut_pos_dict: | 85 count_alt += 1 |
94 if abba in mut_pos_dict[chrom_stop_pos]: | 86 if chrom_stop_pos in mut_pos_dict: |
95 mut_pos_dict[chrom_stop_pos][abba] += 1 | 87 if abba in mut_pos_dict[chrom_stop_pos]: |
88 mut_pos_dict[chrom_stop_pos][abba] += 1 | |
89 else: | |
90 mut_pos_dict[chrom_stop_pos][abba] = 1 | |
96 else: | 91 else: |
92 mut_pos_dict[chrom_stop_pos] = {} | |
97 mut_pos_dict[chrom_stop_pos][abba] = 1 | 93 mut_pos_dict[chrom_stop_pos][abba] = 1 |
98 else: | 94 if chrom_stop_pos not in ref_pos_dict: |
99 mut_pos_dict[chrom_stop_pos] = {} | 95 ref_pos_dict[chrom_stop_pos] = {} |
100 mut_pos_dict[chrom_stop_pos][abba] = 1 | 96 ref_pos_dict[chrom_stop_pos][abba] = 0 |
101 elif pileupread.alignment.query_sequence[pileupread.query_position] == ref: | 97 |
102 count_ref += 1 | 98 elif pileupread.alignment.query_sequence[pileupread.query_position] == ref: |
103 if chrom_stop_pos in ref_pos_dict: | 99 count_ref += 1 |
104 if abba in ref_pos_dict[chrom_stop_pos]: | 100 if chrom_stop_pos in ref_pos_dict: |
105 ref_pos_dict[chrom_stop_pos][abba] += 1 | 101 if abba in ref_pos_dict[chrom_stop_pos]: |
102 ref_pos_dict[chrom_stop_pos][abba] += 1 | |
103 else: | |
104 ref_pos_dict[chrom_stop_pos][abba] = 1 | |
106 else: | 105 else: |
106 ref_pos_dict[chrom_stop_pos] = {} | |
107 ref_pos_dict[chrom_stop_pos][abba] = 1 | 107 ref_pos_dict[chrom_stop_pos][abba] = 1 |
108 else: | 108 else: |
109 ref_pos_dict[chrom_stop_pos] = {} | 109 count_indel += 1 |
110 ref_pos_dict[chrom_stop_pos][abba] = 1 | |
111 else: | |
112 count_indel += 1 | |
113 | 110 |
114 print("coverage at pos %s = %s, ref = %s, alt = %s, indel = %s,\n" % | 111 print("coverage at pos %s = %s, ref = %s, alt = %s, indel = %s,\n" % |
115 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_indel)) | 112 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_indel)) |
116 | 113 |
117 # if mutation is in DCS file but not in SSCS, then set counts to NA | 114 # if mutation is in DCS file but not in SSCS, then set counts to NA |
118 if chrom_stop_pos not in mut_pos_dict.keys(): | 115 if chrom_stop_pos not in mut_pos_dict.keys(): |
119 mut_pos_dict[chrom_stop_pos] = {} | 116 mut_pos_dict[chrom_stop_pos] = {} |
120 mut_pos_dict[chrom_stop_pos]["ab"] = 0 | 117 mut_pos_dict[chrom_stop_pos]["ab"] = 0 |
121 mut_pos_dict[chrom_stop_pos]["ba"] = 0 | 118 mut_pos_dict[chrom_stop_pos]["ba"] = 0 |
122 ref_pos_dict[chrom_stop_pos] = {} | 119 ref_pos_dict[chrom_stop_pos] = {} |
123 ref_pos_dict[chrom_stop_pos]["ab"] = 0 | 120 ref_pos_dict[chrom_stop_pos]["ab"] = 0 |
124 ref_pos_dict[chrom_stop_pos]["ba"] = 0 | 121 ref_pos_dict[chrom_stop_pos]["ba"] = 0 |
122 else: | |
123 print("indels are currently not evaluated") | |
125 bam.close() | 124 bam.close() |
126 | 125 |
127 # save counts | 126 # save counts |
128 with open(sscs_counts_json, "w") as f: | 127 with open(sscs_counts_json, "w") as f: |
129 json.dump((mut_pos_dict, ref_pos_dict), f) | 128 json.dump((mut_pos_dict, ref_pos_dict), f) |