Mercurial > repos > davidvanzessen > shm_csr
comparison shm_csr.py @ 48:c5295dd10dfc draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 08 May 2017 09:27:27 -0400 |
parents | 64711f461c8e |
children | aa8d37bd1930 |
comparison
equal
deleted
inserted
replaced
47:64711f461c8e | 48:c5295dd10dfc |
---|---|
112 | 112 |
113 | 113 |
114 #tandem mutation stuff | 114 #tandem mutation stuff |
115 tandem_frequency = defaultdict(int) | 115 tandem_frequency = defaultdict(int) |
116 mutation_frequency = defaultdict(int) | 116 mutation_frequency = defaultdict(int) |
117 | 117 |
118 mutations_by_id_dic = {} | |
119 first = True | |
120 mutation_by_id_file = os.path.join(os.path.dirname(outfile), "mutation_by_id.txt") | |
121 with open(mutation_by_id_file, 'r') as mutation_by_id: | |
122 for l in mutation_by_id: | |
123 if first: | |
124 first = False | |
125 continue | |
126 splt = l.split("\t") | |
127 mutations_by_id_dic[splt[0]] = int(splt[1]) | |
128 | |
118 tandem_file = os.path.join(os.path.dirname(outfile), "tandems_by_id.txt") | 129 tandem_file = os.path.join(os.path.dirname(outfile), "tandems_by_id.txt") |
119 with open(tandem_file, 'w') as o: | 130 with open(tandem_file, 'w') as o: |
120 highest_tandem_length = 0 | 131 highest_tandem_length = 0 |
121 | 132 |
122 o.write("Sequence.ID\tnumber_of_mutations\tnumber_of_tandems\tregion_length\texpected_tandems\tlongest_tandem\ttandems\n") | 133 o.write("Sequence.ID\tnumber_of_mutations\tnumber_of_tandems\tregion_length\texpected_tandems\tlongest_tandem\ttandems\n") |
157 if highest_tandem_length < len(tandem_muts): | 168 if highest_tandem_length < len(tandem_muts): |
158 highest_tandem_length = len(tandem_muts) | 169 highest_tandem_length = len(tandem_muts) |
159 | 170 |
160 region_length = fr1LengthDict[ID] + cdr1LengthDic[ID] + fr2LengthDict[ID] + cdr2LengthDic[ID] + fr3LengthDict[ID] | 171 region_length = fr1LengthDict[ID] + cdr1LengthDic[ID] + fr2LengthDict[ID] + cdr2LengthDic[ID] + fr3LengthDict[ID] |
161 longest_tandem = max(tandem_muts, key=lambda x: x[1]) if len(tandem_muts) else (0, 0) | 172 longest_tandem = max(tandem_muts, key=lambda x: x[1]) if len(tandem_muts) else (0, 0) |
162 num_mutations = len(mutations) | 173 num_mutations = mutations_by_id_dic[ID] # len(mutations) |
163 f_num_mutations = float(num_mutations) | 174 f_num_mutations = float(num_mutations) |
164 num_tandem_muts = len(tandem_muts) | 175 num_tandem_muts = len(tandem_muts) |
165 expected_tandem_muts = f_num_mutations * (f_num_mutations - 1.0) / float(region_length) | 176 expected_tandem_muts = f_num_mutations * (f_num_mutations - 1.0) / float(region_length) |
166 o.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(ID, | 177 o.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(ID, |
167 str(num_mutations), | 178 str(num_mutations), |
195 with open(tandem_freq_file, 'w') as o: | 206 with open(tandem_freq_file, 'w') as o: |
196 for frq in sorted([int(x) for x in tandem_frequency.keys()]): | 207 for frq in sorted([int(x) for x in tandem_frequency.keys()]): |
197 o.write("{0}\t{1}\n".format(frq, tandem_frequency[str(frq)])) | 208 o.write("{0}\t{1}\n".format(frq, tandem_frequency[str(frq)])) |
198 | 209 |
199 tandem_row = [] | 210 tandem_row = [] |
200 print genes | |
201 print tandem_sum_by_class | |
202 print expected_tandem_sum_by_class | |
203 genes_extra = list(genes) | 211 genes_extra = list(genes) |
204 genes_extra.append("all") | 212 genes_extra.append("all") |
205 for x, y, in zip([tandem_sum_by_class[x] for x in genes_extra], [expected_tandem_sum_by_class[x] for x in genes_extra]): | 213 for x, y, in zip([tandem_sum_by_class[x] for x in genes_extra], [expected_tandem_sum_by_class[x] for x in genes_extra]): |
206 if y != 0: | 214 if y != 0: |
207 tandem_row += [x, round(y, 2), round(x / y, 2)] | 215 tandem_row += [x, round(y, 2), round(x / y, 2)] |
208 else: | 216 else: |
209 tandem_row += [x, round(y, 2), 0] | 217 tandem_row += [x, round(y, 2), 0] |
210 | |
211 """ | |
212 print tandem_row | |
213 tandem_row += tandem_row[-3:] | |
214 print tandem_row | |
215 all_expected_tandem = expected_tandem_sum_by_class["all"] | |
216 all_tandem = tandem_sum_by_class["all"] | |
217 if all_expected_tandem == 0: | |
218 tandem_row[-6:-3] = [all_tandem, round(all_expected_tandem, 2), 0] | |
219 else: | |
220 tandem_row[-6:-3] = [all_tandem, round(all_expected_tandem, 2), round(all_tandem / all_expected_tandem, 2)] | |
221 print tandem_row | |
222 """ | |
223 for i in range(len(genes_extra)): | |
224 gene = genes_extra[i] | |
225 print gene, tandem_row[i*3:i*3+3] | |
226 | 218 |
227 tandem_freq_file = os.path.join(os.path.dirname(outfile), "shm_overview_tandem_row.txt") | 219 tandem_freq_file = os.path.join(os.path.dirname(outfile), "shm_overview_tandem_row.txt") |
228 with open(tandem_freq_file, 'w') as o: | 220 with open(tandem_freq_file, 'w') as o: |
229 o.write("Tandems/Expected (ratio),{0}\n".format(",".join([str(x) for x in tandem_row]))) | 221 o.write("Tandems/Expected (ratio),{0}\n".format(",".join([str(x) for x in tandem_row]))) |
230 | 222 |