comparison shm_csr.py @ 48:c5295dd10dfc draft

Uploaded
author davidvanzessen
date Mon, 08 May 2017 09:27:27 -0400
parents 64711f461c8e
children aa8d37bd1930
comparison
equal deleted inserted replaced
47:64711f461c8e 48:c5295dd10dfc
112 112
113 113
114 #tandem mutation stuff 114 #tandem mutation stuff
115 tandem_frequency = defaultdict(int) 115 tandem_frequency = defaultdict(int)
116 mutation_frequency = defaultdict(int) 116 mutation_frequency = defaultdict(int)
117 117
118 mutations_by_id_dic = {}
119 first = True
120 mutation_by_id_file = os.path.join(os.path.dirname(outfile), "mutation_by_id.txt")
121 with open(mutation_by_id_file, 'r') as mutation_by_id:
122 for l in mutation_by_id:
123 if first:
124 first = False
125 continue
126 splt = l.split("\t")
127 mutations_by_id_dic[splt[0]] = int(splt[1])
128
118 tandem_file = os.path.join(os.path.dirname(outfile), "tandems_by_id.txt") 129 tandem_file = os.path.join(os.path.dirname(outfile), "tandems_by_id.txt")
119 with open(tandem_file, 'w') as o: 130 with open(tandem_file, 'w') as o:
120 highest_tandem_length = 0 131 highest_tandem_length = 0
121 132
122 o.write("Sequence.ID\tnumber_of_mutations\tnumber_of_tandems\tregion_length\texpected_tandems\tlongest_tandem\ttandems\n") 133 o.write("Sequence.ID\tnumber_of_mutations\tnumber_of_tandems\tregion_length\texpected_tandems\tlongest_tandem\ttandems\n")
157 if highest_tandem_length < len(tandem_muts): 168 if highest_tandem_length < len(tandem_muts):
158 highest_tandem_length = len(tandem_muts) 169 highest_tandem_length = len(tandem_muts)
159 170
160 region_length = fr1LengthDict[ID] + cdr1LengthDic[ID] + fr2LengthDict[ID] + cdr2LengthDic[ID] + fr3LengthDict[ID] 171 region_length = fr1LengthDict[ID] + cdr1LengthDic[ID] + fr2LengthDict[ID] + cdr2LengthDic[ID] + fr3LengthDict[ID]
161 longest_tandem = max(tandem_muts, key=lambda x: x[1]) if len(tandem_muts) else (0, 0) 172 longest_tandem = max(tandem_muts, key=lambda x: x[1]) if len(tandem_muts) else (0, 0)
162 num_mutations = len(mutations) 173 num_mutations = mutations_by_id_dic[ID] # len(mutations)
163 f_num_mutations = float(num_mutations) 174 f_num_mutations = float(num_mutations)
164 num_tandem_muts = len(tandem_muts) 175 num_tandem_muts = len(tandem_muts)
165 expected_tandem_muts = f_num_mutations * (f_num_mutations - 1.0) / float(region_length) 176 expected_tandem_muts = f_num_mutations * (f_num_mutations - 1.0) / float(region_length)
166 o.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(ID, 177 o.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(ID,
167 str(num_mutations), 178 str(num_mutations),
195 with open(tandem_freq_file, 'w') as o: 206 with open(tandem_freq_file, 'w') as o:
196 for frq in sorted([int(x) for x in tandem_frequency.keys()]): 207 for frq in sorted([int(x) for x in tandem_frequency.keys()]):
197 o.write("{0}\t{1}\n".format(frq, tandem_frequency[str(frq)])) 208 o.write("{0}\t{1}\n".format(frq, tandem_frequency[str(frq)]))
198 209
199 tandem_row = [] 210 tandem_row = []
200 print genes
201 print tandem_sum_by_class
202 print expected_tandem_sum_by_class
203 genes_extra = list(genes) 211 genes_extra = list(genes)
204 genes_extra.append("all") 212 genes_extra.append("all")
205 for x, y, in zip([tandem_sum_by_class[x] for x in genes_extra], [expected_tandem_sum_by_class[x] for x in genes_extra]): 213 for x, y, in zip([tandem_sum_by_class[x] for x in genes_extra], [expected_tandem_sum_by_class[x] for x in genes_extra]):
206 if y != 0: 214 if y != 0:
207 tandem_row += [x, round(y, 2), round(x / y, 2)] 215 tandem_row += [x, round(y, 2), round(x / y, 2)]
208 else: 216 else:
209 tandem_row += [x, round(y, 2), 0] 217 tandem_row += [x, round(y, 2), 0]
210
211 """
212 print tandem_row
213 tandem_row += tandem_row[-3:]
214 print tandem_row
215 all_expected_tandem = expected_tandem_sum_by_class["all"]
216 all_tandem = tandem_sum_by_class["all"]
217 if all_expected_tandem == 0:
218 tandem_row[-6:-3] = [all_tandem, round(all_expected_tandem, 2), 0]
219 else:
220 tandem_row[-6:-3] = [all_tandem, round(all_expected_tandem, 2), round(all_tandem / all_expected_tandem, 2)]
221 print tandem_row
222 """
223 for i in range(len(genes_extra)):
224 gene = genes_extra[i]
225 print gene, tandem_row[i*3:i*3+3]
226 218
227 tandem_freq_file = os.path.join(os.path.dirname(outfile), "shm_overview_tandem_row.txt") 219 tandem_freq_file = os.path.join(os.path.dirname(outfile), "shm_overview_tandem_row.txt")
228 with open(tandem_freq_file, 'w') as o: 220 with open(tandem_freq_file, 'w') as o:
229 o.write("Tandems/Expected (ratio),{0}\n".format(",".join([str(x) for x in tandem_row]))) 221 o.write("Tandems/Expected (ratio),{0}\n".format(",".join([str(x) for x in tandem_row])))
230 222