Mercurial > repos > davidvanzessen > shm_csr
comparison shm_csr.py @ 48:c5295dd10dfc draft
Uploaded
| author | davidvanzessen |
|---|---|
| date | Mon, 08 May 2017 09:27:27 -0400 |
| parents | 64711f461c8e |
| children | aa8d37bd1930 |
comparison
equal
deleted
inserted
replaced
| 47:64711f461c8e | 48:c5295dd10dfc |
|---|---|
| 112 | 112 |
| 113 | 113 |
| 114 #tandem mutation stuff | 114 #tandem mutation stuff |
| 115 tandem_frequency = defaultdict(int) | 115 tandem_frequency = defaultdict(int) |
| 116 mutation_frequency = defaultdict(int) | 116 mutation_frequency = defaultdict(int) |
| 117 | 117 |
| 118 mutations_by_id_dic = {} | |
| 119 first = True | |
| 120 mutation_by_id_file = os.path.join(os.path.dirname(outfile), "mutation_by_id.txt") | |
| 121 with open(mutation_by_id_file, 'r') as mutation_by_id: | |
| 122 for l in mutation_by_id: | |
| 123 if first: | |
| 124 first = False | |
| 125 continue | |
| 126 splt = l.split("\t") | |
| 127 mutations_by_id_dic[splt[0]] = int(splt[1]) | |
| 128 | |
| 118 tandem_file = os.path.join(os.path.dirname(outfile), "tandems_by_id.txt") | 129 tandem_file = os.path.join(os.path.dirname(outfile), "tandems_by_id.txt") |
| 119 with open(tandem_file, 'w') as o: | 130 with open(tandem_file, 'w') as o: |
| 120 highest_tandem_length = 0 | 131 highest_tandem_length = 0 |
| 121 | 132 |
| 122 o.write("Sequence.ID\tnumber_of_mutations\tnumber_of_tandems\tregion_length\texpected_tandems\tlongest_tandem\ttandems\n") | 133 o.write("Sequence.ID\tnumber_of_mutations\tnumber_of_tandems\tregion_length\texpected_tandems\tlongest_tandem\ttandems\n") |
| 157 if highest_tandem_length < len(tandem_muts): | 168 if highest_tandem_length < len(tandem_muts): |
| 158 highest_tandem_length = len(tandem_muts) | 169 highest_tandem_length = len(tandem_muts) |
| 159 | 170 |
| 160 region_length = fr1LengthDict[ID] + cdr1LengthDic[ID] + fr2LengthDict[ID] + cdr2LengthDic[ID] + fr3LengthDict[ID] | 171 region_length = fr1LengthDict[ID] + cdr1LengthDic[ID] + fr2LengthDict[ID] + cdr2LengthDic[ID] + fr3LengthDict[ID] |
| 161 longest_tandem = max(tandem_muts, key=lambda x: x[1]) if len(tandem_muts) else (0, 0) | 172 longest_tandem = max(tandem_muts, key=lambda x: x[1]) if len(tandem_muts) else (0, 0) |
| 162 num_mutations = len(mutations) | 173 num_mutations = mutations_by_id_dic[ID] # len(mutations) |
| 163 f_num_mutations = float(num_mutations) | 174 f_num_mutations = float(num_mutations) |
| 164 num_tandem_muts = len(tandem_muts) | 175 num_tandem_muts = len(tandem_muts) |
| 165 expected_tandem_muts = f_num_mutations * (f_num_mutations - 1.0) / float(region_length) | 176 expected_tandem_muts = f_num_mutations * (f_num_mutations - 1.0) / float(region_length) |
| 166 o.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(ID, | 177 o.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(ID, |
| 167 str(num_mutations), | 178 str(num_mutations), |
| 195 with open(tandem_freq_file, 'w') as o: | 206 with open(tandem_freq_file, 'w') as o: |
| 196 for frq in sorted([int(x) for x in tandem_frequency.keys()]): | 207 for frq in sorted([int(x) for x in tandem_frequency.keys()]): |
| 197 o.write("{0}\t{1}\n".format(frq, tandem_frequency[str(frq)])) | 208 o.write("{0}\t{1}\n".format(frq, tandem_frequency[str(frq)])) |
| 198 | 209 |
| 199 tandem_row = [] | 210 tandem_row = [] |
| 200 print genes | |
| 201 print tandem_sum_by_class | |
| 202 print expected_tandem_sum_by_class | |
| 203 genes_extra = list(genes) | 211 genes_extra = list(genes) |
| 204 genes_extra.append("all") | 212 genes_extra.append("all") |
| 205 for x, y, in zip([tandem_sum_by_class[x] for x in genes_extra], [expected_tandem_sum_by_class[x] for x in genes_extra]): | 213 for x, y, in zip([tandem_sum_by_class[x] for x in genes_extra], [expected_tandem_sum_by_class[x] for x in genes_extra]): |
| 206 if y != 0: | 214 if y != 0: |
| 207 tandem_row += [x, round(y, 2), round(x / y, 2)] | 215 tandem_row += [x, round(y, 2), round(x / y, 2)] |
| 208 else: | 216 else: |
| 209 tandem_row += [x, round(y, 2), 0] | 217 tandem_row += [x, round(y, 2), 0] |
| 210 | |
| 211 """ | |
| 212 print tandem_row | |
| 213 tandem_row += tandem_row[-3:] | |
| 214 print tandem_row | |
| 215 all_expected_tandem = expected_tandem_sum_by_class["all"] | |
| 216 all_tandem = tandem_sum_by_class["all"] | |
| 217 if all_expected_tandem == 0: | |
| 218 tandem_row[-6:-3] = [all_tandem, round(all_expected_tandem, 2), 0] | |
| 219 else: | |
| 220 tandem_row[-6:-3] = [all_tandem, round(all_expected_tandem, 2), round(all_tandem / all_expected_tandem, 2)] | |
| 221 print tandem_row | |
| 222 """ | |
| 223 for i in range(len(genes_extra)): | |
| 224 gene = genes_extra[i] | |
| 225 print gene, tandem_row[i*3:i*3+3] | |
| 226 | 218 |
| 227 tandem_freq_file = os.path.join(os.path.dirname(outfile), "shm_overview_tandem_row.txt") | 219 tandem_freq_file = os.path.join(os.path.dirname(outfile), "shm_overview_tandem_row.txt") |
| 228 with open(tandem_freq_file, 'w') as o: | 220 with open(tandem_freq_file, 'w') as o: |
| 229 o.write("Tandems/Expected (ratio),{0}\n".format(",".join([str(x) for x in tandem_row]))) | 221 o.write("Tandems/Expected (ratio),{0}\n".format(",".join([str(x) for x in tandem_row]))) |
| 230 | 222 |
