# HG changeset patch # User davidvanzessen # Date 1494250047 14400 # Node ID c5295dd10dfc1993d756f2f88ca58b07b1fd865d # Parent 64711f461c8e7fa71cf6d6f9475f472ff4ab391f Uploaded diff -r 64711f461c8e -r c5295dd10dfc shm_csr.py --- a/shm_csr.py Thu May 04 07:43:09 2017 -0400 +++ b/shm_csr.py Mon May 08 09:27:27 2017 -0400 @@ -114,7 +114,18 @@ #tandem mutation stuff tandem_frequency = defaultdict(int) mutation_frequency = defaultdict(int) - + + mutations_by_id_dic = {} + first = True + mutation_by_id_file = os.path.join(os.path.dirname(outfile), "mutation_by_id.txt") + with open(mutation_by_id_file, 'r') as mutation_by_id: + for l in mutation_by_id: + if first: + first = False + continue + splt = l.split("\t") + mutations_by_id_dic[splt[0]] = int(splt[1]) + tandem_file = os.path.join(os.path.dirname(outfile), "tandems_by_id.txt") with open(tandem_file, 'w') as o: highest_tandem_length = 0 @@ -159,7 +170,7 @@ region_length = fr1LengthDict[ID] + cdr1LengthDic[ID] + fr2LengthDict[ID] + cdr2LengthDic[ID] + fr3LengthDict[ID] longest_tandem = max(tandem_muts, key=lambda x: x[1]) if len(tandem_muts) else (0, 0) - num_mutations = len(mutations) + num_mutations = mutations_by_id_dic[ID] # len(mutations) f_num_mutations = float(num_mutations) num_tandem_muts = len(tandem_muts) expected_tandem_muts = f_num_mutations * (f_num_mutations - 1.0) / float(region_length) @@ -197,9 +208,6 @@ o.write("{0}\t{1}\n".format(frq, tandem_frequency[str(frq)])) tandem_row = [] - print genes - print tandem_sum_by_class - print expected_tandem_sum_by_class genes_extra = list(genes) genes_extra.append("all") for x, y, in zip([tandem_sum_by_class[x] for x in genes_extra], [expected_tandem_sum_by_class[x] for x in genes_extra]): @@ -207,22 +215,6 @@ tandem_row += [x, round(y, 2), round(x / y, 2)] else: tandem_row += [x, round(y, 2), 0] - - """ - print tandem_row - tandem_row += tandem_row[-3:] - print tandem_row - all_expected_tandem = expected_tandem_sum_by_class["all"] - all_tandem = tandem_sum_by_class["all"] - if all_expected_tandem == 0: - tandem_row[-6:-3] = [all_tandem, round(all_expected_tandem, 2), 0] - else: - tandem_row[-6:-3] = [all_tandem, round(all_expected_tandem, 2), round(all_tandem / all_expected_tandem, 2)] - print tandem_row - """ - for i in range(len(genes_extra)): - gene = genes_extra[i] - print gene, tandem_row[i*3:i*3+3] tandem_freq_file = os.path.join(os.path.dirname(outfile), "shm_overview_tandem_row.txt") with open(tandem_freq_file, 'w') as o: