shm_csr: shm_csr.py comparison

comparison shm_csr.py @ 48:c5295dd10dfc draft

Uploaded

author	davidvanzessen
date	Mon, 08 May 2017 09:27:27 -0400
parents	64711f461c8e
children	aa8d37bd1930

comparison

equal deleted inserted replaced

-:64711f461c8e
+:c5295dd10dfc
 	#tandem mutation stuff
 	tandem_frequency = defaultdict(int)
 	mutation_frequency = defaultdict(int)
+	mutations_by_id_dic = {}
+	first = True
+	mutation_by_id_file = os.path.join(os.path.dirname(outfile), "mutation_by_id.txt")
+	with open(mutation_by_id_file, 'r') as mutation_by_id:
+		for l in mutation_by_id:
+			if first:
+				first = False
+				continue
+			splt = l.split("\t")
+			mutations_by_id_dic[splt[0]] = int(splt[1])
 	tandem_file = os.path.join(os.path.dirname(outfile), "tandems_by_id.txt")
 	with open(tandem_file, 'w') as o:
 		highest_tandem_length = 0
 		o.write("Sequence.ID\tnumber_of_mutations\tnumber_of_tandems\tregion_length\texpected_tandems\tlongest_tandem\ttandems\n")
 				if highest_tandem_length < len(tandem_muts):
 					highest_tandem_length = len(tandem_muts)
 			region_length = fr1LengthDict[ID] + cdr1LengthDic[ID] + fr2LengthDict[ID] + cdr2LengthDic[ID] + fr3LengthDict[ID]
 			longest_tandem = max(tandem_muts, key=lambda x: x[1]) if len(tandem_muts) else (0, 0)
-			num_mutations = len(mutations)
+			num_mutations = mutations_by_id_dic[ID] # len(mutations)
 			f_num_mutations = float(num_mutations)
 			num_tandem_muts = len(tandem_muts)
 			expected_tandem_muts = f_num_mutations * (f_num_mutations - 1.0) / float(region_length)
 			o.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(ID,
 																str(num_mutations),
 	with open(tandem_freq_file, 'w') as o:
 		for frq in sorted([int(x) for x in tandem_frequency.keys()]):
 			o.write("{0}\t{1}\n".format(frq, tandem_frequency[str(frq)]))
 	tandem_row = []
-	print genes
-	print tandem_sum_by_class
-	print expected_tandem_sum_by_class
 	genes_extra = list(genes)
 	genes_extra.append("all")
 	for x, y, in zip([tandem_sum_by_class[x] for x in genes_extra], [expected_tandem_sum_by_class[x] for x in genes_extra]):
 		if y != 0:
 			tandem_row += [x, round(y, 2), round(x / y, 2)]
 		else:
 			tandem_row += [x, round(y, 2), 0]
-	"""
-	print tandem_row
-	tandem_row += tandem_row[-3:]
-	print tandem_row
-	all_expected_tandem = expected_tandem_sum_by_class["all"]
-	all_tandem = tandem_sum_by_class["all"]
-	if all_expected_tandem == 0:
-		tandem_row[-6:-3] = [all_tandem, round(all_expected_tandem, 2), 0]
-	else:
-		tandem_row[-6:-3] = [all_tandem, round(all_expected_tandem, 2), round(all_tandem / all_expected_tandem, 2)]
-	print tandem_row
-	"""
-	for i in range(len(genes_extra)):
-		gene = genes_extra[i]
-		print gene, tandem_row[i*3:i*3+3]
 	tandem_freq_file = os.path.join(os.path.dirname(outfile), "shm_overview_tandem_row.txt")
 	with open(tandem_freq_file, 'w') as o:
 		o.write("Tandems/Expected (ratio),{0}\n".format(",".join([str(x) for x in tandem_row])))

Mercurial > repos > davidvanzessen > shm_csr

comparison shm_csr.py @ 48:c5295dd10dfc draft