Mercurial > repos > kaymccoy > consolidate_fitnesses

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/consol_fit.py	Fri Aug 12 16:44:28 2016 -0400
@@ -0,0 +1,301 @@
+# Consol_fit! It's a script & it'll consolidate your fitness values if you got them from a looping trimming pipeline instead of the standard split-by-transposon pipeline. That's all.
+# K. McCoy
+
+import math
+import csv
+
+
+
+
+
+
+
+
+
+
+##### ARGUMENTS #####
+
+def print_usage():
+	print "\n" + "You are missing one or more required flags. A complete list of flags accepted by calc_fitness is as follows:" + "\n\n"
+	print "\033[1m" + "Required" + "\033[0m" + "\n"
+	print "-i" + "\t\t" + "The calc_fit file to be consolidated" + "\n"
+	print "-out" + "\t\t" + "Name of a file to enter the .csv output." + "\n"
+	print "-out2" + "\t\t" + "Name of a file to put the percent blank score in (used in aggregate)." + "\n"
+	print "-calctxt" + "\t\t" + "The txt file output from calc_fit" + "\n"
+	print "-normalize" + "\t" + "A file that contains a list of genes that should have a fitness of 1" + "\n"
+	print "\n"
+	print "\033[1m" + "Optional" + "\033[0m" + "\n"
+	print "-cutoff" + "\t\t" + "Discard any positions where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\n"
+	print "-cutoff2" + "\t\t" + "Discard any positions within the normalization genes where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\n"
+	print "-wig" + "\t\t" + "Create a wiggle file for viewing in a genome browser. Provide a filename." + "\n"
+	print "-maxweight" + "\t" + "The maximum weight a transposon gene can have in normalization calculations" + "\n"
+	print "-multiply" + "\t" + "Multiply all fitness scores by a certain value (e.g., the fitness of a knockout). You should normalize the data." + "\n"
+	print "\n"
+
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument("-calctxt", action="store", dest="calctxt")
+parser.add_argument("-normalize", action="store", dest="normalize")
+parser.add_argument("-i", action="store", dest="input")
+parser.add_argument("-out", action="store", dest="outfile")
+parser.add_argument("-out2", action="store", dest="outfile2")
+parser.add_argument("-cutoff", action="store", dest="cutoff")
+parser.add_argument("-cutoff2", action="store", dest="cutoff2")
+parser.add_argument("-wig", action="store", dest="wig")
+parser.add_argument("-maxweight", action="store", dest="max_weight")
+parser.add_argument("-multiply", action="store", dest="multiply")
+arguments = parser.parse_args()
+
+if (not arguments.input or not arguments.outfile or not arguments.calctxt):
+	print_usage()
+	quit()
+
+if (not arguments.max_weight):
+	arguments.max_weight = 75
+
+if (not arguments.cutoff):
+	arguments.cutoff = 0
+
+# Cutoff2 only has an effect if it's larger than cutoff, since the normalization step references a list of insertions already affected by cutoff.
+
+if (not arguments.cutoff2):
+	arguments.cutoff2 = 10
+
+#Gets total & refname from calc_fit outfile2
+
+with open(arguments.calctxt) as file:
+	calctxt = file.readlines()
+total = float(calctxt[1].split()[1])
+refname = calctxt[2].split()[1]
+
+
+
+
+
+
+
+
+
+
+##### CONSOLIDATING THE CALC_FIT FILE #####
+
+with open(arguments.input) as file:
+	input = file.readlines()
+results = [["position", "strand", "count_1", "count_2", "ratio", "mt_freq_t1", "mt_freq_t2", "pop_freq_t1", "pop_freq_t2", "gene", "D", "W", "nW"]]
+i = 1
+d = float(input[i].split(",")[10])
+while i < len(input):
+	position = float(input[i].split(",")[0])
+	strands = input[i].split(",")[1]
+	c1 = float(input[i].split(",")[2])
+	c2 = float(input[i].split(",")[3])
+	gene = input[i].split(",")[9]
+	while i + 1 < len(input) and float(input[i+1].split(",")[0]) - position <= 4:
+		if i + 1 < len(input):
+			i += 1
+			c1 += float(input[i].split(",")[2])
+			c2 += float(input[i].split(",")[3])
+			strands = input[i].split(",")[1]
+			if strands[0] == 'b':
+				new_strands = 'b/'
+			elif strands[0] == '+':
+				if input[i].split(",")[1][0] == 'b':
+					new_strands = 'b/'
+				elif input[i].split(",")[1][0] == '+':
+					new_strands = '+/'
+				elif input[i].split(",")[1][0] == '-':
+					new_strands = 'b/'
+			elif strands[0] == '-':
+				if input[i].split(",")[1][0] == 'b':
+					new_strands = 'b/'
+				elif input[i].split(",")[1][0] == '+':
+					new_strands = 'b/'
+				elif input[i].split(",")[1][0] == '-':
+					new_strands = '-/'
+			if len(strands) == 3:
+				if len(input[i].split(",")[1]) < 3:
+					new_strands += strands[2]
+				elif strands[0] == 'b':
+					new_strands += 'b'
+				elif strands[0] == '+':
+					if input[i].split(",")[1][2] == 'b':
+						new_strands += 'b'
+					elif input[i].split(",")[1][2] == '+':
+						new_strands += '+'
+					elif input[i].split(",")[1][2] == '-':
+						new_strands += 'b'
+				elif strands[0] == '-':
+					if input[i].split(",")[1][2] == 'b':
+						new_strands += 'b'
+					elif input[i].split(",")[1][2] == '+':
+						new_strands += 'b'
+					elif input[i].split(",")[1][2] == '-':
+						new_strands += '-'
+			else:
+				if len(input[i].split(",")[1]) == 3:
+						new_strands += input[i].split(",")[1][2]
+			strands = new_strands
+	i +=1
+	if c2 != 0:
+		ratio = c2/c1
+	else:
+		ratio = 0
+	mt_freq_t1 = c1/total
+	mt_freq_t2 = c2/total
+	pop_freq_t1 = 1 - mt_freq_t1
+	pop_freq_t2 = 1 - mt_freq_t2
+	w = 0
+	if mt_freq_t2 != 0:
+		top_w = math.log(mt_freq_t2*(d/mt_freq_t1))
+		bot_w = math.log(pop_freq_t2*(d/pop_freq_t1))
+		w = top_w/bot_w
+	row = [position, strands, c1, c2, ratio, mt_freq_t1, mt_freq_t2, pop_freq_t1, pop_freq_t2, gene, d, w, w]
+	results.append(row)
+with open(arguments.outfile, "wb") as csvfile:
+    writer = csv.writer(csvfile)
+    writer.writerows(results)
+
+
+
+
+
+
+
+
+
+
+##### REDOING NORMALIZATION #####
+
+# The header below is just in a typical WIG file format; if you'd like to look into this more UCSC has notes on formatting WIG files on their site.
+
+if (arguments.wig):
+	wigstring = "track type=wiggle_0 name=" + arguments.wig + "\n" + "variableStep chrom=" + refname + "\n"
+
+if (arguments.normalize):
+	with open(arguments.normalize) as file:
+		transposon_genes = file.read().splitlines()
+	print "Normalize genes loaded" + "\n"
+	blank_ws = 0
+	sum = 0
+	count = 0
+	weights = []
+	scores = []
+	for list in results:
+		if list[9] != '' and list[9] in transposon_genes and list[11]:
+			c1 = list[2]
+			c2 = list[3]
+			score = list[11]
+			avg = (c1 + c2)/2
+
+# Skips over those insertion locations with too few insertions - their fitness values are less accurate because they're based on such small insertion numbers.
+
+			if float(c1) >= float(arguments.cutoff2):
+
+# Sets a max weight, to prevent insertion location scores with huge weights from unbalancing the normalization.
+
+				if (avg >= float(arguments.max_weight)):
+					avg = float(arguments.max_weight)
+
+# Tallies how many w values are 0 within the blank_ws value; you might get many transposon genes with a w value of 0 if a bottleneck occurs, which is especially common with in vivo experiments.
+# For example, when studying a nasal infection in a mouse model, what bacteria "sticks" and is able to survive and what bacteria is swallowed and killed or otherwise flushed out tends to be a matter of chance not fitness; all mutants with an insertion in a specific transposon gene could be flushed out by chance!
+
+				if score == 0:
+					blank_ws += 1
+				sum += score
+				count += 1
+				weights.append(avg)
+				scores.append(score)
+
+				print str(list[9]) + " " + str(score) + " " + str(c1)
+
+# Counts and removes all "blank" fitness values of normalization genes - those that = 0 - because they most likely don't really have a fitness value of 0, and you just happened to not get any reads from that location at t2.
+
+	blank_count = 0
+	original_count = len(scores)
+	i = 0
+	while i < original_count:
+		w_value = scores[i]
+		if w_value == 0:
+			blank_count += 1
+			weights.pop[i]
+			scores.pop[i]
+			i-=1
+		i += 1
+
+# If no normalization genes can pass the cutoff, normalization cannot occur, so this ends the script advises the user to try again and lower cutoff and/or cutoff2.
+
+	if len(scores) == 0:
+		print 'ERROR: The normalization genes do not have enough reads to pass cutoff and/or cutoff2; please lower one or both of those arguments.' + "\n"
+		quit()
+
+	pc_blank_normals = float(blank_count) / float(original_count)
+	print "# blank out of " + str(original_count) + ": " + str(pc_blank_normals) + "\n"
+	with open(arguments.outfile2, "w") as f:
+		f.write("blanks: " + str(pc_blank_normals) + "\n" + "total: " + str(total) + "\n" + "refname: " + refname)
+
+	average = sum / count
+	i = 0
+	weighted_sum = 0
+	weight_sum = 0
+	while i < len(weights):
+		weighted_sum += weights[i]*scores[i]
+		weight_sum += weights[i]
+		i += 1
+	weighted_average = weighted_sum/weight_sum
+
+	print "Normalization step:" + "\n"
+	print "Regular average: " + str(average) + "\n"
+	print "Weighted Average: " + str(weighted_average) + "\n"
+	print "Total Insertions: " + str(count) + "\n"
+
+	old_ws = 0
+	new_ws = 0
+	wcount = 0
+	for list in results:
+		if list[11] == 'W':
+			continue
+		new_w = float(list[11])/weighted_average
+
+# Sometimes you want to multiply all the fitness values by a constant; this does that.
+# For example you might multiply all the values by a constant for a genetic interaction screen - where Tn-Seq is performed as usual except there's one background knockout all the mutants share.
+
+		if arguments.multiply:
+			new_w *= float(arguments.multiply)
+
+		if float(list[11]) > 0:
+			old_ws += float(list[11])
+			new_ws += new_w
+			wcount += 1
+
+		list[12] = new_w
+
+		if (arguments.wig):
+			wigstring += str(list[0]) + " " + str(new_w) + "\n"
+
+	old_w_mean = old_ws / wcount
+	new_w_mean = new_ws / wcount
+	print "Old W Average: " + str(old_w_mean) + "\n"
+	print "New W Average: " + str(new_w_mean) + "\n"
+
+with open(arguments.outfile, "wb") as csvfile:
+    writer = csv.writer(csvfile)
+    writer.writerows(results)
+
+if (arguments.wig):
+	if (arguments.normalize):
+		with open(arguments.wig, "wb") as wigfile:
+			wigfile.write(wigstring)
+	else:
+		for list in results:
+			wigstring += str(list[0]) + " " + str(list[11]) + "\n"
+		with open(arguments.wig, "wb") as wigfile:
+				wigfile.write(wigstring)
+
+
+#   ___       ___       ___            ___       ___       ___       ___       ___
+#   /\__\     /\  \     /\__\          /\__\     /\  \     /\  \     /\  \     /\__\
+#  /:/ _/_   /::\  \   |::L__L        /::L_L_   /::\  \   /::\  \   /::\  \   |::L__L
+# /::-"\__\ /::\:\__\  |:::\__\      /:/L:\__\ /:/\:\__\ /:/\:\__\ /:/\:\__\  |:::\__\
+# \;:;-",-" \/\::/  /  /:;;/__/      \/_/:/  / \:\ \/__/ \:\ \/__/ \:\/:/  /  /:;;/__/
+#  |:|  |     /:/  /   \/__/           /:/  /   \:\__\    \:\__\    \::/  /   \/__/
+#   \|__|     \/__/                    \/__/     \/__/     \/__/     \/__/
\ No newline at end of file