annotate aggregate.py @ 4:5ff57a3d0af2 draft

Uploaded
author kaymccoy
date Sun, 11 Dec 2016 17:01:25 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
1 # A translation of aggregate.pl into python! For analysis of Tn-Seq.
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
2 # This script requires BioPython just like calc_fitness.py, so you need it installed along with its dependencies if you want to run these scripts on your own.
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
3 # How to install BioPython and a list of its dependencies can be found here: http://biopython.org/DIST/docs/install/Installation.html
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
4 # K. McCoy
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
5
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
6
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
7
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
8
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
9
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
10
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
11
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
12
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
13
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
14 ##### ARGUMENTS #####
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
15
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
16 def print_usage():
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
17 print "Aggregate.py's usage is as follows:" + "\n\n"
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
18 print "\033[1m" + "Required" + "\033[0m" + "\n"
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
19 print "-o" + "\t\t" + "Output file for aggregated data." + "\n"
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
20 print "\n"
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
21 print "\033[1m" + "Optional" + "\033[0m" + "\n"
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
22 print "-c" + "\t\t" + "Check for missing genes in the data set - provide a reference genome in genbank format. Missing genes will be sent to stdout." + "\n"
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
23 print "-m" + "\t\t" + "Place a mark in an extra column for this set of genes. Provide a file with a list of genes seperated by newlines." + "\n"
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
24 print "-x" + "\t\t" + "Cutoff: Don't include fitness scores with average counts (c1+c2)/2 < x (default: 0)" + "\n"
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
25 print "-b" + "\t\t" + "Bottleneck value: The percentage of insertions randomly lost, which will be discounted for all genes (for example, 20% would be entered as 0.20; default 0.0)" + "\n"
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
26 print "-f" + "\t\t" + "An in-between file carrying information on the blank count found from calc_fitness or consol_fitness; one of two ways to pass a blank count to this script" + "\n"
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
27 print "-w" + "\t\t" + "Use weighted algorithm to calculate averages, variance, sd, se" + "\n"
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
28 print "-l" + "\t\t" + "Weight ceiling: maximum value to use as a weight (default: 999,999)" + "\n"
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
29 print "\n"
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
30 print "All remainder arguements will be treated as fitness files (those files created by calc_fitness.py)" + "\n"
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
31 print "\n"
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
32
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
33 import argparse
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
34 parser = argparse.ArgumentParser()
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
35 parser.add_argument("-o", action="store", dest="summary")
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
36 parser.add_argument("-c", action="store", dest="find_missing")
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
37 parser.add_argument("-m", action="store", dest="marked")
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
38 parser.add_argument("-x", action="store", dest="cutoff")
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
39 parser.add_argument("-b", action="store", dest="blank_pc")
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
40 parser.add_argument("-f", action="store", dest="blank_file")
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
41 parser.add_argument("-w", action="store", dest="weighted")
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
42 parser.add_argument("-l", action="store", dest="weight_ceiling")
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
43 parser.add_argument("fitnessfiles", nargs=argparse.REMAINDER)
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
44
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
45 arguments = parser.parse_args()
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
46
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
47 if not arguments.summary:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
48 print "\n" + "You are missing a value for the -o flag. "
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
49 print_usage()
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
50 quit()
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
51
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
52 if not arguments.fitnessfiles:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
53 print "\n" + "You are missing fitness file(s); these should be entered immediately after all the flags. "
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
54 print_usage()
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
55 quit()
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
56
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
57 # 999,999 is a trivial placeholder number
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
58
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
59 if (not arguments.weight_ceiling):
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
60 arguments.weight_ceiling = 999999
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
61
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
62 # Cutoff exists to discard positions with a low number of counted transcripts, because their fitness may not be as accurate - for the same reasoning that studies with low sample sizes can be innacurate.
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
63
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
64 if (not arguments.cutoff):
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
65 arguments.cutoff = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
66
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
67 # Gets information from the txt output file of calc_fit / consol, if inputted
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
68
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
69 if arguments.blank_file:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
70 with open(arguments.blank_file) as file:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
71 blank_pc = file.read().splitlines()
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
72 arguments.blank_pc = float(blank_pc[0].split()[1])
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
73
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
74 if (not arguments.blank_pc):
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
75 arguments.blank_pc = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
76
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
77
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
78
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
79
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
80
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
81 ##### SUBROUTINES #####
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
82
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
83 # A subroutine that calculates the average, variance, standard deviation (sd), and standard error (se) of a group of scores; for use when aggregating scores by gene later on
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
84
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
85 import math
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
86 def unweighted_average(scores):
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
87 sum = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
88 num = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
89 i = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
90 while i < len(scores):
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
91 if not scores[i]:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
92 scores[i] = 0.0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
93 sum += float(scores[i])
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
94 num += 1
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
95 i += 1
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
96 average = sum/num
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
97 xminusxbars = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
98 while i < len(scores):
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
99 xminusxbars += (float(scores[i]) - average)**2
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
100 if num <= 1:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
101 variance = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
102 else:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
103 variance = xminusxbars/(num-1)
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
104 sd = math.sqrt(variance)
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
105 se = sd / math.sqrt(num)
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
106 return (average, variance, sd, se)
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
107
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
108 # A subroutine that calculates the weighted average, variance, standard deviation (sd), and standard error (se) of a group of scores; the weights come from the number of reads each insertion location has
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
109 # For use when aggregating scores by gene later on, if the weighted argument is called
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
110
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
111 def weighted_average(scores,weights):
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
112 sum = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
113 weighted_average = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
114 weighted_variance = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
115 top = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
116 bottom = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
117 i = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
118 while i < len(weights):
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
119 if not scores[i]:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
120 scores[i] = 0.0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
121 top += float(weights[i])*float(scores[i])
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
122 bottom += float(weights[i])
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
123 i += 1
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
124 if bottom == 0:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
125 return 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
126 weighted_average = top/bottom
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
127 top = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
128 bottom = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
129 i = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
130 while i < len(weights):
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
131 top += float(weights[i]) * (float(scores[i]) - weighted_average)**2
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
132 bottom += float(weights[i])
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
133 i += 1
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
134 weighted_variance = top/bottom
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
135 weighted_stdev = math.sqrt(weighted_variance)
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
136 weighted_stder = weighted_stdev/math.sqrt(len(scores))
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
137 return (weighted_average, weighted_variance, weighted_stdev, weighted_stder)
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
138
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
139
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
140
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
141
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
142
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
143
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
144
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
145
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
146
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
147
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
148 ##### AGGREGATION / CALCULATIONS #####
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
149
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
150 #Reads the genes which should be marked in the final aggregate file into an array
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
151
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
152 import os.path
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
153 if arguments.marked:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
154 with open(arguments.marked) as file:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
155 marked_set = file.read().splitlines()
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
156
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
157 #Creates a dictionary of dictionaries to contain a summary of all genes and their fitness values
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
158 #The fitness values and weights match up, so that the weight of gene_summary[locus]["w"][2] would be gene_summary[locus]["s"][2]
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
159
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
160 import csv
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
161 gene_summary = {}
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
162 for eachfile in arguments.fitnessfiles:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
163 with open(eachfile) as csvfile:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
164 lines = csv.reader(csvfile)
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
165 for line in lines:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
166 locus = line[9]
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
167 w = line[12]
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
168 if w == 'nW':
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
169 continue
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
170 if not w:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
171 w == 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
172 c1 = float(line[2])
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
173 c2 = float(line[3])
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
174 avg = (c1+c2)/2
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
175 if avg < float(arguments.cutoff):
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
176 continue
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
177 if avg > float(arguments.weight_ceiling):
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
178 avg = arguments.weight_ceiling
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
179 if locus not in gene_summary:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
180 gene_summary[locus] = {"w" : [], "s": []}
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
181 gene_summary[locus]["w"].append(w)
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
182 gene_summary[locus]["s"].append(avg)
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
183
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
184 #If finding any missing gene loci is requested in the arguments, starts out by loading all the known features from a genbank file
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
185
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
186 from Bio import SeqIO
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
187 if (arguments.find_missing):
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
188 output = [["locus","mean","var","sd","se","gene","Total","Blank","Not Blank","Blank Removed","M\n"]]
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
189 handle = open(arguments.find_missing, "rU")
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
190 for record in SeqIO.parse(handle, "genbank"):
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
191 refname = record.id
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
192 features = record.features
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
193 handle.close()
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
194
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
195 #Goes through the features to find which are genes
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
196
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
197 for feature in features:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
198 gene = ""
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
199 if feature.type == "gene":
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
200 locus = "".join(feature.qualifiers["locus_tag"])
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
201 if "gene" in feature.qualifiers:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
202 gene = "".join(feature.qualifiers["gene"])
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
203 else:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
204 continue
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
205
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
206 #Goes through the fitness scores of insertions within each gene, and removes whatever % of blank fitness scores were requested along with their corresponding weights
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
207
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
208 sum = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
209 num = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
210 avgsum = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
211 blank_ws = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
212 i = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
213 if locus in gene_summary.keys():
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
214 for w in gene_summary[locus]["w"]:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
215 if float(w) == 0:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
216 blank_ws += 1
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
217 else:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
218 sum += float(w)
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
219 num += 1
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
220 count = num + blank_ws
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
221 removed = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
222 to_remove = int(float(arguments.blank_pc)*count)
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
223 if blank_ws > 0:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
224 i = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
225 while i < len(gene_summary[locus]["w"]):
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
226 w = gene_summary[locus]["w"][i]
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
227 if removed == to_remove:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
228 break
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
229 if float(w) == 0:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
230 del gene_summary[locus]["w"][i]
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
231 del gene_summary[locus]["s"][i]
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
232 removed += 1
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
233 i -= 1
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
234 i += 1
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
235
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
236 #If all the fitness values within a gene are empty, sets mean/var to 0.10 and Xs out sd/se; marks the gene if that's requested
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
237
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
238 if num == 0:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
239 if (arguments.marked and locus in marked_set):
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
240 output.append([locus, "0.10", "0.10", "X", "X", gene, count, blank_ws, num, removed, "M", "\n"])
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
241 else:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
242 output.append([locus, "0.10", "0.10", "X", "X", gene, count, blank_ws, num, removed, "\n"])
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
243
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
244 #Otherwise calls average() or weighted_average() to find the aggregate w / count / standard deviation / standard error of the insertions within each gene; marks the gene if that's requested
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
245
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
246 else:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
247 if not arguments.weighted:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
248 (average, variance, stdev, stderr) = unweighted_average(gene_summary[locus]["w"])
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
249 else:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
250 (average, variance, stdev, stderr) = weighted_average(gene_summary[locus]["w"],gene_summary[locus]["s"])
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
251 if (arguments.marked and locus in marked_set):
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
252 output.append([locus, average, variance, stdev, stderr, gene, count, blank_ws, num, removed, "M", "\n"])
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
253 else:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
254 output.append([locus, average, variance, stdev, stderr, gene, count, blank_ws, num, removed, "\n"])
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
255
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
256 #If a gene doesn't have any insertions, sets mean/var to 0.10 and Xs out sd/se, plus leaves count through removed blank because there were no reads.
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
257
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
258 else:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
259 if (arguments.marked and locus in marked_set):
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
260 output.append([locus, "0.10", "0.10", "X", "X", gene, "", "", "", "", "M", "\n"])
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
261 else:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
262 output.append([locus, "0.10", "0.10", "X", "X", gene, "", "", "", "", "\n"])
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
263
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
264 #Writes the aggregated fitness file
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
265
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
266 with open(arguments.summary, "wb") as csvfile:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
267 writer = csv.writer(csvfile)
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
268 writer.writerows(output)
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
269
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
270 #If finding missing genes is not requested, just finds the aggregate w / count / standard deviation / standard error of the insertions within each gene, and writes them to a file, plus marks the genes requested
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
271 #This is never called through Galaxy since finding missing genes is just better than not finding them.
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
272
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
273 else:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
274 output = [["Locus","W","Count","SD","SE","M\n"]]
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
275 for gene in gene_summary.keys():
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
276 sum = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
277 num = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
278 average = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
279 if "w" not in gene_summary[gene]:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
280 continue
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
281 for i in gene_summary[gene]["w"]:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
282 sum += i
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
283 num += 1
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
284 average = sum/num
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
285 xminusxbars = 0
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
286 for i in w:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
287 xminusxbars += (i-average)**2
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
288 if num > 1:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
289 sd = math.sqrt(xminusxbars/(num-1))
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
290 se = sd / math.sqrt(num)
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
291 if (arguments.marked and locus in marked_set):
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
292 output.append([gene, average, num, sd, se, "M", "\n"])
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
293 else:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
294 output.append([gene, average, num, sd, se, "\n"])
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
295 with open(arguments.summary, "wb") as csvfile:
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
296 writer = csv.writer(csvfile)
5ff57a3d0af2 Uploaded
kaymccoy
parents:
diff changeset
297 writer.writerows(output)