Mercurial > repos > kaymccoy > aggregate_fitness
comparison aggregate.py @ 9:95d062ea06c3 draft
Uploaded
author | kaymccoy |
---|---|
date | Mon, 01 May 2017 22:56:32 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
8:6587180179b6 | 9:95d062ea06c3 |
---|---|
1 # A translation of aggregate.pl into python! For analysis of Tn-Seq. | |
2 # This script requires BioPython just like calc_fitness.py, so you need it installed along with its dependencies if you want to run these scripts on your own. | |
3 # How to install BioPython and a list of its dependencies can be found here: http://biopython.org/DIST/docs/install/Installation.html | |
4 # K. McCoy | |
5 | |
6 | |
7 | |
8 | |
9 | |
10 | |
11 | |
12 | |
13 | |
14 ##### ARGUMENTS ##### | |
15 | |
16 def print_usage(): | |
17 print "Aggregate.py's usage is as follows:" + "\n\n" | |
18 print "\033[1m" + "Required" + "\033[0m" + "\n" | |
19 print "-o" + "\t\t" + "Output file for aggregated data." + "\n" | |
20 print "\n" | |
21 print "\033[1m" + "Optional" + "\033[0m" + "\n" | |
22 print "-c" + "\t\t" + "Check for missing genes in the data set - provide a reference genome in genbank format. Missing genes will be sent to stdout." + "\n" | |
23 print "-m" + "\t\t" + "Place a mark in an extra column for this set of genes. Provide a file with a list of genes seperated by newlines." + "\n" | |
24 print "-x" + "\t\t" + "Cutoff: Don't include fitness scores with average counts (c1+c2)/2 < x (default: 0)" + "\n" | |
25 print "-b" + "\t\t" + "Bottleneck value: The percentage of insertions randomly lost, which will be discounted for all genes (for example, 20% would be entered as 0.20; default 0.0)" + "\n" | |
26 print "-f" + "\t\t" + "An in-between file carrying information on the blank count found from calc_fitness; one of two ways to pass a blank count to this script" + "\n" | |
27 print "-w" + "\t\t" + "Use weighted algorithm to calculate averages, variance, sd, se" + "\n" | |
28 print "-l" + "\t\t" + "Weight ceiling: maximum value to use as a weight (default: 999,999)" + "\n" | |
29 print "\n" | |
30 print "All remainder arguements will be treated as fitness files (those files created by calc_fitness.py)" + "\n" | |
31 print "\n" | |
32 | |
33 import argparse | |
34 parser = argparse.ArgumentParser() | |
35 parser.add_argument("-o", action="store", dest="summary") | |
36 parser.add_argument("-c", action="store", dest="find_missing") | |
37 parser.add_argument("-m", action="store", dest="marked") | |
38 parser.add_argument("-x", action="store", dest="cutoff") | |
39 parser.add_argument("-b", action="store", dest="blank_pc") | |
40 parser.add_argument("-f", action="store", dest="blank_file") | |
41 parser.add_argument("-w", action="store", dest="weighted") | |
42 parser.add_argument("-l", action="store", dest="weight_ceiling") | |
43 parser.add_argument("fitnessfiles", nargs=argparse.REMAINDER) | |
44 | |
45 arguments = parser.parse_args() | |
46 | |
47 if not arguments.summary: | |
48 print "\n" + "You are missing a value for the -o flag. " | |
49 print_usage() | |
50 quit() | |
51 | |
52 if not arguments.fitnessfiles: | |
53 print "\n" + "You are missing fitness file(s); these should be entered immediately after all the flags. " | |
54 print_usage() | |
55 quit() | |
56 | |
57 # 999,999 is a trivial placeholder number | |
58 | |
59 if (not arguments.weight_ceiling): | |
60 arguments.weight_ceiling = 999999 | |
61 | |
62 # Cutoff exists to discard positions with a low number of counted transcripts, because their fitness may not be as accurate - for the same reasoning that studies with low sample sizes can be innacurate. | |
63 | |
64 if (not arguments.cutoff): | |
65 arguments.cutoff = 0 | |
66 | |
67 # Gets information from the txt output file of calc_fit / consol, if inputted | |
68 | |
69 if arguments.blank_file: | |
70 with open(arguments.blank_file) as file: | |
71 blank_pc = file.read().splitlines() | |
72 arguments.blank_pc = float(blank_pc[0].split()[1]) | |
73 | |
74 if (not arguments.blank_pc): | |
75 arguments.blank_pc = 0 | |
76 | |
77 | |
78 | |
79 | |
80 | |
81 ##### SUBROUTINES ##### | |
82 | |
83 # A subroutine that calculates the average, variance, standard deviation (sd), and standard error (se) of a group of scores; for use when aggregating scores by gene later on | |
84 | |
85 import math | |
86 def unweighted_average(scores): | |
87 sum = 0 | |
88 num = 0 | |
89 i = 0 | |
90 while i < len(scores): | |
91 if not scores[i]: | |
92 scores[i] = 0.0 | |
93 sum += float(scores[i]) | |
94 num += 1 | |
95 i += 1 | |
96 average = sum/num | |
97 xminusxbars = 0 | |
98 while i < len(scores): | |
99 xminusxbars += (float(scores[i]) - average)**2 | |
100 if num <= 1: | |
101 variance = 0 | |
102 else: | |
103 variance = xminusxbars/(num-1) | |
104 sd = math.sqrt(variance) | |
105 se = sd / math.sqrt(num) | |
106 return (average, variance, sd, se) | |
107 | |
108 # A subroutine that calculates the weighted average, variance, standard deviation (sd), and standard error (se) of a group of scores; the weights come from the number of reads each insertion location has | |
109 # For use when aggregating scores by gene later on, if the weighted argument is called | |
110 | |
111 def weighted_average(scores,weights): | |
112 sum = 0 | |
113 weighted_average = 0 | |
114 weighted_variance = 0 | |
115 top = 0 | |
116 bottom = 0 | |
117 i = 0 | |
118 while i < len(weights): | |
119 if not scores[i]: | |
120 scores[i] = 0.0 | |
121 top += float(weights[i])*float(scores[i]) | |
122 bottom += float(weights[i]) | |
123 i += 1 | |
124 if bottom == 0: | |
125 return 0 | |
126 weighted_average = top/bottom | |
127 top = 0 | |
128 bottom = 0 | |
129 i = 0 | |
130 while i < len(weights): | |
131 top += float(weights[i]) * (float(scores[i]) - weighted_average)**2 | |
132 bottom += float(weights[i]) | |
133 i += 1 | |
134 weighted_variance = top/bottom | |
135 weighted_stdev = math.sqrt(weighted_variance) | |
136 weighted_stder = weighted_stdev/math.sqrt(len(scores)) | |
137 return (weighted_average, weighted_variance, weighted_stdev, weighted_stder) | |
138 | |
139 | |
140 | |
141 | |
142 | |
143 | |
144 | |
145 | |
146 | |
147 | |
148 ##### AGGREGATION / CALCULATIONS ##### | |
149 | |
150 #Reads the genes which should be marked in the final aggregate file into an array | |
151 | |
152 import os.path | |
153 if arguments.marked: | |
154 with open(arguments.marked) as file: | |
155 marked_set = file.read().splitlines() | |
156 | |
157 #Creates a dictionary of dictionaries to contain a summary of all genes and their fitness values | |
158 #The fitness values and weights match up, so that the weight of gene_summary[locus]["w"][2] would be gene_summary[locus]["s"][2] | |
159 | |
160 import csv | |
161 gene_summary = {} | |
162 for eachfile in arguments.fitnessfiles: | |
163 with open(eachfile) as csvfile: | |
164 lines = csv.reader(csvfile) | |
165 for line in lines: | |
166 locus = line[9] | |
167 w = line[12] | |
168 if w == 'nW': | |
169 continue | |
170 if not w: | |
171 w == 0 | |
172 c1 = float(line[2]) | |
173 c2 = float(line[3]) | |
174 avg = (c1+c2)/2 | |
175 if avg < float(arguments.cutoff): | |
176 continue | |
177 if avg > float(arguments.weight_ceiling): | |
178 avg = arguments.weight_ceiling | |
179 if locus not in gene_summary: | |
180 gene_summary[locus] = {"w" : [], "s": []} | |
181 gene_summary[locus]["w"].append(w) | |
182 gene_summary[locus]["s"].append(avg) | |
183 | |
184 #If finding any missing gene loci is requested in the arguments, starts out by loading all the known features from a genbank file | |
185 | |
186 from Bio import SeqIO | |
187 if (arguments.find_missing): | |
188 output = [["locus","mean","var","sd","se","gene","Total","Blank","Not Blank","Blank Removed","M\n"]] | |
189 handle = open(arguments.find_missing, "rU") | |
190 for record in SeqIO.parse(handle, "genbank"): | |
191 refname = record.id | |
192 features = record.features | |
193 handle.close() | |
194 | |
195 #Goes through the features to find which are genes | |
196 | |
197 for feature in features: | |
198 gene = "" | |
199 if feature.type == "gene": | |
200 locus = "".join(feature.qualifiers["locus_tag"]) | |
201 if "gene" in feature.qualifiers: | |
202 gene = "".join(feature.qualifiers["gene"]) | |
203 else: | |
204 continue | |
205 | |
206 #Goes through the fitness scores of insertions within each gene, and removes whatever % of blank fitness scores were requested along with their corresponding weights | |
207 | |
208 sum = 0 | |
209 num = 0 | |
210 avgsum = 0 | |
211 blank_ws = 0 | |
212 i = 0 | |
213 if locus in gene_summary.keys(): | |
214 for w in gene_summary[locus]["w"]: | |
215 if float(w) == 0: | |
216 blank_ws += 1 | |
217 else: | |
218 sum += float(w) | |
219 num += 1 | |
220 count = num + blank_ws | |
221 removed = 0 | |
222 to_remove = int(float(arguments.blank_pc)*count) | |
223 if blank_ws > 0: | |
224 i = 0 | |
225 while i < len(gene_summary[locus]["w"]): | |
226 w = gene_summary[locus]["w"][i] | |
227 if removed == to_remove: | |
228 break | |
229 if float(w) == 0: | |
230 del gene_summary[locus]["w"][i] | |
231 del gene_summary[locus]["s"][i] | |
232 removed += 1 | |
233 i -= 1 | |
234 i += 1 | |
235 | |
236 #If all the fitness values within a gene are empty, sets mean/var to 0.10 and Xs out sd/se; marks the gene if that's requested | |
237 | |
238 if num == 0: | |
239 if (arguments.marked and locus in marked_set): | |
240 output.append([locus, "0.10", "0.10", "X", "X", gene, count, blank_ws, num, removed, "M", "\n"]) | |
241 else: | |
242 output.append([locus, "0.10", "0.10", "X", "X", gene, count, blank_ws, num, removed, "\n"]) | |
243 | |
244 #Otherwise calls average() or weighted_average() to find the aggregate w / count / standard deviation / standard error of the insertions within each gene; marks the gene if that's requested | |
245 | |
246 else: | |
247 if not arguments.weighted: | |
248 (average, variance, stdev, stderr) = unweighted_average(gene_summary[locus]["w"]) | |
249 else: | |
250 (average, variance, stdev, stderr) = weighted_average(gene_summary[locus]["w"],gene_summary[locus]["s"]) | |
251 if (arguments.marked and locus in marked_set): | |
252 output.append([locus, average, variance, stdev, stderr, gene, count, blank_ws, num, removed, "M", "\n"]) | |
253 else: | |
254 output.append([locus, average, variance, stdev, stderr, gene, count, blank_ws, num, removed, "\n"]) | |
255 | |
256 #If a gene doesn't have any insertions, sets mean/var to 0.10 and Xs out sd/se, plus leaves count through removed blank because there were no reads. | |
257 | |
258 else: | |
259 if (arguments.marked and locus in marked_set): | |
260 output.append([locus, "0.10", "0.10", "X", "X", gene, "", "", "", "", "M", "\n"]) | |
261 else: | |
262 output.append([locus, "0.10", "0.10", "X", "X", gene, "", "", "", "", "\n"]) | |
263 | |
264 #Writes the aggregated fitness file | |
265 | |
266 with open(arguments.summary, "wb") as csvfile: | |
267 writer = csv.writer(csvfile) | |
268 writer.writerows(output) | |
269 | |
270 #If finding missing genes is not requested, just finds the aggregate w / count / standard deviation / standard error of the insertions within each gene, and writes them to a file, plus marks the genes requested | |
271 #This is never called through Galaxy since finding missing genes is just better than not finding them. | |
272 | |
273 else: | |
274 output = [["Locus","W","Count","SD","SE","M\n"]] | |
275 for gene in gene_summary.keys(): | |
276 sum = 0 | |
277 num = 0 | |
278 average = 0 | |
279 if "w" not in gene_summary[gene]: | |
280 continue | |
281 for i in gene_summary[gene]["w"]: | |
282 sum += i | |
283 num += 1 | |
284 average = sum/num | |
285 xminusxbars = 0 | |
286 for i in w: | |
287 xminusxbars += (i-average)**2 | |
288 if num > 1: | |
289 sd = math.sqrt(xminusxbars/(num-1)) | |
290 se = sd / math.sqrt(num) | |
291 if (arguments.marked and locus in marked_set): | |
292 output.append([gene, average, num, sd, se, "M", "\n"]) | |
293 else: | |
294 output.append([gene, average, num, sd, se, "\n"]) | |
295 with open(arguments.summary, "wb") as csvfile: | |
296 writer = csv.writer(csvfile) | |
297 writer.writerows(output) |