annotate aggregate.py @ 5:74ef799815e5 draft

Uploaded
author kaymccoy
date Thu, 11 Aug 2016 18:30:39 -0400
parents 9dd574c52765
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
1 # A translation of aggregate.pl into python! For analysis of Tn-Seq.
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
2 # This script requires BioPython just like calc_fitness.py, so you need it installed along with its dependencies if you want to run these scripts on your own.
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
3 # How to install BioPython and a list of its dependencies can be found here: http://biopython.org/DIST/docs/install/Installation.html
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
4
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
5
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
6
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
7
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
8
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
9
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
10
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
11
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
12
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
13
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
14 ##### ARGUMENTS #####
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
15
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
16 # Prints basic instructions on / options for using this code; called when the user forgets to enter an output file or fitness file(s)
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
17
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
18 def print_usage():
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
19 print "Aggregate.py's usage is as follows:" + "\n\n"
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
20 print "\033[1m" + "Required" + "\033[0m" + "\n"
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
21 print "-o" + "\t\t" + "Output file for aggregated data." + "\n"
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
22 print "\n"
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
23 print "\033[1m" + "Optional" + "\033[0m" + "\n"
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
24 print "-c" + "\t\t" + "Check for missing genes in the data set - provide a reference genome in genbank format. Missing genes will be sent to stdout." + "\n"
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
25 print "-m" + "\t\t" + "Place a mark in an extra column for this set of genes. Provide a file with a list of genes seperated by newlines." + "\n"
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
26 print "-x" + "\t\t" + "Cutoff: Don't include fitness scores with average counts (c1+c2)/2 < x (default: 0)" + "\n"
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
27 print "-b" + "\t\t" + "Blanks: Exclude -b % of blank fitness scores (scores where c2 = 0) (default: 0 = 0%)" + "\n"
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
28 print "-w" + "\t\t" + "Use weighted algorithm to calculate averages, variance, sd, se" + "\n"
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
29 print "-l" + "\t\t" + "Weight ceiling: maximum value to use as a weight (default: 999,999)" + "\n"
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
30 print "\n"
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
31 print "All remainder arguements will be treated as fitness files (those files created by calc_fitness.py)" + "\n"
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
32 print "\n"
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
33
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
34 # Turns the arguments entered in the command line into variables so that they can actually be called.
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
35
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
36 import argparse
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
37 parser = argparse.ArgumentParser()
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
38 parser.add_argument("-o", action="store", dest="summary")
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
39 parser.add_argument("-c", action="store", dest="find_missing")
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
40 parser.add_argument("-m", action="store", dest="marked")
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
41 parser.add_argument("-x", action="store", dest="cutoff")
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
42 parser.add_argument("-b", action="store", dest="blank_pc")
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
43 parser.add_argument("-w", action="store", dest="weighted")
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
44 parser.add_argument("-l", action="store", dest="weight_ceiling")
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
45 parser.add_argument("fitnessfiles", nargs=argparse.REMAINDER)
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
46
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
47 #Shortens the names of those variables created from the arguments - from parser.parse_args().ref_genome to arguments.ref_genome, for example.
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
48
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
49 arguments = parser.parse_args()
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
50
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
51 #Checks that the required arguments have been entered; if not informs the user they need to provide a name for the output file / the fitness file(s) and print's aggregate.py's usage
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
52
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
53 if not arguments.summary:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
54 print "\n" + "You are missing a value for the -o flag. "
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
55 print_usage()
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
56 quit()
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
57
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
58 if not arguments.fitnessfiles:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
59 print "\n" + "You are missing fitness file(s); these should be entered immediately after all the flags. "
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
60 print_usage()
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
61 quit()
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
62
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
63 #Sets the maximum weight of a fitness value, if that wasn't specified in the command line.
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
64
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
65 if (not arguments.weight_ceiling):
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
66 arguments.max_weight = 999999
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
67
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
68 #Sets the cutoff to a default value of 0 if it wasn't specified in the command line.
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
69 #Cutoff exists to discard positions with a low number of counted transcripts, because their fitness may not be as accurate - for the same reasoning that studies with low sample sizes can be innacurate.
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
70
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
71 if (not arguments.cutoff):
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
72 arguments.cutoff = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
73
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
74 #Sets the % of blank fitness values to exclude to a default value of 0 if it wasn't specified in the command line. This can be found in the 2nd output of calc_fitness.
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
75
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
76 if (not arguments.blank_pc):
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
77 arguments.blank_pc = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
78
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
79 # gets blank_pc from the output file of calc_fit / consol
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
80
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
81 if arguments.blank_pc:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
82 with open(arguments.blank_pc) as file:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
83 blank_pc = file.read().splitlines()
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
84 arguments.blank_pc = float(blank_pc[1].split()[1])
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
85
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
86
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
87
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
88
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
89
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
90
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
91
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
92 ##### SUBROUTINES #####
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
93
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
94 #A subroutine that calculates the average, variance, standard deviation (sd), and standard error (se) of a group of scores; for use when aggregating scores by gene later on
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
95
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
96 import math
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
97 def average(scores):
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
98 sum = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
99 num = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
100
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
101 #Finds the average of the scores
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
102
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
103 for i in scores:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
104 sum += i
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
105 num += 1
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
106 average = sum/num
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
107
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
108 #Finds the variance of the scores
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
109
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
110 xminusxbars = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
111 for i in scores:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
112 xminusxbars += (i - average)**2
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
113 variance = xminusxbars/(num-1)
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
114
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
115 #Finds the standard deviation and standard error of the scores; then the average / variance / standard deviation / standard error are returned
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
116
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
117 sd = math.sqrt(variance)
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
118 se = sd / math.sqrt(num)
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
119 return (average, variance, sd, se)
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
120
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
121 #A subroutine that calculates the weighted average, variance, standard deviation (sd), and standard error (se) of a group of scores; the weights come from the number of reads each insertion location has
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
122 #For use when aggregating scores by gene later on, if the weighted argument is called
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
123
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
124 def weighted_average(scores,weights):
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
125 sum = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
126 weighted_average = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
127 weighted_variance = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
128 top = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
129 bottom = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
130 i = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
131
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
132 #Finds weighted average of the scores
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
133
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
134 while i < len(weights):
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
135 if not scores[i]:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
136 scores[i] = 0.0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
137 top += float(weights[i])*float(scores[i])
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
138 bottom += float(weights[i])
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
139 i += 1
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
140 if bottom == 0:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
141 return 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
142 weighted_average = top/bottom
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
143
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
144 #Finds weighted variance of the scores
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
145
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
146 top = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
147 bottom = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
148 i = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
149 while i < len(weights):
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
150 top += float(weights[i]) * (float(scores[i]) - weighted_average)**2
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
151 bottom += float(weights[i])
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
152 i += 1
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
153 weighted_variance = top/bottom
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
154
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
155 #Finds weighted standard deviation and standard error of the scores; then the weighted average / variance / standard deviation / standard error are returned
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
156
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
157 weighted_stdev = math.sqrt(weighted_variance)
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
158 weighted_stder = weighted_stdev/math.sqrt(len(scores))
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
159 return (weighted_average, weighted_variance, weighted_stdev, weighted_stder)
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
160
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
161
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
162
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
163
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
164
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
165
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
166
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
167
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
168
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
169
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
170 ##### AGGREGATION / CALCULATIONS #####
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
171
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
172 #Reads the genes which should be marked in the final aggregate file into an array
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
173
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
174 import os.path
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
175 if arguments.marked:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
176 with open(arguments.marked) as file:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
177 marked_set = file.read().splitlines()
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
178
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
179 #Creates a dictionary of dictionaries to contain a summary of all genes and their fitness values
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
180 #Each gene is its own dictionary with w and s as keys for the various fitnesses and weights of the insertion locations within those genes respectively
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
181 #The fitness values and weights match up, so that the weight of gene_summary[locus]["w"][2] would be gene_summary[locus]["s"][2]
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
182
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
183 import csv
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
184 gene_summary = {}
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
185 for eachfile in arguments.fitnessfiles:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
186 with open(eachfile) as csvfile:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
187 lines = csv.reader(csvfile)
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
188 for line in lines:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
189 locus = line[9]
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
190 w = line[12]
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
191 if w == 'nW':
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
192 continue
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
193 if not w:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
194 w == 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
195 c1 = float(line[2])
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
196 c2 = float(line[3])
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
197 avg = (c1+c2)/2
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
198 if avg < float(arguments.cutoff):
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
199 continue
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
200 if avg > float(arguments.weight_ceiling):
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
201 avg = arguments.weight_ceiling
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
202 if locus not in gene_summary:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
203 gene_summary[locus] = {"w" : [], "s": []}
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
204 gene_summary[locus]["w"].append(w)
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
205 gene_summary[locus]["s"].append(avg)
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
206
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
207 #If finding any missing gene loci is requested in the arguments, starts out by loading all the known features from a genbank file
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
208
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
209 from Bio import SeqIO
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
210 if (arguments.find_missing):
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
211 output = [["locus","mean","var","sd","se","gene","Total","Blank","Not Blank","Blank Removed","M\n"]]
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
212 handle = open(arguments.find_missing, "rU")
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
213 for record in SeqIO.parse(handle, "genbank"):
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
214 refname = record.id
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
215 features = record.features
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
216 handle.close()
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
217
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
218 #Goes through the features to find which are genes
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
219
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
220 for feature in features:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
221 gene = ""
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
222 if feature.type == "gene":
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
223 locus = "".join(feature.qualifiers["locus_tag"])
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
224 if "gene" in feature.qualifiers:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
225 gene = "".join(feature.qualifiers["gene"])
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
226 else:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
227 continue
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
228
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
229 #Goes through the fitness scores of insertions within each gene, and removes whatever % of blank fitness scores were requested along with their corresponding weights
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
230
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
231 sum = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
232 num = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
233 avgsum = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
234 blank_ws = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
235 i = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
236 if locus in gene_summary.keys():
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
237 for w in gene_summary[locus]["w"]:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
238 if float(w) == 0:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
239 blank_ws += 1
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
240 else:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
241 sum += float(w)
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
242 num += 1
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
243 count = num + blank_ws
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
244 removed = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
245 to_remove = int(float(arguments.blank_pc)*count)
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
246 if blank_ws > 0:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
247 i = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
248 while i < len(gene_summary[locus]["w"]):
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
249 if removed == to_remove:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
250 break
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
251 if not w:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
252 del gene_summary[locus]["w"][i]
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
253 del gene_summary[locus]["s"][i]
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
254 removed += 1
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
255 i -= 1
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
256 i += 1
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
257
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
258 #If all the fitness values within a gene are empty, sets mean/var to 0.10 and Xs out sd/se; marks the gene if that's requested
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
259
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
260 if num == 0:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
261 if (arguments.marked and locus in marked_set):
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
262 output.append([locus, "0.10", "0.10", "X", "X", gene, count, blank_ws, num, removed, "M", "\n"])
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
263 else:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
264 output.append([locus, "0.10", "0.10", "X", "X", gene, count, blank_ws, num, removed, "\n"])
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
265
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
266 #Otherwise calls average() or weighted_average() to find the aggregate w / count / standard deviation / standard error of the insertions within each gene; marks the gene if that's requested
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
267
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
268 else:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
269 if not arguments.weighted:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
270 (average, variance, stdev, stderr) = average(gene_summary[locus]["w"])
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
271 else:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
272 (average, variance, stdev, stderr) = weighted_average(gene_summary[locus]["w"],gene_summary[locus]["s"])
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
273 if (arguments.marked and locus in marked_set):
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
274 output.append([locus, average, variance, stdev, stderr, gene, count, blank_ws, num, removed, "M", "\n"])
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
275 else:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
276 output.append([locus, average, variance, stdev, stderr, gene, count, blank_ws, num, removed, "\n"])
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
277
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
278 #If a gene doesn't have any insertions, sets mean/var to 0.10 and Xs out sd/se, plus leaves count through removed blank because there were no reads.
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
279
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
280 else:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
281 if (arguments.marked and locus in marked_set):
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
282 output.append([locus, "0.10", "0.10", "X", "X", gene, "", "", "", "", "M", "\n"])
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
283 else:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
284 output.append([locus, "0.10", "0.10", "X", "X", gene, "", "", "", "", "\n"])
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
285
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
286 #Writes the aggregated fitness file
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
287
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
288 with open(arguments.summary, "wb") as csvfile:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
289 writer = csv.writer(csvfile)
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
290 writer.writerows(output)
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
291
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
292 #If finding missing genes is not requested, just finds the aggregate w / count / standard deviation / standard error of the insertions within each gene, and writes them to a file, plus marks the genes requested
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
293 #This is never called through Galaxy since finding missing genes is better than not finding them.
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
294
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
295 else:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
296 output = [["Locus","W","Count","SD","SE","M\n"]]
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
297 for gene in gene_summary.keys():
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
298 sum = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
299 num = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
300 average = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
301 if "w" not in gene_summary[gene]:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
302 continue
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
303 for i in gene_summary[gene]["w"]:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
304 sum += i
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
305 num += 1
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
306 average = sum/num
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
307 xminusxbars = 0
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
308 for i in w:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
309 xminusxbars += (i-average)**2
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
310 if num > 1:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
311 sd = math.sqrt(xminusxbars/(num-1))
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
312 se = sd / math.sqrt(num)
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
313 if (arguments.marked and locus in marked_set):
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
314 output.append([gene, average, num, sd, se, "M", "\n"])
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
315 else:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
316 output.append([gene, average, num, sd, se, "\n"])
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
317 with open(arguments.summary, "wb") as csvfile:
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
318 writer = csv.writer(csvfile)
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
319 writer.writerows(output)
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
320
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
321
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
322 #Test: python ../script/aggregate.py -m tigr4_normal.txt -w 1 -x 10 -l 50 -b 0 -c NC_003028b2.gbk -o aggregates/L3_2394eVI_GlucTEST.csv results/L3_2394eVI_Gluc.csv
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
323
9dd574c52765 Uploaded
kaymccoy
parents:
diff changeset
324 #Perl Test: perl ../script/aggregate.pl -m tigr4_normal.txt -w 1 -x 10 -l 50 -b 0 -c NC_003028b2.gbk -o aggregates/L3_2394eVI_Gluc.csv results/L3_2394eVI_Gluc.csv