comparison calc_fitness.py @ 4:9aeda6cd07dc draft

Uploaded
author kaymccoy
date Thu, 11 Aug 2016 18:34:33 -0400
parents
children
comparison
equal deleted inserted replaced
3:6aef6d1c9316 4:9aeda6cd07dc
1 # A translation of calc_fitness.pl into python! For analysis of Tn-Seq.
2 # This script requires BioPython, which in turn has a good number of dependencies (some optional but very helpful).
3 # How to install BioPython and a list of its dependencies can be found here: http://biopython.org/DIST/docs/install/Installation.html
4 # To see what future edits / tests are planned for this script, search for the phrase "in the future".
5
6
7
8
9
10
11
12
13
14
15 ##### ARGUMENTS #####
16
17 def print_usage():
18 print "\n" + "You are missing one or more required flags. A complete list of flags accepted by calc_fitness is as follows:" + "\n\n"
19 print "\033[1m" + "Required" + "\033[0m" + "\n"
20 print "-ref" + "\t\t" + "The name of the reference genome file, in GenBank format." + "\n"
21 print "-t1" + "\t\t" + "The name of the bowtie mapfile from time 1." + "\n"
22 print "-t2" + "\t\t" + "The name of the bowtie mapfile from time 2." + "\n"
23 print "-out" + "\t\t" + "Name of a file to enter the .csv output." + "\n"
24 print "\n"
25 print "\033[1m" + "Optional" + "\033[0m" + "\n"
26 print "-expansion" + "\t\t" + "Expansion factor (default: 250)" + "\n"
27 print "-d" + "\t\t" + "All reads being analyzed are downstream of the transposon" + "\n"
28 print "-reads1" + "\t\t" + "The number of reads to be used to calculate the correction factor for time 0." + "\n\t\t" + "(default counted from bowtie output)" + "\n"
29 print "-reads2" + "\t\t" + "The number of reads to be used to calculate the correction factor for time 6." + "\n\t\t" + "(default counted from bowtie output)" + "\n"
30 print "-cutoff" + "\t\t" + "Discard any positions where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\n"
31 print "-cutoff2" + "\t\t" + "Discard any positions within the normalization genes where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\n"
32 print "-strand" + "\t\t" + "Use only the specified strand (+ or -) when counting transcripts (default: both)" + "\n"
33 print "-normalize" + "\t" + "A file that contains a list of genes that should have a fitness of 1" + "\n"
34 print "-maxweight" + "\t" + "The maximum weight a transposon gene can have in normalization calculations" + "\n"
35 print "-multiply" + "\t" + "Multiply all fitness scores by a certain value (e.g., the fitness of a knockout). You should normalize the data." + "\n"
36 print "-ef" + "\t\t" + "Exclude insertions that occur in the first N amount (%) of gene--becuase may not affect gene function." + "\n"
37 print "-el" + "\t\t" + "Exclude insertions in the last N amount (%) of the gene--considering truncation may not affect gene function." + "\n"
38 print "-wig" + "\t\t" + "Create a wiggle file for viewing in a genome browser. Provide a filename." + "\n"
39 print "\n"
40
41 import argparse
42 parser = argparse.ArgumentParser()
43 parser.add_argument("-ref", action="store", dest="ref_genome")
44 parser.add_argument("-t1", action="store", dest="mapfile1")
45 parser.add_argument("-t2", action="store", dest="mapfile2")
46 parser.add_argument("-out", action="store", dest="outfile")
47 parser.add_argument("-out2", action="store", dest="outfile2")
48 parser.add_argument("-expansion", action="store", dest="expansion_factor")
49 parser.add_argument("-d", action="store", dest="downstream")
50 parser.add_argument("-reads1", action="store", dest="reads1")
51 parser.add_argument("-reads2", action="store", dest="reads2")
52 parser.add_argument("-cutoff", action="store", dest="cutoff")
53 parser.add_argument("-cutoff2", action="store", dest="cutoff2")
54 parser.add_argument("-strand", action="store", dest="usestrand")
55 parser.add_argument("-normalize", action="store", dest="normalize")
56 parser.add_argument("-maxweight", action="store", dest="max_weight")
57 parser.add_argument("-multiply", action="store", dest="multiply")
58 parser.add_argument("-ef", action="store", dest="exclude_first")
59 parser.add_argument("-el", action="store", dest="exclude_last")
60 parser.add_argument("-wig", action="store", dest="wig")
61 arguments = parser.parse_args()
62
63 # Checks that all the required arguments have actually been entered
64 # The reference genome is required to find what gene each insertion falls within, the mapfiles are required because the relative number of insertions between time 2 and time 1 are used to calculate each insertion location's fitness, and the outfile is required so you... get an outfile.
65
66 if (not arguments.ref_genome or not arguments.mapfile1 or not arguments.mapfile2 or not arguments.outfile):
67 print_usage()
68 quit()
69
70 # Sets the default value of the expansion factor to 250, which is not a particularly meaningful number; the expansion factor just needs to have some value so that rough fitness calculations can be run.
71 # Measuring and inputting your own expansion factor is highly recommended, as without it the fitness calculations will not be nearly as accurate. See the equations used to calculate fitness if you're confused about that.
72
73 if (not arguments.expansion_factor):
74 arguments.expansion_factor = 250
75
76 # Sets the default maximum weight of a transposon gene; 75 is similarly not a particularly meaningful number.
77
78 if (not arguments.max_weight):
79 arguments.max_weight = 75
80
81 # Sets the default value of cutoff to 0; cutoff exists to discard positions with a low number of counted transcripts, because fitnesses calculated from them may not be very accurate, by the same reasoning that studies with low sample sizes are innacurate.
82
83 if (not arguments.cutoff):
84 arguments.cutoff = 0
85
86 # Sets the default value of cutoff2 to 10; cutoff2 exists to discard positions within normalization genes with a low number of counted transcripts, because fitnesses calculated from them similarly may not be very accurate.
87 # This only has an effect if it's larger than cutoff, since the normalization step references a list of insertions already affected by cutoff.
88
89 if (not arguments.cutoff2):
90 arguments.cutoff2 = 10
91
92 #Sets the default value of strand to "both", indicating you'll use reads from both Watson & Crick strands.
93
94 if (not arguments.usestrand):
95 arguments.usestrand = "both"
96
97
98
99
100
101
102 ##### PARSING THE REFERENCE GENOME #####
103
104 def get_time():
105 import datetime
106 return datetime.datetime.now().time()
107 print "\n" + "Starting: " + str(get_time()) + "\n"
108
109 # Uses Biopython's SeqIO to parse the reference genome file for features; these come from its feature table, which lists all known features and their relevance - if they're a gene, a recognizable repeat, etc.
110 # This is the reason that all reference genome files must be in standard GenBank format!
111
112 from Bio import SeqIO
113 import os.path
114 handle = open(arguments.ref_genome, "rU")
115 for record in SeqIO.parse(handle, "genbank"):
116 refname = record.id
117 features = record.features
118 handle.close()
119
120 # Makes a dictionary out of each feature that's a gene - with its gene name, start location, end location, and strand as keys to their values. Then makes a list out of all those dictionaries for ease of accessing later on.
121
122 feature_list = []
123 for feature in features:
124 if feature.type == "gene":
125 gene = feature.qualifiers["locus_tag"]
126 strand = feature.location.strand
127 start = float(feature.location.start)
128 end = float(feature.location.end)
129
130 # Exclude_first and exclude_last are used here to exclude whatever percentage of the genes you like from calculations; e.g. a value of 0.1 for exclude_last would exclude the last 10% of all genes!
131 # This can be useful because insertions at the very start or end of genes often don't actually break its function.
132
133 if (arguments.exclude_first):
134 start += (end - start) * float(arguments.exclude_first)
135 if (arguments.exclude_last):
136 end -= (end - start) * float(arguments.exclude_last)
137 feature_dictionary = {"gene": gene, "start": start, "end": end, "strand": strand}
138 feature_list.append(feature_dictionary)
139
140 print "Done generating feature lookup: " + str(get_time()) + "\n"
141
142
143
144
145
146
147
148
149
150
151 ##### PARSING THE MAPFILES #####
152
153 with open(arguments.mapfile1) as file:
154 r1 = file.readlines()
155 with open(arguments.mapfile2) as file:
156 r2 = file.readlines()
157
158 # When called, goes through each line of the mapfile (each line is a compressed read) to find the strand, count, and position of the read. It may be helpful to look at how the mapfiles are formatted to understand how this code finds them.
159 # The strand of a read is + (Watson) or - (Crick).
160 # The count is the number of times a particular read was actually sequenced (counted by fastx_collapser before calc_fitness.py is even called).
161 # The position is the position of the sequence when alligned to the reference genome (found by bowtie, also before calc_fitness.py is called). This is the position of the FIRST nucleotide in the sequence, when mapped.
162 # Also records the total numbers of reads for the plus and minus strands, and the number of plus and minus sites from the mapfiles - not for fitness calculations, just for reference, as they're printed later.
163
164 def read_mapfile(reads):
165 plus_total = 0
166 minus_total = 0
167 plus_counts = {"total": 0, "sites": 0}
168 minus_counts = {"total": 0, "sites": 0}
169 for read in reads:
170 if "-" in read.split()[0]:
171 strand = read.split()[1]
172 count = float(read.split()[0].split("-")[1])
173 position = float(read.split()[3])
174 else:
175 continue
176
177 # If for some reason you want to skip all reads from one of the strands - for example, if you wanted to compare the two strands - that's done here.
178
179 if arguments.usestrand != "both" and strand != arguments.usestrand:
180 continue
181
182 # Makes a dictionary for the + strand, with each insert position as a key and the number of insertions there as its corresponding value.
183 # Here we run into a tricky problem, in that the actual location of an insertion would be right before the position of the first nucleotide in a flanking sequence "following" the insertion, but right after the last nucleotide in a flanking sequence "preceding" the insertion.
184 # Thus the location of an insertion from a following flanking sequence would be the same as the mapped position of the sequence, but the location of an insertion from a preceding flanking sequence would be its mapped position + the length of the flanking sequence - 2.
185 # The -2 comes from a fake "TA" in the read, caused by MmeI cutting unevenly at an "AT" site so that an extra "TA" is created, artifically adding +2 to the flanking sequence length.
186 # However, it is currently impossible to distinguish between following and preceding flanking sequences, and so we split the difference by adding (the length of the sequence - 2) to only the + strand reads.
187 # In the future we will try not adding anything to both, or adding 1/2(the sequence length -2) to both + an - strands, to see if that improves calculations.
188 # The 'if arguments.downstream' option is there for a different method than the standard, in which reads are only taken from downstream of the transposon.
189
190 if (strand == "+"):
191 sequence_length = len(read.split()[4])
192 if arguments.downstream:
193 position += 0
194 else:
195 position += (sequence_length - 2)
196 plus_counts["total"] += count
197 plus_counts["sites"] += 1
198 if position in plus_counts:
199 plus_counts[position] += count
200 else:
201 plus_counts[position] = count
202
203 # Makes a dictionary for the - strand, with each insert position as a key and the number of insertions there as its corresponding value.
204
205 else:
206 minus_counts["total"] += count
207 minus_counts["sites"] += 1
208 if position in minus_counts:
209 minus_counts[position] += count
210 else:
211 minus_counts[position] = count
212 return (plus_counts, minus_counts)
213
214 # Calls read_mapfile(reads) to parse arguments.reads1 and arguments.reads2 (your reads from t1 and t2).
215
216 (plus_ref_1, minus_ref_1) = read_mapfile(r1)
217 print "Read first file: " + str(get_time()) + "\n"
218 (plus_ref_2, minus_ref_2) = read_mapfile(r2)
219 print "Read second file: " + str(get_time()) + "\n"
220
221 # Prints the number of + and - reads from reads1 and reads2, as well as the number of + and - sites found from the mapfiles, for reference when debugging etc.
222 # The number of sites is the length of a given dictionary of sites - 1 because its last key, "total", isn't actually a site.
223
224 print "Reads:" + "\n"
225 print "1: + " + str(plus_ref_1["total"]) + " - " + str(minus_ref_1["total"]) + "\n"
226 print "2: + " + str(plus_ref_2["total"]) + " - " + str(minus_ref_2["total"]) + "\n"
227 print "Sites:" + "\n"
228 print "1: + " + str(plus_ref_1["sites"]) + " - " + str(minus_ref_1["sites"]) + "\n"
229 print "2: + " + str(plus_ref_2["sites"]) + " - " + str(minus_ref_2["sites"]) + "\n"
230
231
232
233
234
235
236
237
238
239
240 ##### FITNESS CALCULATIONS #####
241
242 # If reads1 and reads2 weren't specified in the command line, sets them as the total number of reads (found in read_mapfile())
243
244 if not arguments.reads1:
245 arguments.reads1 = plus_ref_1["total"] + minus_ref_1["total"]
246 if not arguments.reads2:
247 arguments.reads2 = plus_ref_2["total"] + minus_ref_2["total"]
248
249 # Calculates the correction factors for reads from t1 and t2; cfactor1 and cfactor2 are the number of reads from t1 and t2 respectively divided by total, which is the average number of reads between the two.
250 # For example, if there were more reads from t2 than t1 - in a 6:4 ratio - cfactor2 would be 1.2 and cfactor1 would be 0.8
251 # This is used later on to correct for pipetting errors, or any other error that would cause unequal amounts of DNA from t1 and t2 to be sequenced so that an unequal amount of reads is produced
252 # However, because slightly more or less DNA shouldn't change the % frequency of a mutation (it would only change the absolute number of counts) this may not be necessary; in the future we'll try removing this.
253
254 total = (float(arguments.reads1) + float(arguments.reads2))/2
255 cfactor1 = float(arguments.reads1)/total
256 cfactor2 = float(arguments.reads2)/total
257 print "Cfactor 1: " + str(cfactor1) + "\n"
258 print "Cfactor 2: " + str(cfactor2) + "\n"
259 import math
260 import csv
261 results = [["position", "strand", "count_1", "count_2", "ratio", "mt_freq_t1", "mt_freq_t2", "pop_freq_t1", "pop_freq_t2", "gene", "D", "W", "nW"]]
262 genic = 0
263 total_inserts = 0
264 with open(arguments.ref_genome, "r") as file:
265 firstline = file.readline()
266 genomelength = firstline.split()[2]
267 i = 0
268 while i < float(genomelength):
269
270 # At each possible location for an insertion in the genome, counts the number of actual insertions at t1 and which strand(s) the corresponding reads came from.
271
272 c1 = 0
273 if i in plus_ref_1:
274 c1 = float(plus_ref_1[i])
275 strand = "+/"
276 if i in minus_ref_1:
277 c1 += float(minus_ref_1[i])
278 strand = "b/"
279 elif i in minus_ref_1:
280 c1 = float(minus_ref_1[i])
281 strand = "-/"
282
283 # If there were no insertions at a certain location at t1 just continues to the next location; there can't be any comparison to make between t1 and t2 if there are no t1 insertions!
284
285 else:
286 i += 1
287 continue
288
289 # At each location where there was an insertion at t1, counts the number of insertions at t2 and which strand(s) the corresponding reads came from.
290
291 c2 = 0
292 if i in plus_ref_2:
293 c2 = float(plus_ref_2[i])
294 if i in minus_ref_2:
295 c2 += float(minus_ref_2[i])
296 strand += "b"
297 else:
298 strand += "+"
299 elif i in minus_ref_2:
300 c2 = float(minus_ref_2[i])
301 strand += "-"
302
303 # Corrects with cfactor1 and cfactor2; as noted above, this may not be necessary and we'll see what removing it does in the future. Possible alternate / non-corrected code is shown in the comments just below.
304
305 c1 /= cfactor1
306 if c2 != 0:
307 c2 /= cfactor2
308 ratio = c2/c1
309 else:
310 c2 = 0
311 ratio = 0
312
313 # if c2 != 0:
314 # ratio = c2/c1
315 # else:
316 # ratio = 0
317
318 # Passes by all insertions with a number of reads smaller than the cutoff, as they may lead to inaccurate fitness calculations.
319
320 if (c1 + c2)/2 < float(arguments.cutoff):
321 i+= 1
322 continue
323
324 # Calculates each insertion's frequency within the populations at t1 and t2. If cfactor correction were removed, "total" would have to be changed to "float(arguments.reads1)" and "float(arguments.reads2)" respectively, as shown in the comments just below.
325
326 mt_freq_t1 = c1/total
327 mt_freq_t2 = c2/total
328 # mt_freq_t1 = c1/float(arguments.reads1)
329 # mt_freq_t2 = c2/float(arguments.reads2)
330 pop_freq_t1 = 1 - mt_freq_t1
331 pop_freq_t2 = 1 - mt_freq_t2
332
333 # Calculates each insertion's fitness! This is from the fitness equation log((frequency of mutation @ time 2 / frequency of mutation @ time 1)*expansion factor)/log((frequency of population without the mutation @ time 2 / frequency of population without the mutation @ time 1)*expansion factor)
334
335 w = 0
336 if mt_freq_t2 != 0:
337 top_w = math.log(mt_freq_t2*(float(arguments.expansion_factor)/mt_freq_t1))
338 bot_w = math.log(pop_freq_t2*(float(arguments.expansion_factor)/pop_freq_t1))
339 w = top_w/bot_w
340
341 # Checks which gene locus the insertion falls within, and records that.
342
343 gene = ''
344 for feature_dictionary in feature_list:
345 if feature_dictionary["start"] <= i and feature_dictionary["end"] >= i:
346 gene = "".join(feature_dictionary["gene"])
347 genic += 1
348 break
349 total_inserts += 1
350
351 # Writes all relevant information on each insertion and its fitness to a cvs file: the location of the insertion, its strand, c1, c2, etc. (the variable names are self-explanatiory)
352 # w is written twice, because the second w will be normalized if normalization is called for, thus becoming nW.
353
354 row = [i, strand, c1, c2, ratio, mt_freq_t1, mt_freq_t2, pop_freq_t1, pop_freq_t2, gene, arguments.expansion_factor, w, w]
355 results.append(row)
356 i += 1
357 with open(arguments.outfile, "wb") as csvfile:
358 writer = csv.writer(csvfile)
359 writer.writerows(results)
360
361 # Prints the time when done comparing mapfiles - that is, when done with all fitness calculations but normalization - as well as the number of genic inserts and number of total inserts.
362
363 print "Done comparing mapfiles " + str(get_time()) + "\n"
364 print "Genic: " + str(genic) + "\n"
365 print "Total: " + str(total_inserts) + "\n"
366
367
368
369
370
371
372
373
374
375
376 ##### NORMALIZATION #####
377
378 # If making a WIG file is requested in the arguments, starts a string to be added to and then written to the WIG file with a typical WIG file header.
379 # The header is just in a typical WIG file format; if you'd like to look into this more UCSC has notes on formatting WIG files on their site.
380
381 if (arguments.wig):
382 wigstring = "track type=wiggle_0 name=" + arguments.wig + "\n" + "variableStep chrom=" + refname + "\n"
383
384 # If a file's given for normalization, starts normalization; this corrects for anything that would cause all the fitness values to be too high or too low.
385
386 if (arguments.normalize):
387
388 # Makes a list of the genes in the normalization file, which should all be transposon genes (these naturally ought to have a fitness value of exactly 1, because transposons are generally non-coding DNA)
389
390 with open(arguments.normalize) as file:
391 transposon_genes = file.read().splitlines()
392 print "Normalize genes loaded" + "\n"
393 blank_ws = 0
394 sum = 0
395 count = 0
396 weights = []
397 scores = []
398 for list in results:
399
400 # Finds all insertions within one of the normalization genes that also have a w value; gets their c1 and c2 values (the number of insertions at t1 and t2) and takes the average of that!
401 # The average is later used as the "weight" of an insertion location's fitness - if it's had more insertions, it should weigh proportionally more towards the average fitness of insertions within the normalization genes.
402
403 if list[9] != '' and list[9] in transposon_genes and list[11]:
404 c1 = list[2]
405 c2 = list[3]
406 score = list[11]
407 avg = (c1 + c2)/2
408
409 # Skips over those insertion locations with too few insertions - their fitness values are less accurate because they're based on such small insertion numbers.
410
411 if float(c1) >= float(arguments.cutoff2):
412 #if avg >= float(arguments.cutoff2):
413
414 # Sets a max weight, to prevent insertion location scores with huge weights from unbalancing the normalization.
415
416 if (avg >= float(arguments.max_weight)):
417 avg = float(arguments.max_weight)
418
419 # Tallies how many w values are 0 within the blank_ws value; you might get many transposon genes with a w value of 0 if a bottleneck occurs, which is especially common with in vivo experiments.
420 # For example, when studying a nasal infection in a mouse model, what bacteria "sticks" and is able to survive and what bacteria is swallowed and killed or otherwise flushed out tends to be a matter
421 # of chance not fitness; all mutants with an insertion in a specific transposon gene could be flushed out by chance!
422
423 if score == 0:
424 blank_ws += 1
425
426 # Adds the fitness values of the insertions within normalization genes together and increments count so their average fitness (sum/count) can be calculated later on
427
428 sum += score
429 count += 1
430
431 # Records the weights of the fitness values of the insertion locations in corresponding lists - for example, weights[2] would be the weight of the fitness value at score[2]
432
433 weights.append(avg)
434 scores.append(score)
435
436 print str(list[9]) + " " + str(score) + " " + str(c1)
437
438 # Counts and removes all "blank" fitness values of normalization genes - those that = 0 - because they most likely don't really have a fitness value of 0, and you just happened to not get any reads from that location at t2.
439
440 blank_count = 0
441 original_count = len(scores)
442 i = 0
443 while i < original_count:
444 w_value = scores[i]
445 if w_value == 0:
446 blank_count += 1
447 weights.pop[i]
448 scores.pop[i]
449 i-=1
450 i += 1
451
452 # If no normalization genes can pass the cutoff, normalization cannot occur, so this ends the script advises the user to try again and lower cutoff and/or cutoff2.
453
454 if len(scores) == 0:
455 print 'ERROR: The normalization genes do not have enough reads to pass cutoff and/or cutoff2; please lower one or both of those arguments.' + "\n"
456 quit()
457
458 # Prints the number of of blank fitness values found and removed for reference. Writes the percentage to a file so it can be referenced for aggregate analysis.
459
460 pc_blank_normals = float(blank_count) / float(original_count)
461 print "# blank out of " + str(original_count) + ": " + str(pc_blank_normals) + "\n"
462 with open(arguments.outfile2, "w") as f:
463 f.write("blanks: " + str(pc_blank_normals) + "\n" + "total: " + str(total) + "\n" + "refname: " + refname)
464
465
466 # Finds "average" - the average fitness value for an insertion within the transposon genes - and "weighted_average" - the average fitness value for an insertion within the transposon genes weighted by how many insertions each had.
467
468 average = sum / count
469 i = 0
470 weighted_sum = 0
471 weight_sum = 0
472 while i < len(weights):
473 weighted_sum += weights[i]*scores[i]
474 weight_sum += weights[i]
475 i += 1
476 weighted_average = weighted_sum/weight_sum
477
478 # Prints the regular average, weighted average, and total insertions for reference
479
480 print "Normalization step:" + "\n"
481 print "Regular average: " + str(average) + "\n"
482 print "Weighted Average: " + str(weighted_average) + "\n"
483 print "Total Insertions: " + str(count) + "\n"
484
485 # The actual normalization happens here; every fitness score is divided by the average fitness found for genes that should have a value of 1.
486 # For example, if the average fitness for genes was too low overall - let's say 0.97 within the normalization geness - every fitness would be proportionally raised.
487
488 old_ws = 0
489 new_ws = 0
490 wcount = 0
491 for list in results:
492 if list[11] == 'W':
493 continue
494 new_w = float(list[11])/weighted_average
495
496 # Sometimes you want to multiply all the fitness values by a constant; this does that.
497 # For example you might multiply all the values by a constant for a genetic interaction screen - where Tn-Seq is performed as usual except there's one background knockout all the mutants share. This is
498 # because independent mutations should have a fitness value that's equal to their individual fitness values multipled, but related mutations will deviate from that; to find those deviations you'd multiply
499 # all the fitness values from mutants from a normal library by the fitness of the background knockout and compare that to the fitness values found from the knockout library!
500
501 if arguments.multiply:
502 new_w *= float(arguments.multiply)
503
504 # Records the old w score for reference, and adds it to a total sum of all w scores (so that the old w mean and new w mean can be printed later).
505
506 if float(list[11]) > 0:
507 old_ws += float(list[11])
508 new_ws += new_w
509 wcount += 1
510
511 # Writes the new w score into the results list of lists.
512
513 list[12] = new_w
514
515 # Adds a line to wiglist for each insertion position, with the insertion position and its new w value.
516
517 if (arguments.wig):
518 wigstring += str(list[0]) + " " + str(new_w) + "\n"
519
520 # Prints the old w mean and new w mean for reference.
521
522 old_w_mean = old_ws / wcount
523 new_w_mean = new_ws / wcount
524 print "Old W Average: " + str(old_w_mean) + "\n"
525 print "New W Average: " + str(new_w_mean) + "\n"
526
527 # Overwrites the old file with the normalized file.
528
529 with open(arguments.outfile, "wb") as csvfile:
530 writer = csv.writer(csvfile)
531 writer.writerows(results)
532
533 # If a WIG file was requested, actually creates the WIG file and writes wiglist to it
534 # So what's written here is the WIG header plus each insertion position and it's new w value if normalization were called for, and each insertion position and its unnormalized w value if normalization were not called for.
535
536 if (arguments.wig):
537 if (arguments.normalize):
538 with open(arguments.wig, "wb") as wigfile:
539 wigfile.write(wigstring)
540 else:
541 for list in results:
542 wigstring += str(list[0]) + " " + str(list[11]) + "\n"
543 with open(arguments.wig, "wb") as wigfile:
544 wigfile.write(wigstring)
545
546
547
548
549
550 #FITOPSpy="-ef .0 -el .10 -cutoff 0"
551 #
552 #python ../script/calc_fitness.py $FITOPSpy -wig gview/py_L1_2394eVI_Gluc.wig -t1 alignments/L1_2394eVI_Input.map -t2 alignments/L1_2394eVI_Gluc_T2.map -ref=NC_003028b2.gbk -out results/py_L1_2394eVI_Gluc.csv -out2 results/py_2_L1_2394eVI_Gluc.txt -expansion 675 -normalize tigr4_normal.txt
553 #python ../script/calc_fitness.py $FITOPSpy -wig gview/py_L3_2394eVI_Gluc.wig -t1 alignments/L3_2394eVI_Input.map -t2 alignments/L3_2394eVI_Gluc_T2.map -ref=NC_003028b2.gbk -out results/py_L3_2394eVI_Gluc.csv -out2 results/py_2_L3_2394eVI_Gluc.txt -expansion 244 -normalize tigr4_normal.txt