annotate calc_fitness.py @ 4:9aeda6cd07dc draft

Uploaded
author kaymccoy
date Thu, 11 Aug 2016 18:34:33 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
1 # A translation of calc_fitness.pl into python! For analysis of Tn-Seq.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
2 # This script requires BioPython, which in turn has a good number of dependencies (some optional but very helpful).
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
3 # How to install BioPython and a list of its dependencies can be found here: http://biopython.org/DIST/docs/install/Installation.html
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
4 # To see what future edits / tests are planned for this script, search for the phrase "in the future".
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
5
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
6
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
7
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
8
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
9
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
10
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
11
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
12
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
13
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
14
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
15 ##### ARGUMENTS #####
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
16
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
17 def print_usage():
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
18 print "\n" + "You are missing one or more required flags. A complete list of flags accepted by calc_fitness is as follows:" + "\n\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
19 print "\033[1m" + "Required" + "\033[0m" + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
20 print "-ref" + "\t\t" + "The name of the reference genome file, in GenBank format." + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
21 print "-t1" + "\t\t" + "The name of the bowtie mapfile from time 1." + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
22 print "-t2" + "\t\t" + "The name of the bowtie mapfile from time 2." + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
23 print "-out" + "\t\t" + "Name of a file to enter the .csv output." + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
24 print "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
25 print "\033[1m" + "Optional" + "\033[0m" + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
26 print "-expansion" + "\t\t" + "Expansion factor (default: 250)" + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
27 print "-d" + "\t\t" + "All reads being analyzed are downstream of the transposon" + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
28 print "-reads1" + "\t\t" + "The number of reads to be used to calculate the correction factor for time 0." + "\n\t\t" + "(default counted from bowtie output)" + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
29 print "-reads2" + "\t\t" + "The number of reads to be used to calculate the correction factor for time 6." + "\n\t\t" + "(default counted from bowtie output)" + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
30 print "-cutoff" + "\t\t" + "Discard any positions where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
31 print "-cutoff2" + "\t\t" + "Discard any positions within the normalization genes where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
32 print "-strand" + "\t\t" + "Use only the specified strand (+ or -) when counting transcripts (default: both)" + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
33 print "-normalize" + "\t" + "A file that contains a list of genes that should have a fitness of 1" + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
34 print "-maxweight" + "\t" + "The maximum weight a transposon gene can have in normalization calculations" + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
35 print "-multiply" + "\t" + "Multiply all fitness scores by a certain value (e.g., the fitness of a knockout). You should normalize the data." + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
36 print "-ef" + "\t\t" + "Exclude insertions that occur in the first N amount (%) of gene--becuase may not affect gene function." + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
37 print "-el" + "\t\t" + "Exclude insertions in the last N amount (%) of the gene--considering truncation may not affect gene function." + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
38 print "-wig" + "\t\t" + "Create a wiggle file for viewing in a genome browser. Provide a filename." + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
39 print "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
40
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
41 import argparse
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
42 parser = argparse.ArgumentParser()
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
43 parser.add_argument("-ref", action="store", dest="ref_genome")
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
44 parser.add_argument("-t1", action="store", dest="mapfile1")
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
45 parser.add_argument("-t2", action="store", dest="mapfile2")
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
46 parser.add_argument("-out", action="store", dest="outfile")
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
47 parser.add_argument("-out2", action="store", dest="outfile2")
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
48 parser.add_argument("-expansion", action="store", dest="expansion_factor")
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
49 parser.add_argument("-d", action="store", dest="downstream")
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
50 parser.add_argument("-reads1", action="store", dest="reads1")
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
51 parser.add_argument("-reads2", action="store", dest="reads2")
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
52 parser.add_argument("-cutoff", action="store", dest="cutoff")
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
53 parser.add_argument("-cutoff2", action="store", dest="cutoff2")
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
54 parser.add_argument("-strand", action="store", dest="usestrand")
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
55 parser.add_argument("-normalize", action="store", dest="normalize")
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
56 parser.add_argument("-maxweight", action="store", dest="max_weight")
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
57 parser.add_argument("-multiply", action="store", dest="multiply")
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
58 parser.add_argument("-ef", action="store", dest="exclude_first")
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
59 parser.add_argument("-el", action="store", dest="exclude_last")
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
60 parser.add_argument("-wig", action="store", dest="wig")
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
61 arguments = parser.parse_args()
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
62
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
63 # Checks that all the required arguments have actually been entered
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
64 # The reference genome is required to find what gene each insertion falls within, the mapfiles are required because the relative number of insertions between time 2 and time 1 are used to calculate each insertion location's fitness, and the outfile is required so you... get an outfile.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
65
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
66 if (not arguments.ref_genome or not arguments.mapfile1 or not arguments.mapfile2 or not arguments.outfile):
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
67 print_usage()
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
68 quit()
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
69
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
70 # Sets the default value of the expansion factor to 250, which is not a particularly meaningful number; the expansion factor just needs to have some value so that rough fitness calculations can be run.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
71 # Measuring and inputting your own expansion factor is highly recommended, as without it the fitness calculations will not be nearly as accurate. See the equations used to calculate fitness if you're confused about that.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
72
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
73 if (not arguments.expansion_factor):
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
74 arguments.expansion_factor = 250
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
75
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
76 # Sets the default maximum weight of a transposon gene; 75 is similarly not a particularly meaningful number.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
77
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
78 if (not arguments.max_weight):
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
79 arguments.max_weight = 75
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
80
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
81 # Sets the default value of cutoff to 0; cutoff exists to discard positions with a low number of counted transcripts, because fitnesses calculated from them may not be very accurate, by the same reasoning that studies with low sample sizes are innacurate.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
82
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
83 if (not arguments.cutoff):
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
84 arguments.cutoff = 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
85
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
86 # Sets the default value of cutoff2 to 10; cutoff2 exists to discard positions within normalization genes with a low number of counted transcripts, because fitnesses calculated from them similarly may not be very accurate.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
87 # This only has an effect if it's larger than cutoff, since the normalization step references a list of insertions already affected by cutoff.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
88
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
89 if (not arguments.cutoff2):
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
90 arguments.cutoff2 = 10
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
91
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
92 #Sets the default value of strand to "both", indicating you'll use reads from both Watson & Crick strands.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
93
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
94 if (not arguments.usestrand):
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
95 arguments.usestrand = "both"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
96
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
97
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
98
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
99
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
100
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
101
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
102 ##### PARSING THE REFERENCE GENOME #####
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
103
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
104 def get_time():
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
105 import datetime
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
106 return datetime.datetime.now().time()
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
107 print "\n" + "Starting: " + str(get_time()) + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
108
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
109 # Uses Biopython's SeqIO to parse the reference genome file for features; these come from its feature table, which lists all known features and their relevance - if they're a gene, a recognizable repeat, etc.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
110 # This is the reason that all reference genome files must be in standard GenBank format!
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
111
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
112 from Bio import SeqIO
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
113 import os.path
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
114 handle = open(arguments.ref_genome, "rU")
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
115 for record in SeqIO.parse(handle, "genbank"):
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
116 refname = record.id
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
117 features = record.features
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
118 handle.close()
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
119
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
120 # Makes a dictionary out of each feature that's a gene - with its gene name, start location, end location, and strand as keys to their values. Then makes a list out of all those dictionaries for ease of accessing later on.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
121
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
122 feature_list = []
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
123 for feature in features:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
124 if feature.type == "gene":
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
125 gene = feature.qualifiers["locus_tag"]
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
126 strand = feature.location.strand
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
127 start = float(feature.location.start)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
128 end = float(feature.location.end)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
129
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
130 # Exclude_first and exclude_last are used here to exclude whatever percentage of the genes you like from calculations; e.g. a value of 0.1 for exclude_last would exclude the last 10% of all genes!
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
131 # This can be useful because insertions at the very start or end of genes often don't actually break its function.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
132
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
133 if (arguments.exclude_first):
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
134 start += (end - start) * float(arguments.exclude_first)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
135 if (arguments.exclude_last):
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
136 end -= (end - start) * float(arguments.exclude_last)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
137 feature_dictionary = {"gene": gene, "start": start, "end": end, "strand": strand}
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
138 feature_list.append(feature_dictionary)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
139
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
140 print "Done generating feature lookup: " + str(get_time()) + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
141
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
142
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
143
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
144
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
145
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
146
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
147
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
148
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
149
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
150
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
151 ##### PARSING THE MAPFILES #####
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
152
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
153 with open(arguments.mapfile1) as file:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
154 r1 = file.readlines()
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
155 with open(arguments.mapfile2) as file:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
156 r2 = file.readlines()
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
157
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
158 # When called, goes through each line of the mapfile (each line is a compressed read) to find the strand, count, and position of the read. It may be helpful to look at how the mapfiles are formatted to understand how this code finds them.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
159 # The strand of a read is + (Watson) or - (Crick).
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
160 # The count is the number of times a particular read was actually sequenced (counted by fastx_collapser before calc_fitness.py is even called).
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
161 # The position is the position of the sequence when alligned to the reference genome (found by bowtie, also before calc_fitness.py is called). This is the position of the FIRST nucleotide in the sequence, when mapped.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
162 # Also records the total numbers of reads for the plus and minus strands, and the number of plus and minus sites from the mapfiles - not for fitness calculations, just for reference, as they're printed later.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
163
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
164 def read_mapfile(reads):
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
165 plus_total = 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
166 minus_total = 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
167 plus_counts = {"total": 0, "sites": 0}
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
168 minus_counts = {"total": 0, "sites": 0}
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
169 for read in reads:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
170 if "-" in read.split()[0]:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
171 strand = read.split()[1]
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
172 count = float(read.split()[0].split("-")[1])
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
173 position = float(read.split()[3])
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
174 else:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
175 continue
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
176
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
177 # If for some reason you want to skip all reads from one of the strands - for example, if you wanted to compare the two strands - that's done here.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
178
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
179 if arguments.usestrand != "both" and strand != arguments.usestrand:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
180 continue
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
181
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
182 # Makes a dictionary for the + strand, with each insert position as a key and the number of insertions there as its corresponding value.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
183 # Here we run into a tricky problem, in that the actual location of an insertion would be right before the position of the first nucleotide in a flanking sequence "following" the insertion, but right after the last nucleotide in a flanking sequence "preceding" the insertion.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
184 # Thus the location of an insertion from a following flanking sequence would be the same as the mapped position of the sequence, but the location of an insertion from a preceding flanking sequence would be its mapped position + the length of the flanking sequence - 2.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
185 # The -2 comes from a fake "TA" in the read, caused by MmeI cutting unevenly at an "AT" site so that an extra "TA" is created, artifically adding +2 to the flanking sequence length.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
186 # However, it is currently impossible to distinguish between following and preceding flanking sequences, and so we split the difference by adding (the length of the sequence - 2) to only the + strand reads.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
187 # In the future we will try not adding anything to both, or adding 1/2(the sequence length -2) to both + an - strands, to see if that improves calculations.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
188 # The 'if arguments.downstream' option is there for a different method than the standard, in which reads are only taken from downstream of the transposon.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
189
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
190 if (strand == "+"):
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
191 sequence_length = len(read.split()[4])
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
192 if arguments.downstream:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
193 position += 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
194 else:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
195 position += (sequence_length - 2)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
196 plus_counts["total"] += count
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
197 plus_counts["sites"] += 1
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
198 if position in plus_counts:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
199 plus_counts[position] += count
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
200 else:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
201 plus_counts[position] = count
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
202
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
203 # Makes a dictionary for the - strand, with each insert position as a key and the number of insertions there as its corresponding value.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
204
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
205 else:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
206 minus_counts["total"] += count
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
207 minus_counts["sites"] += 1
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
208 if position in minus_counts:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
209 minus_counts[position] += count
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
210 else:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
211 minus_counts[position] = count
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
212 return (plus_counts, minus_counts)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
213
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
214 # Calls read_mapfile(reads) to parse arguments.reads1 and arguments.reads2 (your reads from t1 and t2).
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
215
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
216 (plus_ref_1, minus_ref_1) = read_mapfile(r1)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
217 print "Read first file: " + str(get_time()) + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
218 (plus_ref_2, minus_ref_2) = read_mapfile(r2)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
219 print "Read second file: " + str(get_time()) + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
220
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
221 # Prints the number of + and - reads from reads1 and reads2, as well as the number of + and - sites found from the mapfiles, for reference when debugging etc.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
222 # The number of sites is the length of a given dictionary of sites - 1 because its last key, "total", isn't actually a site.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
223
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
224 print "Reads:" + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
225 print "1: + " + str(plus_ref_1["total"]) + " - " + str(minus_ref_1["total"]) + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
226 print "2: + " + str(plus_ref_2["total"]) + " - " + str(minus_ref_2["total"]) + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
227 print "Sites:" + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
228 print "1: + " + str(plus_ref_1["sites"]) + " - " + str(minus_ref_1["sites"]) + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
229 print "2: + " + str(plus_ref_2["sites"]) + " - " + str(minus_ref_2["sites"]) + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
230
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
231
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
232
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
233
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
234
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
235
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
236
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
237
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
238
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
239
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
240 ##### FITNESS CALCULATIONS #####
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
241
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
242 # If reads1 and reads2 weren't specified in the command line, sets them as the total number of reads (found in read_mapfile())
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
243
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
244 if not arguments.reads1:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
245 arguments.reads1 = plus_ref_1["total"] + minus_ref_1["total"]
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
246 if not arguments.reads2:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
247 arguments.reads2 = plus_ref_2["total"] + minus_ref_2["total"]
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
248
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
249 # Calculates the correction factors for reads from t1 and t2; cfactor1 and cfactor2 are the number of reads from t1 and t2 respectively divided by total, which is the average number of reads between the two.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
250 # For example, if there were more reads from t2 than t1 - in a 6:4 ratio - cfactor2 would be 1.2 and cfactor1 would be 0.8
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
251 # This is used later on to correct for pipetting errors, or any other error that would cause unequal amounts of DNA from t1 and t2 to be sequenced so that an unequal amount of reads is produced
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
252 # However, because slightly more or less DNA shouldn't change the % frequency of a mutation (it would only change the absolute number of counts) this may not be necessary; in the future we'll try removing this.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
253
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
254 total = (float(arguments.reads1) + float(arguments.reads2))/2
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
255 cfactor1 = float(arguments.reads1)/total
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
256 cfactor2 = float(arguments.reads2)/total
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
257 print "Cfactor 1: " + str(cfactor1) + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
258 print "Cfactor 2: " + str(cfactor2) + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
259 import math
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
260 import csv
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
261 results = [["position", "strand", "count_1", "count_2", "ratio", "mt_freq_t1", "mt_freq_t2", "pop_freq_t1", "pop_freq_t2", "gene", "D", "W", "nW"]]
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
262 genic = 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
263 total_inserts = 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
264 with open(arguments.ref_genome, "r") as file:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
265 firstline = file.readline()
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
266 genomelength = firstline.split()[2]
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
267 i = 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
268 while i < float(genomelength):
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
269
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
270 # At each possible location for an insertion in the genome, counts the number of actual insertions at t1 and which strand(s) the corresponding reads came from.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
271
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
272 c1 = 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
273 if i in plus_ref_1:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
274 c1 = float(plus_ref_1[i])
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
275 strand = "+/"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
276 if i in minus_ref_1:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
277 c1 += float(minus_ref_1[i])
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
278 strand = "b/"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
279 elif i in minus_ref_1:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
280 c1 = float(minus_ref_1[i])
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
281 strand = "-/"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
282
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
283 # If there were no insertions at a certain location at t1 just continues to the next location; there can't be any comparison to make between t1 and t2 if there are no t1 insertions!
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
284
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
285 else:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
286 i += 1
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
287 continue
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
288
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
289 # At each location where there was an insertion at t1, counts the number of insertions at t2 and which strand(s) the corresponding reads came from.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
290
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
291 c2 = 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
292 if i in plus_ref_2:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
293 c2 = float(plus_ref_2[i])
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
294 if i in minus_ref_2:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
295 c2 += float(minus_ref_2[i])
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
296 strand += "b"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
297 else:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
298 strand += "+"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
299 elif i in minus_ref_2:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
300 c2 = float(minus_ref_2[i])
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
301 strand += "-"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
302
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
303 # Corrects with cfactor1 and cfactor2; as noted above, this may not be necessary and we'll see what removing it does in the future. Possible alternate / non-corrected code is shown in the comments just below.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
304
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
305 c1 /= cfactor1
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
306 if c2 != 0:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
307 c2 /= cfactor2
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
308 ratio = c2/c1
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
309 else:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
310 c2 = 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
311 ratio = 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
312
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
313 # if c2 != 0:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
314 # ratio = c2/c1
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
315 # else:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
316 # ratio = 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
317
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
318 # Passes by all insertions with a number of reads smaller than the cutoff, as they may lead to inaccurate fitness calculations.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
319
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
320 if (c1 + c2)/2 < float(arguments.cutoff):
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
321 i+= 1
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
322 continue
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
323
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
324 # Calculates each insertion's frequency within the populations at t1 and t2. If cfactor correction were removed, "total" would have to be changed to "float(arguments.reads1)" and "float(arguments.reads2)" respectively, as shown in the comments just below.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
325
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
326 mt_freq_t1 = c1/total
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
327 mt_freq_t2 = c2/total
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
328 # mt_freq_t1 = c1/float(arguments.reads1)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
329 # mt_freq_t2 = c2/float(arguments.reads2)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
330 pop_freq_t1 = 1 - mt_freq_t1
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
331 pop_freq_t2 = 1 - mt_freq_t2
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
332
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
333 # Calculates each insertion's fitness! This is from the fitness equation log((frequency of mutation @ time 2 / frequency of mutation @ time 1)*expansion factor)/log((frequency of population without the mutation @ time 2 / frequency of population without the mutation @ time 1)*expansion factor)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
334
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
335 w = 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
336 if mt_freq_t2 != 0:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
337 top_w = math.log(mt_freq_t2*(float(arguments.expansion_factor)/mt_freq_t1))
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
338 bot_w = math.log(pop_freq_t2*(float(arguments.expansion_factor)/pop_freq_t1))
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
339 w = top_w/bot_w
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
340
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
341 # Checks which gene locus the insertion falls within, and records that.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
342
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
343 gene = ''
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
344 for feature_dictionary in feature_list:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
345 if feature_dictionary["start"] <= i and feature_dictionary["end"] >= i:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
346 gene = "".join(feature_dictionary["gene"])
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
347 genic += 1
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
348 break
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
349 total_inserts += 1
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
350
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
351 # Writes all relevant information on each insertion and its fitness to a cvs file: the location of the insertion, its strand, c1, c2, etc. (the variable names are self-explanatiory)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
352 # w is written twice, because the second w will be normalized if normalization is called for, thus becoming nW.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
353
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
354 row = [i, strand, c1, c2, ratio, mt_freq_t1, mt_freq_t2, pop_freq_t1, pop_freq_t2, gene, arguments.expansion_factor, w, w]
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
355 results.append(row)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
356 i += 1
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
357 with open(arguments.outfile, "wb") as csvfile:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
358 writer = csv.writer(csvfile)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
359 writer.writerows(results)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
360
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
361 # Prints the time when done comparing mapfiles - that is, when done with all fitness calculations but normalization - as well as the number of genic inserts and number of total inserts.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
362
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
363 print "Done comparing mapfiles " + str(get_time()) + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
364 print "Genic: " + str(genic) + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
365 print "Total: " + str(total_inserts) + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
366
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
367
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
368
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
369
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
370
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
371
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
372
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
373
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
374
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
375
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
376 ##### NORMALIZATION #####
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
377
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
378 # If making a WIG file is requested in the arguments, starts a string to be added to and then written to the WIG file with a typical WIG file header.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
379 # The header is just in a typical WIG file format; if you'd like to look into this more UCSC has notes on formatting WIG files on their site.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
380
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
381 if (arguments.wig):
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
382 wigstring = "track type=wiggle_0 name=" + arguments.wig + "\n" + "variableStep chrom=" + refname + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
383
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
384 # If a file's given for normalization, starts normalization; this corrects for anything that would cause all the fitness values to be too high or too low.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
385
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
386 if (arguments.normalize):
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
387
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
388 # Makes a list of the genes in the normalization file, which should all be transposon genes (these naturally ought to have a fitness value of exactly 1, because transposons are generally non-coding DNA)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
389
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
390 with open(arguments.normalize) as file:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
391 transposon_genes = file.read().splitlines()
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
392 print "Normalize genes loaded" + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
393 blank_ws = 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
394 sum = 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
395 count = 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
396 weights = []
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
397 scores = []
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
398 for list in results:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
399
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
400 # Finds all insertions within one of the normalization genes that also have a w value; gets their c1 and c2 values (the number of insertions at t1 and t2) and takes the average of that!
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
401 # The average is later used as the "weight" of an insertion location's fitness - if it's had more insertions, it should weigh proportionally more towards the average fitness of insertions within the normalization genes.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
402
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
403 if list[9] != '' and list[9] in transposon_genes and list[11]:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
404 c1 = list[2]
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
405 c2 = list[3]
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
406 score = list[11]
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
407 avg = (c1 + c2)/2
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
408
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
409 # Skips over those insertion locations with too few insertions - their fitness values are less accurate because they're based on such small insertion numbers.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
410
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
411 if float(c1) >= float(arguments.cutoff2):
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
412 #if avg >= float(arguments.cutoff2):
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
413
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
414 # Sets a max weight, to prevent insertion location scores with huge weights from unbalancing the normalization.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
415
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
416 if (avg >= float(arguments.max_weight)):
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
417 avg = float(arguments.max_weight)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
418
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
419 # Tallies how many w values are 0 within the blank_ws value; you might get many transposon genes with a w value of 0 if a bottleneck occurs, which is especially common with in vivo experiments.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
420 # For example, when studying a nasal infection in a mouse model, what bacteria "sticks" and is able to survive and what bacteria is swallowed and killed or otherwise flushed out tends to be a matter
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
421 # of chance not fitness; all mutants with an insertion in a specific transposon gene could be flushed out by chance!
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
422
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
423 if score == 0:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
424 blank_ws += 1
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
425
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
426 # Adds the fitness values of the insertions within normalization genes together and increments count so their average fitness (sum/count) can be calculated later on
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
427
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
428 sum += score
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
429 count += 1
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
430
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
431 # Records the weights of the fitness values of the insertion locations in corresponding lists - for example, weights[2] would be the weight of the fitness value at score[2]
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
432
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
433 weights.append(avg)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
434 scores.append(score)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
435
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
436 print str(list[9]) + " " + str(score) + " " + str(c1)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
437
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
438 # Counts and removes all "blank" fitness values of normalization genes - those that = 0 - because they most likely don't really have a fitness value of 0, and you just happened to not get any reads from that location at t2.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
439
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
440 blank_count = 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
441 original_count = len(scores)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
442 i = 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
443 while i < original_count:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
444 w_value = scores[i]
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
445 if w_value == 0:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
446 blank_count += 1
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
447 weights.pop[i]
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
448 scores.pop[i]
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
449 i-=1
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
450 i += 1
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
451
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
452 # If no normalization genes can pass the cutoff, normalization cannot occur, so this ends the script advises the user to try again and lower cutoff and/or cutoff2.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
453
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
454 if len(scores) == 0:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
455 print 'ERROR: The normalization genes do not have enough reads to pass cutoff and/or cutoff2; please lower one or both of those arguments.' + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
456 quit()
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
457
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
458 # Prints the number of of blank fitness values found and removed for reference. Writes the percentage to a file so it can be referenced for aggregate analysis.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
459
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
460 pc_blank_normals = float(blank_count) / float(original_count)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
461 print "# blank out of " + str(original_count) + ": " + str(pc_blank_normals) + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
462 with open(arguments.outfile2, "w") as f:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
463 f.write("blanks: " + str(pc_blank_normals) + "\n" + "total: " + str(total) + "\n" + "refname: " + refname)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
464
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
465
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
466 # Finds "average" - the average fitness value for an insertion within the transposon genes - and "weighted_average" - the average fitness value for an insertion within the transposon genes weighted by how many insertions each had.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
467
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
468 average = sum / count
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
469 i = 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
470 weighted_sum = 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
471 weight_sum = 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
472 while i < len(weights):
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
473 weighted_sum += weights[i]*scores[i]
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
474 weight_sum += weights[i]
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
475 i += 1
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
476 weighted_average = weighted_sum/weight_sum
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
477
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
478 # Prints the regular average, weighted average, and total insertions for reference
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
479
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
480 print "Normalization step:" + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
481 print "Regular average: " + str(average) + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
482 print "Weighted Average: " + str(weighted_average) + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
483 print "Total Insertions: " + str(count) + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
484
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
485 # The actual normalization happens here; every fitness score is divided by the average fitness found for genes that should have a value of 1.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
486 # For example, if the average fitness for genes was too low overall - let's say 0.97 within the normalization geness - every fitness would be proportionally raised.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
487
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
488 old_ws = 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
489 new_ws = 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
490 wcount = 0
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
491 for list in results:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
492 if list[11] == 'W':
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
493 continue
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
494 new_w = float(list[11])/weighted_average
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
495
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
496 # Sometimes you want to multiply all the fitness values by a constant; this does that.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
497 # For example you might multiply all the values by a constant for a genetic interaction screen - where Tn-Seq is performed as usual except there's one background knockout all the mutants share. This is
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
498 # because independent mutations should have a fitness value that's equal to their individual fitness values multipled, but related mutations will deviate from that; to find those deviations you'd multiply
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
499 # all the fitness values from mutants from a normal library by the fitness of the background knockout and compare that to the fitness values found from the knockout library!
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
500
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
501 if arguments.multiply:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
502 new_w *= float(arguments.multiply)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
503
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
504 # Records the old w score for reference, and adds it to a total sum of all w scores (so that the old w mean and new w mean can be printed later).
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
505
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
506 if float(list[11]) > 0:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
507 old_ws += float(list[11])
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
508 new_ws += new_w
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
509 wcount += 1
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
510
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
511 # Writes the new w score into the results list of lists.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
512
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
513 list[12] = new_w
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
514
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
515 # Adds a line to wiglist for each insertion position, with the insertion position and its new w value.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
516
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
517 if (arguments.wig):
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
518 wigstring += str(list[0]) + " " + str(new_w) + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
519
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
520 # Prints the old w mean and new w mean for reference.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
521
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
522 old_w_mean = old_ws / wcount
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
523 new_w_mean = new_ws / wcount
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
524 print "Old W Average: " + str(old_w_mean) + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
525 print "New W Average: " + str(new_w_mean) + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
526
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
527 # Overwrites the old file with the normalized file.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
528
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
529 with open(arguments.outfile, "wb") as csvfile:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
530 writer = csv.writer(csvfile)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
531 writer.writerows(results)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
532
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
533 # If a WIG file was requested, actually creates the WIG file and writes wiglist to it
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
534 # So what's written here is the WIG header plus each insertion position and it's new w value if normalization were called for, and each insertion position and its unnormalized w value if normalization were not called for.
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
535
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
536 if (arguments.wig):
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
537 if (arguments.normalize):
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
538 with open(arguments.wig, "wb") as wigfile:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
539 wigfile.write(wigstring)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
540 else:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
541 for list in results:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
542 wigstring += str(list[0]) + " " + str(list[11]) + "\n"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
543 with open(arguments.wig, "wb") as wigfile:
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
544 wigfile.write(wigstring)
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
545
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
546
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
547
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
548
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
549
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
550 #FITOPSpy="-ef .0 -el .10 -cutoff 0"
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
551 #
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
552 #python ../script/calc_fitness.py $FITOPSpy -wig gview/py_L1_2394eVI_Gluc.wig -t1 alignments/L1_2394eVI_Input.map -t2 alignments/L1_2394eVI_Gluc_T2.map -ref=NC_003028b2.gbk -out results/py_L1_2394eVI_Gluc.csv -out2 results/py_2_L1_2394eVI_Gluc.txt -expansion 675 -normalize tigr4_normal.txt
9aeda6cd07dc Uploaded
kaymccoy
parents:
diff changeset
553 #python ../script/calc_fitness.py $FITOPSpy -wig gview/py_L3_2394eVI_Gluc.wig -t1 alignments/L3_2394eVI_Input.map -t2 alignments/L3_2394eVI_Gluc_T2.map -ref=NC_003028b2.gbk -out results/py_L3_2394eVI_Gluc.csv -out2 results/py_2_L3_2394eVI_Gluc.txt -expansion 244 -normalize tigr4_normal.txt