calculate_fitnesses: calc_fitness.py annotate

author	kaymccoy
date	Thu, 11 Aug 2016 18:34:33 -0400
parents
children

rev	line source
4 9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	1 # A translation of calc_fitness.pl into python! For analysis of Tn-Seq.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	2 # This script requires BioPython, which in turn has a good number of dependencies (some optional but very helpful).
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	3 # How to install BioPython and a list of its dependencies can be found here: http://biopython.org/DIST/docs/install/Installation.html
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	4 # To see what future edits / tests are planned for this script, search for the phrase "in the future".
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	5
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	6
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	7
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	8
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	9
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	10
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	11
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	12
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	13
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	14
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	15 ##### ARGUMENTS #####
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	16
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	17 def print_usage():
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	18 print "\n" + "You are missing one or more required flags. A complete list of flags accepted by calc_fitness is as follows:" + "\n\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	19 print "\033[1m" + "Required" + "\033[0m" + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	20 print "-ref" + "\t\t" + "The name of the reference genome file, in GenBank format." + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	21 print "-t1" + "\t\t" + "The name of the bowtie mapfile from time 1." + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	22 print "-t2" + "\t\t" + "The name of the bowtie mapfile from time 2." + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	23 print "-out" + "\t\t" + "Name of a file to enter the .csv output." + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	24 print "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	25 print "\033[1m" + "Optional" + "\033[0m" + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	26 print "-expansion" + "\t\t" + "Expansion factor (default: 250)" + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	27 print "-d" + "\t\t" + "All reads being analyzed are downstream of the transposon" + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	28 print "-reads1" + "\t\t" + "The number of reads to be used to calculate the correction factor for time 0." + "\n\t\t" + "(default counted from bowtie output)" + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	29 print "-reads2" + "\t\t" + "The number of reads to be used to calculate the correction factor for time 6." + "\n\t\t" + "(default counted from bowtie output)" + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	30 print "-cutoff" + "\t\t" + "Discard any positions where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	31 print "-cutoff2" + "\t\t" + "Discard any positions within the normalization genes where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	32 print "-strand" + "\t\t" + "Use only the specified strand (+ or -) when counting transcripts (default: both)" + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	33 print "-normalize" + "\t" + "A file that contains a list of genes that should have a fitness of 1" + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	34 print "-maxweight" + "\t" + "The maximum weight a transposon gene can have in normalization calculations" + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	35 print "-multiply" + "\t" + "Multiply all fitness scores by a certain value (e.g., the fitness of a knockout). You should normalize the data." + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	36 print "-ef" + "\t\t" + "Exclude insertions that occur in the first N amount (%) of gene--becuase may not affect gene function." + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	37 print "-el" + "\t\t" + "Exclude insertions in the last N amount (%) of the gene--considering truncation may not affect gene function." + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	38 print "-wig" + "\t\t" + "Create a wiggle file for viewing in a genome browser. Provide a filename." + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	39 print "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	40
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	41 import argparse
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	42 parser = argparse.ArgumentParser()
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	43 parser.add_argument("-ref", action="store", dest="ref_genome")
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	44 parser.add_argument("-t1", action="store", dest="mapfile1")
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	45 parser.add_argument("-t2", action="store", dest="mapfile2")
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	46 parser.add_argument("-out", action="store", dest="outfile")
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	47 parser.add_argument("-out2", action="store", dest="outfile2")
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	48 parser.add_argument("-expansion", action="store", dest="expansion_factor")
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	49 parser.add_argument("-d", action="store", dest="downstream")
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	50 parser.add_argument("-reads1", action="store", dest="reads1")
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	51 parser.add_argument("-reads2", action="store", dest="reads2")
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	52 parser.add_argument("-cutoff", action="store", dest="cutoff")
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	53 parser.add_argument("-cutoff2", action="store", dest="cutoff2")
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	54 parser.add_argument("-strand", action="store", dest="usestrand")
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	55 parser.add_argument("-normalize", action="store", dest="normalize")
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	56 parser.add_argument("-maxweight", action="store", dest="max_weight")
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	57 parser.add_argument("-multiply", action="store", dest="multiply")
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	58 parser.add_argument("-ef", action="store", dest="exclude_first")
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	59 parser.add_argument("-el", action="store", dest="exclude_last")
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	60 parser.add_argument("-wig", action="store", dest="wig")
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	61 arguments = parser.parse_args()
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	62
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	63 # Checks that all the required arguments have actually been entered
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	64 # The reference genome is required to find what gene each insertion falls within, the mapfiles are required because the relative number of insertions between time 2 and time 1 are used to calculate each insertion location's fitness, and the outfile is required so you... get an outfile.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	65
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	66 if (not arguments.ref_genome or not arguments.mapfile1 or not arguments.mapfile2 or not arguments.outfile):
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	67 print_usage()
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	68 quit()
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	69
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	70 # Sets the default value of the expansion factor to 250, which is not a particularly meaningful number; the expansion factor just needs to have some value so that rough fitness calculations can be run.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	71 # Measuring and inputting your own expansion factor is highly recommended, as without it the fitness calculations will not be nearly as accurate. See the equations used to calculate fitness if you're confused about that.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	72
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	73 if (not arguments.expansion_factor):
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	74 arguments.expansion_factor = 250
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	75
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	76 # Sets the default maximum weight of a transposon gene; 75 is similarly not a particularly meaningful number.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	77
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	78 if (not arguments.max_weight):
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	79 arguments.max_weight = 75
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	80
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	81 # Sets the default value of cutoff to 0; cutoff exists to discard positions with a low number of counted transcripts, because fitnesses calculated from them may not be very accurate, by the same reasoning that studies with low sample sizes are innacurate.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	82
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	83 if (not arguments.cutoff):
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	84 arguments.cutoff = 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	85
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	86 # Sets the default value of cutoff2 to 10; cutoff2 exists to discard positions within normalization genes with a low number of counted transcripts, because fitnesses calculated from them similarly may not be very accurate.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	87 # This only has an effect if it's larger than cutoff, since the normalization step references a list of insertions already affected by cutoff.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	88
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	89 if (not arguments.cutoff2):
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	90 arguments.cutoff2 = 10
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	91
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	92 #Sets the default value of strand to "both", indicating you'll use reads from both Watson & Crick strands.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	93
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	94 if (not arguments.usestrand):
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	95 arguments.usestrand = "both"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	96
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	97
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	98
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	99
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	100
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	101
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	102 ##### PARSING THE REFERENCE GENOME #####
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	103
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	104 def get_time():
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	105 import datetime
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	106 return datetime.datetime.now().time()
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	107 print "\n" + "Starting: " + str(get_time()) + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	108
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	109 # Uses Biopython's SeqIO to parse the reference genome file for features; these come from its feature table, which lists all known features and their relevance - if they're a gene, a recognizable repeat, etc.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	110 # This is the reason that all reference genome files must be in standard GenBank format!
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	111
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	112 from Bio import SeqIO
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	113 import os.path
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	114 handle = open(arguments.ref_genome, "rU")
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	115 for record in SeqIO.parse(handle, "genbank"):
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	116 refname = record.id
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	117 features = record.features
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	118 handle.close()
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	119
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	120 # Makes a dictionary out of each feature that's a gene - with its gene name, start location, end location, and strand as keys to their values. Then makes a list out of all those dictionaries for ease of accessing later on.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	121
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	122 feature_list = []
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	123 for feature in features:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	124 if feature.type == "gene":
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	125 gene = feature.qualifiers["locus_tag"]
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	126 strand = feature.location.strand
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	127 start = float(feature.location.start)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	128 end = float(feature.location.end)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	129
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	130 # Exclude_first and exclude_last are used here to exclude whatever percentage of the genes you like from calculations; e.g. a value of 0.1 for exclude_last would exclude the last 10% of all genes!
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	131 # This can be useful because insertions at the very start or end of genes often don't actually break its function.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	132
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	133 if (arguments.exclude_first):
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	134 start += (end - start) * float(arguments.exclude_first)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	135 if (arguments.exclude_last):
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	136 end -= (end - start) * float(arguments.exclude_last)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	137 feature_dictionary = {"gene": gene, "start": start, "end": end, "strand": strand}
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	138 feature_list.append(feature_dictionary)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	139
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	140 print "Done generating feature lookup: " + str(get_time()) + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	141
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	142
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	143
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	144
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	145
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	146
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	147
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	148
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	149
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	150
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	151 ##### PARSING THE MAPFILES #####
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	152
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	153 with open(arguments.mapfile1) as file:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	154 r1 = file.readlines()
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	155 with open(arguments.mapfile2) as file:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	156 r2 = file.readlines()
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	157
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	158 # When called, goes through each line of the mapfile (each line is a compressed read) to find the strand, count, and position of the read. It may be helpful to look at how the mapfiles are formatted to understand how this code finds them.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	159 # The strand of a read is + (Watson) or - (Crick).
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	160 # The count is the number of times a particular read was actually sequenced (counted by fastx_collapser before calc_fitness.py is even called).
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	161 # The position is the position of the sequence when alligned to the reference genome (found by bowtie, also before calc_fitness.py is called). This is the position of the FIRST nucleotide in the sequence, when mapped.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	162 # Also records the total numbers of reads for the plus and minus strands, and the number of plus and minus sites from the mapfiles - not for fitness calculations, just for reference, as they're printed later.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	163
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	164 def read_mapfile(reads):
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	165 plus_total = 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	166 minus_total = 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	167 plus_counts = {"total": 0, "sites": 0}
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	168 minus_counts = {"total": 0, "sites": 0}
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	169 for read in reads:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	170 if "-" in read.split()[0]:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	171 strand = read.split()[1]
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	172 count = float(read.split()[0].split("-")[1])
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	173 position = float(read.split()[3])
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	174 else:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	175 continue
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	176
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	177 # If for some reason you want to skip all reads from one of the strands - for example, if you wanted to compare the two strands - that's done here.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	178
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	179 if arguments.usestrand != "both" and strand != arguments.usestrand:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	180 continue
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	181
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	182 # Makes a dictionary for the + strand, with each insert position as a key and the number of insertions there as its corresponding value.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	183 # Here we run into a tricky problem, in that the actual location of an insertion would be right before the position of the first nucleotide in a flanking sequence "following" the insertion, but right after the last nucleotide in a flanking sequence "preceding" the insertion.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	184 # Thus the location of an insertion from a following flanking sequence would be the same as the mapped position of the sequence, but the location of an insertion from a preceding flanking sequence would be its mapped position + the length of the flanking sequence - 2.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	185 # The -2 comes from a fake "TA" in the read, caused by MmeI cutting unevenly at an "AT" site so that an extra "TA" is created, artifically adding +2 to the flanking sequence length.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	186 # However, it is currently impossible to distinguish between following and preceding flanking sequences, and so we split the difference by adding (the length of the sequence - 2) to only the + strand reads.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	187 # In the future we will try not adding anything to both, or adding 1/2(the sequence length -2) to both + an - strands, to see if that improves calculations.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	188 # The 'if arguments.downstream' option is there for a different method than the standard, in which reads are only taken from downstream of the transposon.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	189
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	190 if (strand == "+"):
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	191 sequence_length = len(read.split()[4])
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	192 if arguments.downstream:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	193 position += 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	194 else:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	195 position += (sequence_length - 2)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	196 plus_counts["total"] += count
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	197 plus_counts["sites"] += 1
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	198 if position in plus_counts:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	199 plus_counts[position] += count
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	200 else:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	201 plus_counts[position] = count
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	202
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	203 # Makes a dictionary for the - strand, with each insert position as a key and the number of insertions there as its corresponding value.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	204
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	205 else:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	206 minus_counts["total"] += count
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	207 minus_counts["sites"] += 1
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	208 if position in minus_counts:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	209 minus_counts[position] += count
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	210 else:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	211 minus_counts[position] = count
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	212 return (plus_counts, minus_counts)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	213
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	214 # Calls read_mapfile(reads) to parse arguments.reads1 and arguments.reads2 (your reads from t1 and t2).
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	215
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	216 (plus_ref_1, minus_ref_1) = read_mapfile(r1)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	217 print "Read first file: " + str(get_time()) + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	218 (plus_ref_2, minus_ref_2) = read_mapfile(r2)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	219 print "Read second file: " + str(get_time()) + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	220
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	221 # Prints the number of + and - reads from reads1 and reads2, as well as the number of + and - sites found from the mapfiles, for reference when debugging etc.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	222 # The number of sites is the length of a given dictionary of sites - 1 because its last key, "total", isn't actually a site.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	223
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	224 print "Reads:" + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	225 print "1: + " + str(plus_ref_1["total"]) + " - " + str(minus_ref_1["total"]) + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	226 print "2: + " + str(plus_ref_2["total"]) + " - " + str(minus_ref_2["total"]) + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	227 print "Sites:" + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	228 print "1: + " + str(plus_ref_1["sites"]) + " - " + str(minus_ref_1["sites"]) + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	229 print "2: + " + str(plus_ref_2["sites"]) + " - " + str(minus_ref_2["sites"]) + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	230
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	231
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	232
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	233
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	234
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	235
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	236
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	237
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	238
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	239
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	240 ##### FITNESS CALCULATIONS #####
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	241
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	242 # If reads1 and reads2 weren't specified in the command line, sets them as the total number of reads (found in read_mapfile())
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	243
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	244 if not arguments.reads1:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	245 arguments.reads1 = plus_ref_1["total"] + minus_ref_1["total"]
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	246 if not arguments.reads2:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	247 arguments.reads2 = plus_ref_2["total"] + minus_ref_2["total"]
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	248
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	249 # Calculates the correction factors for reads from t1 and t2; cfactor1 and cfactor2 are the number of reads from t1 and t2 respectively divided by total, which is the average number of reads between the two.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	250 # For example, if there were more reads from t2 than t1 - in a 6:4 ratio - cfactor2 would be 1.2 and cfactor1 would be 0.8
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	251 # This is used later on to correct for pipetting errors, or any other error that would cause unequal amounts of DNA from t1 and t2 to be sequenced so that an unequal amount of reads is produced
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	252 # However, because slightly more or less DNA shouldn't change the % frequency of a mutation (it would only change the absolute number of counts) this may not be necessary; in the future we'll try removing this.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	253
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	254 total = (float(arguments.reads1) + float(arguments.reads2))/2
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	255 cfactor1 = float(arguments.reads1)/total
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	256 cfactor2 = float(arguments.reads2)/total
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	257 print "Cfactor 1: " + str(cfactor1) + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	258 print "Cfactor 2: " + str(cfactor2) + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	259 import math
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	260 import csv
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	261 results = [["position", "strand", "count_1", "count_2", "ratio", "mt_freq_t1", "mt_freq_t2", "pop_freq_t1", "pop_freq_t2", "gene", "D", "W", "nW"]]
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	262 genic = 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	263 total_inserts = 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	264 with open(arguments.ref_genome, "r") as file:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	265 firstline = file.readline()
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	266 genomelength = firstline.split()[2]
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	267 i = 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	268 while i < float(genomelength):
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	269
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	270 # At each possible location for an insertion in the genome, counts the number of actual insertions at t1 and which strand(s) the corresponding reads came from.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	271
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	272 c1 = 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	273 if i in plus_ref_1:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	274 c1 = float(plus_ref_1[i])
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	275 strand = "+/"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	276 if i in minus_ref_1:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	277 c1 += float(minus_ref_1[i])
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	278 strand = "b/"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	279 elif i in minus_ref_1:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	280 c1 = float(minus_ref_1[i])
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	281 strand = "-/"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	282
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	283 # If there were no insertions at a certain location at t1 just continues to the next location; there can't be any comparison to make between t1 and t2 if there are no t1 insertions!
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	284
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	285 else:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	286 i += 1
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	287 continue
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	288
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	289 # At each location where there was an insertion at t1, counts the number of insertions at t2 and which strand(s) the corresponding reads came from.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	290
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	291 c2 = 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	292 if i in plus_ref_2:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	293 c2 = float(plus_ref_2[i])
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	294 if i in minus_ref_2:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	295 c2 += float(minus_ref_2[i])
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	296 strand += "b"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	297 else:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	298 strand += "+"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	299 elif i in minus_ref_2:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	300 c2 = float(minus_ref_2[i])
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	301 strand += "-"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	302
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	303 # Corrects with cfactor1 and cfactor2; as noted above, this may not be necessary and we'll see what removing it does in the future. Possible alternate / non-corrected code is shown in the comments just below.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	304
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	305 c1 /= cfactor1
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	306 if c2 != 0:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	307 c2 /= cfactor2
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	308 ratio = c2/c1
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	309 else:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	310 c2 = 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	311 ratio = 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	312
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	313 # if c2 != 0:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	314 # ratio = c2/c1
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	315 # else:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	316 # ratio = 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	317
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	318 # Passes by all insertions with a number of reads smaller than the cutoff, as they may lead to inaccurate fitness calculations.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	319
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	320 if (c1 + c2)/2 < float(arguments.cutoff):
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	321 i+= 1
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	322 continue
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	323
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	324 # Calculates each insertion's frequency within the populations at t1 and t2. If cfactor correction were removed, "total" would have to be changed to "float(arguments.reads1)" and "float(arguments.reads2)" respectively, as shown in the comments just below.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	325
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	326 mt_freq_t1 = c1/total
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	327 mt_freq_t2 = c2/total
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	328 # mt_freq_t1 = c1/float(arguments.reads1)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	329 # mt_freq_t2 = c2/float(arguments.reads2)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	330 pop_freq_t1 = 1 - mt_freq_t1
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	331 pop_freq_t2 = 1 - mt_freq_t2
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	332
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	333 # Calculates each insertion's fitness! This is from the fitness equation log((frequency of mutation @ time 2 / frequency of mutation @ time 1)expansion factor)/log((frequency of population without the mutation @ time 2 / frequency of population without the mutation @ time 1)expansion factor)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	334
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	335 w = 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	336 if mt_freq_t2 != 0:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	337 top_w = math.log(mt_freq_t2*(float(arguments.expansion_factor)/mt_freq_t1))
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	338 bot_w = math.log(pop_freq_t2*(float(arguments.expansion_factor)/pop_freq_t1))
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	339 w = top_w/bot_w
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	340
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	341 # Checks which gene locus the insertion falls within, and records that.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	342
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	343 gene = ''
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	344 for feature_dictionary in feature_list:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	345 if feature_dictionary["start"] <= i and feature_dictionary["end"] >= i:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	346 gene = "".join(feature_dictionary["gene"])
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	347 genic += 1
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	348 break
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	349 total_inserts += 1
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	350
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	351 # Writes all relevant information on each insertion and its fitness to a cvs file: the location of the insertion, its strand, c1, c2, etc. (the variable names are self-explanatiory)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	352 # w is written twice, because the second w will be normalized if normalization is called for, thus becoming nW.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	353
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	354 row = [i, strand, c1, c2, ratio, mt_freq_t1, mt_freq_t2, pop_freq_t1, pop_freq_t2, gene, arguments.expansion_factor, w, w]
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	355 results.append(row)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	356 i += 1
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	357 with open(arguments.outfile, "wb") as csvfile:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	358 writer = csv.writer(csvfile)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	359 writer.writerows(results)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	360
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	361 # Prints the time when done comparing mapfiles - that is, when done with all fitness calculations but normalization - as well as the number of genic inserts and number of total inserts.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	362
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	363 print "Done comparing mapfiles " + str(get_time()) + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	364 print "Genic: " + str(genic) + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	365 print "Total: " + str(total_inserts) + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	366
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	367
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	368
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	369
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	370
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	371
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	372
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	373
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	374
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	375
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	376 ##### NORMALIZATION #####
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	377
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	378 # If making a WIG file is requested in the arguments, starts a string to be added to and then written to the WIG file with a typical WIG file header.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	379 # The header is just in a typical WIG file format; if you'd like to look into this more UCSC has notes on formatting WIG files on their site.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	380
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	381 if (arguments.wig):
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	382 wigstring = "track type=wiggle_0 name=" + arguments.wig + "\n" + "variableStep chrom=" + refname + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	383
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	384 # If a file's given for normalization, starts normalization; this corrects for anything that would cause all the fitness values to be too high or too low.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	385
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	386 if (arguments.normalize):
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	387
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	388 # Makes a list of the genes in the normalization file, which should all be transposon genes (these naturally ought to have a fitness value of exactly 1, because transposons are generally non-coding DNA)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	389
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	390 with open(arguments.normalize) as file:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	391 transposon_genes = file.read().splitlines()
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	392 print "Normalize genes loaded" + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	393 blank_ws = 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	394 sum = 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	395 count = 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	396 weights = []
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	397 scores = []
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	398 for list in results:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	399
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	400 # Finds all insertions within one of the normalization genes that also have a w value; gets their c1 and c2 values (the number of insertions at t1 and t2) and takes the average of that!
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	401 # The average is later used as the "weight" of an insertion location's fitness - if it's had more insertions, it should weigh proportionally more towards the average fitness of insertions within the normalization genes.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	402
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	403 if list[9] != '' and list[9] in transposon_genes and list[11]:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	404 c1 = list[2]
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	405 c2 = list[3]
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	406 score = list[11]
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	407 avg = (c1 + c2)/2
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	408
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	409 # Skips over those insertion locations with too few insertions - their fitness values are less accurate because they're based on such small insertion numbers.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	410
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	411 if float(c1) >= float(arguments.cutoff2):
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	412 #if avg >= float(arguments.cutoff2):
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	413
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	414 # Sets a max weight, to prevent insertion location scores with huge weights from unbalancing the normalization.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	415
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	416 if (avg >= float(arguments.max_weight)):
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	417 avg = float(arguments.max_weight)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	418
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	419 # Tallies how many w values are 0 within the blank_ws value; you might get many transposon genes with a w value of 0 if a bottleneck occurs, which is especially common with in vivo experiments.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	420 # For example, when studying a nasal infection in a mouse model, what bacteria "sticks" and is able to survive and what bacteria is swallowed and killed or otherwise flushed out tends to be a matter
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	421 # of chance not fitness; all mutants with an insertion in a specific transposon gene could be flushed out by chance!
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	422
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	423 if score == 0:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	424 blank_ws += 1
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	425
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	426 # Adds the fitness values of the insertions within normalization genes together and increments count so their average fitness (sum/count) can be calculated later on
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	427
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	428 sum += score
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	429 count += 1
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	430
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	431 # Records the weights of the fitness values of the insertion locations in corresponding lists - for example, weights[2] would be the weight of the fitness value at score[2]
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	432
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	433 weights.append(avg)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	434 scores.append(score)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	435
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	436 print str(list[9]) + " " + str(score) + " " + str(c1)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	437
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	438 # Counts and removes all "blank" fitness values of normalization genes - those that = 0 - because they most likely don't really have a fitness value of 0, and you just happened to not get any reads from that location at t2.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	439
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	440 blank_count = 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	441 original_count = len(scores)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	442 i = 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	443 while i < original_count:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	444 w_value = scores[i]
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	445 if w_value == 0:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	446 blank_count += 1
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	447 weights.pop[i]
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	448 scores.pop[i]
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	449 i-=1
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	450 i += 1
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	451
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	452 # If no normalization genes can pass the cutoff, normalization cannot occur, so this ends the script advises the user to try again and lower cutoff and/or cutoff2.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	453
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	454 if len(scores) == 0:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	455 print 'ERROR: The normalization genes do not have enough reads to pass cutoff and/or cutoff2; please lower one or both of those arguments.' + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	456 quit()
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	457
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	458 # Prints the number of of blank fitness values found and removed for reference. Writes the percentage to a file so it can be referenced for aggregate analysis.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	459
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	460 pc_blank_normals = float(blank_count) / float(original_count)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	461 print "# blank out of " + str(original_count) + ": " + str(pc_blank_normals) + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	462 with open(arguments.outfile2, "w") as f:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	463 f.write("blanks: " + str(pc_blank_normals) + "\n" + "total: " + str(total) + "\n" + "refname: " + refname)
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	464
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	465
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	466 # Finds "average" - the average fitness value for an insertion within the transposon genes - and "weighted_average" - the average fitness value for an insertion within the transposon genes weighted by how many insertions each had.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	467
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	468 average = sum / count
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	469 i = 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	470 weighted_sum = 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	471 weight_sum = 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	472 while i < len(weights):
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	473 weighted_sum += weights[i]*scores[i]
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	474 weight_sum += weights[i]
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	475 i += 1
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	476 weighted_average = weighted_sum/weight_sum
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	477
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	478 # Prints the regular average, weighted average, and total insertions for reference
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	479
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	480 print "Normalization step:" + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	481 print "Regular average: " + str(average) + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	482 print "Weighted Average: " + str(weighted_average) + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	483 print "Total Insertions: " + str(count) + "\n"
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	484
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	485 # The actual normalization happens here; every fitness score is divided by the average fitness found for genes that should have a value of 1.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	486 # For example, if the average fitness for genes was too low overall - let's say 0.97 within the normalization geness - every fitness would be proportionally raised.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	487
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	488 old_ws = 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	489 new_ws = 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	490 wcount = 0
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	491 for list in results:
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	492 if list[11] == 'W':
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	493 continue
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	494 new_w = float(list[11])/weighted_average
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	495
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	496 # Sometimes you want to multiply all the fitness values by a constant; this does that.
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	497 # For example you might multiply all the values by a constant for a genetic interaction screen - where Tn-Seq is performed as usual except there's one background knockout all the mutants share. This is
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	498 # because independent mutations should have a fitness value that's equal to their individual fitness values multipled, but related mutations will deviate from that; to find those deviations you'd multiply
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	499 # all the fitness values from mutants from a normal library by the fitness of the background knockout and compare that to the fitness values found from the knockout library!
9aeda6cd07dc Uploaded kaymccoy parents: diff changeset	500

4

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

1 # A translation of calc_fitness.pl into python! For analysis of Tn-Seq.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

2 # This script requires BioPython, which in turn has a good number of dependencies (some optional but very helpful).

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

3 # How to install BioPython and a list of its dependencies can be found here: http://biopython.org/DIST/docs/install/Installation.html

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

4 # To see what future edits / tests are planned for this script, search for the phrase "in the future".

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

5

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

6

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

7

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

8

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

9

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

10

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

11

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

12

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

13

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

14

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

15 ##### ARGUMENTS #####

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

16

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

17 def print_usage():

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

18 print "\n" + "You are missing one or more required flags. A complete list of flags accepted by calc_fitness is as follows:" + "\n\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

19 print "\033[1m" + "Required" + "\033[0m" + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

20 print "-ref" + "\t\t" + "The name of the reference genome file, in GenBank format." + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

21 print "-t1" + "\t\t" + "The name of the bowtie mapfile from time 1." + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

22 print "-t2" + "\t\t" + "The name of the bowtie mapfile from time 2." + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

23 print "-out" + "\t\t" + "Name of a file to enter the .csv output." + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

24 print "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

25 print "\033[1m" + "Optional" + "\033[0m" + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

26 print "-expansion" + "\t\t" + "Expansion factor (default: 250)" + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

27 print "-d" + "\t\t" + "All reads being analyzed are downstream of the transposon" + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

28 print "-reads1" + "\t\t" + "The number of reads to be used to calculate the correction factor for time 0." + "\n\t\t" + "(default counted from bowtie output)" + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

29 print "-reads2" + "\t\t" + "The number of reads to be used to calculate the correction factor for time 6." + "\n\t\t" + "(default counted from bowtie output)" + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

30 print "-cutoff" + "\t\t" + "Discard any positions where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

31 print "-cutoff2" + "\t\t" + "Discard any positions within the normalization genes where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

32 print "-strand" + "\t\t" + "Use only the specified strand (+ or -) when counting transcripts (default: both)" + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

33 print "-normalize" + "\t" + "A file that contains a list of genes that should have a fitness of 1" + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

34 print "-maxweight" + "\t" + "The maximum weight a transposon gene can have in normalization calculations" + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

35 print "-multiply" + "\t" + "Multiply all fitness scores by a certain value (e.g., the fitness of a knockout). You should normalize the data." + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

36 print "-ef" + "\t\t" + "Exclude insertions that occur in the first N amount (%) of gene--becuase may not affect gene function." + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

37 print "-el" + "\t\t" + "Exclude insertions in the last N amount (%) of the gene--considering truncation may not affect gene function." + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

38 print "-wig" + "\t\t" + "Create a wiggle file for viewing in a genome browser. Provide a filename." + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

39 print "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

40

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

41 import argparse

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

42 parser = argparse.ArgumentParser()

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

43 parser.add_argument("-ref", action="store", dest="ref_genome")

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

44 parser.add_argument("-t1", action="store", dest="mapfile1")

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

45 parser.add_argument("-t2", action="store", dest="mapfile2")

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

46 parser.add_argument("-out", action="store", dest="outfile")

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

47 parser.add_argument("-out2", action="store", dest="outfile2")

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

48 parser.add_argument("-expansion", action="store", dest="expansion_factor")

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

49 parser.add_argument("-d", action="store", dest="downstream")

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

50 parser.add_argument("-reads1", action="store", dest="reads1")

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

51 parser.add_argument("-reads2", action="store", dest="reads2")

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

52 parser.add_argument("-cutoff", action="store", dest="cutoff")

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

53 parser.add_argument("-cutoff2", action="store", dest="cutoff2")

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

54 parser.add_argument("-strand", action="store", dest="usestrand")

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

55 parser.add_argument("-normalize", action="store", dest="normalize")

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

56 parser.add_argument("-maxweight", action="store", dest="max_weight")

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

57 parser.add_argument("-multiply", action="store", dest="multiply")

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

58 parser.add_argument("-ef", action="store", dest="exclude_first")

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

59 parser.add_argument("-el", action="store", dest="exclude_last")

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

60 parser.add_argument("-wig", action="store", dest="wig")

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

61 arguments = parser.parse_args()

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

62

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

63 # Checks that all the required arguments have actually been entered

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

64 # The reference genome is required to find what gene each insertion falls within, the mapfiles are required because the relative number of insertions between time 2 and time 1 are used to calculate each insertion location's fitness, and the outfile is required so you... get an outfile.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

65

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

66 if (not arguments.ref_genome or not arguments.mapfile1 or not arguments.mapfile2 or not arguments.outfile):

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

67 print_usage()

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

68 quit()

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

69

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

70 # Sets the default value of the expansion factor to 250, which is not a particularly meaningful number; the expansion factor just needs to have some value so that rough fitness calculations can be run.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

71 # Measuring and inputting your own expansion factor is highly recommended, as without it the fitness calculations will not be nearly as accurate. See the equations used to calculate fitness if you're confused about that.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

72

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

73 if (not arguments.expansion_factor):

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

74 arguments.expansion_factor = 250

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

75

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

76 # Sets the default maximum weight of a transposon gene; 75 is similarly not a particularly meaningful number.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

77

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

78 if (not arguments.max_weight):

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

79 arguments.max_weight = 75

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

80

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

81 # Sets the default value of cutoff to 0; cutoff exists to discard positions with a low number of counted transcripts, because fitnesses calculated from them may not be very accurate, by the same reasoning that studies with low sample sizes are innacurate.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

82

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

83 if (not arguments.cutoff):

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

84 arguments.cutoff = 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

85

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

86 # Sets the default value of cutoff2 to 10; cutoff2 exists to discard positions within normalization genes with a low number of counted transcripts, because fitnesses calculated from them similarly may not be very accurate.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

87 # This only has an effect if it's larger than cutoff, since the normalization step references a list of insertions already affected by cutoff.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

88

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

89 if (not arguments.cutoff2):

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

90 arguments.cutoff2 = 10

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

91

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

92 #Sets the default value of strand to "both", indicating you'll use reads from both Watson & Crick strands.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

93

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

94 if (not arguments.usestrand):

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

95 arguments.usestrand = "both"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

96

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

97

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

98

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

99

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

100

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

101

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

102 ##### PARSING THE REFERENCE GENOME #####

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

103

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

104 def get_time():

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

105 import datetime

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

106 return datetime.datetime.now().time()

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

107 print "\n" + "Starting: " + str(get_time()) + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

108

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

109 # Uses Biopython's SeqIO to parse the reference genome file for features; these come from its feature table, which lists all known features and their relevance - if they're a gene, a recognizable repeat, etc.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

110 # This is the reason that all reference genome files must be in standard GenBank format!

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

111

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

112 from Bio import SeqIO

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

113 import os.path

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

114 handle = open(arguments.ref_genome, "rU")

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

115 for record in SeqIO.parse(handle, "genbank"):

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

116 refname = record.id

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

117 features = record.features

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

118 handle.close()

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

119

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

120 # Makes a dictionary out of each feature that's a gene - with its gene name, start location, end location, and strand as keys to their values. Then makes a list out of all those dictionaries for ease of accessing later on.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

121

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

122 feature_list = []

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

123 for feature in features:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

124 if feature.type == "gene":

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

125 gene = feature.qualifiers["locus_tag"]

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

126 strand = feature.location.strand

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

127 start = float(feature.location.start)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

128 end = float(feature.location.end)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

129

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

130 # Exclude_first and exclude_last are used here to exclude whatever percentage of the genes you like from calculations; e.g. a value of 0.1 for exclude_last would exclude the last 10% of all genes!

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

131 # This can be useful because insertions at the very start or end of genes often don't actually break its function.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

132

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

133 if (arguments.exclude_first):

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

134 start += (end - start) * float(arguments.exclude_first)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

135 if (arguments.exclude_last):

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

136 end -= (end - start) * float(arguments.exclude_last)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

137 feature_dictionary = {"gene": gene, "start": start, "end": end, "strand": strand}

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

138 feature_list.append(feature_dictionary)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

139

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

140 print "Done generating feature lookup: " + str(get_time()) + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

141

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

142

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

143

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

144

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

145

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

146

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

147

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

148

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

149

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

150

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

151 ##### PARSING THE MAPFILES #####

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

152

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

153 with open(arguments.mapfile1) as file:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

154 r1 = file.readlines()

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

155 with open(arguments.mapfile2) as file:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

156 r2 = file.readlines()

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

157

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

158 # When called, goes through each line of the mapfile (each line is a compressed read) to find the strand, count, and position of the read. It may be helpful to look at how the mapfiles are formatted to understand how this code finds them.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

159 # The strand of a read is + (Watson) or - (Crick).

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

160 # The count is the number of times a particular read was actually sequenced (counted by fastx_collapser before calc_fitness.py is even called).

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

161 # The position is the position of the sequence when alligned to the reference genome (found by bowtie, also before calc_fitness.py is called). This is the position of the FIRST nucleotide in the sequence, when mapped.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

162 # Also records the total numbers of reads for the plus and minus strands, and the number of plus and minus sites from the mapfiles - not for fitness calculations, just for reference, as they're printed later.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

163

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

164 def read_mapfile(reads):

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

165 plus_total = 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

166 minus_total = 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

167 plus_counts = {"total": 0, "sites": 0}

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

168 minus_counts = {"total": 0, "sites": 0}

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

169 for read in reads:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

170 if "-" in read.split()[0]:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

171 strand = read.split()[1]

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

172 count = float(read.split()[0].split("-")[1])

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

173 position = float(read.split()[3])

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

174 else:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

175 continue

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

176

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

177 # If for some reason you want to skip all reads from one of the strands - for example, if you wanted to compare the two strands - that's done here.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

178

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

179 if arguments.usestrand != "both" and strand != arguments.usestrand:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

180 continue

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

181

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

182 # Makes a dictionary for the + strand, with each insert position as a key and the number of insertions there as its corresponding value.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

183 # Here we run into a tricky problem, in that the actual location of an insertion would be right before the position of the first nucleotide in a flanking sequence "following" the insertion, but right after the last nucleotide in a flanking sequence "preceding" the insertion.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

184 # Thus the location of an insertion from a following flanking sequence would be the same as the mapped position of the sequence, but the location of an insertion from a preceding flanking sequence would be its mapped position + the length of the flanking sequence - 2.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

185 # The -2 comes from a fake "TA" in the read, caused by MmeI cutting unevenly at an "AT" site so that an extra "TA" is created, artifically adding +2 to the flanking sequence length.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

186 # However, it is currently impossible to distinguish between following and preceding flanking sequences, and so we split the difference by adding (the length of the sequence - 2) to only the + strand reads.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

187 # In the future we will try not adding anything to both, or adding 1/2(the sequence length -2) to both + an - strands, to see if that improves calculations.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

188 # The 'if arguments.downstream' option is there for a different method than the standard, in which reads are only taken from downstream of the transposon.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

189

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

190 if (strand == "+"):

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

191 sequence_length = len(read.split()[4])

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

192 if arguments.downstream:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

193 position += 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

194 else:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

195 position += (sequence_length - 2)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

196 plus_counts["total"] += count

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

197 plus_counts["sites"] += 1

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

198 if position in plus_counts:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

199 plus_counts[position] += count

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

200 else:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

201 plus_counts[position] = count

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

202

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

203 # Makes a dictionary for the - strand, with each insert position as a key and the number of insertions there as its corresponding value.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

204

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

205 else:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

206 minus_counts["total"] += count

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

207 minus_counts["sites"] += 1

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

208 if position in minus_counts:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

209 minus_counts[position] += count

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

210 else:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

211 minus_counts[position] = count

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

212 return (plus_counts, minus_counts)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

213

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

214 # Calls read_mapfile(reads) to parse arguments.reads1 and arguments.reads2 (your reads from t1 and t2).

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

215

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

216 (plus_ref_1, minus_ref_1) = read_mapfile(r1)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

217 print "Read first file: " + str(get_time()) + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

218 (plus_ref_2, minus_ref_2) = read_mapfile(r2)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

219 print "Read second file: " + str(get_time()) + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

220

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

221 # Prints the number of + and - reads from reads1 and reads2, as well as the number of + and - sites found from the mapfiles, for reference when debugging etc.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

222 # The number of sites is the length of a given dictionary of sites - 1 because its last key, "total", isn't actually a site.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

223

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

224 print "Reads:" + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

225 print "1: + " + str(plus_ref_1["total"]) + " - " + str(minus_ref_1["total"]) + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

226 print "2: + " + str(plus_ref_2["total"]) + " - " + str(minus_ref_2["total"]) + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

227 print "Sites:" + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

228 print "1: + " + str(plus_ref_1["sites"]) + " - " + str(minus_ref_1["sites"]) + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

229 print "2: + " + str(plus_ref_2["sites"]) + " - " + str(minus_ref_2["sites"]) + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

230

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

231

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

232

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

233

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

234

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

235

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

236

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

237

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

238

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

239

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

240 ##### FITNESS CALCULATIONS #####

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

241

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

242 # If reads1 and reads2 weren't specified in the command line, sets them as the total number of reads (found in read_mapfile())

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

243

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

244 if not arguments.reads1:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

245 arguments.reads1 = plus_ref_1["total"] + minus_ref_1["total"]

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

246 if not arguments.reads2:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

247 arguments.reads2 = plus_ref_2["total"] + minus_ref_2["total"]

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

248

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

249 # Calculates the correction factors for reads from t1 and t2; cfactor1 and cfactor2 are the number of reads from t1 and t2 respectively divided by total, which is the average number of reads between the two.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

250 # For example, if there were more reads from t2 than t1 - in a 6:4 ratio - cfactor2 would be 1.2 and cfactor1 would be 0.8

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

251 # This is used later on to correct for pipetting errors, or any other error that would cause unequal amounts of DNA from t1 and t2 to be sequenced so that an unequal amount of reads is produced

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

252 # However, because slightly more or less DNA shouldn't change the % frequency of a mutation (it would only change the absolute number of counts) this may not be necessary; in the future we'll try removing this.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

253

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

254 total = (float(arguments.reads1) + float(arguments.reads2))/2

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

255 cfactor1 = float(arguments.reads1)/total

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

256 cfactor2 = float(arguments.reads2)/total

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

257 print "Cfactor 1: " + str(cfactor1) + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

258 print "Cfactor 2: " + str(cfactor2) + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

259 import math

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

260 import csv

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

261 results = [["position", "strand", "count_1", "count_2", "ratio", "mt_freq_t1", "mt_freq_t2", "pop_freq_t1", "pop_freq_t2", "gene", "D", "W", "nW"]]

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

262 genic = 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

263 total_inserts = 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

264 with open(arguments.ref_genome, "r") as file:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

265 firstline = file.readline()

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

266 genomelength = firstline.split()[2]

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

267 i = 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

268 while i < float(genomelength):

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

269

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

270 # At each possible location for an insertion in the genome, counts the number of actual insertions at t1 and which strand(s) the corresponding reads came from.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

271

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

272 c1 = 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

273 if i in plus_ref_1:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

274 c1 = float(plus_ref_1[i])

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

275 strand = "+/"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

276 if i in minus_ref_1:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

277 c1 += float(minus_ref_1[i])

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

278 strand = "b/"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

279 elif i in minus_ref_1:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

280 c1 = float(minus_ref_1[i])

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

281 strand = "-/"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

282

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

283 # If there were no insertions at a certain location at t1 just continues to the next location; there can't be any comparison to make between t1 and t2 if there are no t1 insertions!

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

284

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

285 else:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

286 i += 1

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

287 continue

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

288

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

289 # At each location where there was an insertion at t1, counts the number of insertions at t2 and which strand(s) the corresponding reads came from.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

290

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

291 c2 = 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

292 if i in plus_ref_2:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

293 c2 = float(plus_ref_2[i])

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

294 if i in minus_ref_2:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

295 c2 += float(minus_ref_2[i])

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

296 strand += "b"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

297 else:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

298 strand += "+"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

299 elif i in minus_ref_2:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

300 c2 = float(minus_ref_2[i])

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

301 strand += "-"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

302

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

303 # Corrects with cfactor1 and cfactor2; as noted above, this may not be necessary and we'll see what removing it does in the future. Possible alternate / non-corrected code is shown in the comments just below.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

304

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

305 c1 /= cfactor1

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

306 if c2 != 0:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

307 c2 /= cfactor2

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

308 ratio = c2/c1

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

309 else:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

310 c2 = 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

311 ratio = 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

312

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

313 # if c2 != 0:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

314 # ratio = c2/c1

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

315 # else:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

316 # ratio = 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

317

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

318 # Passes by all insertions with a number of reads smaller than the cutoff, as they may lead to inaccurate fitness calculations.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

319

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

320 if (c1 + c2)/2 < float(arguments.cutoff):

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

321 i+= 1

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

322 continue

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

323

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

324 # Calculates each insertion's frequency within the populations at t1 and t2. If cfactor correction were removed, "total" would have to be changed to "float(arguments.reads1)" and "float(arguments.reads2)" respectively, as shown in the comments just below.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

325

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

326 mt_freq_t1 = c1/total

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

327 mt_freq_t2 = c2/total

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

328 # mt_freq_t1 = c1/float(arguments.reads1)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

329 # mt_freq_t2 = c2/float(arguments.reads2)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

330 pop_freq_t1 = 1 - mt_freq_t1

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

331 pop_freq_t2 = 1 - mt_freq_t2

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

332

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

333 # Calculates each insertion's fitness! This is from the fitness equation log((frequency of mutation @ time 2 / frequency of mutation @ time 1)*expansion factor)/log((frequency of population without the mutation @ time 2 / frequency of population without the mutation @ time 1)*expansion factor)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

334

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

335 w = 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

336 if mt_freq_t2 != 0:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

337 top_w = math.log(mt_freq_t2*(float(arguments.expansion_factor)/mt_freq_t1))

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

338 bot_w = math.log(pop_freq_t2*(float(arguments.expansion_factor)/pop_freq_t1))

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

339 w = top_w/bot_w

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

340

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

341 # Checks which gene locus the insertion falls within, and records that.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

342

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

343 gene = ''

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

344 for feature_dictionary in feature_list:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

345 if feature_dictionary["start"] <= i and feature_dictionary["end"] >= i:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

346 gene = "".join(feature_dictionary["gene"])

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

347 genic += 1

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

348 break

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

349 total_inserts += 1

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

350

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

351 # Writes all relevant information on each insertion and its fitness to a cvs file: the location of the insertion, its strand, c1, c2, etc. (the variable names are self-explanatiory)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

352 # w is written twice, because the second w will be normalized if normalization is called for, thus becoming nW.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

353

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

354 row = [i, strand, c1, c2, ratio, mt_freq_t1, mt_freq_t2, pop_freq_t1, pop_freq_t2, gene, arguments.expansion_factor, w, w]

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

355 results.append(row)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

356 i += 1

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

357 with open(arguments.outfile, "wb") as csvfile:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

358 writer = csv.writer(csvfile)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

359 writer.writerows(results)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

360

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

361 # Prints the time when done comparing mapfiles - that is, when done with all fitness calculations but normalization - as well as the number of genic inserts and number of total inserts.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

362

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

363 print "Done comparing mapfiles " + str(get_time()) + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

364 print "Genic: " + str(genic) + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

365 print "Total: " + str(total_inserts) + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

366

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

367

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

368

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

369

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

370

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

371

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

372

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

373

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

374

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

375

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

376 ##### NORMALIZATION #####

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

377

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

378 # If making a WIG file is requested in the arguments, starts a string to be added to and then written to the WIG file with a typical WIG file header.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

379 # The header is just in a typical WIG file format; if you'd like to look into this more UCSC has notes on formatting WIG files on their site.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

380

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

381 if (arguments.wig):

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

382 wigstring = "track type=wiggle_0 name=" + arguments.wig + "\n" + "variableStep chrom=" + refname + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

383

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

384 # If a file's given for normalization, starts normalization; this corrects for anything that would cause all the fitness values to be too high or too low.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

385

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

386 if (arguments.normalize):

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

387

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

388 # Makes a list of the genes in the normalization file, which should all be transposon genes (these naturally ought to have a fitness value of exactly 1, because transposons are generally non-coding DNA)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

389

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

390 with open(arguments.normalize) as file:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

391 transposon_genes = file.read().splitlines()

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

392 print "Normalize genes loaded" + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

393 blank_ws = 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

394 sum = 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

395 count = 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

396 weights = []

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

397 scores = []

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

398 for list in results:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

399

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

400 # Finds all insertions within one of the normalization genes that also have a w value; gets their c1 and c2 values (the number of insertions at t1 and t2) and takes the average of that!

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

401 # The average is later used as the "weight" of an insertion location's fitness - if it's had more insertions, it should weigh proportionally more towards the average fitness of insertions within the normalization genes.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

402

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

403 if list[9] != '' and list[9] in transposon_genes and list[11]:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

404 c1 = list[2]

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

405 c2 = list[3]

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

406 score = list[11]

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

407 avg = (c1 + c2)/2

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

408

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

409 # Skips over those insertion locations with too few insertions - their fitness values are less accurate because they're based on such small insertion numbers.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

410

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

411 if float(c1) >= float(arguments.cutoff2):

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

412 #if avg >= float(arguments.cutoff2):

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

413

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

414 # Sets a max weight, to prevent insertion location scores with huge weights from unbalancing the normalization.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

415

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

416 if (avg >= float(arguments.max_weight)):

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

417 avg = float(arguments.max_weight)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

418

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

419 # Tallies how many w values are 0 within the blank_ws value; you might get many transposon genes with a w value of 0 if a bottleneck occurs, which is especially common with in vivo experiments.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

420 # For example, when studying a nasal infection in a mouse model, what bacteria "sticks" and is able to survive and what bacteria is swallowed and killed or otherwise flushed out tends to be a matter

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

421 # of chance not fitness; all mutants with an insertion in a specific transposon gene could be flushed out by chance!

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

422

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

423 if score == 0:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

424 blank_ws += 1

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

425

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

426 # Adds the fitness values of the insertions within normalization genes together and increments count so their average fitness (sum/count) can be calculated later on

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

427

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

428 sum += score

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

429 count += 1

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

430

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

431 # Records the weights of the fitness values of the insertion locations in corresponding lists - for example, weights[2] would be the weight of the fitness value at score[2]

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

432

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

433 weights.append(avg)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

434 scores.append(score)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

435

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

436 print str(list[9]) + " " + str(score) + " " + str(c1)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

437

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

438 # Counts and removes all "blank" fitness values of normalization genes - those that = 0 - because they most likely don't really have a fitness value of 0, and you just happened to not get any reads from that location at t2.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

439

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

440 blank_count = 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

441 original_count = len(scores)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

442 i = 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

443 while i < original_count:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

444 w_value = scores[i]

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

445 if w_value == 0:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

446 blank_count += 1

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

447 weights.pop[i]

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

448 scores.pop[i]

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

449 i-=1

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

450 i += 1

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

451

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

452 # If no normalization genes can pass the cutoff, normalization cannot occur, so this ends the script advises the user to try again and lower cutoff and/or cutoff2.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

453

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

454 if len(scores) == 0:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

455 print 'ERROR: The normalization genes do not have enough reads to pass cutoff and/or cutoff2; please lower one or both of those arguments.' + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

456 quit()

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

457

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

458 # Prints the number of of blank fitness values found and removed for reference. Writes the percentage to a file so it can be referenced for aggregate analysis.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

459

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

460 pc_blank_normals = float(blank_count) / float(original_count)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

461 print "# blank out of " + str(original_count) + ": " + str(pc_blank_normals) + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

462 with open(arguments.outfile2, "w") as f:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

463 f.write("blanks: " + str(pc_blank_normals) + "\n" + "total: " + str(total) + "\n" + "refname: " + refname)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

464

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

465

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

466 # Finds "average" - the average fitness value for an insertion within the transposon genes - and "weighted_average" - the average fitness value for an insertion within the transposon genes weighted by how many insertions each had.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

467

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

468 average = sum / count

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

469 i = 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

470 weighted_sum = 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

471 weight_sum = 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

472 while i < len(weights):

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

473 weighted_sum += weights[i]*scores[i]

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

474 weight_sum += weights[i]

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

475 i += 1

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

476 weighted_average = weighted_sum/weight_sum

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

477

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

478 # Prints the regular average, weighted average, and total insertions for reference

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

479

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

480 print "Normalization step:" + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

481 print "Regular average: " + str(average) + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

482 print "Weighted Average: " + str(weighted_average) + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

483 print "Total Insertions: " + str(count) + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

484

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

485 # The actual normalization happens here; every fitness score is divided by the average fitness found for genes that should have a value of 1.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

486 # For example, if the average fitness for genes was too low overall - let's say 0.97 within the normalization geness - every fitness would be proportionally raised.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

487

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

488 old_ws = 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

489 new_ws = 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

490 wcount = 0

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

491 for list in results:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

492 if list[11] == 'W':

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

493 continue

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

494 new_w = float(list[11])/weighted_average

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

495

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

496 # Sometimes you want to multiply all the fitness values by a constant; this does that.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

497 # For example you might multiply all the values by a constant for a genetic interaction screen - where Tn-Seq is performed as usual except there's one background knockout all the mutants share. This is

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

498 # because independent mutations should have a fitness value that's equal to their individual fitness values multipled, but related mutations will deviate from that; to find those deviations you'd multiply

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

499 # all the fitness values from mutants from a normal library by the fitness of the background knockout and compare that to the fitness values found from the knockout library!

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

500

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

501 if arguments.multiply:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

502 new_w *= float(arguments.multiply)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

503

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

504 # Records the old w score for reference, and adds it to a total sum of all w scores (so that the old w mean and new w mean can be printed later).

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

505

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

506 if float(list[11]) > 0:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

507 old_ws += float(list[11])

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

508 new_ws += new_w

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

509 wcount += 1

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

510

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

511 # Writes the new w score into the results list of lists.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

512

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

513 list[12] = new_w

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

514

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

515 # Adds a line to wiglist for each insertion position, with the insertion position and its new w value.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

516

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

517 if (arguments.wig):

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

518 wigstring += str(list[0]) + " " + str(new_w) + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

519

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

520 # Prints the old w mean and new w mean for reference.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

521

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

522 old_w_mean = old_ws / wcount

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

523 new_w_mean = new_ws / wcount

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

524 print "Old W Average: " + str(old_w_mean) + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

525 print "New W Average: " + str(new_w_mean) + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

526

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

527 # Overwrites the old file with the normalized file.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

528

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

529 with open(arguments.outfile, "wb") as csvfile:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

530 writer = csv.writer(csvfile)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

531 writer.writerows(results)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

532

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

533 # If a WIG file was requested, actually creates the WIG file and writes wiglist to it

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

534 # So what's written here is the WIG header plus each insertion position and it's new w value if normalization were called for, and each insertion position and its unnormalized w value if normalization were not called for.

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

535

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

536 if (arguments.wig):

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

537 if (arguments.normalize):

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

538 with open(arguments.wig, "wb") as wigfile:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

539 wigfile.write(wigstring)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

540 else:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

541 for list in results:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

542 wigstring += str(list[0]) + " " + str(list[11]) + "\n"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

543 with open(arguments.wig, "wb") as wigfile:

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

544 wigfile.write(wigstring)

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

545

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

546

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

547

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

548

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

549

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

550 #FITOPSpy="-ef .0 -el .10 -cutoff 0"

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

551 #

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

552 #python ../script/calc_fitness.py $FITOPSpy -wig gview/py_L1_2394eVI_Gluc.wig -t1 alignments/L1_2394eVI_Input.map -t2 alignments/L1_2394eVI_Gluc_T2.map -ref=NC_003028b2.gbk -out results/py_L1_2394eVI_Gluc.csv -out2 results/py_2_L1_2394eVI_Gluc.txt -expansion 675 -normalize tigr4_normal.txt

9aeda6cd07dc Uploaded

kaymccoy

parents:

diff changeset

553 #python ../script/calc_fitness.py $FITOPSpy -wig gview/py_L3_2394eVI_Gluc.wig -t1 alignments/L3_2394eVI_Input.map -t2 alignments/L3_2394eVI_Gluc_T2.map -ref=NC_003028b2.gbk -out results/py_L3_2394eVI_Gluc.csv -out2 results/py_2_L3_2394eVI_Gluc.txt -expansion 244 -normalize tigr4_normal.txt

Mercurial > repos > kaymccoy > calculate_fitnesses

annotate calc_fitness.py @ 4:9aeda6cd07dc draft