Galaxy |

Changeset 4:9aeda6cd07dc (2016-08-11)

Previous changeset 3:6aef6d1c9316 (2016-08-11) Next changeset 5:aa156d61c38c (2016-08-11)

Commit message:
Uploaded

added:
calc_fitness.py

diff -r 6aef6d1c9316 -r 9aeda6cd07dc calc_fitness.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/calc_fitness.py Thu Aug 11 18:34:33 2016 -0400

[

b'@@ -0,0 +1,553 @@\n+# A translation of calc_fitness.pl into python! For analysis of Tn-Seq.\n+# This script requires BioPython, which in turn has a good number of dependencies (some optional but very helpful).\n+# How to install BioPython and a list of its dependencies can be found here: http://biopython.org/DIST/docs/install/Installation.html\n+# To see what future edits / tests are planned for this script, search for the phrase "in the future".\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+##### ARGUMENTS #####\n+\n+def print_usage():\n+\tprint "\\n" + "You are missing one or more required flags. A complete list of flags accepted by calc_fitness is as follows:" + "\\n\\n"\n+\tprint "\\033[1m" + "Required" + "\\033[0m" + "\\n"\n+\tprint "-ref" + "\\t\\t" + "The name of the reference genome file, in GenBank format." + "\\n"\n+\tprint "-t1" + "\\t\\t" + "The name of the bowtie mapfile from time 1." + "\\n"\n+\tprint "-t2" + "\\t\\t" + "The name of the bowtie mapfile from time 2." + "\\n"\n+\tprint "-out" + "\\t\\t" + "Name of a file to enter the .csv output." + "\\n"\n+\tprint "\\n"\n+\tprint "\\033[1m" + "Optional" + "\\033[0m" + "\\n"\n+\tprint "-expansion" + "\\t\\t" + "Expansion factor (default: 250)" + "\\n"\n+\tprint "-d" + "\\t\\t" + "All reads being analyzed are downstream of the transposon" + "\\n"\n+\tprint "-reads1" + "\\t\\t" + "The number of reads to be used to calculate the correction factor for time 0." + "\\n\\t\\t" + "(default counted from bowtie output)" + "\\n"\n+\tprint "-reads2" + "\\t\\t" + "The number of reads to be used to calculate the correction factor for time 6." + "\\n\\t\\t" + "(default counted from bowtie output)" + "\\n"\n+\tprint "-cutoff" + "\\t\\t" + "Discard any positions where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\\n"\n+\tprint "-cutoff2" + "\\t\\t" + "Discard any positions within the normalization genes where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\\n"\n+\tprint "-strand" + "\\t\\t" + "Use only the specified strand (+ or -) when counting transcripts (default: both)" + "\\n"\n+\tprint "-normalize" + "\\t" + "A file that contains a list of genes that should have a fitness of 1" + "\\n"\n+\tprint "-maxweight" + "\\t" + "The maximum weight a transposon gene can have in normalization calculations" + "\\n"\n+\tprint "-multiply" + "\\t" + "Multiply all fitness scores by a certain value (e.g., the fitness of a knockout). You should normalize the data." + "\\n"\n+\tprint "-ef" + "\\t\\t" + "Exclude insertions that occur in the first N amount (%) of gene--becuase may not affect gene function." + "\\n"\n+\tprint "-el" + "\\t\\t" + "Exclude insertions in the last N amount (%) of the gene--considering truncation may not affect gene function." + "\\n"\n+\tprint "-wig" + "\\t\\t" + "Create a wiggle file for viewing in a genome browser. Provide a filename." + "\\n"\n+\tprint "\\n"\n+\n+import argparse \n+parser = argparse.ArgumentParser()\n+parser.add_argument("-ref", action="store", dest="ref_genome")\n+parser.add_argument("-t1", action="store", dest="mapfile1")\n+parser.add_argument("-t2", action="store", dest="mapfile2")\n+parser.add_argument("-out", action="store", dest="outfile")\n+parser.add_argument("-out2", action="store", dest="outfile2")\n+parser.add_argument("-expansion", action="store", dest="expansion_factor")\n+parser.add_argument("-d", action="store", dest="downstream")\n+parser.add_argument("-reads1", action="store", dest="reads1")\n+parser.add_argument("-reads2", action="store", dest="reads2")\n+parser.add_argument("-cutoff", action="store", dest="cutoff")\n+parser.add_argument("-cutoff2", action="store", dest="cutoff2")\n+parser.add_argument("-strand", action="store", dest="usestrand")\n+parser.add_argument("-normalize", action="store", dest="normalize")\n+parser.add_argument("-maxweight", action="store", dest="max_weight")\n+parser.add_argument("-multiply", action="store", dest="multiply")\n+parser.add_argument("-ef", action="store", dest="exclude_first")\n+parser.add_argument("-el", action="store", dest="exclude_last")\n+parser.add_arg'..b'rtion within the transposon genes weighted by how many insertions each had.\n+\n+\taverage = sum / count\n+\ti = 0\n+\tweighted_sum = 0\n+\tweight_sum = 0\n+\twhile i < len(weights):\n+\t\tweighted_sum += weights[i]*scores[i]\n+\t\tweight_sum += weights[i]\n+\t\ti += 1\n+\tweighted_average = weighted_sum/weight_sum\n+ \n+# Prints the regular average, weighted average, and total insertions for reference\n+ \n+\tprint "Normalization step:" + "\\n"\n+\tprint "Regular average: " + str(average) + "\\n"\n+\tprint "Weighted Average: " + str(weighted_average) + "\\n"\n+\tprint "Total Insertions: " + str(count) + "\\n"\n+ \n+# The actual normalization happens here; every fitness score is divided by the average fitness found for genes that should have a value of 1. \n+# For example, if the average fitness for genes was too low overall - let\'s say 0.97 within the normalization geness - every fitness would be proportionally raised.\n+\n+\told_ws = 0\n+\tnew_ws = 0\n+\twcount = 0\n+\tfor list in results:\n+\t\tif list[11] == \'W\':\n+\t\t\tcontinue\n+\t\tnew_w = float(list[11])/weighted_average\n+\t\t\n+# Sometimes you want to multiply all the fitness values by a constant; this does that.\n+# For example you might multiply all the values by a constant for a genetic interaction screen - where Tn-Seq is performed as usual except there\'s one background knockout all the mutants share. This is\n+# because independent mutations should have a fitness value that\'s equal to their individual fitness values multipled, but related mutations will deviate from that; to find those deviations you\'d multiply\n+# all the fitness values from mutants from a normal library by the fitness of the background knockout and compare that to the fitness values found from the knockout library!\n+\t\t\n+\t\tif arguments.multiply:\n+\t\t\tnew_w *= float(arguments.multiply)\n+\t\t\n+# Records the old w score for reference, and adds it to a total sum of all w scores (so that the old w mean and new w mean can be printed later).\n+\t\t\n+\t\tif float(list[11]) > 0:\n+\t\t\told_ws += float(list[11])\n+\t\t\tnew_ws += new_w\n+\t\t\twcount += 1\n+\t\t\t\n+# Writes the new w score into the results list of lists.\n+\t\t\t\n+\t\tlist[12] = new_w\n+\t\t\n+# Adds a line to wiglist for each insertion position, with the insertion position and its new w value.\n+\t\t\n+\t\tif (arguments.wig):\n+\t\t\twigstring += str(list[0]) + " " + str(new_w) + "\\n"\n+ \n+# Prints the old w mean and new w mean for reference.\n+ \n+\told_w_mean = old_ws / wcount\n+\tnew_w_mean = new_ws / wcount\n+\tprint "Old W Average: " + str(old_w_mean) + "\\n"\n+\tprint "New W Average: " + str(new_w_mean) + "\\n"\n+\n+# Overwrites the old file with the normalized file.\n+\n+with open(arguments.outfile, "wb") as csvfile:\n+ writer = csv.writer(csvfile)\n+ writer.writerows(results)\n+ \n+# If a WIG file was requested, actually creates the WIG file and writes wiglist to it\n+# So what\'s written here is the WIG header plus each insertion position and it\'s new w value if normalization were called for, and each insertion position and its unnormalized w value if normalization were not called for.\n+\t\t\n+if (arguments.wig):\n+\tif (arguments.normalize):\n+\t\twith open(arguments.wig, "wb") as wigfile:\n+\t\t\twigfile.write(wigstring)\n+\telse:\n+\t\tfor list in results:\n+\t\t\twigstring += str(list[0]) + " " + str(list[11]) + "\\n"\n+\t\twith open(arguments.wig, "wb") as wigfile:\n+\t\t\t\twigfile.write(wigstring)\t\t\t\t\t\n+\n+\n+\n+\n+\n+#FITOPSpy="-ef .0 -el .10 -cutoff 0"\n+#\n+#python ../script/calc_fitness.py $FITOPSpy -wig gview/py_L1_2394eVI_Gluc.wig -t1 alignments/L1_2394eVI_Input.map -t2 alignments/L1_2394eVI_Gluc_T2.map -ref=NC_003028b2.gbk -out results/py_L1_2394eVI_Gluc.csv -out2 results/py_2_L1_2394eVI_Gluc.txt -expansion 675 -normalize tigr4_normal.txt\n+#python ../script/calc_fitness.py $FITOPSpy -wig gview/py_L3_2394eVI_Gluc.wig -t1 alignments/L3_2394eVI_Input.map -t2 alignments/L3_2394eVI_Gluc_T2.map -ref=NC_003028b2.gbk -out results/py_L3_2394eVI_Gluc.csv -out2 results/py_2_L3_2394eVI_Gluc.txt -expansion 244 -normalize tigr4_normal.txt\n\\ No newline at end of file\n'