Previous changeset 3:98ec522f4e95 (2016-08-11) Next changeset 5:deb5134655cb (2016-08-11) |
Commit message:
Uploaded |
added:
consol_fit.py |
b |
diff -r 98ec522f4e95 -r 7d2f2d1a23ee consol_fit.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/consol_fit.py Thu Aug 11 18:33:54 2016 -0400 |
[ |
b'@@ -0,0 +1,344 @@\n+# Consol_fit! It\'s a script & it\'ll consolidate your fitness values if you got them from a looping trimming pipeline instead of the standard split-by-transposon pipeline. That\'s it!\n+\n+# Test: python ../script/consol_fit.py -calctxt results/py_2_L3_2394eVI_Gluc.txt -wig gview/consol_L3_2394eVI_Gluc.wig -i results/py_L3_2394eVI_Gluc.csv -out results/consol_L3_2394eVI_Gluc.csv -out2 results/py_2_L3_2394eVI_Gluc.csv -normalize tigr4_normal.txt\n+\n+# Test: python ../script/consol_fit.py -calctxt results/py_2_L3_2394eVI_Gluc.txt -wig gview/consol_L3_2394eVI_Gluc.wig -i results/galaxy_test.csv -out results/consol_L3_2394eVI_Gluc.csv -out2 results/py_2_L3_2394eVI_Gluc.csv -normalize tigr4_normal.txt\n+\n+import math\n+import csv\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+##### ARGUMENTS #####\n+\n+def print_usage():\n+\tprint "\\n" + "You are missing one or more required flags. A complete list of flags accepted by calc_fitness is as follows:" + "\\n\\n"\n+\tprint "\\033[1m" + "Required" + "\\033[0m" + "\\n"\n+\tprint "-i" + "\\t\\t" + "The calc_fit file to be consolidated" + "\\n"\n+\tprint "-out" + "\\t\\t" + "Name of a file to enter the .csv output." + "\\n"\n+\tprint "-out2" + "\\t\\t" + "Name of a file to put the percent blank score in (used in aggregate)." + "\\n"\n+\tprint "-calctxt" + "\\t\\t" + "The txt file output from calc_fit" + "\\n"\n+\tprint "-normalize" + "\\t" + "A file that contains a list of genes that should have a fitness of 1" + "\\n"\n+\tprint "\\n"\n+\tprint "\\033[1m" + "Optional" + "\\033[0m" + "\\n"\n+\tprint "-cutoff" + "\\t\\t" + "Discard any positions where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\\n"\n+\tprint "-cutoff2" + "\\t\\t" + "Discard any positions within the normalization genes where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\\n"\n+\tprint "-wig" + "\\t\\t" + "Create a wiggle file for viewing in a genome browser. Provide a filename." + "\\n"\n+\tprint "-maxweight" + "\\t" + "The maximum weight a transposon gene can have in normalization calculations" + "\\n"\n+\tprint "-multiply" + "\\t" + "Multiply all fitness scores by a certain value (e.g., the fitness of a knockout). You should normalize the data." + "\\n"\n+\tprint "\\n"\n+\n+import argparse \n+parser = argparse.ArgumentParser()\n+parser.add_argument("-calctxt", action="store", dest="calctxt")\n+parser.add_argument("-normalize", action="store", dest="normalize")\n+parser.add_argument("-i", action="store", dest="input")\n+parser.add_argument("-out", action="store", dest="outfile")\n+parser.add_argument("-out2", action="store", dest="outfile2")\n+parser.add_argument("-cutoff", action="store", dest="cutoff")\n+parser.add_argument("-cutoff2", action="store", dest="cutoff2")\n+parser.add_argument("-wig", action="store", dest="wig")\n+parser.add_argument("-maxweight", action="store", dest="max_weight")\n+parser.add_argument("-multiply", action="store", dest="multiply")\n+arguments = parser.parse_args()\n+\n+# Checks that all the required arguments have actually been entered\n+\n+if (not arguments.input or not arguments.outfile or not arguments.calctxt):\n+\tprint_usage() \n+\tquit()\n+\n+#\n+\n+if (not arguments.max_weight):\n+\targuments.max_weight = 75\n+\n+#\n+\n+if (not arguments.cutoff):\n+\targuments.cutoff = 0\n+\t\n+# Sets the default value of cutoff2 to 10; cutoff2 exists to discard positions within normalization genes with a low number of counted transcripts, because fitnesses calculated from them similarly may not be very accurate.\n+# This only has an effect if it\'s larger than cutoff, since the normalization step references a list of insertions already affected by cutoff.\n+\t\n+if (not arguments.cutoff2):\n+\targuments.cutoff2 = 10\n+\n+#Gets total & refname from calc_fit outfile2\n+\n+with open(arguments.calctxt) as file:\n+\tcalctxt = file.readlines()\n+total = float(calctxt[1].split()[1])\n+refname = calctxt[2].split()[1]\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\t\n+##### CONSOLIDATING THE CALC_FIT FILE #####\n+\n+with open(arguments.input) as file:\n+\tinput = file.readlines()\n'..b'\n+\t\t\n+# Prints the number of of blank fitness values found and removed for reference. Writes the percentage to a file so it can be referenced for aggregate analysis.\n+\t\n+\tpc_blank_normals = float(blank_count) / float(original_count)\n+\tprint "# blank out of " + str(original_count) + ": " + str(pc_blank_normals) + "\\n"\n+\twith open(arguments.outfile2, "w") as f:\n+\t\tf.write("blanks: " + str(pc_blank_normals) + "\\n" + "total: " + str(total) + "\\n" + "refname: " + refname)\n+ \n+ \n+# Finds "average" - the average fitness value for an insertion within the transposon genes - and "weighted_average" - the average fitness value for an insertion within the transposon genes weighted by how many insertions each had.\n+\n+\taverage = sum / count\n+\ti = 0\n+\tweighted_sum = 0\n+\tweight_sum = 0\n+\twhile i < len(weights):\n+\t\tweighted_sum += weights[i]*scores[i]\n+\t\tweight_sum += weights[i]\n+\t\ti += 1\n+\tweighted_average = weighted_sum/weight_sum\n+ \n+# Prints the regular average, weighted average, and total insertions for reference\n+ \n+\tprint "Normalization step:" + "\\n"\n+\tprint "Regular average: " + str(average) + "\\n"\n+\tprint "Weighted Average: " + str(weighted_average) + "\\n"\n+\tprint "Total Insertions: " + str(count) + "\\n"\n+ \n+# The actual normalization happens here; every fitness score is divided by the average fitness found for genes that should have a value of 1. \n+# For example, if the average fitness for genes was too low overall - let\'s say 0.97 within the normalization geness - every fitness would be proportionally raised.\n+\n+\told_ws = 0\n+\tnew_ws = 0\n+\twcount = 0\n+\tfor list in results:\n+\t\tif list[11] == \'W\':\n+\t\t\tcontinue\n+\t\tnew_w = float(list[11])/weighted_average\n+\t\t\n+# Sometimes you want to multiply all the fitness values by a constant; this does that.\n+# For example you might multiply all the values by a constant for a genetic interaction screen - where Tn-Seq is performed as usual except there\'s one background knockout all the mutants share. This is\n+# because independent mutations should have a fitness value that\'s equal to their individual fitness values multipled, but related mutations will deviate from that; to find those deviations you\'d multiply\n+# all the fitness values from mutants from a normal library by the fitness of the background knockout and compare that to the fitness values found from the knockout library!\n+\t\t\n+\t\tif arguments.multiply:\n+\t\t\tnew_w *= float(arguments.multiply)\n+\t\t\n+# Records the old w score for reference, and adds it to a total sum of all w scores (so that the old w mean and new w mean can be printed later).\n+\t\t\n+\t\tif float(list[11]) > 0:\n+\t\t\told_ws += float(list[11])\n+\t\t\tnew_ws += new_w\n+\t\t\twcount += 1\n+\t\t\t\n+# Writes the new w score into the results list of lists.\n+\t\t\t\n+\t\tlist[12] = new_w\n+\t\t\n+# Adds a line to wiglist for each insertion position, with the insertion position and its new w value.\n+\t\t\n+\t\tif (arguments.wig):\n+\t\t\twigstring += str(list[0]) + " " + str(new_w) + "\\n"\n+ \n+# Prints the old w mean and new w mean for reference.\n+ \n+\told_w_mean = old_ws / wcount\n+\tnew_w_mean = new_ws / wcount\n+\tprint "Old W Average: " + str(old_w_mean) + "\\n"\n+\tprint "New W Average: " + str(new_w_mean) + "\\n"\n+\n+# Overwrites the old file with the normalized file.\n+\n+with open(arguments.outfile, "wb") as csvfile:\n+ writer = csv.writer(csvfile)\n+ writer.writerows(results)\n+ \n+# If a WIG file was requested, actually creates the WIG file and writes wiglist to it\n+# So what\'s written here is the WIG header plus each insertion position and it\'s new w value if normalization were called for, and each insertion position and its unnormalized w value if normalization were not called for.\n+\t\t\n+if (arguments.wig):\n+\tif (arguments.normalize):\n+\t\twith open(arguments.wig, "wb") as wigfile:\n+\t\t\twigfile.write(wigstring)\n+\telse:\n+\t\tfor list in results:\n+\t\t\twigstring += str(list[0]) + " " + str(list[11]) + "\\n"\n+\t\twith open(arguments.wig, "wb") as wigfile:\n+\t\t\t\twigfile.write(wigstring)\n\\ No newline at end of file\n' |