Next changeset 1:1d0612af4426 (2016-11-06) |
Commit message:
Uploaded |
added:
aggregate.py |
b |
diff -r 000000000000 -r 30a036e0dc6b aggregate.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/aggregate.py Sun Nov 06 21:04:55 2016 -0500 |
[ |
b'@@ -0,0 +1,297 @@\n+# A translation of aggregate.pl into python! For analysis of Tn-Seq.\n+# This script requires BioPython just like calc_fitness.py, so you need it installed along with its dependencies if you want to run these scripts on your own.\n+# How to install BioPython and a list of its dependencies can be found here: http://biopython.org/DIST/docs/install/Installation.html\n+# K. McCoy\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+##### ARGUMENTS #####\n+\n+def print_usage():\n+\tprint "Aggregate.py\'s usage is as follows:" + "\\n\\n"\n+\tprint "\\033[1m" + "Required" + "\\033[0m" + "\\n"\n+\tprint "-o" + "\\t\\t" + "Output file for aggregated data." + "\\n"\n+\tprint "\\n"\n+\tprint "\\033[1m" + "Optional" + "\\033[0m" + "\\n"\n+\tprint "-c" + "\\t\\t" + "Check for missing genes in the data set - provide a reference genome in genbank format. Missing genes will be sent to stdout." + "\\n"\n+\tprint "-m" + "\\t\\t" + "Place a mark in an extra column for this set of genes. Provide a file with a list of genes seperated by newlines." + "\\n"\n+\tprint "-x" + "\\t\\t" + "Cutoff: Don\'t include fitness scores with average counts (c1+c2)/2 < x (default: 0)" + "\\n"\n+\tprint "-b" + "\\t\\t" + "Blanks: Exclude -b % of blank fitness scores (scores where c2 = 0) (default: 0 = 0%)" + "\\n"\n+\tprint "-f" + "\\t\\t" + "An in-between file carrying information on the blank count found from calc_fitness or consol_fitness; one of two ways to pass a blank count to this script" + "\\n"\n+\tprint "-w" + "\\t\\t" + "Use weighted algorithm to calculate averages, variance, sd, se" + "\\n"\n+\tprint "-l" + "\\t\\t" + "Weight ceiling: maximum value to use as a weight (default: 999,999)" + "\\n"\n+\tprint "\\n"\n+\tprint "All remainder arguements will be treated as fitness files (those files created by calc_fitness.py)" + "\\n"\n+\tprint "\\n"\n+\t\n+import argparse \n+parser = argparse.ArgumentParser()\n+parser.add_argument("-o", action="store", dest="summary")\n+parser.add_argument("-c", action="store", dest="find_missing")\n+parser.add_argument("-m", action="store", dest="marked")\n+parser.add_argument("-x", action="store", dest="cutoff")\n+parser.add_argument("-b", action="store", dest="blank_pc")\n+parser.add_argument("-f", action="store", dest="blank_file")\n+parser.add_argument("-w", action="store", dest="weighted")\n+parser.add_argument("-l", action="store", dest="weight_ceiling")\n+parser.add_argument("fitnessfiles", nargs=argparse.REMAINDER)\n+\n+arguments = parser.parse_args()\n+\n+if not arguments.summary:\n+\tprint "\\n" + "You are missing a value for the -o flag. "\n+\tprint_usage() \n+\tquit()\n+\n+if not arguments.fitnessfiles:\n+\tprint "\\n" + "You are missing fitness file(s); these should be entered immediately after all the flags. "\n+\tprint_usage() \n+\tquit()\n+\t\n+# 999,999 is a trivial placeholder number\n+\t\n+if (not arguments.weight_ceiling):\n+\targuments.weight_ceiling = 999999\n+\t\n+# Cutoff exists to discard positions with a low number of counted transcripts, because their fitness may not be as accurate - for the same reasoning that studies with low sample sizes can be innacurate. \n+\t\n+if (not arguments.cutoff):\n+\targuments.cutoff = 0\n+\n+# Gets information from the txt output file of calc_fit / consol, if inputted\n+\n+if arguments.blank_file:\n+\twith open(arguments.blank_file) as file:\n+\t\tblank_pc = file.read().splitlines()\n+\t\targuments.blank_pc = float(blank_pc[0].split()[1])\n+\n+if (not arguments.blank_pc):\n+\targuments.blank_pc = 0\n+\n+\n+\n+\n+\n+##### SUBROUTINES #####\n+\n+# A subroutine that calculates the average, variance, standard deviation (sd), and standard error (se) of a group of scores; for use when aggregating scores by gene later on\n+\n+import math\n+def unweighted_average(scores):\n+\tsum = 0\n+\tnum = 0\n+\ti = 0\n+\twhile i < len(scores):\n+\t\tif not scores[i]:\n+\t\t\tscores[i] = 0.0\n+\t\tsum += float(scores[i])\n+\t\tnum += 1\n+\t\ti += 1\n+\taverage = sum/num\n+\txminusxbars = 0\n+\twhile i < len(scores):\n+\t\txminusxbars += (float(scores[i]) - average)**2\n+\tif num <= 1:\n+\t\tvariance = 0\n+\telse:\n+\t\tvariance = xminusxbars/(num-1)\n+\tsd = math.sqrt(variance)\n+\tse = sd / ma'..b'"]]\n+\thandle = open(arguments.find_missing, "rU")\n+\tfor record in SeqIO.parse(handle, "genbank"):\n+\t\trefname = record.id\n+\t\tfeatures = record.features\n+\thandle.close()\n+\t\n+#Goes through the features to find which are genes\n+\t\n+\tfor feature in features:\n+\t\tgene = ""\n+\t\tif feature.type == "gene":\n+\t\t\tlocus = "".join(feature.qualifiers["locus_tag"])\n+\t\t\tif "gene" in feature.qualifiers:\n+\t\t\t\tgene = "".join(feature.qualifiers["gene"])\n+\t\telse:\n+\t\t\tcontinue\n+\t\t\t\n+#Goes through the fitness scores of insertions within each gene, and removes whatever % of blank fitness scores were requested along with their corresponding weights\n+\n+\t\tsum = 0\n+\t\tnum = 0\n+\t\tavgsum = 0\n+\t\tblank_ws = 0\n+\t\ti = 0\n+\t\tif locus in gene_summary.keys():\n+\t\t\tfor w in gene_summary[locus]["w"]:\n+\t\t\t\tif float(w) == 0:\n+\t\t\t\t\tblank_ws += 1\n+\t\t\t\telse:\n+\t\t\t\t\tsum += float(w)\n+\t\t\t\t\tnum += 1\n+\t\t\tcount = num + blank_ws\t\t\n+\t\t\tremoved = 0\n+\t\t\tto_remove = int(float(arguments.blank_pc)*count)\n+\t\t\tif blank_ws > 0:\n+\t\t\t\ti = 0\n+\t\t\t\twhile i < len(gene_summary[locus]["w"]):\n+\t\t\t\t\tw = gene_summary[locus]["w"][i] \n+\t\t\t\t\tif removed == to_remove:\n+\t\t\t\t\t\tbreak\n+\t\t\t\t\tif float(w) == 0:\n+\t\t\t\t\t\tdel gene_summary[locus]["w"][i]\n+\t\t\t\t\t\tdel gene_summary[locus]["s"][i]\n+\t\t\t\t\t\tremoved += 1\n+\t\t\t\t\t\ti -= 1\n+\t\t\t\t\ti += 1\n+\t\t\t\n+#If all the fitness values within a gene are empty, sets mean/var to 0.10 and Xs out sd/se; marks the gene if that\'s requested\n+\n+\t\t\tif num == 0:\t\n+\t\t\t\tif (arguments.marked and locus in marked_set):\n+\t\t\t\t\toutput.append([locus, "0.10", "0.10", "X", "X", gene, count, blank_ws, num, removed, "M", "\\n"])\n+\t\t\t\telse:\n+\t\t\t\t\toutput.append([locus, "0.10", "0.10", "X", "X", gene, count, blank_ws, num, removed, "\\n"])\n+\n+#Otherwise calls average() or weighted_average() to find the aggregate w / count / standard deviation / standard error of the insertions within each gene; marks the gene if that\'s requested\n+\n+\t\t\telse:\n+\t\t\t\tif not arguments.weighted:\n+\t\t\t\t\t(average, variance, stdev, stderr) = unweighted_average(gene_summary[locus]["w"])\n+\t\t\t\telse:\n+\t\t\t\t\t(average, variance, stdev, stderr) = weighted_average(gene_summary[locus]["w"],gene_summary[locus]["s"])\n+\t\t\t\tif (arguments.marked and locus in marked_set):\n+\t\t\t\t\toutput.append([locus, average, variance, stdev, stderr, gene, count, blank_ws, num, removed, "M", "\\n"])\n+\t\t\t\telse:\n+\t\t\t\t\toutput.append([locus, average, variance, stdev, stderr, gene, count, blank_ws, num, removed, "\\n"])\n+\t\t\n+#If a gene doesn\'t have any insertions, sets mean/var to 0.10 and Xs out sd/se, plus leaves count through removed blank because there were no reads.\n+\n+\t\telse:\n+\t\t\tif (arguments.marked and locus in marked_set):\n+\t\t\t\toutput.append([locus, "0.10", "0.10", "X", "X", gene, "", "", "", "", "M", "\\n"])\n+\t\t\telse:\n+\t\t\t\toutput.append([locus, "0.10", "0.10", "X", "X", gene, "", "", "", "", "\\n"])\n+\n+#Writes the aggregated fitness file\n+\n+\twith open(arguments.summary, "wb") as csvfile:\n+\t\twriter = csv.writer(csvfile)\n+\t\twriter.writerows(output)\n+\n+#If finding missing genes is not requested, just finds the aggregate w / count / standard deviation / standard error of the insertions within each gene, and writes them to a file, plus marks the genes requested\n+#This is never called through Galaxy since finding missing genes is just better than not finding them.\n+\n+else:\n+\toutput = [["Locus","W","Count","SD","SE","M\\n"]]\n+\tfor gene in gene_summary.keys():\n+\t\tsum = 0\n+\t\tnum = 0\n+\t\taverage = 0\n+\t\tif "w" not in gene_summary[gene]:\n+\t\t\tcontinue\n+\t\tfor i in gene_summary[gene]["w"]:\n+\t\t\tsum += i\n+\t\t\tnum += 1\n+\t\taverage = sum/num\n+\t\txminusxbars = 0\n+\t\tfor i in w:\n+\t\t\txminusxbars += (i-average)**2\n+\t\tif num > 1:\n+\t\t\tsd = math.sqrt(xminusxbars/(num-1))\n+\t\t\tse = sd / math.sqrt(num)\n+\t\tif (arguments.marked and locus in marked_set):\n+\t\t\toutput.append([gene, average, num, sd, se, "M", "\\n"])\n+\t\telse:\n+\t\t\toutput.append([gene, average, num, sd, se, "\\n"])\n+\twith open(arguments.summary, "wb") as csvfile:\n+\t\twriter = csv.writer(csvfile)\n+\t\twriter.writerows(output)\n\\ No newline at end of file\n' |