# HG changeset patch # User kaymccoy # Date 1478482192 18000 # Node ID ed2ca82d332d0429865e0a3a81adb0ea57007b38 # Parent 28cc5b2a0f6df5a94db055472cbbab150b7aae60 Deleted selected files diff -r 28cc5b2a0f6d -r ed2ca82d332d aggregate.py --- a/aggregate.py Sat Aug 13 00:10:07 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,495 +0,0 @@ -# A translation of aggregate.pl into python! For analysis of Tn-Seq. -# This script requires BioPython just like calc_fitness.py, so you need it installed along with its dependencies if you want to run these scripts on your own. -# How to install BioPython and a list of its dependencies can be found here: http://biopython.org/DIST/docs/install/Installation.html -# K. McCoy - - - - - - - - - -##### ARGUMENTS ##### - -def print_usage(): - print "Aggregate.py's usage is as follows:" + "\n\n" - print "\033[1m" + "Required" + "\033[0m" + "\n" - print "-o" + "\t\t" + "Output file for aggregated data." + "\n" - print "\n" - print "\033[1m" + "Optional" + "\033[0m" + "\n" - print "-c" + "\t\t" + "Check for missing genes in the data set - provide a reference genome in genbank format. Missing genes will be sent to stdout." + "\n" - print "-m" + "\t\t" + "Place a mark in an extra column for this set of genes. Provide a file with a list of genes seperated by newlines." + "\n" - print "-x" + "\t\t" + "Cutoff: Don't include fitness scores with average counts (c1+c2)/2 < x (default: 0)" + "\n" - print "-b" + "\t\t" + "Blanks: Exclude -b % of blank fitness scores (scores where c2 = 0) (default: 0 = 0%)" + "\n" - print "-f" + "\t\t" + "An in-between file carrying information on the blank count found from calc_fitness or consol_fitness; one of two ways to pass a blank count to this script" + "\n" - print "-w" + "\t\t" + "Use weighted algorithm to calculate averages, variance, sd, se" + "\n" - print "-l" + "\t\t" + "Weight ceiling: maximum value to use as a weight (default: 999,999)" + "\n" - print "\n" - print "All remainder arguements will be treated as fitness files (those files created by calc_fitness.py)" + "\n" - print "\n" - -import argparse -parser = argparse.ArgumentParser() -parser.add_argument("-o", action="store", dest="summary") -parser.add_argument("-c", action="store", dest="find_missing") -parser.add_argument("-m", action="store", dest="marked") -parser.add_argument("-x", action="store", dest="cutoff") -parser.add_argument("-b", action="store", dest="blank_pc") -parser.add_argument("-f", action="store", dest="blank_file") -parser.add_argument("-w", action="store", dest="weighted") -parser.add_argument("-l", action="store", dest="weight_ceiling") -parser.add_argument("fitnessfiles", nargs=argparse.REMAINDER) - -arguments = parser.parse_args() - -if not arguments.summary: - print "\n" + "You are missing a value for the -o flag. " - print_usage() - quit() - -if not arguments.fitnessfiles: - print "\n" + "You are missing fitness file(s); these should be entered immediately after all the flags. " - print_usage() - quit() - -# 999,999 is a trivial placeholder number - -if (not arguments.weight_ceiling): - arguments.weight_ceiling = 999999 - -# Cutoff exists to discard positions with a low number of counted transcripts, because their fitness may not be as accurate - for the same reasoning that studies with low sample sizes can be innacurate. - -if (not arguments.cutoff): - arguments.cutoff = 0 - -# Gets information from the txt output file of calc_fit / consol, if inputted - -if arguments.blank_file: - with open(arguments.blank_file) as file: - blank_pc = file.read().splitlines() - arguments.blank_pc = float(blank_pc[0].split()[1]) - -if (not arguments.blank_pc): - arguments.blank_pc = 0 - - - - - -##### SUBROUTINES ##### - -# A subroutine that calculates the average, variance, standard deviation (sd), and standard error (se) of a group of scores; for use when aggregating scores by gene later on - -import math -def unweighted_average(scores): - sum = 0 - num = 0 - i = 0 - while i < len(scores): - if not scores[i]: - scores[i] = 0.0 - sum += float(scores[i]) - num += 1 - i += 1 - average = sum/num - xminusxbars = 0 - while i < len(scores): - xminusxbars += (float(scores[i]) - average)**2 - if num <= 1: - variance = 0 - else: - variance = xminusxbars/(num-1) - sd = math.sqrt(variance) - se = sd / math.sqrt(num) - return (average, variance, sd, se) - -# A subroutine that calculates the weighted average, variance, standard deviation (sd), and standard error (se) of a group of scores; the weights come from the number of reads each insertion location has -# For use when aggregating scores by gene later on, if the weighted argument is called - -def weighted_average(scores,weights): - sum = 0 - weighted_average = 0 - weighted_variance = 0 - top = 0 - bottom = 0 - i = 0 - while i < len(weights): - if not scores[i]: - scores[i] = 0.0 - top += float(weights[i])*float(scores[i]) - bottom += float(weights[i]) - i += 1 - if bottom == 0: - return 0 - weighted_average = top/bottom - top = 0 - bottom = 0 - i = 0 - while i < len(weights): - top += float(weights[i]) * (float(scores[i]) - weighted_average)**2 - bottom += float(weights[i]) - i += 1 - weighted_variance = top/bottom - weighted_stdev = math.sqrt(weighted_variance) - weighted_stder = weighted_stdev/math.sqrt(len(scores)) - return (weighted_average, weighted_variance, weighted_stdev, weighted_stder) - - - - - - - - - - -##### AGGREGATION / CALCULATIONS ##### - -#Reads the genes which should be marked in the final aggregate file into an array - -import os.path -if arguments.marked: - with open(arguments.marked) as file: - marked_set = file.read().splitlines() - -#Creates a dictionary of dictionaries to contain a summary of all genes and their fitness values -#The fitness values and weights match up, so that the weight of gene_summary[locus]["w"][2] would be gene_summary[locus]["s"][2] - -import csv -gene_summary = {} -for eachfile in arguments.fitnessfiles: - with open(eachfile) as csvfile: - lines = csv.reader(csvfile) - for line in lines: - locus = line[9] - w = line[12] - if w == 'nW': - continue - if not w: - w == 0 - c1 = float(line[2]) - c2 = float(line[3]) - avg = (c1+c2)/2 - if avg < float(arguments.cutoff): - continue - if avg > float(arguments.weight_ceiling): - avg = arguments.weight_ceiling - if locus not in gene_summary: - gene_summary[locus] = {"w" : [], "s": []} - gene_summary[locus]["w"].append(w) - gene_summary[locus]["s"].append(avg) - -#If finding any missing gene loci is requested in the arguments, starts out by loading all the known features from a genbank file - -from Bio import SeqIO -if (arguments.find_missing): - output = [["locus","mean","var","sd","se","gene","Total","Blank","Not Blank","Blank Removed","M\n"]] - handle = open(arguments.find_missing, "rU") - for record in SeqIO.parse(handle, "genbank"): - refname = record.id - features = record.features - handle.close() - -#Goes through the features to find which are genes - - for feature in features: - gene = "" - if feature.type == "gene": - locus = "".join(feature.qualifiers["locus_tag"]) - if "gene" in feature.qualifiers: - gene = "".join(feature.qualifiers["gene"]) - else: - continue - -#Goes through the fitness scores of insertions within each gene, and removes whatever % of blank fitness scores were requested along with their corresponding weights - - sum = 0 - num = 0 - avgsum = 0 - blank_ws = 0 - i = 0 - if locus in gene_summary.keys(): - for w in gene_summary[locus]["w"]: - if float(w) == 0: - blank_ws += 1 - else: - sum += float(w) - num += 1 - count = num + blank_ws - removed = 0 - to_remove = int(float(arguments.blank_pc)*count) - if blank_ws > 0: - i = 0 - while i < len(gene_summary[locus]["w"]): - w = gene_summary[locus]["w"][i] - if removed == to_remove: - break - if float(w) == 0: - del gene_summary[locus]["w"][i] - del gene_summary[locus]["s"][i] - removed += 1 - i -= 1 - i += 1 - -#If all the fitness values within a gene are empty, sets mean/var to 0.10 and Xs out sd/se; marks the gene if that's requested - - if num == 0: - if (arguments.marked and locus in marked_set): - output.append([locus, "0.10", "0.10", "X", "X", gene, count, blank_ws, num, removed, "M", "\n"]) - else: - output.append([locus, "0.10", "0.10", "X", "X", gene, count, blank_ws, num, removed, "\n"]) - -#Otherwise calls average() or weighted_average() to find the aggregate w / count / standard deviation / standard error of the insertions within each gene; marks the gene if that's requested - - else: - if not arguments.weighted: - (average, variance, stdev, stderr) = unweighted_average(gene_summary[locus]["w"]) - else: - (average, variance, stdev, stderr) = weighted_average(gene_summary[locus]["w"],gene_summary[locus]["s"]) - if (arguments.marked and locus in marked_set): - output.append([locus, average, variance, stdev, stderr, gene, count, blank_ws, num, removed, "M", "\n"]) - else: - output.append([locus, average, variance, stdev, stderr, gene, count, blank_ws, num, removed, "\n"]) - -#If a gene doesn't have any insertions, sets mean/var to 0.10 and Xs out sd/se, plus leaves count through removed blank because there were no reads. - - else: - if (arguments.marked and locus in marked_set): - output.append([locus, "0.10", "0.10", "X", "X", gene, "", "", "", "", "M", "\n"]) - else: - output.append([locus, "0.10", "0.10", "X", "X", gene, "", "", "", "", "\n"]) - -#Writes the aggregated fitness file - - with open(arguments.summary, "wb") as csvfile: - writer = csv.writer(csvfile) - writer.writerows(output) - -#If finding missing genes is not requested, just finds the aggregate w / count / standard deviation / standard error of the insertions within each gene, and writes them to a file, plus marks the genes requested -#This is never called through Galaxy since finding missing genes is just better than not finding them. - -else: - output = [["Locus","W","Count","SD","SE","M\n"]] - for gene in gene_summary.keys(): - sum = 0 - num = 0 - average = 0 - if "w" not in gene_summary[gene]: - continue - for i in gene_summary[gene]["w"]: - sum += i - num += 1 - average = sum/num - xminusxbars = 0 - for i in w: - xminusxbars += (i-average)**2 - if num > 1: - sd = math.sqrt(xminusxbars/(num-1)) - se = sd / math.sqrt(num) - if (arguments.marked and locus in marked_set): - output.append([gene, average, num, sd, se, "M", "\n"]) - else: - output.append([gene, average, num, sd, se, "\n"]) - with open(arguments.summary, "wb") as csvfile: - writer = csv.writer(csvfile) - writer.writerows(output) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# -# ~MMM=:DMMM?, +NMMO=,:~I8MMMMM8+, , ~I8MMMMMN87~?8NNMMN8: +NMND~ +MN= ,$MMMI ?M8, ,OM8, :MN+ =MM? ,MMDNMMD ,+DM8I, ,,:::~~~:::::::::: -# IMMMNMM8I ,I8MM87~::+$8NNMMMMOI+=~~:, ,,:~=?$DNMMMMMMDOZI7ZDMMMD8I , , $M8+?8MM8I , 7MI +MN= ZMN, 8MD MMN8MMM, :$ONM8I+=:, ,,,::::~~~~~=====~: -# , ,DMNN7: , ,OMMN7==~::~=?8NNMMMMMMNNMMMMMMMMMMMN8OO8ODNMMMMMD~ , IMMNMN~ ,OM+ ,NM$ ,NMO, :MM$ , ,:::,,::::,, $MNMMNM, ,,, :?ONMMNN8?~,, , ,,,,,,,::~~=+++??=~ -# ,,:=+????+, I$ :ZMMN8$~:,,, ,:=?7$O8DD8O$7+==+$O8DNMMMMMMMMM$ ?$, == ,~, ~NM= 8MD, ,OM8ZMMO , ,::::::~~~:,, ?MNMMZ ,,,,,, ,+7ONMNMD8O$+~, ,,,,,,::~====:: -# ,:=IONMMMMMMMMMMM8: ZN$: ,~DMMMND7=, ,,:~====:=$DNMMMMMMMN88MMMZ +N$ , ,7DN8= =MN, IMM =DMN7 ,,,,,,,,,, ,~?, ,,,,, , ~?$8MMMMNN8Z?~:,, ,:::,,, -#+ONMMMMMMNO7=:,, ,,+MMO, 7D$: ~OMNMNNMNNNNNNNNNNNMMMMMMMMMNMMMM?,~MMM8 ND, 7MM=, ?NN:, +MO, ?MM, ,,,, , :,:=$DMMMMMMN87=~, ,,,,, -#MMND8$: , 7NM7 , =?~,,, :?88DDDNNMNNNDNDD88Z?:, ZMM$ ,MMMO ,MM+ZNMM? ::ZNZ, +M$ ,MM?, , ,, ,,:=?Z8NMMMMMN8Z+=, -#: ~ZMM8~ ,,,, 8MM, +MMM, 7ZZI~=$OOZ$: ,:+???+, +MZ =MN= ,,, , ,,:~=?IIII$ZO88DDNNNNNNMMMMMMMMMMN~, -# ,:OMMM? ,,:,, 8MM ~NNM7 :OMMMMMMMMO: =M8, :NM~ = ,~?I, ,,,,,,,,,,,,,, ,~$DNMMMMMMMMMND8O888Z$II7777I??+++===:, -# ?DMM8?, OMN??NMM$ ~8MMMO?===7MMM8: ~NM= =NN: ,OM~ ,, +NMMN~ ,,,,, ,,,,,,,, ,?$O8NNMDDZ7?+=~:, , , , -# , ~$MMMD+ , , ~MMNMMN~ : +NMMZ, NMNM~ 7MI , ZNN,, IMN, :DMNM+ ~+NMMMD~, ,,,,,,,, ,, ,,,,,:, +OMNMMOI~, -# , $MMMM$, , ,=?ODNMNNMNMMMMNNND~ ,$D$, , ,8MM8 ,MMM7 ,ZMNNMM= DMMNMMMMMMMMMMMMNI: , ,,,,,,,,, ,,, ?NMO=, , -# ,~ZNMND7, ,,:~=+$DNNMMMMNDDDD888OZZZZ8NMMN IMMN: ,MMN~ +Z$+, ?NNNDO+:?O888OI, ,,,,,,, ,,, +MN+ -# =ONMMM8~ , ,:=IDMMMMMMMND8$+:, , ,INMNZ :MMM~, , +MMD , ,, ,,::,,,, ,,:::, ?M8: , -#8MMMNZ~ , =I$ONMMNDZ7?+, ,,=I8NMD7: DMMN DMN= ,:::, ,:~~=~~:,, :ZMMZI+: ,, ,,,,,,, -#MN7:,:=7ONMMMD$?=, , ~7ODNMM$+: ~MMM++7ZOZOO8O8D8$~ ,MM8 ,,,,::~~==~~:,, :+7DMMMMNNDD88OOZZZO88DNNNN8=, ,,,,,,, -#I,~+DMMNND7: , ,$MMMMMN7, $MMN :??+=~::,,,, NMD, ,:,,:::~=~~,,,, ,,=I$8DNMMMMMMMMMNMMMNZ: ,,,,,, -#DNMNN7:, ,+ONMMMMNI: NMM$ DMN= ,,,,:::::~~::,, ?DMMMMDZ=, ,,, -#N8$: ,,, ,:=?ONMMMD8Z+, ,,,, MMM= ZMMI ,=?$8NMMMMMMMMMMMN87=~~,,, :=ZMMMMD$~ -# , ,=ZNMMMMMNI:, ,~?Z88888$=, ,:~+??~, MMM, IMM$ ,=ZNMMMMMN8$+~=~=~~===7ODNMMMN8DNMMMN+, ,, -# , ~?$ODMMMMNZ?: :II+~, ,=7= :?77?=:====?O+ ,,:,,, MMM, ?MM$ ,,,, :?ONMM8II=, , =DMMMM87=, ,,,, -# , ,, ,~I8MMMMMMN87?~:=+?7$ZOO88DD888O$I+~:, ~ZZ: ,$7,~??, ,?+ ,+Z8$?==??= MMM =MM$ :?ODNNNNNNNMMO: ,:?NNMNO= ,,,IMMMNZ, ,,,:,,,,,,,,,,,,, -#, ,~7DNMMMMMMMMMMMNNNNMMMNND8Z7II7$$$$ZODNNNMND$, :O$: , ,IN$, I+ +ZZ=,, 7+ MMM =ODDDDNNNNNN8= :MM$ ?DDNN8?::,, ,,7NMM8, 7NMNZ~, :OMMM$, , ,,:::~~:,,,,,,,,,,,,,,,,, -#?8NMMMMMMMMMMMN8I:,,, ~$NMMM$ :87 +DM$ ID+~78I, O7 :=+~ MMM , , ?MM7 $NMN$, :I8MMMMMMNMDNNNNNNNNNDD88ZI=: ,ZMM7, +MMN~ ,,,:::,, ,,:,::::,,,,,, -#MMMD8DNMDZ7=: ,:=+7ZNNNDOZ~ =DI , :7MM7, IDDZI, =DN88ZI77$N? NMM: $MM= ~: =OMNZ+ ,=7DMMMMMMNDDOOOOZ$7IIIII77$ZOO8NNMMD$+~ :OND~, ,, MMN= ,,,,,,,,,,,,, -#: ,,, , ,:+ZNMNNMMNO?: +8? +NMN7 , , ON~ 8MM$ NMD, ,7MMMN, 7MMO, ,:ONMMMMN8I: ,~ZNMNN$, ~MM? , ZMNND?~: ,,, ,, ,,,,,, -# , , ,:~?7$ZZ8NNMMMNO7I=, ~D+ ZNO=, , :NM$: =MMM NMO ~NMNMMM :ZDN$, ,,=7DMD$?=, :?ZMD$: :DMNOZZZ$: ,,,,, ~IDMMMMMMMMMMMMMMMMMMMMM8~ -# , =DMNNNNNNNN8$= : , ,,~?ODNZ: :DM? =: ,MMM7 +MN~ ,MM8:NMN INMZ, ,=ONOI , +NMZ ,,+ZDMMMMMMMN+, ,,,,,,, ,~?$ODNNNNNNNMMMMMMMMD= , -# ,:::::~7$I?=: ,~78NMNMMMMM? :+Z+ M8 $MM, =, DMMD NMN DMO OMM77NMI, ?8$~ , ~ZDDDNNNNNMMMMMMMMNMNNNNNDD8Z+:, IMN: I8DNMMMMN7~: ,,,,,, +$O$, ~$DMMMMMMMD~ ,, , -# :I+ :ID? ~ZDNNNNMMNO?~,, ,:::::=7ONMNNNDMMMN$ ,=IONNMMMMZ:, ~MO ,7MMMI $MN, , :MMN ?8NDDNND$~ +MM~ ,MM~=MMMNMD, =ONNDDNMMMMMMMMMMMMNNND88DNNNNNMMMMMMMNDO7+~::,:,7MM+ =DMMMNNO? ~ZMMMMM7::~INMMMMMMMMMMMMMN8: -#:MMM+,INMMM: ,:~+78NMMMM8?==++++++???++??I$Z77$$$$$$$7II??I$ZZO8MMMMM8Z7~ ,IZI ~77+ ?MMMMD88MN8+~, +8MO$OMMMMMNMMMZ, ~=: OMM? ,MM? ZMM~MMMM8~ ,:+7$$$$ZZ7?==: :8MDOZ$ZZZODMMMMMM8+, ,:=?$ZZOOOOOOZ$: , , ,=8MNMMMMMMMMMMNDZ$7$MMMMMMMMMMMMMMMMMMNI+?I7ZDNMMMMMMMMMM$, -#=MMMZ$MMMMMM~ ,::::~==~~+I$8NMMMMMN$::::::::,,, ,,=ONDDO$7II?+~, , ,,$DD87: =NND= , ,+$$=:~, ,:, ,MMD NMI ,~?Z8DND88$?: 8MM$MMMZ:=~:,~~:::,,,=$DNMND$MM8, ~DMNMMMN?, :7$7?+==~=:,,,,,,,, ,,,,,,, ,,,,,,::,,,,,:::::::,,,::::::,, ,OM$,7MMMMMMNZ= , ~8MMMMMMMNDO7I7MMMMMMMMMMMMN8Z7+?NMMMD,, -#NMMMMMM? ,++, ~=I??= :7$ 7MM+ 7M7 ,:?ZZ$MMM8NMZ~+, :?INMMM? +$NMMMNZ: :+?I7$$Z$O88D88DDDDNNNNNNNNNNNNNNNNNNDDNDDDNNNNNMMNMMMMMMMNNNN$:??~ ,+II?=, , ,, ,?I??+=~, :MMMMMNM -#MMMMMD~ ,:=++++++++++=~,,, +MMN: ,MMD, :M$ ,INMNMMMMMMMMMM~~~?D, :OMM8: ,+$8Z$+,,$MMMMMD, :IODDDD -#N$~,,, ,~I$8NMMMMMMMMMNMMMMMMMMMMMMMMMMNNNN7: ZMMMM? NMMMNZ~:, +$: ,NMNI:::~?8MMM7I? IO ~I$ODNNNDND8OO$I?DMMMD$I8NMMMMMMMNMNMMMMMMM8=, , -# ,:~?ONMMMMNNNDDD8NMD+, ,~?Z8NMNM8=, , $MNNNMM? :DMMMNDD8DZ7$+, ,8NMMNMNMMMD$: =8, ,INMMMNZI?====+I$$8DDNMMMMMMNNNMND7, :~$DMMMNMO -# ,?DMMMMMNNNMMM+::~++ZMMMM8ZZZZZZ$II77II7???=~:, ,+DMN7 =NMM8, ,NMMNMNMN$?I7I77OZ~ ~8D$~, ,MMMMMMNM8: :DMMM, , DMO=ZMZ, -# ,?OMMN87=, IZNNNMMMMMMMMMMND7IIII??III$8DDNN8Z+, , ~MM ?OMMMM~ ,MMMMMZ+ ~I= IMMMO$$= ?NNM? MN~ +NM: -# IMM8~ =MD, ?N= :,, ,+77?=+, $8MM7::OMMMMMMZ+ I+, ?NI , :MMM8$ NNI, 7M$ -# ,MMN $MD 7M+ ~ODNZ~, :7MMN? , $MMMN= 7MNMN: +8+ ,D8~ =MMDD8Z= =NMD OM7, -# OMMD~ IMN: =8O:,~+$OO? :IZ+: :ZMMNM8= =DMMI ~8MM8= , ,, :8M?,$MM, ::, DM, =Z~ ,I8DDDN8$DMN~ MN ,=Z8DNDZZ= MD: , -# ,,,,,,,, ?8NNMM8$I????++=+8MMMD$77$ZDMMNMMMDNOZ~ :IZ8DOI~, =MMMNMNM8I:, ,7M8I +MND7 ~MNDDM8~ 7MI MM7 :8MDDM7 8N= ~ID$, ,:Z$?:,,,=ONNMNMD= ?M$ :: +MI ,,,,,,::~~:::,,,,,,, ,+OMO:, , -# ,,,::~===~~:, ,~+I$88DNNNNNNNNNDNDD8O$?~:, ,7DNMNDMMMN~,ZMND$8NNDZ=, $M? ?NNN7 =MO OMD$MM= MMMINMO $M7 DN: 8MMD: ?D~ :O8ZDMMMMD=, ,ZN$ ,INNMD= :NO ,:+8MMMMMMMMDI:,::,, -#=~~~::~~::, ~?78MMMDNMM? :+I$DD87?=, OMNDO$7$OI: 7N+ ?MNMN~ MMMNM$ 7M$ 7M$ +NMMO, ?N~ ~8+ ~7NMMNNOI= ~?ONZ+ ~ZNND,$M8 ~MO $MMMMMMMMMM7:::~~~~~~=+++===~~~:::,, -# ,~ONMMNO~, :?8NNMN8?~ZNMMMMMMMMMNMMNDOI=~NM? =MMM+ NMMMO 8M$ ,MM, $MMMN =D, ,?N~ , ?NMMMNNNDDO8DDD887, ,$ND= :O$NMZ IMO ,IMMMMN$, 7NO, ,,,,,:,,, -# ,~IDMNN8OI, :~+$DMMMMMMMN87+=~~~~?ZDNNMM, ?NM? :NMN :MM7 ~MN ~DMMMN O~ :DD, ,MMMMNNOI:,, , ,=ZODD8D? :MMM: NMZ +8NM8, ,, :~ ,,~:, -# ,INNMMNNO+ :=78NNZIIONMNNDMMN ,7D~ :MN~ MNMM?=MM $? OD, ,IMD ,:8MNMNNNNNNNNMMNM7: :8~ ZND OMN7, :NNMMMMN :8MMMMMNZ=: -# :O88NMMN$=~:, ?MMN88NMMMD =MN? =, :NM +7$NO, =N8 , OMI, ,, 7M: ZDDND? MMO ,, +MM8?OMN: ,:: -# :$NMMMNMM8I: ,8MD, +MMN= ,=?77=:ZMMM7 ?MN ~$+ 7M7 ,DZ :M~ IMM8: ,8MN, , ,,,, :MMD,=DMM+,?O8= , , -# ,=I77$ONNNZ?+ONMZ =$D ,+=::+$8NNMNDNMD7?, ?MN :~~: ~D7 ,NI ,M7 ~?ZNZ?, 8MM~ =ZMMN~$MMMMMMMNMMD$ONMMMMD: -# ~DMM= ~I7=, =DMMMD7MMM, :+?=?O$: ~N: ~MN88D$: NMM=, ~MMMMMMMMMMMMOZMMMMMM7::8MMI -# , :DNNI :~IDMMND :::=?II?==~::ZN7=+I$ZZZ8DZ+~~: IMMM~ ~MMMNMMMMMI~:7NMMMMD7,: +NM8, -# :NMM7, ~MM+ ,,,:~==~~~: , OMMN: ?NNMMMMM8, ~NMNO, =MMMN?,, -# ,,,,,,,, ,$8MMO= ~M8 +MNO: ,MM~ , :: ,~, ,,:::,,,, -# ,::,,,, ~ONND+ OD+ ?NMD, ?? ,::::::,, -# :=+??=:,,,,, ~?$D87I: ~Z? =$DM$: ,::,,,,,,,, -# ,:~==~, :+=, $NNNNZ~ ,,:~~: , :DMMMI -# ,~~~~~:,,, ,,, ,~IMMMN8O+, ,:~?7$Z7~, :ZDMNNDNM8ZI -# ,,~~:,, , :$DNNMMN8?, , ,~7ZOO?: ,:$NMMMM? :7NMMD?, -# ,:~~:,,, $NMMMMMNMNO?~ ~?ODDD$=, :?8MMMMMD?~7NMMMD$~ :ONMMN? -# ,,,,,,,,, ?$DMMMMMMMMMMD$?=~, ,~7ZZODN87????I$ODNMDOZ$7I: ~$ZDMD7==~ =$ZDND$++~, -# ~?++=~~, ~+?II7ZNMMMMMM8$$$?~, ,~?II7Z8DDOZ$77II+: ~?IZDM8O$I~, :??$8MMNZZ7+, -# , , ,+D7 :=IZ8NMMMNNMNNO$= ,:?ZDNNMMMNMND8Z7=, =ZNMMMMMD?,, :ONMNMMMN?, -# ,MMMM7 ,,::~IONNMMNNDDD8OZI=, ,::::=+I$ONMNNNNDDDNNMMMMMMMMMD87: ,:~=ZNNNDOI, ~7$ZO8DDNNNNDD8O+:, -#~, +MMMMDI?~ ,~+Z8DNDNNMMMMDD$+:,,, , ,~?7O8DNNNNNNMMMMMNOOZI??++?IZ88$: ,,~ZDMMMMMMMMMMMMMMMMMMNNNNNMMMNND8$=,, -# ,~: NMMMMOZMM8: ,7DZ~ ,,~IONMMMMMMMMMNDZ+~: , ,=I8DNMMMMMNMNMMMMMDNMMMMNMMMNNNDDDDD8O8Z$7II+++IZDDMMNMN$,, -# :~: MMMMMNMMM+ :, +?DOI~ ,:~?7$$ZODDNMNN8Z7??+=~:, ,~=+?7I?=:,,,,:=?I7$$$$$$$ZZZOO8DDNNMMNNNNNMMMMNMZ+ -# ~=, MMMMMMMM8 ,NNNO , ,~?ZDNMNNNNNNND8O7?:,,, ,~=7DMMMMMMMMMZ~ -# ,:+~ ,MMMMMMMMO ~N8: ,,:~, ,:~:,,, ,:~, ,:~~=+++?7$O8DNNNNNMMN8$=, ~+$ZO8DNNMMNNNND8OOOZ$+:, :=+I8MMMMDOI -# ,=+ MMMMMMMMMDMZ, ~?: ,,,,, ,:~~~, ,NN?$NMMMMMMMMMMMN87~:,,,=+=:,,,,,,:?NMMNMMMMMMNNDNDI8MMM+ , -# +=, $MMMN?$NMN, ,+?+:, ,,:~~:, :=~, 8M= , :7ONNNO?~, , :$NNMMMMMMMMMMMMMN, -# ,+? ,NNO,, , :ODNNNZ: :?777I= IMO ,,=$Z7+~ ,?NNMMNZ: :7NMMMMMNMMMM8 , ,IDND~ -# ,=~, 7I :+I+~::,, ~?I~ , ,~=~==: ZMZ ?8I: :I$DND$?~ :?$8MMN$?: ?ZDMMMMMMMM7 +$NMMMMMI -# ~~ , ,=+=: ,== ::, , ZMM: ?NN8: ,7NMMNI, ,$NMMMO: =NMMMNMM+ I8NMMMM, -# ~= ,~:,,~~, ,:, ,,,, , ,, +MN8 ,~OMM8Z, :+ZMNDOI =IMMN8= ,=NNMO ?NMMMD -# ::, ,,,:: ,:, ,, =MMO~ , ?NMMMI, ,INMMMDI, ~DMMN+, NMO ?DNO~ -# ,~, ,,,::, ,, , ,, ,INMND+: :MMM+ ~ZDDMMMD?~:$MMMMMMMMMMM? -# ,, :~~:, ,:, ,, :ZMMMMNNMMMN, ,=ONMMMMNM$,,,,,, ,, -#Z, :, ,, ,~=~, ,, ,=?7$I= , :~~~, ,,,, -#MD= :: ,,,, , ,, ,,,, -#MNMI :: ,,, ,,,, -#7MMM? , ,~: :~:, ,:, -# OMMMO: ~NMD, :=: :+=: ,,, -# ~MMMN8: NMMM~ +?: ,:=++~ ,:,, -# $MMMMMNI,?MMMMZ ,DM7 ,=I= ~+~~:, ,:,,, -# +MMO~OMMMMMMMMD MMM7 , , =$~ ?OZ+ ,:, -# ~NMN: :+DNN7MMMNMMMDODMMN+ :+?, +77+~: :::, -# IMM+ ,NMMMMMMMMN?NM7 ~7+, , :+$I, , +8? :+~ -# =NMD, NMMMMMM8~ ?NN, =ND? ,=??: ,8NMMMM= :++ -# +DO, ~ZZ$ODI :8M~ :=I8NMNMMD+, ,:~~=: , +8MO$DMD+ ~===~: :~~: -# :$~ 7NMD$, ZMM8I? ,~=: , ZNN7,:MM8~,OMNMD8DMM= ++: -# =ZNOI: ?DND= ,?I~ ,,,: ,$ND+ ?NMNNNNN7+, 7MN, ,=+~ -# :ZI: :, ,=7: ::,, $NN= 7NMMMM7: :DMMMN8NMMDDMN7 ,::, -# ~?= ,,, ,ZM= DMMD8 ?MMMMNN7, ,I+ :~~, -# =+, ,,, ,, =Z8: +$~ -# :~ ,,~~, -# :, ::, -# ::, ,:::, -# ,, ,~ -# ,, ,,, -# -# \ No newline at end of file diff -r 28cc5b2a0f6d -r ed2ca82d332d aggregate.xml --- a/aggregate.xml Sat Aug 13 00:10:07 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,102 +0,0 @@ - - fitness calculations by gene - - biopython - - - aggregate.py - #if $mark.certain == "yes": - -m $mark.genes - #end if - #if $weighted.algorithms == "yes": - -w 1 - #end if - -x $cutoff - -l $weightceiling - #if $blank.count == "yes": - -b $blank.custom_blanks - #end if - #if $blank.count == "no": - -f $blank.txt_blanks - #end if - -c $ref - -o $output - $input - #for $a in $additionalcsv - ${a.input2} - #end for - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool calculates the aggregate fitness values of mutations by gene. - -**The options explained** - -The csv fitness file(s): These are the csv (comma separated values) files containing the fitness values you want to aggregate by gene. Since they should have been produced by the "Calculate Fitness" tool, each line besides the header should represent the following information for an insertion location: position,strand,count_1,count_2,ratio,mt_freq_t1,mt_freq_t2,pop_freq_t1,pop_freq_t2,gene,D,W,nW - -GenBank reference genome: the reference genome of whatever model you're working with, which needs to be in standard genbank format. For more on that format see the genbank website. - -Marking certain genes: If you chose to mark certain genes, those genes will have an "M" under the M column of the resulting aggregate file. - -Using weighted algorithms: Recommended. If you chose to use weighted algorithms, scores will be weighted by the number of reads their insertion location has, as insertions with more reads tend to be more accurate. - -Weight ceiling: This value lets you set a weight ceiling for the weights of fitness values. It's only relevant if you're using weighted algorithms. - -Cutoff: This value lets you ignore the fitness scores of any insertion locations with an average count (the number of counts from t1 and t2 divided by 2) less than it. - -Blanks: This value lets you exclude a % of blank fitness scores (scores with a fitness of 0) from your calculations. It should be entered as a float (e.g. 0.10 would be 10%) if entered by hand, or you can use the blank % calculated from the normalization genes by calc_fit by entering its txt output file - -The name of your output file: self-explanatory. Remember to have it end in ".csv". - -**Additional notes** - -The output file should have each line (besides the header) represent the following information for a particular gene: locus,mean,var,sd,se,gene,Total,Blank,Not Blank,Blank Removed,M - - - \ No newline at end of file