Previous changeset 9:0ba0852ed5ad (2017-03-28) Next changeset 11:2655e8056329 (2017-03-28) |
Commit message:
Uploaded |
added:
regionFitness.pl |
b |
diff -r 0ba0852ed5ad -r abaafe9d801d regionFitness.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/regionFitness.pl Tue Mar 28 14:04:10 2017 -0400 |
[ |
b'@@ -0,0 +1,802 @@\n+#!/usr/bin/perl -w\n+\n+#Margaret Antonio 17.01.06\n+\n+# Sliding Window specifically for Galaxy implementation.\n+\n+use strict;\n+use Getopt::Long;\n+use warnings;\n+use Bio::SeqIO;\n+use Data::Random qw(:all);\n+use List::BinarySearch qw( :all );\n+use autodie;\n+no warnings;\n+\n+#AVAILABLE OPTIONS. WILL PRINT UPON ERROR\n+sub print_usage() {\n+\n+ print "\\n####################################################################\\n";\n+ print "USAGE:\\n";\n+ print "slidingWindow.pl -f <fasta file> -r <genbank file> <inputs>\\n";\n+ \n+ print "\\nREQUIRED:\\n";\n+ print " -d\\tDirectory containing all input files (output files from\\n";\n+ print " \\tcalcFitness tool)\\n";\n+ print " \\tOR\\n";\n+ print " \\tIn the command line (without a flag), input filename(s)\\n";\n+ print " -f\\tFilename for genome sequence, in fasta format\\n";\n+ print " -r\\tFilename for genome annotation, in GenBank format\\n";\n+ \n+ print "\\nOPTIONAL:\\n";\n+ print " -h\\tPrint usage\\n";\n+ print " -size\\tThe size of the sliding window. Default=500\\n";\n+ print " -step\\tThe window spacing. Default=10\\n";\n+ print " -c\\tExclude values with avg. counts less than x where (c1+c2)/2<x\\n";\n+ print " \\tDefault=15\\n";\n+ print " -log\\tSend all output to a log file instead of the terminal\\n";\n+ print " -max\\tExpected max number of TA sites in a window.\\n";\n+ print " \\tUsed for creating null distribution library. Default=100\\n";\n+ print " -w\\tDo weighted average for fitness per insertion\\n";\n+ print " -wc\\tInteger value for weight ceiling. Default=50\\n";\n+ \n+ print " \\n~~~~Always check that file paths are correctly specified~~~~\\n";\n+ print "\\n##################################################################\\n";\n+ \n+\n+}\n+\n+#ASSIGN INPUTS TO VARIABLES\n+our ($cutoff,$step, $size, $help,$fasta, $log, $ref,$weight_ceiling,$weight,$custom,$run,$max,$f1,$f2,$f3,$f4,$f5,$f6);\n+\n+GetOptions(\n+\'c:i\'=>\\$cutoff,\n+\'step:i\' => \\$step,\n+\'size:i\' => \\$size,\n+\'f:s\' => \\$fasta,\n+\'r:s\' => \\$ref,\n+\'log\' => \\$log,\n+\'h\' => \\$help,\n+\'m:i\'=>\\$max,\n+\'wc:i\'=>\\$weight_ceiling,\n+\'w\'=>\\$weight,\n+\'u:s\'=>\\$custom,\n+\'n:s\'=>\\$run,\n+\'f1:s\'=>\\$f1,\n+\'f2:s\'=>\\$f2,\n+\'f3:s\'=>\\$f3,\n+\'f4:s\'=>\\$f4,\n+\'f5:s\'=>\\$f5,\n+\'f6:s\'=>\\$f6,\n+\n+);\n+\n+sub get_time() {\n+ my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(time);\n+ return "$hour:$min:$sec";\n+}\n+\n+sub uniq {\n+ my %seen;\n+ return grep { ! $seen{$_}++ } @_;\n+}\n+\n+# Just to test out the script opening\n+if ($help){\n+\tprint_usage();\n+\tprint "\\n";\n+ exit;\n+}\n+my $round=\'%.4f\';\n+\n+# every output file will have this prefix\n+#if (!$run){$run = "run1";}\n+$run = $run . "_";\n+\n+if (!$weight_ceiling){$weight_ceiling=50;}\n+if (!$cutoff){$cutoff=15;}\n+\n+\n+#open (STDOUT, ">>", $run . "log.txt");\n+\n+#CHECKING PARAMETERS: Check to make sure required option inputs are there and if not then assign default\n+if (!$size) { $size = 500 }; #set the default sliding window size to 500\n+if (!$step) { $step = 10 }; #set the default step size to 10\n+if (!$cutoff) { $cutoff = 10 };\n+if (!$max) { $max = 100 }; # most insertions expected in a given window region (for making null dist lib)\n+\n+print "Window size: $size\\n";\n+print "Step value: $step\\n";\n+print "Cutoff: $cutoff\\n";\n+\n+\n+# Returns mean, variance, sd, se\n+sub average {\n+ my $scoreref = shift @_;\n+ my @scores = @{$scoreref};\n+ \n+ my $sum=0;\n+ my $num=0;\n+ \n+ # Get the average.\n+ foreach my $w (@scores) {\n+ $sum += $w;\n+ $num++;\n+ }\n+ my $average= $sum/$num;\n+ my $xminusxbars = 0;\n+ \n+ # Get the variance.\n+ foreach my $w (@scores) {\n+ $xminusxbars += ($w-$average)**2;\n+ }\n+ my $variance;\n+ if ($num<=1){\n+ $variance=0.10;\n+ }\n+ else{\n+ $variance = sprintf($round,(1/($num-1)) * $xminusxbars);\n+ }\n+ my $sd = sprintf($round,sqrt($variance));\n+ my $se = sprintf($round,$sd / sqrt($num));\n+ \n+ return ($average, $variance, $s'..b' $meanFit=mean(@allFits);\n+print "Average fitness for all windows: ",$meanFit,"\\n";\n+my @expWindows=();\n+foreach (@newWindows){\n+ my @entry=@{$_};\n+ my $mean=$entry[7];\n+ my $absdev=sprintf("$round",$mean-$meanFit);\n+ push (@entry,$absdev);\n+ push @expWindows,\\@entry;\n+}\n+\n+@newWindows=@expWindows;\n+\n+#MAKE OUTPUT CSV FILE WITH ESSENTIAL WINDOW CALCULATIONS\n+\n+print "Start csv ouput file creation: ",get_time(),"\\n";\n+\n+open CSV, \'>\', $f3; # $run . "slidingWindows.csv" or die "Cannot open sliding window output file";\n+\t\n+my @csvHeader= ("start", "end","mutants","insertions","TA_sites","ratio","p-value","average", "variance","stdev","stderr","genes","fit-mean");\n+my $header=join(",",@csvHeader);\n+print CSV $header,"\\n";\n+foreach my $winLine(@newWindows){\n+ my $string=join(",",@{$winLine});\n+ print CSV $string, "\\n";\n+}\n+close CSV;\n+print "End csv ouput file creation: ",get_time(),"\\n\\n";\n+\n+my $in = Bio::SeqIO->new(-file=>$ref, -format => \'genbank\');\n+my $refseq = $in->next_seq;\n+my $refname = $refseq->id;\n+\n+#MAKE essentials WIG FILE---->later make BW--->IGV\n+sub printwig{\n+ print "Start wig file creation: ",get_time(),"\\n";\n+ my $in = Bio::SeqIO->new(-file=>$ref, -format => \'genbank\');\n+ my $refseq = $in->next_seq;\n+ my $refname = $refseq->id;\n+ my $ewig="essentialWindows.wig";\n+ my $FILE5 = $run . $ewig;\n+ open eWIG, ">", $f5; #$FILE5";\n+ printf eWIG "track type=wiggle_0 name=$ewig\\n";\n+ printf eWIG "variableStep chrom=$refname\\n";\n+ foreach my $wigLine(@newWindows){\n+ my @wigFields=@$wigLine;\n+ my $position=$wigFields[0];\n+ while ($position<=$wigFields[1]){\n+ printf eWIG $position, " ",$wigFields[7],"\\n"; #7 for pvalue, but 2 for fitness!!!!!!\n+ $position=$position+1;\n+ }\n+ }\n+ close eWIG;\n+ print "End wig file creation: ",get_time(),"\\n\\n";\n+ print "If this wig file needs to be converted to a Big Wig, then use USCS program wigToBigWig in terminal: \\n \\t./wigToBigWig gview/12G.wig organism.txt BigWig/output.bw \\n\\n";\n+}\n+\n+\n+#OUTPUT REGULAR SLIDING INFORMATION WITHOUT ESSENTIALS CALCULATIONS (P-VALUES)\n+\n+\n+#MAKE OUTPUT CSV FILE WITH FITNESS WINDOW CALCULATIONS\n+ print "Start csv ouput file creation: ",get_time(),"\\n";\n+ open FITFILE, ">", $f4; #$run . "fitWindows.csv" or die "Failed to open file";\n+ my @head = ("start", "end","fitness","mutant_count", "\\n");\n+ print FITFILE join(",",@head); #header\n+ foreach my $winLine(@allWindows){\n+ print FITFILE join(",", @$winLine),"\\n";\n+ }\n+ close FITFILE;\n+ print "End csv ouput file creation: ",get_time(),"\\n\\n";\n+\n+#MAKE WIG FILE---->later make BW--->IGV\n+ print "Start wig file creation: ",get_time(),"\\n";\n+ my $fin = Bio::SeqIO->new(-file=>$ref, -format => \'genbank\');\n+ $refseq = $fin->next_seq;\n+ $refname = $refseq->id;\n+ my $fwig="fitWindows.wig";\n+ open fWIG, ">", $f5; #$run . $fwig;\n+ print fWIG "track type=wiggle_0 name=$fwig\\n";\n+ print fWIG "variableStep chrom=$refname\\n";\n+ foreach my $wigLine(@allWindows){\n+ my @wigFields=@$wigLine;\n+ my $pos=$wigFields[0];\n+ my $fit=$wigFields[7];\n+ print fWIG $pos," ",$fit,"\\n";\n+ }\n+ close fWIG;\n+ print "End wig file creation: ",get_time(),"\\n\\n";\n+ print "If this wig file needs to be converted to a Big Wig,\\n";\n+ print " then use USCS program wigToBigWig in terminal: \\n";\n+ print "\\t./wigToBigWig gview/12G.wig organism.txt BigWig/output.bw \\n\\n";\n+\n+\n+#GOING TO MAKE A TEXT FILE FOR BED CONVERSION TO BIGBED, NEED CHROM # IN COLUMN 0\n+\n+my @cummulative;\n+\n+#MAKING A REGULAR TEXT FILE fields: [chrom,start,end,fitness,count]\n+\n+print "Start text file creation time: ",get_time(),"\\n";\n+open my $TXT, \'>\', $f6; #$run . "fitWindows.txt" or die $!;\n+print $TXT ("start","end","W","mutant_count","insertion_count\\n");\n+print $TXT (join("\\t",@$_),"\\n") for @allWindows;\n+close $TXT;\n+print "End text file creation: ",get_time(),"\\n\\n";\n+\n+\n' |