Mercurial > repos > antmarge > regionfitness
changeset 2:e8749de582fa draft
Uploaded
author | antmarge |
---|---|
date | Tue, 28 Mar 2017 10:51:53 -0400 |
parents | 7b3e44bd0ab0 |
children | 246f0ceb77bc |
files | regionFitness.pl |
diffstat | 1 files changed, 802 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/regionFitness.pl Tue Mar 28 10:51:53 2017 -0400 @@ -0,0 +1,802 @@ +#!/usr/bin/perl -w + +#Margaret Antonio 17.01.06 + +# Sliding Window specifically for Galaxy implementation. + +use strict; +use Getopt::Long; +use warnings; +use Bio::SeqIO; +use Data::Random qw(:all); +use List::BinarySearch qw( :all ); +use autodie; +no warnings; + +#AVAILABLE OPTIONS. WILL PRINT UPON ERROR +sub print_usage() { + + print "\n####################################################################\n"; + print "USAGE:\n"; + print "slidingWindow.pl -f <fasta file> -r <genbank file> <inputs>\n"; + + print "\nREQUIRED:\n"; + print " -d\tDirectory containing all input files (output files from\n"; + print " \tcalcFitness tool)\n"; + print " \tOR\n"; + print " \tIn the command line (without a flag), input filename(s)\n"; + print " -f\tFilename for genome sequence, in fasta format\n"; + print " -r\tFilename for genome annotation, in GenBank format\n"; + + print "\nOPTIONAL:\n"; + print " -h\tPrint usage\n"; + print " -size\tThe size of the sliding window. Default=500\n"; + print " -step\tThe window spacing. Default=10\n"; + print " -c\tExclude values with avg. counts less than x where (c1+c2)/2<x\n"; + print " \tDefault=15\n"; + print " -log\tSend all output to a log file instead of the terminal\n"; + print " -max\tExpected max number of TA sites in a window.\n"; + print " \tUsed for creating null distribution library. Default=100\n"; + print " -w\tDo weighted average for fitness per insertion\n"; + print " -wc\tInteger value for weight ceiling. Default=50\n"; + + print " \n~~~~Always check that file paths are correctly specified~~~~\n"; + print "\n##################################################################\n"; + + +} + +#ASSIGN INPUTS TO VARIABLES +our ($cutoff,$step, $size, $help,$fasta, $log, $ref,$weight_ceiling,$weight,$custom,$run,$max,$f1,$f2,$f3,$f4,$f5,$f6); + +GetOptions( +'c:i'=>\$cutoff, +'step:i' => \$step, +'size:i' => \$size, +'f:s' => \$fasta, +'r:s' => \$ref, +'log' => \$log, +'h' => \$help, +'m:i'=>\$max, +'wc:i'=>\$weight_ceiling, +'w'=>\$weight, +'u:s'=>\$custom, +'n:s'=>\$run, +'f1:s'=>\$f1, +'f2:s'=>\$f2, +'f3:s'=>\$f3, +'f4:s'=>\$f4, +'f5:s'=>\$f5, +'f6:s'=>\$f6, + +); + +sub get_time() { + my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(time); + return "$hour:$min:$sec"; +} + +sub uniq { + my %seen; + return grep { ! $seen{$_}++ } @_; +} + +# Just to test out the script opening +if ($help){ + print_usage(); + print "\n"; + exit; +} +my $round='%.4f'; + +# every output file will have this prefix +#if (!$run){$run = "run1";} +$run = $run . "_"; + +if (!$weight_ceiling){$weight_ceiling=50;} +if (!$cutoff){$cutoff=15;} + + +#open (STDOUT, ">>", $run . "log.txt"); + +#CHECKING PARAMETERS: Check to make sure required option inputs are there and if not then assign default +if (!$size) { $size = 500 }; #set the default sliding window size to 500 +if (!$step) { $step = 10 }; #set the default step size to 10 +if (!$cutoff) { $cutoff = 10 }; +if (!$max) { $max = 100 }; # most insertions expected in a given window region (for making null dist lib) + +print "Window size: $size\n"; +print "Step value: $step\n"; +print "Cutoff: $cutoff\n"; + + +# Returns mean, variance, sd, se +sub average { + my $scoreref = shift @_; + my @scores = @{$scoreref}; + + my $sum=0; + my $num=0; + + # Get the average. + foreach my $w (@scores) { + $sum += $w; + $num++; + } + my $average= $sum/$num; + my $xminusxbars = 0; + + # Get the variance. + foreach my $w (@scores) { + $xminusxbars += ($w-$average)**2; + } + my $variance; + if ($num<=1){ + $variance=0.10; + } + else{ + $variance = sprintf($round,(1/($num-1)) * $xminusxbars); + } + my $sd = sprintf($round,sqrt($variance)); + my $se = sprintf($round,$sd / sqrt($num)); + + return ($average, $variance, $sd, $se); + +} + +# Function to clean unwanted characters from lines + +sub cleaner{ + my $line=$_[0]; + chomp($line); + $line =~ s/\x0d{0,1}\x0a{0,1}\Z//s; + return $line; +} + +# Takes two parameters, both hashrefs to lists. +# 1) hashref to list of scores +# 2) hashref to list of weights, equal in length to the scores. +sub weighted_average { + + my $scoreref = shift @_; + my $weightref = shift @_; + my @scores = @{$scoreref}; + my @weights = @{$weightref}; + + my $sum=0; + my ($weighted_average, $weighted_variance)=(0,0); + my $v2; + + # Go through once to get total, calculate V2. + for (my $i=0; $i<@weights; $i++) { + $sum += $weights[$i]; + $v2 += $weights[$i]**2; + } + if ($sum == 0) { return 0; } # No scores given? + + my $scor = join (' ', @scores); + my $wght = join (' ', @weights); + + # Now calculated weighted average. + my ($top, $bottom) = (0,0); + for (my $i=0; $i<@weights; $i++) { + $top += $weights[$i] * $scores[$i]; + $bottom += $weights[$i]; + } + $weighted_average = sprintf($round,$top/$bottom); + + ($top, $bottom) = (0,0); + # Now calculate weighted sample variance. + for (my $i=0; $i<@weights; $i++) { + $top += ( $weights[$i] * ($scores[$i] - $weighted_average)**2); + $bottom += $weights[$i]; + } + $weighted_variance = sprintf($round,$top/$bottom); + + my $weighted_stdev = sprintf($round,sqrt($weighted_variance)); + my $weighted_stder = sprintf($round,$weighted_stdev / sqrt(@scores)); # / length scores. + + return ($weighted_average, $weighted_variance, $weighted_stdev, $weighted_stder); +} + +#CREATE AN ARRAY OF DATA FROM INPUT CSV FILE(S) + +my $rowCount=-1; +my $last=0; +my @unsorted; +my @insertPos; #array to hold all positions of insertions. Going to use this later to match up with TA sites +my %wind_summary; + +my @files = @ARGV; + +my $num=(scalar @files); + +my $datestring = localtime(); +print "Local date and time $datestring\n"; + +print "\n---------Importing files--------\n"; +print "\tStart input array ",get_time(),"\n"; +print "\tNumber of csv files: ", $num,"\n"; + + +#Go through each file from the commandline (ARGV array) and read each line as an array +#into select array if values satisfy the cutoff +for (my $i=0; $i<$num; $i++){ + print "File #",$i+1,"\t"; + my $file=$files[$i]; + print $file,"\n"; + + open(DATA, '<', $file) or die "Could not open '$file' Make sure input .csv files are entered in the command line\n"; + my $dummy=<DATA>; #read and store column names in dummy variable + while (my $entry = <DATA>) { + chomp $entry; + my @line=split(",",$entry); + my $locus = $line[9]; #gene id (SP_0000) + my $w = $line[12]; #nW + if (!$w){ $w=0 } # For blanks + my $c1 = $line[2]; + my $c2 = $line[3]; + my $avg = ($c1+$c2)/2; + #Average counts must be greater than cutoff (minimum allowed) + if ($avg > $cutoff) { + my @select=($line[0],$w,$avg,$line[9]); + my $select=\@select; + push(@unsorted,$select); + push(@insertPos,$line[0]); #keep track of actual insertion site position + $last=$select[0]; + $rowCount++; + } + if ($avg >= $weight_ceiling) { $avg = $weight_ceiling; } # Maximum weight + } + close DATA; +} + +my @sorted = sort { $a->[0] <=> $b->[0] } @unsorted; +@insertPos = sort { $a <=> $b } @insertPos; +# Get unique +@insertPos= uniq(@insertPos); + +print "\n\tFinished input array ",get_time(),"\n"; + +############################################################################################### + +print "\n---------Creating sliding windows across the genome--------\n\n"; + +my $index=-1; +my $marker=0; +my $totalInsert=0; +my $totalWindows=0; + + +#SUBROUTINE FOR EACH WINDOW CALCULATION +sub OneWindow{ + my $Wstart=shift @_; + my $Wend=shift@_; + my $Wcount=0; + my $insertion=0; + my $Wsum=0; + my $lastPos=0; + my $i; + my @allgenes=(); + + for ($i=$marker;$i<$rowCount;$i++){ + my @fields=@{$sorted[$i]}; + my $pos=$fields[0]; + my $w=$fields[1]; #fitness value for single insertion + my $avgCount=$fields[2]; + my $gene=$fields[3]; + + if ($pos<$Wstart){ #if deleted, error shows up + next; + } + if ($pos<=$Wend){ + if ($pos<($Wstart+$step)){ + $marker++; + } + my @empty; + if (!$wind_summary{$Wstart}) { + $wind_summary{$Wstart}{w} = [@empty]; + $wind_summary{$Wstart}{s} = [@empty]; + $wind_summary{$Wstart}{g} = [@empty]; + } + # Hash of Fitness scores. + $wind_summary{$Wstart}{w} = [@{$wind_summary{$Wstart}{w}}, $w]; + # Hash of counts used to generate those fitness scores. + $wind_summary{$Wstart}{s} = [@{$wind_summary{$Wstart}{s}}, $avgCount]; + + + if (!($gene =~ /^ *$/)){ + $gene=~ s/"//g; + $gene=~ s/ //g; + $gene=~ s/'//g; + push @allgenes,$gene; + } + $Wsum+=$w; + $Wcount++; + if ($pos!=$lastPos){ + $insertion+=1; + $lastPos=$pos; + } + } + #if ($fields[0]>$Wend) #finished with that window, then: + else{ + if ($Wcount>0){ + $totalWindows++; + $totalInsert+=$insertion; + + my ($average, $variance, $stdev, $stderr); + + if ($num <=1 ) { + ($average, $variance, $stdev, $stderr)=(0.10,0.10,"NA","NA"); + } + else{ + if (!$weight) { + ($average, $variance, $stdev, $stderr) = &average($wind_summary{$Wstart}{w}); + } + else { + ($average, $variance, $stdev, $stderr)= &weighted_average($wind_summary{$Wstart}{w},$wind_summary{$Wstart}{s}); + } + } + + @allgenes= uniq(@allgenes); + my @sortedGenes = sort { lc($a) cmp lc($b) } @allgenes; + my $wind_genes=join(" ",@sortedGenes); + my @window=($Wstart,$Wend,$Wcount,$insertion,$wind_genes,$average,$variance,$stdev,$stderr); + return (\@window); + } + + else{ + #Even if there were no insertions, still want window in file for consistent start/end + my @window=($Wstart,$Wend,$Wcount,$insertion," ","NA","NA","NA","NA"); + return (\@window); + } #Because count=0 (i.e. there were no insertion mutants in that window) + } + } +} + +sub customWindow{ + + my $Wstart=shift @_; + my $Wend=shift@_; + my $Wavg=0; + my $Wcount=0; + my $insertion=0; + my $Wsum=0; + my $lastPos=0; + my @allgenes=(); + my $i; + + + for ($i=$marker;$i<$rowCount;$i++){ + my @fields=@{$sorted[$i]}; + my $pos=$fields[0]; + my $w=$fields[1]; #fitness value for single insertion + my $avgCount=$fields[2]; + my $gene=$fields[3]; + + + + if ($pos<$Wstart){ #if deleted, error shows up + next; + } + if ($pos<=$Wend){ + my @empty; + if (!$wind_summary{$Wstart}) { + $wind_summary{$Wstart}{w} = [@empty]; + $wind_summary{$Wstart}{s} = [@empty]; + $wind_summary{$Wstart}{g} = [@empty]; + } + # Hash of fitness scores + $wind_summary{$Wstart}{w} = [@{$wind_summary{$Wstart}{w}}, $w]; + # Hash of counts used to generate those fitness scores. + $wind_summary{$Wstart}{s} = [@{$wind_summary{$Wstart}{s}}, $avgCount]; + + if (!($gene =~ /^ *$/)){ + $gene=~ s/"//g; + $gene=~ s/ //g; + $gene=~ s/'//g; + push @allgenes,$gene; + } + $Wsum+=$w; + $Wcount++; + if ($pos!=$lastPos){ + $insertion+=1; + $lastPos=$pos; + } + } + + #if ($fields[0]>$Wend){ #if finished with that window, then: + else{ + if ($Wcount>0){ + $totalWindows++; + $totalInsert+=$insertion; + + my ($average, $variance, $stdev, $stderr); + + if ($num <=1 ) { + ($average, $variance, $stdev, $stderr)=(0.10,0.10,"NA","NA"); + } + else{ + if (!$weight) { + ($average, $variance, $stdev, $stderr) = &average($wind_summary{$Wstart}{w}); + } + else { + ($average, $variance, $stdev, $stderr)= &weighted_average($wind_summary{$Wstart}{w},$wind_summary{$Wstart}{s}); + } + } + + @allgenes= uniq(@allgenes); + my @sortedGenes = sort { lc($a) cmp lc($b) } @allgenes; + my $wind_genes=join(" ",@sortedGenes); + print TEST $wind_genes,"\n"; + my @window=($Wstart,$Wend,$Wcount,$insertion,$wind_genes,$average,$variance,$stdev,$stderr); + return (\@window); + } + + else{ #Even if there were no insertions, still want window in file for consistent start/end + my @window=($Wstart,$Wend,$Wcount,$insertion," ","NA","NA","NA","NA"); + return (\@window); + } #Because count=0 (i.e. there were no insertion mutants in that window) + } + } +} + +print "Start calculation: ",get_time(),"\n"; +my $start=1; +my $end=$size+1; +my $windowNum=0; +my @allWindows; #will be a 2D array containing all window info to be written into output file + + + +#IF A CUSTOM LIST OF START/END COORDINATES IS GIVEN THEN CREATE WINDOWS USING IT +if ($custom){ + open (CUST, '<', $custom); + while(my $line=<CUST>){ + $line=cleaner($line); + my @cwind=split(",",$line); + my $start=$cwind[0]; + my $end=$cwind[1]; + my $window=customWindow($start,$end); + push @allWindows,$window; + } + close CUST; +} + +else{ #if not custom list + #WHILE LOOP TO CALL THE ONE WINDOW SUBROUTINE FOR CALCULATIONS===INCREMENTS START AND END VALUES OF THE WINDOW + while ($end<=$last-$size){ #100,000bp-->9,950 windows--> only 8500 windows in csv because 0 + my($window)=OneWindow($start,$end); + push (@allWindows,$window); + $start=$start+$step; + $end=$end+$step; + } + print "End calculation: ",get_time(),"\n"; +} + + +#my $avgInsert=$totalInsert/$totalWindows; +#print "Average number of insertions for $size base pair windows: $avgInsert\n"; + + +#ESSENTIALS: Counting the number of TA sites in the genome and whether an insertion occurred there or not + +print "\n---------Assessing essentiality of genome region in each window--------\n\n"; + +my @sites; + +#First read fasta file into a string +my $seqio = Bio::SeqIO->new(-file => $fasta, '-format' => 'Fasta'); +my $prev; +my $total=0; +while(my $seq = $seqio->next_seq) { + $fasta = $seq->seq; +} +#Get number of "TA" sites, regardless of insertion---this is just the fasta file +my $x="TA"; +my @c = $fasta =~ /$x/g; +my $countTA = @c; + +#At what positions in the genome do TA sites occur? +my $pos=0; +my $countInsert=0; +my @selector; #use this array to hold all 1 and 0 of whether insertion happened or not. + +#Go through all TA sites identified above and see if an insertion occurs there. +#Push results onto two arrays a 2d array with the position and a 1D array with just the 0 or 1 + +my @unmatched; #hold all unmatched ta sites +my @allTAsites; #2d array to hold all occurences of TA sites in genome +my $unmatchedCount=0; +my $offset=0; +my $genPos = index($fasta,'TA',$offset); + +while (($genPos != -1) and ($pos<scalar @insertPos)) { #as long as the TA site is foun + my $res=0; + if ($genPos>$insertPos[$pos]){ + push @unmatched,$insertPos[$pos]; + $unmatchedCount++; + $pos++; + } + if ($genPos==$insertPos[$pos]){ + $res=1; + $countInsert++; + $pos++; + } + my @sites=($genPos,$res); + push @selector,$res; + #push the 0 or 1 onto the array @selector---going to use this to draw random sets for the null distribution + push (@allTAsites,\@sites); + $offset = $genPos + 1; + $genPos = index($fasta, 'TA', $offset); + $countTA++; + } + + #my $FILE1 = "$run" . "allTAsites.txt"; + my $FILE1 = $f1; + open (ALL_TA, ">", $FILE1); + foreach my $sit(@allTAsites){ + foreach (@$sit){ + print ALL_TA $_, "\t"; + } + printf ALL_TA "\n"; + } + close ALL_TA; + + my $FILE2 = $f2; #$run . "unmatched.txt"; + unless(open UNM, ">", $FILE2){ + die "\nUnable to create $FILE2:\n$!"; + } + foreach (@unmatched){ + print UNM $_, "\n"; + } + close UNM; + + print "\n\nTotal of unmatched insertions: $unmatchedCount\n"; + print "See unmatched.txt for genome indices of unmatched sites\n"; + print "See allTAsites.txt for details on all TA sites and insertions\n\n"; + +#print "\nTotal: $countInsert insertions in $countTA TA sites.\n"; + +#-------------------------------------------------------------------------------------------------- + +#Now, have an array for each TA site and if an insertion occurred there. + #So per site @sites=(position, 0 or 1 for insertion). +#Next step, create null distribution of 10,000 random sets with same number of + #TA sites as the window and then calculate p-value + +#SUBROUTINE FOR MAKING THE NULL DISTRIBUTION SPECIFIC TO THE WINDOW + +#DISTRIBUTION'S STANDARD DEVIATION +my $N=10000; + +sub mean { + if (scalar @_ ==0){ + return 0; + } + my $sum; + grep { $sum += $_ } @_; + return $sum/@_; +} +sub stdev{ + my @data = @{shift @_}; + my $average=shift @_; + my $sqtotal = 0; + foreach(@data) { + $sqtotal += ($average-$_) ** 2; + } + my $std = ($sqtotal / ($N-1)) ** 0.5; + return $std; +} + +#MAKE LIBRARY OF NULL DISTRIBUTIONS: +print "Making a library of null distributions.\n For information on distribution library, see nullDist.txt\n"; + +my @distLib; + +my $FILE3 = $f3; #$run . "nullDist.txt"; +unless(open DIST, ">", $FILE3){ + die "\nUnable to create $FILE3:\n$!"; +} +printf DIST "Sites\tSize(n)\tMin\tMax\tMean\tstdev\tvar\tMinPval\n"; + +#Loop once for each distribution in the library +#(i.e. distribution of 10,000 sites each with 35 TA sites, then a distribution of 10,000 sites each with 36 TA sites, etc) + +sub pvalue{ + + #takes in window count average (countAvg) and number of TAsites and makes a null distribution to calculate the pvalue, which it returns + my $mean=shift@_; + my $TAsites=shift@_; + my $N=10000; + my @specDist=@{$distLib[$TAsites-1]}; + my $rank= binsearch_pos { $a cmp $b } $mean,@specDist; + my $i=$rank; + while ($i<scalar(@specDist)-1 and $specDist[$i+1]==$specDist[$rank]){ + $i++; + } + $rank=$i; + my $pval=$rank/$N; #calculate pval as rank/N + return $pval; + +} + +for (my $sitez=1; $sitez<=$max;$sitez++){ + #print "In the first for loop to make a null distribution\n"; + my @unsorted; + my $count=0; + my $sum=0; + + for (my $i=1; $i<=$N; $i++){ + my @random_set = rand_set( set => \@selector, size => $sitez); + my $setMean=mean(@random_set); + push (@unsorted, $setMean); + #print "$i:\t", "$setMean\n"; + $count++; + $sum+=$setMean; + } + my @nullDist= sort { $a <=> $b } @unsorted; + my $nullMean=sprintf("$round",($sum/$count)); + my $standev =sprintf("$round",stdev(\@nullDist, $nullMean)); + my $variance=sprintf("$round",($standev*$standev)); + my $min=sprintf("$round",$nullDist[0]); + my $max=sprintf("$round",$nullDist[scalar @nullDist-1]); + my $setScalar=scalar @nullDist; + push (@distLib,\@nullDist); + my $minp=pvalue(0,$sitez); + printf DIST "$sitez\t$N\t$min\t$max\t$nullMean\t$standev\t$variance\t$minp\n"; +} +close DIST; + +#SUBROUTINE TO CALCULATE THE P-VALUE OF THE WINDOW INSERTIONS AGAINST THE NULL DISTRIBUTION +#--------------------------------------------------------------------------------------------- + + #Now we have an array called @allTAsites which contains every TAsite position + #with a 0 next to it for "no insertion". + #Now just need to replace 0 with 1 if there IS and insertion at that site + +my @newWindows=(); +my $printNum=0; + +#my $allWindows=\@allWindows; +for (my $i=0;$i<scalar @allWindows;$i++){ + my @win=@{$allWindows[$i]}; + my $starter=$win[0]; + my $ender=$win[1]; + my $insertions=$win[3]; + my $mutcount=$win[2]; + #print "num $printNum -->\tStart pos: $starter\tEnd pos: $ender\n"; + #How many TA sites are there from $genome[$start] to $genome[$end]? + + my $seq = substr($fasta,$starter-1,$size); #start-1 becase $start and $end are positions in genome starting at 1,2,3.... substr(string,start, length) needs indexes + my $ta="TA"; + my @c = $seq =~ /$ta/g; + my $TAsites = scalar @c; + my $avgInsert=sprintf("$round",$insertions/$TAsites); + my $pval=pvalue($avgInsert,$TAsites); + + #reorder so things make more sense: all fitness related calcs together and all sig together + my @new=($starter,$ender,$mutcount,$insertions,$TAsites,$avgInsert,$pval,$win[5],$win[6],$win[7],$win[8],$win[4]); + + push (@newWindows,\@new); + $printNum++; + } + + + print "End p-value calculation: ",get_time(),"\n\n"; + +#print "This is the TA count: $count\nTotal genome size is: $total\n\n"; + +#CALCULATE THE ABSOLUTE DIFFERENCE BETWEEN REGION'S MEAN FITNESS AND AVERAGE MEAN FITNESS +### For all windows, add a field that has the difference between that window's mean fitness and the +#average mean fitness for all of the windows + +my @allFits = map $_->[7], @newWindows; +my $meanFit=mean(@allFits); +print "Average fitness for all windows: ",$meanFit,"\n"; +my @expWindows=(); +foreach (@newWindows){ + my @entry=@{$_}; + my $mean=$entry[7]; + my $absdev=sprintf("$round",$mean-$meanFit); + push (@entry,$absdev); + push @expWindows,\@entry; +} + +@newWindows=@expWindows; + +#MAKE OUTPUT CSV FILE WITH ESSENTIAL WINDOW CALCULATIONS + +print "Start csv ouput file creation: ",get_time(),"\n"; + +open CSV, '>', $f3; # $run . "slidingWindows.csv" or die "Cannot open sliding window output file"; + +my @csvHeader= ("start", "end","mutants","insertions","TA_sites","ratio","p-value","average", "variance","stdev","stderr","genes","fit-mean"); +my $header=join(",",@csvHeader); +print CSV $header,"\n"; +foreach my $winLine(@newWindows){ + my $string=join(",",@{$winLine}); + print CSV $string, "\n"; +} +close CSV; +print "End csv ouput file creation: ",get_time(),"\n\n"; + +my $in = Bio::SeqIO->new(-file=>$ref, -format => 'genbank'); +my $refseq = $in->next_seq; +my $refname = $refseq->id; + +#MAKE essentials WIG FILE---->later make BW--->IGV +sub printwig{ + print "Start wig file creation: ",get_time(),"\n"; + my $in = Bio::SeqIO->new(-file=>$ref, -format => 'genbank'); + my $refseq = $in->next_seq; + my $refname = $refseq->id; + my $ewig="essentialWindows.wig"; + my $FILE5 = $run . $ewig; + open eWIG, ">", $f5; #$FILE5"; + printf eWIG "track type=wiggle_0 name=$ewig\n"; + printf eWIG "variableStep chrom=$refname\n"; + foreach my $wigLine(@newWindows){ + my @wigFields=@$wigLine; + my $position=$wigFields[0]; + while ($position<=$wigFields[1]){ + printf eWIG $position, " ",$wigFields[7],"\n"; #7 for pvalue, but 2 for fitness!!!!!! + $position=$position+1; + } + } + close eWIG; + print "End wig file creation: ",get_time(),"\n\n"; + print "If this wig file needs to be converted to a Big Wig, then use USCS program wigToBigWig in terminal: \n \t./wigToBigWig gview/12G.wig organism.txt BigWig/output.bw \n\n"; +} + + +#OUTPUT REGULAR SLIDING INFORMATION WITHOUT ESSENTIALS CALCULATIONS (P-VALUES) + + +#MAKE OUTPUT CSV FILE WITH FITNESS WINDOW CALCULATIONS + print "Start csv ouput file creation: ",get_time(),"\n"; + open FITFILE, ">", $f4; #$run . "fitWindows.csv" or die "Failed to open file"; + my @head = ("start", "end","fitness","mutant_count", "\n"); + print FITFILE join(",",@head); #header + foreach my $winLine(@allWindows){ + print FITFILE join(",", @$winLine),"\n"; + } + close FITFILE; + print "End csv ouput file creation: ",get_time(),"\n\n"; + +#MAKE WIG FILE---->later make BW--->IGV + print "Start wig file creation: ",get_time(),"\n"; + my $fin = Bio::SeqIO->new(-file=>$ref, -format => 'genbank'); + $refseq = $fin->next_seq; + $refname = $refseq->id; + my $fwig="fitWindows.wig"; + open fWIG, ">", $f5; #$run . $fwig; + print fWIG "track type=wiggle_0 name=$fwig\n"; + print fWIG "variableStep chrom=$refname\n"; + foreach my $wigLine(@allWindows){ + my @wigFields=@$wigLine; + my $pos=$wigFields[0]; + my $fit=$wigFields[7]; + print fWIG $pos," ",$fit,"\n"; + } + close fWIG; + print "End wig file creation: ",get_time(),"\n\n"; + print "If this wig file needs to be converted to a Big Wig,\n"; + print " then use USCS program wigToBigWig in terminal: \n"; + print "\t./wigToBigWig gview/12G.wig organism.txt BigWig/output.bw \n\n"; + + +#GOING TO MAKE A TEXT FILE FOR BED CONVERSION TO BIGBED, NEED CHROM # IN COLUMN 0 + +my @cummulative; + +#MAKING A REGULAR TEXT FILE fields: [chrom,start,end,fitness,count] + +print "Start text file creation time: ",get_time(),"\n"; +open my $TXT, '>', $f6; #$run . "fitWindows.txt" or die $!; +print $TXT ("start","end","W","mutant_count","insertion_count\n"); +print $TXT (join("\t",@$_),"\n") for @allWindows; +close $TXT; +print "End text file creation: ",get_time(),"\n\n"; + +