Mercurial > repos > antmarge > regionfitness
comparison regionFitness.pl @ 2:e8749de582fa draft
Uploaded
author | antmarge |
---|---|
date | Tue, 28 Mar 2017 10:51:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1:7b3e44bd0ab0 | 2:e8749de582fa |
---|---|
1 #!/usr/bin/perl -w | |
2 | |
3 #Margaret Antonio 17.01.06 | |
4 | |
5 # Sliding Window specifically for Galaxy implementation. | |
6 | |
7 use strict; | |
8 use Getopt::Long; | |
9 use warnings; | |
10 use Bio::SeqIO; | |
11 use Data::Random qw(:all); | |
12 use List::BinarySearch qw( :all ); | |
13 use autodie; | |
14 no warnings; | |
15 | |
16 #AVAILABLE OPTIONS. WILL PRINT UPON ERROR | |
17 sub print_usage() { | |
18 | |
19 print "\n####################################################################\n"; | |
20 print "USAGE:\n"; | |
21 print "slidingWindow.pl -f <fasta file> -r <genbank file> <inputs>\n"; | |
22 | |
23 print "\nREQUIRED:\n"; | |
24 print " -d\tDirectory containing all input files (output files from\n"; | |
25 print " \tcalcFitness tool)\n"; | |
26 print " \tOR\n"; | |
27 print " \tIn the command line (without a flag), input filename(s)\n"; | |
28 print " -f\tFilename for genome sequence, in fasta format\n"; | |
29 print " -r\tFilename for genome annotation, in GenBank format\n"; | |
30 | |
31 print "\nOPTIONAL:\n"; | |
32 print " -h\tPrint usage\n"; | |
33 print " -size\tThe size of the sliding window. Default=500\n"; | |
34 print " -step\tThe window spacing. Default=10\n"; | |
35 print " -c\tExclude values with avg. counts less than x where (c1+c2)/2<x\n"; | |
36 print " \tDefault=15\n"; | |
37 print " -log\tSend all output to a log file instead of the terminal\n"; | |
38 print " -max\tExpected max number of TA sites in a window.\n"; | |
39 print " \tUsed for creating null distribution library. Default=100\n"; | |
40 print " -w\tDo weighted average for fitness per insertion\n"; | |
41 print " -wc\tInteger value for weight ceiling. Default=50\n"; | |
42 | |
43 print " \n~~~~Always check that file paths are correctly specified~~~~\n"; | |
44 print "\n##################################################################\n"; | |
45 | |
46 | |
47 } | |
48 | |
49 #ASSIGN INPUTS TO VARIABLES | |
50 our ($cutoff,$step, $size, $help,$fasta, $log, $ref,$weight_ceiling,$weight,$custom,$run,$max,$f1,$f2,$f3,$f4,$f5,$f6); | |
51 | |
52 GetOptions( | |
53 'c:i'=>\$cutoff, | |
54 'step:i' => \$step, | |
55 'size:i' => \$size, | |
56 'f:s' => \$fasta, | |
57 'r:s' => \$ref, | |
58 'log' => \$log, | |
59 'h' => \$help, | |
60 'm:i'=>\$max, | |
61 'wc:i'=>\$weight_ceiling, | |
62 'w'=>\$weight, | |
63 'u:s'=>\$custom, | |
64 'n:s'=>\$run, | |
65 'f1:s'=>\$f1, | |
66 'f2:s'=>\$f2, | |
67 'f3:s'=>\$f3, | |
68 'f4:s'=>\$f4, | |
69 'f5:s'=>\$f5, | |
70 'f6:s'=>\$f6, | |
71 | |
72 ); | |
73 | |
74 sub get_time() { | |
75 my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(time); | |
76 return "$hour:$min:$sec"; | |
77 } | |
78 | |
79 sub uniq { | |
80 my %seen; | |
81 return grep { ! $seen{$_}++ } @_; | |
82 } | |
83 | |
84 # Just to test out the script opening | |
85 if ($help){ | |
86 print_usage(); | |
87 print "\n"; | |
88 exit; | |
89 } | |
90 my $round='%.4f'; | |
91 | |
92 # every output file will have this prefix | |
93 #if (!$run){$run = "run1";} | |
94 $run = $run . "_"; | |
95 | |
96 if (!$weight_ceiling){$weight_ceiling=50;} | |
97 if (!$cutoff){$cutoff=15;} | |
98 | |
99 | |
100 #open (STDOUT, ">>", $run . "log.txt"); | |
101 | |
102 #CHECKING PARAMETERS: Check to make sure required option inputs are there and if not then assign default | |
103 if (!$size) { $size = 500 }; #set the default sliding window size to 500 | |
104 if (!$step) { $step = 10 }; #set the default step size to 10 | |
105 if (!$cutoff) { $cutoff = 10 }; | |
106 if (!$max) { $max = 100 }; # most insertions expected in a given window region (for making null dist lib) | |
107 | |
108 print "Window size: $size\n"; | |
109 print "Step value: $step\n"; | |
110 print "Cutoff: $cutoff\n"; | |
111 | |
112 | |
113 # Returns mean, variance, sd, se | |
114 sub average { | |
115 my $scoreref = shift @_; | |
116 my @scores = @{$scoreref}; | |
117 | |
118 my $sum=0; | |
119 my $num=0; | |
120 | |
121 # Get the average. | |
122 foreach my $w (@scores) { | |
123 $sum += $w; | |
124 $num++; | |
125 } | |
126 my $average= $sum/$num; | |
127 my $xminusxbars = 0; | |
128 | |
129 # Get the variance. | |
130 foreach my $w (@scores) { | |
131 $xminusxbars += ($w-$average)**2; | |
132 } | |
133 my $variance; | |
134 if ($num<=1){ | |
135 $variance=0.10; | |
136 } | |
137 else{ | |
138 $variance = sprintf($round,(1/($num-1)) * $xminusxbars); | |
139 } | |
140 my $sd = sprintf($round,sqrt($variance)); | |
141 my $se = sprintf($round,$sd / sqrt($num)); | |
142 | |
143 return ($average, $variance, $sd, $se); | |
144 | |
145 } | |
146 | |
147 # Function to clean unwanted characters from lines | |
148 | |
149 sub cleaner{ | |
150 my $line=$_[0]; | |
151 chomp($line); | |
152 $line =~ s/\x0d{0,1}\x0a{0,1}\Z//s; | |
153 return $line; | |
154 } | |
155 | |
156 # Takes two parameters, both hashrefs to lists. | |
157 # 1) hashref to list of scores | |
158 # 2) hashref to list of weights, equal in length to the scores. | |
159 sub weighted_average { | |
160 | |
161 my $scoreref = shift @_; | |
162 my $weightref = shift @_; | |
163 my @scores = @{$scoreref}; | |
164 my @weights = @{$weightref}; | |
165 | |
166 my $sum=0; | |
167 my ($weighted_average, $weighted_variance)=(0,0); | |
168 my $v2; | |
169 | |
170 # Go through once to get total, calculate V2. | |
171 for (my $i=0; $i<@weights; $i++) { | |
172 $sum += $weights[$i]; | |
173 $v2 += $weights[$i]**2; | |
174 } | |
175 if ($sum == 0) { return 0; } # No scores given? | |
176 | |
177 my $scor = join (' ', @scores); | |
178 my $wght = join (' ', @weights); | |
179 | |
180 # Now calculated weighted average. | |
181 my ($top, $bottom) = (0,0); | |
182 for (my $i=0; $i<@weights; $i++) { | |
183 $top += $weights[$i] * $scores[$i]; | |
184 $bottom += $weights[$i]; | |
185 } | |
186 $weighted_average = sprintf($round,$top/$bottom); | |
187 | |
188 ($top, $bottom) = (0,0); | |
189 # Now calculate weighted sample variance. | |
190 for (my $i=0; $i<@weights; $i++) { | |
191 $top += ( $weights[$i] * ($scores[$i] - $weighted_average)**2); | |
192 $bottom += $weights[$i]; | |
193 } | |
194 $weighted_variance = sprintf($round,$top/$bottom); | |
195 | |
196 my $weighted_stdev = sprintf($round,sqrt($weighted_variance)); | |
197 my $weighted_stder = sprintf($round,$weighted_stdev / sqrt(@scores)); # / length scores. | |
198 | |
199 return ($weighted_average, $weighted_variance, $weighted_stdev, $weighted_stder); | |
200 } | |
201 | |
202 #CREATE AN ARRAY OF DATA FROM INPUT CSV FILE(S) | |
203 | |
204 my $rowCount=-1; | |
205 my $last=0; | |
206 my @unsorted; | |
207 my @insertPos; #array to hold all positions of insertions. Going to use this later to match up with TA sites | |
208 my %wind_summary; | |
209 | |
210 my @files = @ARGV; | |
211 | |
212 my $num=(scalar @files); | |
213 | |
214 my $datestring = localtime(); | |
215 print "Local date and time $datestring\n"; | |
216 | |
217 print "\n---------Importing files--------\n"; | |
218 print "\tStart input array ",get_time(),"\n"; | |
219 print "\tNumber of csv files: ", $num,"\n"; | |
220 | |
221 | |
222 #Go through each file from the commandline (ARGV array) and read each line as an array | |
223 #into select array if values satisfy the cutoff | |
224 for (my $i=0; $i<$num; $i++){ | |
225 print "File #",$i+1,"\t"; | |
226 my $file=$files[$i]; | |
227 print $file,"\n"; | |
228 | |
229 open(DATA, '<', $file) or die "Could not open '$file' Make sure input .csv files are entered in the command line\n"; | |
230 my $dummy=<DATA>; #read and store column names in dummy variable | |
231 while (my $entry = <DATA>) { | |
232 chomp $entry; | |
233 my @line=split(",",$entry); | |
234 my $locus = $line[9]; #gene id (SP_0000) | |
235 my $w = $line[12]; #nW | |
236 if (!$w){ $w=0 } # For blanks | |
237 my $c1 = $line[2]; | |
238 my $c2 = $line[3]; | |
239 my $avg = ($c1+$c2)/2; | |
240 #Average counts must be greater than cutoff (minimum allowed) | |
241 if ($avg > $cutoff) { | |
242 my @select=($line[0],$w,$avg,$line[9]); | |
243 my $select=\@select; | |
244 push(@unsorted,$select); | |
245 push(@insertPos,$line[0]); #keep track of actual insertion site position | |
246 $last=$select[0]; | |
247 $rowCount++; | |
248 } | |
249 if ($avg >= $weight_ceiling) { $avg = $weight_ceiling; } # Maximum weight | |
250 } | |
251 close DATA; | |
252 } | |
253 | |
254 my @sorted = sort { $a->[0] <=> $b->[0] } @unsorted; | |
255 @insertPos = sort { $a <=> $b } @insertPos; | |
256 # Get unique | |
257 @insertPos= uniq(@insertPos); | |
258 | |
259 print "\n\tFinished input array ",get_time(),"\n"; | |
260 | |
261 ############################################################################################### | |
262 | |
263 print "\n---------Creating sliding windows across the genome--------\n\n"; | |
264 | |
265 my $index=-1; | |
266 my $marker=0; | |
267 my $totalInsert=0; | |
268 my $totalWindows=0; | |
269 | |
270 | |
271 #SUBROUTINE FOR EACH WINDOW CALCULATION | |
272 sub OneWindow{ | |
273 my $Wstart=shift @_; | |
274 my $Wend=shift@_; | |
275 my $Wcount=0; | |
276 my $insertion=0; | |
277 my $Wsum=0; | |
278 my $lastPos=0; | |
279 my $i; | |
280 my @allgenes=(); | |
281 | |
282 for ($i=$marker;$i<$rowCount;$i++){ | |
283 my @fields=@{$sorted[$i]}; | |
284 my $pos=$fields[0]; | |
285 my $w=$fields[1]; #fitness value for single insertion | |
286 my $avgCount=$fields[2]; | |
287 my $gene=$fields[3]; | |
288 | |
289 if ($pos<$Wstart){ #if deleted, error shows up | |
290 next; | |
291 } | |
292 if ($pos<=$Wend){ | |
293 if ($pos<($Wstart+$step)){ | |
294 $marker++; | |
295 } | |
296 my @empty; | |
297 if (!$wind_summary{$Wstart}) { | |
298 $wind_summary{$Wstart}{w} = [@empty]; | |
299 $wind_summary{$Wstart}{s} = [@empty]; | |
300 $wind_summary{$Wstart}{g} = [@empty]; | |
301 } | |
302 # Hash of Fitness scores. | |
303 $wind_summary{$Wstart}{w} = [@{$wind_summary{$Wstart}{w}}, $w]; | |
304 # Hash of counts used to generate those fitness scores. | |
305 $wind_summary{$Wstart}{s} = [@{$wind_summary{$Wstart}{s}}, $avgCount]; | |
306 | |
307 | |
308 if (!($gene =~ /^ *$/)){ | |
309 $gene=~ s/"//g; | |
310 $gene=~ s/ //g; | |
311 $gene=~ s/'//g; | |
312 push @allgenes,$gene; | |
313 } | |
314 $Wsum+=$w; | |
315 $Wcount++; | |
316 if ($pos!=$lastPos){ | |
317 $insertion+=1; | |
318 $lastPos=$pos; | |
319 } | |
320 } | |
321 #if ($fields[0]>$Wend) #finished with that window, then: | |
322 else{ | |
323 if ($Wcount>0){ | |
324 $totalWindows++; | |
325 $totalInsert+=$insertion; | |
326 | |
327 my ($average, $variance, $stdev, $stderr); | |
328 | |
329 if ($num <=1 ) { | |
330 ($average, $variance, $stdev, $stderr)=(0.10,0.10,"NA","NA"); | |
331 } | |
332 else{ | |
333 if (!$weight) { | |
334 ($average, $variance, $stdev, $stderr) = &average($wind_summary{$Wstart}{w}); | |
335 } | |
336 else { | |
337 ($average, $variance, $stdev, $stderr)= &weighted_average($wind_summary{$Wstart}{w},$wind_summary{$Wstart}{s}); | |
338 } | |
339 } | |
340 | |
341 @allgenes= uniq(@allgenes); | |
342 my @sortedGenes = sort { lc($a) cmp lc($b) } @allgenes; | |
343 my $wind_genes=join(" ",@sortedGenes); | |
344 my @window=($Wstart,$Wend,$Wcount,$insertion,$wind_genes,$average,$variance,$stdev,$stderr); | |
345 return (\@window); | |
346 } | |
347 | |
348 else{ | |
349 #Even if there were no insertions, still want window in file for consistent start/end | |
350 my @window=($Wstart,$Wend,$Wcount,$insertion," ","NA","NA","NA","NA"); | |
351 return (\@window); | |
352 } #Because count=0 (i.e. there were no insertion mutants in that window) | |
353 } | |
354 } | |
355 } | |
356 | |
357 sub customWindow{ | |
358 | |
359 my $Wstart=shift @_; | |
360 my $Wend=shift@_; | |
361 my $Wavg=0; | |
362 my $Wcount=0; | |
363 my $insertion=0; | |
364 my $Wsum=0; | |
365 my $lastPos=0; | |
366 my @allgenes=(); | |
367 my $i; | |
368 | |
369 | |
370 for ($i=$marker;$i<$rowCount;$i++){ | |
371 my @fields=@{$sorted[$i]}; | |
372 my $pos=$fields[0]; | |
373 my $w=$fields[1]; #fitness value for single insertion | |
374 my $avgCount=$fields[2]; | |
375 my $gene=$fields[3]; | |
376 | |
377 | |
378 | |
379 if ($pos<$Wstart){ #if deleted, error shows up | |
380 next; | |
381 } | |
382 if ($pos<=$Wend){ | |
383 my @empty; | |
384 if (!$wind_summary{$Wstart}) { | |
385 $wind_summary{$Wstart}{w} = [@empty]; | |
386 $wind_summary{$Wstart}{s} = [@empty]; | |
387 $wind_summary{$Wstart}{g} = [@empty]; | |
388 } | |
389 # Hash of fitness scores | |
390 $wind_summary{$Wstart}{w} = [@{$wind_summary{$Wstart}{w}}, $w]; | |
391 # Hash of counts used to generate those fitness scores. | |
392 $wind_summary{$Wstart}{s} = [@{$wind_summary{$Wstart}{s}}, $avgCount]; | |
393 | |
394 if (!($gene =~ /^ *$/)){ | |
395 $gene=~ s/"//g; | |
396 $gene=~ s/ //g; | |
397 $gene=~ s/'//g; | |
398 push @allgenes,$gene; | |
399 } | |
400 $Wsum+=$w; | |
401 $Wcount++; | |
402 if ($pos!=$lastPos){ | |
403 $insertion+=1; | |
404 $lastPos=$pos; | |
405 } | |
406 } | |
407 | |
408 #if ($fields[0]>$Wend){ #if finished with that window, then: | |
409 else{ | |
410 if ($Wcount>0){ | |
411 $totalWindows++; | |
412 $totalInsert+=$insertion; | |
413 | |
414 my ($average, $variance, $stdev, $stderr); | |
415 | |
416 if ($num <=1 ) { | |
417 ($average, $variance, $stdev, $stderr)=(0.10,0.10,"NA","NA"); | |
418 } | |
419 else{ | |
420 if (!$weight) { | |
421 ($average, $variance, $stdev, $stderr) = &average($wind_summary{$Wstart}{w}); | |
422 } | |
423 else { | |
424 ($average, $variance, $stdev, $stderr)= &weighted_average($wind_summary{$Wstart}{w},$wind_summary{$Wstart}{s}); | |
425 } | |
426 } | |
427 | |
428 @allgenes= uniq(@allgenes); | |
429 my @sortedGenes = sort { lc($a) cmp lc($b) } @allgenes; | |
430 my $wind_genes=join(" ",@sortedGenes); | |
431 print TEST $wind_genes,"\n"; | |
432 my @window=($Wstart,$Wend,$Wcount,$insertion,$wind_genes,$average,$variance,$stdev,$stderr); | |
433 return (\@window); | |
434 } | |
435 | |
436 else{ #Even if there were no insertions, still want window in file for consistent start/end | |
437 my @window=($Wstart,$Wend,$Wcount,$insertion," ","NA","NA","NA","NA"); | |
438 return (\@window); | |
439 } #Because count=0 (i.e. there were no insertion mutants in that window) | |
440 } | |
441 } | |
442 } | |
443 | |
444 print "Start calculation: ",get_time(),"\n"; | |
445 my $start=1; | |
446 my $end=$size+1; | |
447 my $windowNum=0; | |
448 my @allWindows; #will be a 2D array containing all window info to be written into output file | |
449 | |
450 | |
451 | |
452 #IF A CUSTOM LIST OF START/END COORDINATES IS GIVEN THEN CREATE WINDOWS USING IT | |
453 if ($custom){ | |
454 open (CUST, '<', $custom); | |
455 while(my $line=<CUST>){ | |
456 $line=cleaner($line); | |
457 my @cwind=split(",",$line); | |
458 my $start=$cwind[0]; | |
459 my $end=$cwind[1]; | |
460 my $window=customWindow($start,$end); | |
461 push @allWindows,$window; | |
462 } | |
463 close CUST; | |
464 } | |
465 | |
466 else{ #if not custom list | |
467 #WHILE LOOP TO CALL THE ONE WINDOW SUBROUTINE FOR CALCULATIONS===INCREMENTS START AND END VALUES OF THE WINDOW | |
468 while ($end<=$last-$size){ #100,000bp-->9,950 windows--> only 8500 windows in csv because 0 | |
469 my($window)=OneWindow($start,$end); | |
470 push (@allWindows,$window); | |
471 $start=$start+$step; | |
472 $end=$end+$step; | |
473 } | |
474 print "End calculation: ",get_time(),"\n"; | |
475 } | |
476 | |
477 | |
478 #my $avgInsert=$totalInsert/$totalWindows; | |
479 #print "Average number of insertions for $size base pair windows: $avgInsert\n"; | |
480 | |
481 | |
482 #ESSENTIALS: Counting the number of TA sites in the genome and whether an insertion occurred there or not | |
483 | |
484 print "\n---------Assessing essentiality of genome region in each window--------\n\n"; | |
485 | |
486 my @sites; | |
487 | |
488 #First read fasta file into a string | |
489 my $seqio = Bio::SeqIO->new(-file => $fasta, '-format' => 'Fasta'); | |
490 my $prev; | |
491 my $total=0; | |
492 while(my $seq = $seqio->next_seq) { | |
493 $fasta = $seq->seq; | |
494 } | |
495 #Get number of "TA" sites, regardless of insertion---this is just the fasta file | |
496 my $x="TA"; | |
497 my @c = $fasta =~ /$x/g; | |
498 my $countTA = @c; | |
499 | |
500 #At what positions in the genome do TA sites occur? | |
501 my $pos=0; | |
502 my $countInsert=0; | |
503 my @selector; #use this array to hold all 1 and 0 of whether insertion happened or not. | |
504 | |
505 #Go through all TA sites identified above and see if an insertion occurs there. | |
506 #Push results onto two arrays a 2d array with the position and a 1D array with just the 0 or 1 | |
507 | |
508 my @unmatched; #hold all unmatched ta sites | |
509 my @allTAsites; #2d array to hold all occurences of TA sites in genome | |
510 my $unmatchedCount=0; | |
511 my $offset=0; | |
512 my $genPos = index($fasta,'TA',$offset); | |
513 | |
514 while (($genPos != -1) and ($pos<scalar @insertPos)) { #as long as the TA site is foun | |
515 my $res=0; | |
516 if ($genPos>$insertPos[$pos]){ | |
517 push @unmatched,$insertPos[$pos]; | |
518 $unmatchedCount++; | |
519 $pos++; | |
520 } | |
521 if ($genPos==$insertPos[$pos]){ | |
522 $res=1; | |
523 $countInsert++; | |
524 $pos++; | |
525 } | |
526 my @sites=($genPos,$res); | |
527 push @selector,$res; | |
528 #push the 0 or 1 onto the array @selector---going to use this to draw random sets for the null distribution | |
529 push (@allTAsites,\@sites); | |
530 $offset = $genPos + 1; | |
531 $genPos = index($fasta, 'TA', $offset); | |
532 $countTA++; | |
533 } | |
534 | |
535 #my $FILE1 = "$run" . "allTAsites.txt"; | |
536 my $FILE1 = $f1; | |
537 open (ALL_TA, ">", $FILE1); | |
538 foreach my $sit(@allTAsites){ | |
539 foreach (@$sit){ | |
540 print ALL_TA $_, "\t"; | |
541 } | |
542 printf ALL_TA "\n"; | |
543 } | |
544 close ALL_TA; | |
545 | |
546 my $FILE2 = $f2; #$run . "unmatched.txt"; | |
547 unless(open UNM, ">", $FILE2){ | |
548 die "\nUnable to create $FILE2:\n$!"; | |
549 } | |
550 foreach (@unmatched){ | |
551 print UNM $_, "\n"; | |
552 } | |
553 close UNM; | |
554 | |
555 print "\n\nTotal of unmatched insertions: $unmatchedCount\n"; | |
556 print "See unmatched.txt for genome indices of unmatched sites\n"; | |
557 print "See allTAsites.txt for details on all TA sites and insertions\n\n"; | |
558 | |
559 #print "\nTotal: $countInsert insertions in $countTA TA sites.\n"; | |
560 | |
561 #-------------------------------------------------------------------------------------------------- | |
562 | |
563 #Now, have an array for each TA site and if an insertion occurred there. | |
564 #So per site @sites=(position, 0 or 1 for insertion). | |
565 #Next step, create null distribution of 10,000 random sets with same number of | |
566 #TA sites as the window and then calculate p-value | |
567 | |
568 #SUBROUTINE FOR MAKING THE NULL DISTRIBUTION SPECIFIC TO THE WINDOW | |
569 | |
570 #DISTRIBUTION'S STANDARD DEVIATION | |
571 my $N=10000; | |
572 | |
573 sub mean { | |
574 if (scalar @_ ==0){ | |
575 return 0; | |
576 } | |
577 my $sum; | |
578 grep { $sum += $_ } @_; | |
579 return $sum/@_; | |
580 } | |
581 sub stdev{ | |
582 my @data = @{shift @_}; | |
583 my $average=shift @_; | |
584 my $sqtotal = 0; | |
585 foreach(@data) { | |
586 $sqtotal += ($average-$_) ** 2; | |
587 } | |
588 my $std = ($sqtotal / ($N-1)) ** 0.5; | |
589 return $std; | |
590 } | |
591 | |
592 #MAKE LIBRARY OF NULL DISTRIBUTIONS: | |
593 print "Making a library of null distributions.\n For information on distribution library, see nullDist.txt\n"; | |
594 | |
595 my @distLib; | |
596 | |
597 my $FILE3 = $f3; #$run . "nullDist.txt"; | |
598 unless(open DIST, ">", $FILE3){ | |
599 die "\nUnable to create $FILE3:\n$!"; | |
600 } | |
601 printf DIST "Sites\tSize(n)\tMin\tMax\tMean\tstdev\tvar\tMinPval\n"; | |
602 | |
603 #Loop once for each distribution in the library | |
604 #(i.e. distribution of 10,000 sites each with 35 TA sites, then a distribution of 10,000 sites each with 36 TA sites, etc) | |
605 | |
606 sub pvalue{ | |
607 | |
608 #takes in window count average (countAvg) and number of TAsites and makes a null distribution to calculate the pvalue, which it returns | |
609 my $mean=shift@_; | |
610 my $TAsites=shift@_; | |
611 my $N=10000; | |
612 my @specDist=@{$distLib[$TAsites-1]}; | |
613 my $rank= binsearch_pos { $a cmp $b } $mean,@specDist; | |
614 my $i=$rank; | |
615 while ($i<scalar(@specDist)-1 and $specDist[$i+1]==$specDist[$rank]){ | |
616 $i++; | |
617 } | |
618 $rank=$i; | |
619 my $pval=$rank/$N; #calculate pval as rank/N | |
620 return $pval; | |
621 | |
622 } | |
623 | |
624 for (my $sitez=1; $sitez<=$max;$sitez++){ | |
625 #print "In the first for loop to make a null distribution\n"; | |
626 my @unsorted; | |
627 my $count=0; | |
628 my $sum=0; | |
629 | |
630 for (my $i=1; $i<=$N; $i++){ | |
631 my @random_set = rand_set( set => \@selector, size => $sitez); | |
632 my $setMean=mean(@random_set); | |
633 push (@unsorted, $setMean); | |
634 #print "$i:\t", "$setMean\n"; | |
635 $count++; | |
636 $sum+=$setMean; | |
637 } | |
638 my @nullDist= sort { $a <=> $b } @unsorted; | |
639 my $nullMean=sprintf("$round",($sum/$count)); | |
640 my $standev =sprintf("$round",stdev(\@nullDist, $nullMean)); | |
641 my $variance=sprintf("$round",($standev*$standev)); | |
642 my $min=sprintf("$round",$nullDist[0]); | |
643 my $max=sprintf("$round",$nullDist[scalar @nullDist-1]); | |
644 my $setScalar=scalar @nullDist; | |
645 push (@distLib,\@nullDist); | |
646 my $minp=pvalue(0,$sitez); | |
647 printf DIST "$sitez\t$N\t$min\t$max\t$nullMean\t$standev\t$variance\t$minp\n"; | |
648 } | |
649 close DIST; | |
650 | |
651 #SUBROUTINE TO CALCULATE THE P-VALUE OF THE WINDOW INSERTIONS AGAINST THE NULL DISTRIBUTION | |
652 #--------------------------------------------------------------------------------------------- | |
653 | |
654 #Now we have an array called @allTAsites which contains every TAsite position | |
655 #with a 0 next to it for "no insertion". | |
656 #Now just need to replace 0 with 1 if there IS and insertion at that site | |
657 | |
658 my @newWindows=(); | |
659 my $printNum=0; | |
660 | |
661 #my $allWindows=\@allWindows; | |
662 for (my $i=0;$i<scalar @allWindows;$i++){ | |
663 my @win=@{$allWindows[$i]}; | |
664 my $starter=$win[0]; | |
665 my $ender=$win[1]; | |
666 my $insertions=$win[3]; | |
667 my $mutcount=$win[2]; | |
668 #print "num $printNum -->\tStart pos: $starter\tEnd pos: $ender\n"; | |
669 #How many TA sites are there from $genome[$start] to $genome[$end]? | |
670 | |
671 my $seq = substr($fasta,$starter-1,$size); #start-1 becase $start and $end are positions in genome starting at 1,2,3.... substr(string,start, length) needs indexes | |
672 my $ta="TA"; | |
673 my @c = $seq =~ /$ta/g; | |
674 my $TAsites = scalar @c; | |
675 my $avgInsert=sprintf("$round",$insertions/$TAsites); | |
676 my $pval=pvalue($avgInsert,$TAsites); | |
677 | |
678 #reorder so things make more sense: all fitness related calcs together and all sig together | |
679 my @new=($starter,$ender,$mutcount,$insertions,$TAsites,$avgInsert,$pval,$win[5],$win[6],$win[7],$win[8],$win[4]); | |
680 | |
681 push (@newWindows,\@new); | |
682 $printNum++; | |
683 } | |
684 | |
685 | |
686 print "End p-value calculation: ",get_time(),"\n\n"; | |
687 | |
688 #print "This is the TA count: $count\nTotal genome size is: $total\n\n"; | |
689 | |
690 #CALCULATE THE ABSOLUTE DIFFERENCE BETWEEN REGION'S MEAN FITNESS AND AVERAGE MEAN FITNESS | |
691 ### For all windows, add a field that has the difference between that window's mean fitness and the | |
692 #average mean fitness for all of the windows | |
693 | |
694 my @allFits = map $_->[7], @newWindows; | |
695 my $meanFit=mean(@allFits); | |
696 print "Average fitness for all windows: ",$meanFit,"\n"; | |
697 my @expWindows=(); | |
698 foreach (@newWindows){ | |
699 my @entry=@{$_}; | |
700 my $mean=$entry[7]; | |
701 my $absdev=sprintf("$round",$mean-$meanFit); | |
702 push (@entry,$absdev); | |
703 push @expWindows,\@entry; | |
704 } | |
705 | |
706 @newWindows=@expWindows; | |
707 | |
708 #MAKE OUTPUT CSV FILE WITH ESSENTIAL WINDOW CALCULATIONS | |
709 | |
710 print "Start csv ouput file creation: ",get_time(),"\n"; | |
711 | |
712 open CSV, '>', $f3; # $run . "slidingWindows.csv" or die "Cannot open sliding window output file"; | |
713 | |
714 my @csvHeader= ("start", "end","mutants","insertions","TA_sites","ratio","p-value","average", "variance","stdev","stderr","genes","fit-mean"); | |
715 my $header=join(",",@csvHeader); | |
716 print CSV $header,"\n"; | |
717 foreach my $winLine(@newWindows){ | |
718 my $string=join(",",@{$winLine}); | |
719 print CSV $string, "\n"; | |
720 } | |
721 close CSV; | |
722 print "End csv ouput file creation: ",get_time(),"\n\n"; | |
723 | |
724 my $in = Bio::SeqIO->new(-file=>$ref, -format => 'genbank'); | |
725 my $refseq = $in->next_seq; | |
726 my $refname = $refseq->id; | |
727 | |
728 #MAKE essentials WIG FILE---->later make BW--->IGV | |
729 sub printwig{ | |
730 print "Start wig file creation: ",get_time(),"\n"; | |
731 my $in = Bio::SeqIO->new(-file=>$ref, -format => 'genbank'); | |
732 my $refseq = $in->next_seq; | |
733 my $refname = $refseq->id; | |
734 my $ewig="essentialWindows.wig"; | |
735 my $FILE5 = $run . $ewig; | |
736 open eWIG, ">", $f5; #$FILE5"; | |
737 printf eWIG "track type=wiggle_0 name=$ewig\n"; | |
738 printf eWIG "variableStep chrom=$refname\n"; | |
739 foreach my $wigLine(@newWindows){ | |
740 my @wigFields=@$wigLine; | |
741 my $position=$wigFields[0]; | |
742 while ($position<=$wigFields[1]){ | |
743 printf eWIG $position, " ",$wigFields[7],"\n"; #7 for pvalue, but 2 for fitness!!!!!! | |
744 $position=$position+1; | |
745 } | |
746 } | |
747 close eWIG; | |
748 print "End wig file creation: ",get_time(),"\n\n"; | |
749 print "If this wig file needs to be converted to a Big Wig, then use USCS program wigToBigWig in terminal: \n \t./wigToBigWig gview/12G.wig organism.txt BigWig/output.bw \n\n"; | |
750 } | |
751 | |
752 | |
753 #OUTPUT REGULAR SLIDING INFORMATION WITHOUT ESSENTIALS CALCULATIONS (P-VALUES) | |
754 | |
755 | |
756 #MAKE OUTPUT CSV FILE WITH FITNESS WINDOW CALCULATIONS | |
757 print "Start csv ouput file creation: ",get_time(),"\n"; | |
758 open FITFILE, ">", $f4; #$run . "fitWindows.csv" or die "Failed to open file"; | |
759 my @head = ("start", "end","fitness","mutant_count", "\n"); | |
760 print FITFILE join(",",@head); #header | |
761 foreach my $winLine(@allWindows){ | |
762 print FITFILE join(",", @$winLine),"\n"; | |
763 } | |
764 close FITFILE; | |
765 print "End csv ouput file creation: ",get_time(),"\n\n"; | |
766 | |
767 #MAKE WIG FILE---->later make BW--->IGV | |
768 print "Start wig file creation: ",get_time(),"\n"; | |
769 my $fin = Bio::SeqIO->new(-file=>$ref, -format => 'genbank'); | |
770 $refseq = $fin->next_seq; | |
771 $refname = $refseq->id; | |
772 my $fwig="fitWindows.wig"; | |
773 open fWIG, ">", $f5; #$run . $fwig; | |
774 print fWIG "track type=wiggle_0 name=$fwig\n"; | |
775 print fWIG "variableStep chrom=$refname\n"; | |
776 foreach my $wigLine(@allWindows){ | |
777 my @wigFields=@$wigLine; | |
778 my $pos=$wigFields[0]; | |
779 my $fit=$wigFields[7]; | |
780 print fWIG $pos," ",$fit,"\n"; | |
781 } | |
782 close fWIG; | |
783 print "End wig file creation: ",get_time(),"\n\n"; | |
784 print "If this wig file needs to be converted to a Big Wig,\n"; | |
785 print " then use USCS program wigToBigWig in terminal: \n"; | |
786 print "\t./wigToBigWig gview/12G.wig organism.txt BigWig/output.bw \n\n"; | |
787 | |
788 | |
789 #GOING TO MAKE A TEXT FILE FOR BED CONVERSION TO BIGBED, NEED CHROM # IN COLUMN 0 | |
790 | |
791 my @cummulative; | |
792 | |
793 #MAKING A REGULAR TEXT FILE fields: [chrom,start,end,fitness,count] | |
794 | |
795 print "Start text file creation time: ",get_time(),"\n"; | |
796 open my $TXT, '>', $f6; #$run . "fitWindows.txt" or die $!; | |
797 print $TXT ("start","end","W","mutant_count","insertion_count\n"); | |
798 print $TXT (join("\t",@$_),"\n") for @allWindows; | |
799 close $TXT; | |
800 print "End text file creation: ",get_time(),"\n\n"; | |
801 | |
802 |