2
|
1 #!/usr/bin/perl -w
|
|
2
|
|
3 #Margaret Antonio 17.01.06
|
|
4
|
|
5 # Sliding Window specifically for Galaxy implementation.
|
|
6
|
|
7 use strict;
|
|
8 use Getopt::Long;
|
|
9 use warnings;
|
|
10 use Bio::SeqIO;
|
|
11 use Data::Random qw(:all);
|
|
12 use List::BinarySearch qw( :all );
|
|
13 use autodie;
|
|
14 no warnings;
|
|
15
|
|
16 #AVAILABLE OPTIONS. WILL PRINT UPON ERROR
|
|
17 sub print_usage() {
|
|
18
|
|
19 print "\n####################################################################\n";
|
|
20 print "USAGE:\n";
|
|
21 print "slidingWindow.pl -f <fasta file> -r <genbank file> <inputs>\n";
|
|
22
|
|
23 print "\nREQUIRED:\n";
|
|
24 print " -d\tDirectory containing all input files (output files from\n";
|
|
25 print " \tcalcFitness tool)\n";
|
|
26 print " \tOR\n";
|
|
27 print " \tIn the command line (without a flag), input filename(s)\n";
|
|
28 print " -f\tFilename for genome sequence, in fasta format\n";
|
|
29 print " -r\tFilename for genome annotation, in GenBank format\n";
|
|
30
|
|
31 print "\nOPTIONAL:\n";
|
|
32 print " -h\tPrint usage\n";
|
|
33 print " -size\tThe size of the sliding window. Default=500\n";
|
|
34 print " -step\tThe window spacing. Default=10\n";
|
|
35 print " -c\tExclude values with avg. counts less than x where (c1+c2)/2<x\n";
|
|
36 print " \tDefault=15\n";
|
|
37 print " -log\tSend all output to a log file instead of the terminal\n";
|
|
38 print " -max\tExpected max number of TA sites in a window.\n";
|
|
39 print " \tUsed for creating null distribution library. Default=100\n";
|
|
40 print " -w\tDo weighted average for fitness per insertion\n";
|
|
41 print " -wc\tInteger value for weight ceiling. Default=50\n";
|
|
42
|
|
43 print " \n~~~~Always check that file paths are correctly specified~~~~\n";
|
|
44 print "\n##################################################################\n";
|
|
45
|
|
46
|
|
47 }
|
|
48
|
|
49 #ASSIGN INPUTS TO VARIABLES
|
|
50 our ($cutoff,$step, $size, $help,$fasta, $log, $ref,$weight_ceiling,$weight,$custom,$run,$max,$f1,$f2,$f3,$f4,$f5,$f6);
|
|
51
|
|
52 GetOptions(
|
|
53 'c:i'=>\$cutoff,
|
|
54 'step:i' => \$step,
|
|
55 'size:i' => \$size,
|
|
56 'f:s' => \$fasta,
|
|
57 'r:s' => \$ref,
|
|
58 'log' => \$log,
|
|
59 'h' => \$help,
|
|
60 'm:i'=>\$max,
|
|
61 'wc:i'=>\$weight_ceiling,
|
|
62 'w'=>\$weight,
|
|
63 'u:s'=>\$custom,
|
|
64 'n:s'=>\$run,
|
|
65 'f1:s'=>\$f1,
|
|
66 'f2:s'=>\$f2,
|
|
67 'f3:s'=>\$f3,
|
|
68 'f4:s'=>\$f4,
|
|
69 'f5:s'=>\$f5,
|
|
70 'f6:s'=>\$f6,
|
|
71
|
|
72 );
|
|
73
|
|
74 sub get_time() {
|
|
75 my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(time);
|
|
76 return "$hour:$min:$sec";
|
|
77 }
|
|
78
|
|
79 sub uniq {
|
|
80 my %seen;
|
|
81 return grep { ! $seen{$_}++ } @_;
|
|
82 }
|
|
83
|
|
84 # Just to test out the script opening
|
|
85 if ($help){
|
|
86 print_usage();
|
|
87 print "\n";
|
|
88 exit;
|
|
89 }
|
|
90 my $round='%.4f';
|
|
91
|
|
92 # every output file will have this prefix
|
|
93 #if (!$run){$run = "run1";}
|
|
94 $run = $run . "_";
|
|
95
|
|
96 if (!$weight_ceiling){$weight_ceiling=50;}
|
|
97 if (!$cutoff){$cutoff=15;}
|
|
98
|
|
99
|
|
100 #open (STDOUT, ">>", $run . "log.txt");
|
|
101
|
|
102 #CHECKING PARAMETERS: Check to make sure required option inputs are there and if not then assign default
|
|
103 if (!$size) { $size = 500 }; #set the default sliding window size to 500
|
|
104 if (!$step) { $step = 10 }; #set the default step size to 10
|
|
105 if (!$cutoff) { $cutoff = 10 };
|
|
106 if (!$max) { $max = 100 }; # most insertions expected in a given window region (for making null dist lib)
|
|
107
|
|
108 print "Window size: $size\n";
|
|
109 print "Step value: $step\n";
|
|
110 print "Cutoff: $cutoff\n";
|
|
111
|
|
112
|
|
113 # Returns mean, variance, sd, se
|
|
114 sub average {
|
|
115 my $scoreref = shift @_;
|
|
116 my @scores = @{$scoreref};
|
|
117
|
|
118 my $sum=0;
|
|
119 my $num=0;
|
|
120
|
|
121 # Get the average.
|
|
122 foreach my $w (@scores) {
|
|
123 $sum += $w;
|
|
124 $num++;
|
|
125 }
|
|
126 my $average= $sum/$num;
|
|
127 my $xminusxbars = 0;
|
|
128
|
|
129 # Get the variance.
|
|
130 foreach my $w (@scores) {
|
|
131 $xminusxbars += ($w-$average)**2;
|
|
132 }
|
|
133 my $variance;
|
|
134 if ($num<=1){
|
|
135 $variance=0.10;
|
|
136 }
|
|
137 else{
|
|
138 $variance = sprintf($round,(1/($num-1)) * $xminusxbars);
|
|
139 }
|
|
140 my $sd = sprintf($round,sqrt($variance));
|
|
141 my $se = sprintf($round,$sd / sqrt($num));
|
|
142
|
|
143 return ($average, $variance, $sd, $se);
|
|
144
|
|
145 }
|
|
146
|
|
147 # Function to clean unwanted characters from lines
|
|
148
|
|
149 sub cleaner{
|
|
150 my $line=$_[0];
|
|
151 chomp($line);
|
|
152 $line =~ s/\x0d{0,1}\x0a{0,1}\Z//s;
|
|
153 return $line;
|
|
154 }
|
|
155
|
|
156 # Takes two parameters, both hashrefs to lists.
|
|
157 # 1) hashref to list of scores
|
|
158 # 2) hashref to list of weights, equal in length to the scores.
|
|
159 sub weighted_average {
|
|
160
|
|
161 my $scoreref = shift @_;
|
|
162 my $weightref = shift @_;
|
|
163 my @scores = @{$scoreref};
|
|
164 my @weights = @{$weightref};
|
|
165
|
|
166 my $sum=0;
|
|
167 my ($weighted_average, $weighted_variance)=(0,0);
|
|
168 my $v2;
|
|
169
|
|
170 # Go through once to get total, calculate V2.
|
|
171 for (my $i=0; $i<@weights; $i++) {
|
|
172 $sum += $weights[$i];
|
|
173 $v2 += $weights[$i]**2;
|
|
174 }
|
|
175 if ($sum == 0) { return 0; } # No scores given?
|
|
176
|
|
177 my $scor = join (' ', @scores);
|
|
178 my $wght = join (' ', @weights);
|
|
179
|
|
180 # Now calculated weighted average.
|
|
181 my ($top, $bottom) = (0,0);
|
|
182 for (my $i=0; $i<@weights; $i++) {
|
|
183 $top += $weights[$i] * $scores[$i];
|
|
184 $bottom += $weights[$i];
|
|
185 }
|
|
186 $weighted_average = sprintf($round,$top/$bottom);
|
|
187
|
|
188 ($top, $bottom) = (0,0);
|
|
189 # Now calculate weighted sample variance.
|
|
190 for (my $i=0; $i<@weights; $i++) {
|
|
191 $top += ( $weights[$i] * ($scores[$i] - $weighted_average)**2);
|
|
192 $bottom += $weights[$i];
|
|
193 }
|
|
194 $weighted_variance = sprintf($round,$top/$bottom);
|
|
195
|
|
196 my $weighted_stdev = sprintf($round,sqrt($weighted_variance));
|
|
197 my $weighted_stder = sprintf($round,$weighted_stdev / sqrt(@scores)); # / length scores.
|
|
198
|
|
199 return ($weighted_average, $weighted_variance, $weighted_stdev, $weighted_stder);
|
|
200 }
|
|
201
|
|
202 #CREATE AN ARRAY OF DATA FROM INPUT CSV FILE(S)
|
|
203
|
|
204 my $rowCount=-1;
|
|
205 my $last=0;
|
|
206 my @unsorted;
|
|
207 my @insertPos; #array to hold all positions of insertions. Going to use this later to match up with TA sites
|
|
208 my %wind_summary;
|
|
209
|
|
210 my @files = @ARGV;
|
|
211
|
|
212 my $num=(scalar @files);
|
|
213
|
|
214 my $datestring = localtime();
|
|
215 print "Local date and time $datestring\n";
|
|
216
|
|
217 print "\n---------Importing files--------\n";
|
|
218 print "\tStart input array ",get_time(),"\n";
|
|
219 print "\tNumber of csv files: ", $num,"\n";
|
|
220
|
|
221
|
|
222 #Go through each file from the commandline (ARGV array) and read each line as an array
|
|
223 #into select array if values satisfy the cutoff
|
|
224 for (my $i=0; $i<$num; $i++){
|
|
225 print "File #",$i+1,"\t";
|
|
226 my $file=$files[$i];
|
|
227 print $file,"\n";
|
|
228
|
|
229 open(DATA, '<', $file) or die "Could not open '$file' Make sure input .csv files are entered in the command line\n";
|
|
230 my $dummy=<DATA>; #read and store column names in dummy variable
|
|
231 while (my $entry = <DATA>) {
|
|
232 chomp $entry;
|
|
233 my @line=split(",",$entry);
|
|
234 my $locus = $line[9]; #gene id (SP_0000)
|
|
235 my $w = $line[12]; #nW
|
|
236 if (!$w){ $w=0 } # For blanks
|
|
237 my $c1 = $line[2];
|
|
238 my $c2 = $line[3];
|
|
239 my $avg = ($c1+$c2)/2;
|
|
240 #Average counts must be greater than cutoff (minimum allowed)
|
|
241 if ($avg > $cutoff) {
|
|
242 my @select=($line[0],$w,$avg,$line[9]);
|
|
243 my $select=\@select;
|
|
244 push(@unsorted,$select);
|
|
245 push(@insertPos,$line[0]); #keep track of actual insertion site position
|
|
246 $last=$select[0];
|
|
247 $rowCount++;
|
|
248 }
|
|
249 if ($avg >= $weight_ceiling) { $avg = $weight_ceiling; } # Maximum weight
|
|
250 }
|
|
251 close DATA;
|
|
252 }
|
|
253
|
|
254 my @sorted = sort { $a->[0] <=> $b->[0] } @unsorted;
|
|
255 @insertPos = sort { $a <=> $b } @insertPos;
|
|
256 # Get unique
|
|
257 @insertPos= uniq(@insertPos);
|
|
258
|
|
259 print "\n\tFinished input array ",get_time(),"\n";
|
|
260
|
|
261 ###############################################################################################
|
|
262
|
|
263 print "\n---------Creating sliding windows across the genome--------\n\n";
|
|
264
|
|
265 my $index=-1;
|
|
266 my $marker=0;
|
|
267 my $totalInsert=0;
|
|
268 my $totalWindows=0;
|
|
269
|
|
270
|
|
271 #SUBROUTINE FOR EACH WINDOW CALCULATION
|
|
272 sub OneWindow{
|
|
273 my $Wstart=shift @_;
|
|
274 my $Wend=shift@_;
|
|
275 my $Wcount=0;
|
|
276 my $insertion=0;
|
|
277 my $Wsum=0;
|
|
278 my $lastPos=0;
|
|
279 my $i;
|
|
280 my @allgenes=();
|
|
281
|
|
282 for ($i=$marker;$i<$rowCount;$i++){
|
|
283 my @fields=@{$sorted[$i]};
|
|
284 my $pos=$fields[0];
|
|
285 my $w=$fields[1]; #fitness value for single insertion
|
|
286 my $avgCount=$fields[2];
|
|
287 my $gene=$fields[3];
|
|
288
|
|
289 if ($pos<$Wstart){ #if deleted, error shows up
|
|
290 next;
|
|
291 }
|
|
292 if ($pos<=$Wend){
|
|
293 if ($pos<($Wstart+$step)){
|
|
294 $marker++;
|
|
295 }
|
|
296 my @empty;
|
|
297 if (!$wind_summary{$Wstart}) {
|
|
298 $wind_summary{$Wstart}{w} = [@empty];
|
|
299 $wind_summary{$Wstart}{s} = [@empty];
|
|
300 $wind_summary{$Wstart}{g} = [@empty];
|
|
301 }
|
|
302 # Hash of Fitness scores.
|
|
303 $wind_summary{$Wstart}{w} = [@{$wind_summary{$Wstart}{w}}, $w];
|
|
304 # Hash of counts used to generate those fitness scores.
|
|
305 $wind_summary{$Wstart}{s} = [@{$wind_summary{$Wstart}{s}}, $avgCount];
|
|
306
|
|
307
|
|
308 if (!($gene =~ /^ *$/)){
|
|
309 $gene=~ s/"//g;
|
|
310 $gene=~ s/ //g;
|
|
311 $gene=~ s/'//g;
|
|
312 push @allgenes,$gene;
|
|
313 }
|
|
314 $Wsum+=$w;
|
|
315 $Wcount++;
|
|
316 if ($pos!=$lastPos){
|
|
317 $insertion+=1;
|
|
318 $lastPos=$pos;
|
|
319 }
|
|
320 }
|
|
321 #if ($fields[0]>$Wend) #finished with that window, then:
|
|
322 else{
|
|
323 if ($Wcount>0){
|
|
324 $totalWindows++;
|
|
325 $totalInsert+=$insertion;
|
|
326
|
|
327 my ($average, $variance, $stdev, $stderr);
|
|
328
|
|
329 if ($num <=1 ) {
|
|
330 ($average, $variance, $stdev, $stderr)=(0.10,0.10,"NA","NA");
|
|
331 }
|
|
332 else{
|
|
333 if (!$weight) {
|
|
334 ($average, $variance, $stdev, $stderr) = &average($wind_summary{$Wstart}{w});
|
|
335 }
|
|
336 else {
|
|
337 ($average, $variance, $stdev, $stderr)= &weighted_average($wind_summary{$Wstart}{w},$wind_summary{$Wstart}{s});
|
|
338 }
|
|
339 }
|
|
340
|
|
341 @allgenes= uniq(@allgenes);
|
|
342 my @sortedGenes = sort { lc($a) cmp lc($b) } @allgenes;
|
|
343 my $wind_genes=join(" ",@sortedGenes);
|
|
344 my @window=($Wstart,$Wend,$Wcount,$insertion,$wind_genes,$average,$variance,$stdev,$stderr);
|
|
345 return (\@window);
|
|
346 }
|
|
347
|
|
348 else{
|
|
349 #Even if there were no insertions, still want window in file for consistent start/end
|
|
350 my @window=($Wstart,$Wend,$Wcount,$insertion," ","NA","NA","NA","NA");
|
|
351 return (\@window);
|
|
352 } #Because count=0 (i.e. there were no insertion mutants in that window)
|
|
353 }
|
|
354 }
|
|
355 }
|
|
356
|
|
357 sub customWindow{
|
|
358
|
|
359 my $Wstart=shift @_;
|
|
360 my $Wend=shift@_;
|
|
361 my $Wavg=0;
|
|
362 my $Wcount=0;
|
|
363 my $insertion=0;
|
|
364 my $Wsum=0;
|
|
365 my $lastPos=0;
|
|
366 my @allgenes=();
|
|
367 my $i;
|
|
368
|
|
369
|
|
370 for ($i=$marker;$i<$rowCount;$i++){
|
|
371 my @fields=@{$sorted[$i]};
|
|
372 my $pos=$fields[0];
|
|
373 my $w=$fields[1]; #fitness value for single insertion
|
|
374 my $avgCount=$fields[2];
|
|
375 my $gene=$fields[3];
|
|
376
|
|
377
|
|
378
|
|
379 if ($pos<$Wstart){ #if deleted, error shows up
|
|
380 next;
|
|
381 }
|
|
382 if ($pos<=$Wend){
|
|
383 my @empty;
|
|
384 if (!$wind_summary{$Wstart}) {
|
|
385 $wind_summary{$Wstart}{w} = [@empty];
|
|
386 $wind_summary{$Wstart}{s} = [@empty];
|
|
387 $wind_summary{$Wstart}{g} = [@empty];
|
|
388 }
|
|
389 # Hash of fitness scores
|
|
390 $wind_summary{$Wstart}{w} = [@{$wind_summary{$Wstart}{w}}, $w];
|
|
391 # Hash of counts used to generate those fitness scores.
|
|
392 $wind_summary{$Wstart}{s} = [@{$wind_summary{$Wstart}{s}}, $avgCount];
|
|
393
|
|
394 if (!($gene =~ /^ *$/)){
|
|
395 $gene=~ s/"//g;
|
|
396 $gene=~ s/ //g;
|
|
397 $gene=~ s/'//g;
|
|
398 push @allgenes,$gene;
|
|
399 }
|
|
400 $Wsum+=$w;
|
|
401 $Wcount++;
|
|
402 if ($pos!=$lastPos){
|
|
403 $insertion+=1;
|
|
404 $lastPos=$pos;
|
|
405 }
|
|
406 }
|
|
407
|
|
408 #if ($fields[0]>$Wend){ #if finished with that window, then:
|
|
409 else{
|
|
410 if ($Wcount>0){
|
|
411 $totalWindows++;
|
|
412 $totalInsert+=$insertion;
|
|
413
|
|
414 my ($average, $variance, $stdev, $stderr);
|
|
415
|
|
416 if ($num <=1 ) {
|
|
417 ($average, $variance, $stdev, $stderr)=(0.10,0.10,"NA","NA");
|
|
418 }
|
|
419 else{
|
|
420 if (!$weight) {
|
|
421 ($average, $variance, $stdev, $stderr) = &average($wind_summary{$Wstart}{w});
|
|
422 }
|
|
423 else {
|
|
424 ($average, $variance, $stdev, $stderr)= &weighted_average($wind_summary{$Wstart}{w},$wind_summary{$Wstart}{s});
|
|
425 }
|
|
426 }
|
|
427
|
|
428 @allgenes= uniq(@allgenes);
|
|
429 my @sortedGenes = sort { lc($a) cmp lc($b) } @allgenes;
|
|
430 my $wind_genes=join(" ",@sortedGenes);
|
|
431 print TEST $wind_genes,"\n";
|
|
432 my @window=($Wstart,$Wend,$Wcount,$insertion,$wind_genes,$average,$variance,$stdev,$stderr);
|
|
433 return (\@window);
|
|
434 }
|
|
435
|
|
436 else{ #Even if there were no insertions, still want window in file for consistent start/end
|
|
437 my @window=($Wstart,$Wend,$Wcount,$insertion," ","NA","NA","NA","NA");
|
|
438 return (\@window);
|
|
439 } #Because count=0 (i.e. there were no insertion mutants in that window)
|
|
440 }
|
|
441 }
|
|
442 }
|
|
443
|
|
444 print "Start calculation: ",get_time(),"\n";
|
|
445 my $start=1;
|
|
446 my $end=$size+1;
|
|
447 my $windowNum=0;
|
|
448 my @allWindows; #will be a 2D array containing all window info to be written into output file
|
|
449
|
|
450
|
|
451
|
|
452 #IF A CUSTOM LIST OF START/END COORDINATES IS GIVEN THEN CREATE WINDOWS USING IT
|
|
453 if ($custom){
|
|
454 open (CUST, '<', $custom);
|
|
455 while(my $line=<CUST>){
|
|
456 $line=cleaner($line);
|
|
457 my @cwind=split(",",$line);
|
|
458 my $start=$cwind[0];
|
|
459 my $end=$cwind[1];
|
|
460 my $window=customWindow($start,$end);
|
|
461 push @allWindows,$window;
|
|
462 }
|
|
463 close CUST;
|
|
464 }
|
|
465
|
|
466 else{ #if not custom list
|
|
467 #WHILE LOOP TO CALL THE ONE WINDOW SUBROUTINE FOR CALCULATIONS===INCREMENTS START AND END VALUES OF THE WINDOW
|
|
468 while ($end<=$last-$size){ #100,000bp-->9,950 windows--> only 8500 windows in csv because 0
|
|
469 my($window)=OneWindow($start,$end);
|
|
470 push (@allWindows,$window);
|
|
471 $start=$start+$step;
|
|
472 $end=$end+$step;
|
|
473 }
|
|
474 print "End calculation: ",get_time(),"\n";
|
|
475 }
|
|
476
|
|
477
|
|
478 #my $avgInsert=$totalInsert/$totalWindows;
|
|
479 #print "Average number of insertions for $size base pair windows: $avgInsert\n";
|
|
480
|
|
481
|
|
482 #ESSENTIALS: Counting the number of TA sites in the genome and whether an insertion occurred there or not
|
|
483
|
|
484 print "\n---------Assessing essentiality of genome region in each window--------\n\n";
|
|
485
|
|
486 my @sites;
|
|
487
|
|
488 #First read fasta file into a string
|
|
489 my $seqio = Bio::SeqIO->new(-file => $fasta, '-format' => 'Fasta');
|
|
490 my $prev;
|
|
491 my $total=0;
|
|
492 while(my $seq = $seqio->next_seq) {
|
|
493 $fasta = $seq->seq;
|
|
494 }
|
|
495 #Get number of "TA" sites, regardless of insertion---this is just the fasta file
|
|
496 my $x="TA";
|
|
497 my @c = $fasta =~ /$x/g;
|
|
498 my $countTA = @c;
|
|
499
|
|
500 #At what positions in the genome do TA sites occur?
|
|
501 my $pos=0;
|
|
502 my $countInsert=0;
|
|
503 my @selector; #use this array to hold all 1 and 0 of whether insertion happened or not.
|
|
504
|
|
505 #Go through all TA sites identified above and see if an insertion occurs there.
|
|
506 #Push results onto two arrays a 2d array with the position and a 1D array with just the 0 or 1
|
|
507
|
|
508 my @unmatched; #hold all unmatched ta sites
|
|
509 my @allTAsites; #2d array to hold all occurences of TA sites in genome
|
|
510 my $unmatchedCount=0;
|
|
511 my $offset=0;
|
|
512 my $genPos = index($fasta,'TA',$offset);
|
|
513
|
|
514 while (($genPos != -1) and ($pos<scalar @insertPos)) { #as long as the TA site is foun
|
|
515 my $res=0;
|
|
516 if ($genPos>$insertPos[$pos]){
|
|
517 push @unmatched,$insertPos[$pos];
|
|
518 $unmatchedCount++;
|
|
519 $pos++;
|
|
520 }
|
|
521 if ($genPos==$insertPos[$pos]){
|
|
522 $res=1;
|
|
523 $countInsert++;
|
|
524 $pos++;
|
|
525 }
|
|
526 my @sites=($genPos,$res);
|
|
527 push @selector,$res;
|
|
528 #push the 0 or 1 onto the array @selector---going to use this to draw random sets for the null distribution
|
|
529 push (@allTAsites,\@sites);
|
|
530 $offset = $genPos + 1;
|
|
531 $genPos = index($fasta, 'TA', $offset);
|
|
532 $countTA++;
|
|
533 }
|
|
534
|
|
535 #my $FILE1 = "$run" . "allTAsites.txt";
|
|
536 my $FILE1 = $f1;
|
|
537 open (ALL_TA, ">", $FILE1);
|
|
538 foreach my $sit(@allTAsites){
|
|
539 foreach (@$sit){
|
|
540 print ALL_TA $_, "\t";
|
|
541 }
|
|
542 printf ALL_TA "\n";
|
|
543 }
|
|
544 close ALL_TA;
|
|
545
|
|
546 my $FILE2 = $f2; #$run . "unmatched.txt";
|
|
547 unless(open UNM, ">", $FILE2){
|
|
548 die "\nUnable to create $FILE2:\n$!";
|
|
549 }
|
|
550 foreach (@unmatched){
|
|
551 print UNM $_, "\n";
|
|
552 }
|
|
553 close UNM;
|
|
554
|
|
555 print "\n\nTotal of unmatched insertions: $unmatchedCount\n";
|
|
556 print "See unmatched.txt for genome indices of unmatched sites\n";
|
|
557 print "See allTAsites.txt for details on all TA sites and insertions\n\n";
|
|
558
|
|
559 #print "\nTotal: $countInsert insertions in $countTA TA sites.\n";
|
|
560
|
|
561 #--------------------------------------------------------------------------------------------------
|
|
562
|
|
563 #Now, have an array for each TA site and if an insertion occurred there.
|
|
564 #So per site @sites=(position, 0 or 1 for insertion).
|
|
565 #Next step, create null distribution of 10,000 random sets with same number of
|
|
566 #TA sites as the window and then calculate p-value
|
|
567
|
|
568 #SUBROUTINE FOR MAKING THE NULL DISTRIBUTION SPECIFIC TO THE WINDOW
|
|
569
|
|
570 #DISTRIBUTION'S STANDARD DEVIATION
|
|
571 my $N=10000;
|
|
572
|
|
573 sub mean {
|
|
574 if (scalar @_ ==0){
|
|
575 return 0;
|
|
576 }
|
|
577 my $sum;
|
|
578 grep { $sum += $_ } @_;
|
|
579 return $sum/@_;
|
|
580 }
|
|
581 sub stdev{
|
|
582 my @data = @{shift @_};
|
|
583 my $average=shift @_;
|
|
584 my $sqtotal = 0;
|
|
585 foreach(@data) {
|
|
586 $sqtotal += ($average-$_) ** 2;
|
|
587 }
|
|
588 my $std = ($sqtotal / ($N-1)) ** 0.5;
|
|
589 return $std;
|
|
590 }
|
|
591
|
|
592 #MAKE LIBRARY OF NULL DISTRIBUTIONS:
|
|
593 print "Making a library of null distributions.\n For information on distribution library, see nullDist.txt\n";
|
|
594
|
|
595 my @distLib;
|
|
596
|
|
597 my $FILE3 = $f3; #$run . "nullDist.txt";
|
|
598 unless(open DIST, ">", $FILE3){
|
|
599 die "\nUnable to create $FILE3:\n$!";
|
|
600 }
|
|
601 printf DIST "Sites\tSize(n)\tMin\tMax\tMean\tstdev\tvar\tMinPval\n";
|
|
602
|
|
603 #Loop once for each distribution in the library
|
|
604 #(i.e. distribution of 10,000 sites each with 35 TA sites, then a distribution of 10,000 sites each with 36 TA sites, etc)
|
|
605
|
|
606 sub pvalue{
|
|
607
|
|
608 #takes in window count average (countAvg) and number of TAsites and makes a null distribution to calculate the pvalue, which it returns
|
|
609 my $mean=shift@_;
|
|
610 my $TAsites=shift@_;
|
|
611 my $N=10000;
|
|
612 my @specDist=@{$distLib[$TAsites-1]};
|
|
613 my $rank= binsearch_pos { $a cmp $b } $mean,@specDist;
|
|
614 my $i=$rank;
|
|
615 while ($i<scalar(@specDist)-1 and $specDist[$i+1]==$specDist[$rank]){
|
|
616 $i++;
|
|
617 }
|
|
618 $rank=$i;
|
|
619 my $pval=$rank/$N; #calculate pval as rank/N
|
|
620 return $pval;
|
|
621
|
|
622 }
|
|
623
|
|
624 for (my $sitez=1; $sitez<=$max;$sitez++){
|
|
625 #print "In the first for loop to make a null distribution\n";
|
|
626 my @unsorted;
|
|
627 my $count=0;
|
|
628 my $sum=0;
|
|
629
|
|
630 for (my $i=1; $i<=$N; $i++){
|
|
631 my @random_set = rand_set( set => \@selector, size => $sitez);
|
|
632 my $setMean=mean(@random_set);
|
|
633 push (@unsorted, $setMean);
|
|
634 #print "$i:\t", "$setMean\n";
|
|
635 $count++;
|
|
636 $sum+=$setMean;
|
|
637 }
|
|
638 my @nullDist= sort { $a <=> $b } @unsorted;
|
|
639 my $nullMean=sprintf("$round",($sum/$count));
|
|
640 my $standev =sprintf("$round",stdev(\@nullDist, $nullMean));
|
|
641 my $variance=sprintf("$round",($standev*$standev));
|
|
642 my $min=sprintf("$round",$nullDist[0]);
|
|
643 my $max=sprintf("$round",$nullDist[scalar @nullDist-1]);
|
|
644 my $setScalar=scalar @nullDist;
|
|
645 push (@distLib,\@nullDist);
|
|
646 my $minp=pvalue(0,$sitez);
|
|
647 printf DIST "$sitez\t$N\t$min\t$max\t$nullMean\t$standev\t$variance\t$minp\n";
|
|
648 }
|
|
649 close DIST;
|
|
650
|
|
651 #SUBROUTINE TO CALCULATE THE P-VALUE OF THE WINDOW INSERTIONS AGAINST THE NULL DISTRIBUTION
|
|
652 #---------------------------------------------------------------------------------------------
|
|
653
|
|
654 #Now we have an array called @allTAsites which contains every TAsite position
|
|
655 #with a 0 next to it for "no insertion".
|
|
656 #Now just need to replace 0 with 1 if there IS and insertion at that site
|
|
657
|
|
658 my @newWindows=();
|
|
659 my $printNum=0;
|
|
660
|
|
661 #my $allWindows=\@allWindows;
|
|
662 for (my $i=0;$i<scalar @allWindows;$i++){
|
|
663 my @win=@{$allWindows[$i]};
|
|
664 my $starter=$win[0];
|
|
665 my $ender=$win[1];
|
|
666 my $insertions=$win[3];
|
|
667 my $mutcount=$win[2];
|
|
668 #print "num $printNum -->\tStart pos: $starter\tEnd pos: $ender\n";
|
|
669 #How many TA sites are there from $genome[$start] to $genome[$end]?
|
|
670
|
|
671 my $seq = substr($fasta,$starter-1,$size); #start-1 becase $start and $end are positions in genome starting at 1,2,3.... substr(string,start, length) needs indexes
|
|
672 my $ta="TA";
|
|
673 my @c = $seq =~ /$ta/g;
|
|
674 my $TAsites = scalar @c;
|
|
675 my $avgInsert=sprintf("$round",$insertions/$TAsites);
|
|
676 my $pval=pvalue($avgInsert,$TAsites);
|
|
677
|
|
678 #reorder so things make more sense: all fitness related calcs together and all sig together
|
|
679 my @new=($starter,$ender,$mutcount,$insertions,$TAsites,$avgInsert,$pval,$win[5],$win[6],$win[7],$win[8],$win[4]);
|
|
680
|
|
681 push (@newWindows,\@new);
|
|
682 $printNum++;
|
|
683 }
|
|
684
|
|
685
|
|
686 print "End p-value calculation: ",get_time(),"\n\n";
|
|
687
|
|
688 #print "This is the TA count: $count\nTotal genome size is: $total\n\n";
|
|
689
|
|
690 #CALCULATE THE ABSOLUTE DIFFERENCE BETWEEN REGION'S MEAN FITNESS AND AVERAGE MEAN FITNESS
|
|
691 ### For all windows, add a field that has the difference between that window's mean fitness and the
|
|
692 #average mean fitness for all of the windows
|
|
693
|
|
694 my @allFits = map $_->[7], @newWindows;
|
|
695 my $meanFit=mean(@allFits);
|
|
696 print "Average fitness for all windows: ",$meanFit,"\n";
|
|
697 my @expWindows=();
|
|
698 foreach (@newWindows){
|
|
699 my @entry=@{$_};
|
|
700 my $mean=$entry[7];
|
|
701 my $absdev=sprintf("$round",$mean-$meanFit);
|
|
702 push (@entry,$absdev);
|
|
703 push @expWindows,\@entry;
|
|
704 }
|
|
705
|
|
706 @newWindows=@expWindows;
|
|
707
|
|
708 #MAKE OUTPUT CSV FILE WITH ESSENTIAL WINDOW CALCULATIONS
|
|
709
|
|
710 print "Start csv ouput file creation: ",get_time(),"\n";
|
|
711
|
|
712 open CSV, '>', $f3; # $run . "slidingWindows.csv" or die "Cannot open sliding window output file";
|
|
713
|
|
714 my @csvHeader= ("start", "end","mutants","insertions","TA_sites","ratio","p-value","average", "variance","stdev","stderr","genes","fit-mean");
|
|
715 my $header=join(",",@csvHeader);
|
|
716 print CSV $header,"\n";
|
|
717 foreach my $winLine(@newWindows){
|
|
718 my $string=join(",",@{$winLine});
|
|
719 print CSV $string, "\n";
|
|
720 }
|
|
721 close CSV;
|
|
722 print "End csv ouput file creation: ",get_time(),"\n\n";
|
|
723
|
|
724 my $in = Bio::SeqIO->new(-file=>$ref, -format => 'genbank');
|
|
725 my $refseq = $in->next_seq;
|
|
726 my $refname = $refseq->id;
|
|
727
|
|
728 #MAKE essentials WIG FILE---->later make BW--->IGV
|
|
729 sub printwig{
|
|
730 print "Start wig file creation: ",get_time(),"\n";
|
|
731 my $in = Bio::SeqIO->new(-file=>$ref, -format => 'genbank');
|
|
732 my $refseq = $in->next_seq;
|
|
733 my $refname = $refseq->id;
|
|
734 my $ewig="essentialWindows.wig";
|
|
735 my $FILE5 = $run . $ewig;
|
|
736 open eWIG, ">", $f5; #$FILE5";
|
|
737 printf eWIG "track type=wiggle_0 name=$ewig\n";
|
|
738 printf eWIG "variableStep chrom=$refname\n";
|
|
739 foreach my $wigLine(@newWindows){
|
|
740 my @wigFields=@$wigLine;
|
|
741 my $position=$wigFields[0];
|
|
742 while ($position<=$wigFields[1]){
|
|
743 printf eWIG $position, " ",$wigFields[7],"\n"; #7 for pvalue, but 2 for fitness!!!!!!
|
|
744 $position=$position+1;
|
|
745 }
|
|
746 }
|
|
747 close eWIG;
|
|
748 print "End wig file creation: ",get_time(),"\n\n";
|
|
749 print "If this wig file needs to be converted to a Big Wig, then use USCS program wigToBigWig in terminal: \n \t./wigToBigWig gview/12G.wig organism.txt BigWig/output.bw \n\n";
|
|
750 }
|
|
751
|
|
752
|
|
753 #OUTPUT REGULAR SLIDING INFORMATION WITHOUT ESSENTIALS CALCULATIONS (P-VALUES)
|
|
754
|
|
755
|
|
756 #MAKE OUTPUT CSV FILE WITH FITNESS WINDOW CALCULATIONS
|
|
757 print "Start csv ouput file creation: ",get_time(),"\n";
|
|
758 open FITFILE, ">", $f4; #$run . "fitWindows.csv" or die "Failed to open file";
|
|
759 my @head = ("start", "end","fitness","mutant_count", "\n");
|
|
760 print FITFILE join(",",@head); #header
|
|
761 foreach my $winLine(@allWindows){
|
|
762 print FITFILE join(",", @$winLine),"\n";
|
|
763 }
|
|
764 close FITFILE;
|
|
765 print "End csv ouput file creation: ",get_time(),"\n\n";
|
|
766
|
|
767 #MAKE WIG FILE---->later make BW--->IGV
|
|
768 print "Start wig file creation: ",get_time(),"\n";
|
|
769 my $fin = Bio::SeqIO->new(-file=>$ref, -format => 'genbank');
|
|
770 $refseq = $fin->next_seq;
|
|
771 $refname = $refseq->id;
|
|
772 my $fwig="fitWindows.wig";
|
|
773 open fWIG, ">", $f5; #$run . $fwig;
|
|
774 print fWIG "track type=wiggle_0 name=$fwig\n";
|
|
775 print fWIG "variableStep chrom=$refname\n";
|
|
776 foreach my $wigLine(@allWindows){
|
|
777 my @wigFields=@$wigLine;
|
|
778 my $pos=$wigFields[0];
|
|
779 my $fit=$wigFields[7];
|
|
780 print fWIG $pos," ",$fit,"\n";
|
|
781 }
|
|
782 close fWIG;
|
|
783 print "End wig file creation: ",get_time(),"\n\n";
|
|
784 print "If this wig file needs to be converted to a Big Wig,\n";
|
|
785 print " then use USCS program wigToBigWig in terminal: \n";
|
|
786 print "\t./wigToBigWig gview/12G.wig organism.txt BigWig/output.bw \n\n";
|
|
787
|
|
788
|
|
789 #GOING TO MAKE A TEXT FILE FOR BED CONVERSION TO BIGBED, NEED CHROM # IN COLUMN 0
|
|
790
|
|
791 my @cummulative;
|
|
792
|
|
793 #MAKING A REGULAR TEXT FILE fields: [chrom,start,end,fitness,count]
|
|
794
|
|
795 print "Start text file creation time: ",get_time(),"\n";
|
|
796 open my $TXT, '>', $f6; #$run . "fitWindows.txt" or die $!;
|
|
797 print $TXT ("start","end","W","mutant_count","insertion_count\n");
|
|
798 print $TXT (join("\t",@$_),"\n") for @allWindows;
|
|
799 close $TXT;
|
|
800 print "End text file creation: ",get_time(),"\n\n";
|
|
801
|
|
802
|