comparison dataOverview.pl @ 3:80205e898861 draft default tip

New updates
author antmarge
date Tue, 02 May 2017 21:20:54 -0400
parents b66f4a551e25
children
comparison
equal deleted inserted replaced
2:3ed885628c9f 3:80205e898861
282 print OUT "$sat%\tSaturation of TA sites after cutoff filter (validInsertions/TAsites)\n"; 282 print OUT "$sat%\tSaturation of TA sites after cutoff filter (validInsertions/TAsites)\n";
283 print OUT "$inscov%\tGenome coverage by insertions (validInsertions/genomeSize)\n"; 283 print OUT "$inscov%\tGenome coverage by insertions (validInsertions/genomeSize)\n";
284 print OUT "$tacov%\tGenome coverage by TA sites (TAsites/genomeSize)\n"; 284 print OUT "$tacov%\tGenome coverage by TA sites (TAsites/genomeSize)\n";
285 print OUT "$lg_dist_ta\tLargest distance between TA sites\n"; 285 print OUT "$lg_dist_ta\tLargest distance between TA sites\n";
286 print OUT "$lg_dist_ins\tLargest distance between insertions\n"; 286 print OUT "$lg_dist_ins\tLargest distance between insertions\n";
287 print OUT "\n\nOpen Reading Frames\n\n";
288 287
289 #Store everything to be print OUTed in array 288 #Store everything to be print OUTed in array
290 my @table; 289 my @table;
291
292 #Find open reading frames from fasta file
293 local $_ = $fasta;
294 my @orfSize;
295 my @allc; #numbers of TAs in the ORFS here.
296 my $blank=0; #ORFS that don't have any TA sites.
297 my $orfCount=0; #keep track of the number of ORFs found.
298 my $minSize=0;
299 #Read somewhere that 99 is a good min but there is an annotated 86 bp gene for 19F
300 while ( /ATG/g ) {
301 my $start = pos() - 3;
302 if ( /T(?:AA|AG|GA)/g ) {
303 my $stop = pos;
304 my $size=$stop - $start;
305 if ($size>=$minSize){
306 push (@orfSize,$size);
307 my $seq=substr ($_, $start, $stop - $start);
308 my @ctemp = $seq =~ /$x/g;
309 my $countTA = @ctemp;
310 if ($countTA==0){$blank++}
311 push (@allc,$countTA);
312 $orfCount++;
313 }
314 }
315 }
316
317 print OUT "\nORFs based on Fasta sequence and start (ATG) and end (TAA,TAG,TGA) codons\n";
318 push (@table,["Set minimum size for an ORF",$minSize]);
319 print OUT "$orfCount\tTotal number of ORFs found\n";
320 my ($minORF, $maxORF) = minmax(@orfSize);
321 print OUT "$minORF\tSmallest ORF\n";
322 print OUT "$maxORF\tLargest ORF\n";
323 my ($mintaORF,$maxtaORF) = minmax(@allc);
324 print OUT "$mintaORF\tFewest # TA sites in an ORF\n";
325 print OUT "$maxtaORF\tGreatest # TA sites in an ORF\n";
326 print OUT "$blank\tNumber of ORFs that don't have any TA sites\n";
327
328 290
329 print OUT "\nGenes using the genbank annotation file\n\n"; 291 print OUT "\nGenes using the genbank annotation file\n\n";
330 ###Get genbank file. Find all start and stop for genes 292 ###Get genbank file. Find all start and stop for genes
331 #See how many insertions fall into genes vs intergenic regions 293 #See how many insertions fall into genes vs intergenic regions
332 #Get array of coordinates for all insertions then remove insertion if it is 294 #Get array of coordinates for all insertions then remove insertion if it is