Mercurial > repos > dereeper > pangenome_explorer
view PanExplorer_workflow/Perl/Naegleria/calculateFeatureDensitiesFromGFF.pl @ 1:032f6b3806a3 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:16:08 +0000 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl use strict; my $gff = $ARGV[0]; my $window_size = $ARGV[1]; my %counts; my %hash; open(F,$gff); while(<F>){ if (/^#/){next;} my @infos = split("\t",$_); my $chr = $infos[0]; #$chr =~s/chr//g; my $source = $infos[1]; my $feature = $infos[2]; my $start = $infos[3]; my $end = $infos[4]; my $window_num = int($start/$window_size)+1; my $window_num2 = int($end/$window_size)+1; if ($source eq "repeatmasker" && $feature eq "match"){ $feature = "repeats"; } if ($feature ne "gene" && $feature ne "CDS" && $feature ne "repeats" && $feature !~/UTR/ && $feature ne "exon"){next;} $counts{$chr}{$window_num}{$feature}++; if ($window_num == $window_num2){ for (my $i = $start; $i <= $end; $i++){ $hash{$chr}{$window_num}{$feature}{$i} = 1; } } else{ for (my $i = $start; $i <= ($window_num*$window_size); $i++){ $hash{$chr}{$window_num}{$feature}{$i} = 1; } for (my $j = ($window_num2-1)*$window_size; $j <= $end; $j++){ $hash{$chr}{$window_num2}{$feature}{$j} = 1; } if (($window_num2 - $window_num) > 1){ for (my $window_num_new = $window_num + 1; $window_num_new <= $window_num2 - 1; $window_num_new++){ for (my $j = (($window_num_new-1)*$window_size); $j <= ($window_num_new*$window_size); $j++){ $hash{$chr}{$window_num_new}{$feature}{$j} = 1; } } } } } close(F); #my $refhash2 = $hash{"Chr13"}{"1"}{"exon"}; #my %subhash2 = %$refhash2; #print scalar %subhash2; #exit; open(G2,">gene_counts.txt"); open(G,">gene_density.txt"); open(I,">exon_density.txt"); open(R,">repeat_density.txt"); foreach my $chr(sort {$a<=>$b} keys(%hash)){ #print G "$chr 0 1 0\n"; #print G "$chr 1 2 1\n"; #print I "$chr 0 1 0\n"; #print I "$chr 0 1 0\n"; my $refhash = $hash{$chr}; my %subhash = %$refhash; foreach my $window_num(sort {$a<=>$b} keys(%subhash)){ # genes my $proportion_gene = 0; if ($hash{$chr}{$window_num}{"gene"}){ my $refhash2 = $hash{$chr}{$window_num}{"gene"}; my %subhash2 = %$refhash2; $proportion_gene = scalar %subhash2 / $window_size; if ($proportion_gene > 1.1){ #print "$chr $window_num $proportion_gene ".scalar %subhash2."\n";exit; } } # exon my $proportion_exon = 0; if ($hash{$chr}{$window_num}{"CDS"}){ my $refhash2 = $hash{$chr}{$window_num}{"CDS"}; my %subhash2 = %$refhash2; $proportion_exon = scalar %subhash2 / $window_size; } # repeat my $proportion_repeat = 0; if ($hash{$chr}{$window_num}{"repeats"}){ my $refhash2 = $hash{$chr}{$window_num}{"repeats"}; my %subhash2 = %$refhash2; $proportion_repeat = scalar %subhash2 / $window_size; #$proportion_repeat = $proportion_repeat + $proportion_gene; } else{ #$proportion_repeat = $proportion_gene; $proportion_repeat = 0; } my $start = ($window_num-1)*$window_size; my $end = $window_num*$window_size; print I "$chr $start $end $proportion_exon\n"; print G "$chr $start $end $proportion_gene\n"; print R "$chr $start $end $proportion_repeat\n"; if ($counts{$chr}{$window_num}{"gene"}){ print G2 "$chr $start $end ".$counts{$chr}{$window_num}{"gene"}."\n"; } else{ print G2 "$chr $start $end 0\n"; } } } close(G); close(I); close(R); close(G2);