comparison PanExplorer_workflow/Perl/Naegleria/calculateFeatureDensitiesFromGFF.pl @ 1:032f6b3806a3 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:16:08 +0000
parents
children
comparison
equal deleted inserted replaced
0:3cbb01081cde 1:032f6b3806a3
1 #!/usr/bin/perl
2
3 use strict;
4
5 my $gff = $ARGV[0];
6 my $window_size = $ARGV[1];
7
8 my %counts;
9 my %hash;
10 open(F,$gff);
11 while(<F>){
12 if (/^#/){next;}
13 my @infos = split("\t",$_);
14 my $chr = $infos[0];
15 #$chr =~s/chr//g;
16 my $source = $infos[1];
17 my $feature = $infos[2];
18 my $start = $infos[3];
19 my $end = $infos[4];
20 my $window_num = int($start/$window_size)+1;
21 my $window_num2 = int($end/$window_size)+1;
22 if ($source eq "repeatmasker" && $feature eq "match"){
23 $feature = "repeats";
24 }
25 if ($feature ne "gene" && $feature ne "CDS" && $feature ne "repeats" && $feature !~/UTR/ && $feature ne "exon"){next;}
26 $counts{$chr}{$window_num}{$feature}++;
27 if ($window_num == $window_num2){
28 for (my $i = $start; $i <= $end; $i++){
29 $hash{$chr}{$window_num}{$feature}{$i} = 1;
30 }
31 }
32 else{
33 for (my $i = $start; $i <= ($window_num*$window_size); $i++){
34 $hash{$chr}{$window_num}{$feature}{$i} = 1;
35 }
36 for (my $j = ($window_num2-1)*$window_size; $j <= $end; $j++){
37 $hash{$chr}{$window_num2}{$feature}{$j} = 1;
38 }
39 if (($window_num2 - $window_num) > 1){
40 for (my $window_num_new = $window_num + 1; $window_num_new <= $window_num2 - 1; $window_num_new++){
41 for (my $j = (($window_num_new-1)*$window_size); $j <= ($window_num_new*$window_size); $j++){
42 $hash{$chr}{$window_num_new}{$feature}{$j} = 1;
43 }
44 }
45 }
46
47 }
48 }
49 close(F);
50
51 #my $refhash2 = $hash{"Chr13"}{"1"}{"exon"};
52 #my %subhash2 = %$refhash2;
53 #print scalar %subhash2;
54 #exit;
55
56 open(G2,">gene_counts.txt");
57 open(G,">gene_density.txt");
58 open(I,">exon_density.txt");
59 open(R,">repeat_density.txt");
60 foreach my $chr(sort {$a<=>$b} keys(%hash)){
61 #print G "$chr 0 1 0\n";
62 #print G "$chr 1 2 1\n";
63 #print I "$chr 0 1 0\n";
64 #print I "$chr 0 1 0\n";
65 my $refhash = $hash{$chr};
66 my %subhash = %$refhash;
67 foreach my $window_num(sort {$a<=>$b} keys(%subhash)){
68
69 # genes
70 my $proportion_gene = 0;
71 if ($hash{$chr}{$window_num}{"gene"}){
72 my $refhash2 = $hash{$chr}{$window_num}{"gene"};
73 my %subhash2 = %$refhash2;
74 $proportion_gene = scalar %subhash2 / $window_size;
75 if ($proportion_gene > 1.1){
76 #print "$chr $window_num $proportion_gene ".scalar %subhash2."\n";exit;
77 }
78 }
79
80 # exon
81 my $proportion_exon = 0;
82 if ($hash{$chr}{$window_num}{"CDS"}){
83 my $refhash2 = $hash{$chr}{$window_num}{"CDS"};
84 my %subhash2 = %$refhash2;
85 $proportion_exon = scalar %subhash2 / $window_size;
86 }
87
88 # repeat
89 my $proportion_repeat = 0;
90 if ($hash{$chr}{$window_num}{"repeats"}){
91 my $refhash2 = $hash{$chr}{$window_num}{"repeats"};
92 my %subhash2 = %$refhash2;
93 $proportion_repeat = scalar %subhash2 / $window_size;
94 #$proportion_repeat = $proportion_repeat + $proportion_gene;
95 }
96 else{
97 #$proportion_repeat = $proportion_gene;
98 $proportion_repeat = 0;
99 }
100 my $start = ($window_num-1)*$window_size;
101 my $end = $window_num*$window_size;
102 print I "$chr $start $end $proportion_exon\n";
103 print G "$chr $start $end $proportion_gene\n";
104 print R "$chr $start $end $proportion_repeat\n";
105 if ($counts{$chr}{$window_num}{"gene"}){
106 print G2 "$chr $start $end ".$counts{$chr}{$window_num}{"gene"}."\n";
107 }
108 else{
109 print G2 "$chr $start $end 0\n";
110 }
111 }
112 }
113 close(G);
114 close(I);
115 close(R);
116 close(G2);
117