annotate PanExplorer_workflow/Perl/Naegleria/calculateFeatureDensitiesFromGFF.pl @ 2:97e4e3e818b6 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:48:09 +0000
parents 032f6b3806a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
2
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
3 use strict;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
4
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
5 my $gff = $ARGV[0];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
6 my $window_size = $ARGV[1];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
7
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
8 my %counts;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
9 my %hash;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
10 open(F,$gff);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
11 while(<F>){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
12 if (/^#/){next;}
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
13 my @infos = split("\t",$_);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
14 my $chr = $infos[0];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
15 #$chr =~s/chr//g;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
16 my $source = $infos[1];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
17 my $feature = $infos[2];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
18 my $start = $infos[3];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
19 my $end = $infos[4];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
20 my $window_num = int($start/$window_size)+1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
21 my $window_num2 = int($end/$window_size)+1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
22 if ($source eq "repeatmasker" && $feature eq "match"){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
23 $feature = "repeats";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
24 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
25 if ($feature ne "gene" && $feature ne "CDS" && $feature ne "repeats" && $feature !~/UTR/ && $feature ne "exon"){next;}
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
26 $counts{$chr}{$window_num}{$feature}++;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
27 if ($window_num == $window_num2){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
28 for (my $i = $start; $i <= $end; $i++){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
29 $hash{$chr}{$window_num}{$feature}{$i} = 1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
30 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
31 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
32 else{
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
33 for (my $i = $start; $i <= ($window_num*$window_size); $i++){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
34 $hash{$chr}{$window_num}{$feature}{$i} = 1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
35 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
36 for (my $j = ($window_num2-1)*$window_size; $j <= $end; $j++){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
37 $hash{$chr}{$window_num2}{$feature}{$j} = 1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
38 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
39 if (($window_num2 - $window_num) > 1){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
40 for (my $window_num_new = $window_num + 1; $window_num_new <= $window_num2 - 1; $window_num_new++){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
41 for (my $j = (($window_num_new-1)*$window_size); $j <= ($window_num_new*$window_size); $j++){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
42 $hash{$chr}{$window_num_new}{$feature}{$j} = 1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
43 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
44 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
45 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
46
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
47 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
48 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
49 close(F);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
50
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
51 #my $refhash2 = $hash{"Chr13"}{"1"}{"exon"};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
52 #my %subhash2 = %$refhash2;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
53 #print scalar %subhash2;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
54 #exit;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
55
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
56 open(G2,">gene_counts.txt");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
57 open(G,">gene_density.txt");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
58 open(I,">exon_density.txt");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
59 open(R,">repeat_density.txt");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
60 foreach my $chr(sort {$a<=>$b} keys(%hash)){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
61 #print G "$chr 0 1 0\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
62 #print G "$chr 1 2 1\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
63 #print I "$chr 0 1 0\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
64 #print I "$chr 0 1 0\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
65 my $refhash = $hash{$chr};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
66 my %subhash = %$refhash;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
67 foreach my $window_num(sort {$a<=>$b} keys(%subhash)){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
68
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
69 # genes
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
70 my $proportion_gene = 0;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
71 if ($hash{$chr}{$window_num}{"gene"}){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
72 my $refhash2 = $hash{$chr}{$window_num}{"gene"};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
73 my %subhash2 = %$refhash2;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
74 $proportion_gene = scalar %subhash2 / $window_size;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
75 if ($proportion_gene > 1.1){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
76 #print "$chr $window_num $proportion_gene ".scalar %subhash2."\n";exit;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
77 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
78 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
79
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
80 # exon
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
81 my $proportion_exon = 0;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
82 if ($hash{$chr}{$window_num}{"CDS"}){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
83 my $refhash2 = $hash{$chr}{$window_num}{"CDS"};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
84 my %subhash2 = %$refhash2;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
85 $proportion_exon = scalar %subhash2 / $window_size;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
86 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
87
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
88 # repeat
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
89 my $proportion_repeat = 0;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
90 if ($hash{$chr}{$window_num}{"repeats"}){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
91 my $refhash2 = $hash{$chr}{$window_num}{"repeats"};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
92 my %subhash2 = %$refhash2;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
93 $proportion_repeat = scalar %subhash2 / $window_size;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
94 #$proportion_repeat = $proportion_repeat + $proportion_gene;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
95 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
96 else{
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
97 #$proportion_repeat = $proportion_gene;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
98 $proportion_repeat = 0;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
99 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
100 my $start = ($window_num-1)*$window_size;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
101 my $end = $window_num*$window_size;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
102 print I "$chr $start $end $proportion_exon\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
103 print G "$chr $start $end $proportion_gene\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
104 print R "$chr $start $end $proportion_repeat\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
105 if ($counts{$chr}{$window_num}{"gene"}){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
106 print G2 "$chr $start $end ".$counts{$chr}{$window_num}{"gene"}."\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
107 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
108 else{
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
109 print G2 "$chr $start $end 0\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
110 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
111 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
112 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
113 close(G);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
114 close(I);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
115 close(R);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
116 close(G2);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
117