1
|
1 #!/usr/bin/perl
|
|
2
|
|
3 use strict;
|
|
4
|
|
5 my $gff = $ARGV[0];
|
|
6 my $window_size = $ARGV[1];
|
|
7
|
|
8 my %counts;
|
|
9 my %hash;
|
|
10 open(F,$gff);
|
|
11 while(<F>){
|
|
12 if (/^#/){next;}
|
|
13 my @infos = split("\t",$_);
|
|
14 my $chr = $infos[0];
|
|
15 #$chr =~s/chr//g;
|
|
16 my $source = $infos[1];
|
|
17 my $feature = $infos[2];
|
|
18 my $start = $infos[3];
|
|
19 my $end = $infos[4];
|
|
20 my $window_num = int($start/$window_size)+1;
|
|
21 my $window_num2 = int($end/$window_size)+1;
|
|
22 if ($source eq "repeatmasker" && $feature eq "match"){
|
|
23 $feature = "repeats";
|
|
24 }
|
|
25 if ($feature ne "gene" && $feature ne "CDS" && $feature ne "repeats" && $feature !~/UTR/ && $feature ne "exon"){next;}
|
|
26 $counts{$chr}{$window_num}{$feature}++;
|
|
27 if ($window_num == $window_num2){
|
|
28 for (my $i = $start; $i <= $end; $i++){
|
|
29 $hash{$chr}{$window_num}{$feature}{$i} = 1;
|
|
30 }
|
|
31 }
|
|
32 else{
|
|
33 for (my $i = $start; $i <= ($window_num*$window_size); $i++){
|
|
34 $hash{$chr}{$window_num}{$feature}{$i} = 1;
|
|
35 }
|
|
36 for (my $j = ($window_num2-1)*$window_size; $j <= $end; $j++){
|
|
37 $hash{$chr}{$window_num2}{$feature}{$j} = 1;
|
|
38 }
|
|
39 if (($window_num2 - $window_num) > 1){
|
|
40 for (my $window_num_new = $window_num + 1; $window_num_new <= $window_num2 - 1; $window_num_new++){
|
|
41 for (my $j = (($window_num_new-1)*$window_size); $j <= ($window_num_new*$window_size); $j++){
|
|
42 $hash{$chr}{$window_num_new}{$feature}{$j} = 1;
|
|
43 }
|
|
44 }
|
|
45 }
|
|
46
|
|
47 }
|
|
48 }
|
|
49 close(F);
|
|
50
|
|
51 #my $refhash2 = $hash{"Chr13"}{"1"}{"exon"};
|
|
52 #my %subhash2 = %$refhash2;
|
|
53 #print scalar %subhash2;
|
|
54 #exit;
|
|
55
|
|
56 open(G2,">gene_counts.txt");
|
|
57 open(G,">gene_density.txt");
|
|
58 open(I,">exon_density.txt");
|
|
59 open(R,">repeat_density.txt");
|
|
60 foreach my $chr(sort {$a<=>$b} keys(%hash)){
|
|
61 #print G "$chr 0 1 0\n";
|
|
62 #print G "$chr 1 2 1\n";
|
|
63 #print I "$chr 0 1 0\n";
|
|
64 #print I "$chr 0 1 0\n";
|
|
65 my $refhash = $hash{$chr};
|
|
66 my %subhash = %$refhash;
|
|
67 foreach my $window_num(sort {$a<=>$b} keys(%subhash)){
|
|
68
|
|
69 # genes
|
|
70 my $proportion_gene = 0;
|
|
71 if ($hash{$chr}{$window_num}{"gene"}){
|
|
72 my $refhash2 = $hash{$chr}{$window_num}{"gene"};
|
|
73 my %subhash2 = %$refhash2;
|
|
74 $proportion_gene = scalar %subhash2 / $window_size;
|
|
75 if ($proportion_gene > 1.1){
|
|
76 #print "$chr $window_num $proportion_gene ".scalar %subhash2."\n";exit;
|
|
77 }
|
|
78 }
|
|
79
|
|
80 # exon
|
|
81 my $proportion_exon = 0;
|
|
82 if ($hash{$chr}{$window_num}{"CDS"}){
|
|
83 my $refhash2 = $hash{$chr}{$window_num}{"CDS"};
|
|
84 my %subhash2 = %$refhash2;
|
|
85 $proportion_exon = scalar %subhash2 / $window_size;
|
|
86 }
|
|
87
|
|
88 # repeat
|
|
89 my $proportion_repeat = 0;
|
|
90 if ($hash{$chr}{$window_num}{"repeats"}){
|
|
91 my $refhash2 = $hash{$chr}{$window_num}{"repeats"};
|
|
92 my %subhash2 = %$refhash2;
|
|
93 $proportion_repeat = scalar %subhash2 / $window_size;
|
|
94 #$proportion_repeat = $proportion_repeat + $proportion_gene;
|
|
95 }
|
|
96 else{
|
|
97 #$proportion_repeat = $proportion_gene;
|
|
98 $proportion_repeat = 0;
|
|
99 }
|
|
100 my $start = ($window_num-1)*$window_size;
|
|
101 my $end = $window_num*$window_size;
|
|
102 print I "$chr $start $end $proportion_exon\n";
|
|
103 print G "$chr $start $end $proportion_gene\n";
|
|
104 print R "$chr $start $end $proportion_repeat\n";
|
|
105 if ($counts{$chr}{$window_num}{"gene"}){
|
|
106 print G2 "$chr $start $end ".$counts{$chr}{$window_num}{"gene"}."\n";
|
|
107 }
|
|
108 else{
|
|
109 print G2 "$chr $start $end 0\n";
|
|
110 }
|
|
111 }
|
|
112 }
|
|
113 close(G);
|
|
114 close(I);
|
|
115 close(R);
|
|
116 close(G2);
|
|
117
|