3
|
1 #!/usr/bin/perl
|
|
2
|
|
3 use strict;
|
|
4
|
|
5 my $pav_matrix = $ARGV[0];
|
|
6 my $cog_of_clusters = $ARGV[1];
|
|
7 my $output = $ARGV[2];
|
|
8
|
|
9 my %categories = ("Unknown"=>1);
|
|
10 my %cog_categories;
|
|
11 my %cogs_for_cluster;
|
|
12 open(COG,$cog_of_clusters);
|
|
13 while(<COG>){
|
|
14 my $line = $_;
|
|
15 $line =~s/\n//g;$line =~s/\r//g;
|
|
16 my ($cluster,$cog,$category) = split("\t",$line);
|
|
17 #if (!$cogs_for_cluster{$cluster}){
|
|
18 $cogs_for_cluster{$cluster} .= "$cog,";
|
|
19 #}
|
|
20 $cog_categories{$cog} = $category;
|
|
21 $categories{$category}=1;
|
|
22 }
|
|
23 close(COG);
|
|
24
|
|
25 my %hash;
|
|
26 my %hash2;
|
|
27 open(F,$pav_matrix);
|
|
28 my $first_line = <F>;
|
|
29 $first_line =~s/\n//g;$first_line =~s/\r//g;
|
|
30 my @samples = split(/\t/,$first_line);
|
|
31 my $nb_total = 0;
|
|
32 my $nb_core = 0;
|
|
33 my $nb_accessory = 0;
|
|
34 print C "Cluster\tFunction\tCOG\tCOG categories\n";
|
|
35 print D "Cluster\tFound in N strains\tFunction\tCOG\tCOG categories\n";
|
|
36 while(<F>){
|
|
37 my @i = split("\t",$_);
|
|
38 my $cluster_num = $i[0];
|
|
39 my $clnb = $i[0];
|
|
40 my $pos = $clnb;
|
|
41 my $pos_before = $pos - 1;
|
|
42 my $nb_found = 0;
|
|
43 my $concatenate_samples = "";
|
|
44 my $strain_found = 0;
|
|
45 my %functions_of_genes;
|
|
46 for (my $j = 1; $j <= $#i; $j++){
|
|
47 my $val = $i[$j];
|
|
48 if ($val =~/\w+/){
|
|
49 my @genes = split(",",$val);
|
|
50 $nb_found++;
|
|
51 }
|
|
52 }
|
|
53 # core-genes
|
|
54 my $type = "";
|
|
55 if ($nb_found == (scalar @samples - 1)){
|
|
56 $type = "core";
|
|
57 $nb_core++;
|
|
58 }
|
|
59 # dispensables
|
|
60 else{
|
|
61 $type = "accessory";
|
|
62 $nb_accessory++;
|
|
63 }
|
|
64 my @cogs = split(/,/,$cogs_for_cluster{$clnb});
|
|
65 my %cat;
|
|
66 foreach my $cog(@cogs){
|
|
67 if ($cog =~/\w+/){
|
|
68 my $category = $cog_categories{$cog};
|
|
69 $cat{$category}=1;
|
|
70 }
|
|
71 }
|
|
72 if (!$cogs_for_cluster{$clnb}){
|
|
73 $cat{"Unknown"}=1;
|
|
74 }
|
|
75
|
|
76 foreach my $category(keys(%categories)){
|
|
77 if ($cat{$category}){
|
|
78 $hash{$category}{$type}++;
|
|
79 }
|
|
80 else{
|
|
81 $hash{"not in $category"}{$type}++;
|
|
82 }
|
|
83 }
|
|
84 }
|
|
85 close(F);
|
|
86
|
|
87 open(OUT,">$output");
|
|
88 my %results;
|
|
89 foreach my $category(keys(%categories)){
|
|
90 my $n1 = 0;
|
|
91 if ($hash{$category}{"core"}){
|
|
92 $n1 = $hash{$category}{"core"};
|
|
93 }
|
|
94 my $n2 = 0;
|
|
95 if ($hash{$category}{"accessory"}){
|
|
96 $n2 = $hash{$category}{"accessory"};
|
|
97 }
|
|
98 my $n3 = 0;
|
|
99 if ($hash{"not in $category"}{"core"}){
|
|
100 $n3 = $hash{"not in $category"}{"core"};
|
|
101 }
|
|
102 my $n4 = 0;
|
|
103 if ($hash{"not in $category"}{"accessory"}){
|
|
104 $n4 = $hash{"not in $category"}{"accessory"};
|
|
105 }
|
|
106
|
|
107 # core
|
|
108 open(R,">$pav_matrix.$category.core.script.R");
|
|
109 print R "(tab <- matrix(c($n1, $n2, $n3, $n4), nrow=2))\n";
|
|
110 my $ratio1 = $n1/$n3;
|
|
111 my $ratio2 = $n2/$n4;
|
|
112 print OUT "core $category $n1 $n2 $n3 $n4 $ratio1 $ratio2\n";
|
|
113 print R "fisher.test(tab)\n";
|
|
114 close(R);
|
|
115 system("Rscript $pav_matrix.$category.core.script.R >$pav_matrix.$category.core.script.R.out");
|
|
116 open(O,"$pav_matrix.$category.core.script.R.out");
|
|
117 my $go_odds = 0;
|
|
118 while(<O>){
|
|
119 if (/p-value [=<] (.*)/){
|
|
120 my $pvalue = $1;
|
|
121 $results{$category}{"core"}{"pval"} = $pvalue;
|
|
122 }
|
|
123 if (/odds/){$go_odds = 1;}
|
|
124 if (/([\d\.]*)/ && $go_odds){
|
|
125 my $odds = $1;
|
|
126
|
|
127 $results{$category}{"core"}{"odds_ratio"} = $odds;
|
|
128 }
|
|
129 }
|
|
130 close(O);
|
|
131
|
|
132 if ($category ne 'Unknown'){
|
|
133 unlink("$pav_matrix.$category.core.script.R.out");
|
|
134 unlink("$pav_matrix.$category.core.script.R");
|
|
135 }
|
|
136
|
|
137 # dispensable
|
|
138 my $n1 = 0;
|
|
139 if ($hash{$category}{"accessory"}){
|
|
140 $n1 = $hash{$category}{"accessory"};
|
|
141 }
|
|
142 my $n2 = 0;
|
|
143 if ($hash{$category}{"core"}){
|
|
144 $n2= $hash{$category}{"core"};
|
|
145 }
|
|
146 my $n3 = 0;
|
|
147 if ($hash{"not in $category"}{"accessory"}){
|
|
148 $n3 = $hash{"not in $category"}{"accessory"};
|
|
149 }
|
|
150 my $n4 = 0;
|
|
151 if ($hash{"not in $category"}{"core"}){
|
|
152 $n4 = $hash{"not in $category"}{"core"};
|
|
153 }
|
|
154
|
|
155 open(R,">$pav_matrix.$category.accessory.script.R");
|
|
156 print R "(tab <- matrix(c($n1, $n3, $n2, $n4), nrow=2))\n";
|
|
157 print R "fisher.test(tab)\n";
|
|
158 close(R);
|
|
159 print "dispensable, $category: $n1, $n2, $n3, $n4\n";
|
|
160 system("Rscript $pav_matrix.$category.accessory.script.R >$pav_matrix.$category.accessory.script.R.out");
|
|
161 open(O,"$pav_matrix.$category.accessory.script.R.out");
|
|
162 my $go_odds = 0;
|
|
163 while(<O>){
|
|
164 if (/p-value [=<] (.*)/){
|
|
165 my $pvalue = $1;
|
|
166 $results{$category}{"accessory"}{"pval"} = $pvalue;
|
|
167 }
|
|
168 if (/odds/){$go_odds = 1;}
|
|
169 if (/([\d\.]*)/ && $go_odds){
|
|
170 my $odds = $1;
|
|
171 $results{$category}{"accessory"}{"odds_ratio"} = $odds;
|
|
172 }
|
|
173 }
|
|
174 close(O);
|
|
175 if ($category ne 'Unknown'){
|
|
176 unlink("$pav_matrix.$category.accessory.script.R");
|
|
177 unlink("$pav_matrix.$category.accessory.script.R.out");
|
|
178 }
|
|
179 }
|
|
180
|
|
181 foreach my $category(keys(%results)){
|
|
182 print "$category:";
|
|
183 print " ".$results{$category}{"core"}{"odds_ratio"};
|
|
184 print " ".$results{$category}{"core"}{"pval"};
|
|
185 print " ".$results{$category}{"accessory"}{"odds_ratio"};
|
|
186 print " ".$results{$category}{"accessory"}{"pval"};
|
|
187 print "\n";
|
|
188 }
|
|
189 close(OUT);
|