annotate Perl/CalculateCogEnrichment.pl @ 12:38c66e401040 draft

Uploaded
author dereeper
date Thu, 30 May 2024 19:59:16 +0000
parents e42d30da7a74
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
3 use strict;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
4
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
5 my $pav_matrix = $ARGV[0];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
6 my $cog_of_clusters = $ARGV[1];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
7 my $output = $ARGV[2];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
8
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
9 my %categories = ("Unknown"=>1);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
10 my %cog_categories;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
11 my %cogs_for_cluster;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
12 open(COG,$cog_of_clusters);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
13 while(<COG>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
14 my $line = $_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
15 $line =~s/\n//g;$line =~s/\r//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
16 my ($cluster,$cog,$category) = split("\t",$line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
17 #if (!$cogs_for_cluster{$cluster}){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
18 $cogs_for_cluster{$cluster} .= "$cog,";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
19 #}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
20 $cog_categories{$cog} = $category;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
21 $categories{$category}=1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
22 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
23 close(COG);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
24
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
25 my %hash;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
26 my %hash2;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
27 open(F,$pav_matrix);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
28 my $first_line = <F>;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
29 $first_line =~s/\n//g;$first_line =~s/\r//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
30 my @samples = split(/\t/,$first_line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
31 my $nb_total = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
32 my $nb_core = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
33 my $nb_accessory = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
34 print C "Cluster\tFunction\tCOG\tCOG categories\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
35 print D "Cluster\tFound in N strains\tFunction\tCOG\tCOG categories\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
36 while(<F>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
37 my @i = split("\t",$_);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
38 my $cluster_num = $i[0];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
39 my $clnb = $i[0];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
40 my $pos = $clnb;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
41 my $pos_before = $pos - 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
42 my $nb_found = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
43 my $concatenate_samples = "";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
44 my $strain_found = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
45 my %functions_of_genes;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
46 for (my $j = 1; $j <= $#i; $j++){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
47 my $val = $i[$j];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
48 if ($val =~/\w+/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
49 my @genes = split(",",$val);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
50 $nb_found++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
51 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
52 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
53 # core-genes
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
54 my $type = "";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
55 if ($nb_found == (scalar @samples - 1)){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
56 $type = "core";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
57 $nb_core++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
58 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
59 # dispensables
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
60 else{
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
61 $type = "accessory";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
62 $nb_accessory++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
63 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
64 my @cogs = split(/,/,$cogs_for_cluster{$clnb});
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
65 my %cat;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
66 foreach my $cog(@cogs){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
67 if ($cog =~/\w+/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
68 my $category = $cog_categories{$cog};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
69 $cat{$category}=1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
70 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
71 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
72 if (!$cogs_for_cluster{$clnb}){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
73 $cat{"Unknown"}=1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
74 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
75
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
76 foreach my $category(keys(%categories)){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
77 if ($cat{$category}){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
78 $hash{$category}{$type}++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
79 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
80 else{
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
81 $hash{"not in $category"}{$type}++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
82 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
83 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
84 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
85 close(F);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
86
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
87 open(OUT,">$output");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
88 my %results;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
89 foreach my $category(keys(%categories)){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
90 my $n1 = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
91 if ($hash{$category}{"core"}){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
92 $n1 = $hash{$category}{"core"};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
93 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
94 my $n2 = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
95 if ($hash{$category}{"accessory"}){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
96 $n2 = $hash{$category}{"accessory"};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
97 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
98 my $n3 = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
99 if ($hash{"not in $category"}{"core"}){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
100 $n3 = $hash{"not in $category"}{"core"};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
101 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
102 my $n4 = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
103 if ($hash{"not in $category"}{"accessory"}){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
104 $n4 = $hash{"not in $category"}{"accessory"};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
105 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
106
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
107 # core
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
108 open(R,">$pav_matrix.$category.core.script.R");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
109 print R "(tab <- matrix(c($n1, $n2, $n3, $n4), nrow=2))\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
110 my $ratio1 = $n1/$n3;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
111 my $ratio2 = $n2/$n4;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
112 print OUT "core $category $n1 $n2 $n3 $n4 $ratio1 $ratio2\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
113 print R "fisher.test(tab)\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
114 close(R);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
115 system("Rscript $pav_matrix.$category.core.script.R >$pav_matrix.$category.core.script.R.out");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
116 open(O,"$pav_matrix.$category.core.script.R.out");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
117 my $go_odds = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
118 while(<O>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
119 if (/p-value [=<] (.*)/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
120 my $pvalue = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
121 $results{$category}{"core"}{"pval"} = $pvalue;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
122 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
123 if (/odds/){$go_odds = 1;}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
124 if (/([\d\.]*)/ && $go_odds){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
125 my $odds = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
126
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
127 $results{$category}{"core"}{"odds_ratio"} = $odds;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
128 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
129 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
130 close(O);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
131
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
132 if ($category ne 'Unknown'){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
133 unlink("$pav_matrix.$category.core.script.R.out");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
134 unlink("$pav_matrix.$category.core.script.R");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
135 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
136
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
137 # dispensable
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
138 my $n1 = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
139 if ($hash{$category}{"accessory"}){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
140 $n1 = $hash{$category}{"accessory"};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
141 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
142 my $n2 = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
143 if ($hash{$category}{"core"}){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
144 $n2= $hash{$category}{"core"};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
145 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
146 my $n3 = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
147 if ($hash{"not in $category"}{"accessory"}){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
148 $n3 = $hash{"not in $category"}{"accessory"};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
149 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
150 my $n4 = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
151 if ($hash{"not in $category"}{"core"}){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
152 $n4 = $hash{"not in $category"}{"core"};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
153 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
154
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
155 open(R,">$pav_matrix.$category.accessory.script.R");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
156 print R "(tab <- matrix(c($n1, $n3, $n2, $n4), nrow=2))\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
157 print R "fisher.test(tab)\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
158 close(R);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
159 print "dispensable, $category: $n1, $n2, $n3, $n4\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
160 system("Rscript $pav_matrix.$category.accessory.script.R >$pav_matrix.$category.accessory.script.R.out");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
161 open(O,"$pav_matrix.$category.accessory.script.R.out");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
162 my $go_odds = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
163 while(<O>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
164 if (/p-value [=<] (.*)/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
165 my $pvalue = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
166 $results{$category}{"accessory"}{"pval"} = $pvalue;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
167 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
168 if (/odds/){$go_odds = 1;}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
169 if (/([\d\.]*)/ && $go_odds){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
170 my $odds = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
171 $results{$category}{"accessory"}{"odds_ratio"} = $odds;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
172 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
173 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
174 close(O);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
175 if ($category ne 'Unknown'){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
176 unlink("$pav_matrix.$category.accessory.script.R");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
177 unlink("$pav_matrix.$category.accessory.script.R.out");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
178 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
179 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
180
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
181 foreach my $category(keys(%results)){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
182 print "$category:";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
183 print " ".$results{$category}{"core"}{"odds_ratio"};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
184 print " ".$results{$category}{"core"}{"pval"};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
185 print " ".$results{$category}{"accessory"}{"odds_ratio"};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
186 print " ".$results{$category}{"accessory"}{"pval"};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
187 print "\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
188 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
189 close(OUT);