Mercurial > repos > dereeper > pangenome_explorer
comparison Perl/CalculateCogEnrichment.pl @ 3:e42d30da7a74 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:52:25 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:97e4e3e818b6 | 3:e42d30da7a74 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 use strict; | |
4 | |
5 my $pav_matrix = $ARGV[0]; | |
6 my $cog_of_clusters = $ARGV[1]; | |
7 my $output = $ARGV[2]; | |
8 | |
9 my %categories = ("Unknown"=>1); | |
10 my %cog_categories; | |
11 my %cogs_for_cluster; | |
12 open(COG,$cog_of_clusters); | |
13 while(<COG>){ | |
14 my $line = $_; | |
15 $line =~s/\n//g;$line =~s/\r//g; | |
16 my ($cluster,$cog,$category) = split("\t",$line); | |
17 #if (!$cogs_for_cluster{$cluster}){ | |
18 $cogs_for_cluster{$cluster} .= "$cog,"; | |
19 #} | |
20 $cog_categories{$cog} = $category; | |
21 $categories{$category}=1; | |
22 } | |
23 close(COG); | |
24 | |
25 my %hash; | |
26 my %hash2; | |
27 open(F,$pav_matrix); | |
28 my $first_line = <F>; | |
29 $first_line =~s/\n//g;$first_line =~s/\r//g; | |
30 my @samples = split(/\t/,$first_line); | |
31 my $nb_total = 0; | |
32 my $nb_core = 0; | |
33 my $nb_accessory = 0; | |
34 print C "Cluster\tFunction\tCOG\tCOG categories\n"; | |
35 print D "Cluster\tFound in N strains\tFunction\tCOG\tCOG categories\n"; | |
36 while(<F>){ | |
37 my @i = split("\t",$_); | |
38 my $cluster_num = $i[0]; | |
39 my $clnb = $i[0]; | |
40 my $pos = $clnb; | |
41 my $pos_before = $pos - 1; | |
42 my $nb_found = 0; | |
43 my $concatenate_samples = ""; | |
44 my $strain_found = 0; | |
45 my %functions_of_genes; | |
46 for (my $j = 1; $j <= $#i; $j++){ | |
47 my $val = $i[$j]; | |
48 if ($val =~/\w+/){ | |
49 my @genes = split(",",$val); | |
50 $nb_found++; | |
51 } | |
52 } | |
53 # core-genes | |
54 my $type = ""; | |
55 if ($nb_found == (scalar @samples - 1)){ | |
56 $type = "core"; | |
57 $nb_core++; | |
58 } | |
59 # dispensables | |
60 else{ | |
61 $type = "accessory"; | |
62 $nb_accessory++; | |
63 } | |
64 my @cogs = split(/,/,$cogs_for_cluster{$clnb}); | |
65 my %cat; | |
66 foreach my $cog(@cogs){ | |
67 if ($cog =~/\w+/){ | |
68 my $category = $cog_categories{$cog}; | |
69 $cat{$category}=1; | |
70 } | |
71 } | |
72 if (!$cogs_for_cluster{$clnb}){ | |
73 $cat{"Unknown"}=1; | |
74 } | |
75 | |
76 foreach my $category(keys(%categories)){ | |
77 if ($cat{$category}){ | |
78 $hash{$category}{$type}++; | |
79 } | |
80 else{ | |
81 $hash{"not in $category"}{$type}++; | |
82 } | |
83 } | |
84 } | |
85 close(F); | |
86 | |
87 open(OUT,">$output"); | |
88 my %results; | |
89 foreach my $category(keys(%categories)){ | |
90 my $n1 = 0; | |
91 if ($hash{$category}{"core"}){ | |
92 $n1 = $hash{$category}{"core"}; | |
93 } | |
94 my $n2 = 0; | |
95 if ($hash{$category}{"accessory"}){ | |
96 $n2 = $hash{$category}{"accessory"}; | |
97 } | |
98 my $n3 = 0; | |
99 if ($hash{"not in $category"}{"core"}){ | |
100 $n3 = $hash{"not in $category"}{"core"}; | |
101 } | |
102 my $n4 = 0; | |
103 if ($hash{"not in $category"}{"accessory"}){ | |
104 $n4 = $hash{"not in $category"}{"accessory"}; | |
105 } | |
106 | |
107 # core | |
108 open(R,">$pav_matrix.$category.core.script.R"); | |
109 print R "(tab <- matrix(c($n1, $n2, $n3, $n4), nrow=2))\n"; | |
110 my $ratio1 = $n1/$n3; | |
111 my $ratio2 = $n2/$n4; | |
112 print OUT "core $category $n1 $n2 $n3 $n4 $ratio1 $ratio2\n"; | |
113 print R "fisher.test(tab)\n"; | |
114 close(R); | |
115 system("Rscript $pav_matrix.$category.core.script.R >$pav_matrix.$category.core.script.R.out"); | |
116 open(O,"$pav_matrix.$category.core.script.R.out"); | |
117 my $go_odds = 0; | |
118 while(<O>){ | |
119 if (/p-value [=<] (.*)/){ | |
120 my $pvalue = $1; | |
121 $results{$category}{"core"}{"pval"} = $pvalue; | |
122 } | |
123 if (/odds/){$go_odds = 1;} | |
124 if (/([\d\.]*)/ && $go_odds){ | |
125 my $odds = $1; | |
126 | |
127 $results{$category}{"core"}{"odds_ratio"} = $odds; | |
128 } | |
129 } | |
130 close(O); | |
131 | |
132 if ($category ne 'Unknown'){ | |
133 unlink("$pav_matrix.$category.core.script.R.out"); | |
134 unlink("$pav_matrix.$category.core.script.R"); | |
135 } | |
136 | |
137 # dispensable | |
138 my $n1 = 0; | |
139 if ($hash{$category}{"accessory"}){ | |
140 $n1 = $hash{$category}{"accessory"}; | |
141 } | |
142 my $n2 = 0; | |
143 if ($hash{$category}{"core"}){ | |
144 $n2= $hash{$category}{"core"}; | |
145 } | |
146 my $n3 = 0; | |
147 if ($hash{"not in $category"}{"accessory"}){ | |
148 $n3 = $hash{"not in $category"}{"accessory"}; | |
149 } | |
150 my $n4 = 0; | |
151 if ($hash{"not in $category"}{"core"}){ | |
152 $n4 = $hash{"not in $category"}{"core"}; | |
153 } | |
154 | |
155 open(R,">$pav_matrix.$category.accessory.script.R"); | |
156 print R "(tab <- matrix(c($n1, $n3, $n2, $n4), nrow=2))\n"; | |
157 print R "fisher.test(tab)\n"; | |
158 close(R); | |
159 print "dispensable, $category: $n1, $n2, $n3, $n4\n"; | |
160 system("Rscript $pav_matrix.$category.accessory.script.R >$pav_matrix.$category.accessory.script.R.out"); | |
161 open(O,"$pav_matrix.$category.accessory.script.R.out"); | |
162 my $go_odds = 0; | |
163 while(<O>){ | |
164 if (/p-value [=<] (.*)/){ | |
165 my $pvalue = $1; | |
166 $results{$category}{"accessory"}{"pval"} = $pvalue; | |
167 } | |
168 if (/odds/){$go_odds = 1;} | |
169 if (/([\d\.]*)/ && $go_odds){ | |
170 my $odds = $1; | |
171 $results{$category}{"accessory"}{"odds_ratio"} = $odds; | |
172 } | |
173 } | |
174 close(O); | |
175 if ($category ne 'Unknown'){ | |
176 unlink("$pav_matrix.$category.accessory.script.R"); | |
177 unlink("$pav_matrix.$category.accessory.script.R.out"); | |
178 } | |
179 } | |
180 | |
181 foreach my $category(keys(%results)){ | |
182 print "$category:"; | |
183 print " ".$results{$category}{"core"}{"odds_ratio"}; | |
184 print " ".$results{$category}{"core"}{"pval"}; | |
185 print " ".$results{$category}{"accessory"}{"odds_ratio"}; | |
186 print " ".$results{$category}{"accessory"}{"pval"}; | |
187 print "\n"; | |
188 } | |
189 close(OUT); |