Mercurial > repos > dereeper > pangenome_explorer
comparison PanExplorer_workflow/Perl/GetCogOfCluster.pl @ 1:032f6b3806a3 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:16:08 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:3cbb01081cde | 1:032f6b3806a3 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 use strict; | |
4 use File::Basename; | |
5 my $dirname = dirname(__FILE__); | |
6 | |
7 my $pav_matrix = $ARGV[0]; | |
8 my $prot_dir = $ARGV[1]; | |
9 my $cog_outfile = $ARGV[2]; | |
10 my $cog_stats = $ARGV[3]; | |
11 my $cog_stats2 = $ARGV[4]; | |
12 my $cog_clusters = $ARGV[5]; | |
13 my $strain_info_file = $ARGV[6]; | |
14 | |
15 | |
16 | |
17 my %cogs_categories = ( | |
18 "J"=>"INFORMATION STORAGE AND PROCESSING", | |
19 "A"=>"INFORMATION STORAGE AND PROCESSING", | |
20 "K"=>"INFORMATION STORAGE AND PROCESSING", | |
21 "L"=>"INFORMATION STORAGE AND PROCESSING", | |
22 "B"=>"INFORMATION STORAGE AND PROCESSING", | |
23 "D"=>"CELLULAR PROCESSES AND SIGNALING", | |
24 "Y"=>"CELLULAR PROCESSES AND SIGNALING", | |
25 "V"=>"CELLULAR PROCESSES AND SIGNALING", | |
26 "T"=>"CELLULAR PROCESSES AND SIGNALING", | |
27 "M"=>"CELLULAR PROCESSES AND SIGNALING", | |
28 "N"=>"CELLULAR PROCESSES AND SIGNALING", | |
29 "Z"=>"CELLULAR PROCESSES AND SIGNALING", | |
30 "W"=>"CELLULAR PROCESSES AND SIGNALING", | |
31 "U"=>"CELLULAR PROCESSES AND SIGNALING", | |
32 "O"=>"CELLULAR PROCESSES AND SIGNALING", | |
33 "C"=>"METABOLISM", | |
34 "G"=>"METABOLISM", | |
35 "E"=>"METABOLISM", | |
36 "F"=>"METABOLISM", | |
37 "H"=>"METABOLISM", | |
38 "I"=>"METABOLISM", | |
39 "P"=>"METABOLISM", | |
40 "Q"=>"METABOLISM", | |
41 "R"=>"POORLY CHARACTERIZED", | |
42 "S"=>"POORLY CHARACTERIZED" | |
43 ); | |
44 | |
45 my %strain_names; | |
46 open(S,$strain_info_file); | |
47 while(<S>){ | |
48 my $line = $_; | |
49 $line =~s/\n//g;$line =~s/\r//g; | |
50 my ($id,$strain_name) = split(/\t/,$line); | |
51 $strain_names{$id} = $strain_name; | |
52 } | |
53 close(S); | |
54 | |
55 my %strain_of_prot; | |
56 my %proteins; | |
57 open( DIR, "ls $prot_dir/*pep |" ); | |
58 while(<DIR>) { | |
59 my $filename = $_; | |
60 my $strain; | |
61 my $id; | |
62 if ($filename =~/\/([^\/]*).pep/){ | |
63 $strain = $1; | |
64 } | |
65 #open(F,"zcat $filename|" ); | |
66 open(F,"$filename" ); | |
67 while(<F>){ | |
68 if (/>(.*)/){ | |
69 $id = $1; | |
70 $strain_of_prot{$id} = $strain; | |
71 } | |
72 else{ | |
73 $proteins{$id}.= $_; | |
74 } | |
75 } | |
76 close(F); | |
77 } | |
78 closedir(DIR); | |
79 | |
80 my %functions; | |
81 my %accessory_clusters; | |
82 my %genes_of_cluster; | |
83 my %cluster_of_gene; | |
84 open(S,">$pav_matrix.selection_prot.fa"); | |
85 open(O,$pav_matrix); | |
86 <O>; | |
87 while(<O>){ | |
88 my $line = $_; | |
89 $line =~s/\n//g;$line =~s/\r//g; | |
90 my @infos = split(/\t/,$line); | |
91 my $cluster = $infos[0]; | |
92 print S ">$cluster\n"; | |
93 for (my $i=1; $i <= $#infos; $i++){ | |
94 my @genenames = split(/,/,$infos[$i]); | |
95 foreach my $genename(@genenames){ | |
96 if ($genename=~/\w+/){ | |
97 $genes_of_cluster{$cluster} .= "$genename,"; | |
98 } | |
99 } | |
100 } | |
101 for (my $i=1; $i <= $#infos; $i++){ | |
102 if ($infos[$i] =~/-/){$accessory_clusters{$cluster} = 1;} | |
103 } | |
104 LINE: for (my $i=1; $i <= $#infos; $i++){ | |
105 my @genenames = split(/,/,$infos[$i]); | |
106 foreach my $genename(@genenames){ | |
107 if ($genename=~/\w+/){ | |
108 #$genename =~s/\|/_/g; | |
109 print S $proteins{$genename}; | |
110 $cluster_of_gene{$genename} = $cluster; | |
111 my $function = `grep '$genename' $prot_dir/*func`; | |
112 $function =~s/\n//g;$function =~s/\r//g; | |
113 my @t = split(/$genename/,$function); | |
114 if ($t[1] =~/-\s+(.*)/){ | |
115 $function = $1; | |
116 $function =~s/'//g; | |
117 } | |
118 $functions{$cluster} = $function; | |
119 last LINE; | |
120 } | |
121 } | |
122 } | |
123 } | |
124 close(O); | |
125 close(S); | |
126 | |
127 | |
128 ######################################################################################################################################### | |
129 # Test if COG are already provided in Genbank files. If provided, rpsblast is not launched, we get the COG information from Genbank files | |
130 ######################################################################################################################################### | |
131 my %cog_of_genes; | |
132 open(LS,"ls $prot_dir/*gb |"); | |
133 while(my $gb_file = <LS>){ | |
134 chomp($gb_file); | |
135 open(GB,$gb_file); | |
136 my $current_cog; | |
137 my $current_gene; | |
138 while(my $l=<GB>){ | |
139 if ($l=~/(COG\d+)/){ | |
140 $current_cog = $1; | |
141 } | |
142 if ($l =~/protein_id=\"(.*)\"/ && $current_cog){ | |
143 $current_gene = $1; | |
144 $cog_of_genes{$current_gene} .= ",$current_cog"; | |
145 } | |
146 if ($l =~/locus_tag=\"(.*)\"/ && $current_cog){ | |
147 $current_gene = $1; | |
148 $cog_of_genes{$current_gene} .= ",$current_cog"; | |
149 } | |
150 } | |
151 close(GB); | |
152 } | |
153 close(LS); | |
154 | |
155 my %count_letter; | |
156 if (scalar keys(%cog_of_genes) > 1){ | |
157 open(WHOG,"$dirname/../COG/whog"); | |
158 my %letters_of_cog; | |
159 while(my $l=<WHOG>){ | |
160 if ($l=~/\[(\w+)\] (COG\d+) /){ | |
161 my $letter = $1; | |
162 my $cogid = $2; | |
163 $letters_of_cog{$cogid} = $letter; | |
164 } | |
165 } | |
166 close(WHOG); | |
167 | |
168 my %cogs_of_cluster; | |
169 foreach my $gene(keys(%cog_of_genes)){ | |
170 my $cluster = $cluster_of_gene{$gene}; | |
171 my $cogs = $cog_of_genes{$gene}; | |
172 $cogs_of_cluster{$cluster} = $cogs; | |
173 } | |
174 open(COG_CLUST,">$cog_clusters"); | |
175 foreach my $cluster(sort {$a<=>$b} keys(%cogs_of_cluster)){ | |
176 my $cogids = $cogs_of_cluster{$cluster}; | |
177 my @ids = split(/,/,$cogids); | |
178 foreach my $id(@ids){ | |
179 if ($id =~/\w+/ && $cluster){ | |
180 my $letter = $letters_of_cog{$id}; | |
181 my @letters = split(//,$letter); | |
182 my $letters_string = join("\t",@letters); | |
183 #$letter =~s//\t/g; | |
184 print COG_CLUST "$cluster $id $letters_string\n"; | |
185 } | |
186 } | |
187 } | |
188 close(COG_CLUST); | |
189 } | |
190 else{ | |
191 | |
192 my $is_plus = `which rpsblast+`; | |
193 my $is_not_plus = `which rpsblast`; | |
194 my $software = "rpsblast"; | |
195 if ($is_plus){$software = "rpsblast+";} | |
196 system("$software -query $pav_matrix.selection_prot.fa -db $dirname/../COG/Cog -out $pav_matrix.selection.rps-blast.out -evalue 1e-2 -outfmt '7 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qcovs'"); | |
197 | |
198 system("perl $dirname/../COG/bac-genomics-scripts/cdd2cog/cdd2cog.pl -r $pav_matrix.selection.rps-blast.out -c $dirname/../COG/cddid.tbl -f $dirname/../COG/fun.txt -w $dirname/../COG/whog -a"); | |
199 system("cp -rf results/protein-id_cog.txt $cog_clusters"); | |
200 system("rm -rf results"); | |
201 } | |
202 | |
203 open(COG,">$cog_outfile"); | |
204 my %cogs; | |
205 open(C,$cog_clusters); | |
206 while(<C>){ | |
207 my $line = $_; | |
208 $line =~s/\n//g;$line =~s/\r//g; | |
209 my @infos = split(/\t/,$line); | |
210 my $coginfo = ""; | |
211 my $cluster = $infos[0]; | |
212 for (my $i = 1; $i <= $#infos; $i++){ | |
213 $coginfo .= "\t".$infos[$i]; | |
214 } | |
215 my $gene_list = $genes_of_cluster{$cluster}; | |
216 chop($gene_list); | |
217 my @genes = split(",",$gene_list); | |
218 foreach my $gene(@genes){ | |
219 my @letters = split(/\t/,$coginfo); | |
220 my $strain = $strain_of_prot{$gene}; | |
221 for (my $i = 2; $i <= $#letters; $i++){ | |
222 my $letter = $letters[$i]; | |
223 my $cat = $cogs_categories{$letter}; | |
224 $count_letter{$strain}{$letter}++; | |
225 $count_letter{$strain}{$cat}++; | |
226 } | |
227 print COG $gene.$coginfo."\n"; | |
228 } | |
229 } | |
230 close(C); | |
231 close(COG); | |
232 | |
233 | |
234 | |
235 my %cogs_of_clusters; | |
236 my %cogcats_of_clusters; | |
237 open(C,$cog_clusters); | |
238 while(<C>){ | |
239 my $line = $_; | |
240 $line =~s/\n//g;$line =~s/\r//g; | |
241 my @infos = split(/\t/,$line); | |
242 my $cluster = $infos[0]; | |
243 my $cog = $infos[1]; | |
244 my $cog_category = $infos[2]; | |
245 $cogs_of_clusters{$cluster}{$cog} = 1; | |
246 $cogcats_of_clusters{$cluster} = $cog_category; | |
247 } | |
248 close(C); | |
249 | |
250 open(CC,">$cog_clusters.2"); | |
251 foreach my $cluster(sort{$a<=>$b} keys(%genes_of_cluster)){ | |
252 my $cog_category = "Unknown"; | |
253 my $cog_list = "Unknown"; | |
254 if ($cogcats_of_clusters{$cluster}){ | |
255 $cog_category = $cogcats_of_clusters{$cluster}; | |
256 my $ref_cogs_of_clusters = $cogs_of_clusters{$cluster}; | |
257 $cog_list = join(",",keys(%$ref_cogs_of_clusters)); | |
258 } | |
259 my $function = "Unknown"; | |
260 if ($functions{$cluster}){ | |
261 $function = $functions{$cluster}; | |
262 } | |
263 if ($accessory_clusters{$cluster}){ | |
264 print CC "$cluster\t$cog_list\t$cog_category\t$function\n"; | |
265 } | |
266 } | |
267 close(CC); | |
268 | |
269 my @cat_of_cat = ("INFORMATION STORAGE AND PROCESSING","METABOLISM","CELLULAR PROCESSES AND SIGNALING","POORLY CHARACTERIZED"); | |
270 my @cog_categories = ("D","M","N","O","T","U","V","W","Y","Z","A","B","J","K","L","C","E","F","G","H","I","P","Q","R","S"); | |
271 open(COG_STAT,">$cog_stats"); | |
272 open(COG_STAT2,">$cog_stats2"); | |
273 print COG_STAT "COG\t".join("\t",@cog_categories)."\n"; | |
274 print COG_STAT2 "COG\t".join("\t",@cat_of_cat)."\n"; | |
275 foreach my $strain(keys(%count_letter)){ | |
276 my $strain_name = $strain_names{$strain}; | |
277 print COG_STAT "$strain_name"; | |
278 print COG_STAT2 "$strain_name"; | |
279 my $ref_subhash = $count_letter{$strain}; | |
280 my %subhash = %$ref_subhash; | |
281 foreach my $letter(@cog_categories){ | |
282 my $n = 0; | |
283 if ($count_letter{$strain}{$letter}){$n = $count_letter{$strain}{$letter};} | |
284 print COG_STAT "\t".$n; | |
285 } | |
286 print COG_STAT "\n"; | |
287 | |
288 foreach my $cat(@cat_of_cat){ | |
289 my $n = 0; | |
290 if ($count_letter{$strain}{$cat}){$n = $count_letter{$strain}{$cat};} | |
291 print COG_STAT2 "\t".$n; | |
292 } | |
293 print COG_STAT2 "\n"; | |
294 } | |
295 close(COG_STAT); | |
296 close(COG_STAT2); | |
297 |