comparison Perl/GetCogOfCluster.pl @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
comparison
equal deleted inserted replaced
2:97e4e3e818b6 3:e42d30da7a74
1 #!/usr/bin/perl
2
3 use strict;
4 use File::Basename;
5 my $dirname = dirname(__FILE__);
6
7 my $pav_matrix = $ARGV[0];
8 my $prot_dir = $ARGV[1];
9 my $cog_outfile = $ARGV[2];
10 my $cog_stats = $ARGV[3];
11 my $cog_stats2 = $ARGV[4];
12 my $cog_clusters = $ARGV[5];
13 my $strain_info_file = $ARGV[6];
14
15
16
17 my %cogs_categories = (
18 "J"=>"INFORMATION STORAGE AND PROCESSING",
19 "A"=>"INFORMATION STORAGE AND PROCESSING",
20 "K"=>"INFORMATION STORAGE AND PROCESSING",
21 "L"=>"INFORMATION STORAGE AND PROCESSING",
22 "B"=>"INFORMATION STORAGE AND PROCESSING",
23 "D"=>"CELLULAR PROCESSES AND SIGNALING",
24 "Y"=>"CELLULAR PROCESSES AND SIGNALING",
25 "V"=>"CELLULAR PROCESSES AND SIGNALING",
26 "T"=>"CELLULAR PROCESSES AND SIGNALING",
27 "M"=>"CELLULAR PROCESSES AND SIGNALING",
28 "N"=>"CELLULAR PROCESSES AND SIGNALING",
29 "Z"=>"CELLULAR PROCESSES AND SIGNALING",
30 "W"=>"CELLULAR PROCESSES AND SIGNALING",
31 "U"=>"CELLULAR PROCESSES AND SIGNALING",
32 "O"=>"CELLULAR PROCESSES AND SIGNALING",
33 "C"=>"METABOLISM",
34 "G"=>"METABOLISM",
35 "E"=>"METABOLISM",
36 "F"=>"METABOLISM",
37 "H"=>"METABOLISM",
38 "I"=>"METABOLISM",
39 "P"=>"METABOLISM",
40 "Q"=>"METABOLISM",
41 "R"=>"POORLY CHARACTERIZED",
42 "S"=>"POORLY CHARACTERIZED"
43 );
44
45 my %strain_names;
46 open(S,$strain_info_file);
47 while(<S>){
48 my $line = $_;
49 $line =~s/\n//g;$line =~s/\r//g;
50 my ($id,$strain_name) = split(/\t/,$line);
51 $strain_names{$id} = $strain_name;
52 }
53 close(S);
54
55 my %strain_of_prot;
56 my %proteins;
57 open( DIR, "ls $prot_dir/*pep |" );
58 while(<DIR>) {
59 my $filename = $_;
60 my $strain;
61 my $id;
62 if ($filename =~/\/([^\/]*).pep/){
63 $strain = $1;
64 }
65 #open(F,"zcat $filename|" );
66 open(F,"$filename" );
67 while(<F>){
68 if (/>(.*)/){
69 $id = $1;
70 $strain_of_prot{$id} = $strain;
71 }
72 else{
73 $proteins{$id}.= $_;
74 }
75 }
76 close(F);
77 }
78 closedir(DIR);
79
80 my %functions;
81 my %accessory_clusters;
82 my %genes_of_cluster;
83 my %cluster_of_gene;
84 open(S,">$pav_matrix.selection_prot.fa");
85 open(O,$pav_matrix);
86 <O>;
87 while(<O>){
88 my $line = $_;
89 $line =~s/\n//g;$line =~s/\r//g;
90 my @infos = split(/\t/,$line);
91 my $cluster = $infos[0];
92 print S ">$cluster\n";
93 for (my $i=1; $i <= $#infos; $i++){
94 my @genenames = split(/,/,$infos[$i]);
95 foreach my $genename(@genenames){
96 if ($genename=~/\w+/){
97 $genes_of_cluster{$cluster} .= "$genename,";
98 }
99 }
100 }
101 for (my $i=1; $i <= $#infos; $i++){
102 if ($infos[$i] =~/-/){$accessory_clusters{$cluster} = 1;}
103 }
104 LINE: for (my $i=1; $i <= $#infos; $i++){
105 my @genenames = split(/,/,$infos[$i]);
106 foreach my $genename(@genenames){
107 if ($genename=~/\w+/){
108 #$genename =~s/\|/_/g;
109 print S $proteins{$genename};
110 $cluster_of_gene{$genename} = $cluster;
111 my $function = `grep '$genename' $prot_dir/*func`;
112 $function =~s/\n//g;$function =~s/\r//g;
113 my @t = split(/$genename/,$function);
114 if ($t[1] =~/-\s+(.*)/){
115 $function = $1;
116 $function =~s/'//g;
117 }
118 $functions{$cluster} = $function;
119 last LINE;
120 }
121 }
122 }
123 }
124 close(O);
125 close(S);
126
127
128 #########################################################################################################################################
129 # Test if COG are already provided in Genbank files. If provided, rpsblast is not launched, we get the COG information from Genbank files
130 #########################################################################################################################################
131 my %cog_of_genes;
132 open(LS,"ls $prot_dir/*gb |");
133 while(my $gb_file = <LS>){
134 chomp($gb_file);
135 open(GB,$gb_file);
136 my $current_cog;
137 my $current_gene;
138 while(my $l=<GB>){
139 if ($l=~/(COG\d+)/){
140 $current_cog = $1;
141 }
142 if ($l =~/protein_id=\"(.*)\"/ && $current_cog){
143 $current_gene = $1;
144 $cog_of_genes{$current_gene} .= ",$current_cog";
145 }
146 if ($l =~/locus_tag=\"(.*)\"/ && $current_cog){
147 $current_gene = $1;
148 $cog_of_genes{$current_gene} .= ",$current_cog";
149 }
150 }
151 close(GB);
152 }
153 close(LS);
154
155 my %count_letter;
156 if (scalar keys(%cog_of_genes) > 1){
157 open(WHOG,"$dirname/../COG/whog");
158 my %letters_of_cog;
159 while(my $l=<WHOG>){
160 if ($l=~/\[(\w+)\] (COG\d+) /){
161 my $letter = $1;
162 my $cogid = $2;
163 $letters_of_cog{$cogid} = $letter;
164 }
165 }
166 close(WHOG);
167
168 my %cogs_of_cluster;
169 foreach my $gene(keys(%cog_of_genes)){
170 my $cluster = $cluster_of_gene{$gene};
171 my $cogs = $cog_of_genes{$gene};
172 $cogs_of_cluster{$cluster} = $cogs;
173 }
174 open(COG_CLUST,">$cog_clusters");
175 foreach my $cluster(sort {$a<=>$b} keys(%cogs_of_cluster)){
176 my $cogids = $cogs_of_cluster{$cluster};
177 my @ids = split(/,/,$cogids);
178 foreach my $id(@ids){
179 if ($id =~/\w+/ && $cluster){
180 my $letter = $letters_of_cog{$id};
181 my @letters = split(//,$letter);
182 my $letters_string = join("\t",@letters);
183 #$letter =~s//\t/g;
184 print COG_CLUST "$cluster $id $letters_string\n";
185 }
186 }
187 }
188 close(COG_CLUST);
189 }
190 else{
191
192 my $is_plus = `which rpsblast+`;
193 my $is_not_plus = `which rpsblast`;
194 my $software = "rpsblast";
195 if ($is_plus){$software = "rpsblast+";}
196 system("$software -query $pav_matrix.selection_prot.fa -db $dirname/../COG/Cog -out $pav_matrix.selection.rps-blast.out -evalue 1e-2 -outfmt '7 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qcovs'");
197
198 system("perl $dirname/../COG/bac-genomics-scripts/cdd2cog/cdd2cog.pl -r $pav_matrix.selection.rps-blast.out -c $dirname/../COG/cddid.tbl -f $dirname/../COG/fun.txt -w $dirname/../COG/whog -a");
199 system("cp -rf results/protein-id_cog.txt $cog_clusters");
200 system("rm -rf results");
201 }
202
203 open(COG,">$cog_outfile");
204 my %cogs;
205 open(C,$cog_clusters);
206 while(<C>){
207 my $line = $_;
208 $line =~s/\n//g;$line =~s/\r//g;
209 my @infos = split(/\t/,$line);
210 my $coginfo = "";
211 my $cluster = $infos[0];
212 for (my $i = 1; $i <= $#infos; $i++){
213 $coginfo .= "\t".$infos[$i];
214 }
215 my $gene_list = $genes_of_cluster{$cluster};
216 chop($gene_list);
217 my @genes = split(",",$gene_list);
218 foreach my $gene(@genes){
219 my @letters = split(/\t/,$coginfo);
220 my $strain = $strain_of_prot{$gene};
221 for (my $i = 2; $i <= $#letters; $i++){
222 my $letter = $letters[$i];
223 my $cat = $cogs_categories{$letter};
224 $count_letter{$strain}{$letter}++;
225 $count_letter{$strain}{$cat}++;
226 }
227 print COG $gene.$coginfo."\n";
228 }
229 }
230 close(C);
231 close(COG);
232
233
234
235 my %cogs_of_clusters;
236 my %cogcats_of_clusters;
237 open(C,$cog_clusters);
238 while(<C>){
239 my $line = $_;
240 $line =~s/\n//g;$line =~s/\r//g;
241 my @infos = split(/\t/,$line);
242 my $cluster = $infos[0];
243 my $cog = $infos[1];
244 my $cog_category = $infos[2];
245 $cogs_of_clusters{$cluster}{$cog} = 1;
246 $cogcats_of_clusters{$cluster} = $cog_category;
247 }
248 close(C);
249
250 open(CC,">$cog_clusters.2");
251 foreach my $cluster(sort{$a<=>$b} keys(%genes_of_cluster)){
252 my $cog_category = "Unknown";
253 my $cog_list = "Unknown";
254 if ($cogcats_of_clusters{$cluster}){
255 $cog_category = $cogcats_of_clusters{$cluster};
256 my $ref_cogs_of_clusters = $cogs_of_clusters{$cluster};
257 $cog_list = join(",",keys(%$ref_cogs_of_clusters));
258 }
259 my $function = "Unknown";
260 if ($functions{$cluster}){
261 $function = $functions{$cluster};
262 }
263 if ($accessory_clusters{$cluster}){
264 print CC "$cluster\t$cog_list\t$cog_category\t$function\n";
265 }
266 }
267 close(CC);
268
269 my @cat_of_cat = ("INFORMATION STORAGE AND PROCESSING","METABOLISM","CELLULAR PROCESSES AND SIGNALING","POORLY CHARACTERIZED");
270 my @cog_categories = ("D","M","N","O","T","U","V","W","Y","Z","A","B","J","K","L","C","E","F","G","H","I","P","Q","R","S");
271 open(COG_STAT,">$cog_stats");
272 open(COG_STAT2,">$cog_stats2");
273 print COG_STAT "COG\t".join("\t",@cog_categories)."\n";
274 print COG_STAT2 "COG\t".join("\t",@cat_of_cat)."\n";
275 foreach my $strain(keys(%count_letter)){
276 my $strain_name = $strain_names{$strain};
277 print COG_STAT "$strain_name";
278 print COG_STAT2 "$strain_name";
279 my $ref_subhash = $count_letter{$strain};
280 my %subhash = %$ref_subhash;
281 foreach my $letter(@cog_categories){
282 my $n = 0;
283 if ($count_letter{$strain}{$letter}){$n = $count_letter{$strain}{$letter};}
284 print COG_STAT "\t".$n;
285 }
286 print COG_STAT "\n";
287
288 foreach my $cat(@cat_of_cat){
289 my $n = 0;
290 if ($count_letter{$strain}{$cat}){$n = $count_letter{$strain}{$cat};}
291 print COG_STAT2 "\t".$n;
292 }
293 print COG_STAT2 "\n";
294 }
295 close(COG_STAT);
296 close(COG_STAT2);
297