1
|
1 #!/usr/bin/perl
|
|
2
|
|
3 use strict;
|
|
4 use File::Basename;
|
|
5 my $dirname = dirname(__FILE__);
|
|
6
|
|
7 my $pav_matrix = $ARGV[0];
|
|
8 my $prot_dir = $ARGV[1];
|
|
9 my $cog_outfile = $ARGV[2];
|
|
10 my $cog_stats = $ARGV[3];
|
|
11 my $cog_stats2 = $ARGV[4];
|
|
12 my $cog_clusters = $ARGV[5];
|
|
13 my $strain_info_file = $ARGV[6];
|
|
14
|
|
15
|
|
16
|
|
17 my %cogs_categories = (
|
|
18 "J"=>"INFORMATION STORAGE AND PROCESSING",
|
|
19 "A"=>"INFORMATION STORAGE AND PROCESSING",
|
|
20 "K"=>"INFORMATION STORAGE AND PROCESSING",
|
|
21 "L"=>"INFORMATION STORAGE AND PROCESSING",
|
|
22 "B"=>"INFORMATION STORAGE AND PROCESSING",
|
|
23 "D"=>"CELLULAR PROCESSES AND SIGNALING",
|
|
24 "Y"=>"CELLULAR PROCESSES AND SIGNALING",
|
|
25 "V"=>"CELLULAR PROCESSES AND SIGNALING",
|
|
26 "T"=>"CELLULAR PROCESSES AND SIGNALING",
|
|
27 "M"=>"CELLULAR PROCESSES AND SIGNALING",
|
|
28 "N"=>"CELLULAR PROCESSES AND SIGNALING",
|
|
29 "Z"=>"CELLULAR PROCESSES AND SIGNALING",
|
|
30 "W"=>"CELLULAR PROCESSES AND SIGNALING",
|
|
31 "U"=>"CELLULAR PROCESSES AND SIGNALING",
|
|
32 "O"=>"CELLULAR PROCESSES AND SIGNALING",
|
|
33 "C"=>"METABOLISM",
|
|
34 "G"=>"METABOLISM",
|
|
35 "E"=>"METABOLISM",
|
|
36 "F"=>"METABOLISM",
|
|
37 "H"=>"METABOLISM",
|
|
38 "I"=>"METABOLISM",
|
|
39 "P"=>"METABOLISM",
|
|
40 "Q"=>"METABOLISM",
|
|
41 "R"=>"POORLY CHARACTERIZED",
|
|
42 "S"=>"POORLY CHARACTERIZED"
|
|
43 );
|
|
44
|
|
45 my %strain_names;
|
|
46 open(S,$strain_info_file);
|
|
47 while(<S>){
|
|
48 my $line = $_;
|
|
49 $line =~s/\n//g;$line =~s/\r//g;
|
|
50 my ($id,$strain_name) = split(/\t/,$line);
|
|
51 $strain_names{$id} = $strain_name;
|
|
52 }
|
|
53 close(S);
|
|
54
|
|
55 my %strain_of_prot;
|
|
56 my %proteins;
|
|
57 open( DIR, "ls $prot_dir/*pep |" );
|
|
58 while(<DIR>) {
|
|
59 my $filename = $_;
|
|
60 my $strain;
|
|
61 my $id;
|
|
62 if ($filename =~/\/([^\/]*).pep/){
|
|
63 $strain = $1;
|
|
64 }
|
|
65 #open(F,"zcat $filename|" );
|
|
66 open(F,"$filename" );
|
|
67 while(<F>){
|
|
68 if (/>(.*)/){
|
|
69 $id = $1;
|
|
70 $strain_of_prot{$id} = $strain;
|
|
71 }
|
|
72 else{
|
|
73 $proteins{$id}.= $_;
|
|
74 }
|
|
75 }
|
|
76 close(F);
|
|
77 }
|
|
78 closedir(DIR);
|
|
79
|
|
80 my %functions;
|
|
81 my %accessory_clusters;
|
|
82 my %genes_of_cluster;
|
|
83 my %cluster_of_gene;
|
|
84 open(S,">$pav_matrix.selection_prot.fa");
|
|
85 open(O,$pav_matrix);
|
|
86 <O>;
|
|
87 while(<O>){
|
|
88 my $line = $_;
|
|
89 $line =~s/\n//g;$line =~s/\r//g;
|
|
90 my @infos = split(/\t/,$line);
|
|
91 my $cluster = $infos[0];
|
|
92 print S ">$cluster\n";
|
|
93 for (my $i=1; $i <= $#infos; $i++){
|
|
94 my @genenames = split(/,/,$infos[$i]);
|
|
95 foreach my $genename(@genenames){
|
|
96 if ($genename=~/\w+/){
|
|
97 $genes_of_cluster{$cluster} .= "$genename,";
|
|
98 }
|
|
99 }
|
|
100 }
|
|
101 for (my $i=1; $i <= $#infos; $i++){
|
|
102 if ($infos[$i] =~/-/){$accessory_clusters{$cluster} = 1;}
|
|
103 }
|
|
104 LINE: for (my $i=1; $i <= $#infos; $i++){
|
|
105 my @genenames = split(/,/,$infos[$i]);
|
|
106 foreach my $genename(@genenames){
|
|
107 if ($genename=~/\w+/){
|
|
108 #$genename =~s/\|/_/g;
|
|
109 print S $proteins{$genename};
|
|
110 $cluster_of_gene{$genename} = $cluster;
|
|
111 my $function = `grep '$genename' $prot_dir/*func`;
|
|
112 $function =~s/\n//g;$function =~s/\r//g;
|
|
113 my @t = split(/$genename/,$function);
|
|
114 if ($t[1] =~/-\s+(.*)/){
|
|
115 $function = $1;
|
|
116 $function =~s/'//g;
|
|
117 }
|
|
118 $functions{$cluster} = $function;
|
|
119 last LINE;
|
|
120 }
|
|
121 }
|
|
122 }
|
|
123 }
|
|
124 close(O);
|
|
125 close(S);
|
|
126
|
|
127
|
|
128 #########################################################################################################################################
|
|
129 # Test if COG are already provided in Genbank files. If provided, rpsblast is not launched, we get the COG information from Genbank files
|
|
130 #########################################################################################################################################
|
|
131 my %cog_of_genes;
|
|
132 open(LS,"ls $prot_dir/*gb |");
|
|
133 while(my $gb_file = <LS>){
|
|
134 chomp($gb_file);
|
|
135 open(GB,$gb_file);
|
|
136 my $current_cog;
|
|
137 my $current_gene;
|
|
138 while(my $l=<GB>){
|
|
139 if ($l=~/(COG\d+)/){
|
|
140 $current_cog = $1;
|
|
141 }
|
|
142 if ($l =~/protein_id=\"(.*)\"/ && $current_cog){
|
|
143 $current_gene = $1;
|
|
144 $cog_of_genes{$current_gene} .= ",$current_cog";
|
|
145 }
|
|
146 if ($l =~/locus_tag=\"(.*)\"/ && $current_cog){
|
|
147 $current_gene = $1;
|
|
148 $cog_of_genes{$current_gene} .= ",$current_cog";
|
|
149 }
|
|
150 }
|
|
151 close(GB);
|
|
152 }
|
|
153 close(LS);
|
|
154
|
|
155 my %count_letter;
|
|
156 if (scalar keys(%cog_of_genes) > 1){
|
|
157 open(WHOG,"$dirname/../COG/whog");
|
|
158 my %letters_of_cog;
|
|
159 while(my $l=<WHOG>){
|
|
160 if ($l=~/\[(\w+)\] (COG\d+) /){
|
|
161 my $letter = $1;
|
|
162 my $cogid = $2;
|
|
163 $letters_of_cog{$cogid} = $letter;
|
|
164 }
|
|
165 }
|
|
166 close(WHOG);
|
|
167
|
|
168 my %cogs_of_cluster;
|
|
169 foreach my $gene(keys(%cog_of_genes)){
|
|
170 my $cluster = $cluster_of_gene{$gene};
|
|
171 my $cogs = $cog_of_genes{$gene};
|
|
172 $cogs_of_cluster{$cluster} = $cogs;
|
|
173 }
|
|
174 open(COG_CLUST,">$cog_clusters");
|
|
175 foreach my $cluster(sort {$a<=>$b} keys(%cogs_of_cluster)){
|
|
176 my $cogids = $cogs_of_cluster{$cluster};
|
|
177 my @ids = split(/,/,$cogids);
|
|
178 foreach my $id(@ids){
|
|
179 if ($id =~/\w+/ && $cluster){
|
|
180 my $letter = $letters_of_cog{$id};
|
|
181 my @letters = split(//,$letter);
|
|
182 my $letters_string = join("\t",@letters);
|
|
183 #$letter =~s//\t/g;
|
|
184 print COG_CLUST "$cluster $id $letters_string\n";
|
|
185 }
|
|
186 }
|
|
187 }
|
|
188 close(COG_CLUST);
|
|
189 }
|
|
190 else{
|
|
191
|
|
192 my $is_plus = `which rpsblast+`;
|
|
193 my $is_not_plus = `which rpsblast`;
|
|
194 my $software = "rpsblast";
|
|
195 if ($is_plus){$software = "rpsblast+";}
|
|
196 system("$software -query $pav_matrix.selection_prot.fa -db $dirname/../COG/Cog -out $pav_matrix.selection.rps-blast.out -evalue 1e-2 -outfmt '7 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qcovs'");
|
|
197
|
|
198 system("perl $dirname/../COG/bac-genomics-scripts/cdd2cog/cdd2cog.pl -r $pav_matrix.selection.rps-blast.out -c $dirname/../COG/cddid.tbl -f $dirname/../COG/fun.txt -w $dirname/../COG/whog -a");
|
|
199 system("cp -rf results/protein-id_cog.txt $cog_clusters");
|
|
200 system("rm -rf results");
|
|
201 }
|
|
202
|
|
203 open(COG,">$cog_outfile");
|
|
204 my %cogs;
|
|
205 open(C,$cog_clusters);
|
|
206 while(<C>){
|
|
207 my $line = $_;
|
|
208 $line =~s/\n//g;$line =~s/\r//g;
|
|
209 my @infos = split(/\t/,$line);
|
|
210 my $coginfo = "";
|
|
211 my $cluster = $infos[0];
|
|
212 for (my $i = 1; $i <= $#infos; $i++){
|
|
213 $coginfo .= "\t".$infos[$i];
|
|
214 }
|
|
215 my $gene_list = $genes_of_cluster{$cluster};
|
|
216 chop($gene_list);
|
|
217 my @genes = split(",",$gene_list);
|
|
218 foreach my $gene(@genes){
|
|
219 my @letters = split(/\t/,$coginfo);
|
|
220 my $strain = $strain_of_prot{$gene};
|
|
221 for (my $i = 2; $i <= $#letters; $i++){
|
|
222 my $letter = $letters[$i];
|
|
223 my $cat = $cogs_categories{$letter};
|
|
224 $count_letter{$strain}{$letter}++;
|
|
225 $count_letter{$strain}{$cat}++;
|
|
226 }
|
|
227 print COG $gene.$coginfo."\n";
|
|
228 }
|
|
229 }
|
|
230 close(C);
|
|
231 close(COG);
|
|
232
|
|
233
|
|
234
|
|
235 my %cogs_of_clusters;
|
|
236 my %cogcats_of_clusters;
|
|
237 open(C,$cog_clusters);
|
|
238 while(<C>){
|
|
239 my $line = $_;
|
|
240 $line =~s/\n//g;$line =~s/\r//g;
|
|
241 my @infos = split(/\t/,$line);
|
|
242 my $cluster = $infos[0];
|
|
243 my $cog = $infos[1];
|
|
244 my $cog_category = $infos[2];
|
|
245 $cogs_of_clusters{$cluster}{$cog} = 1;
|
|
246 $cogcats_of_clusters{$cluster} = $cog_category;
|
|
247 }
|
|
248 close(C);
|
|
249
|
|
250 open(CC,">$cog_clusters.2");
|
|
251 foreach my $cluster(sort{$a<=>$b} keys(%genes_of_cluster)){
|
|
252 my $cog_category = "Unknown";
|
|
253 my $cog_list = "Unknown";
|
|
254 if ($cogcats_of_clusters{$cluster}){
|
|
255 $cog_category = $cogcats_of_clusters{$cluster};
|
|
256 my $ref_cogs_of_clusters = $cogs_of_clusters{$cluster};
|
|
257 $cog_list = join(",",keys(%$ref_cogs_of_clusters));
|
|
258 }
|
|
259 my $function = "Unknown";
|
|
260 if ($functions{$cluster}){
|
|
261 $function = $functions{$cluster};
|
|
262 }
|
|
263 if ($accessory_clusters{$cluster}){
|
|
264 print CC "$cluster\t$cog_list\t$cog_category\t$function\n";
|
|
265 }
|
|
266 }
|
|
267 close(CC);
|
|
268
|
|
269 my @cat_of_cat = ("INFORMATION STORAGE AND PROCESSING","METABOLISM","CELLULAR PROCESSES AND SIGNALING","POORLY CHARACTERIZED");
|
|
270 my @cog_categories = ("D","M","N","O","T","U","V","W","Y","Z","A","B","J","K","L","C","E","F","G","H","I","P","Q","R","S");
|
|
271 open(COG_STAT,">$cog_stats");
|
|
272 open(COG_STAT2,">$cog_stats2");
|
|
273 print COG_STAT "COG\t".join("\t",@cog_categories)."\n";
|
|
274 print COG_STAT2 "COG\t".join("\t",@cat_of_cat)."\n";
|
|
275 foreach my $strain(keys(%count_letter)){
|
|
276 my $strain_name = $strain_names{$strain};
|
|
277 print COG_STAT "$strain_name";
|
|
278 print COG_STAT2 "$strain_name";
|
|
279 my $ref_subhash = $count_letter{$strain};
|
|
280 my %subhash = %$ref_subhash;
|
|
281 foreach my $letter(@cog_categories){
|
|
282 my $n = 0;
|
|
283 if ($count_letter{$strain}{$letter}){$n = $count_letter{$strain}{$letter};}
|
|
284 print COG_STAT "\t".$n;
|
|
285 }
|
|
286 print COG_STAT "\n";
|
|
287
|
|
288 foreach my $cat(@cat_of_cat){
|
|
289 my $n = 0;
|
|
290 if ($count_letter{$strain}{$cat}){$n = $count_letter{$strain}{$cat};}
|
|
291 print COG_STAT2 "\t".$n;
|
|
292 }
|
|
293 print COG_STAT2 "\n";
|
|
294 }
|
|
295 close(COG_STAT);
|
|
296 close(COG_STAT2);
|
|
297
|