annotate PanExplorer_workflow/Perl/GetCogOfCluster.pl @ 2:97e4e3e818b6 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:48:09 +0000
parents 032f6b3806a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
2
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
3 use strict;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
4 use File::Basename;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
5 my $dirname = dirname(__FILE__);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
6
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
7 my $pav_matrix = $ARGV[0];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
8 my $prot_dir = $ARGV[1];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
9 my $cog_outfile = $ARGV[2];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
10 my $cog_stats = $ARGV[3];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
11 my $cog_stats2 = $ARGV[4];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
12 my $cog_clusters = $ARGV[5];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
13 my $strain_info_file = $ARGV[6];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
14
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
15
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
16
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
17 my %cogs_categories = (
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
18 "J"=>"INFORMATION STORAGE AND PROCESSING",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
19 "A"=>"INFORMATION STORAGE AND PROCESSING",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
20 "K"=>"INFORMATION STORAGE AND PROCESSING",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
21 "L"=>"INFORMATION STORAGE AND PROCESSING",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
22 "B"=>"INFORMATION STORAGE AND PROCESSING",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
23 "D"=>"CELLULAR PROCESSES AND SIGNALING",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
24 "Y"=>"CELLULAR PROCESSES AND SIGNALING",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
25 "V"=>"CELLULAR PROCESSES AND SIGNALING",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
26 "T"=>"CELLULAR PROCESSES AND SIGNALING",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
27 "M"=>"CELLULAR PROCESSES AND SIGNALING",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
28 "N"=>"CELLULAR PROCESSES AND SIGNALING",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
29 "Z"=>"CELLULAR PROCESSES AND SIGNALING",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
30 "W"=>"CELLULAR PROCESSES AND SIGNALING",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
31 "U"=>"CELLULAR PROCESSES AND SIGNALING",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
32 "O"=>"CELLULAR PROCESSES AND SIGNALING",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
33 "C"=>"METABOLISM",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
34 "G"=>"METABOLISM",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
35 "E"=>"METABOLISM",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
36 "F"=>"METABOLISM",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
37 "H"=>"METABOLISM",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
38 "I"=>"METABOLISM",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
39 "P"=>"METABOLISM",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
40 "Q"=>"METABOLISM",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
41 "R"=>"POORLY CHARACTERIZED",
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
42 "S"=>"POORLY CHARACTERIZED"
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
43 );
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
44
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
45 my %strain_names;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
46 open(S,$strain_info_file);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
47 while(<S>){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
48 my $line = $_;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
49 $line =~s/\n//g;$line =~s/\r//g;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
50 my ($id,$strain_name) = split(/\t/,$line);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
51 $strain_names{$id} = $strain_name;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
52 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
53 close(S);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
54
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
55 my %strain_of_prot;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
56 my %proteins;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
57 open( DIR, "ls $prot_dir/*pep |" );
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
58 while(<DIR>) {
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
59 my $filename = $_;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
60 my $strain;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
61 my $id;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
62 if ($filename =~/\/([^\/]*).pep/){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
63 $strain = $1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
64 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
65 #open(F,"zcat $filename|" );
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
66 open(F,"$filename" );
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
67 while(<F>){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
68 if (/>(.*)/){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
69 $id = $1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
70 $strain_of_prot{$id} = $strain;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
71 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
72 else{
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
73 $proteins{$id}.= $_;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
74 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
75 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
76 close(F);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
77 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
78 closedir(DIR);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
79
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
80 my %functions;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
81 my %accessory_clusters;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
82 my %genes_of_cluster;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
83 my %cluster_of_gene;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
84 open(S,">$pav_matrix.selection_prot.fa");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
85 open(O,$pav_matrix);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
86 <O>;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
87 while(<O>){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
88 my $line = $_;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
89 $line =~s/\n//g;$line =~s/\r//g;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
90 my @infos = split(/\t/,$line);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
91 my $cluster = $infos[0];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
92 print S ">$cluster\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
93 for (my $i=1; $i <= $#infos; $i++){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
94 my @genenames = split(/,/,$infos[$i]);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
95 foreach my $genename(@genenames){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
96 if ($genename=~/\w+/){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
97 $genes_of_cluster{$cluster} .= "$genename,";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
98 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
99 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
100 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
101 for (my $i=1; $i <= $#infos; $i++){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
102 if ($infos[$i] =~/-/){$accessory_clusters{$cluster} = 1;}
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
103 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
104 LINE: for (my $i=1; $i <= $#infos; $i++){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
105 my @genenames = split(/,/,$infos[$i]);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
106 foreach my $genename(@genenames){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
107 if ($genename=~/\w+/){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
108 #$genename =~s/\|/_/g;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
109 print S $proteins{$genename};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
110 $cluster_of_gene{$genename} = $cluster;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
111 my $function = `grep '$genename' $prot_dir/*func`;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
112 $function =~s/\n//g;$function =~s/\r//g;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
113 my @t = split(/$genename/,$function);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
114 if ($t[1] =~/-\s+(.*)/){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
115 $function = $1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
116 $function =~s/'//g;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
117 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
118 $functions{$cluster} = $function;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
119 last LINE;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
120 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
121 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
122 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
123 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
124 close(O);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
125 close(S);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
126
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
127
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
128 #########################################################################################################################################
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
129 # Test if COG are already provided in Genbank files. If provided, rpsblast is not launched, we get the COG information from Genbank files
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
130 #########################################################################################################################################
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
131 my %cog_of_genes;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
132 open(LS,"ls $prot_dir/*gb |");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
133 while(my $gb_file = <LS>){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
134 chomp($gb_file);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
135 open(GB,$gb_file);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
136 my $current_cog;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
137 my $current_gene;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
138 while(my $l=<GB>){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
139 if ($l=~/(COG\d+)/){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
140 $current_cog = $1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
141 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
142 if ($l =~/protein_id=\"(.*)\"/ && $current_cog){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
143 $current_gene = $1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
144 $cog_of_genes{$current_gene} .= ",$current_cog";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
145 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
146 if ($l =~/locus_tag=\"(.*)\"/ && $current_cog){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
147 $current_gene = $1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
148 $cog_of_genes{$current_gene} .= ",$current_cog";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
149 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
150 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
151 close(GB);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
152 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
153 close(LS);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
154
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
155 my %count_letter;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
156 if (scalar keys(%cog_of_genes) > 1){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
157 open(WHOG,"$dirname/../COG/whog");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
158 my %letters_of_cog;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
159 while(my $l=<WHOG>){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
160 if ($l=~/\[(\w+)\] (COG\d+) /){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
161 my $letter = $1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
162 my $cogid = $2;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
163 $letters_of_cog{$cogid} = $letter;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
164 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
165 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
166 close(WHOG);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
167
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
168 my %cogs_of_cluster;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
169 foreach my $gene(keys(%cog_of_genes)){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
170 my $cluster = $cluster_of_gene{$gene};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
171 my $cogs = $cog_of_genes{$gene};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
172 $cogs_of_cluster{$cluster} = $cogs;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
173 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
174 open(COG_CLUST,">$cog_clusters");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
175 foreach my $cluster(sort {$a<=>$b} keys(%cogs_of_cluster)){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
176 my $cogids = $cogs_of_cluster{$cluster};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
177 my @ids = split(/,/,$cogids);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
178 foreach my $id(@ids){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
179 if ($id =~/\w+/ && $cluster){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
180 my $letter = $letters_of_cog{$id};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
181 my @letters = split(//,$letter);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
182 my $letters_string = join("\t",@letters);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
183 #$letter =~s//\t/g;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
184 print COG_CLUST "$cluster $id $letters_string\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
185 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
186 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
187 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
188 close(COG_CLUST);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
189 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
190 else{
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
191
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
192 my $is_plus = `which rpsblast+`;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
193 my $is_not_plus = `which rpsblast`;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
194 my $software = "rpsblast";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
195 if ($is_plus){$software = "rpsblast+";}
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
196 system("$software -query $pav_matrix.selection_prot.fa -db $dirname/../COG/Cog -out $pav_matrix.selection.rps-blast.out -evalue 1e-2 -outfmt '7 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qcovs'");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
197
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
198 system("perl $dirname/../COG/bac-genomics-scripts/cdd2cog/cdd2cog.pl -r $pav_matrix.selection.rps-blast.out -c $dirname/../COG/cddid.tbl -f $dirname/../COG/fun.txt -w $dirname/../COG/whog -a");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
199 system("cp -rf results/protein-id_cog.txt $cog_clusters");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
200 system("rm -rf results");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
201 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
202
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
203 open(COG,">$cog_outfile");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
204 my %cogs;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
205 open(C,$cog_clusters);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
206 while(<C>){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
207 my $line = $_;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
208 $line =~s/\n//g;$line =~s/\r//g;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
209 my @infos = split(/\t/,$line);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
210 my $coginfo = "";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
211 my $cluster = $infos[0];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
212 for (my $i = 1; $i <= $#infos; $i++){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
213 $coginfo .= "\t".$infos[$i];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
214 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
215 my $gene_list = $genes_of_cluster{$cluster};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
216 chop($gene_list);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
217 my @genes = split(",",$gene_list);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
218 foreach my $gene(@genes){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
219 my @letters = split(/\t/,$coginfo);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
220 my $strain = $strain_of_prot{$gene};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
221 for (my $i = 2; $i <= $#letters; $i++){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
222 my $letter = $letters[$i];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
223 my $cat = $cogs_categories{$letter};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
224 $count_letter{$strain}{$letter}++;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
225 $count_letter{$strain}{$cat}++;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
226 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
227 print COG $gene.$coginfo."\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
228 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
229 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
230 close(C);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
231 close(COG);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
232
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
233
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
234
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
235 my %cogs_of_clusters;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
236 my %cogcats_of_clusters;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
237 open(C,$cog_clusters);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
238 while(<C>){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
239 my $line = $_;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
240 $line =~s/\n//g;$line =~s/\r//g;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
241 my @infos = split(/\t/,$line);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
242 my $cluster = $infos[0];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
243 my $cog = $infos[1];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
244 my $cog_category = $infos[2];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
245 $cogs_of_clusters{$cluster}{$cog} = 1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
246 $cogcats_of_clusters{$cluster} = $cog_category;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
247 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
248 close(C);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
249
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
250 open(CC,">$cog_clusters.2");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
251 foreach my $cluster(sort{$a<=>$b} keys(%genes_of_cluster)){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
252 my $cog_category = "Unknown";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
253 my $cog_list = "Unknown";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
254 if ($cogcats_of_clusters{$cluster}){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
255 $cog_category = $cogcats_of_clusters{$cluster};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
256 my $ref_cogs_of_clusters = $cogs_of_clusters{$cluster};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
257 $cog_list = join(",",keys(%$ref_cogs_of_clusters));
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
258 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
259 my $function = "Unknown";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
260 if ($functions{$cluster}){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
261 $function = $functions{$cluster};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
262 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
263 if ($accessory_clusters{$cluster}){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
264 print CC "$cluster\t$cog_list\t$cog_category\t$function\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
265 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
266 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
267 close(CC);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
268
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
269 my @cat_of_cat = ("INFORMATION STORAGE AND PROCESSING","METABOLISM","CELLULAR PROCESSES AND SIGNALING","POORLY CHARACTERIZED");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
270 my @cog_categories = ("D","M","N","O","T","U","V","W","Y","Z","A","B","J","K","L","C","E","F","G","H","I","P","Q","R","S");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
271 open(COG_STAT,">$cog_stats");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
272 open(COG_STAT2,">$cog_stats2");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
273 print COG_STAT "COG\t".join("\t",@cog_categories)."\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
274 print COG_STAT2 "COG\t".join("\t",@cat_of_cat)."\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
275 foreach my $strain(keys(%count_letter)){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
276 my $strain_name = $strain_names{$strain};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
277 print COG_STAT "$strain_name";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
278 print COG_STAT2 "$strain_name";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
279 my $ref_subhash = $count_letter{$strain};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
280 my %subhash = %$ref_subhash;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
281 foreach my $letter(@cog_categories){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
282 my $n = 0;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
283 if ($count_letter{$strain}{$letter}){$n = $count_letter{$strain}{$letter};}
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
284 print COG_STAT "\t".$n;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
285 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
286 print COG_STAT "\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
287
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
288 foreach my $cat(@cat_of_cat){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
289 my $n = 0;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
290 if ($count_letter{$strain}{$cat}){$n = $count_letter{$strain}{$cat};}
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
291 print COG_STAT2 "\t".$n;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
292 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
293 print COG_STAT2 "\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
294 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
295 close(COG_STAT);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
296 close(COG_STAT2);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
297