annotate Perl/GetCogOfCluster.pl @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
3 use strict;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
4 use File::Basename;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
5 my $dirname = dirname(__FILE__);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
6
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
7 my $pav_matrix = $ARGV[0];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
8 my $prot_dir = $ARGV[1];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
9 my $cog_outfile = $ARGV[2];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
10 my $cog_stats = $ARGV[3];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
11 my $cog_stats2 = $ARGV[4];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
12 my $cog_clusters = $ARGV[5];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
13 my $strain_info_file = $ARGV[6];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
14
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
15
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
16
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
17 my %cogs_categories = (
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
18 "J"=>"INFORMATION STORAGE AND PROCESSING",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
19 "A"=>"INFORMATION STORAGE AND PROCESSING",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
20 "K"=>"INFORMATION STORAGE AND PROCESSING",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
21 "L"=>"INFORMATION STORAGE AND PROCESSING",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
22 "B"=>"INFORMATION STORAGE AND PROCESSING",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
23 "D"=>"CELLULAR PROCESSES AND SIGNALING",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
24 "Y"=>"CELLULAR PROCESSES AND SIGNALING",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
25 "V"=>"CELLULAR PROCESSES AND SIGNALING",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
26 "T"=>"CELLULAR PROCESSES AND SIGNALING",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
27 "M"=>"CELLULAR PROCESSES AND SIGNALING",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
28 "N"=>"CELLULAR PROCESSES AND SIGNALING",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
29 "Z"=>"CELLULAR PROCESSES AND SIGNALING",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
30 "W"=>"CELLULAR PROCESSES AND SIGNALING",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
31 "U"=>"CELLULAR PROCESSES AND SIGNALING",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
32 "O"=>"CELLULAR PROCESSES AND SIGNALING",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
33 "C"=>"METABOLISM",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
34 "G"=>"METABOLISM",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
35 "E"=>"METABOLISM",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
36 "F"=>"METABOLISM",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
37 "H"=>"METABOLISM",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
38 "I"=>"METABOLISM",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
39 "P"=>"METABOLISM",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
40 "Q"=>"METABOLISM",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
41 "R"=>"POORLY CHARACTERIZED",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
42 "S"=>"POORLY CHARACTERIZED"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
43 );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
44
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
45 my %strain_names;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
46 open(S,$strain_info_file);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
47 while(<S>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
48 my $line = $_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
49 $line =~s/\n//g;$line =~s/\r//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
50 my ($id,$strain_name) = split(/\t/,$line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
51 $strain_names{$id} = $strain_name;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
52 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
53 close(S);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
54
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
55 my %strain_of_prot;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
56 my %proteins;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
57 open( DIR, "ls $prot_dir/*pep |" );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
58 while(<DIR>) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
59 my $filename = $_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
60 my $strain;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
61 my $id;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
62 if ($filename =~/\/([^\/]*).pep/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
63 $strain = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
64 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
65 #open(F,"zcat $filename|" );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
66 open(F,"$filename" );
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
67 while(<F>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
68 if (/>(.*)/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
69 $id = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
70 $strain_of_prot{$id} = $strain;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
71 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
72 else{
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
73 $proteins{$id}.= $_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
74 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
75 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
76 close(F);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
77 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
78 closedir(DIR);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
79
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
80 my %functions;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
81 my %accessory_clusters;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
82 my %genes_of_cluster;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
83 my %cluster_of_gene;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
84 open(S,">$pav_matrix.selection_prot.fa");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
85 open(O,$pav_matrix);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
86 <O>;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
87 while(<O>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
88 my $line = $_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
89 $line =~s/\n//g;$line =~s/\r//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
90 my @infos = split(/\t/,$line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
91 my $cluster = $infos[0];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
92 print S ">$cluster\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
93 for (my $i=1; $i <= $#infos; $i++){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
94 my @genenames = split(/,/,$infos[$i]);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
95 foreach my $genename(@genenames){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
96 if ($genename=~/\w+/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
97 $genes_of_cluster{$cluster} .= "$genename,";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
98 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
99 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
100 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
101 for (my $i=1; $i <= $#infos; $i++){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
102 if ($infos[$i] =~/-/){$accessory_clusters{$cluster} = 1;}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
103 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
104 LINE: for (my $i=1; $i <= $#infos; $i++){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
105 my @genenames = split(/,/,$infos[$i]);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
106 foreach my $genename(@genenames){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
107 if ($genename=~/\w+/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
108 #$genename =~s/\|/_/g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
109 print S $proteins{$genename};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
110 $cluster_of_gene{$genename} = $cluster;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
111 my $function = `grep '$genename' $prot_dir/*func`;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
112 $function =~s/\n//g;$function =~s/\r//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
113 my @t = split(/$genename/,$function);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
114 if ($t[1] =~/-\s+(.*)/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
115 $function = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
116 $function =~s/'//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
117 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
118 $functions{$cluster} = $function;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
119 last LINE;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
120 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
121 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
122 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
123 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
124 close(O);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
125 close(S);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
126
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
127
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
128 #########################################################################################################################################
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
129 # Test if COG are already provided in Genbank files. If provided, rpsblast is not launched, we get the COG information from Genbank files
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
130 #########################################################################################################################################
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
131 my %cog_of_genes;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
132 open(LS,"ls $prot_dir/*gb |");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
133 while(my $gb_file = <LS>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
134 chomp($gb_file);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
135 open(GB,$gb_file);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
136 my $current_cog;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
137 my $current_gene;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
138 while(my $l=<GB>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
139 if ($l=~/(COG\d+)/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
140 $current_cog = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
141 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
142 if ($l =~/protein_id=\"(.*)\"/ && $current_cog){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
143 $current_gene = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
144 $cog_of_genes{$current_gene} .= ",$current_cog";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
145 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
146 if ($l =~/locus_tag=\"(.*)\"/ && $current_cog){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
147 $current_gene = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
148 $cog_of_genes{$current_gene} .= ",$current_cog";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
149 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
150 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
151 close(GB);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
152 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
153 close(LS);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
154
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
155 my %count_letter;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
156 if (scalar keys(%cog_of_genes) > 1){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
157 open(WHOG,"$dirname/../COG/whog");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
158 my %letters_of_cog;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
159 while(my $l=<WHOG>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
160 if ($l=~/\[(\w+)\] (COG\d+) /){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
161 my $letter = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
162 my $cogid = $2;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
163 $letters_of_cog{$cogid} = $letter;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
164 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
165 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
166 close(WHOG);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
167
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
168 my %cogs_of_cluster;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
169 foreach my $gene(keys(%cog_of_genes)){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
170 my $cluster = $cluster_of_gene{$gene};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
171 my $cogs = $cog_of_genes{$gene};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
172 $cogs_of_cluster{$cluster} = $cogs;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
173 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
174 open(COG_CLUST,">$cog_clusters");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
175 foreach my $cluster(sort {$a<=>$b} keys(%cogs_of_cluster)){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
176 my $cogids = $cogs_of_cluster{$cluster};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
177 my @ids = split(/,/,$cogids);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
178 foreach my $id(@ids){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
179 if ($id =~/\w+/ && $cluster){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
180 my $letter = $letters_of_cog{$id};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
181 my @letters = split(//,$letter);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
182 my $letters_string = join("\t",@letters);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
183 #$letter =~s//\t/g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
184 print COG_CLUST "$cluster $id $letters_string\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
185 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
186 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
187 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
188 close(COG_CLUST);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
189 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
190 else{
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
191
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
192 my $is_plus = `which rpsblast+`;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
193 my $is_not_plus = `which rpsblast`;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
194 my $software = "rpsblast";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
195 if ($is_plus){$software = "rpsblast+";}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
196 system("$software -query $pav_matrix.selection_prot.fa -db $dirname/../COG/Cog -out $pav_matrix.selection.rps-blast.out -evalue 1e-2 -outfmt '7 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qcovs'");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
197
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
198 system("perl $dirname/../COG/bac-genomics-scripts/cdd2cog/cdd2cog.pl -r $pav_matrix.selection.rps-blast.out -c $dirname/../COG/cddid.tbl -f $dirname/../COG/fun.txt -w $dirname/../COG/whog -a");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
199 system("cp -rf results/protein-id_cog.txt $cog_clusters");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
200 system("rm -rf results");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
201 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
202
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
203 open(COG,">$cog_outfile");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
204 my %cogs;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
205 open(C,$cog_clusters);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
206 while(<C>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
207 my $line = $_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
208 $line =~s/\n//g;$line =~s/\r//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
209 my @infos = split(/\t/,$line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
210 my $coginfo = "";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
211 my $cluster = $infos[0];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
212 for (my $i = 1; $i <= $#infos; $i++){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
213 $coginfo .= "\t".$infos[$i];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
214 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
215 my $gene_list = $genes_of_cluster{$cluster};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
216 chop($gene_list);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
217 my @genes = split(",",$gene_list);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
218 foreach my $gene(@genes){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
219 my @letters = split(/\t/,$coginfo);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
220 my $strain = $strain_of_prot{$gene};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
221 for (my $i = 2; $i <= $#letters; $i++){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
222 my $letter = $letters[$i];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
223 my $cat = $cogs_categories{$letter};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
224 $count_letter{$strain}{$letter}++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
225 $count_letter{$strain}{$cat}++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
226 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
227 print COG $gene.$coginfo."\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
228 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
229 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
230 close(C);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
231 close(COG);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
232
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
233
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
234
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
235 my %cogs_of_clusters;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
236 my %cogcats_of_clusters;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
237 open(C,$cog_clusters);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
238 while(<C>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
239 my $line = $_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
240 $line =~s/\n//g;$line =~s/\r//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
241 my @infos = split(/\t/,$line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
242 my $cluster = $infos[0];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
243 my $cog = $infos[1];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
244 my $cog_category = $infos[2];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
245 $cogs_of_clusters{$cluster}{$cog} = 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
246 $cogcats_of_clusters{$cluster} = $cog_category;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
247 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
248 close(C);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
249
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
250 open(CC,">$cog_clusters.2");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
251 foreach my $cluster(sort{$a<=>$b} keys(%genes_of_cluster)){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
252 my $cog_category = "Unknown";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
253 my $cog_list = "Unknown";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
254 if ($cogcats_of_clusters{$cluster}){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
255 $cog_category = $cogcats_of_clusters{$cluster};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
256 my $ref_cogs_of_clusters = $cogs_of_clusters{$cluster};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
257 $cog_list = join(",",keys(%$ref_cogs_of_clusters));
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
258 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
259 my $function = "Unknown";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
260 if ($functions{$cluster}){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
261 $function = $functions{$cluster};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
262 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
263 if ($accessory_clusters{$cluster}){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
264 print CC "$cluster\t$cog_list\t$cog_category\t$function\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
265 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
266 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
267 close(CC);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
268
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
269 my @cat_of_cat = ("INFORMATION STORAGE AND PROCESSING","METABOLISM","CELLULAR PROCESSES AND SIGNALING","POORLY CHARACTERIZED");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
270 my @cog_categories = ("D","M","N","O","T","U","V","W","Y","Z","A","B","J","K","L","C","E","F","G","H","I","P","Q","R","S");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
271 open(COG_STAT,">$cog_stats");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
272 open(COG_STAT2,">$cog_stats2");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
273 print COG_STAT "COG\t".join("\t",@cog_categories)."\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
274 print COG_STAT2 "COG\t".join("\t",@cat_of_cat)."\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
275 foreach my $strain(keys(%count_letter)){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
276 my $strain_name = $strain_names{$strain};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
277 print COG_STAT "$strain_name";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
278 print COG_STAT2 "$strain_name";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
279 my $ref_subhash = $count_letter{$strain};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
280 my %subhash = %$ref_subhash;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
281 foreach my $letter(@cog_categories){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
282 my $n = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
283 if ($count_letter{$strain}{$letter}){$n = $count_letter{$strain}{$letter};}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
284 print COG_STAT "\t".$n;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
285 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
286 print COG_STAT "\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
287
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
288 foreach my $cat(@cat_of_cat){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
289 my $n = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
290 if ($count_letter{$strain}{$cat}){$n = $count_letter{$strain}{$cat};}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
291 print COG_STAT2 "\t".$n;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
292 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
293 print COG_STAT2 "\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
294 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
295 close(COG_STAT);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
296 close(COG_STAT2);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
297