Mercurial > repos > dereeper > pangenome_explorer
view PanExplorer_workflow/Perl/GetCogOfCluster.pl @ 2:97e4e3e818b6 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:48:09 +0000 |
parents | 032f6b3806a3 |
children |
line wrap: on
line source
#!/usr/bin/perl use strict; use File::Basename; my $dirname = dirname(__FILE__); my $pav_matrix = $ARGV[0]; my $prot_dir = $ARGV[1]; my $cog_outfile = $ARGV[2]; my $cog_stats = $ARGV[3]; my $cog_stats2 = $ARGV[4]; my $cog_clusters = $ARGV[5]; my $strain_info_file = $ARGV[6]; my %cogs_categories = ( "J"=>"INFORMATION STORAGE AND PROCESSING", "A"=>"INFORMATION STORAGE AND PROCESSING", "K"=>"INFORMATION STORAGE AND PROCESSING", "L"=>"INFORMATION STORAGE AND PROCESSING", "B"=>"INFORMATION STORAGE AND PROCESSING", "D"=>"CELLULAR PROCESSES AND SIGNALING", "Y"=>"CELLULAR PROCESSES AND SIGNALING", "V"=>"CELLULAR PROCESSES AND SIGNALING", "T"=>"CELLULAR PROCESSES AND SIGNALING", "M"=>"CELLULAR PROCESSES AND SIGNALING", "N"=>"CELLULAR PROCESSES AND SIGNALING", "Z"=>"CELLULAR PROCESSES AND SIGNALING", "W"=>"CELLULAR PROCESSES AND SIGNALING", "U"=>"CELLULAR PROCESSES AND SIGNALING", "O"=>"CELLULAR PROCESSES AND SIGNALING", "C"=>"METABOLISM", "G"=>"METABOLISM", "E"=>"METABOLISM", "F"=>"METABOLISM", "H"=>"METABOLISM", "I"=>"METABOLISM", "P"=>"METABOLISM", "Q"=>"METABOLISM", "R"=>"POORLY CHARACTERIZED", "S"=>"POORLY CHARACTERIZED" ); my %strain_names; open(S,$strain_info_file); while(<S>){ my $line = $_; $line =~s/\n//g;$line =~s/\r//g; my ($id,$strain_name) = split(/\t/,$line); $strain_names{$id} = $strain_name; } close(S); my %strain_of_prot; my %proteins; open( DIR, "ls $prot_dir/*pep |" ); while(<DIR>) { my $filename = $_; my $strain; my $id; if ($filename =~/\/([^\/]*).pep/){ $strain = $1; } #open(F,"zcat $filename|" ); open(F,"$filename" ); while(<F>){ if (/>(.*)/){ $id = $1; $strain_of_prot{$id} = $strain; } else{ $proteins{$id}.= $_; } } close(F); } closedir(DIR); my %functions; my %accessory_clusters; my %genes_of_cluster; my %cluster_of_gene; open(S,">$pav_matrix.selection_prot.fa"); open(O,$pav_matrix); <O>; while(<O>){ my $line = $_; $line =~s/\n//g;$line =~s/\r//g; my @infos = split(/\t/,$line); my $cluster = $infos[0]; print S ">$cluster\n"; for (my $i=1; $i <= $#infos; $i++){ my @genenames = split(/,/,$infos[$i]); foreach my $genename(@genenames){ if ($genename=~/\w+/){ $genes_of_cluster{$cluster} .= "$genename,"; } } } for (my $i=1; $i <= $#infos; $i++){ if ($infos[$i] =~/-/){$accessory_clusters{$cluster} = 1;} } LINE: for (my $i=1; $i <= $#infos; $i++){ my @genenames = split(/,/,$infos[$i]); foreach my $genename(@genenames){ if ($genename=~/\w+/){ #$genename =~s/\|/_/g; print S $proteins{$genename}; $cluster_of_gene{$genename} = $cluster; my $function = `grep '$genename' $prot_dir/*func`; $function =~s/\n//g;$function =~s/\r//g; my @t = split(/$genename/,$function); if ($t[1] =~/-\s+(.*)/){ $function = $1; $function =~s/'//g; } $functions{$cluster} = $function; last LINE; } } } } close(O); close(S); ######################################################################################################################################### # Test if COG are already provided in Genbank files. If provided, rpsblast is not launched, we get the COG information from Genbank files ######################################################################################################################################### my %cog_of_genes; open(LS,"ls $prot_dir/*gb |"); while(my $gb_file = <LS>){ chomp($gb_file); open(GB,$gb_file); my $current_cog; my $current_gene; while(my $l=<GB>){ if ($l=~/(COG\d+)/){ $current_cog = $1; } if ($l =~/protein_id=\"(.*)\"/ && $current_cog){ $current_gene = $1; $cog_of_genes{$current_gene} .= ",$current_cog"; } if ($l =~/locus_tag=\"(.*)\"/ && $current_cog){ $current_gene = $1; $cog_of_genes{$current_gene} .= ",$current_cog"; } } close(GB); } close(LS); my %count_letter; if (scalar keys(%cog_of_genes) > 1){ open(WHOG,"$dirname/../COG/whog"); my %letters_of_cog; while(my $l=<WHOG>){ if ($l=~/\[(\w+)\] (COG\d+) /){ my $letter = $1; my $cogid = $2; $letters_of_cog{$cogid} = $letter; } } close(WHOG); my %cogs_of_cluster; foreach my $gene(keys(%cog_of_genes)){ my $cluster = $cluster_of_gene{$gene}; my $cogs = $cog_of_genes{$gene}; $cogs_of_cluster{$cluster} = $cogs; } open(COG_CLUST,">$cog_clusters"); foreach my $cluster(sort {$a<=>$b} keys(%cogs_of_cluster)){ my $cogids = $cogs_of_cluster{$cluster}; my @ids = split(/,/,$cogids); foreach my $id(@ids){ if ($id =~/\w+/ && $cluster){ my $letter = $letters_of_cog{$id}; my @letters = split(//,$letter); my $letters_string = join("\t",@letters); #$letter =~s//\t/g; print COG_CLUST "$cluster $id $letters_string\n"; } } } close(COG_CLUST); } else{ my $is_plus = `which rpsblast+`; my $is_not_plus = `which rpsblast`; my $software = "rpsblast"; if ($is_plus){$software = "rpsblast+";} system("$software -query $pav_matrix.selection_prot.fa -db $dirname/../COG/Cog -out $pav_matrix.selection.rps-blast.out -evalue 1e-2 -outfmt '7 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qcovs'"); system("perl $dirname/../COG/bac-genomics-scripts/cdd2cog/cdd2cog.pl -r $pav_matrix.selection.rps-blast.out -c $dirname/../COG/cddid.tbl -f $dirname/../COG/fun.txt -w $dirname/../COG/whog -a"); system("cp -rf results/protein-id_cog.txt $cog_clusters"); system("rm -rf results"); } open(COG,">$cog_outfile"); my %cogs; open(C,$cog_clusters); while(<C>){ my $line = $_; $line =~s/\n//g;$line =~s/\r//g; my @infos = split(/\t/,$line); my $coginfo = ""; my $cluster = $infos[0]; for (my $i = 1; $i <= $#infos; $i++){ $coginfo .= "\t".$infos[$i]; } my $gene_list = $genes_of_cluster{$cluster}; chop($gene_list); my @genes = split(",",$gene_list); foreach my $gene(@genes){ my @letters = split(/\t/,$coginfo); my $strain = $strain_of_prot{$gene}; for (my $i = 2; $i <= $#letters; $i++){ my $letter = $letters[$i]; my $cat = $cogs_categories{$letter}; $count_letter{$strain}{$letter}++; $count_letter{$strain}{$cat}++; } print COG $gene.$coginfo."\n"; } } close(C); close(COG); my %cogs_of_clusters; my %cogcats_of_clusters; open(C,$cog_clusters); while(<C>){ my $line = $_; $line =~s/\n//g;$line =~s/\r//g; my @infos = split(/\t/,$line); my $cluster = $infos[0]; my $cog = $infos[1]; my $cog_category = $infos[2]; $cogs_of_clusters{$cluster}{$cog} = 1; $cogcats_of_clusters{$cluster} = $cog_category; } close(C); open(CC,">$cog_clusters.2"); foreach my $cluster(sort{$a<=>$b} keys(%genes_of_cluster)){ my $cog_category = "Unknown"; my $cog_list = "Unknown"; if ($cogcats_of_clusters{$cluster}){ $cog_category = $cogcats_of_clusters{$cluster}; my $ref_cogs_of_clusters = $cogs_of_clusters{$cluster}; $cog_list = join(",",keys(%$ref_cogs_of_clusters)); } my $function = "Unknown"; if ($functions{$cluster}){ $function = $functions{$cluster}; } if ($accessory_clusters{$cluster}){ print CC "$cluster\t$cog_list\t$cog_category\t$function\n"; } } close(CC); my @cat_of_cat = ("INFORMATION STORAGE AND PROCESSING","METABOLISM","CELLULAR PROCESSES AND SIGNALING","POORLY CHARACTERIZED"); my @cog_categories = ("D","M","N","O","T","U","V","W","Y","Z","A","B","J","K","L","C","E","F","G","H","I","P","Q","R","S"); open(COG_STAT,">$cog_stats"); open(COG_STAT2,">$cog_stats2"); print COG_STAT "COG\t".join("\t",@cog_categories)."\n"; print COG_STAT2 "COG\t".join("\t",@cat_of_cat)."\n"; foreach my $strain(keys(%count_letter)){ my $strain_name = $strain_names{$strain}; print COG_STAT "$strain_name"; print COG_STAT2 "$strain_name"; my $ref_subhash = $count_letter{$strain}; my %subhash = %$ref_subhash; foreach my $letter(@cog_categories){ my $n = 0; if ($count_letter{$strain}{$letter}){$n = $count_letter{$strain}{$letter};} print COG_STAT "\t".$n; } print COG_STAT "\n"; foreach my $cat(@cat_of_cat){ my $n = 0; if ($count_letter{$strain}{$cat}){$n = $count_letter{$strain}{$cat};} print COG_STAT2 "\t".$n; } print COG_STAT2 "\n"; } close(COG_STAT); close(COG_STAT2);