Mercurial > repos > dereeper > pangenome_explorer
diff Perl/CalculateCogEnrichment.pl @ 3:e42d30da7a74 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:52:25 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Perl/CalculateCogEnrichment.pl Thu May 30 11:52:25 2024 +0000 @@ -0,0 +1,189 @@ +#!/usr/bin/perl + +use strict; + +my $pav_matrix = $ARGV[0]; +my $cog_of_clusters = $ARGV[1]; +my $output = $ARGV[2]; + +my %categories = ("Unknown"=>1); +my %cog_categories; +my %cogs_for_cluster; +open(COG,$cog_of_clusters); +while(<COG>){ + my $line = $_; + $line =~s/\n//g;$line =~s/\r//g; + my ($cluster,$cog,$category) = split("\t",$line); + #if (!$cogs_for_cluster{$cluster}){ + $cogs_for_cluster{$cluster} .= "$cog,"; + #} + $cog_categories{$cog} = $category; + $categories{$category}=1; +} +close(COG); + +my %hash; +my %hash2; +open(F,$pav_matrix); +my $first_line = <F>; +$first_line =~s/\n//g;$first_line =~s/\r//g; +my @samples = split(/\t/,$first_line); +my $nb_total = 0; +my $nb_core = 0; +my $nb_accessory = 0; +print C "Cluster\tFunction\tCOG\tCOG categories\n"; +print D "Cluster\tFound in N strains\tFunction\tCOG\tCOG categories\n"; +while(<F>){ + my @i = split("\t",$_); + my $cluster_num = $i[0]; + my $clnb = $i[0]; + my $pos = $clnb; + my $pos_before = $pos - 1; + my $nb_found = 0; + my $concatenate_samples = ""; + my $strain_found = 0; + my %functions_of_genes; + for (my $j = 1; $j <= $#i; $j++){ + my $val = $i[$j]; + if ($val =~/\w+/){ + my @genes = split(",",$val); + $nb_found++; + } + } + # core-genes + my $type = ""; + if ($nb_found == (scalar @samples - 1)){ + $type = "core"; + $nb_core++; + } + # dispensables + else{ + $type = "accessory"; + $nb_accessory++; + } + my @cogs = split(/,/,$cogs_for_cluster{$clnb}); + my %cat; + foreach my $cog(@cogs){ + if ($cog =~/\w+/){ + my $category = $cog_categories{$cog}; + $cat{$category}=1; + } + } + if (!$cogs_for_cluster{$clnb}){ + $cat{"Unknown"}=1; + } + + foreach my $category(keys(%categories)){ + if ($cat{$category}){ + $hash{$category}{$type}++; + } + else{ + $hash{"not in $category"}{$type}++; + } + } +} +close(F); + +open(OUT,">$output"); +my %results; +foreach my $category(keys(%categories)){ + my $n1 = 0; + if ($hash{$category}{"core"}){ + $n1 = $hash{$category}{"core"}; + } + my $n2 = 0; + if ($hash{$category}{"accessory"}){ + $n2 = $hash{$category}{"accessory"}; + } + my $n3 = 0; + if ($hash{"not in $category"}{"core"}){ + $n3 = $hash{"not in $category"}{"core"}; + } + my $n4 = 0; + if ($hash{"not in $category"}{"accessory"}){ + $n4 = $hash{"not in $category"}{"accessory"}; + } + + # core + open(R,">$pav_matrix.$category.core.script.R"); + print R "(tab <- matrix(c($n1, $n2, $n3, $n4), nrow=2))\n"; + my $ratio1 = $n1/$n3; + my $ratio2 = $n2/$n4; + print OUT "core $category $n1 $n2 $n3 $n4 $ratio1 $ratio2\n"; + print R "fisher.test(tab)\n"; + close(R); + system("Rscript $pav_matrix.$category.core.script.R >$pav_matrix.$category.core.script.R.out"); + open(O,"$pav_matrix.$category.core.script.R.out"); + my $go_odds = 0; + while(<O>){ + if (/p-value [=<] (.*)/){ + my $pvalue = $1; + $results{$category}{"core"}{"pval"} = $pvalue; + } + if (/odds/){$go_odds = 1;} + if (/([\d\.]*)/ && $go_odds){ + my $odds = $1; + + $results{$category}{"core"}{"odds_ratio"} = $odds; + } + } + close(O); + + if ($category ne 'Unknown'){ + unlink("$pav_matrix.$category.core.script.R.out"); + unlink("$pav_matrix.$category.core.script.R"); + } + + # dispensable + my $n1 = 0; + if ($hash{$category}{"accessory"}){ + $n1 = $hash{$category}{"accessory"}; + } + my $n2 = 0; + if ($hash{$category}{"core"}){ + $n2= $hash{$category}{"core"}; + } + my $n3 = 0; + if ($hash{"not in $category"}{"accessory"}){ + $n3 = $hash{"not in $category"}{"accessory"}; + } + my $n4 = 0; + if ($hash{"not in $category"}{"core"}){ + $n4 = $hash{"not in $category"}{"core"}; + } + + open(R,">$pav_matrix.$category.accessory.script.R"); + print R "(tab <- matrix(c($n1, $n3, $n2, $n4), nrow=2))\n"; + print R "fisher.test(tab)\n"; + close(R); + print "dispensable, $category: $n1, $n2, $n3, $n4\n"; + system("Rscript $pav_matrix.$category.accessory.script.R >$pav_matrix.$category.accessory.script.R.out"); + open(O,"$pav_matrix.$category.accessory.script.R.out"); + my $go_odds = 0; + while(<O>){ + if (/p-value [=<] (.*)/){ + my $pvalue = $1; + $results{$category}{"accessory"}{"pval"} = $pvalue; + } + if (/odds/){$go_odds = 1;} + if (/([\d\.]*)/ && $go_odds){ + my $odds = $1; + $results{$category}{"accessory"}{"odds_ratio"} = $odds; + } + } + close(O); + if ($category ne 'Unknown'){ + unlink("$pav_matrix.$category.accessory.script.R"); + unlink("$pav_matrix.$category.accessory.script.R.out"); + } +} + +foreach my $category(keys(%results)){ + print "$category:"; + print " ".$results{$category}{"core"}{"odds_ratio"}; + print " ".$results{$category}{"core"}{"pval"}; + print " ".$results{$category}{"accessory"}{"odds_ratio"}; + print " ".$results{$category}{"accessory"}{"pval"}; + print "\n"; +} +close(OUT);