Mercurial > repos > dereeper > pangenome_explorer
view PanExplorer_workflow/Perl/CalculateCogEnrichment.pl @ 2:97e4e3e818b6 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:48:09 +0000 |
parents | 032f6b3806a3 |
children |
line wrap: on
line source
#!/usr/bin/perl use strict; my $pav_matrix = $ARGV[0]; my $cog_of_clusters = $ARGV[1]; my $output = $ARGV[2]; my %categories = ("Unknown"=>1); my %cog_categories; my %cogs_for_cluster; open(COG,$cog_of_clusters); while(<COG>){ my $line = $_; $line =~s/\n//g;$line =~s/\r//g; my ($cluster,$cog,$category) = split("\t",$line); #if (!$cogs_for_cluster{$cluster}){ $cogs_for_cluster{$cluster} .= "$cog,"; #} $cog_categories{$cog} = $category; $categories{$category}=1; } close(COG); my %hash; my %hash2; open(F,$pav_matrix); my $first_line = <F>; $first_line =~s/\n//g;$first_line =~s/\r//g; my @samples = split(/\t/,$first_line); my $nb_total = 0; my $nb_core = 0; my $nb_accessory = 0; print C "Cluster\tFunction\tCOG\tCOG categories\n"; print D "Cluster\tFound in N strains\tFunction\tCOG\tCOG categories\n"; while(<F>){ my @i = split("\t",$_); my $cluster_num = $i[0]; my $clnb = $i[0]; my $pos = $clnb; my $pos_before = $pos - 1; my $nb_found = 0; my $concatenate_samples = ""; my $strain_found = 0; my %functions_of_genes; for (my $j = 1; $j <= $#i; $j++){ my $val = $i[$j]; if ($val =~/\w+/){ my @genes = split(",",$val); $nb_found++; } } # core-genes my $type = ""; if ($nb_found == (scalar @samples - 1)){ $type = "core"; $nb_core++; } # dispensables else{ $type = "accessory"; $nb_accessory++; } my @cogs = split(/,/,$cogs_for_cluster{$clnb}); my %cat; foreach my $cog(@cogs){ if ($cog =~/\w+/){ my $category = $cog_categories{$cog}; $cat{$category}=1; } } if (!$cogs_for_cluster{$clnb}){ $cat{"Unknown"}=1; } foreach my $category(keys(%categories)){ if ($cat{$category}){ $hash{$category}{$type}++; } else{ $hash{"not in $category"}{$type}++; } } } close(F); open(OUT,">$output"); my %results; foreach my $category(keys(%categories)){ my $n1 = 0; if ($hash{$category}{"core"}){ $n1 = $hash{$category}{"core"}; } my $n2 = 0; if ($hash{$category}{"accessory"}){ $n2 = $hash{$category}{"accessory"}; } my $n3 = 0; if ($hash{"not in $category"}{"core"}){ $n3 = $hash{"not in $category"}{"core"}; } my $n4 = 0; if ($hash{"not in $category"}{"accessory"}){ $n4 = $hash{"not in $category"}{"accessory"}; } # core open(R,">$pav_matrix.$category.core.script.R"); print R "(tab <- matrix(c($n1, $n2, $n3, $n4), nrow=2))\n"; my $ratio1 = $n1/$n3; my $ratio2 = $n2/$n4; print OUT "core $category $n1 $n2 $n3 $n4 $ratio1 $ratio2\n"; print R "fisher.test(tab)\n"; close(R); system("Rscript $pav_matrix.$category.core.script.R >$pav_matrix.$category.core.script.R.out"); open(O,"$pav_matrix.$category.core.script.R.out"); my $go_odds = 0; while(<O>){ if (/p-value [=<] (.*)/){ my $pvalue = $1; $results{$category}{"core"}{"pval"} = $pvalue; } if (/odds/){$go_odds = 1;} if (/([\d\.]*)/ && $go_odds){ my $odds = $1; $results{$category}{"core"}{"odds_ratio"} = $odds; } } close(O); if ($category ne 'Unknown'){ unlink("$pav_matrix.$category.core.script.R.out"); unlink("$pav_matrix.$category.core.script.R"); } # dispensable my $n1 = 0; if ($hash{$category}{"accessory"}){ $n1 = $hash{$category}{"accessory"}; } my $n2 = 0; if ($hash{$category}{"core"}){ $n2= $hash{$category}{"core"}; } my $n3 = 0; if ($hash{"not in $category"}{"accessory"}){ $n3 = $hash{"not in $category"}{"accessory"}; } my $n4 = 0; if ($hash{"not in $category"}{"core"}){ $n4 = $hash{"not in $category"}{"core"}; } open(R,">$pav_matrix.$category.accessory.script.R"); print R "(tab <- matrix(c($n1, $n3, $n2, $n4), nrow=2))\n"; print R "fisher.test(tab)\n"; close(R); print "dispensable, $category: $n1, $n2, $n3, $n4\n"; system("Rscript $pav_matrix.$category.accessory.script.R >$pav_matrix.$category.accessory.script.R.out"); open(O,"$pav_matrix.$category.accessory.script.R.out"); my $go_odds = 0; while(<O>){ if (/p-value [=<] (.*)/){ my $pvalue = $1; $results{$category}{"accessory"}{"pval"} = $pvalue; } if (/odds/){$go_odds = 1;} if (/([\d\.]*)/ && $go_odds){ my $odds = $1; $results{$category}{"accessory"}{"odds_ratio"} = $odds; } } close(O); if ($category ne 'Unknown'){ unlink("$pav_matrix.$category.accessory.script.R"); unlink("$pav_matrix.$category.accessory.script.R.out"); } } foreach my $category(keys(%results)){ print "$category:"; print " ".$results{$category}{"core"}{"odds_ratio"}; print " ".$results{$category}{"core"}{"pval"}; print " ".$results{$category}{"accessory"}{"odds_ratio"}; print " ".$results{$category}{"accessory"}{"pval"}; print "\n"; } close(OUT);