Mercurial > repos > cristian > rbgoa
view gomwu_b.pl @ 2:5acf9dfdfa27 draft default tip
planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
author | cristian |
---|---|
date | Wed, 09 Nov 2022 08:57:54 +0000 |
parents | f7287f82602f |
children |
line wrap: on
line source
#!/usr/bin/env perl use File::Basename; my $usage= " gomwu_b.pl (v. Feb 2015): This is the second script in the GO database slimming and reformatting procedure, called automatically by goStats.R See README_GO_MWU.txt file for details. Mikhail Matz, UT Austin; matz@utexas.edu "; my $gen2go=shift; my $measure=shift; ($mname,$mdir,$mext) = fileparse($measure,'\..*'); ($aname,$adir,$aext) = fileparse($gen2go,'\..*'); my $div=shift or die "$usage\nNot enough arguments for gomwu_b.pl\n"; my $clfile=$mdir."cl_dissim0_".$div."_".$aname.$aext; open CLF, $clfile or die "cannot locate primary clustering file $clfile\n"; my %clgo={}; my $go; my $cl; <CLF>; while(<CLF>){ ($go, $cl)=split(/,/,$_); push @{$clgo{$cl}},$go; } close CLF; unlink $clfile; unlink $mdir."dissim0_".$div."_".$aname.$aext; opendir THISDIR, $mdir; my @donealready=grep /$gen2go/, readdir THISDIR; my $dones=" "."@donealready"." "; #print "DONE: $dones\n"; my $inname2=$mdir.$mname.".".$div.".tmp"; open TAB, $inname2 or die "go_nrify: cannot open input table $inname2\n"; <TAB>; my %level={}; my %desc={}; my %value={}; my $des; my $go; my $l; my $gn; my $val; my @gos=(); my %genes={}; my @gcount=(); my %gci={}; my %goi={}; while (<TAB>){ chomp; ($des,$go,$l,$val,$gn)=split(/\t/,$_); $value{$gn}=$val; $desc{$go}=$des; push @{$genes{$go}},$gn; push @gcount, $gn unless ($gci{$gn}==1) ; $gci{$gn}=1; # push @genes,$gn; unless ($goi{$go}==1){ $desc{$go}=$des; $level{$go}=$l; push @gos, $go; $goi{$go}=1; } } close TAB; unlink $inname2; #-------------------- my @nrgos=(); my %nrlev={}; my @nrgos=(); my %nrgenes={}; my %nrdesc={}; my $gcount=0; my $cl; my $go; my $gene; foreach $cl (keys %clgo){ my $largest=${$clgo{$cl}}[0]; my $maxgenes=$#{$genes{$largest}}+1; my $maxlevel=$level{$largest}; $gcount+=$#{$genes{$largest}}+1; my @nrgens=@{$genes{$largest}}; if ($#{$clgo{$cl}}>0) { foreach $go (@{$clgo{$cl}}) { if ($maxgenes<($#{$genes{$go}}+1)) { $maxgenes=($#{$genes{$go}}+1); $largest=$go; } elsif ($maxgenes==($#{$genes{$go}}+1)) { if ($maxlevel<$level{$go}){ $maxlevel=$level{$go}; $largest=$go; } } foreach $gene (@{$genes{$go}}){ push @nrgens, $gene unless(" @nrgens "=~/ $gene /); } } } my $goos=join(";",@{$clgo{$cl}}); push @nrgos, $goos; $nrdesc{$goos}=$desc{$largest}; $nrlev{$goos}=$maxlevel; @{$nrgenes{$goos}}=@nrgens; $gcount+=$#nrgens+1; } print $#nrgos+1," non-redundant GO categories of good size\n-------------\n"; $outname=$mdir.$mname."_".$div.".tsv"; print("creating $outname"); open OUT, ">$outname" or die "gomwu_b: cannot create output $outname\n"; print {OUT} "name\tterm\tlev\tseq\tvalue\n"; foreach $go (@nrgos) { foreach $gene (@{$nrgenes{$go}}){ print {OUT} "$nrdesc{$go}\t$go\t$nrlev{$go}\t$gene\t$value{$gene}\n"; } } close OUT; my %level={}; my %desc={}; my %value={}; my $des; my $go; my $l; my $gn; my $val; my @gos=(); my %genes={}; my @gcount=(); my %nrlev={}; my @nrgos=(); my %nrgenes={}; my %nrdesc={}; my $gcount=0; my %dnds; #################### # building dissimilarity matrix my $inname4=$mdir."dissim_".$div."_".$mname."_".$aname.$aext; my $inname3=$mdir.$mname."_".$div.".tsv"; #if($dones!~/ $inname4 /) { use List::Util qw[min max]; open TAB, $inname3 or die "go_cluster: cannot open input table $inname3\n"; <TAB>; # my $des; # my $go; # my $l; # my $gn; # my $val; # my @gos=(); # my %genes={}; my %gosi={}; print"\nSecondary clustering:\ncalculating similarities....\n"; while (<TAB>){ chomp; ($des,$go,$l,$gn,$val)=split(/\t/,$_); push @{$genes{$go}},$gn; unless ($gosi{$go}==1 ){ push @gos, $go; $gosi{$go}=1; } } my %dissim={}; for ($g1=0;$g1<$#gos;$g1++){ my $go=@gos[$g1]; #if ($go eq "unknown") { print "unknown as go\n";} for ($g2=$g1+1;$g2<=$#gos;$g2++){ my $go2=@gos[$g2]; if ($go2 eq "unknown") { #print "$go against $go2\n"; $dissim{$go,$go2}=$dissim{$go2,$go}=1; next; } my %seen={}; my $count=0; foreach $g (@{$genes{$go}},@{$genes{$go2}}){ unless($seen{$g}==1 ){ $count++; $seen{$g}=1; } } my $shared=$#{$genes{$go}}+1+$#{$genes{$go2}}+1-$count; my $ref=min($#{$genes{$go}}+1,$#{$genes{$go2}}+1); $dissim{$go,$go2}=$dissim{$go2,$go}=sprintf("%.3f",1-$shared/$ref); } } open OUT, ">$inname4" or die "gomwu_b: cannot create output $inname4\n"; print {OUT} join("\t",@gos),"\n"; foreach $go (@gos) { $dissim{$go,$go}=0; foreach $go2 (@gos){ print {OUT} "$dissim{$go,$go2}"; print {OUT} "\t" unless ($go2 eq $gos[$#gos]); } print {OUT} "\n"; } #}