diff gomwu_b.pl @ 0:91261b42c07e draft

"planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
author cristian
date Thu, 14 Apr 2022 13:28:05 +0000
parents
children f7287f82602f
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gomwu_b.pl	Thu Apr 14 13:28:05 2022 +0000
@@ -0,0 +1,217 @@
+#!/usr/bin/env perl
+
+my $usage= "
+
+gomwu_b.pl  (v. Feb 2015):
+
+This is the second script in the GO database slimming and reformatting procedure,
+called automatically by goStats.R 
+
+See README_GO_MWU.txt file for details.
+
+Mikhail Matz, UT Austin; matz@utexas.edu
+
+";
+
+my $gen2go=shift;
+my $measure=shift;
+my $div=shift or die "$usage\nNot enough arguments for gomwu_b.pl\n";
+
+my $clfile="cl_dissim0_".$div."_".$gen2go; 
+open CLF, $clfile or die "cannot locate primary clustering file $clfile\n";
+my %clgo={};
+my $go;
+my $cl;
+
+<CLF>;
+while(<CLF>){
+	($go, $cl)=split(/,/,$_);
+	push @{$clgo{$cl}},$go;
+}
+close CLF;
+
+unlink $clfile;
+unlink "dissim0_".$div."_".$gen2go; 
+
+opendir THISDIR, ".";
+my @donealready=grep /$gen2go/, readdir THISDIR;
+my $dones=" "."@donealready"." ";
+#print "DONE: $dones\n";
+
+my $inname2=$measure.".".$div.".tmp";
+open TAB, $inname2 or die "go_nrify: cannot open input table $inname2\n";
+<TAB>;
+my %level={};
+my %desc={};
+my %value={};
+my $des;
+my $go; 
+my $l;
+my $gn;
+my $val;
+my @gos=();
+my %genes={};
+my @gcount=();
+my %gci={};
+my %goi={};
+
+while (<TAB>){
+	chomp;
+	($des,$go,$l,$val,$gn)=split(/\t/,$_);
+	$value{$gn}=$val;
+	$desc{$go}=$des;
+	push @{$genes{$go}},$gn;
+	push @gcount, $gn unless ($gci{$gn}==1) ;
+	$gci{$gn}=1;
+#	push @genes,$gn;
+	unless ($goi{$go}==1){
+		$desc{$go}=$des;
+		$level{$go}=$l;
+		push @gos, $go;
+		$goi{$go}=1;
+	}
+}
+close TAB;
+unlink $inname2;
+
+#--------------------
+
+
+my @nrgos=();
+my %nrlev={};
+my @nrgos=();
+my %nrgenes={};
+my %nrdesc={};
+my $gcount=0;
+my $cl;
+my $go;
+my $gene;
+
+foreach $cl (keys %clgo){
+	my $largest=${$clgo{$cl}}[0];	
+	my $maxgenes=$#{$genes{$largest}}+1;
+	my $maxlevel=$level{$largest};
+	$gcount+=$#{$genes{$largest}}+1;
+	my @nrgens=@{$genes{$largest}};
+	if ($#{$clgo{$cl}}>0) {
+		foreach $go (@{$clgo{$cl}}) {
+			if ($maxgenes<($#{$genes{$go}}+1)) {
+				$maxgenes=($#{$genes{$go}}+1);
+				$largest=$go;
+			}
+			elsif ($maxgenes==($#{$genes{$go}}+1)) {
+				if ($maxlevel<$level{$go}){
+					$maxlevel=$level{$go};
+					$largest=$go;
+				}
+			}
+			foreach $gene (@{$genes{$go}}){
+				push @nrgens, $gene unless(" @nrgens "=~/ $gene /);
+			}
+		}			
+	}
+	my $goos=join(";",@{$clgo{$cl}});
+	push @nrgos, $goos;
+	$nrdesc{$goos}=$desc{$largest};
+	$nrlev{$goos}=$maxlevel;
+	@{$nrgenes{$goos}}=@nrgens;	
+	$gcount+=$#nrgens+1;
+}
+
+print $#nrgos+1," non-redundant GO categories of good size\n-------------\n"; 
+$outname=$div."_".$measure;
+open OUT, ">$outname" or die "gomwu_b: cannot create output $outname\n";
+print {OUT} "name\tterm\tlev\tseq\tvalue\n";
+
+foreach $go (@nrgos) {
+	foreach $gene (@{$nrgenes{$go}}){
+		print {OUT} "$nrdesc{$go}\t$go\t$nrlev{$go}\t$gene\t$value{$gene}\n";
+	}
+}
+close OUT;
+my %level={};
+my %desc={};
+my %value={};
+my $des;
+my $go; 
+my $l;
+my $gn;
+my $val;
+my @gos=();
+my %genes={};
+my @gcount=();
+my %nrlev={};
+my @nrgos=();
+my %nrgenes={};
+my %nrdesc={};
+my $gcount=0;
+my %dnds;
+
+####################
+# building dissimilarity matrix
+
+my $inname4="dissim_".$div."_".$measure."_".$gen2go;
+my $inname3=$div."_".$measure;
+
+#if($dones!~/ $inname4 /) { 
+
+	use List::Util qw[min max];
+	open TAB, $inname3 or die "go_cluster: cannot open input table $inname3\n";
+	<TAB>;
+
+	my $des;
+	my $go; 
+	my $l;
+	my $gn;
+	my $val;
+	my @gos=();
+	my %genes={};
+	my %gosi={};
+
+	print"\nSecondary clustering:\ncalculating similarities....\n";
+	while (<TAB>){
+		chomp;
+		($des,$go,$l,$gn,$val)=split(/\t/,$_);
+		push @{$genes{$go}},$gn;
+		unless ($gosi{$go}==1 ){
+			push @gos, $go;
+			$gosi{$go}=1;
+		}
+	}
+
+	my %dissim={};
+	for ($g1=0;$g1<$#gos;$g1++){
+		my $go=@gos[$g1];
+	#if ($go eq "unknown") { print "unknown as go\n";}
+		for ($g2=$g1+1;$g2<=$#gos;$g2++){
+			my $go2=@gos[$g2];		
+			if ($go2 eq "unknown") {
+	#print "$go against $go2\n";
+				$dissim{$go,$go2}=$dissim{$go2,$go}=1;
+				next;
+			}
+			my %seen={}; my $count=0;
+			foreach $g (@{$genes{$go}},@{$genes{$go2}}){
+				unless($seen{$g}==1 ){
+					$count++;
+					$seen{$g}=1;
+				}
+			}
+			my $shared=$#{$genes{$go}}+1+$#{$genes{$go2}}+1-$count;
+			my $ref=min($#{$genes{$go}}+1,$#{$genes{$go2}}+1);
+			$dissim{$go,$go2}=$dissim{$go2,$go}=sprintf("%.3f",1-$shared/$ref);
+		}
+	}
+
+	open OUT, ">$inname4" or die "gomwu_b: cannot create output $inname4\n";
+	print {OUT} join("\t",@gos),"\n";
+
+	foreach $go (@gos) {
+		$dissim{$go,$go}=0;
+		foreach $go2 (@gos){
+			print {OUT} "$dissim{$go,$go2}";
+			print {OUT} "\t" unless ($go2 eq $gos[$#gos]);
+		}
+		print {OUT} "\n";
+	}
+#}