diff gomwu_a.pl @ 0:91261b42c07e draft

"planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
author cristian
date Thu, 14 Apr 2022 13:28:05 +0000
parents
children f7287f82602f
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gomwu_a.pl	Thu Apr 14 13:28:05 2022 +0000
@@ -0,0 +1,341 @@
+#!/usr/bin/env perl
+
+my $usage= "
+
+gomwu_a.pl  (v. Feb 2015):
+
+This is the fist script in the GO database slimming and reformatting procedure,
+called automatically by goStats.R 
+
+See README_GO_MWU.txt file for details.
+
+Mikhail Matz, UT Austin; matz@utexas.edu
+
+";
+
+print "@ARGV";
+
+my $onto=$ARGV[0] or die $usage;
+my $gen2go=$ARGV[1] or die $usage;
+my $measure=$ARGV[2] or die $usage;
+my $div=$ARGV[3] or die $usage;
+my $altern="t";
+if ("@ARGV"=~/alternative=(\w)/) { $altern=$1; }
+my $toomany=0.1;
+if ("@ARGV"=~/largest=(0\.\d+)/) { $toomany=$1; }
+my $mingenes=5;
+if ("@ARGV"=~/smallest=(\d+)/) { $mingenes=$1; }
+my $cutHeight=0.25;
+if ("@ARGV"=~/cutHeight=(\S+)/) { $cutHeight=$1; }
+my $padj="BH";
+if ("@ARGV"=~/p.adjust=(\w+)/) { $padj=$1; }
+
+
+print "
+
+Run parameters:
+
+largest GO category as fraction of all genes (largest)  : $toomany
+         smallest GO category as # of genes (smallest)  : $mingenes
+                clustering threshold (clusterCutHeight) : $cutHeight
+
+";
+
+my $division;
+if ($div eq "BP") { $division="biological_process";}
+elsif ($div eq "MF") { $division="molecular_function";}
+elsif ($div eq "CC") { $division="cellular_component";}
+else { die "unrecognized division: $div\n";}
+
+my $inname2=$measure.".".$div.".tmp";
+my $inname3=$div."_".$measure;
+my $inname31="dissim0_".$div."_".$gen2go;
+my $inname4="dissim_".$div."_".$measure."_".$gen2go;
+
+my @donealready=();
+
+open GO, $onto or die "cannot open ontology $onto\n";
+open CORAL, $gen2go or die "cannot open genes2go table $gen2go\n";
+open DNDS, $measure or die "cannot open measure table $measure\n";
+my $outname=$inname2;
+open VOOL, ">$outname" or die "cannot create output file $outname\n";
+
+#################################################
+# reading GO hierarchy, only the $divterm division
+print "-----------------\nretrieving GO hierarchy, reformatting data...\n\n";
+my %parents;
+my %name;
+my $term;
+my %goodterms;
+#my $divterm;
+#my $division="molecular_function";
+
+$bads=0;
+
+while (<GO>){
+	if ($_=~/^id:\s(GO\S+)/){
+		$term=$1;
+	}
+	elsif ($_=~/^namespace:\s(\S+)/){
+			if ($1 ne $division) {
+				$term="";
+			}
+			else {
+				$goodterms{$term}="OK" unless (!$term);
+			}
+	}
+
+	elsif ($_=~/^is_a:\s(GO\S+)\s/) {
+		push @{$parents{$term}}, $1 unless (!$term);
+		push @allparents, $1;
+	}
+	elsif ($_=~/^name:\s(.+)/) {
+		$name{$term}=$1;
+		chomp($name{$term}) unless (!$term);
+		if ($name{$term} eq $division) { $divterm=$term;}
+	}
+}
+$goodterms{$term}="OK" unless (!$term);
+
+# calculating hierarchy levels, putting all parents together
+
+my %golevel;
+$golevel{$divterm}=0;
+
+foreach $term (keys %parents) {
+	$extra=$#{$parents{$term}}+1;
+	for ($in=$#{$parents{$term}};$extra;$in+=$extra){
+		$golevel{$term}++;
+		$tstart=$in-$extra+1;
+		$extra=0;
+		for ($tt=$tstart; $tt<=$in;$tt++){ 
+			$t=${$parents{$term}}[$tt];
+			foreach $t0 (@{$parents{$t}}) {
+				next if ("@{$parents{$term}}"=~/$t0/ || !$goodterms{$t0});
+				push @{$parents{$term}}, $t0;
+				$extra++;
+			}
+		}	
+	}
+}
+###################################################
+
+# reading measures
+my $l=0;
+my %dnds;
+my $seq;
+
+while (<DNDS>){
+	if (!$l){
+		$l++;
+		next;
+	}
+	chomp;
+	($seq,$ns)=split(/,/, $_);
+	if ($seq=~/SEQ/) { $seq.="_s";}
+	$dnds{$seq}=$ns;
+}
+
+#reformatting
+
+$bads=0;
+$goods=0;
+
+my $seq;
+my $goline;
+my @terms;
+my @orphans;
+my @nolevel;
+
+print {VOOL} "id\tterm\tlev\tvalue\tseq\n";
+
+my $count;
+while (<CORAL>){
+	chomp;
+	($seq,$goline)=split(/\t/,$_);
+	if ($dnds{$seq}!~/\d/) {
+		push @orphans, $seq;
+		next;
+	}
+	if ($goline=~/unknown/){
+		print {VOOL} "\"$goline\"\t$goline\t5\t$dnds{$seq}\t$seq\n";
+		next;
+	}
+	@terms=split(/;/,$goline);
+	$nt0=$#terms+1;
+	$count++;
+#print "-------------------\n$count | $seq terms: $nt0\n";
+	my @collect;
+	foreach $term (@terms) {
+		if (!$goodterms{$term}) {
+				$bads++;
+#print "$term is not $division$1\n";
+				next;
+		}
+#print "$term\n";
+		$goods++;
+		if (!$golevel{$term}){
+			push @nolevel,$term;
+			$golevel{$term}=-1;
+		}
+		push @collect,($term,@{$parents{$term}});
+	}
+	
+$ncoll=$#collect+1;
+#print "collected terms: $ncoll\n";
+
+	my @nrcollect;
+	foreach $term (@collect) { 
+		push @nrcollect, $term unless ("@nrcollect"=~/$term/);
+	}
+
+$ncoll=$#nrcollect+1;
+#print "       nr terms: $ncoll\n";
+	
+	foreach $term (@nrcollect) { 
+		print {VOOL} "\"$name{$term}\"\t$term\t$golevel{$term}\t$dnds{$seq}\t$seq\n";
+	}
+}
+#print "terms matching division: $goods\nterms not matching division: $bads\n";
+$nor=$#orphans+1;
+$nnl=$#nolevel+1;
+print "-------------\ngo_reformat:\nGenes with GO annotations, but not listed in measure table: $nor\n";
+print "\nTerms without defined level (old ontology?..): $nnl\n-------------\n";
+my %parents;
+my %name;
+my $term;
+my %goodterms;
+close VOOL;
+#########################
+# selecting good sized categroies, collapsing redundant ones
+#if($dones!~/ $inname3 /) { 
+open TAB, $inname2 or die "go_nrify: cannot open input table $inname2\n";
+<TAB>;
+
+my %level={};
+my %desc={};
+my %value={};
+my $des;
+my $go; 
+my $l;
+my $gn;
+my $val;
+my @gos=();
+my %genes={};
+my @gcount=();
+my %gci={};
+my %goi={};
+
+while (<TAB>){
+	chomp;
+	($des,$go,$l,$val,$gn)=split(/\t/,$_);
+	$value{$gn}=$val;
+	$desc{$go}=$des;
+	push @{$genes{$go}},$gn;
+	push @gcount, $gn unless ($gci{$gn}==1) ;
+	$gci{$gn}=1;
+#	push @genes,$gn;
+	unless ($goi{$go}==1){
+		$desc{$go}=$des;
+		$level{$go}=$l;
+		push @gos, $go;
+		$goi{$go}=1;
+	}
+}
+
+close TAB;
+#unlink $inname2;
+
+$gc=$#gcount+1;
+$goc=$#gos+1;
+print "-------------\ngo_nrify:\n$goc categories, $gc genes; size range $mingenes-",$gc*$toomany,"\n";
+
+# excluding too large and  too small categories
+my %gonego={};
+my $toobroad=0;
+my $toosmall=0;
+for ($g1=0;$g1<=$#gos;$g1++){
+	$go=@gos[$g1];
+	my $golen=$#{$genes{$go}}+1;
+	if ($golen > $gc*$toomany) { 
+		unless ($go eq "unknown") {
+			$gonego{$go}=1;
+			$toobroad++;
+		}
+#warn "$go: too broad ($golen genes)\n";
+	}
+	elsif ($golen < $mingenes) { 
+		$gonego{$go}=1 ;
+		$toosmall++;
+#warn "\t\t$go: too narrow ($golen genes) @{$genes{$go}}\n";
+	}
+}
+print "\t$toobroad too broad\n\t$toosmall too small\n\t", $goc-$toobroad-$toosmall," remaining
+
+removing redundancy:
+
+calculating GO term similarities based on shared genes...\n";
+
+my @goodgo=();
+foreach $go (@gos) { push @goodgo, $go unless ($gonego{$go}==1); }
+@gos=@goodgo;
+
+################################
+
+	#warn "comparing categories...\n"; 
+#my $clfile="cl_".$inname31;
+#if($dones!~/ $clfile /) { 
+
+	use List::Util qw[min max];
+	for ($g1=0;$g1<=$#gos;$g1++){
+		my $go=@gos[$g1];
+		next if ($gonego{$go}==1);
+	#warn "----------------\n$go  $desc{$go}  level $level{$go}\n";
+		my $goos=$go;
+		my $lev=$level{$go};
+		my $dsc=$desc{$go};
+		for ($g2=$g1+1;$g2<=$#gos;$g2++){
+			my $go2=@gos[$g2];	
+			next if ($gonego{$go2}==1);
+			next if ($ggi{$go2}==1);
+			my %seen={}; 
+			my $count=0;
+			my @combo=();
+			if ($lump<=1) {
+				foreach $g (@{$genes{$go}},@{$genes{$go2}}){
+					unless($seen{$g}==1 ){
+						$count++;
+						$seen{$g}=1;
+						push @combo, $g;
+					}
+				}
+				my $shared=$#{$genes{$go}}+1+$#{$genes{$go2}}+1-$count;
+				$overlap{$go,$go2}=min($shared/($#{$genes{$go}}+1),$shared/($#{$genes{$go2}}+1));			
+				$overlap{$go2,$go}=min($shared/($#{$genes{$go}}+1),$shared/($#{$genes{$go2}}+1));			
+			}
+		}
+	}
+
+	open OUT, ">$inname31" or die "gomwu_b: cannot create output $inname31\n";
+	
+	print {OUT} join("\t",@gos),"\n";
+
+	foreach $go (@gos) {
+		$overlap{$go,$go}=1;
+		foreach $go2 (@gos){
+			print {OUT} sprintf("%.3f",1-$overlap{$go,$go2});;
+			print {OUT} "\t" unless ($go2 eq $gos[$#gos]);
+		}
+		print {OUT} "\n";
+	}
+	close OUT;
+#}
+
+#print "calling clusteringGOs.R script ....\n";
+#	my $err=`Rscript clusteringGOs.R $inname31 $cutHeight `;
+#	print $err;
+
+
+
+
+