view gomwu_a.pl @ 2:5acf9dfdfa27 draft default tip

planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
author cristian
date Wed, 09 Nov 2022 08:57:54 +0000
parents f7287f82602f
children
line wrap: on
line source

#!/usr/bin/env perl

my $usage= "

gomwu_a.pl  (v. Feb 2015):

This is the fist script in the GO database slimming and reformatting procedure,
called automatically by goStats.R 

See README_GO_MWU.txt file for details.

Mikhail Matz, UT Austin; matz@utexas.edu

";

use File::Basename;

print "@ARGV";

my $onto=$ARGV[0] or die $usage;
my $gen2go=$ARGV[1] or die $usage;
my $measure=$ARGV[2] or die $usage;
my $div=$ARGV[3] or die $usage;
my $altern="t";
if ("@ARGV"=~/alternative=(\w)/) { $altern=$1; }
my $toomany=0.1;
if ("@ARGV"=~/largest=(0\.\d+)/) { $toomany=$1; }
my $mingenes=5;
if ("@ARGV"=~/smallest=(\d+)/) { $mingenes=$1; }
my $cutHeight=0.25;
if ("@ARGV"=~/cutHeight=(\S+)/) { $cutHeight=$1; }
my $padj="BH";
if ("@ARGV"=~/p.adjust=(\w+)/) { $padj=$1; }


print "

Run parameters:

largest GO category as fraction of all genes (largest)  : $toomany
         smallest GO category as # of genes (smallest)  : $mingenes
                clustering threshold (clusterCutHeight) : $cutHeight

";

my $division;
if ($div eq "BP") { $division="biological_process";}
elsif ($div eq "MF") { $division="molecular_function";}
elsif ($div eq "CC") { $division="cellular_component";}
else { die "unrecognized division: $div\n";}

($mname,$mdir,$mext) = fileparse($measure,'\..*');
($aname,$adir,$aext) = fileparse($gen2go,'\..*');
print "$mname - $mdir - $mext\n";
my $inname2=$mdir.$mname.".".$div.".tmp";
my $inname3=$mdir.$mname."_".$div.".tsv";
my $inname31=$mdir."dissim0_".$div."_".$aname.$aext;
my $inname4=$mdir."dissim_".$div."_".$mname."_".$aname.$aext;

my @donealready=();

open GO, $onto or die "cannot open ontology $onto\n";
open CORAL, $gen2go or die "cannot open genes2go table $gen2go\n";
open DNDS, $measure or die "cannot open measure table $measure\n";
my $outname=$inname2;
open VOOL, ">$outname" or die "cannot create output file $outname\n";

#################################################
# reading GO hierarchy, only the $divterm division
print "-----------------\nretrieving GO hierarchy, reformatting data...\n\n";
my %parents;
my %name;
my $term;
my %goodterms;
#my $divterm;
#my $division="molecular_function";

$bads=0;

while (<GO>){
	if ($_=~/^id:\s(GO\S+)/){
		$term=$1;
	}
	elsif ($_=~/^namespace:\s(\S+)/){
			if ($1 ne $division) {
				$term="";
			}
			else {
				$goodterms{$term}="OK" unless (!$term);
			}
	}

	elsif ($_=~/^is_a:\s(GO\S+)\s/) {
		push @{$parents{$term}}, $1 unless (!$term);
		push @allparents, $1;
	}
	elsif ($_=~/^name:\s(.+)/) {
		$name{$term}=$1;
		chomp($name{$term}) unless (!$term);
		if ($name{$term} eq $division) { $divterm=$term;}
	}
}
$goodterms{$term}="OK" unless (!$term);

# calculating hierarchy levels, putting all parents together

my %golevel;
$golevel{$divterm}=0;

foreach $term (keys %parents) {
	$extra=$#{$parents{$term}}+1;
	for ($in=$#{$parents{$term}};$extra;$in+=$extra){
		$golevel{$term}++;
		$tstart=$in-$extra+1;
		$extra=0;
		for ($tt=$tstart; $tt<=$in;$tt++){ 
			$t=${$parents{$term}}[$tt];
			foreach $t0 (@{$parents{$t}}) {
				next if ("@{$parents{$term}}"=~/$t0/ || !$goodterms{$t0});
				push @{$parents{$term}}, $t0;
				$extra++;
			}
		}	
	}
}
###################################################

# reading measures
my $l=0;
my %dnds;
my $seq;

while (<DNDS>){
	if (!$l){
		$l++;
		next;
	}
	chomp;
	($seq,$ns)=split(/\t/, $_);
	if ($seq=~/SEQ/) { $seq.="_s";}
	$dnds{$seq}=$ns;
}

#reformatting

$bads=0;
$goods=0;

my $seq;
my $goline;
my @terms;
my @orphans;
my @nolevel;

print {VOOL} "id\tterm\tlev\tvalue\tseq\n";

my $count;
while (<CORAL>){
	chomp;
	($seq,$goline)=split(/\t/,$_);
	if ($dnds{$seq}!~/\d/) {
		push @orphans, $seq;
		next;
	}
	if ($goline=~/unknown/){
		print {VOOL} "\"$goline\"\t$goline\t5\t$dnds{$seq}\t$seq\n";
		next;
	}
	@terms=split(/;/,$goline);
	$nt0=$#terms+1;
	$count++;
#print "-------------------\n$count | $seq terms: $nt0\n";
	my @collect;
	foreach $term (@terms) {
		if (!$goodterms{$term}) {
				$bads++;
#print "$term is not $division$1\n";
				next;
		}
#print "$term\n";
		$goods++;
		if (!$golevel{$term}){
			push @nolevel,$term;
			$golevel{$term}=-1;
		}
		push @collect,($term,@{$parents{$term}});
	}
	
$ncoll=$#collect+1;
#print "collected terms: $ncoll\n";

	my @nrcollect;
	foreach $term (@collect) { 
		push @nrcollect, $term unless ("@nrcollect"=~/$term/);
	}

$ncoll=$#nrcollect+1;
#print "       nr terms: $ncoll\n";
	
	foreach $term (@nrcollect) { 
		print {VOOL} "\"$name{$term}\"\t$term\t$golevel{$term}\t$dnds{$seq}\t$seq\n";
	}
}
#print "terms matching division: $goods\nterms not matching division: $bads\n";
$nor=$#orphans+1;
$nnl=$#nolevel+1;
print "-------------\ngo_reformat:\nGenes with GO annotations, but not listed in measure table: $nor\n";
print "\nTerms without defined level (old ontology?..): $nnl\n-------------\n";
my %parents;
my %name;
my $term;
my %goodterms;
close VOOL;
#########################
# selecting good sized categroies, collapsing redundant ones
#if($dones!~/ $inname3 /) { 
open TAB, $inname2 or die "go_nrify: cannot open input table $inname2\n";
<TAB>;

my %level={};
my %desc={};
my %value={};
my $des;
my $go; 
my $ll;
my $gn;
my $val;
my @gos=();
my %genes={};
my @gcount=();
my %gci={};
my %goi={};

while (<TAB>){
	chomp;
	($des,$go,$ll,$val,$gn)=split(/\t/,$_);
	$value{$gn}=$val;
	$desc{$go}=$des;
	push @{$genes{$go}},$gn;
	push @gcount, $gn unless ($gci{$gn}==1) ;
	$gci{$gn}=1;
#	push @genes,$gn;
	unless ($goi{$go}==1){
		$desc{$go}=$des;
		$level{$go}=$l;
		push @gos, $go;
		$goi{$go}=1;
	}
}

close TAB;
#unlink $inname2;

$gc=$#gcount+1;
$goc=$#gos+1;
print "-------------\ngo_nrify:\n$goc categories, $gc genes; size range $mingenes-",$gc*$toomany,"\n";

# excluding too large and  too small categories
my %gonego={};
my $toobroad=0;
my $toosmall=0;
for ($g1=0;$g1<=$#gos;$g1++){
	$go=@gos[$g1];
	my $golen=$#{$genes{$go}}+1;
	if ($golen > $gc*$toomany) { 
		unless ($go eq "unknown") {
			$gonego{$go}=1;
			$toobroad++;
		}
#warn "$go: too broad ($golen genes)\n";
	}
	elsif ($golen < $mingenes) { 
		$gonego{$go}=1 ;
		$toosmall++;
#warn "\t\t$go: too narrow ($golen genes) @{$genes{$go}}\n";
	}
}
print "\t$toobroad too broad\n\t$toosmall too small\n\t", $goc-$toobroad-$toosmall," remaining

removing redundancy:

calculating GO term similarities based on shared genes...\n";

my @goodgo=();
foreach $go (@gos) { push @goodgo, $go unless ($gonego{$go}==1); }
@gos=@goodgo;

################################

#warn "comparing categories...\n"; 
#my $clfile="cl_".$inname31;
#if($dones!~/ $clfile /) { 

use List::Util qw[min max];
for ($g1=0;$g1<=$#gos;$g1++){
    my $go=@gos[$g1];
    next if ($gonego{$go}==1);
    #warn "----------------\n$go  $desc{$go}  level $level{$go}\n";
    my $goos=$go;
    my $lev=$level{$go};
    my $dsc=$desc{$go};
    for ($g2=$g1+1;$g2<=$#gos;$g2++){
        my $go2=@gos[$g2];	
        next if ($gonego{$go2}==1);
        next if ($ggi{$go2}==1);
        my %seen={}; 
        my $count=0;
        my @combo=();
        if ($lump<=1) {
            foreach $g (@{$genes{$go}},@{$genes{$go2}}){
                unless($seen{$g}==1 ){
                    $count++;
                    $seen{$g}=1;
                    push @combo, $g;
                }
            }
            my $shared=$#{$genes{$go}}+1+$#{$genes{$go2}}+1-$count;
            $overlap{$go,$go2}=min($shared/($#{$genes{$go}}+1),$shared/($#{$genes{$go2}}+1));			
            $overlap{$go2,$go}=min($shared/($#{$genes{$go}}+1),$shared/($#{$genes{$go2}}+1));			
        }
    }
}

open OUT, ">$inname31" or die "gomwu_a: cannot create output $inname31\n";

print {OUT} join("\t",@gos),"\n";

foreach $go (@gos) {
    $overlap{$go,$go}=1;
    foreach $go2 (@gos){
        print {OUT} sprintf("%.3f",1-$overlap{$go,$go2});;
        print {OUT} "\t" unless ($go2 eq $gos[$#gos]);
    }
    print {OUT} "\n";
}
close OUT;
#}

#print "calling clusteringGOs.R script ....\n";
#	my $err=`Rscript clusteringGOs.R $inname31 $cutHeight `;
#	print $err;