annotate gomwu_a.pl @ 2:5acf9dfdfa27 draft default tip

planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
author cristian
date Wed, 09 Nov 2022 08:57:54 +0000
parents f7287f82602f
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
1 #!/usr/bin/env perl
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
2
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
3 my $usage= "
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
4
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
5 gomwu_a.pl (v. Feb 2015):
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
6
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
7 This is the fist script in the GO database slimming and reformatting procedure,
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
8 called automatically by goStats.R
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
9
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
10 See README_GO_MWU.txt file for details.
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
11
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
12 Mikhail Matz, UT Austin; matz@utexas.edu
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
13
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
14 ";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
15
1
f7287f82602f "planemo upload commit 486235d6560c9e95bd42152ad19bf7c3941cdc1b"
cristian
parents: 0
diff changeset
16 use File::Basename;
f7287f82602f "planemo upload commit 486235d6560c9e95bd42152ad19bf7c3941cdc1b"
cristian
parents: 0
diff changeset
17
0
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
18 print "@ARGV";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
19
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
20 my $onto=$ARGV[0] or die $usage;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
21 my $gen2go=$ARGV[1] or die $usage;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
22 my $measure=$ARGV[2] or die $usage;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
23 my $div=$ARGV[3] or die $usage;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
24 my $altern="t";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
25 if ("@ARGV"=~/alternative=(\w)/) { $altern=$1; }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
26 my $toomany=0.1;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
27 if ("@ARGV"=~/largest=(0\.\d+)/) { $toomany=$1; }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
28 my $mingenes=5;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
29 if ("@ARGV"=~/smallest=(\d+)/) { $mingenes=$1; }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
30 my $cutHeight=0.25;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
31 if ("@ARGV"=~/cutHeight=(\S+)/) { $cutHeight=$1; }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
32 my $padj="BH";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
33 if ("@ARGV"=~/p.adjust=(\w+)/) { $padj=$1; }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
34
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
35
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
36 print "
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
37
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
38 Run parameters:
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
39
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
40 largest GO category as fraction of all genes (largest) : $toomany
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
41 smallest GO category as # of genes (smallest) : $mingenes
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
42 clustering threshold (clusterCutHeight) : $cutHeight
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
43
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
44 ";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
45
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
46 my $division;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
47 if ($div eq "BP") { $division="biological_process";}
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
48 elsif ($div eq "MF") { $division="molecular_function";}
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
49 elsif ($div eq "CC") { $division="cellular_component";}
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
50 else { die "unrecognized division: $div\n";}
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
51
1
f7287f82602f "planemo upload commit 486235d6560c9e95bd42152ad19bf7c3941cdc1b"
cristian
parents: 0
diff changeset
52 ($mname,$mdir,$mext) = fileparse($measure,'\..*');
f7287f82602f "planemo upload commit 486235d6560c9e95bd42152ad19bf7c3941cdc1b"
cristian
parents: 0
diff changeset
53 ($aname,$adir,$aext) = fileparse($gen2go,'\..*');
f7287f82602f "planemo upload commit 486235d6560c9e95bd42152ad19bf7c3941cdc1b"
cristian
parents: 0
diff changeset
54 print "$mname - $mdir - $mext\n";
f7287f82602f "planemo upload commit 486235d6560c9e95bd42152ad19bf7c3941cdc1b"
cristian
parents: 0
diff changeset
55 my $inname2=$mdir.$mname.".".$div.".tmp";
f7287f82602f "planemo upload commit 486235d6560c9e95bd42152ad19bf7c3941cdc1b"
cristian
parents: 0
diff changeset
56 my $inname3=$mdir.$mname."_".$div.".tsv";
f7287f82602f "planemo upload commit 486235d6560c9e95bd42152ad19bf7c3941cdc1b"
cristian
parents: 0
diff changeset
57 my $inname31=$mdir."dissim0_".$div."_".$aname.$aext;
f7287f82602f "planemo upload commit 486235d6560c9e95bd42152ad19bf7c3941cdc1b"
cristian
parents: 0
diff changeset
58 my $inname4=$mdir."dissim_".$div."_".$mname."_".$aname.$aext;
0
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
59
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
60 my @donealready=();
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
61
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
62 open GO, $onto or die "cannot open ontology $onto\n";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
63 open CORAL, $gen2go or die "cannot open genes2go table $gen2go\n";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
64 open DNDS, $measure or die "cannot open measure table $measure\n";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
65 my $outname=$inname2;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
66 open VOOL, ">$outname" or die "cannot create output file $outname\n";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
67
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
68 #################################################
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
69 # reading GO hierarchy, only the $divterm division
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
70 print "-----------------\nretrieving GO hierarchy, reformatting data...\n\n";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
71 my %parents;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
72 my %name;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
73 my $term;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
74 my %goodterms;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
75 #my $divterm;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
76 #my $division="molecular_function";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
77
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
78 $bads=0;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
79
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
80 while (<GO>){
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
81 if ($_=~/^id:\s(GO\S+)/){
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
82 $term=$1;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
83 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
84 elsif ($_=~/^namespace:\s(\S+)/){
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
85 if ($1 ne $division) {
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
86 $term="";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
87 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
88 else {
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
89 $goodterms{$term}="OK" unless (!$term);
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
90 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
91 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
92
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
93 elsif ($_=~/^is_a:\s(GO\S+)\s/) {
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
94 push @{$parents{$term}}, $1 unless (!$term);
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
95 push @allparents, $1;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
96 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
97 elsif ($_=~/^name:\s(.+)/) {
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
98 $name{$term}=$1;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
99 chomp($name{$term}) unless (!$term);
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
100 if ($name{$term} eq $division) { $divterm=$term;}
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
101 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
102 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
103 $goodterms{$term}="OK" unless (!$term);
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
104
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
105 # calculating hierarchy levels, putting all parents together
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
106
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
107 my %golevel;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
108 $golevel{$divterm}=0;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
109
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
110 foreach $term (keys %parents) {
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
111 $extra=$#{$parents{$term}}+1;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
112 for ($in=$#{$parents{$term}};$extra;$in+=$extra){
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
113 $golevel{$term}++;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
114 $tstart=$in-$extra+1;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
115 $extra=0;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
116 for ($tt=$tstart; $tt<=$in;$tt++){
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
117 $t=${$parents{$term}}[$tt];
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
118 foreach $t0 (@{$parents{$t}}) {
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
119 next if ("@{$parents{$term}}"=~/$t0/ || !$goodterms{$t0});
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
120 push @{$parents{$term}}, $t0;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
121 $extra++;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
122 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
123 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
124 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
125 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
126 ###################################################
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
127
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
128 # reading measures
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
129 my $l=0;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
130 my %dnds;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
131 my $seq;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
132
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
133 while (<DNDS>){
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
134 if (!$l){
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
135 $l++;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
136 next;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
137 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
138 chomp;
1
f7287f82602f "planemo upload commit 486235d6560c9e95bd42152ad19bf7c3941cdc1b"
cristian
parents: 0
diff changeset
139 ($seq,$ns)=split(/\t/, $_);
0
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
140 if ($seq=~/SEQ/) { $seq.="_s";}
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
141 $dnds{$seq}=$ns;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
142 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
143
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
144 #reformatting
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
145
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
146 $bads=0;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
147 $goods=0;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
148
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
149 my $seq;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
150 my $goline;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
151 my @terms;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
152 my @orphans;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
153 my @nolevel;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
154
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
155 print {VOOL} "id\tterm\tlev\tvalue\tseq\n";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
156
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
157 my $count;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
158 while (<CORAL>){
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
159 chomp;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
160 ($seq,$goline)=split(/\t/,$_);
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
161 if ($dnds{$seq}!~/\d/) {
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
162 push @orphans, $seq;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
163 next;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
164 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
165 if ($goline=~/unknown/){
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
166 print {VOOL} "\"$goline\"\t$goline\t5\t$dnds{$seq}\t$seq\n";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
167 next;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
168 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
169 @terms=split(/;/,$goline);
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
170 $nt0=$#terms+1;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
171 $count++;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
172 #print "-------------------\n$count | $seq terms: $nt0\n";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
173 my @collect;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
174 foreach $term (@terms) {
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
175 if (!$goodterms{$term}) {
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
176 $bads++;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
177 #print "$term is not $division$1\n";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
178 next;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
179 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
180 #print "$term\n";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
181 $goods++;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
182 if (!$golevel{$term}){
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
183 push @nolevel,$term;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
184 $golevel{$term}=-1;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
185 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
186 push @collect,($term,@{$parents{$term}});
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
187 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
188
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
189 $ncoll=$#collect+1;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
190 #print "collected terms: $ncoll\n";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
191
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
192 my @nrcollect;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
193 foreach $term (@collect) {
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
194 push @nrcollect, $term unless ("@nrcollect"=~/$term/);
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
195 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
196
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
197 $ncoll=$#nrcollect+1;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
198 #print " nr terms: $ncoll\n";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
199
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
200 foreach $term (@nrcollect) {
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
201 print {VOOL} "\"$name{$term}\"\t$term\t$golevel{$term}\t$dnds{$seq}\t$seq\n";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
202 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
203 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
204 #print "terms matching division: $goods\nterms not matching division: $bads\n";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
205 $nor=$#orphans+1;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
206 $nnl=$#nolevel+1;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
207 print "-------------\ngo_reformat:\nGenes with GO annotations, but not listed in measure table: $nor\n";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
208 print "\nTerms without defined level (old ontology?..): $nnl\n-------------\n";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
209 my %parents;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
210 my %name;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
211 my $term;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
212 my %goodterms;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
213 close VOOL;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
214 #########################
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
215 # selecting good sized categroies, collapsing redundant ones
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
216 #if($dones!~/ $inname3 /) {
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
217 open TAB, $inname2 or die "go_nrify: cannot open input table $inname2\n";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
218 <TAB>;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
219
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
220 my %level={};
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
221 my %desc={};
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
222 my %value={};
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
223 my $des;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
224 my $go;
1
f7287f82602f "planemo upload commit 486235d6560c9e95bd42152ad19bf7c3941cdc1b"
cristian
parents: 0
diff changeset
225 my $ll;
0
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
226 my $gn;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
227 my $val;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
228 my @gos=();
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
229 my %genes={};
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
230 my @gcount=();
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
231 my %gci={};
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
232 my %goi={};
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
233
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
234 while (<TAB>){
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
235 chomp;
1
f7287f82602f "planemo upload commit 486235d6560c9e95bd42152ad19bf7c3941cdc1b"
cristian
parents: 0
diff changeset
236 ($des,$go,$ll,$val,$gn)=split(/\t/,$_);
0
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
237 $value{$gn}=$val;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
238 $desc{$go}=$des;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
239 push @{$genes{$go}},$gn;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
240 push @gcount, $gn unless ($gci{$gn}==1) ;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
241 $gci{$gn}=1;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
242 # push @genes,$gn;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
243 unless ($goi{$go}==1){
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
244 $desc{$go}=$des;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
245 $level{$go}=$l;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
246 push @gos, $go;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
247 $goi{$go}=1;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
248 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
249 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
250
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
251 close TAB;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
252 #unlink $inname2;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
253
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
254 $gc=$#gcount+1;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
255 $goc=$#gos+1;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
256 print "-------------\ngo_nrify:\n$goc categories, $gc genes; size range $mingenes-",$gc*$toomany,"\n";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
257
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
258 # excluding too large and too small categories
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
259 my %gonego={};
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
260 my $toobroad=0;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
261 my $toosmall=0;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
262 for ($g1=0;$g1<=$#gos;$g1++){
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
263 $go=@gos[$g1];
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
264 my $golen=$#{$genes{$go}}+1;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
265 if ($golen > $gc*$toomany) {
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
266 unless ($go eq "unknown") {
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
267 $gonego{$go}=1;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
268 $toobroad++;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
269 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
270 #warn "$go: too broad ($golen genes)\n";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
271 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
272 elsif ($golen < $mingenes) {
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
273 $gonego{$go}=1 ;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
274 $toosmall++;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
275 #warn "\t\t$go: too narrow ($golen genes) @{$genes{$go}}\n";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
276 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
277 }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
278 print "\t$toobroad too broad\n\t$toosmall too small\n\t", $goc-$toobroad-$toosmall," remaining
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
279
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
280 removing redundancy:
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
281
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
282 calculating GO term similarities based on shared genes...\n";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
283
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
284 my @goodgo=();
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
285 foreach $go (@gos) { push @goodgo, $go unless ($gonego{$go}==1); }
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
286 @gos=@goodgo;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
287
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
288 ################################
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
289
2
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
290 #warn "comparing categories...\n";
0
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
291 #my $clfile="cl_".$inname31;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
292 #if($dones!~/ $clfile /) {
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
293
2
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
294 use List::Util qw[min max];
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
295 for ($g1=0;$g1<=$#gos;$g1++){
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
296 my $go=@gos[$g1];
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
297 next if ($gonego{$go}==1);
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
298 #warn "----------------\n$go $desc{$go} level $level{$go}\n";
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
299 my $goos=$go;
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
300 my $lev=$level{$go};
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
301 my $dsc=$desc{$go};
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
302 for ($g2=$g1+1;$g2<=$#gos;$g2++){
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
303 my $go2=@gos[$g2];
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
304 next if ($gonego{$go2}==1);
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
305 next if ($ggi{$go2}==1);
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
306 my %seen={};
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
307 my $count=0;
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
308 my @combo=();
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
309 if ($lump<=1) {
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
310 foreach $g (@{$genes{$go}},@{$genes{$go2}}){
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
311 unless($seen{$g}==1 ){
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
312 $count++;
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
313 $seen{$g}=1;
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
314 push @combo, $g;
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
315 }
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
316 }
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
317 my $shared=$#{$genes{$go}}+1+$#{$genes{$go2}}+1-$count;
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
318 $overlap{$go,$go2}=min($shared/($#{$genes{$go}}+1),$shared/($#{$genes{$go2}}+1));
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
319 $overlap{$go2,$go}=min($shared/($#{$genes{$go}}+1),$shared/($#{$genes{$go2}}+1));
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
320 }
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
321 }
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
322 }
0
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
323
2
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
324 open OUT, ">$inname31" or die "gomwu_a: cannot create output $inname31\n";
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
325
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
326 print {OUT} join("\t",@gos),"\n";
0
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
327
2
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
328 foreach $go (@gos) {
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
329 $overlap{$go,$go}=1;
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
330 foreach $go2 (@gos){
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
331 print {OUT} sprintf("%.3f",1-$overlap{$go,$go2});;
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
332 print {OUT} "\t" unless ($go2 eq $gos[$#gos]);
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
333 }
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
334 print {OUT} "\n";
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
335 }
5acf9dfdfa27 planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
cristian
parents: 1
diff changeset
336 close OUT;
0
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
337 #}
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
338
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
339 #print "calling clusteringGOs.R script ....\n";
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
340 # my $err=`Rscript clusteringGOs.R $inname31 $cutHeight `;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
341 # print $err;
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
342
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
343
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
344
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
345
91261b42c07e "planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
cristian
parents:
diff changeset
346