comparison gomwu_a.pl @ 0:91261b42c07e draft

"planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
author cristian
date Thu, 14 Apr 2022 13:28:05 +0000
parents
children f7287f82602f
comparison
equal deleted inserted replaced
-1:000000000000 0:91261b42c07e
1 #!/usr/bin/env perl
2
3 my $usage= "
4
5 gomwu_a.pl (v. Feb 2015):
6
7 This is the fist script in the GO database slimming and reformatting procedure,
8 called automatically by goStats.R
9
10 See README_GO_MWU.txt file for details.
11
12 Mikhail Matz, UT Austin; matz@utexas.edu
13
14 ";
15
16 print "@ARGV";
17
18 my $onto=$ARGV[0] or die $usage;
19 my $gen2go=$ARGV[1] or die $usage;
20 my $measure=$ARGV[2] or die $usage;
21 my $div=$ARGV[3] or die $usage;
22 my $altern="t";
23 if ("@ARGV"=~/alternative=(\w)/) { $altern=$1; }
24 my $toomany=0.1;
25 if ("@ARGV"=~/largest=(0\.\d+)/) { $toomany=$1; }
26 my $mingenes=5;
27 if ("@ARGV"=~/smallest=(\d+)/) { $mingenes=$1; }
28 my $cutHeight=0.25;
29 if ("@ARGV"=~/cutHeight=(\S+)/) { $cutHeight=$1; }
30 my $padj="BH";
31 if ("@ARGV"=~/p.adjust=(\w+)/) { $padj=$1; }
32
33
34 print "
35
36 Run parameters:
37
38 largest GO category as fraction of all genes (largest) : $toomany
39 smallest GO category as # of genes (smallest) : $mingenes
40 clustering threshold (clusterCutHeight) : $cutHeight
41
42 ";
43
44 my $division;
45 if ($div eq "BP") { $division="biological_process";}
46 elsif ($div eq "MF") { $division="molecular_function";}
47 elsif ($div eq "CC") { $division="cellular_component";}
48 else { die "unrecognized division: $div\n";}
49
50 my $inname2=$measure.".".$div.".tmp";
51 my $inname3=$div."_".$measure;
52 my $inname31="dissim0_".$div."_".$gen2go;
53 my $inname4="dissim_".$div."_".$measure."_".$gen2go;
54
55 my @donealready=();
56
57 open GO, $onto or die "cannot open ontology $onto\n";
58 open CORAL, $gen2go or die "cannot open genes2go table $gen2go\n";
59 open DNDS, $measure or die "cannot open measure table $measure\n";
60 my $outname=$inname2;
61 open VOOL, ">$outname" or die "cannot create output file $outname\n";
62
63 #################################################
64 # reading GO hierarchy, only the $divterm division
65 print "-----------------\nretrieving GO hierarchy, reformatting data...\n\n";
66 my %parents;
67 my %name;
68 my $term;
69 my %goodterms;
70 #my $divterm;
71 #my $division="molecular_function";
72
73 $bads=0;
74
75 while (<GO>){
76 if ($_=~/^id:\s(GO\S+)/){
77 $term=$1;
78 }
79 elsif ($_=~/^namespace:\s(\S+)/){
80 if ($1 ne $division) {
81 $term="";
82 }
83 else {
84 $goodterms{$term}="OK" unless (!$term);
85 }
86 }
87
88 elsif ($_=~/^is_a:\s(GO\S+)\s/) {
89 push @{$parents{$term}}, $1 unless (!$term);
90 push @allparents, $1;
91 }
92 elsif ($_=~/^name:\s(.+)/) {
93 $name{$term}=$1;
94 chomp($name{$term}) unless (!$term);
95 if ($name{$term} eq $division) { $divterm=$term;}
96 }
97 }
98 $goodterms{$term}="OK" unless (!$term);
99
100 # calculating hierarchy levels, putting all parents together
101
102 my %golevel;
103 $golevel{$divterm}=0;
104
105 foreach $term (keys %parents) {
106 $extra=$#{$parents{$term}}+1;
107 for ($in=$#{$parents{$term}};$extra;$in+=$extra){
108 $golevel{$term}++;
109 $tstart=$in-$extra+1;
110 $extra=0;
111 for ($tt=$tstart; $tt<=$in;$tt++){
112 $t=${$parents{$term}}[$tt];
113 foreach $t0 (@{$parents{$t}}) {
114 next if ("@{$parents{$term}}"=~/$t0/ || !$goodterms{$t0});
115 push @{$parents{$term}}, $t0;
116 $extra++;
117 }
118 }
119 }
120 }
121 ###################################################
122
123 # reading measures
124 my $l=0;
125 my %dnds;
126 my $seq;
127
128 while (<DNDS>){
129 if (!$l){
130 $l++;
131 next;
132 }
133 chomp;
134 ($seq,$ns)=split(/,/, $_);
135 if ($seq=~/SEQ/) { $seq.="_s";}
136 $dnds{$seq}=$ns;
137 }
138
139 #reformatting
140
141 $bads=0;
142 $goods=0;
143
144 my $seq;
145 my $goline;
146 my @terms;
147 my @orphans;
148 my @nolevel;
149
150 print {VOOL} "id\tterm\tlev\tvalue\tseq\n";
151
152 my $count;
153 while (<CORAL>){
154 chomp;
155 ($seq,$goline)=split(/\t/,$_);
156 if ($dnds{$seq}!~/\d/) {
157 push @orphans, $seq;
158 next;
159 }
160 if ($goline=~/unknown/){
161 print {VOOL} "\"$goline\"\t$goline\t5\t$dnds{$seq}\t$seq\n";
162 next;
163 }
164 @terms=split(/;/,$goline);
165 $nt0=$#terms+1;
166 $count++;
167 #print "-------------------\n$count | $seq terms: $nt0\n";
168 my @collect;
169 foreach $term (@terms) {
170 if (!$goodterms{$term}) {
171 $bads++;
172 #print "$term is not $division$1\n";
173 next;
174 }
175 #print "$term\n";
176 $goods++;
177 if (!$golevel{$term}){
178 push @nolevel,$term;
179 $golevel{$term}=-1;
180 }
181 push @collect,($term,@{$parents{$term}});
182 }
183
184 $ncoll=$#collect+1;
185 #print "collected terms: $ncoll\n";
186
187 my @nrcollect;
188 foreach $term (@collect) {
189 push @nrcollect, $term unless ("@nrcollect"=~/$term/);
190 }
191
192 $ncoll=$#nrcollect+1;
193 #print " nr terms: $ncoll\n";
194
195 foreach $term (@nrcollect) {
196 print {VOOL} "\"$name{$term}\"\t$term\t$golevel{$term}\t$dnds{$seq}\t$seq\n";
197 }
198 }
199 #print "terms matching division: $goods\nterms not matching division: $bads\n";
200 $nor=$#orphans+1;
201 $nnl=$#nolevel+1;
202 print "-------------\ngo_reformat:\nGenes with GO annotations, but not listed in measure table: $nor\n";
203 print "\nTerms without defined level (old ontology?..): $nnl\n-------------\n";
204 my %parents;
205 my %name;
206 my $term;
207 my %goodterms;
208 close VOOL;
209 #########################
210 # selecting good sized categroies, collapsing redundant ones
211 #if($dones!~/ $inname3 /) {
212 open TAB, $inname2 or die "go_nrify: cannot open input table $inname2\n";
213 <TAB>;
214
215 my %level={};
216 my %desc={};
217 my %value={};
218 my $des;
219 my $go;
220 my $l;
221 my $gn;
222 my $val;
223 my @gos=();
224 my %genes={};
225 my @gcount=();
226 my %gci={};
227 my %goi={};
228
229 while (<TAB>){
230 chomp;
231 ($des,$go,$l,$val,$gn)=split(/\t/,$_);
232 $value{$gn}=$val;
233 $desc{$go}=$des;
234 push @{$genes{$go}},$gn;
235 push @gcount, $gn unless ($gci{$gn}==1) ;
236 $gci{$gn}=1;
237 # push @genes,$gn;
238 unless ($goi{$go}==1){
239 $desc{$go}=$des;
240 $level{$go}=$l;
241 push @gos, $go;
242 $goi{$go}=1;
243 }
244 }
245
246 close TAB;
247 #unlink $inname2;
248
249 $gc=$#gcount+1;
250 $goc=$#gos+1;
251 print "-------------\ngo_nrify:\n$goc categories, $gc genes; size range $mingenes-",$gc*$toomany,"\n";
252
253 # excluding too large and too small categories
254 my %gonego={};
255 my $toobroad=0;
256 my $toosmall=0;
257 for ($g1=0;$g1<=$#gos;$g1++){
258 $go=@gos[$g1];
259 my $golen=$#{$genes{$go}}+1;
260 if ($golen > $gc*$toomany) {
261 unless ($go eq "unknown") {
262 $gonego{$go}=1;
263 $toobroad++;
264 }
265 #warn "$go: too broad ($golen genes)\n";
266 }
267 elsif ($golen < $mingenes) {
268 $gonego{$go}=1 ;
269 $toosmall++;
270 #warn "\t\t$go: too narrow ($golen genes) @{$genes{$go}}\n";
271 }
272 }
273 print "\t$toobroad too broad\n\t$toosmall too small\n\t", $goc-$toobroad-$toosmall," remaining
274
275 removing redundancy:
276
277 calculating GO term similarities based on shared genes...\n";
278
279 my @goodgo=();
280 foreach $go (@gos) { push @goodgo, $go unless ($gonego{$go}==1); }
281 @gos=@goodgo;
282
283 ################################
284
285 #warn "comparing categories...\n";
286 #my $clfile="cl_".$inname31;
287 #if($dones!~/ $clfile /) {
288
289 use List::Util qw[min max];
290 for ($g1=0;$g1<=$#gos;$g1++){
291 my $go=@gos[$g1];
292 next if ($gonego{$go}==1);
293 #warn "----------------\n$go $desc{$go} level $level{$go}\n";
294 my $goos=$go;
295 my $lev=$level{$go};
296 my $dsc=$desc{$go};
297 for ($g2=$g1+1;$g2<=$#gos;$g2++){
298 my $go2=@gos[$g2];
299 next if ($gonego{$go2}==1);
300 next if ($ggi{$go2}==1);
301 my %seen={};
302 my $count=0;
303 my @combo=();
304 if ($lump<=1) {
305 foreach $g (@{$genes{$go}},@{$genes{$go2}}){
306 unless($seen{$g}==1 ){
307 $count++;
308 $seen{$g}=1;
309 push @combo, $g;
310 }
311 }
312 my $shared=$#{$genes{$go}}+1+$#{$genes{$go2}}+1-$count;
313 $overlap{$go,$go2}=min($shared/($#{$genes{$go}}+1),$shared/($#{$genes{$go2}}+1));
314 $overlap{$go2,$go}=min($shared/($#{$genes{$go}}+1),$shared/($#{$genes{$go2}}+1));
315 }
316 }
317 }
318
319 open OUT, ">$inname31" or die "gomwu_b: cannot create output $inname31\n";
320
321 print {OUT} join("\t",@gos),"\n";
322
323 foreach $go (@gos) {
324 $overlap{$go,$go}=1;
325 foreach $go2 (@gos){
326 print {OUT} sprintf("%.3f",1-$overlap{$go,$go2});;
327 print {OUT} "\t" unless ($go2 eq $gos[$#gos]);
328 }
329 print {OUT} "\n";
330 }
331 close OUT;
332 #}
333
334 #print "calling clusteringGOs.R script ....\n";
335 # my $err=`Rscript clusteringGOs.R $inname31 $cutHeight `;
336 # print $err;
337
338
339
340
341