Mercurial > repos > cristian > rbgoa
comparison gomwu_a.pl @ 0:91261b42c07e draft
"planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
author | cristian |
---|---|
date | Thu, 14 Apr 2022 13:28:05 +0000 |
parents | |
children | f7287f82602f |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:91261b42c07e |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 my $usage= " | |
4 | |
5 gomwu_a.pl (v. Feb 2015): | |
6 | |
7 This is the fist script in the GO database slimming and reformatting procedure, | |
8 called automatically by goStats.R | |
9 | |
10 See README_GO_MWU.txt file for details. | |
11 | |
12 Mikhail Matz, UT Austin; matz@utexas.edu | |
13 | |
14 "; | |
15 | |
16 print "@ARGV"; | |
17 | |
18 my $onto=$ARGV[0] or die $usage; | |
19 my $gen2go=$ARGV[1] or die $usage; | |
20 my $measure=$ARGV[2] or die $usage; | |
21 my $div=$ARGV[3] or die $usage; | |
22 my $altern="t"; | |
23 if ("@ARGV"=~/alternative=(\w)/) { $altern=$1; } | |
24 my $toomany=0.1; | |
25 if ("@ARGV"=~/largest=(0\.\d+)/) { $toomany=$1; } | |
26 my $mingenes=5; | |
27 if ("@ARGV"=~/smallest=(\d+)/) { $mingenes=$1; } | |
28 my $cutHeight=0.25; | |
29 if ("@ARGV"=~/cutHeight=(\S+)/) { $cutHeight=$1; } | |
30 my $padj="BH"; | |
31 if ("@ARGV"=~/p.adjust=(\w+)/) { $padj=$1; } | |
32 | |
33 | |
34 print " | |
35 | |
36 Run parameters: | |
37 | |
38 largest GO category as fraction of all genes (largest) : $toomany | |
39 smallest GO category as # of genes (smallest) : $mingenes | |
40 clustering threshold (clusterCutHeight) : $cutHeight | |
41 | |
42 "; | |
43 | |
44 my $division; | |
45 if ($div eq "BP") { $division="biological_process";} | |
46 elsif ($div eq "MF") { $division="molecular_function";} | |
47 elsif ($div eq "CC") { $division="cellular_component";} | |
48 else { die "unrecognized division: $div\n";} | |
49 | |
50 my $inname2=$measure.".".$div.".tmp"; | |
51 my $inname3=$div."_".$measure; | |
52 my $inname31="dissim0_".$div."_".$gen2go; | |
53 my $inname4="dissim_".$div."_".$measure."_".$gen2go; | |
54 | |
55 my @donealready=(); | |
56 | |
57 open GO, $onto or die "cannot open ontology $onto\n"; | |
58 open CORAL, $gen2go or die "cannot open genes2go table $gen2go\n"; | |
59 open DNDS, $measure or die "cannot open measure table $measure\n"; | |
60 my $outname=$inname2; | |
61 open VOOL, ">$outname" or die "cannot create output file $outname\n"; | |
62 | |
63 ################################################# | |
64 # reading GO hierarchy, only the $divterm division | |
65 print "-----------------\nretrieving GO hierarchy, reformatting data...\n\n"; | |
66 my %parents; | |
67 my %name; | |
68 my $term; | |
69 my %goodterms; | |
70 #my $divterm; | |
71 #my $division="molecular_function"; | |
72 | |
73 $bads=0; | |
74 | |
75 while (<GO>){ | |
76 if ($_=~/^id:\s(GO\S+)/){ | |
77 $term=$1; | |
78 } | |
79 elsif ($_=~/^namespace:\s(\S+)/){ | |
80 if ($1 ne $division) { | |
81 $term=""; | |
82 } | |
83 else { | |
84 $goodterms{$term}="OK" unless (!$term); | |
85 } | |
86 } | |
87 | |
88 elsif ($_=~/^is_a:\s(GO\S+)\s/) { | |
89 push @{$parents{$term}}, $1 unless (!$term); | |
90 push @allparents, $1; | |
91 } | |
92 elsif ($_=~/^name:\s(.+)/) { | |
93 $name{$term}=$1; | |
94 chomp($name{$term}) unless (!$term); | |
95 if ($name{$term} eq $division) { $divterm=$term;} | |
96 } | |
97 } | |
98 $goodterms{$term}="OK" unless (!$term); | |
99 | |
100 # calculating hierarchy levels, putting all parents together | |
101 | |
102 my %golevel; | |
103 $golevel{$divterm}=0; | |
104 | |
105 foreach $term (keys %parents) { | |
106 $extra=$#{$parents{$term}}+1; | |
107 for ($in=$#{$parents{$term}};$extra;$in+=$extra){ | |
108 $golevel{$term}++; | |
109 $tstart=$in-$extra+1; | |
110 $extra=0; | |
111 for ($tt=$tstart; $tt<=$in;$tt++){ | |
112 $t=${$parents{$term}}[$tt]; | |
113 foreach $t0 (@{$parents{$t}}) { | |
114 next if ("@{$parents{$term}}"=~/$t0/ || !$goodterms{$t0}); | |
115 push @{$parents{$term}}, $t0; | |
116 $extra++; | |
117 } | |
118 } | |
119 } | |
120 } | |
121 ################################################### | |
122 | |
123 # reading measures | |
124 my $l=0; | |
125 my %dnds; | |
126 my $seq; | |
127 | |
128 while (<DNDS>){ | |
129 if (!$l){ | |
130 $l++; | |
131 next; | |
132 } | |
133 chomp; | |
134 ($seq,$ns)=split(/,/, $_); | |
135 if ($seq=~/SEQ/) { $seq.="_s";} | |
136 $dnds{$seq}=$ns; | |
137 } | |
138 | |
139 #reformatting | |
140 | |
141 $bads=0; | |
142 $goods=0; | |
143 | |
144 my $seq; | |
145 my $goline; | |
146 my @terms; | |
147 my @orphans; | |
148 my @nolevel; | |
149 | |
150 print {VOOL} "id\tterm\tlev\tvalue\tseq\n"; | |
151 | |
152 my $count; | |
153 while (<CORAL>){ | |
154 chomp; | |
155 ($seq,$goline)=split(/\t/,$_); | |
156 if ($dnds{$seq}!~/\d/) { | |
157 push @orphans, $seq; | |
158 next; | |
159 } | |
160 if ($goline=~/unknown/){ | |
161 print {VOOL} "\"$goline\"\t$goline\t5\t$dnds{$seq}\t$seq\n"; | |
162 next; | |
163 } | |
164 @terms=split(/;/,$goline); | |
165 $nt0=$#terms+1; | |
166 $count++; | |
167 #print "-------------------\n$count | $seq terms: $nt0\n"; | |
168 my @collect; | |
169 foreach $term (@terms) { | |
170 if (!$goodterms{$term}) { | |
171 $bads++; | |
172 #print "$term is not $division$1\n"; | |
173 next; | |
174 } | |
175 #print "$term\n"; | |
176 $goods++; | |
177 if (!$golevel{$term}){ | |
178 push @nolevel,$term; | |
179 $golevel{$term}=-1; | |
180 } | |
181 push @collect,($term,@{$parents{$term}}); | |
182 } | |
183 | |
184 $ncoll=$#collect+1; | |
185 #print "collected terms: $ncoll\n"; | |
186 | |
187 my @nrcollect; | |
188 foreach $term (@collect) { | |
189 push @nrcollect, $term unless ("@nrcollect"=~/$term/); | |
190 } | |
191 | |
192 $ncoll=$#nrcollect+1; | |
193 #print " nr terms: $ncoll\n"; | |
194 | |
195 foreach $term (@nrcollect) { | |
196 print {VOOL} "\"$name{$term}\"\t$term\t$golevel{$term}\t$dnds{$seq}\t$seq\n"; | |
197 } | |
198 } | |
199 #print "terms matching division: $goods\nterms not matching division: $bads\n"; | |
200 $nor=$#orphans+1; | |
201 $nnl=$#nolevel+1; | |
202 print "-------------\ngo_reformat:\nGenes with GO annotations, but not listed in measure table: $nor\n"; | |
203 print "\nTerms without defined level (old ontology?..): $nnl\n-------------\n"; | |
204 my %parents; | |
205 my %name; | |
206 my $term; | |
207 my %goodterms; | |
208 close VOOL; | |
209 ######################### | |
210 # selecting good sized categroies, collapsing redundant ones | |
211 #if($dones!~/ $inname3 /) { | |
212 open TAB, $inname2 or die "go_nrify: cannot open input table $inname2\n"; | |
213 <TAB>; | |
214 | |
215 my %level={}; | |
216 my %desc={}; | |
217 my %value={}; | |
218 my $des; | |
219 my $go; | |
220 my $l; | |
221 my $gn; | |
222 my $val; | |
223 my @gos=(); | |
224 my %genes={}; | |
225 my @gcount=(); | |
226 my %gci={}; | |
227 my %goi={}; | |
228 | |
229 while (<TAB>){ | |
230 chomp; | |
231 ($des,$go,$l,$val,$gn)=split(/\t/,$_); | |
232 $value{$gn}=$val; | |
233 $desc{$go}=$des; | |
234 push @{$genes{$go}},$gn; | |
235 push @gcount, $gn unless ($gci{$gn}==1) ; | |
236 $gci{$gn}=1; | |
237 # push @genes,$gn; | |
238 unless ($goi{$go}==1){ | |
239 $desc{$go}=$des; | |
240 $level{$go}=$l; | |
241 push @gos, $go; | |
242 $goi{$go}=1; | |
243 } | |
244 } | |
245 | |
246 close TAB; | |
247 #unlink $inname2; | |
248 | |
249 $gc=$#gcount+1; | |
250 $goc=$#gos+1; | |
251 print "-------------\ngo_nrify:\n$goc categories, $gc genes; size range $mingenes-",$gc*$toomany,"\n"; | |
252 | |
253 # excluding too large and too small categories | |
254 my %gonego={}; | |
255 my $toobroad=0; | |
256 my $toosmall=0; | |
257 for ($g1=0;$g1<=$#gos;$g1++){ | |
258 $go=@gos[$g1]; | |
259 my $golen=$#{$genes{$go}}+1; | |
260 if ($golen > $gc*$toomany) { | |
261 unless ($go eq "unknown") { | |
262 $gonego{$go}=1; | |
263 $toobroad++; | |
264 } | |
265 #warn "$go: too broad ($golen genes)\n"; | |
266 } | |
267 elsif ($golen < $mingenes) { | |
268 $gonego{$go}=1 ; | |
269 $toosmall++; | |
270 #warn "\t\t$go: too narrow ($golen genes) @{$genes{$go}}\n"; | |
271 } | |
272 } | |
273 print "\t$toobroad too broad\n\t$toosmall too small\n\t", $goc-$toobroad-$toosmall," remaining | |
274 | |
275 removing redundancy: | |
276 | |
277 calculating GO term similarities based on shared genes...\n"; | |
278 | |
279 my @goodgo=(); | |
280 foreach $go (@gos) { push @goodgo, $go unless ($gonego{$go}==1); } | |
281 @gos=@goodgo; | |
282 | |
283 ################################ | |
284 | |
285 #warn "comparing categories...\n"; | |
286 #my $clfile="cl_".$inname31; | |
287 #if($dones!~/ $clfile /) { | |
288 | |
289 use List::Util qw[min max]; | |
290 for ($g1=0;$g1<=$#gos;$g1++){ | |
291 my $go=@gos[$g1]; | |
292 next if ($gonego{$go}==1); | |
293 #warn "----------------\n$go $desc{$go} level $level{$go}\n"; | |
294 my $goos=$go; | |
295 my $lev=$level{$go}; | |
296 my $dsc=$desc{$go}; | |
297 for ($g2=$g1+1;$g2<=$#gos;$g2++){ | |
298 my $go2=@gos[$g2]; | |
299 next if ($gonego{$go2}==1); | |
300 next if ($ggi{$go2}==1); | |
301 my %seen={}; | |
302 my $count=0; | |
303 my @combo=(); | |
304 if ($lump<=1) { | |
305 foreach $g (@{$genes{$go}},@{$genes{$go2}}){ | |
306 unless($seen{$g}==1 ){ | |
307 $count++; | |
308 $seen{$g}=1; | |
309 push @combo, $g; | |
310 } | |
311 } | |
312 my $shared=$#{$genes{$go}}+1+$#{$genes{$go2}}+1-$count; | |
313 $overlap{$go,$go2}=min($shared/($#{$genes{$go}}+1),$shared/($#{$genes{$go2}}+1)); | |
314 $overlap{$go2,$go}=min($shared/($#{$genes{$go}}+1),$shared/($#{$genes{$go2}}+1)); | |
315 } | |
316 } | |
317 } | |
318 | |
319 open OUT, ">$inname31" or die "gomwu_b: cannot create output $inname31\n"; | |
320 | |
321 print {OUT} join("\t",@gos),"\n"; | |
322 | |
323 foreach $go (@gos) { | |
324 $overlap{$go,$go}=1; | |
325 foreach $go2 (@gos){ | |
326 print {OUT} sprintf("%.3f",1-$overlap{$go,$go2});; | |
327 print {OUT} "\t" unless ($go2 eq $gos[$#gos]); | |
328 } | |
329 print {OUT} "\n"; | |
330 } | |
331 close OUT; | |
332 #} | |
333 | |
334 #print "calling clusteringGOs.R script ....\n"; | |
335 # my $err=`Rscript clusteringGOs.R $inname31 $cutHeight `; | |
336 # print $err; | |
337 | |
338 | |
339 | |
340 | |
341 |