Mercurial > repos > cristian > rbgoa
comparison gomwu_b.pl @ 0:91261b42c07e draft
"planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
author | cristian |
---|---|
date | Thu, 14 Apr 2022 13:28:05 +0000 |
parents | |
children | f7287f82602f |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:91261b42c07e |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 my $usage= " | |
4 | |
5 gomwu_b.pl (v. Feb 2015): | |
6 | |
7 This is the second script in the GO database slimming and reformatting procedure, | |
8 called automatically by goStats.R | |
9 | |
10 See README_GO_MWU.txt file for details. | |
11 | |
12 Mikhail Matz, UT Austin; matz@utexas.edu | |
13 | |
14 "; | |
15 | |
16 my $gen2go=shift; | |
17 my $measure=shift; | |
18 my $div=shift or die "$usage\nNot enough arguments for gomwu_b.pl\n"; | |
19 | |
20 my $clfile="cl_dissim0_".$div."_".$gen2go; | |
21 open CLF, $clfile or die "cannot locate primary clustering file $clfile\n"; | |
22 my %clgo={}; | |
23 my $go; | |
24 my $cl; | |
25 | |
26 <CLF>; | |
27 while(<CLF>){ | |
28 ($go, $cl)=split(/,/,$_); | |
29 push @{$clgo{$cl}},$go; | |
30 } | |
31 close CLF; | |
32 | |
33 unlink $clfile; | |
34 unlink "dissim0_".$div."_".$gen2go; | |
35 | |
36 opendir THISDIR, "."; | |
37 my @donealready=grep /$gen2go/, readdir THISDIR; | |
38 my $dones=" "."@donealready"." "; | |
39 #print "DONE: $dones\n"; | |
40 | |
41 my $inname2=$measure.".".$div.".tmp"; | |
42 open TAB, $inname2 or die "go_nrify: cannot open input table $inname2\n"; | |
43 <TAB>; | |
44 my %level={}; | |
45 my %desc={}; | |
46 my %value={}; | |
47 my $des; | |
48 my $go; | |
49 my $l; | |
50 my $gn; | |
51 my $val; | |
52 my @gos=(); | |
53 my %genes={}; | |
54 my @gcount=(); | |
55 my %gci={}; | |
56 my %goi={}; | |
57 | |
58 while (<TAB>){ | |
59 chomp; | |
60 ($des,$go,$l,$val,$gn)=split(/\t/,$_); | |
61 $value{$gn}=$val; | |
62 $desc{$go}=$des; | |
63 push @{$genes{$go}},$gn; | |
64 push @gcount, $gn unless ($gci{$gn}==1) ; | |
65 $gci{$gn}=1; | |
66 # push @genes,$gn; | |
67 unless ($goi{$go}==1){ | |
68 $desc{$go}=$des; | |
69 $level{$go}=$l; | |
70 push @gos, $go; | |
71 $goi{$go}=1; | |
72 } | |
73 } | |
74 close TAB; | |
75 unlink $inname2; | |
76 | |
77 #-------------------- | |
78 | |
79 | |
80 my @nrgos=(); | |
81 my %nrlev={}; | |
82 my @nrgos=(); | |
83 my %nrgenes={}; | |
84 my %nrdesc={}; | |
85 my $gcount=0; | |
86 my $cl; | |
87 my $go; | |
88 my $gene; | |
89 | |
90 foreach $cl (keys %clgo){ | |
91 my $largest=${$clgo{$cl}}[0]; | |
92 my $maxgenes=$#{$genes{$largest}}+1; | |
93 my $maxlevel=$level{$largest}; | |
94 $gcount+=$#{$genes{$largest}}+1; | |
95 my @nrgens=@{$genes{$largest}}; | |
96 if ($#{$clgo{$cl}}>0) { | |
97 foreach $go (@{$clgo{$cl}}) { | |
98 if ($maxgenes<($#{$genes{$go}}+1)) { | |
99 $maxgenes=($#{$genes{$go}}+1); | |
100 $largest=$go; | |
101 } | |
102 elsif ($maxgenes==($#{$genes{$go}}+1)) { | |
103 if ($maxlevel<$level{$go}){ | |
104 $maxlevel=$level{$go}; | |
105 $largest=$go; | |
106 } | |
107 } | |
108 foreach $gene (@{$genes{$go}}){ | |
109 push @nrgens, $gene unless(" @nrgens "=~/ $gene /); | |
110 } | |
111 } | |
112 } | |
113 my $goos=join(";",@{$clgo{$cl}}); | |
114 push @nrgos, $goos; | |
115 $nrdesc{$goos}=$desc{$largest}; | |
116 $nrlev{$goos}=$maxlevel; | |
117 @{$nrgenes{$goos}}=@nrgens; | |
118 $gcount+=$#nrgens+1; | |
119 } | |
120 | |
121 print $#nrgos+1," non-redundant GO categories of good size\n-------------\n"; | |
122 $outname=$div."_".$measure; | |
123 open OUT, ">$outname" or die "gomwu_b: cannot create output $outname\n"; | |
124 print {OUT} "name\tterm\tlev\tseq\tvalue\n"; | |
125 | |
126 foreach $go (@nrgos) { | |
127 foreach $gene (@{$nrgenes{$go}}){ | |
128 print {OUT} "$nrdesc{$go}\t$go\t$nrlev{$go}\t$gene\t$value{$gene}\n"; | |
129 } | |
130 } | |
131 close OUT; | |
132 my %level={}; | |
133 my %desc={}; | |
134 my %value={}; | |
135 my $des; | |
136 my $go; | |
137 my $l; | |
138 my $gn; | |
139 my $val; | |
140 my @gos=(); | |
141 my %genes={}; | |
142 my @gcount=(); | |
143 my %nrlev={}; | |
144 my @nrgos=(); | |
145 my %nrgenes={}; | |
146 my %nrdesc={}; | |
147 my $gcount=0; | |
148 my %dnds; | |
149 | |
150 #################### | |
151 # building dissimilarity matrix | |
152 | |
153 my $inname4="dissim_".$div."_".$measure."_".$gen2go; | |
154 my $inname3=$div."_".$measure; | |
155 | |
156 #if($dones!~/ $inname4 /) { | |
157 | |
158 use List::Util qw[min max]; | |
159 open TAB, $inname3 or die "go_cluster: cannot open input table $inname3\n"; | |
160 <TAB>; | |
161 | |
162 my $des; | |
163 my $go; | |
164 my $l; | |
165 my $gn; | |
166 my $val; | |
167 my @gos=(); | |
168 my %genes={}; | |
169 my %gosi={}; | |
170 | |
171 print"\nSecondary clustering:\ncalculating similarities....\n"; | |
172 while (<TAB>){ | |
173 chomp; | |
174 ($des,$go,$l,$gn,$val)=split(/\t/,$_); | |
175 push @{$genes{$go}},$gn; | |
176 unless ($gosi{$go}==1 ){ | |
177 push @gos, $go; | |
178 $gosi{$go}=1; | |
179 } | |
180 } | |
181 | |
182 my %dissim={}; | |
183 for ($g1=0;$g1<$#gos;$g1++){ | |
184 my $go=@gos[$g1]; | |
185 #if ($go eq "unknown") { print "unknown as go\n";} | |
186 for ($g2=$g1+1;$g2<=$#gos;$g2++){ | |
187 my $go2=@gos[$g2]; | |
188 if ($go2 eq "unknown") { | |
189 #print "$go against $go2\n"; | |
190 $dissim{$go,$go2}=$dissim{$go2,$go}=1; | |
191 next; | |
192 } | |
193 my %seen={}; my $count=0; | |
194 foreach $g (@{$genes{$go}},@{$genes{$go2}}){ | |
195 unless($seen{$g}==1 ){ | |
196 $count++; | |
197 $seen{$g}=1; | |
198 } | |
199 } | |
200 my $shared=$#{$genes{$go}}+1+$#{$genes{$go2}}+1-$count; | |
201 my $ref=min($#{$genes{$go}}+1,$#{$genes{$go2}}+1); | |
202 $dissim{$go,$go2}=$dissim{$go2,$go}=sprintf("%.3f",1-$shared/$ref); | |
203 } | |
204 } | |
205 | |
206 open OUT, ">$inname4" or die "gomwu_b: cannot create output $inname4\n"; | |
207 print {OUT} join("\t",@gos),"\n"; | |
208 | |
209 foreach $go (@gos) { | |
210 $dissim{$go,$go}=0; | |
211 foreach $go2 (@gos){ | |
212 print {OUT} "$dissim{$go,$go2}"; | |
213 print {OUT} "\t" unless ($go2 eq $gos[$#gos]); | |
214 } | |
215 print {OUT} "\n"; | |
216 } | |
217 #} |