comparison gomwu_b.pl @ 0:91261b42c07e draft

"planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
author cristian
date Thu, 14 Apr 2022 13:28:05 +0000
parents
children f7287f82602f
comparison
equal deleted inserted replaced
-1:000000000000 0:91261b42c07e
1 #!/usr/bin/env perl
2
3 my $usage= "
4
5 gomwu_b.pl (v. Feb 2015):
6
7 This is the second script in the GO database slimming and reformatting procedure,
8 called automatically by goStats.R
9
10 See README_GO_MWU.txt file for details.
11
12 Mikhail Matz, UT Austin; matz@utexas.edu
13
14 ";
15
16 my $gen2go=shift;
17 my $measure=shift;
18 my $div=shift or die "$usage\nNot enough arguments for gomwu_b.pl\n";
19
20 my $clfile="cl_dissim0_".$div."_".$gen2go;
21 open CLF, $clfile or die "cannot locate primary clustering file $clfile\n";
22 my %clgo={};
23 my $go;
24 my $cl;
25
26 <CLF>;
27 while(<CLF>){
28 ($go, $cl)=split(/,/,$_);
29 push @{$clgo{$cl}},$go;
30 }
31 close CLF;
32
33 unlink $clfile;
34 unlink "dissim0_".$div."_".$gen2go;
35
36 opendir THISDIR, ".";
37 my @donealready=grep /$gen2go/, readdir THISDIR;
38 my $dones=" "."@donealready"." ";
39 #print "DONE: $dones\n";
40
41 my $inname2=$measure.".".$div.".tmp";
42 open TAB, $inname2 or die "go_nrify: cannot open input table $inname2\n";
43 <TAB>;
44 my %level={};
45 my %desc={};
46 my %value={};
47 my $des;
48 my $go;
49 my $l;
50 my $gn;
51 my $val;
52 my @gos=();
53 my %genes={};
54 my @gcount=();
55 my %gci={};
56 my %goi={};
57
58 while (<TAB>){
59 chomp;
60 ($des,$go,$l,$val,$gn)=split(/\t/,$_);
61 $value{$gn}=$val;
62 $desc{$go}=$des;
63 push @{$genes{$go}},$gn;
64 push @gcount, $gn unless ($gci{$gn}==1) ;
65 $gci{$gn}=1;
66 # push @genes,$gn;
67 unless ($goi{$go}==1){
68 $desc{$go}=$des;
69 $level{$go}=$l;
70 push @gos, $go;
71 $goi{$go}=1;
72 }
73 }
74 close TAB;
75 unlink $inname2;
76
77 #--------------------
78
79
80 my @nrgos=();
81 my %nrlev={};
82 my @nrgos=();
83 my %nrgenes={};
84 my %nrdesc={};
85 my $gcount=0;
86 my $cl;
87 my $go;
88 my $gene;
89
90 foreach $cl (keys %clgo){
91 my $largest=${$clgo{$cl}}[0];
92 my $maxgenes=$#{$genes{$largest}}+1;
93 my $maxlevel=$level{$largest};
94 $gcount+=$#{$genes{$largest}}+1;
95 my @nrgens=@{$genes{$largest}};
96 if ($#{$clgo{$cl}}>0) {
97 foreach $go (@{$clgo{$cl}}) {
98 if ($maxgenes<($#{$genes{$go}}+1)) {
99 $maxgenes=($#{$genes{$go}}+1);
100 $largest=$go;
101 }
102 elsif ($maxgenes==($#{$genes{$go}}+1)) {
103 if ($maxlevel<$level{$go}){
104 $maxlevel=$level{$go};
105 $largest=$go;
106 }
107 }
108 foreach $gene (@{$genes{$go}}){
109 push @nrgens, $gene unless(" @nrgens "=~/ $gene /);
110 }
111 }
112 }
113 my $goos=join(";",@{$clgo{$cl}});
114 push @nrgos, $goos;
115 $nrdesc{$goos}=$desc{$largest};
116 $nrlev{$goos}=$maxlevel;
117 @{$nrgenes{$goos}}=@nrgens;
118 $gcount+=$#nrgens+1;
119 }
120
121 print $#nrgos+1," non-redundant GO categories of good size\n-------------\n";
122 $outname=$div."_".$measure;
123 open OUT, ">$outname" or die "gomwu_b: cannot create output $outname\n";
124 print {OUT} "name\tterm\tlev\tseq\tvalue\n";
125
126 foreach $go (@nrgos) {
127 foreach $gene (@{$nrgenes{$go}}){
128 print {OUT} "$nrdesc{$go}\t$go\t$nrlev{$go}\t$gene\t$value{$gene}\n";
129 }
130 }
131 close OUT;
132 my %level={};
133 my %desc={};
134 my %value={};
135 my $des;
136 my $go;
137 my $l;
138 my $gn;
139 my $val;
140 my @gos=();
141 my %genes={};
142 my @gcount=();
143 my %nrlev={};
144 my @nrgos=();
145 my %nrgenes={};
146 my %nrdesc={};
147 my $gcount=0;
148 my %dnds;
149
150 ####################
151 # building dissimilarity matrix
152
153 my $inname4="dissim_".$div."_".$measure."_".$gen2go;
154 my $inname3=$div."_".$measure;
155
156 #if($dones!~/ $inname4 /) {
157
158 use List::Util qw[min max];
159 open TAB, $inname3 or die "go_cluster: cannot open input table $inname3\n";
160 <TAB>;
161
162 my $des;
163 my $go;
164 my $l;
165 my $gn;
166 my $val;
167 my @gos=();
168 my %genes={};
169 my %gosi={};
170
171 print"\nSecondary clustering:\ncalculating similarities....\n";
172 while (<TAB>){
173 chomp;
174 ($des,$go,$l,$gn,$val)=split(/\t/,$_);
175 push @{$genes{$go}},$gn;
176 unless ($gosi{$go}==1 ){
177 push @gos, $go;
178 $gosi{$go}=1;
179 }
180 }
181
182 my %dissim={};
183 for ($g1=0;$g1<$#gos;$g1++){
184 my $go=@gos[$g1];
185 #if ($go eq "unknown") { print "unknown as go\n";}
186 for ($g2=$g1+1;$g2<=$#gos;$g2++){
187 my $go2=@gos[$g2];
188 if ($go2 eq "unknown") {
189 #print "$go against $go2\n";
190 $dissim{$go,$go2}=$dissim{$go2,$go}=1;
191 next;
192 }
193 my %seen={}; my $count=0;
194 foreach $g (@{$genes{$go}},@{$genes{$go2}}){
195 unless($seen{$g}==1 ){
196 $count++;
197 $seen{$g}=1;
198 }
199 }
200 my $shared=$#{$genes{$go}}+1+$#{$genes{$go2}}+1-$count;
201 my $ref=min($#{$genes{$go}}+1,$#{$genes{$go2}}+1);
202 $dissim{$go,$go2}=$dissim{$go2,$go}=sprintf("%.3f",1-$shared/$ref);
203 }
204 }
205
206 open OUT, ">$inname4" or die "gomwu_b: cannot create output $inname4\n";
207 print {OUT} join("\t",@gos),"\n";
208
209 foreach $go (@gos) {
210 $dissim{$go,$go}=0;
211 foreach $go2 (@gos){
212 print {OUT} "$dissim{$go,$go2}";
213 print {OUT} "\t" unless ($go2 eq $gos[$#gos]);
214 }
215 print {OUT} "\n";
216 }
217 #}