1
|
1 #argv0: input transcript bed file
|
|
2 #argv1: output filename, will be in AS format
|
|
3
|
|
4 use strict;
|
|
5 my $AnnoFileName = $ARGV[0];
|
|
6 my $outputFileName = $ARGV[1];
|
|
7
|
|
8 if($AnnoFileName eq "" or $outputFileName eq "")
|
|
9 {
|
|
10 print "TXDBGEN: Please specify your input files\n";
|
|
11 exit;
|
|
12 }
|
|
13
|
|
14 my $cachefolder = `basename $outputFileName`;
|
|
15 chomp($cachefolder);
|
|
16 my $cachefolder = $cachefolder.".cache";
|
|
17 my $AnnoFileBase = `basename $AnnoFileName`;
|
|
18 chomp($AnnoFileBase);
|
|
19
|
|
20 if(! -e $cachefolder)
|
|
21 {
|
|
22 mkdir $cachefolder or die "CHECK: cannot mkdir $cachefolder\n";
|
|
23 print "TXDBGEN: mkdir $cachefolder\n";
|
|
24 }
|
|
25
|
|
26 my $CacheAnnoFileName = $cachefolder."/".$AnnoFileBase.".sort";
|
|
27 #sort the annotation file
|
|
28
|
|
29 print "TXDBGEN: sort $AnnoFileName \n";
|
|
30
|
|
31 system("sort -k6,6 -k1,1 -k2,2n -k3,3n $AnnoFileName >$CacheAnnoFileName");
|
|
32
|
|
33 $AnnoFileName = $CacheAnnoFileName;
|
|
34
|
|
35 #read the annotations into hashes
|
|
36
|
|
37 open(AnnoFile, $CacheAnnoFileName) or die "can not open",$CacheAnnoFileName;
|
|
38 #split the genes into contigs
|
|
39
|
|
40 my $contigid = 0;
|
|
41 my $end_tmp = 0;
|
|
42 my $chr_tmp = "chr";
|
|
43 my $strand_tmp="NA";
|
|
44
|
|
45 my $TXnumtmp=0;
|
|
46
|
|
47 my %eventlist =();
|
|
48 #my %eventlist_af = ();
|
|
49 #my %eventlist_al = ();
|
|
50 my %evidences = ();
|
|
51
|
|
52 open(my $output,">$outputFileName");
|
|
53
|
|
54 open(my $output2, ">$outputFileName.evi");
|
|
55 while(my $line =<AnnoFile>)
|
|
56 {
|
|
57 chomp($line);
|
|
58 my @a = split("\t",$line);
|
|
59 my $chr = $a[0];
|
|
60 my $start = $a[1];
|
|
61 my $end = $a[2];
|
|
62 my $name = $a[3];
|
|
63 my @sizes = split(",",$a[10]);
|
|
64 my $strand = $a[5];
|
|
65 my @start_shifts = split(",",$a[11]);
|
|
66 #my $chrstr = $chr.$strand;
|
|
67 my @starts;
|
|
68 my @ends;
|
|
69 for(my $i=0;$i<@start_shifts;$i++)
|
|
70 {
|
|
71 $starts[$i]=$start_shifts[$i]+$start;
|
|
72 $ends[$i] = $starts[$i]+$sizes[$i];
|
|
73 }
|
|
74 if($start >$end_tmp or $chr ne $chr_tmp or $strand_tmp ne $strand)
|
|
75 {
|
|
76 $contigid++;
|
|
77 #$ctgmultisonum++ if $TXnumtmp>1;
|
|
78 $TXnumtmp=0;
|
|
79 my $annos = scanevents(\%eventlist, "inner") ;
|
|
80 #my $annos_af = scanevents(\%eventlist_af, "af");
|
|
81 #my $annos_al = scanevents(\%eventlist_al, "al");
|
|
82
|
|
83 my $stdout = select ($output);
|
|
84 my $eventids = printanno($annos,$chr_tmp,$strand_tmp) if(scalar(%$annos)>0);
|
|
85 #printanno($annos_af,$chr_tmp,$strand_tmp) if(scalar(%$annos_af)>0);
|
|
86 #printanno($annos_al,$chr_tmp,$strand_tmp) if(scalar(%$annos_al)>0);
|
|
87 select($stdout);
|
|
88 $stdout = select ($output2);
|
|
89 #print cross information
|
|
90 foreach my $connect_str (keys %$eventids)
|
|
91 {
|
|
92 print $eventids->{$connect_str},"\t";
|
|
93 foreach my $transcriptid (keys %{$evidences{$connect_str}})
|
|
94 {
|
|
95 print $transcriptid,",";
|
|
96 }
|
|
97 print "\n";
|
|
98 }
|
|
99 select($stdout);
|
|
100 print "TXDBGEN: Contig ID $contigid at $chr $strand...\n" if ( $contigid%1000 == 0);
|
|
101 %eventlist = ();
|
|
102 #%eventlist_al = ();
|
|
103 #%eventlist_af = ();
|
|
104 #print CacheContigFile "#ctg$contigid\n";
|
|
105
|
|
106 }
|
|
107 $TXnumtmp++;
|
|
108 $end_tmp = $end if ($end > $end_tmp or $chr ne $chr_tmp or $strand_tmp ne $strand);
|
|
109 $chr_tmp = $chr;
|
|
110
|
|
111 $strand_tmp = $strand;
|
|
112
|
|
113 #scan connections, 2 and 3 exons for CA/CS/AF
|
|
114 # if(scalar(@starts)>2)
|
|
115 # {
|
|
116 # my $connectionstr_af =$starts[0]."-".
|
|
117 # $ends[0].",".
|
|
118 # $starts[1]."-".
|
|
119 # $ends[1]."," ;
|
|
120 #
|
|
121 # $eventlist_af{$connectionstr_af} = $starts[0];
|
|
122 # my $connectionstr_al = $starts[scalar(@starts)-2]."-".
|
|
123 # $ends[scalar(@starts)-2].",".
|
|
124 # $starts[scalar(@starts)-1]."-".
|
|
125 # $ends[scalar(@starts)-1]."," ;
|
|
126 # $eventlist_al{$connectionstr_al} = $starts[scalar(@starts)-2];
|
|
127 # }
|
|
128 #didn't consider direction yet
|
|
129 #add 1 exon for IR
|
|
130 for(my $n=1;$n<4;$n++)
|
|
131 {
|
|
132 for(my $i=0;$i<scalar(@starts)-$n+1;$i++)
|
|
133 {
|
|
134 my $connectionstr = "";
|
|
135 for(my $j=$i;$j<$i+$n;$j++)
|
|
136 {
|
|
137 $connectionstr = $connectionstr .
|
|
138 $starts[$j]."-".
|
|
139 $ends[$j].",";
|
|
140 }
|
|
141 # print $connectionstr,"\n";
|
|
142 $eventlist{$connectionstr} = $n;
|
|
143 $evidences{$connectionstr}{$name} = 1;
|
|
144 }
|
|
145
|
|
146 }
|
|
147
|
|
148 }
|
|
149
|
|
150 my $annos = scanevents(\%eventlist, "inner");
|
|
151 #my $annos_af = scanevents(\%eventlist_af, "af");
|
|
152 #my $annos_al = scanevents(\%eventlist_al, "al");
|
|
153
|
|
154 my $stdout = select ($output);
|
|
155 my $eventids = printanno($annos,$chr_tmp,$strand_tmp) if(scalar(%$annos)>0);
|
|
156 select ($stdout);
|
|
157 $stdout = select ($output2);
|
|
158 #print cross information
|
|
159 foreach my $connect_str (keys %$eventids)
|
|
160 {
|
|
161 print $eventids->{$connect_str},"\t";
|
|
162 foreach my $transcriptid (keys %{$evidences{$connect_str}})
|
|
163 {
|
|
164 print $transcriptid,",";
|
|
165 }
|
|
166 print "\n";
|
|
167 }
|
|
168
|
|
169 select($stdout);
|
|
170
|
|
171 close($output);
|
|
172 close($output2);
|
|
173
|
|
174 #close(CacheContigFile);
|
|
175 #define sub scanevents
|
|
176 sub scanevents
|
|
177 {
|
|
178 my ($eventlist,$mode)=@_;
|
|
179 my %anno=();
|
|
180 if($mode eq "inner")
|
|
181 {
|
|
182 foreach my $key (keys %$eventlist)
|
|
183 {
|
|
184 #print $key,"\t",$eventlist->{$key},"\n";
|
|
185 if($eventlist->{$key} == 3)
|
|
186 {
|
|
187 my @a=split(",",$key);
|
|
188 my $connectstr_ca=$a[0].",".$a[2].",";
|
|
189 if (exists $eventlist->{$connectstr_ca})
|
|
190 {
|
|
191 $anno{$key}="ca";
|
|
192 }
|
|
193 else
|
|
194 {
|
|
195 $anno{$key}="cs";
|
|
196 }
|
|
197 }
|
|
198 if($eventlist->{$key} == 2)
|
|
199 {
|
|
200 my @a=split(/[,-]/,$key);
|
|
201
|
|
202 my $connectstr_ir=$a[0]."-".$a[3].",";
|
|
203 if (exists $eventlist->{$connectstr_ir})
|
|
204 {
|
|
205 $anno{$key}="ir";
|
|
206 }
|
|
207 foreach my $search (keys %$eventlist)
|
|
208 {
|
|
209 next if($eventlist->{$search}!=2);
|
|
210 my $connectstr_ss="";
|
|
211 my @b=split(/[,-]/,$search);
|
|
212 if($a[0]==$b[0] && $a[3]==$b[3])
|
|
213 {
|
|
214 if($a[1]==$b[1])
|
|
215 {
|
|
216 next if ($b[2] == $a[2]);
|
|
217 $connectstr_ss .= $a[0]."-".$a[1].",";
|
|
218 if($b[2]>$a[2])
|
|
219 {
|
|
220 $connectstr_ss .= $a[2]."-".$b[2].",".$b[2];
|
|
221 }
|
|
222 #elsif($b[2]<$a[2])
|
|
223 else
|
|
224 {
|
|
225 $connectstr_ss .= $b[2]."-".$a[2].",".$a[2];
|
|
226 }
|
|
227 $connectstr_ss .= "-".$a[3].",";
|
|
228 $anno{$connectstr_ss}="ss";
|
|
229 #print $connectstr_ss,"\tss\n";
|
|
230 }
|
|
231 if($a[2]==$b[2])
|
|
232 {
|
|
233 $connectstr_ss .= $a[0]."-";
|
|
234 if($b[1]>$a[1])
|
|
235 {
|
|
236 $connectstr_ss .= $a[1].",".$a[1]."-".$b[1];
|
|
237 }
|
|
238 #elsif($b[2]<$a[2])
|
|
239 else
|
|
240 {
|
|
241 $connectstr_ss .= $b[1].",".$b[1]."-".$a[1];
|
|
242 }
|
|
243 $connectstr_ss .= ",".$a[2]."-".$a[3].",";
|
|
244 $anno{$connectstr_ss}="ss";
|
|
245 #print $connectstr_ss,"\tss\n";
|
|
246 }
|
|
247
|
|
248
|
|
249 }
|
|
250
|
|
251 }
|
|
252 }
|
|
253 }
|
|
254 }
|
|
255 if($mode eq "af" or $mode eq "al")#alternative first/last exon
|
|
256 {
|
|
257 foreach my $key (keys %$eventlist)
|
|
258 {
|
|
259 my @a=split(/,/,$key);
|
|
260 foreach my $search (keys %$eventlist)
|
|
261 {
|
|
262 next if $search eq $key;
|
|
263 my $connectstr = "";
|
|
264 if($eventlist->{$search} < $eventlist->{$key} )
|
|
265 {
|
|
266 $connectstr = $search."|".$key;
|
|
267 }
|
|
268 else
|
|
269 {
|
|
270 $connectstr = $key."|".$search;
|
|
271 }
|
|
272 next if exists $anno{$connectstr};
|
|
273 my @b=split(/,/,$search);
|
|
274 if($mode eq "af")
|
|
275 {
|
|
276 if($a[0] ne $b[0] and $a[1] eq $b[1])
|
|
277 {
|
|
278 $anno{$connectstr} = $mode;
|
|
279 }
|
|
280 }
|
|
281 else
|
|
282 {
|
|
283 if($a[0] eq $b[0] and $a[1] ne $b[1])
|
|
284 {
|
|
285 $anno{$connectstr} = $mode;
|
|
286 }
|
|
287 }
|
|
288
|
|
289 }
|
|
290 }
|
|
291 }
|
|
292 return(\%anno);
|
|
293 }
|
|
294
|
|
295
|
|
296 sub printanno
|
|
297 {
|
|
298 # print the annotations
|
|
299 # and return the connectstr and event id to be used to print evidences
|
|
300 my %eventids=();
|
|
301 my ($annos,$chr,$strand)=@_;
|
|
302 my %nums_per_isoform;
|
|
303 foreach my $key (keys %$annos)
|
|
304 {
|
|
305 if($annos->{$key} eq "af")
|
|
306 {
|
|
307 my @a=split(/[\,\-\|]+/,$key);
|
|
308 my $chrid=substr($chr,3,length($chr)-3);
|
|
309 my $id_anno="AF";
|
|
310 #consider direction later
|
|
311 my $id=join("-",$id_anno,$id_anno, $a[2],$a[3]);
|
|
312 my $num=0;
|
|
313 $num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id};
|
|
314 $nums_per_isoform{$id}++;
|
|
315
|
|
316 print $chr,"\t",$a[0],"\t",$a[3],"\t";
|
|
317 print $id.".".$num,"[L]\t";
|
|
318 print "0\t";
|
|
319 print $strand,"\t";
|
|
320 print $a[0],"\t",$a[3],"\t";
|
|
321 print "255,0,0\t";
|
|
322 print "2\t";
|
|
323 print $a[1]-$a[0],",";
|
|
324 print $a[3]-$a[2],"\t";
|
|
325 print "0,";
|
|
326 print $a[2]-$a[0],"\n";
|
|
327
|
|
328 print $chr,"\t",$a[4],"\t",$a[7],"\t";
|
|
329 print $id.".".$num,"[R]\t";
|
|
330 print "0\t";
|
|
331 print $strand,"\t";
|
|
332 print $a[4],"\t",$a[7],"\t";
|
|
333 print "255,0,0\t";
|
|
334 print "2\t";
|
|
335 print $a[5]-$a[4],",";
|
|
336 print $a[7]-$a[6],"\t";
|
|
337 print "0,";
|
|
338 print $a[6]-$a[4],"\n";
|
|
339
|
|
340 #print "$key,af\n";
|
|
341 }
|
|
342 if($annos->{$key} eq "ca" or $annos->{$key} eq "cs")
|
|
343 {
|
|
344 my @a=split(/[,-]/,$key);
|
|
345
|
|
346 my $chrid=substr($chr,3,length($chr)-3);
|
|
347 my $id_anno="CS";
|
|
348 $id_anno="CA" if $annos->{$key} eq "ca";
|
|
349 my $id="CA-$id_anno-$chrid"."-".$a[2]."-".$a[3];
|
|
350 my $num=0;
|
|
351 $num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id};
|
|
352 $nums_per_isoform{$id}++;
|
|
353
|
|
354 print $chr,"\t",$a[0],"\t",$a[5],"\t";
|
|
355 print $id.".".$num,"[L]\t";
|
|
356 $eventids {$key} = $id.".".$num."[L]";
|
|
357 print "0\t";
|
|
358 print $strand,"\t";
|
|
359 print $a[0],"\t",$a[5],"\t";
|
|
360 print "255,0,0\t";
|
|
361 print "3\t";
|
|
362 print $a[1]-$a[0],",";
|
|
363 print $a[3]-$a[2],",";
|
|
364 print $a[5]-$a[4],"\t";
|
|
365 print "0,";
|
|
366 print $a[2]-$a[0],",";
|
|
367 print $a[4]-$a[0],"\n";
|
|
368
|
|
369 print $chr,"\t",$a[0],"\t",$a[5],"\t";
|
|
370 print $id.".".$num,"[S]\t";
|
|
371 my $connectstr_ca= $a[0]."-".$a[1].",".$a[4]."-".$a[5].",";
|
|
372 $eventids{$connectstr_ca} = $id.".".$num."[S]";
|
|
373 print "0\t";
|
|
374 print $strand,"\t";
|
|
375 print $a[0],"\t",$a[5],"\t";
|
|
376 print "255,0,0\t";
|
|
377 print "2\t";
|
|
378 print $a[1]-$a[0],",";
|
|
379 #print $a[3]-$a[2],",";
|
|
380 print $a[5]-$a[4],"\t";
|
|
381 print "0,";
|
|
382 #print $a[2]-$a[0],",";
|
|
383 print $a[4]-$a[0],"\n";
|
|
384
|
|
385 }
|
|
386 if($annos->{$key} eq "ir")
|
|
387 {
|
|
388 my @a=split(/[,-]/,$key);
|
|
389
|
|
390 my $chrid=substr($chr,3,length($chr)-3);
|
|
391 my $id="IR-IR-$chrid"."-".$a[1]."-".$a[2];
|
|
392 my $num=0;
|
|
393 $num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id};
|
|
394 $nums_per_isoform{$id}++;
|
|
395
|
|
396 print $chr,"\t",$a[0],"\t",$a[3],"\t";
|
|
397 print $id.".".$num,"[L]\t";
|
|
398 print "0\t";
|
|
399 print $strand,"\t";
|
|
400 print $a[0],"\t",$a[3],"\t";
|
|
401 $eventids{$a[0]."-".$a[3].","} = $id.".".$num."[L]";
|
|
402 print "255,0,0\t";
|
|
403 print "3\t";
|
|
404 print $a[1]-$a[0],",";
|
|
405 print $a[2]-$a[1],",";
|
|
406 print $a[3]-$a[2],"\t";
|
|
407 print "0,";
|
|
408 print $a[1]-$a[0],",";
|
|
409 print $a[2]-$a[0],"\n";
|
|
410
|
|
411 print $chr,"\t",$a[0],"\t",$a[3],"\t";
|
|
412 print $id.".".$num,"[S]\t";
|
|
413 $eventids{$key} = $id.".".$num."[S]";
|
|
414 print "0\t";
|
|
415 print $strand,"\t";
|
|
416 print $a[0],"\t",$a[3],"\t";
|
|
417 print "255,0,0\t";
|
|
418 print "2\t";
|
|
419 print $a[1]-$a[0],",";
|
|
420 #print $a[3]-$a[2],",";
|
|
421 print $a[3]-$a[2],"\t";
|
|
422 print "0,";
|
|
423 #print $a[2]-$a[0],",";
|
|
424 print $a[2]-$a[0],"\n";
|
|
425
|
|
426 }
|
|
427 if($annos->{$key} eq "ss")
|
|
428 {
|
|
429 my @a=split(/[,-]/,$key);
|
|
430 my $chrid=substr($chr,3,length($chr)-3);
|
|
431 my $type="AA";
|
|
432 if( ($strand eq "+" && $a[1]==$a[2]) or ($strand eq "-" && $a[3]==$a[4]))
|
|
433 {
|
|
434 $type="AD";
|
|
435 }
|
|
436 my $id="$type-$type-$chrid"."-".$a[2]."-".$a[3];
|
|
437 my $connect_str_L = "";
|
|
438 my $connect_str_S = "";
|
|
439 if($a[1]==$a[2])
|
|
440 {
|
|
441 $connect_str_L = $a[0]."-".$a[3].",".$a[4]."-".$a[5].",";
|
|
442 $connect_str_S = $a[0]."-".$a[1].",".$a[4]."-".$a[5].",";
|
|
443 }
|
|
444 else
|
|
445 {
|
|
446 $connect_str_L = $a[0]."-".$a[1].",".$a[2]."-".$a[5].",";
|
|
447 $connect_str_S = $a[0]."-".$a[1].",".$a[3]."-".$a[5].",";
|
|
448
|
|
449 }
|
|
450 my $num=0;
|
|
451 $num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id};
|
|
452 $nums_per_isoform{$id}++;
|
|
453 print $chr,"\t",$a[0],"\t",$a[5],"\t";
|
|
454 print $id.".".$num,"[L]\t";
|
|
455 $eventids {$connect_str_L} = $id.".".$num."[L]";
|
|
456 print "0\t";
|
|
457 print $strand,"\t";
|
|
458 print $a[0],"\t",$a[5],"\t";
|
|
459 print "255,0,0\t";
|
|
460 print "3\t";
|
|
461 print $a[1]-$a[0],",";
|
|
462 print $a[3]-$a[2],",";
|
|
463 print $a[5]-$a[4],"\t";
|
|
464 print "0,";
|
|
465 print $a[2]-$a[0],",";
|
|
466 print $a[4]-$a[0],"\n";
|
|
467
|
|
468 print $chr,"\t",$a[0],"\t",$a[5],"\t";
|
|
469 print $id.".".$num,"[S]\t";
|
|
470 $eventids {$connect_str_S} = $id.".".$num."[S]";
|
|
471 print "0\t";
|
|
472 print $strand,"\t";
|
|
473 print $a[0],"\t",$a[5],"\t";
|
|
474 print "255,0,0\t";
|
|
475 print "2\t";
|
|
476 print $a[1]-$a[0],",";
|
|
477 #print $a[3]-$a[2],",";
|
|
478 print $a[5]-$a[4],"\t";
|
|
479 print "0,";
|
|
480 #print $a[2]-$a[0],",";
|
|
481 print $a[4]-$a[0],"\n";
|
|
482
|
|
483
|
|
484 }
|
|
485
|
|
486 }
|
|
487 return(\%eventids);
|
|
488
|
|
489 }
|
|
490
|