comparison bin/scanbed2txdb.pl @ 1:adc0f7765d85 draft

planemo upload
author bioitcore
date Thu, 07 Sep 2017 15:06:58 -0400
parents
children
comparison
equal deleted inserted replaced
0:d4ca551ca300 1:adc0f7765d85
1 #argv0: input transcript bed file
2 #argv1: output filename, will be in AS format
3
4 use strict;
5 my $AnnoFileName = $ARGV[0];
6 my $outputFileName = $ARGV[1];
7
8 if($AnnoFileName eq "" or $outputFileName eq "")
9 {
10 print "TXDBGEN: Please specify your input files\n";
11 exit;
12 }
13
14 my $cachefolder = `basename $outputFileName`;
15 chomp($cachefolder);
16 my $cachefolder = $cachefolder.".cache";
17 my $AnnoFileBase = `basename $AnnoFileName`;
18 chomp($AnnoFileBase);
19
20 if(! -e $cachefolder)
21 {
22 mkdir $cachefolder or die "CHECK: cannot mkdir $cachefolder\n";
23 print "TXDBGEN: mkdir $cachefolder\n";
24 }
25
26 my $CacheAnnoFileName = $cachefolder."/".$AnnoFileBase.".sort";
27 #sort the annotation file
28
29 print "TXDBGEN: sort $AnnoFileName \n";
30
31 system("sort -k6,6 -k1,1 -k2,2n -k3,3n $AnnoFileName >$CacheAnnoFileName");
32
33 $AnnoFileName = $CacheAnnoFileName;
34
35 #read the annotations into hashes
36
37 open(AnnoFile, $CacheAnnoFileName) or die "can not open",$CacheAnnoFileName;
38 #split the genes into contigs
39
40 my $contigid = 0;
41 my $end_tmp = 0;
42 my $chr_tmp = "chr";
43 my $strand_tmp="NA";
44
45 my $TXnumtmp=0;
46
47 my %eventlist =();
48 #my %eventlist_af = ();
49 #my %eventlist_al = ();
50 my %evidences = ();
51
52 open(my $output,">$outputFileName");
53
54 open(my $output2, ">$outputFileName.evi");
55 while(my $line =<AnnoFile>)
56 {
57 chomp($line);
58 my @a = split("\t",$line);
59 my $chr = $a[0];
60 my $start = $a[1];
61 my $end = $a[2];
62 my $name = $a[3];
63 my @sizes = split(",",$a[10]);
64 my $strand = $a[5];
65 my @start_shifts = split(",",$a[11]);
66 #my $chrstr = $chr.$strand;
67 my @starts;
68 my @ends;
69 for(my $i=0;$i<@start_shifts;$i++)
70 {
71 $starts[$i]=$start_shifts[$i]+$start;
72 $ends[$i] = $starts[$i]+$sizes[$i];
73 }
74 if($start >$end_tmp or $chr ne $chr_tmp or $strand_tmp ne $strand)
75 {
76 $contigid++;
77 #$ctgmultisonum++ if $TXnumtmp>1;
78 $TXnumtmp=0;
79 my $annos = scanevents(\%eventlist, "inner") ;
80 #my $annos_af = scanevents(\%eventlist_af, "af");
81 #my $annos_al = scanevents(\%eventlist_al, "al");
82
83 my $stdout = select ($output);
84 my $eventids = printanno($annos,$chr_tmp,$strand_tmp) if(scalar(%$annos)>0);
85 #printanno($annos_af,$chr_tmp,$strand_tmp) if(scalar(%$annos_af)>0);
86 #printanno($annos_al,$chr_tmp,$strand_tmp) if(scalar(%$annos_al)>0);
87 select($stdout);
88 $stdout = select ($output2);
89 #print cross information
90 foreach my $connect_str (keys %$eventids)
91 {
92 print $eventids->{$connect_str},"\t";
93 foreach my $transcriptid (keys %{$evidences{$connect_str}})
94 {
95 print $transcriptid,",";
96 }
97 print "\n";
98 }
99 select($stdout);
100 print "TXDBGEN: Contig ID $contigid at $chr $strand...\n" if ( $contigid%1000 == 0);
101 %eventlist = ();
102 #%eventlist_al = ();
103 #%eventlist_af = ();
104 #print CacheContigFile "#ctg$contigid\n";
105
106 }
107 $TXnumtmp++;
108 $end_tmp = $end if ($end > $end_tmp or $chr ne $chr_tmp or $strand_tmp ne $strand);
109 $chr_tmp = $chr;
110
111 $strand_tmp = $strand;
112
113 #scan connections, 2 and 3 exons for CA/CS/AF
114 # if(scalar(@starts)>2)
115 # {
116 # my $connectionstr_af =$starts[0]."-".
117 # $ends[0].",".
118 # $starts[1]."-".
119 # $ends[1]."," ;
120 #
121 # $eventlist_af{$connectionstr_af} = $starts[0];
122 # my $connectionstr_al = $starts[scalar(@starts)-2]."-".
123 # $ends[scalar(@starts)-2].",".
124 # $starts[scalar(@starts)-1]."-".
125 # $ends[scalar(@starts)-1]."," ;
126 # $eventlist_al{$connectionstr_al} = $starts[scalar(@starts)-2];
127 # }
128 #didn't consider direction yet
129 #add 1 exon for IR
130 for(my $n=1;$n<4;$n++)
131 {
132 for(my $i=0;$i<scalar(@starts)-$n+1;$i++)
133 {
134 my $connectionstr = "";
135 for(my $j=$i;$j<$i+$n;$j++)
136 {
137 $connectionstr = $connectionstr .
138 $starts[$j]."-".
139 $ends[$j].",";
140 }
141 # print $connectionstr,"\n";
142 $eventlist{$connectionstr} = $n;
143 $evidences{$connectionstr}{$name} = 1;
144 }
145
146 }
147
148 }
149
150 my $annos = scanevents(\%eventlist, "inner");
151 #my $annos_af = scanevents(\%eventlist_af, "af");
152 #my $annos_al = scanevents(\%eventlist_al, "al");
153
154 my $stdout = select ($output);
155 my $eventids = printanno($annos,$chr_tmp,$strand_tmp) if(scalar(%$annos)>0);
156 select ($stdout);
157 $stdout = select ($output2);
158 #print cross information
159 foreach my $connect_str (keys %$eventids)
160 {
161 print $eventids->{$connect_str},"\t";
162 foreach my $transcriptid (keys %{$evidences{$connect_str}})
163 {
164 print $transcriptid,",";
165 }
166 print "\n";
167 }
168
169 select($stdout);
170
171 close($output);
172 close($output2);
173
174 #close(CacheContigFile);
175 #define sub scanevents
176 sub scanevents
177 {
178 my ($eventlist,$mode)=@_;
179 my %anno=();
180 if($mode eq "inner")
181 {
182 foreach my $key (keys %$eventlist)
183 {
184 #print $key,"\t",$eventlist->{$key},"\n";
185 if($eventlist->{$key} == 3)
186 {
187 my @a=split(",",$key);
188 my $connectstr_ca=$a[0].",".$a[2].",";
189 if (exists $eventlist->{$connectstr_ca})
190 {
191 $anno{$key}="ca";
192 }
193 else
194 {
195 $anno{$key}="cs";
196 }
197 }
198 if($eventlist->{$key} == 2)
199 {
200 my @a=split(/[,-]/,$key);
201
202 my $connectstr_ir=$a[0]."-".$a[3].",";
203 if (exists $eventlist->{$connectstr_ir})
204 {
205 $anno{$key}="ir";
206 }
207 foreach my $search (keys %$eventlist)
208 {
209 next if($eventlist->{$search}!=2);
210 my $connectstr_ss="";
211 my @b=split(/[,-]/,$search);
212 if($a[0]==$b[0] && $a[3]==$b[3])
213 {
214 if($a[1]==$b[1])
215 {
216 next if ($b[2] == $a[2]);
217 $connectstr_ss .= $a[0]."-".$a[1].",";
218 if($b[2]>$a[2])
219 {
220 $connectstr_ss .= $a[2]."-".$b[2].",".$b[2];
221 }
222 #elsif($b[2]<$a[2])
223 else
224 {
225 $connectstr_ss .= $b[2]."-".$a[2].",".$a[2];
226 }
227 $connectstr_ss .= "-".$a[3].",";
228 $anno{$connectstr_ss}="ss";
229 #print $connectstr_ss,"\tss\n";
230 }
231 if($a[2]==$b[2])
232 {
233 $connectstr_ss .= $a[0]."-";
234 if($b[1]>$a[1])
235 {
236 $connectstr_ss .= $a[1].",".$a[1]."-".$b[1];
237 }
238 #elsif($b[2]<$a[2])
239 else
240 {
241 $connectstr_ss .= $b[1].",".$b[1]."-".$a[1];
242 }
243 $connectstr_ss .= ",".$a[2]."-".$a[3].",";
244 $anno{$connectstr_ss}="ss";
245 #print $connectstr_ss,"\tss\n";
246 }
247
248
249 }
250
251 }
252 }
253 }
254 }
255 if($mode eq "af" or $mode eq "al")#alternative first/last exon
256 {
257 foreach my $key (keys %$eventlist)
258 {
259 my @a=split(/,/,$key);
260 foreach my $search (keys %$eventlist)
261 {
262 next if $search eq $key;
263 my $connectstr = "";
264 if($eventlist->{$search} < $eventlist->{$key} )
265 {
266 $connectstr = $search."|".$key;
267 }
268 else
269 {
270 $connectstr = $key."|".$search;
271 }
272 next if exists $anno{$connectstr};
273 my @b=split(/,/,$search);
274 if($mode eq "af")
275 {
276 if($a[0] ne $b[0] and $a[1] eq $b[1])
277 {
278 $anno{$connectstr} = $mode;
279 }
280 }
281 else
282 {
283 if($a[0] eq $b[0] and $a[1] ne $b[1])
284 {
285 $anno{$connectstr} = $mode;
286 }
287 }
288
289 }
290 }
291 }
292 return(\%anno);
293 }
294
295
296 sub printanno
297 {
298 # print the annotations
299 # and return the connectstr and event id to be used to print evidences
300 my %eventids=();
301 my ($annos,$chr,$strand)=@_;
302 my %nums_per_isoform;
303 foreach my $key (keys %$annos)
304 {
305 if($annos->{$key} eq "af")
306 {
307 my @a=split(/[\,\-\|]+/,$key);
308 my $chrid=substr($chr,3,length($chr)-3);
309 my $id_anno="AF";
310 #consider direction later
311 my $id=join("-",$id_anno,$id_anno, $a[2],$a[3]);
312 my $num=0;
313 $num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id};
314 $nums_per_isoform{$id}++;
315
316 print $chr,"\t",$a[0],"\t",$a[3],"\t";
317 print $id.".".$num,"[L]\t";
318 print "0\t";
319 print $strand,"\t";
320 print $a[0],"\t",$a[3],"\t";
321 print "255,0,0\t";
322 print "2\t";
323 print $a[1]-$a[0],",";
324 print $a[3]-$a[2],"\t";
325 print "0,";
326 print $a[2]-$a[0],"\n";
327
328 print $chr,"\t",$a[4],"\t",$a[7],"\t";
329 print $id.".".$num,"[R]\t";
330 print "0\t";
331 print $strand,"\t";
332 print $a[4],"\t",$a[7],"\t";
333 print "255,0,0\t";
334 print "2\t";
335 print $a[5]-$a[4],",";
336 print $a[7]-$a[6],"\t";
337 print "0,";
338 print $a[6]-$a[4],"\n";
339
340 #print "$key,af\n";
341 }
342 if($annos->{$key} eq "ca" or $annos->{$key} eq "cs")
343 {
344 my @a=split(/[,-]/,$key);
345
346 my $chrid=substr($chr,3,length($chr)-3);
347 my $id_anno="CS";
348 $id_anno="CA" if $annos->{$key} eq "ca";
349 my $id="CA-$id_anno-$chrid"."-".$a[2]."-".$a[3];
350 my $num=0;
351 $num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id};
352 $nums_per_isoform{$id}++;
353
354 print $chr,"\t",$a[0],"\t",$a[5],"\t";
355 print $id.".".$num,"[L]\t";
356 $eventids {$key} = $id.".".$num."[L]";
357 print "0\t";
358 print $strand,"\t";
359 print $a[0],"\t",$a[5],"\t";
360 print "255,0,0\t";
361 print "3\t";
362 print $a[1]-$a[0],",";
363 print $a[3]-$a[2],",";
364 print $a[5]-$a[4],"\t";
365 print "0,";
366 print $a[2]-$a[0],",";
367 print $a[4]-$a[0],"\n";
368
369 print $chr,"\t",$a[0],"\t",$a[5],"\t";
370 print $id.".".$num,"[S]\t";
371 my $connectstr_ca= $a[0]."-".$a[1].",".$a[4]."-".$a[5].",";
372 $eventids{$connectstr_ca} = $id.".".$num."[S]";
373 print "0\t";
374 print $strand,"\t";
375 print $a[0],"\t",$a[5],"\t";
376 print "255,0,0\t";
377 print "2\t";
378 print $a[1]-$a[0],",";
379 #print $a[3]-$a[2],",";
380 print $a[5]-$a[4],"\t";
381 print "0,";
382 #print $a[2]-$a[0],",";
383 print $a[4]-$a[0],"\n";
384
385 }
386 if($annos->{$key} eq "ir")
387 {
388 my @a=split(/[,-]/,$key);
389
390 my $chrid=substr($chr,3,length($chr)-3);
391 my $id="IR-IR-$chrid"."-".$a[1]."-".$a[2];
392 my $num=0;
393 $num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id};
394 $nums_per_isoform{$id}++;
395
396 print $chr,"\t",$a[0],"\t",$a[3],"\t";
397 print $id.".".$num,"[L]\t";
398 print "0\t";
399 print $strand,"\t";
400 print $a[0],"\t",$a[3],"\t";
401 $eventids{$a[0]."-".$a[3].","} = $id.".".$num."[L]";
402 print "255,0,0\t";
403 print "3\t";
404 print $a[1]-$a[0],",";
405 print $a[2]-$a[1],",";
406 print $a[3]-$a[2],"\t";
407 print "0,";
408 print $a[1]-$a[0],",";
409 print $a[2]-$a[0],"\n";
410
411 print $chr,"\t",$a[0],"\t",$a[3],"\t";
412 print $id.".".$num,"[S]\t";
413 $eventids{$key} = $id.".".$num."[S]";
414 print "0\t";
415 print $strand,"\t";
416 print $a[0],"\t",$a[3],"\t";
417 print "255,0,0\t";
418 print "2\t";
419 print $a[1]-$a[0],",";
420 #print $a[3]-$a[2],",";
421 print $a[3]-$a[2],"\t";
422 print "0,";
423 #print $a[2]-$a[0],",";
424 print $a[2]-$a[0],"\n";
425
426 }
427 if($annos->{$key} eq "ss")
428 {
429 my @a=split(/[,-]/,$key);
430 my $chrid=substr($chr,3,length($chr)-3);
431 my $type="AA";
432 if( ($strand eq "+" && $a[1]==$a[2]) or ($strand eq "-" && $a[3]==$a[4]))
433 {
434 $type="AD";
435 }
436 my $id="$type-$type-$chrid"."-".$a[2]."-".$a[3];
437 my $connect_str_L = "";
438 my $connect_str_S = "";
439 if($a[1]==$a[2])
440 {
441 $connect_str_L = $a[0]."-".$a[3].",".$a[4]."-".$a[5].",";
442 $connect_str_S = $a[0]."-".$a[1].",".$a[4]."-".$a[5].",";
443 }
444 else
445 {
446 $connect_str_L = $a[0]."-".$a[1].",".$a[2]."-".$a[5].",";
447 $connect_str_S = $a[0]."-".$a[1].",".$a[3]."-".$a[5].",";
448
449 }
450 my $num=0;
451 $num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id};
452 $nums_per_isoform{$id}++;
453 print $chr,"\t",$a[0],"\t",$a[5],"\t";
454 print $id.".".$num,"[L]\t";
455 $eventids {$connect_str_L} = $id.".".$num."[L]";
456 print "0\t";
457 print $strand,"\t";
458 print $a[0],"\t",$a[5],"\t";
459 print "255,0,0\t";
460 print "3\t";
461 print $a[1]-$a[0],",";
462 print $a[3]-$a[2],",";
463 print $a[5]-$a[4],"\t";
464 print "0,";
465 print $a[2]-$a[0],",";
466 print $a[4]-$a[0],"\n";
467
468 print $chr,"\t",$a[0],"\t",$a[5],"\t";
469 print $id.".".$num,"[S]\t";
470 $eventids {$connect_str_S} = $id.".".$num."[S]";
471 print "0\t";
472 print $strand,"\t";
473 print $a[0],"\t",$a[5],"\t";
474 print "255,0,0\t";
475 print "2\t";
476 print $a[1]-$a[0],",";
477 #print $a[3]-$a[2],",";
478 print $a[5]-$a[4],"\t";
479 print "0,";
480 #print $a[2]-$a[0],",";
481 print $a[4]-$a[0],"\n";
482
483
484 }
485
486 }
487 return(\%eventids);
488
489 }
490