Mercurial > repos > bioitcore > splicetrap
comparison bin/scanbed2txdb.pl @ 1:adc0f7765d85 draft
planemo upload
| author | bioitcore |
|---|---|
| date | Thu, 07 Sep 2017 15:06:58 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:d4ca551ca300 | 1:adc0f7765d85 |
|---|---|
| 1 #argv0: input transcript bed file | |
| 2 #argv1: output filename, will be in AS format | |
| 3 | |
| 4 use strict; | |
| 5 my $AnnoFileName = $ARGV[0]; | |
| 6 my $outputFileName = $ARGV[1]; | |
| 7 | |
| 8 if($AnnoFileName eq "" or $outputFileName eq "") | |
| 9 { | |
| 10 print "TXDBGEN: Please specify your input files\n"; | |
| 11 exit; | |
| 12 } | |
| 13 | |
| 14 my $cachefolder = `basename $outputFileName`; | |
| 15 chomp($cachefolder); | |
| 16 my $cachefolder = $cachefolder.".cache"; | |
| 17 my $AnnoFileBase = `basename $AnnoFileName`; | |
| 18 chomp($AnnoFileBase); | |
| 19 | |
| 20 if(! -e $cachefolder) | |
| 21 { | |
| 22 mkdir $cachefolder or die "CHECK: cannot mkdir $cachefolder\n"; | |
| 23 print "TXDBGEN: mkdir $cachefolder\n"; | |
| 24 } | |
| 25 | |
| 26 my $CacheAnnoFileName = $cachefolder."/".$AnnoFileBase.".sort"; | |
| 27 #sort the annotation file | |
| 28 | |
| 29 print "TXDBGEN: sort $AnnoFileName \n"; | |
| 30 | |
| 31 system("sort -k6,6 -k1,1 -k2,2n -k3,3n $AnnoFileName >$CacheAnnoFileName"); | |
| 32 | |
| 33 $AnnoFileName = $CacheAnnoFileName; | |
| 34 | |
| 35 #read the annotations into hashes | |
| 36 | |
| 37 open(AnnoFile, $CacheAnnoFileName) or die "can not open",$CacheAnnoFileName; | |
| 38 #split the genes into contigs | |
| 39 | |
| 40 my $contigid = 0; | |
| 41 my $end_tmp = 0; | |
| 42 my $chr_tmp = "chr"; | |
| 43 my $strand_tmp="NA"; | |
| 44 | |
| 45 my $TXnumtmp=0; | |
| 46 | |
| 47 my %eventlist =(); | |
| 48 #my %eventlist_af = (); | |
| 49 #my %eventlist_al = (); | |
| 50 my %evidences = (); | |
| 51 | |
| 52 open(my $output,">$outputFileName"); | |
| 53 | |
| 54 open(my $output2, ">$outputFileName.evi"); | |
| 55 while(my $line =<AnnoFile>) | |
| 56 { | |
| 57 chomp($line); | |
| 58 my @a = split("\t",$line); | |
| 59 my $chr = $a[0]; | |
| 60 my $start = $a[1]; | |
| 61 my $end = $a[2]; | |
| 62 my $name = $a[3]; | |
| 63 my @sizes = split(",",$a[10]); | |
| 64 my $strand = $a[5]; | |
| 65 my @start_shifts = split(",",$a[11]); | |
| 66 #my $chrstr = $chr.$strand; | |
| 67 my @starts; | |
| 68 my @ends; | |
| 69 for(my $i=0;$i<@start_shifts;$i++) | |
| 70 { | |
| 71 $starts[$i]=$start_shifts[$i]+$start; | |
| 72 $ends[$i] = $starts[$i]+$sizes[$i]; | |
| 73 } | |
| 74 if($start >$end_tmp or $chr ne $chr_tmp or $strand_tmp ne $strand) | |
| 75 { | |
| 76 $contigid++; | |
| 77 #$ctgmultisonum++ if $TXnumtmp>1; | |
| 78 $TXnumtmp=0; | |
| 79 my $annos = scanevents(\%eventlist, "inner") ; | |
| 80 #my $annos_af = scanevents(\%eventlist_af, "af"); | |
| 81 #my $annos_al = scanevents(\%eventlist_al, "al"); | |
| 82 | |
| 83 my $stdout = select ($output); | |
| 84 my $eventids = printanno($annos,$chr_tmp,$strand_tmp) if(scalar(%$annos)>0); | |
| 85 #printanno($annos_af,$chr_tmp,$strand_tmp) if(scalar(%$annos_af)>0); | |
| 86 #printanno($annos_al,$chr_tmp,$strand_tmp) if(scalar(%$annos_al)>0); | |
| 87 select($stdout); | |
| 88 $stdout = select ($output2); | |
| 89 #print cross information | |
| 90 foreach my $connect_str (keys %$eventids) | |
| 91 { | |
| 92 print $eventids->{$connect_str},"\t"; | |
| 93 foreach my $transcriptid (keys %{$evidences{$connect_str}}) | |
| 94 { | |
| 95 print $transcriptid,","; | |
| 96 } | |
| 97 print "\n"; | |
| 98 } | |
| 99 select($stdout); | |
| 100 print "TXDBGEN: Contig ID $contigid at $chr $strand...\n" if ( $contigid%1000 == 0); | |
| 101 %eventlist = (); | |
| 102 #%eventlist_al = (); | |
| 103 #%eventlist_af = (); | |
| 104 #print CacheContigFile "#ctg$contigid\n"; | |
| 105 | |
| 106 } | |
| 107 $TXnumtmp++; | |
| 108 $end_tmp = $end if ($end > $end_tmp or $chr ne $chr_tmp or $strand_tmp ne $strand); | |
| 109 $chr_tmp = $chr; | |
| 110 | |
| 111 $strand_tmp = $strand; | |
| 112 | |
| 113 #scan connections, 2 and 3 exons for CA/CS/AF | |
| 114 # if(scalar(@starts)>2) | |
| 115 # { | |
| 116 # my $connectionstr_af =$starts[0]."-". | |
| 117 # $ends[0].",". | |
| 118 # $starts[1]."-". | |
| 119 # $ends[1]."," ; | |
| 120 # | |
| 121 # $eventlist_af{$connectionstr_af} = $starts[0]; | |
| 122 # my $connectionstr_al = $starts[scalar(@starts)-2]."-". | |
| 123 # $ends[scalar(@starts)-2].",". | |
| 124 # $starts[scalar(@starts)-1]."-". | |
| 125 # $ends[scalar(@starts)-1]."," ; | |
| 126 # $eventlist_al{$connectionstr_al} = $starts[scalar(@starts)-2]; | |
| 127 # } | |
| 128 #didn't consider direction yet | |
| 129 #add 1 exon for IR | |
| 130 for(my $n=1;$n<4;$n++) | |
| 131 { | |
| 132 for(my $i=0;$i<scalar(@starts)-$n+1;$i++) | |
| 133 { | |
| 134 my $connectionstr = ""; | |
| 135 for(my $j=$i;$j<$i+$n;$j++) | |
| 136 { | |
| 137 $connectionstr = $connectionstr . | |
| 138 $starts[$j]."-". | |
| 139 $ends[$j].","; | |
| 140 } | |
| 141 # print $connectionstr,"\n"; | |
| 142 $eventlist{$connectionstr} = $n; | |
| 143 $evidences{$connectionstr}{$name} = 1; | |
| 144 } | |
| 145 | |
| 146 } | |
| 147 | |
| 148 } | |
| 149 | |
| 150 my $annos = scanevents(\%eventlist, "inner"); | |
| 151 #my $annos_af = scanevents(\%eventlist_af, "af"); | |
| 152 #my $annos_al = scanevents(\%eventlist_al, "al"); | |
| 153 | |
| 154 my $stdout = select ($output); | |
| 155 my $eventids = printanno($annos,$chr_tmp,$strand_tmp) if(scalar(%$annos)>0); | |
| 156 select ($stdout); | |
| 157 $stdout = select ($output2); | |
| 158 #print cross information | |
| 159 foreach my $connect_str (keys %$eventids) | |
| 160 { | |
| 161 print $eventids->{$connect_str},"\t"; | |
| 162 foreach my $transcriptid (keys %{$evidences{$connect_str}}) | |
| 163 { | |
| 164 print $transcriptid,","; | |
| 165 } | |
| 166 print "\n"; | |
| 167 } | |
| 168 | |
| 169 select($stdout); | |
| 170 | |
| 171 close($output); | |
| 172 close($output2); | |
| 173 | |
| 174 #close(CacheContigFile); | |
| 175 #define sub scanevents | |
| 176 sub scanevents | |
| 177 { | |
| 178 my ($eventlist,$mode)=@_; | |
| 179 my %anno=(); | |
| 180 if($mode eq "inner") | |
| 181 { | |
| 182 foreach my $key (keys %$eventlist) | |
| 183 { | |
| 184 #print $key,"\t",$eventlist->{$key},"\n"; | |
| 185 if($eventlist->{$key} == 3) | |
| 186 { | |
| 187 my @a=split(",",$key); | |
| 188 my $connectstr_ca=$a[0].",".$a[2].","; | |
| 189 if (exists $eventlist->{$connectstr_ca}) | |
| 190 { | |
| 191 $anno{$key}="ca"; | |
| 192 } | |
| 193 else | |
| 194 { | |
| 195 $anno{$key}="cs"; | |
| 196 } | |
| 197 } | |
| 198 if($eventlist->{$key} == 2) | |
| 199 { | |
| 200 my @a=split(/[,-]/,$key); | |
| 201 | |
| 202 my $connectstr_ir=$a[0]."-".$a[3].","; | |
| 203 if (exists $eventlist->{$connectstr_ir}) | |
| 204 { | |
| 205 $anno{$key}="ir"; | |
| 206 } | |
| 207 foreach my $search (keys %$eventlist) | |
| 208 { | |
| 209 next if($eventlist->{$search}!=2); | |
| 210 my $connectstr_ss=""; | |
| 211 my @b=split(/[,-]/,$search); | |
| 212 if($a[0]==$b[0] && $a[3]==$b[3]) | |
| 213 { | |
| 214 if($a[1]==$b[1]) | |
| 215 { | |
| 216 next if ($b[2] == $a[2]); | |
| 217 $connectstr_ss .= $a[0]."-".$a[1].","; | |
| 218 if($b[2]>$a[2]) | |
| 219 { | |
| 220 $connectstr_ss .= $a[2]."-".$b[2].",".$b[2]; | |
| 221 } | |
| 222 #elsif($b[2]<$a[2]) | |
| 223 else | |
| 224 { | |
| 225 $connectstr_ss .= $b[2]."-".$a[2].",".$a[2]; | |
| 226 } | |
| 227 $connectstr_ss .= "-".$a[3].","; | |
| 228 $anno{$connectstr_ss}="ss"; | |
| 229 #print $connectstr_ss,"\tss\n"; | |
| 230 } | |
| 231 if($a[2]==$b[2]) | |
| 232 { | |
| 233 $connectstr_ss .= $a[0]."-"; | |
| 234 if($b[1]>$a[1]) | |
| 235 { | |
| 236 $connectstr_ss .= $a[1].",".$a[1]."-".$b[1]; | |
| 237 } | |
| 238 #elsif($b[2]<$a[2]) | |
| 239 else | |
| 240 { | |
| 241 $connectstr_ss .= $b[1].",".$b[1]."-".$a[1]; | |
| 242 } | |
| 243 $connectstr_ss .= ",".$a[2]."-".$a[3].","; | |
| 244 $anno{$connectstr_ss}="ss"; | |
| 245 #print $connectstr_ss,"\tss\n"; | |
| 246 } | |
| 247 | |
| 248 | |
| 249 } | |
| 250 | |
| 251 } | |
| 252 } | |
| 253 } | |
| 254 } | |
| 255 if($mode eq "af" or $mode eq "al")#alternative first/last exon | |
| 256 { | |
| 257 foreach my $key (keys %$eventlist) | |
| 258 { | |
| 259 my @a=split(/,/,$key); | |
| 260 foreach my $search (keys %$eventlist) | |
| 261 { | |
| 262 next if $search eq $key; | |
| 263 my $connectstr = ""; | |
| 264 if($eventlist->{$search} < $eventlist->{$key} ) | |
| 265 { | |
| 266 $connectstr = $search."|".$key; | |
| 267 } | |
| 268 else | |
| 269 { | |
| 270 $connectstr = $key."|".$search; | |
| 271 } | |
| 272 next if exists $anno{$connectstr}; | |
| 273 my @b=split(/,/,$search); | |
| 274 if($mode eq "af") | |
| 275 { | |
| 276 if($a[0] ne $b[0] and $a[1] eq $b[1]) | |
| 277 { | |
| 278 $anno{$connectstr} = $mode; | |
| 279 } | |
| 280 } | |
| 281 else | |
| 282 { | |
| 283 if($a[0] eq $b[0] and $a[1] ne $b[1]) | |
| 284 { | |
| 285 $anno{$connectstr} = $mode; | |
| 286 } | |
| 287 } | |
| 288 | |
| 289 } | |
| 290 } | |
| 291 } | |
| 292 return(\%anno); | |
| 293 } | |
| 294 | |
| 295 | |
| 296 sub printanno | |
| 297 { | |
| 298 # print the annotations | |
| 299 # and return the connectstr and event id to be used to print evidences | |
| 300 my %eventids=(); | |
| 301 my ($annos,$chr,$strand)=@_; | |
| 302 my %nums_per_isoform; | |
| 303 foreach my $key (keys %$annos) | |
| 304 { | |
| 305 if($annos->{$key} eq "af") | |
| 306 { | |
| 307 my @a=split(/[\,\-\|]+/,$key); | |
| 308 my $chrid=substr($chr,3,length($chr)-3); | |
| 309 my $id_anno="AF"; | |
| 310 #consider direction later | |
| 311 my $id=join("-",$id_anno,$id_anno, $a[2],$a[3]); | |
| 312 my $num=0; | |
| 313 $num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id}; | |
| 314 $nums_per_isoform{$id}++; | |
| 315 | |
| 316 print $chr,"\t",$a[0],"\t",$a[3],"\t"; | |
| 317 print $id.".".$num,"[L]\t"; | |
| 318 print "0\t"; | |
| 319 print $strand,"\t"; | |
| 320 print $a[0],"\t",$a[3],"\t"; | |
| 321 print "255,0,0\t"; | |
| 322 print "2\t"; | |
| 323 print $a[1]-$a[0],","; | |
| 324 print $a[3]-$a[2],"\t"; | |
| 325 print "0,"; | |
| 326 print $a[2]-$a[0],"\n"; | |
| 327 | |
| 328 print $chr,"\t",$a[4],"\t",$a[7],"\t"; | |
| 329 print $id.".".$num,"[R]\t"; | |
| 330 print "0\t"; | |
| 331 print $strand,"\t"; | |
| 332 print $a[4],"\t",$a[7],"\t"; | |
| 333 print "255,0,0\t"; | |
| 334 print "2\t"; | |
| 335 print $a[5]-$a[4],","; | |
| 336 print $a[7]-$a[6],"\t"; | |
| 337 print "0,"; | |
| 338 print $a[6]-$a[4],"\n"; | |
| 339 | |
| 340 #print "$key,af\n"; | |
| 341 } | |
| 342 if($annos->{$key} eq "ca" or $annos->{$key} eq "cs") | |
| 343 { | |
| 344 my @a=split(/[,-]/,$key); | |
| 345 | |
| 346 my $chrid=substr($chr,3,length($chr)-3); | |
| 347 my $id_anno="CS"; | |
| 348 $id_anno="CA" if $annos->{$key} eq "ca"; | |
| 349 my $id="CA-$id_anno-$chrid"."-".$a[2]."-".$a[3]; | |
| 350 my $num=0; | |
| 351 $num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id}; | |
| 352 $nums_per_isoform{$id}++; | |
| 353 | |
| 354 print $chr,"\t",$a[0],"\t",$a[5],"\t"; | |
| 355 print $id.".".$num,"[L]\t"; | |
| 356 $eventids {$key} = $id.".".$num."[L]"; | |
| 357 print "0\t"; | |
| 358 print $strand,"\t"; | |
| 359 print $a[0],"\t",$a[5],"\t"; | |
| 360 print "255,0,0\t"; | |
| 361 print "3\t"; | |
| 362 print $a[1]-$a[0],","; | |
| 363 print $a[3]-$a[2],","; | |
| 364 print $a[5]-$a[4],"\t"; | |
| 365 print "0,"; | |
| 366 print $a[2]-$a[0],","; | |
| 367 print $a[4]-$a[0],"\n"; | |
| 368 | |
| 369 print $chr,"\t",$a[0],"\t",$a[5],"\t"; | |
| 370 print $id.".".$num,"[S]\t"; | |
| 371 my $connectstr_ca= $a[0]."-".$a[1].",".$a[4]."-".$a[5].","; | |
| 372 $eventids{$connectstr_ca} = $id.".".$num."[S]"; | |
| 373 print "0\t"; | |
| 374 print $strand,"\t"; | |
| 375 print $a[0],"\t",$a[5],"\t"; | |
| 376 print "255,0,0\t"; | |
| 377 print "2\t"; | |
| 378 print $a[1]-$a[0],","; | |
| 379 #print $a[3]-$a[2],","; | |
| 380 print $a[5]-$a[4],"\t"; | |
| 381 print "0,"; | |
| 382 #print $a[2]-$a[0],","; | |
| 383 print $a[4]-$a[0],"\n"; | |
| 384 | |
| 385 } | |
| 386 if($annos->{$key} eq "ir") | |
| 387 { | |
| 388 my @a=split(/[,-]/,$key); | |
| 389 | |
| 390 my $chrid=substr($chr,3,length($chr)-3); | |
| 391 my $id="IR-IR-$chrid"."-".$a[1]."-".$a[2]; | |
| 392 my $num=0; | |
| 393 $num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id}; | |
| 394 $nums_per_isoform{$id}++; | |
| 395 | |
| 396 print $chr,"\t",$a[0],"\t",$a[3],"\t"; | |
| 397 print $id.".".$num,"[L]\t"; | |
| 398 print "0\t"; | |
| 399 print $strand,"\t"; | |
| 400 print $a[0],"\t",$a[3],"\t"; | |
| 401 $eventids{$a[0]."-".$a[3].","} = $id.".".$num."[L]"; | |
| 402 print "255,0,0\t"; | |
| 403 print "3\t"; | |
| 404 print $a[1]-$a[0],","; | |
| 405 print $a[2]-$a[1],","; | |
| 406 print $a[3]-$a[2],"\t"; | |
| 407 print "0,"; | |
| 408 print $a[1]-$a[0],","; | |
| 409 print $a[2]-$a[0],"\n"; | |
| 410 | |
| 411 print $chr,"\t",$a[0],"\t",$a[3],"\t"; | |
| 412 print $id.".".$num,"[S]\t"; | |
| 413 $eventids{$key} = $id.".".$num."[S]"; | |
| 414 print "0\t"; | |
| 415 print $strand,"\t"; | |
| 416 print $a[0],"\t",$a[3],"\t"; | |
| 417 print "255,0,0\t"; | |
| 418 print "2\t"; | |
| 419 print $a[1]-$a[0],","; | |
| 420 #print $a[3]-$a[2],","; | |
| 421 print $a[3]-$a[2],"\t"; | |
| 422 print "0,"; | |
| 423 #print $a[2]-$a[0],","; | |
| 424 print $a[2]-$a[0],"\n"; | |
| 425 | |
| 426 } | |
| 427 if($annos->{$key} eq "ss") | |
| 428 { | |
| 429 my @a=split(/[,-]/,$key); | |
| 430 my $chrid=substr($chr,3,length($chr)-3); | |
| 431 my $type="AA"; | |
| 432 if( ($strand eq "+" && $a[1]==$a[2]) or ($strand eq "-" && $a[3]==$a[4])) | |
| 433 { | |
| 434 $type="AD"; | |
| 435 } | |
| 436 my $id="$type-$type-$chrid"."-".$a[2]."-".$a[3]; | |
| 437 my $connect_str_L = ""; | |
| 438 my $connect_str_S = ""; | |
| 439 if($a[1]==$a[2]) | |
| 440 { | |
| 441 $connect_str_L = $a[0]."-".$a[3].",".$a[4]."-".$a[5].","; | |
| 442 $connect_str_S = $a[0]."-".$a[1].",".$a[4]."-".$a[5].","; | |
| 443 } | |
| 444 else | |
| 445 { | |
| 446 $connect_str_L = $a[0]."-".$a[1].",".$a[2]."-".$a[5].","; | |
| 447 $connect_str_S = $a[0]."-".$a[1].",".$a[3]."-".$a[5].","; | |
| 448 | |
| 449 } | |
| 450 my $num=0; | |
| 451 $num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id}; | |
| 452 $nums_per_isoform{$id}++; | |
| 453 print $chr,"\t",$a[0],"\t",$a[5],"\t"; | |
| 454 print $id.".".$num,"[L]\t"; | |
| 455 $eventids {$connect_str_L} = $id.".".$num."[L]"; | |
| 456 print "0\t"; | |
| 457 print $strand,"\t"; | |
| 458 print $a[0],"\t",$a[5],"\t"; | |
| 459 print "255,0,0\t"; | |
| 460 print "3\t"; | |
| 461 print $a[1]-$a[0],","; | |
| 462 print $a[3]-$a[2],","; | |
| 463 print $a[5]-$a[4],"\t"; | |
| 464 print "0,"; | |
| 465 print $a[2]-$a[0],","; | |
| 466 print $a[4]-$a[0],"\n"; | |
| 467 | |
| 468 print $chr,"\t",$a[0],"\t",$a[5],"\t"; | |
| 469 print $id.".".$num,"[S]\t"; | |
| 470 $eventids {$connect_str_S} = $id.".".$num."[S]"; | |
| 471 print "0\t"; | |
| 472 print $strand,"\t"; | |
| 473 print $a[0],"\t",$a[5],"\t"; | |
| 474 print "255,0,0\t"; | |
| 475 print "2\t"; | |
| 476 print $a[1]-$a[0],","; | |
| 477 #print $a[3]-$a[2],","; | |
| 478 print $a[5]-$a[4],"\t"; | |
| 479 print "0,"; | |
| 480 #print $a[2]-$a[0],","; | |
| 481 print $a[4]-$a[0],"\n"; | |
| 482 | |
| 483 | |
| 484 } | |
| 485 | |
| 486 } | |
| 487 return(\%eventids); | |
| 488 | |
| 489 } | |
| 490 |
