Mercurial > repos > bioitcore > splicetrap
view bin/scanbed2txdb.pl @ 5:2ebca9da5e42 draft default tip
planemo upload
author | bioitcore |
---|---|
date | Thu, 07 Sep 2017 17:39:24 -0400 |
parents | adc0f7765d85 |
children |
line wrap: on
line source
#argv0: input transcript bed file #argv1: output filename, will be in AS format use strict; my $AnnoFileName = $ARGV[0]; my $outputFileName = $ARGV[1]; if($AnnoFileName eq "" or $outputFileName eq "") { print "TXDBGEN: Please specify your input files\n"; exit; } my $cachefolder = `basename $outputFileName`; chomp($cachefolder); my $cachefolder = $cachefolder.".cache"; my $AnnoFileBase = `basename $AnnoFileName`; chomp($AnnoFileBase); if(! -e $cachefolder) { mkdir $cachefolder or die "CHECK: cannot mkdir $cachefolder\n"; print "TXDBGEN: mkdir $cachefolder\n"; } my $CacheAnnoFileName = $cachefolder."/".$AnnoFileBase.".sort"; #sort the annotation file print "TXDBGEN: sort $AnnoFileName \n"; system("sort -k6,6 -k1,1 -k2,2n -k3,3n $AnnoFileName >$CacheAnnoFileName"); $AnnoFileName = $CacheAnnoFileName; #read the annotations into hashes open(AnnoFile, $CacheAnnoFileName) or die "can not open",$CacheAnnoFileName; #split the genes into contigs my $contigid = 0; my $end_tmp = 0; my $chr_tmp = "chr"; my $strand_tmp="NA"; my $TXnumtmp=0; my %eventlist =(); #my %eventlist_af = (); #my %eventlist_al = (); my %evidences = (); open(my $output,">$outputFileName"); open(my $output2, ">$outputFileName.evi"); while(my $line =<AnnoFile>) { chomp($line); my @a = split("\t",$line); my $chr = $a[0]; my $start = $a[1]; my $end = $a[2]; my $name = $a[3]; my @sizes = split(",",$a[10]); my $strand = $a[5]; my @start_shifts = split(",",$a[11]); #my $chrstr = $chr.$strand; my @starts; my @ends; for(my $i=0;$i<@start_shifts;$i++) { $starts[$i]=$start_shifts[$i]+$start; $ends[$i] = $starts[$i]+$sizes[$i]; } if($start >$end_tmp or $chr ne $chr_tmp or $strand_tmp ne $strand) { $contigid++; #$ctgmultisonum++ if $TXnumtmp>1; $TXnumtmp=0; my $annos = scanevents(\%eventlist, "inner") ; #my $annos_af = scanevents(\%eventlist_af, "af"); #my $annos_al = scanevents(\%eventlist_al, "al"); my $stdout = select ($output); my $eventids = printanno($annos,$chr_tmp,$strand_tmp) if(scalar(%$annos)>0); #printanno($annos_af,$chr_tmp,$strand_tmp) if(scalar(%$annos_af)>0); #printanno($annos_al,$chr_tmp,$strand_tmp) if(scalar(%$annos_al)>0); select($stdout); $stdout = select ($output2); #print cross information foreach my $connect_str (keys %$eventids) { print $eventids->{$connect_str},"\t"; foreach my $transcriptid (keys %{$evidences{$connect_str}}) { print $transcriptid,","; } print "\n"; } select($stdout); print "TXDBGEN: Contig ID $contigid at $chr $strand...\n" if ( $contigid%1000 == 0); %eventlist = (); #%eventlist_al = (); #%eventlist_af = (); #print CacheContigFile "#ctg$contigid\n"; } $TXnumtmp++; $end_tmp = $end if ($end > $end_tmp or $chr ne $chr_tmp or $strand_tmp ne $strand); $chr_tmp = $chr; $strand_tmp = $strand; #scan connections, 2 and 3 exons for CA/CS/AF # if(scalar(@starts)>2) # { # my $connectionstr_af =$starts[0]."-". # $ends[0].",". # $starts[1]."-". # $ends[1]."," ; # # $eventlist_af{$connectionstr_af} = $starts[0]; # my $connectionstr_al = $starts[scalar(@starts)-2]."-". # $ends[scalar(@starts)-2].",". # $starts[scalar(@starts)-1]."-". # $ends[scalar(@starts)-1]."," ; # $eventlist_al{$connectionstr_al} = $starts[scalar(@starts)-2]; # } #didn't consider direction yet #add 1 exon for IR for(my $n=1;$n<4;$n++) { for(my $i=0;$i<scalar(@starts)-$n+1;$i++) { my $connectionstr = ""; for(my $j=$i;$j<$i+$n;$j++) { $connectionstr = $connectionstr . $starts[$j]."-". $ends[$j].","; } # print $connectionstr,"\n"; $eventlist{$connectionstr} = $n; $evidences{$connectionstr}{$name} = 1; } } } my $annos = scanevents(\%eventlist, "inner"); #my $annos_af = scanevents(\%eventlist_af, "af"); #my $annos_al = scanevents(\%eventlist_al, "al"); my $stdout = select ($output); my $eventids = printanno($annos,$chr_tmp,$strand_tmp) if(scalar(%$annos)>0); select ($stdout); $stdout = select ($output2); #print cross information foreach my $connect_str (keys %$eventids) { print $eventids->{$connect_str},"\t"; foreach my $transcriptid (keys %{$evidences{$connect_str}}) { print $transcriptid,","; } print "\n"; } select($stdout); close($output); close($output2); #close(CacheContigFile); #define sub scanevents sub scanevents { my ($eventlist,$mode)=@_; my %anno=(); if($mode eq "inner") { foreach my $key (keys %$eventlist) { #print $key,"\t",$eventlist->{$key},"\n"; if($eventlist->{$key} == 3) { my @a=split(",",$key); my $connectstr_ca=$a[0].",".$a[2].","; if (exists $eventlist->{$connectstr_ca}) { $anno{$key}="ca"; } else { $anno{$key}="cs"; } } if($eventlist->{$key} == 2) { my @a=split(/[,-]/,$key); my $connectstr_ir=$a[0]."-".$a[3].","; if (exists $eventlist->{$connectstr_ir}) { $anno{$key}="ir"; } foreach my $search (keys %$eventlist) { next if($eventlist->{$search}!=2); my $connectstr_ss=""; my @b=split(/[,-]/,$search); if($a[0]==$b[0] && $a[3]==$b[3]) { if($a[1]==$b[1]) { next if ($b[2] == $a[2]); $connectstr_ss .= $a[0]."-".$a[1].","; if($b[2]>$a[2]) { $connectstr_ss .= $a[2]."-".$b[2].",".$b[2]; } #elsif($b[2]<$a[2]) else { $connectstr_ss .= $b[2]."-".$a[2].",".$a[2]; } $connectstr_ss .= "-".$a[3].","; $anno{$connectstr_ss}="ss"; #print $connectstr_ss,"\tss\n"; } if($a[2]==$b[2]) { $connectstr_ss .= $a[0]."-"; if($b[1]>$a[1]) { $connectstr_ss .= $a[1].",".$a[1]."-".$b[1]; } #elsif($b[2]<$a[2]) else { $connectstr_ss .= $b[1].",".$b[1]."-".$a[1]; } $connectstr_ss .= ",".$a[2]."-".$a[3].","; $anno{$connectstr_ss}="ss"; #print $connectstr_ss,"\tss\n"; } } } } } } if($mode eq "af" or $mode eq "al")#alternative first/last exon { foreach my $key (keys %$eventlist) { my @a=split(/,/,$key); foreach my $search (keys %$eventlist) { next if $search eq $key; my $connectstr = ""; if($eventlist->{$search} < $eventlist->{$key} ) { $connectstr = $search."|".$key; } else { $connectstr = $key."|".$search; } next if exists $anno{$connectstr}; my @b=split(/,/,$search); if($mode eq "af") { if($a[0] ne $b[0] and $a[1] eq $b[1]) { $anno{$connectstr} = $mode; } } else { if($a[0] eq $b[0] and $a[1] ne $b[1]) { $anno{$connectstr} = $mode; } } } } } return(\%anno); } sub printanno { # print the annotations # and return the connectstr and event id to be used to print evidences my %eventids=(); my ($annos,$chr,$strand)=@_; my %nums_per_isoform; foreach my $key (keys %$annos) { if($annos->{$key} eq "af") { my @a=split(/[\,\-\|]+/,$key); my $chrid=substr($chr,3,length($chr)-3); my $id_anno="AF"; #consider direction later my $id=join("-",$id_anno,$id_anno, $a[2],$a[3]); my $num=0; $num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id}; $nums_per_isoform{$id}++; print $chr,"\t",$a[0],"\t",$a[3],"\t"; print $id.".".$num,"[L]\t"; print "0\t"; print $strand,"\t"; print $a[0],"\t",$a[3],"\t"; print "255,0,0\t"; print "2\t"; print $a[1]-$a[0],","; print $a[3]-$a[2],"\t"; print "0,"; print $a[2]-$a[0],"\n"; print $chr,"\t",$a[4],"\t",$a[7],"\t"; print $id.".".$num,"[R]\t"; print "0\t"; print $strand,"\t"; print $a[4],"\t",$a[7],"\t"; print "255,0,0\t"; print "2\t"; print $a[5]-$a[4],","; print $a[7]-$a[6],"\t"; print "0,"; print $a[6]-$a[4],"\n"; #print "$key,af\n"; } if($annos->{$key} eq "ca" or $annos->{$key} eq "cs") { my @a=split(/[,-]/,$key); my $chrid=substr($chr,3,length($chr)-3); my $id_anno="CS"; $id_anno="CA" if $annos->{$key} eq "ca"; my $id="CA-$id_anno-$chrid"."-".$a[2]."-".$a[3]; my $num=0; $num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id}; $nums_per_isoform{$id}++; print $chr,"\t",$a[0],"\t",$a[5],"\t"; print $id.".".$num,"[L]\t"; $eventids {$key} = $id.".".$num."[L]"; print "0\t"; print $strand,"\t"; print $a[0],"\t",$a[5],"\t"; print "255,0,0\t"; print "3\t"; print $a[1]-$a[0],","; print $a[3]-$a[2],","; print $a[5]-$a[4],"\t"; print "0,"; print $a[2]-$a[0],","; print $a[4]-$a[0],"\n"; print $chr,"\t",$a[0],"\t",$a[5],"\t"; print $id.".".$num,"[S]\t"; my $connectstr_ca= $a[0]."-".$a[1].",".$a[4]."-".$a[5].","; $eventids{$connectstr_ca} = $id.".".$num."[S]"; print "0\t"; print $strand,"\t"; print $a[0],"\t",$a[5],"\t"; print "255,0,0\t"; print "2\t"; print $a[1]-$a[0],","; #print $a[3]-$a[2],","; print $a[5]-$a[4],"\t"; print "0,"; #print $a[2]-$a[0],","; print $a[4]-$a[0],"\n"; } if($annos->{$key} eq "ir") { my @a=split(/[,-]/,$key); my $chrid=substr($chr,3,length($chr)-3); my $id="IR-IR-$chrid"."-".$a[1]."-".$a[2]; my $num=0; $num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id}; $nums_per_isoform{$id}++; print $chr,"\t",$a[0],"\t",$a[3],"\t"; print $id.".".$num,"[L]\t"; print "0\t"; print $strand,"\t"; print $a[0],"\t",$a[3],"\t"; $eventids{$a[0]."-".$a[3].","} = $id.".".$num."[L]"; print "255,0,0\t"; print "3\t"; print $a[1]-$a[0],","; print $a[2]-$a[1],","; print $a[3]-$a[2],"\t"; print "0,"; print $a[1]-$a[0],","; print $a[2]-$a[0],"\n"; print $chr,"\t",$a[0],"\t",$a[3],"\t"; print $id.".".$num,"[S]\t"; $eventids{$key} = $id.".".$num."[S]"; print "0\t"; print $strand,"\t"; print $a[0],"\t",$a[3],"\t"; print "255,0,0\t"; print "2\t"; print $a[1]-$a[0],","; #print $a[3]-$a[2],","; print $a[3]-$a[2],"\t"; print "0,"; #print $a[2]-$a[0],","; print $a[2]-$a[0],"\n"; } if($annos->{$key} eq "ss") { my @a=split(/[,-]/,$key); my $chrid=substr($chr,3,length($chr)-3); my $type="AA"; if( ($strand eq "+" && $a[1]==$a[2]) or ($strand eq "-" && $a[3]==$a[4])) { $type="AD"; } my $id="$type-$type-$chrid"."-".$a[2]."-".$a[3]; my $connect_str_L = ""; my $connect_str_S = ""; if($a[1]==$a[2]) { $connect_str_L = $a[0]."-".$a[3].",".$a[4]."-".$a[5].","; $connect_str_S = $a[0]."-".$a[1].",".$a[4]."-".$a[5].","; } else { $connect_str_L = $a[0]."-".$a[1].",".$a[2]."-".$a[5].","; $connect_str_S = $a[0]."-".$a[1].",".$a[3]."-".$a[5].","; } my $num=0; $num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id}; $nums_per_isoform{$id}++; print $chr,"\t",$a[0],"\t",$a[5],"\t"; print $id.".".$num,"[L]\t"; $eventids {$connect_str_L} = $id.".".$num."[L]"; print "0\t"; print $strand,"\t"; print $a[0],"\t",$a[5],"\t"; print "255,0,0\t"; print "3\t"; print $a[1]-$a[0],","; print $a[3]-$a[2],","; print $a[5]-$a[4],"\t"; print "0,"; print $a[2]-$a[0],","; print $a[4]-$a[0],"\n"; print $chr,"\t",$a[0],"\t",$a[5],"\t"; print $id.".".$num,"[S]\t"; $eventids {$connect_str_S} = $id.".".$num."[S]"; print "0\t"; print $strand,"\t"; print $a[0],"\t",$a[5],"\t"; print "255,0,0\t"; print "2\t"; print $a[1]-$a[0],","; #print $a[3]-$a[2],","; print $a[5]-$a[4],"\t"; print "0,"; #print $a[2]-$a[0],","; print $a[4]-$a[0],"\n"; } } return(\%eventids); }