Mercurial > repos > bioitcore > splicetrap
diff bin/scanbed2txdb.pl @ 1:adc0f7765d85 draft
planemo upload
author | bioitcore |
---|---|
date | Thu, 07 Sep 2017 15:06:58 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/scanbed2txdb.pl Thu Sep 07 15:06:58 2017 -0400 @@ -0,0 +1,490 @@ +#argv0: input transcript bed file +#argv1: output filename, will be in AS format + +use strict; +my $AnnoFileName = $ARGV[0]; +my $outputFileName = $ARGV[1]; + +if($AnnoFileName eq "" or $outputFileName eq "") +{ + print "TXDBGEN: Please specify your input files\n"; + exit; +} + +my $cachefolder = `basename $outputFileName`; +chomp($cachefolder); +my $cachefolder = $cachefolder.".cache"; +my $AnnoFileBase = `basename $AnnoFileName`; +chomp($AnnoFileBase); + +if(! -e $cachefolder) +{ + mkdir $cachefolder or die "CHECK: cannot mkdir $cachefolder\n"; + print "TXDBGEN: mkdir $cachefolder\n"; +} + +my $CacheAnnoFileName = $cachefolder."/".$AnnoFileBase.".sort"; +#sort the annotation file + +print "TXDBGEN: sort $AnnoFileName \n"; + +system("sort -k6,6 -k1,1 -k2,2n -k3,3n $AnnoFileName >$CacheAnnoFileName"); + +$AnnoFileName = $CacheAnnoFileName; + +#read the annotations into hashes + +open(AnnoFile, $CacheAnnoFileName) or die "can not open",$CacheAnnoFileName; +#split the genes into contigs + +my $contigid = 0; +my $end_tmp = 0; +my $chr_tmp = "chr"; +my $strand_tmp="NA"; + +my $TXnumtmp=0; + +my %eventlist =(); +#my %eventlist_af = (); +#my %eventlist_al = (); +my %evidences = (); + +open(my $output,">$outputFileName"); + +open(my $output2, ">$outputFileName.evi"); +while(my $line =<AnnoFile>) +{ + chomp($line); + my @a = split("\t",$line); + my $chr = $a[0]; + my $start = $a[1]; + my $end = $a[2]; + my $name = $a[3]; + my @sizes = split(",",$a[10]); + my $strand = $a[5]; + my @start_shifts = split(",",$a[11]); + #my $chrstr = $chr.$strand; + my @starts; + my @ends; + for(my $i=0;$i<@start_shifts;$i++) + { + $starts[$i]=$start_shifts[$i]+$start; + $ends[$i] = $starts[$i]+$sizes[$i]; + } + if($start >$end_tmp or $chr ne $chr_tmp or $strand_tmp ne $strand) + { + $contigid++; + #$ctgmultisonum++ if $TXnumtmp>1; + $TXnumtmp=0; + my $annos = scanevents(\%eventlist, "inner") ; + #my $annos_af = scanevents(\%eventlist_af, "af"); + #my $annos_al = scanevents(\%eventlist_al, "al"); + + my $stdout = select ($output); + my $eventids = printanno($annos,$chr_tmp,$strand_tmp) if(scalar(%$annos)>0); + #printanno($annos_af,$chr_tmp,$strand_tmp) if(scalar(%$annos_af)>0); + #printanno($annos_al,$chr_tmp,$strand_tmp) if(scalar(%$annos_al)>0); + select($stdout); + $stdout = select ($output2); + #print cross information + foreach my $connect_str (keys %$eventids) + { + print $eventids->{$connect_str},"\t"; + foreach my $transcriptid (keys %{$evidences{$connect_str}}) + { + print $transcriptid,","; + } + print "\n"; + } + select($stdout); + print "TXDBGEN: Contig ID $contigid at $chr $strand...\n" if ( $contigid%1000 == 0); + %eventlist = (); + #%eventlist_al = (); + #%eventlist_af = (); + #print CacheContigFile "#ctg$contigid\n"; + + } + $TXnumtmp++; + $end_tmp = $end if ($end > $end_tmp or $chr ne $chr_tmp or $strand_tmp ne $strand); + $chr_tmp = $chr; + + $strand_tmp = $strand; + + #scan connections, 2 and 3 exons for CA/CS/AF +# if(scalar(@starts)>2) +# { +# my $connectionstr_af =$starts[0]."-". +# $ends[0].",". +# $starts[1]."-". +# $ends[1]."," ; +# +# $eventlist_af{$connectionstr_af} = $starts[0]; +# my $connectionstr_al = $starts[scalar(@starts)-2]."-". +# $ends[scalar(@starts)-2].",". +# $starts[scalar(@starts)-1]."-". +# $ends[scalar(@starts)-1]."," ; +# $eventlist_al{$connectionstr_al} = $starts[scalar(@starts)-2]; +# } + #didn't consider direction yet + #add 1 exon for IR + for(my $n=1;$n<4;$n++) + { + for(my $i=0;$i<scalar(@starts)-$n+1;$i++) + { + my $connectionstr = ""; + for(my $j=$i;$j<$i+$n;$j++) + { + $connectionstr = $connectionstr . + $starts[$j]."-". + $ends[$j].","; + } + # print $connectionstr,"\n"; + $eventlist{$connectionstr} = $n; + $evidences{$connectionstr}{$name} = 1; + } + + } + +} + +my $annos = scanevents(\%eventlist, "inner"); +#my $annos_af = scanevents(\%eventlist_af, "af"); +#my $annos_al = scanevents(\%eventlist_al, "al"); + +my $stdout = select ($output); +my $eventids = printanno($annos,$chr_tmp,$strand_tmp) if(scalar(%$annos)>0); +select ($stdout); +$stdout = select ($output2); +#print cross information +foreach my $connect_str (keys %$eventids) +{ + print $eventids->{$connect_str},"\t"; + foreach my $transcriptid (keys %{$evidences{$connect_str}}) + { + print $transcriptid,","; + } + print "\n"; +} + +select($stdout); + +close($output); +close($output2); + +#close(CacheContigFile); +#define sub scanevents +sub scanevents +{ + my ($eventlist,$mode)=@_; + my %anno=(); + if($mode eq "inner") + { + foreach my $key (keys %$eventlist) + { + #print $key,"\t",$eventlist->{$key},"\n"; + if($eventlist->{$key} == 3) + { + my @a=split(",",$key); + my $connectstr_ca=$a[0].",".$a[2].","; + if (exists $eventlist->{$connectstr_ca}) + { + $anno{$key}="ca"; + } + else + { + $anno{$key}="cs"; + } + } + if($eventlist->{$key} == 2) + { + my @a=split(/[,-]/,$key); + + my $connectstr_ir=$a[0]."-".$a[3].","; + if (exists $eventlist->{$connectstr_ir}) + { + $anno{$key}="ir"; + } + foreach my $search (keys %$eventlist) + { + next if($eventlist->{$search}!=2); + my $connectstr_ss=""; + my @b=split(/[,-]/,$search); + if($a[0]==$b[0] && $a[3]==$b[3]) + { + if($a[1]==$b[1]) + { + next if ($b[2] == $a[2]); + $connectstr_ss .= $a[0]."-".$a[1].","; + if($b[2]>$a[2]) + { + $connectstr_ss .= $a[2]."-".$b[2].",".$b[2]; + } + #elsif($b[2]<$a[2]) + else + { + $connectstr_ss .= $b[2]."-".$a[2].",".$a[2]; + } + $connectstr_ss .= "-".$a[3].","; + $anno{$connectstr_ss}="ss"; + #print $connectstr_ss,"\tss\n"; + } + if($a[2]==$b[2]) + { + $connectstr_ss .= $a[0]."-"; + if($b[1]>$a[1]) + { + $connectstr_ss .= $a[1].",".$a[1]."-".$b[1]; + } + #elsif($b[2]<$a[2]) + else + { + $connectstr_ss .= $b[1].",".$b[1]."-".$a[1]; + } + $connectstr_ss .= ",".$a[2]."-".$a[3].","; + $anno{$connectstr_ss}="ss"; + #print $connectstr_ss,"\tss\n"; + } + + + } + + } + } + } + } + if($mode eq "af" or $mode eq "al")#alternative first/last exon + { + foreach my $key (keys %$eventlist) + { + my @a=split(/,/,$key); + foreach my $search (keys %$eventlist) + { + next if $search eq $key; + my $connectstr = ""; + if($eventlist->{$search} < $eventlist->{$key} ) + { + $connectstr = $search."|".$key; + } + else + { + $connectstr = $key."|".$search; + } + next if exists $anno{$connectstr}; + my @b=split(/,/,$search); + if($mode eq "af") + { + if($a[0] ne $b[0] and $a[1] eq $b[1]) + { + $anno{$connectstr} = $mode; + } + } + else + { + if($a[0] eq $b[0] and $a[1] ne $b[1]) + { + $anno{$connectstr} = $mode; + } + } + + } + } + } + return(\%anno); +} + + +sub printanno +{ +# print the annotations +# and return the connectstr and event id to be used to print evidences + my %eventids=(); + my ($annos,$chr,$strand)=@_; + my %nums_per_isoform; + foreach my $key (keys %$annos) + { + if($annos->{$key} eq "af") + { + my @a=split(/[\,\-\|]+/,$key); + my $chrid=substr($chr,3,length($chr)-3); + my $id_anno="AF"; + #consider direction later + my $id=join("-",$id_anno,$id_anno, $a[2],$a[3]); + my $num=0; + $num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id}; + $nums_per_isoform{$id}++; + + print $chr,"\t",$a[0],"\t",$a[3],"\t"; + print $id.".".$num,"[L]\t"; + print "0\t"; + print $strand,"\t"; + print $a[0],"\t",$a[3],"\t"; + print "255,0,0\t"; + print "2\t"; + print $a[1]-$a[0],","; + print $a[3]-$a[2],"\t"; + print "0,"; + print $a[2]-$a[0],"\n"; + + print $chr,"\t",$a[4],"\t",$a[7],"\t"; + print $id.".".$num,"[R]\t"; + print "0\t"; + print $strand,"\t"; + print $a[4],"\t",$a[7],"\t"; + print "255,0,0\t"; + print "2\t"; + print $a[5]-$a[4],","; + print $a[7]-$a[6],"\t"; + print "0,"; + print $a[6]-$a[4],"\n"; + + #print "$key,af\n"; + } + if($annos->{$key} eq "ca" or $annos->{$key} eq "cs") + { + my @a=split(/[,-]/,$key); + + my $chrid=substr($chr,3,length($chr)-3); + my $id_anno="CS"; + $id_anno="CA" if $annos->{$key} eq "ca"; + my $id="CA-$id_anno-$chrid"."-".$a[2]."-".$a[3]; + my $num=0; + $num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id}; + $nums_per_isoform{$id}++; + + print $chr,"\t",$a[0],"\t",$a[5],"\t"; + print $id.".".$num,"[L]\t"; + $eventids {$key} = $id.".".$num."[L]"; + print "0\t"; + print $strand,"\t"; + print $a[0],"\t",$a[5],"\t"; + print "255,0,0\t"; + print "3\t"; + print $a[1]-$a[0],","; + print $a[3]-$a[2],","; + print $a[5]-$a[4],"\t"; + print "0,"; + print $a[2]-$a[0],","; + print $a[4]-$a[0],"\n"; + + print $chr,"\t",$a[0],"\t",$a[5],"\t"; + print $id.".".$num,"[S]\t"; + my $connectstr_ca= $a[0]."-".$a[1].",".$a[4]."-".$a[5].","; + $eventids{$connectstr_ca} = $id.".".$num."[S]"; + print "0\t"; + print $strand,"\t"; + print $a[0],"\t",$a[5],"\t"; + print "255,0,0\t"; + print "2\t"; + print $a[1]-$a[0],","; + #print $a[3]-$a[2],","; + print $a[5]-$a[4],"\t"; + print "0,"; + #print $a[2]-$a[0],","; + print $a[4]-$a[0],"\n"; + + } + if($annos->{$key} eq "ir") + { + my @a=split(/[,-]/,$key); + + my $chrid=substr($chr,3,length($chr)-3); + my $id="IR-IR-$chrid"."-".$a[1]."-".$a[2]; + my $num=0; + $num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id}; + $nums_per_isoform{$id}++; + + print $chr,"\t",$a[0],"\t",$a[3],"\t"; + print $id.".".$num,"[L]\t"; + print "0\t"; + print $strand,"\t"; + print $a[0],"\t",$a[3],"\t"; + $eventids{$a[0]."-".$a[3].","} = $id.".".$num."[L]"; + print "255,0,0\t"; + print "3\t"; + print $a[1]-$a[0],","; + print $a[2]-$a[1],","; + print $a[3]-$a[2],"\t"; + print "0,"; + print $a[1]-$a[0],","; + print $a[2]-$a[0],"\n"; + + print $chr,"\t",$a[0],"\t",$a[3],"\t"; + print $id.".".$num,"[S]\t"; + $eventids{$key} = $id.".".$num."[S]"; + print "0\t"; + print $strand,"\t"; + print $a[0],"\t",$a[3],"\t"; + print "255,0,0\t"; + print "2\t"; + print $a[1]-$a[0],","; + #print $a[3]-$a[2],","; + print $a[3]-$a[2],"\t"; + print "0,"; + #print $a[2]-$a[0],","; + print $a[2]-$a[0],"\n"; + + } + if($annos->{$key} eq "ss") + { + my @a=split(/[,-]/,$key); + my $chrid=substr($chr,3,length($chr)-3); + my $type="AA"; + if( ($strand eq "+" && $a[1]==$a[2]) or ($strand eq "-" && $a[3]==$a[4])) + { + $type="AD"; + } + my $id="$type-$type-$chrid"."-".$a[2]."-".$a[3]; + my $connect_str_L = ""; + my $connect_str_S = ""; + if($a[1]==$a[2]) + { + $connect_str_L = $a[0]."-".$a[3].",".$a[4]."-".$a[5].","; + $connect_str_S = $a[0]."-".$a[1].",".$a[4]."-".$a[5].","; + } + else + { + $connect_str_L = $a[0]."-".$a[1].",".$a[2]."-".$a[5].","; + $connect_str_S = $a[0]."-".$a[1].",".$a[3]."-".$a[5].","; + + } + my $num=0; + $num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id}; + $nums_per_isoform{$id}++; + print $chr,"\t",$a[0],"\t",$a[5],"\t"; + print $id.".".$num,"[L]\t"; + $eventids {$connect_str_L} = $id.".".$num."[L]"; + print "0\t"; + print $strand,"\t"; + print $a[0],"\t",$a[5],"\t"; + print "255,0,0\t"; + print "3\t"; + print $a[1]-$a[0],","; + print $a[3]-$a[2],","; + print $a[5]-$a[4],"\t"; + print "0,"; + print $a[2]-$a[0],","; + print $a[4]-$a[0],"\n"; + + print $chr,"\t",$a[0],"\t",$a[5],"\t"; + print $id.".".$num,"[S]\t"; + $eventids {$connect_str_S} = $id.".".$num."[S]"; + print "0\t"; + print $strand,"\t"; + print $a[0],"\t",$a[5],"\t"; + print "255,0,0\t"; + print "2\t"; + print $a[1]-$a[0],","; + #print $a[3]-$a[2],","; + print $a[5]-$a[4],"\t"; + print "0,"; + #print $a[2]-$a[0],","; + print $a[4]-$a[0],"\n"; + + + } + + } + return(\%eventids); + +} +