diff bin/scanbed2txdb.pl @ 1:adc0f7765d85 draft

planemo upload
author bioitcore
date Thu, 07 Sep 2017 15:06:58 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/scanbed2txdb.pl	Thu Sep 07 15:06:58 2017 -0400
@@ -0,0 +1,490 @@
+#argv0: input transcript bed file
+#argv1: output filename, will be in AS format
+
+use strict;
+my $AnnoFileName = $ARGV[0];
+my $outputFileName = $ARGV[1];
+
+if($AnnoFileName eq "" or $outputFileName eq "")
+{
+	print "TXDBGEN: Please specify your input files\n";
+	exit;
+}
+
+my $cachefolder = `basename $outputFileName`;
+chomp($cachefolder);
+my $cachefolder = $cachefolder.".cache";
+my $AnnoFileBase = `basename $AnnoFileName`; 
+chomp($AnnoFileBase);
+
+if(! -e $cachefolder)
+{
+	mkdir $cachefolder or die "CHECK: cannot mkdir $cachefolder\n";
+	print "TXDBGEN: mkdir $cachefolder\n";
+}
+
+my $CacheAnnoFileName = $cachefolder."/".$AnnoFileBase.".sort";
+#sort the annotation file
+
+print "TXDBGEN: sort $AnnoFileName \n";
+
+system("sort -k6,6 -k1,1 -k2,2n -k3,3n $AnnoFileName >$CacheAnnoFileName");
+
+$AnnoFileName = $CacheAnnoFileName;
+
+#read the annotations into hashes
+
+open(AnnoFile, $CacheAnnoFileName) or die "can not open",$CacheAnnoFileName;
+#split the genes into contigs
+
+my $contigid = 0;
+my $end_tmp = 0;
+my $chr_tmp = "chr";
+my $strand_tmp="NA";
+
+my $TXnumtmp=0;
+
+my %eventlist =();
+#my %eventlist_af = ();
+#my %eventlist_al = ();
+my %evidences = ();
+
+open(my $output,">$outputFileName");
+
+open(my $output2, ">$outputFileName.evi");
+while(my $line =<AnnoFile>)
+{
+	chomp($line);
+	my @a = split("\t",$line);
+	my $chr = $a[0];
+	my $start = $a[1];
+	my $end = $a[2];
+	my $name = $a[3];
+	my @sizes = split(",",$a[10]);
+	my $strand = $a[5];
+	my @start_shifts = split(",",$a[11]);
+	#my $chrstr = $chr.$strand;
+	my @starts;
+	my @ends;
+	for(my $i=0;$i<@start_shifts;$i++)
+        {
+                $starts[$i]=$start_shifts[$i]+$start;
+                $ends[$i] = $starts[$i]+$sizes[$i];
+        }
+	if($start >$end_tmp or $chr ne $chr_tmp or $strand_tmp ne $strand)
+        {
+                $contigid++;
+                #$ctgmultisonum++ if $TXnumtmp>1;
+                $TXnumtmp=0;
+		my $annos = scanevents(\%eventlist, "inner") ;
+		#my $annos_af = scanevents(\%eventlist_af, "af");
+		#my $annos_al = scanevents(\%eventlist_al, "al");
+		
+		my $stdout = select ($output);
+		my $eventids = printanno($annos,$chr_tmp,$strand_tmp) if(scalar(%$annos)>0);
+		#printanno($annos_af,$chr_tmp,$strand_tmp) if(scalar(%$annos_af)>0);
+		#printanno($annos_al,$chr_tmp,$strand_tmp) if(scalar(%$annos_al)>0);
+		select($stdout);
+		$stdout = select ($output2);
+		#print cross information
+		foreach my $connect_str  (keys %$eventids) 
+		{
+			print $eventids->{$connect_str},"\t";
+			foreach my $transcriptid  (keys %{$evidences{$connect_str}}) 
+			{
+				print $transcriptid,",";
+			}
+			print "\n";
+		}
+		select($stdout);
+		print "TXDBGEN: Contig ID $contigid at $chr $strand...\n" if ( $contigid%1000 == 0);
+		%eventlist = ();
+		#%eventlist_al = ();
+		#%eventlist_af = ();
+                #print CacheContigFile "#ctg$contigid\n";
+
+        }
+	$TXnumtmp++;
+        $end_tmp = $end if ($end > $end_tmp or $chr ne $chr_tmp or $strand_tmp ne $strand);
+        $chr_tmp = $chr;
+	
+	$strand_tmp = $strand;
+	
+	#scan connections, 2 and 3 exons for CA/CS/AF
+#	if(scalar(@starts)>2)
+#	{
+#		my $connectionstr_af =$starts[0]."-".
+#				$ends[0].",".
+#				$starts[1]."-".
+#				$ends[1]."," ;
+#	
+#		$eventlist_af{$connectionstr_af} = $starts[0];
+#		my $connectionstr_al = $starts[scalar(@starts)-2]."-".
+#				$ends[scalar(@starts)-2].",".
+#				$starts[scalar(@starts)-1]."-".
+#				$ends[scalar(@starts)-1]."," ;
+#		$eventlist_al{$connectionstr_al} = $starts[scalar(@starts)-2];
+#	}
+		#didn't consider direction yet
+        #add 1 exon for IR
+	for(my $n=1;$n<4;$n++)
+        {
+		for(my $i=0;$i<scalar(@starts)-$n+1;$i++)
+                {
+			my $connectionstr = "";
+			for(my $j=$i;$j<$i+$n;$j++)
+                        {
+                                $connectionstr = $connectionstr .
+                                        $starts[$j]."-".
+                                        $ends[$j].",";
+                        }
+		#	print $connectionstr,"\n";
+			$eventlist{$connectionstr} = $n;
+			$evidences{$connectionstr}{$name} = 1;
+		}
+
+	}
+									
+}
+
+my $annos = scanevents(\%eventlist, "inner");
+#my $annos_af = scanevents(\%eventlist_af, "af");
+#my $annos_al = scanevents(\%eventlist_al, "al");
+		
+my $stdout = select ($output);
+my $eventids = printanno($annos,$chr_tmp,$strand_tmp) if(scalar(%$annos)>0);
+select ($stdout);
+$stdout = select ($output2);
+#print cross information
+foreach my $connect_str (keys %$eventids)
+{
+	print $eventids->{$connect_str},"\t";
+	foreach my $transcriptid (keys %{$evidences{$connect_str}}) 
+	{
+		print $transcriptid,",";
+	}
+	print "\n";
+}
+
+select($stdout);	
+
+close($output);
+close($output2);
+
+#close(CacheContigFile);
+#define sub scanevents
+sub scanevents
+{
+	my ($eventlist,$mode)=@_;
+	my %anno=();
+	if($mode eq "inner")
+	{
+		foreach my $key (keys %$eventlist)
+		{
+			#print $key,"\t",$eventlist->{$key},"\n";
+			if($eventlist->{$key} == 3)
+			{
+				my @a=split(",",$key);
+				my $connectstr_ca=$a[0].",".$a[2].",";
+				if (exists $eventlist->{$connectstr_ca})
+				{
+					$anno{$key}="ca";
+				}
+				else
+				{
+					$anno{$key}="cs";
+				}
+			}
+			if($eventlist->{$key} == 2)
+			{
+				my @a=split(/[,-]/,$key);
+
+				my $connectstr_ir=$a[0]."-".$a[3].",";
+				if (exists $eventlist->{$connectstr_ir})
+				{
+					$anno{$key}="ir";
+				}
+				foreach my $search (keys %$eventlist)
+				{
+					next if($eventlist->{$search}!=2);
+					my $connectstr_ss="";
+					my @b=split(/[,-]/,$search);
+					if($a[0]==$b[0] && $a[3]==$b[3])
+					{
+						if($a[1]==$b[1])
+						{
+							next if ($b[2] == $a[2]);
+							$connectstr_ss .= $a[0]."-".$a[1].",";
+							if($b[2]>$a[2])
+							{
+								$connectstr_ss .= $a[2]."-".$b[2].",".$b[2];
+							}
+							#elsif($b[2]<$a[2])
+							else
+							{
+								$connectstr_ss .= $b[2]."-".$a[2].",".$a[2];
+							}
+							$connectstr_ss .= "-".$a[3].",";
+							$anno{$connectstr_ss}="ss";
+							#print $connectstr_ss,"\tss\n";
+						}
+						if($a[2]==$b[2])
+						{
+							$connectstr_ss .= $a[0]."-";
+							if($b[1]>$a[1])
+							{
+								$connectstr_ss .= $a[1].",".$a[1]."-".$b[1];
+							}
+							#elsif($b[2]<$a[2])
+							else
+							{
+								$connectstr_ss .= $b[1].",".$b[1]."-".$a[1];
+							}
+							$connectstr_ss .= ",".$a[2]."-".$a[3].",";
+							$anno{$connectstr_ss}="ss";
+							#print $connectstr_ss,"\tss\n";
+						}
+
+
+					}
+
+				}
+			}
+		}
+	}
+	if($mode eq "af" or $mode eq "al")#alternative first/last exon
+	{
+		foreach my $key (keys %$eventlist)
+		{
+			my @a=split(/,/,$key);
+			foreach my $search (keys %$eventlist)
+			{
+				next if $search eq $key;
+				my $connectstr = "";
+				if($eventlist->{$search} < $eventlist->{$key} )
+				{
+					$connectstr = $search."|".$key;
+				}
+				else
+				{
+					$connectstr = $key."|".$search;
+				}
+				next if exists $anno{$connectstr};
+				my @b=split(/,/,$search);
+				if($mode eq "af")
+				{
+					if($a[0] ne $b[0] and $a[1] eq $b[1])
+					{
+						$anno{$connectstr} = $mode;
+					}
+				}
+				else
+				{
+					if($a[0] eq $b[0] and $a[1] ne $b[1])
+					{
+						$anno{$connectstr} = $mode;
+					}
+				}
+				
+			}
+		}
+	}
+	return(\%anno);
+}
+
+
+sub printanno
+{
+# print the annotations
+# and return the connectstr and event id to be used to print evidences
+	my %eventids=();
+	my ($annos,$chr,$strand)=@_;
+	my %nums_per_isoform;
+	foreach my $key (keys %$annos)
+	{
+		if($annos->{$key} eq "af")
+		{
+			my @a=split(/[\,\-\|]+/,$key);
+			my $chrid=substr($chr,3,length($chr)-3);
+			my $id_anno="AF";
+			#consider direction later
+			my $id=join("-",$id_anno,$id_anno, $a[2],$a[3]);
+			my $num=0;
+			$num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id};
+			$nums_per_isoform{$id}++;
+
+			print $chr,"\t",$a[0],"\t",$a[3],"\t";
+			print $id.".".$num,"[L]\t";
+			print "0\t";
+			print $strand,"\t";
+			print $a[0],"\t",$a[3],"\t";
+			print "255,0,0\t";
+			print "2\t";
+			print $a[1]-$a[0],",";
+			print $a[3]-$a[2],"\t";
+			print "0,";
+			print $a[2]-$a[0],"\n";
+			
+			print $chr,"\t",$a[4],"\t",$a[7],"\t";
+			print $id.".".$num,"[R]\t";
+			print "0\t";
+			print $strand,"\t";
+			print $a[4],"\t",$a[7],"\t";
+			print "255,0,0\t";
+			print "2\t";
+			print $a[5]-$a[4],",";
+			print $a[7]-$a[6],"\t";
+			print "0,";
+			print $a[6]-$a[4],"\n";
+				
+			#print "$key,af\n";
+		}
+		if($annos->{$key} eq "ca" or $annos->{$key} eq "cs")
+		{
+			my @a=split(/[,-]/,$key);
+
+			my $chrid=substr($chr,3,length($chr)-3);
+			my $id_anno="CS";
+			$id_anno="CA" if $annos->{$key} eq "ca";
+			my $id="CA-$id_anno-$chrid"."-".$a[2]."-".$a[3];
+			my $num=0;
+			$num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id};
+			$nums_per_isoform{$id}++;
+
+			print $chr,"\t",$a[0],"\t",$a[5],"\t";
+			print $id.".".$num,"[L]\t";
+			$eventids {$key} = $id.".".$num."[L]";
+			print "0\t";
+			print $strand,"\t";
+			print $a[0],"\t",$a[5],"\t";
+			print "255,0,0\t";
+			print "3\t";
+			print $a[1]-$a[0],",";
+			print $a[3]-$a[2],",";
+			print $a[5]-$a[4],"\t";
+			print "0,";
+			print $a[2]-$a[0],",";
+			print $a[4]-$a[0],"\n";
+			
+			print $chr,"\t",$a[0],"\t",$a[5],"\t";
+                        print $id.".".$num,"[S]\t";
+			my $connectstr_ca= $a[0]."-".$a[1].",".$a[4]."-".$a[5].",";
+			$eventids{$connectstr_ca} = $id.".".$num."[S]";
+                        print "0\t";
+                        print $strand,"\t";
+                        print $a[0],"\t",$a[5],"\t";
+                        print "255,0,0\t";
+                        print "2\t";    
+                        print $a[1]-$a[0],",";
+                        #print $a[3]-$a[2],",";
+                        print $a[5]-$a[4],"\t";
+                        print "0,";
+                        #print $a[2]-$a[0],",";
+                        print $a[4]-$a[0],"\n";
+			
+		}
+		if($annos->{$key} eq "ir")
+		{
+			my @a=split(/[,-]/,$key);
+
+			my $chrid=substr($chr,3,length($chr)-3);
+			my $id="IR-IR-$chrid"."-".$a[1]."-".$a[2];
+			my $num=0;
+			$num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id};
+			$nums_per_isoform{$id}++;
+
+			print $chr,"\t",$a[0],"\t",$a[3],"\t";
+			print $id.".".$num,"[L]\t";
+			print "0\t";
+			print $strand,"\t";
+			print $a[0],"\t",$a[3],"\t";
+			$eventids{$a[0]."-".$a[3].","} = $id.".".$num."[L]";
+			print "255,0,0\t";
+			print "3\t";
+			print $a[1]-$a[0],",";
+			print $a[2]-$a[1],",";
+			print $a[3]-$a[2],"\t";
+			print "0,";
+			print $a[1]-$a[0],",";
+			print $a[2]-$a[0],"\n";
+			
+			print $chr,"\t",$a[0],"\t",$a[3],"\t";
+                        print $id.".".$num,"[S]\t";
+			$eventids{$key} = $id.".".$num."[S]";
+                        print "0\t";
+                        print $strand,"\t";
+                        print $a[0],"\t",$a[3],"\t";
+                        print "255,0,0\t";
+                        print "2\t";    
+                        print $a[1]-$a[0],",";
+                        #print $a[3]-$a[2],",";
+                        print $a[3]-$a[2],"\t";
+                        print "0,";
+                        #print $a[2]-$a[0],",";
+                        print $a[2]-$a[0],"\n";
+		
+		}
+		if($annos->{$key} eq "ss")
+		{
+			my @a=split(/[,-]/,$key);
+			my $chrid=substr($chr,3,length($chr)-3);
+			my $type="AA";
+			if( ($strand eq "+" && $a[1]==$a[2]) or ($strand eq "-" && $a[3]==$a[4]))
+			{
+				$type="AD";
+			}
+			my $id="$type-$type-$chrid"."-".$a[2]."-".$a[3];
+			my $connect_str_L = "";
+			my $connect_str_S = "";
+			if($a[1]==$a[2])
+			{
+				$connect_str_L = $a[0]."-".$a[3].",".$a[4]."-".$a[5].",";
+				$connect_str_S = $a[0]."-".$a[1].",".$a[4]."-".$a[5].",";
+			}
+			else
+			{
+				$connect_str_L = $a[0]."-".$a[1].",".$a[2]."-".$a[5].",";
+				$connect_str_S = $a[0]."-".$a[1].",".$a[3]."-".$a[5].",";
+				
+			}
+			my $num=0;
+			$num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id};
+			$nums_per_isoform{$id}++;
+			print $chr,"\t",$a[0],"\t",$a[5],"\t";
+			print $id.".".$num,"[L]\t";
+			$eventids {$connect_str_L} = $id.".".$num."[L]";
+			print "0\t";
+			print $strand,"\t";
+			print $a[0],"\t",$a[5],"\t";
+			print "255,0,0\t";
+			print "3\t";
+			print $a[1]-$a[0],",";
+			print $a[3]-$a[2],",";
+			print $a[5]-$a[4],"\t";
+			print "0,";
+			print $a[2]-$a[0],",";
+			print $a[4]-$a[0],"\n";
+			
+			print $chr,"\t",$a[0],"\t",$a[5],"\t";
+                        print $id.".".$num,"[S]\t";
+			$eventids {$connect_str_S} = $id.".".$num."[S]";
+                        print "0\t";
+                        print $strand,"\t";
+                        print $a[0],"\t",$a[5],"\t";
+                        print "255,0,0\t";
+                        print "2\t";    
+                        print $a[1]-$a[0],",";
+                        #print $a[3]-$a[2],",";
+                        print $a[5]-$a[4],"\t";
+                        print "0,";
+                        #print $a[2]-$a[0],",";
+                        print $a[4]-$a[0],"\n";
+	
+	
+		}
+
+	}
+	return(\%eventids);
+	
+}
+