view bin/scanbed2txdb.pl @ 5:2ebca9da5e42 draft default tip

planemo upload
author bioitcore
date Thu, 07 Sep 2017 17:39:24 -0400
parents adc0f7765d85
children
line wrap: on
line source

#argv0: input transcript bed file
#argv1: output filename, will be in AS format

use strict;
my $AnnoFileName = $ARGV[0];
my $outputFileName = $ARGV[1];

if($AnnoFileName eq "" or $outputFileName eq "")
{
	print "TXDBGEN: Please specify your input files\n";
	exit;
}

my $cachefolder = `basename $outputFileName`;
chomp($cachefolder);
my $cachefolder = $cachefolder.".cache";
my $AnnoFileBase = `basename $AnnoFileName`; 
chomp($AnnoFileBase);

if(! -e $cachefolder)
{
	mkdir $cachefolder or die "CHECK: cannot mkdir $cachefolder\n";
	print "TXDBGEN: mkdir $cachefolder\n";
}

my $CacheAnnoFileName = $cachefolder."/".$AnnoFileBase.".sort";
#sort the annotation file

print "TXDBGEN: sort $AnnoFileName \n";

system("sort -k6,6 -k1,1 -k2,2n -k3,3n $AnnoFileName >$CacheAnnoFileName");

$AnnoFileName = $CacheAnnoFileName;

#read the annotations into hashes

open(AnnoFile, $CacheAnnoFileName) or die "can not open",$CacheAnnoFileName;
#split the genes into contigs

my $contigid = 0;
my $end_tmp = 0;
my $chr_tmp = "chr";
my $strand_tmp="NA";

my $TXnumtmp=0;

my %eventlist =();
#my %eventlist_af = ();
#my %eventlist_al = ();
my %evidences = ();

open(my $output,">$outputFileName");

open(my $output2, ">$outputFileName.evi");
while(my $line =<AnnoFile>)
{
	chomp($line);
	my @a = split("\t",$line);
	my $chr = $a[0];
	my $start = $a[1];
	my $end = $a[2];
	my $name = $a[3];
	my @sizes = split(",",$a[10]);
	my $strand = $a[5];
	my @start_shifts = split(",",$a[11]);
	#my $chrstr = $chr.$strand;
	my @starts;
	my @ends;
	for(my $i=0;$i<@start_shifts;$i++)
        {
                $starts[$i]=$start_shifts[$i]+$start;
                $ends[$i] = $starts[$i]+$sizes[$i];
        }
	if($start >$end_tmp or $chr ne $chr_tmp or $strand_tmp ne $strand)
        {
                $contigid++;
                #$ctgmultisonum++ if $TXnumtmp>1;
                $TXnumtmp=0;
		my $annos = scanevents(\%eventlist, "inner") ;
		#my $annos_af = scanevents(\%eventlist_af, "af");
		#my $annos_al = scanevents(\%eventlist_al, "al");
		
		my $stdout = select ($output);
		my $eventids = printanno($annos,$chr_tmp,$strand_tmp) if(scalar(%$annos)>0);
		#printanno($annos_af,$chr_tmp,$strand_tmp) if(scalar(%$annos_af)>0);
		#printanno($annos_al,$chr_tmp,$strand_tmp) if(scalar(%$annos_al)>0);
		select($stdout);
		$stdout = select ($output2);
		#print cross information
		foreach my $connect_str  (keys %$eventids) 
		{
			print $eventids->{$connect_str},"\t";
			foreach my $transcriptid  (keys %{$evidences{$connect_str}}) 
			{
				print $transcriptid,",";
			}
			print "\n";
		}
		select($stdout);
		print "TXDBGEN: Contig ID $contigid at $chr $strand...\n" if ( $contigid%1000 == 0);
		%eventlist = ();
		#%eventlist_al = ();
		#%eventlist_af = ();
                #print CacheContigFile "#ctg$contigid\n";

        }
	$TXnumtmp++;
        $end_tmp = $end if ($end > $end_tmp or $chr ne $chr_tmp or $strand_tmp ne $strand);
        $chr_tmp = $chr;
	
	$strand_tmp = $strand;
	
	#scan connections, 2 and 3 exons for CA/CS/AF
#	if(scalar(@starts)>2)
#	{
#		my $connectionstr_af =$starts[0]."-".
#				$ends[0].",".
#				$starts[1]."-".
#				$ends[1]."," ;
#	
#		$eventlist_af{$connectionstr_af} = $starts[0];
#		my $connectionstr_al = $starts[scalar(@starts)-2]."-".
#				$ends[scalar(@starts)-2].",".
#				$starts[scalar(@starts)-1]."-".
#				$ends[scalar(@starts)-1]."," ;
#		$eventlist_al{$connectionstr_al} = $starts[scalar(@starts)-2];
#	}
		#didn't consider direction yet
        #add 1 exon for IR
	for(my $n=1;$n<4;$n++)
        {
		for(my $i=0;$i<scalar(@starts)-$n+1;$i++)
                {
			my $connectionstr = "";
			for(my $j=$i;$j<$i+$n;$j++)
                        {
                                $connectionstr = $connectionstr .
                                        $starts[$j]."-".
                                        $ends[$j].",";
                        }
		#	print $connectionstr,"\n";
			$eventlist{$connectionstr} = $n;
			$evidences{$connectionstr}{$name} = 1;
		}

	}
									
}

my $annos = scanevents(\%eventlist, "inner");
#my $annos_af = scanevents(\%eventlist_af, "af");
#my $annos_al = scanevents(\%eventlist_al, "al");
		
my $stdout = select ($output);
my $eventids = printanno($annos,$chr_tmp,$strand_tmp) if(scalar(%$annos)>0);
select ($stdout);
$stdout = select ($output2);
#print cross information
foreach my $connect_str (keys %$eventids)
{
	print $eventids->{$connect_str},"\t";
	foreach my $transcriptid (keys %{$evidences{$connect_str}}) 
	{
		print $transcriptid,",";
	}
	print "\n";
}

select($stdout);	

close($output);
close($output2);

#close(CacheContigFile);
#define sub scanevents
sub scanevents
{
	my ($eventlist,$mode)=@_;
	my %anno=();
	if($mode eq "inner")
	{
		foreach my $key (keys %$eventlist)
		{
			#print $key,"\t",$eventlist->{$key},"\n";
			if($eventlist->{$key} == 3)
			{
				my @a=split(",",$key);
				my $connectstr_ca=$a[0].",".$a[2].",";
				if (exists $eventlist->{$connectstr_ca})
				{
					$anno{$key}="ca";
				}
				else
				{
					$anno{$key}="cs";
				}
			}
			if($eventlist->{$key} == 2)
			{
				my @a=split(/[,-]/,$key);

				my $connectstr_ir=$a[0]."-".$a[3].",";
				if (exists $eventlist->{$connectstr_ir})
				{
					$anno{$key}="ir";
				}
				foreach my $search (keys %$eventlist)
				{
					next if($eventlist->{$search}!=2);
					my $connectstr_ss="";
					my @b=split(/[,-]/,$search);
					if($a[0]==$b[0] && $a[3]==$b[3])
					{
						if($a[1]==$b[1])
						{
							next if ($b[2] == $a[2]);
							$connectstr_ss .= $a[0]."-".$a[1].",";
							if($b[2]>$a[2])
							{
								$connectstr_ss .= $a[2]."-".$b[2].",".$b[2];
							}
							#elsif($b[2]<$a[2])
							else
							{
								$connectstr_ss .= $b[2]."-".$a[2].",".$a[2];
							}
							$connectstr_ss .= "-".$a[3].",";
							$anno{$connectstr_ss}="ss";
							#print $connectstr_ss,"\tss\n";
						}
						if($a[2]==$b[2])
						{
							$connectstr_ss .= $a[0]."-";
							if($b[1]>$a[1])
							{
								$connectstr_ss .= $a[1].",".$a[1]."-".$b[1];
							}
							#elsif($b[2]<$a[2])
							else
							{
								$connectstr_ss .= $b[1].",".$b[1]."-".$a[1];
							}
							$connectstr_ss .= ",".$a[2]."-".$a[3].",";
							$anno{$connectstr_ss}="ss";
							#print $connectstr_ss,"\tss\n";
						}


					}

				}
			}
		}
	}
	if($mode eq "af" or $mode eq "al")#alternative first/last exon
	{
		foreach my $key (keys %$eventlist)
		{
			my @a=split(/,/,$key);
			foreach my $search (keys %$eventlist)
			{
				next if $search eq $key;
				my $connectstr = "";
				if($eventlist->{$search} < $eventlist->{$key} )
				{
					$connectstr = $search."|".$key;
				}
				else
				{
					$connectstr = $key."|".$search;
				}
				next if exists $anno{$connectstr};
				my @b=split(/,/,$search);
				if($mode eq "af")
				{
					if($a[0] ne $b[0] and $a[1] eq $b[1])
					{
						$anno{$connectstr} = $mode;
					}
				}
				else
				{
					if($a[0] eq $b[0] and $a[1] ne $b[1])
					{
						$anno{$connectstr} = $mode;
					}
				}
				
			}
		}
	}
	return(\%anno);
}


sub printanno
{
# print the annotations
# and return the connectstr and event id to be used to print evidences
	my %eventids=();
	my ($annos,$chr,$strand)=@_;
	my %nums_per_isoform;
	foreach my $key (keys %$annos)
	{
		if($annos->{$key} eq "af")
		{
			my @a=split(/[\,\-\|]+/,$key);
			my $chrid=substr($chr,3,length($chr)-3);
			my $id_anno="AF";
			#consider direction later
			my $id=join("-",$id_anno,$id_anno, $a[2],$a[3]);
			my $num=0;
			$num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id};
			$nums_per_isoform{$id}++;

			print $chr,"\t",$a[0],"\t",$a[3],"\t";
			print $id.".".$num,"[L]\t";
			print "0\t";
			print $strand,"\t";
			print $a[0],"\t",$a[3],"\t";
			print "255,0,0\t";
			print "2\t";
			print $a[1]-$a[0],",";
			print $a[3]-$a[2],"\t";
			print "0,";
			print $a[2]-$a[0],"\n";
			
			print $chr,"\t",$a[4],"\t",$a[7],"\t";
			print $id.".".$num,"[R]\t";
			print "0\t";
			print $strand,"\t";
			print $a[4],"\t",$a[7],"\t";
			print "255,0,0\t";
			print "2\t";
			print $a[5]-$a[4],",";
			print $a[7]-$a[6],"\t";
			print "0,";
			print $a[6]-$a[4],"\n";
				
			#print "$key,af\n";
		}
		if($annos->{$key} eq "ca" or $annos->{$key} eq "cs")
		{
			my @a=split(/[,-]/,$key);

			my $chrid=substr($chr,3,length($chr)-3);
			my $id_anno="CS";
			$id_anno="CA" if $annos->{$key} eq "ca";
			my $id="CA-$id_anno-$chrid"."-".$a[2]."-".$a[3];
			my $num=0;
			$num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id};
			$nums_per_isoform{$id}++;

			print $chr,"\t",$a[0],"\t",$a[5],"\t";
			print $id.".".$num,"[L]\t";
			$eventids {$key} = $id.".".$num."[L]";
			print "0\t";
			print $strand,"\t";
			print $a[0],"\t",$a[5],"\t";
			print "255,0,0\t";
			print "3\t";
			print $a[1]-$a[0],",";
			print $a[3]-$a[2],",";
			print $a[5]-$a[4],"\t";
			print "0,";
			print $a[2]-$a[0],",";
			print $a[4]-$a[0],"\n";
			
			print $chr,"\t",$a[0],"\t",$a[5],"\t";
                        print $id.".".$num,"[S]\t";
			my $connectstr_ca= $a[0]."-".$a[1].",".$a[4]."-".$a[5].",";
			$eventids{$connectstr_ca} = $id.".".$num."[S]";
                        print "0\t";
                        print $strand,"\t";
                        print $a[0],"\t",$a[5],"\t";
                        print "255,0,0\t";
                        print "2\t";    
                        print $a[1]-$a[0],",";
                        #print $a[3]-$a[2],",";
                        print $a[5]-$a[4],"\t";
                        print "0,";
                        #print $a[2]-$a[0],",";
                        print $a[4]-$a[0],"\n";
			
		}
		if($annos->{$key} eq "ir")
		{
			my @a=split(/[,-]/,$key);

			my $chrid=substr($chr,3,length($chr)-3);
			my $id="IR-IR-$chrid"."-".$a[1]."-".$a[2];
			my $num=0;
			$num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id};
			$nums_per_isoform{$id}++;

			print $chr,"\t",$a[0],"\t",$a[3],"\t";
			print $id.".".$num,"[L]\t";
			print "0\t";
			print $strand,"\t";
			print $a[0],"\t",$a[3],"\t";
			$eventids{$a[0]."-".$a[3].","} = $id.".".$num."[L]";
			print "255,0,0\t";
			print "3\t";
			print $a[1]-$a[0],",";
			print $a[2]-$a[1],",";
			print $a[3]-$a[2],"\t";
			print "0,";
			print $a[1]-$a[0],",";
			print $a[2]-$a[0],"\n";
			
			print $chr,"\t",$a[0],"\t",$a[3],"\t";
                        print $id.".".$num,"[S]\t";
			$eventids{$key} = $id.".".$num."[S]";
                        print "0\t";
                        print $strand,"\t";
                        print $a[0],"\t",$a[3],"\t";
                        print "255,0,0\t";
                        print "2\t";    
                        print $a[1]-$a[0],",";
                        #print $a[3]-$a[2],",";
                        print $a[3]-$a[2],"\t";
                        print "0,";
                        #print $a[2]-$a[0],",";
                        print $a[2]-$a[0],"\n";
		
		}
		if($annos->{$key} eq "ss")
		{
			my @a=split(/[,-]/,$key);
			my $chrid=substr($chr,3,length($chr)-3);
			my $type="AA";
			if( ($strand eq "+" && $a[1]==$a[2]) or ($strand eq "-" && $a[3]==$a[4]))
			{
				$type="AD";
			}
			my $id="$type-$type-$chrid"."-".$a[2]."-".$a[3];
			my $connect_str_L = "";
			my $connect_str_S = "";
			if($a[1]==$a[2])
			{
				$connect_str_L = $a[0]."-".$a[3].",".$a[4]."-".$a[5].",";
				$connect_str_S = $a[0]."-".$a[1].",".$a[4]."-".$a[5].",";
			}
			else
			{
				$connect_str_L = $a[0]."-".$a[1].",".$a[2]."-".$a[5].",";
				$connect_str_S = $a[0]."-".$a[1].",".$a[3]."-".$a[5].",";
				
			}
			my $num=0;
			$num = $nums_per_isoform{$id} if exists $nums_per_isoform{$id};
			$nums_per_isoform{$id}++;
			print $chr,"\t",$a[0],"\t",$a[5],"\t";
			print $id.".".$num,"[L]\t";
			$eventids {$connect_str_L} = $id.".".$num."[L]";
			print "0\t";
			print $strand,"\t";
			print $a[0],"\t",$a[5],"\t";
			print "255,0,0\t";
			print "3\t";
			print $a[1]-$a[0],",";
			print $a[3]-$a[2],",";
			print $a[5]-$a[4],"\t";
			print "0,";
			print $a[2]-$a[0],",";
			print $a[4]-$a[0],"\n";
			
			print $chr,"\t",$a[0],"\t",$a[5],"\t";
                        print $id.".".$num,"[S]\t";
			$eventids {$connect_str_S} = $id.".".$num."[S]";
                        print "0\t";
                        print $strand,"\t";
                        print $a[0],"\t",$a[5],"\t";
                        print "255,0,0\t";
                        print "2\t";    
                        print $a[1]-$a[0],",";
                        #print $a[3]-$a[2],",";
                        print $a[5]-$a[4],"\t";
                        print "0,";
                        #print $a[2]-$a[0],",";
                        print $a[4]-$a[0],"\n";
	
	
		}

	}
	return(\%eventids);
	
}