view bin/gtf2bed.pl @ 4:cd336e593a92 draft

planemo upload
author bioitcore
date Thu, 07 Sep 2017 16:53:12 -0400
parents adc0f7765d85
children
line wrap: on
line source

# rewrite on Sep 7th,2022

#part of package SpliceTrap

#Jie Wu
use strict;

my $inputfilename = $ARGV[0];

# input file is a gtf file, 
# "transcript_id" is required for each line and should not be ambiguous.
# only the "exon" lines are used

my %chr_hash; 
my %strand_hash;
my %tx_exons; #tx_exons{$tx_id){$start} = $size;

my $linenum = 0;

open(input, $inputfilename);

while(my $line=<input>)
{
	$linenum++; 
	my @a = split("\t",$line);
	if ($a[2] eq "exon")
	{
		my $txid;
		if($a[8]=~/transcript_id "(\S*?)"/)
		{
			$txid = $1;
		}
		else
		{
			die ("$inputfilename format error! No transcript_id in line $linenum \n");
		}
		
		if( exists $chr_hash{$txid} and $chr_hash{$txid} ne $a[0])
		{
			warn ("$inputfilename: ambiguous transcript_id in line $linenum: $txid Skipped \n");
			next;
		}
		if( exists $strand_hash{$txid} and $strand_hash{$txid} ne $a[6])
		{
			warn ("$inputfilename: ambiguous transcript_id in line $linenum: $txid Skipped\n");
		}
		$chr_hash{$txid} = $a[0];
		$strand_hash{$txid} = $a[6];
		$tx_exons{$txid}{$a[3]} = $a[4] - $a[3] +1;
		
	}
	
}

foreach my $txid (keys %chr_hash)
{
	my @starts;
	my @sizes;
	foreach my $start (sort {$a<=>$b} (keys %{$tx_exons{$txid}} ) )
	{
		push (@starts, $start);
		push (@sizes, $tx_exons{$txid}{$start});
	}
	my $exon_num   = scalar(@sizes);
	my $starts_str = "";
	for(my $i = 0; $i < $exon_num; $i++)
	{
		$starts_str = $starts_str.($starts[$i] - $starts[0]).",";
		if($i>0)
		{
		warn "$txid, intron size..".($starts[$i]-$starts[$i-1])."\n" if ($starts[$i]-$starts[$i-1]>1000000);
		}
	}
	my $sizes_str  = join(",",@sizes);
	my $end = $starts[$exon_num-1] + $sizes[$exon_num-1] -1;
	print join("\t",$chr_hash{$txid}, $starts[0]-1, $end, $txid,"0",$strand_hash{$txid},$starts[0]-1, $end, "255,0,0",$exon_num,$sizes_str, $starts_str);
        print "\n";
}


close(input);