diff bin/gtf2bed.pl @ 1:adc0f7765d85 draft

planemo upload
author bioitcore
date Thu, 07 Sep 2017 15:06:58 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/gtf2bed.pl	Thu Sep 07 15:06:58 2017 -0400
@@ -0,0 +1,81 @@
+# rewrite on Sep 7th,2022
+
+#part of package SpliceTrap
+
+#Jie Wu
+use strict;
+
+my $inputfilename = $ARGV[0];
+
+# input file is a gtf file, 
+# "transcript_id" is required for each line and should not be ambiguous.
+# only the "exon" lines are used
+
+my %chr_hash; 
+my %strand_hash;
+my %tx_exons; #tx_exons{$tx_id){$start} = $size;
+
+my $linenum = 0;
+
+open(input, $inputfilename);
+
+while(my $line=<input>)
+{
+	$linenum++; 
+	my @a = split("\t",$line);
+	if ($a[2] eq "exon")
+	{
+		my $txid;
+		if($a[8]=~/transcript_id "(\S*?)"/)
+		{
+			$txid = $1;
+		}
+		else
+		{
+			die ("$inputfilename format error! No transcript_id in line $linenum \n");
+		}
+		
+		if( exists $chr_hash{$txid} and $chr_hash{$txid} ne $a[0])
+		{
+			warn ("$inputfilename: ambiguous transcript_id in line $linenum: $txid Skipped \n");
+			next;
+		}
+		if( exists $strand_hash{$txid} and $strand_hash{$txid} ne $a[6])
+		{
+			warn ("$inputfilename: ambiguous transcript_id in line $linenum: $txid Skipped\n");
+		}
+		$chr_hash{$txid} = $a[0];
+		$strand_hash{$txid} = $a[6];
+		$tx_exons{$txid}{$a[3]} = $a[4] - $a[3] +1;
+		
+	}
+	
+}
+
+foreach my $txid (keys %chr_hash)
+{
+	my @starts;
+	my @sizes;
+	foreach my $start (sort {$a<=>$b} (keys %{$tx_exons{$txid}} ) )
+	{
+		push (@starts, $start);
+		push (@sizes, $tx_exons{$txid}{$start});
+	}
+	my $exon_num   = scalar(@sizes);
+	my $starts_str = "";
+	for(my $i = 0; $i < $exon_num; $i++)
+	{
+		$starts_str = $starts_str.($starts[$i] - $starts[0]).",";
+		if($i>0)
+		{
+		warn "$txid, intron size..".($starts[$i]-$starts[$i-1])."\n" if ($starts[$i]-$starts[$i-1]>1000000);
+		}
+	}
+	my $sizes_str  = join(",",@sizes);
+	my $end = $starts[$exon_num-1] + $sizes[$exon_num-1] -1;
+	print join("\t",$chr_hash{$txid}, $starts[0]-1, $end, $txid,"0",$strand_hash{$txid},$starts[0]-1, $end, "255,0,0",$exon_num,$sizes_str, $starts_str);
+        print "\n";
+}
+
+
+close(input);