Mercurial > repos > bioitcore > splicetrap
diff bin/gtf2bed.pl @ 1:adc0f7765d85 draft
planemo upload
author | bioitcore |
---|---|
date | Thu, 07 Sep 2017 15:06:58 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/gtf2bed.pl Thu Sep 07 15:06:58 2017 -0400 @@ -0,0 +1,81 @@ +# rewrite on Sep 7th,2022 + +#part of package SpliceTrap + +#Jie Wu +use strict; + +my $inputfilename = $ARGV[0]; + +# input file is a gtf file, +# "transcript_id" is required for each line and should not be ambiguous. +# only the "exon" lines are used + +my %chr_hash; +my %strand_hash; +my %tx_exons; #tx_exons{$tx_id){$start} = $size; + +my $linenum = 0; + +open(input, $inputfilename); + +while(my $line=<input>) +{ + $linenum++; + my @a = split("\t",$line); + if ($a[2] eq "exon") + { + my $txid; + if($a[8]=~/transcript_id "(\S*?)"/) + { + $txid = $1; + } + else + { + die ("$inputfilename format error! No transcript_id in line $linenum \n"); + } + + if( exists $chr_hash{$txid} and $chr_hash{$txid} ne $a[0]) + { + warn ("$inputfilename: ambiguous transcript_id in line $linenum: $txid Skipped \n"); + next; + } + if( exists $strand_hash{$txid} and $strand_hash{$txid} ne $a[6]) + { + warn ("$inputfilename: ambiguous transcript_id in line $linenum: $txid Skipped\n"); + } + $chr_hash{$txid} = $a[0]; + $strand_hash{$txid} = $a[6]; + $tx_exons{$txid}{$a[3]} = $a[4] - $a[3] +1; + + } + +} + +foreach my $txid (keys %chr_hash) +{ + my @starts; + my @sizes; + foreach my $start (sort {$a<=>$b} (keys %{$tx_exons{$txid}} ) ) + { + push (@starts, $start); + push (@sizes, $tx_exons{$txid}{$start}); + } + my $exon_num = scalar(@sizes); + my $starts_str = ""; + for(my $i = 0; $i < $exon_num; $i++) + { + $starts_str = $starts_str.($starts[$i] - $starts[0]).","; + if($i>0) + { + warn "$txid, intron size..".($starts[$i]-$starts[$i-1])."\n" if ($starts[$i]-$starts[$i-1]>1000000); + } + } + my $sizes_str = join(",",@sizes); + my $end = $starts[$exon_num-1] + $sizes[$exon_num-1] -1; + print join("\t",$chr_hash{$txid}, $starts[0]-1, $end, $txid,"0",$strand_hash{$txid},$starts[0]-1, $end, "255,0,0",$exon_num,$sizes_str, $starts_str); + print "\n"; +} + + +close(input);