Mercurial > repos > bioitcore > splicetrap
view bin/gtf2bed.pl @ 5:2ebca9da5e42 draft default tip
planemo upload
author | bioitcore |
---|---|
date | Thu, 07 Sep 2017 17:39:24 -0400 |
parents | adc0f7765d85 |
children |
line wrap: on
line source
# rewrite on Sep 7th,2022 #part of package SpliceTrap #Jie Wu use strict; my $inputfilename = $ARGV[0]; # input file is a gtf file, # "transcript_id" is required for each line and should not be ambiguous. # only the "exon" lines are used my %chr_hash; my %strand_hash; my %tx_exons; #tx_exons{$tx_id){$start} = $size; my $linenum = 0; open(input, $inputfilename); while(my $line=<input>) { $linenum++; my @a = split("\t",$line); if ($a[2] eq "exon") { my $txid; if($a[8]=~/transcript_id "(\S*?)"/) { $txid = $1; } else { die ("$inputfilename format error! No transcript_id in line $linenum \n"); } if( exists $chr_hash{$txid} and $chr_hash{$txid} ne $a[0]) { warn ("$inputfilename: ambiguous transcript_id in line $linenum: $txid Skipped \n"); next; } if( exists $strand_hash{$txid} and $strand_hash{$txid} ne $a[6]) { warn ("$inputfilename: ambiguous transcript_id in line $linenum: $txid Skipped\n"); } $chr_hash{$txid} = $a[0]; $strand_hash{$txid} = $a[6]; $tx_exons{$txid}{$a[3]} = $a[4] - $a[3] +1; } } foreach my $txid (keys %chr_hash) { my @starts; my @sizes; foreach my $start (sort {$a<=>$b} (keys %{$tx_exons{$txid}} ) ) { push (@starts, $start); push (@sizes, $tx_exons{$txid}{$start}); } my $exon_num = scalar(@sizes); my $starts_str = ""; for(my $i = 0; $i < $exon_num; $i++) { $starts_str = $starts_str.($starts[$i] - $starts[0]).","; if($i>0) { warn "$txid, intron size..".($starts[$i]-$starts[$i-1])."\n" if ($starts[$i]-$starts[$i-1]>1000000); } } my $sizes_str = join(",",@sizes); my $end = $starts[$exon_num-1] + $sizes[$exon_num-1] -1; print join("\t",$chr_hash{$txid}, $starts[0]-1, $end, $txid,"0",$strand_hash{$txid},$starts[0]-1, $end, "255,0,0",$exon_num,$sizes_str, $starts_str); print "\n"; } close(input);