comparison bin/gtf2bed.pl @ 1:adc0f7765d85 draft

planemo upload
author bioitcore
date Thu, 07 Sep 2017 15:06:58 -0400
parents
children
comparison
equal deleted inserted replaced
0:d4ca551ca300 1:adc0f7765d85
1 # rewrite on Sep 7th,2022
2
3 #part of package SpliceTrap
4
5 #Jie Wu
6 use strict;
7
8 my $inputfilename = $ARGV[0];
9
10 # input file is a gtf file,
11 # "transcript_id" is required for each line and should not be ambiguous.
12 # only the "exon" lines are used
13
14 my %chr_hash;
15 my %strand_hash;
16 my %tx_exons; #tx_exons{$tx_id){$start} = $size;
17
18 my $linenum = 0;
19
20 open(input, $inputfilename);
21
22 while(my $line=<input>)
23 {
24 $linenum++;
25 my @a = split("\t",$line);
26 if ($a[2] eq "exon")
27 {
28 my $txid;
29 if($a[8]=~/transcript_id "(\S*?)"/)
30 {
31 $txid = $1;
32 }
33 else
34 {
35 die ("$inputfilename format error! No transcript_id in line $linenum \n");
36 }
37
38 if( exists $chr_hash{$txid} and $chr_hash{$txid} ne $a[0])
39 {
40 warn ("$inputfilename: ambiguous transcript_id in line $linenum: $txid Skipped \n");
41 next;
42 }
43 if( exists $strand_hash{$txid} and $strand_hash{$txid} ne $a[6])
44 {
45 warn ("$inputfilename: ambiguous transcript_id in line $linenum: $txid Skipped\n");
46 }
47 $chr_hash{$txid} = $a[0];
48 $strand_hash{$txid} = $a[6];
49 $tx_exons{$txid}{$a[3]} = $a[4] - $a[3] +1;
50
51 }
52
53 }
54
55 foreach my $txid (keys %chr_hash)
56 {
57 my @starts;
58 my @sizes;
59 foreach my $start (sort {$a<=>$b} (keys %{$tx_exons{$txid}} ) )
60 {
61 push (@starts, $start);
62 push (@sizes, $tx_exons{$txid}{$start});
63 }
64 my $exon_num = scalar(@sizes);
65 my $starts_str = "";
66 for(my $i = 0; $i < $exon_num; $i++)
67 {
68 $starts_str = $starts_str.($starts[$i] - $starts[0]).",";
69 if($i>0)
70 {
71 warn "$txid, intron size..".($starts[$i]-$starts[$i-1])."\n" if ($starts[$i]-$starts[$i-1]>1000000);
72 }
73 }
74 my $sizes_str = join(",",@sizes);
75 my $end = $starts[$exon_num-1] + $sizes[$exon_num-1] -1;
76 print join("\t",$chr_hash{$txid}, $starts[0]-1, $end, $txid,"0",$strand_hash{$txid},$starts[0]-1, $end, "255,0,0",$exon_num,$sizes_str, $starts_str);
77 print "\n";
78 }
79
80
81 close(input);