1
|
1 # rewrite on Sep 7th,2022
|
|
2
|
|
3 #part of package SpliceTrap
|
|
4
|
|
5 #Jie Wu
|
|
6 use strict;
|
|
7
|
|
8 my $inputfilename = $ARGV[0];
|
|
9
|
|
10 # input file is a gtf file,
|
|
11 # "transcript_id" is required for each line and should not be ambiguous.
|
|
12 # only the "exon" lines are used
|
|
13
|
|
14 my %chr_hash;
|
|
15 my %strand_hash;
|
|
16 my %tx_exons; #tx_exons{$tx_id){$start} = $size;
|
|
17
|
|
18 my $linenum = 0;
|
|
19
|
|
20 open(input, $inputfilename);
|
|
21
|
|
22 while(my $line=<input>)
|
|
23 {
|
|
24 $linenum++;
|
|
25 my @a = split("\t",$line);
|
|
26 if ($a[2] eq "exon")
|
|
27 {
|
|
28 my $txid;
|
|
29 if($a[8]=~/transcript_id "(\S*?)"/)
|
|
30 {
|
|
31 $txid = $1;
|
|
32 }
|
|
33 else
|
|
34 {
|
|
35 die ("$inputfilename format error! No transcript_id in line $linenum \n");
|
|
36 }
|
|
37
|
|
38 if( exists $chr_hash{$txid} and $chr_hash{$txid} ne $a[0])
|
|
39 {
|
|
40 warn ("$inputfilename: ambiguous transcript_id in line $linenum: $txid Skipped \n");
|
|
41 next;
|
|
42 }
|
|
43 if( exists $strand_hash{$txid} and $strand_hash{$txid} ne $a[6])
|
|
44 {
|
|
45 warn ("$inputfilename: ambiguous transcript_id in line $linenum: $txid Skipped\n");
|
|
46 }
|
|
47 $chr_hash{$txid} = $a[0];
|
|
48 $strand_hash{$txid} = $a[6];
|
|
49 $tx_exons{$txid}{$a[3]} = $a[4] - $a[3] +1;
|
|
50
|
|
51 }
|
|
52
|
|
53 }
|
|
54
|
|
55 foreach my $txid (keys %chr_hash)
|
|
56 {
|
|
57 my @starts;
|
|
58 my @sizes;
|
|
59 foreach my $start (sort {$a<=>$b} (keys %{$tx_exons{$txid}} ) )
|
|
60 {
|
|
61 push (@starts, $start);
|
|
62 push (@sizes, $tx_exons{$txid}{$start});
|
|
63 }
|
|
64 my $exon_num = scalar(@sizes);
|
|
65 my $starts_str = "";
|
|
66 for(my $i = 0; $i < $exon_num; $i++)
|
|
67 {
|
|
68 $starts_str = $starts_str.($starts[$i] - $starts[0]).",";
|
|
69 if($i>0)
|
|
70 {
|
|
71 warn "$txid, intron size..".($starts[$i]-$starts[$i-1])."\n" if ($starts[$i]-$starts[$i-1]>1000000);
|
|
72 }
|
|
73 }
|
|
74 my $sizes_str = join(",",@sizes);
|
|
75 my $end = $starts[$exon_num-1] + $sizes[$exon_num-1] -1;
|
|
76 print join("\t",$chr_hash{$txid}, $starts[0]-1, $end, $txid,"0",$strand_hash{$txid},$starts[0]-1, $end, "255,0,0",$exon_num,$sizes_str, $starts_str);
|
|
77 print "\n";
|
|
78 }
|
|
79
|
|
80
|
|
81 close(input);
|