annotate bin/gtf2bed.pl @ 5:2ebca9da5e42 draft default tip

planemo upload
author bioitcore
date Thu, 07 Sep 2017 17:39:24 -0400
parents adc0f7765d85
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
1 # rewrite on Sep 7th,2022
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
2
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
3 #part of package SpliceTrap
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
4
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
5 #Jie Wu
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
6 use strict;
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
7
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
8 my $inputfilename = $ARGV[0];
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
9
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
10 # input file is a gtf file,
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
11 # "transcript_id" is required for each line and should not be ambiguous.
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
12 # only the "exon" lines are used
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
13
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
14 my %chr_hash;
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
15 my %strand_hash;
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
16 my %tx_exons; #tx_exons{$tx_id){$start} = $size;
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
17
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
18 my $linenum = 0;
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
19
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
20 open(input, $inputfilename);
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
21
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
22 while(my $line=<input>)
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
23 {
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
24 $linenum++;
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
25 my @a = split("\t",$line);
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
26 if ($a[2] eq "exon")
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
27 {
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
28 my $txid;
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
29 if($a[8]=~/transcript_id "(\S*?)"/)
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
30 {
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
31 $txid = $1;
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
32 }
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
33 else
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
34 {
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
35 die ("$inputfilename format error! No transcript_id in line $linenum \n");
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
36 }
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
37
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
38 if( exists $chr_hash{$txid} and $chr_hash{$txid} ne $a[0])
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
39 {
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
40 warn ("$inputfilename: ambiguous transcript_id in line $linenum: $txid Skipped \n");
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
41 next;
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
42 }
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
43 if( exists $strand_hash{$txid} and $strand_hash{$txid} ne $a[6])
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
44 {
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
45 warn ("$inputfilename: ambiguous transcript_id in line $linenum: $txid Skipped\n");
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
46 }
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
47 $chr_hash{$txid} = $a[0];
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
48 $strand_hash{$txid} = $a[6];
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
49 $tx_exons{$txid}{$a[3]} = $a[4] - $a[3] +1;
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
50
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
51 }
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
52
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
53 }
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
54
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
55 foreach my $txid (keys %chr_hash)
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
56 {
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
57 my @starts;
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
58 my @sizes;
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
59 foreach my $start (sort {$a<=>$b} (keys %{$tx_exons{$txid}} ) )
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
60 {
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
61 push (@starts, $start);
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
62 push (@sizes, $tx_exons{$txid}{$start});
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
63 }
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
64 my $exon_num = scalar(@sizes);
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
65 my $starts_str = "";
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
66 for(my $i = 0; $i < $exon_num; $i++)
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
67 {
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
68 $starts_str = $starts_str.($starts[$i] - $starts[0]).",";
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
69 if($i>0)
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
70 {
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
71 warn "$txid, intron size..".($starts[$i]-$starts[$i-1])."\n" if ($starts[$i]-$starts[$i-1]>1000000);
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
72 }
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
73 }
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
74 my $sizes_str = join(",",@sizes);
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
75 my $end = $starts[$exon_num-1] + $sizes[$exon_num-1] -1;
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
76 print join("\t",$chr_hash{$txid}, $starts[0]-1, $end, $txid,"0",$strand_hash{$txid},$starts[0]-1, $end, "255,0,0",$exon_num,$sizes_str, $starts_str);
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
77 print "\n";
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
78 }
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
79
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
80
adc0f7765d85 planemo upload
bioitcore
parents:
diff changeset
81 close(input);