0
|
1 #!/usr/bin/perl
|
|
2 use strict;
|
|
3 use warnings;
|
|
4
|
|
5 ##################################
|
|
6 # make_intragenic_junctions.pl
|
|
7 #
|
|
8 # This is an extension of MFB's perl code make_exon_junctions.pl
|
|
9 # Generates "abnormal" junctions between exons of a gene
|
|
10 # In the code GeneA, GeneB should be input as same gene
|
|
11 #
|
|
12 # RV last modified 4/9/2013
|
|
13 ##################################
|
|
14
|
|
15 if ($#ARGV!=5) {die "usage is 'perl make_exon_junctions.pl GeneA GeneB ref.annotation ref.map ref.fasta junL > output'\n";}
|
|
16 my $geneA = shift;
|
|
17 my $geneB = shift;
|
|
18 my $annotations = shift;
|
|
19 my $map = shift;
|
|
20 my $fasta = shift;
|
|
21 my $overlap = shift;
|
|
22
|
|
23 # Get transcript Ensembl IDs and orientations for GeneA and GeneB
|
|
24
|
|
25 my @ensembA;
|
|
26 my @ensembB;
|
|
27 my @orientationA;
|
|
28 my @orientationB;
|
|
29 my @seqidA;
|
|
30 my @seqidB;
|
|
31
|
|
32 my $testerA=0; my $testerB=0;
|
|
33 open (ANN, "<$annotations") or die "can't open Ensembl annotations\n";
|
|
34 while (my $text = <ANN>) {
|
|
35 chomp $text;
|
|
36 my @line = split " ", $text;
|
|
37 if ($line[3] eq $geneA) {push @ensembA, $line[1]; push @orientationA, $line[4]; push @seqidA, $line[0]; $testerA++;}
|
|
38 if ($line[3] eq $geneB) {push @ensembB, $line[1]; push @orientationB, $line[4]; push @seqidB, $line[0]; $testerB++;}
|
|
39 }
|
|
40 close (ANN);
|
|
41 if ($testerA==0 || $testerB==0) {die "couldn't find one of the genes\n";}
|
|
42
|
|
43 # Get exon lengths for each transcript for GeneA and GeneB
|
|
44
|
|
45 my @exon_length_A;
|
|
46 my @exon_length_B;
|
|
47
|
|
48 my $chr_A;
|
|
49 my $chr_B;
|
|
50
|
|
51 my @exon_end_A;
|
|
52 my @exon_start_B;
|
|
53
|
|
54 open (MAP, "<$map") or die "can't open Ensembl map\n";
|
|
55 while (my $text = <MAP>) {
|
|
56 chomp $text;
|
|
57 my @line = split " ", $text;
|
|
58 my $ensID = pop (@line);
|
|
59 pop @line;
|
|
60 for (my $i=0; $i<=$#ensembA; $i++) {
|
|
61 if ($ensembA[$i] eq $ensID) {
|
|
62 $chr_A = $line[0];
|
|
63 my $num_exons = ($#line+1)/3;
|
|
64 for (my $exonA=0; $exonA<$num_exons; $exonA++) {
|
|
65 my $length = $line[3*$exonA + 2] - $line[3*$exonA + 1] + 1;
|
|
66 if ($orientationA[$i] == 1) {
|
|
67 push @{$exon_length_A[$i]}, $length;
|
|
68 push @{$exon_end_A[$i]}, $line[3*$exonA + 2];
|
|
69 }
|
|
70 else {
|
|
71 unshift @{$exon_length_A[$i]}, $length;
|
|
72 unshift @{$exon_end_A[$i]}, $line[3*$exonA + 1];
|
|
73 }
|
|
74 }
|
|
75 }
|
|
76 }
|
|
77 for (my $i=0; $i<=$#ensembB; $i++) {
|
|
78 if ($ensembB[$i] eq $ensID) {
|
|
79 $chr_B = $line[0];
|
|
80 my $num_exons = ($#line+1)/3;
|
|
81 for (my $exonB=0; $exonB<$num_exons; $exonB++) {
|
|
82 my $length = $line[3*$exonB + 2] - $line[3*$exonB + 1] + 1;
|
|
83 if ($orientationB[$i] == 1) {
|
|
84 push @{$exon_length_B[$i]}, $length;
|
|
85 push @{$exon_start_B[$i]}, $line[3*$exonB + 1];
|
|
86 }
|
|
87 else {
|
|
88 unshift @{$exon_length_B[$i]}, $length;
|
|
89 unshift @{$exon_start_B[$i]}, $line[3*$exonB + 2];
|
|
90 }
|
|
91 }
|
|
92 }
|
|
93 }
|
|
94 }
|
|
95 close (MAP);
|
|
96
|
|
97 # Get sequence for each transcript (take reverse complement when necessary)
|
|
98
|
|
99 my @sequenceA;
|
|
100 my @sequenceB;
|
|
101
|
|
102 open (FHSEQ, "<$fasta") or die "can't open Ensembl map\n";
|
|
103 my $top = <FHSEQ>;
|
|
104 my $readID = 0;
|
|
105 while (my $text = <FHSEQ>) {
|
|
106 chomp $text;
|
|
107 if ($text =~ ">") { #WTG changed to update for specific ENST ID number which might not be in the same order in the fasta as in the annotation
|
|
108 $text=~ s/>//;
|
|
109 $readID=$text;
|
|
110 }
|
|
111 else {
|
|
112 for (my $i=0; $i<=$#seqidA; $i++) {
|
|
113 if ($seqidA[$i] == $readID) {
|
|
114 if ($sequenceA[$i]) {
|
|
115 $sequenceA[$i] = $sequenceA[$i].$text;
|
|
116 }
|
|
117 else {$sequenceA[$i] = $text;}
|
|
118 }
|
|
119 }
|
|
120 for (my $i=0; $i<=$#seqidB; $i++) {
|
|
121 if ($seqidB[$i] == $readID) {
|
|
122 if ($sequenceB[$i]) {
|
|
123 $sequenceB[$i] = $sequenceB[$i].$text;
|
|
124 }
|
|
125 else {$sequenceB[$i] = $text;}
|
|
126 }
|
|
127 }
|
|
128 }
|
|
129 }
|
|
130 close (FHSEQ);
|
|
131
|
|
132 for (my $txtA=0; $txtA<=$#sequenceA; $txtA++) {
|
|
133 if ($orientationA[$txtA]==-1) {
|
|
134 $sequenceA[$txtA] = rc($sequenceA[$txtA]);
|
|
135 }
|
|
136 }
|
|
137 for (my $txtB=0; $txtB<=$#sequenceB; $txtB++) {
|
|
138 if ($orientationB[$txtB]==-1) {
|
|
139 $sequenceB[$txtB] = rc($sequenceB[$txtB]);
|
|
140 }
|
|
141 }
|
|
142
|
|
143
|
|
144 # Print sequences for each hypothetical exon junction (for each pair of transcripts)
|
|
145 my %junctions;
|
|
146 my %junctions_known;
|
|
147 for (my $txtA=0; $txtA<=$#sequenceA; $txtA++) {
|
|
148 for (my $txtB=0; $txtB<=$#sequenceB; $txtB++) {
|
|
149
|
|
150 my $num_exon_A = $#{$exon_length_A[$txtA]}+1;
|
|
151 my $num_exon_B = $#{$exon_length_B[$txtB]}+1;
|
|
152
|
|
153 my $running_pos_A=0;
|
|
154 for (my $exonA=0; $exonA<$num_exon_A; $exonA++) {
|
|
155 $running_pos_A += $exon_length_A[$txtA][$exonA];
|
|
156 my $junction_start = $exon_end_A[$txtA][$exonA];
|
|
157 my $start = $running_pos_A - $overlap;
|
|
158 my $seqA;
|
|
159 if ($start >= 0) {
|
|
160 $seqA = substr($sequenceA[$txtA], $start, $overlap);
|
|
161 }
|
|
162 else {
|
|
163 $start=0;
|
|
164 my $tmp_length = $running_pos_A;
|
|
165 $seqA = substr($sequenceA[$txtA], $start, $tmp_length);
|
|
166 }
|
|
167 my $running_pos_B=0;
|
|
168 for (my $exonB=0; $exonB<$num_exon_B; $exonB++) {
|
|
169 my $junction_end = $exon_start_B[$txtB][$exonB];
|
|
170 my $start = $running_pos_B;
|
|
171 my $seqB = substr($sequenceB[$txtB], $start, $overlap);
|
|
172 $running_pos_B += $exon_length_B[$txtB][$exonB];
|
|
173 my $key= ">chr$chr_A\.$junction_start\.chr$chr_B\.$junction_end\n";
|
|
174 my $junction = $seqA.$seqB;
|
|
175 #print ">chr$chr_A\.$junction_start\.chr$chr_B\.$junction_end\n";
|
|
176 #print "$junction\n";
|
|
177 $junctions{$key}=$junction;
|
|
178 }
|
|
179 }
|
|
180 }
|
|
181
|
|
182 }
|
|
183
|
|
184 for (my $txtA=0; $txtA<=$#sequenceA; $txtA++) {
|
|
185 my $num_exon_A = $#{$exon_length_A[$txtA]};
|
|
186
|
|
187 my $running_pos_A=0;
|
|
188 for (my $exonA=0; $exonA<$num_exon_A; $exonA++) {
|
|
189 $running_pos_A += $exon_length_A[$txtA][$exonA];
|
|
190 my $junction_start = $exon_end_A[$txtA][$exonA];
|
|
191 my $start = $running_pos_A - $overlap;
|
|
192 my $seqA;
|
|
193 if ($start >= 0) {
|
|
194 $seqA = substr($sequenceA[$txtA], $start, $overlap);
|
|
195 }
|
|
196 else {
|
|
197 $start=0;
|
|
198 my $tmp_length = $running_pos_A;
|
|
199 $seqA = substr($sequenceA[$txtA], $start, $tmp_length);
|
|
200 }
|
|
201 my $running_pos_B=0;
|
|
202 my $junction_end = $exon_start_B[$txtA][$exonA+1];
|
|
203 my $start1 = $running_pos_B;
|
|
204 my $seqB = substr($sequenceA[$txtA], $start1, $overlap);
|
|
205 #$running_pos_B += $exon_length_A[$txtA][$exonA+1];
|
|
206 my $key= ">chr$chr_A\.$junction_start\.chr$chr_A\.$junction_end\n";
|
|
207 my $junction = $seqA.$seqB;
|
|
208 #print "$junction\n";
|
|
209 $junctions_known{$key}=$junction;
|
|
210
|
|
211 }
|
|
212 }
|
|
213
|
|
214 my @disc;
|
|
215 my %count = ();
|
|
216 foreach my $element (keys (%junctions_known), keys (%junctions)) { $count{$element}++ }
|
|
217 foreach my $element (keys %count) {
|
|
218 if($count{$element} == 1){
|
|
219 #print $element;
|
|
220 push @disc , $element;
|
|
221 }
|
|
222 }
|
|
223
|
|
224 foreach my $junc(@disc){
|
|
225 if($junctions_known{$junc}){print $junc,$junctions_known{$junc},"\n";
|
|
226 }
|
|
227 else {print $junc,$junctions{$junc},"\n";
|
|
228 }
|
|
229
|
|
230 }
|
|
231
|
|
232
|
|
233 ##################
|
|
234 ##################
|
|
235 ### SUBROUTINES
|
|
236 ##################
|
|
237 ##################
|
|
238
|
|
239 sub rc{
|
|
240 my $dna = shift;
|
|
241 my $revcom = reverse($dna);
|
|
242 $revcom =~ tr/ACGTacgt/TGCAtgca/;
|
|
243 return $revcom;
|
|
244 }
|