annotate pyPRADA_1.2/make_exon_junctions.pl @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
1 #!/usr/bin/perl
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
2 use strict;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
3 use warnings;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
4
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
5 ##################################
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
6 # make_exon_junctions.pl
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
7 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
8 # Take as input Gene A and Gene B
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
9 # in fusion. Using Ensembl52
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
10 # mapping information and
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
11 # transcript sequences, outputs
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
12 # sequences for all possible
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
13 # exon junctions with the 3' end
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
14 # of Gene A fused to the 5' end
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
15 # of Gene B
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
16 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
17 # M. Berger, January 27, 2009
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
18 # WTG modified 7/10/2012
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
19 # RV last modified 4/9/2013
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
20 ##################################
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
21
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
22 if ($#ARGV!=5) {die "usage is 'perl make_exon_junctions.pl GeneA GeneB ref.annotation ref.map ref.fasta junL > output'\n";}
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
23 my $geneA = shift;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
24 my $geneB = shift;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
25 my $annotations = shift;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
26 my $map = shift;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
27 my $fasta = shift;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
28 my $overlap = shift;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
29
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
30 # Get transcript Ensembl IDs and orientations for GeneA and GeneB
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
31
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
32 my @ensembA;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
33 my @ensembB;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
34 my @orientationA;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
35 my @orientationB;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
36 my @seqidA;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
37 my @seqidB;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
38
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
39 my $testerA=0; my $testerB=0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
40 open (ANN, "<$annotations") or die "can't open Ensembl annotations\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
41 while (my $text = <ANN>) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
42 chomp $text;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
43 my @line = split " ", $text;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
44 if ($line[3] eq $geneA) {push @ensembA, $line[1]; push @orientationA, $line[4]; push @seqidA, $line[0]; $testerA++;}
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
45 if ($line[3] eq $geneB) {push @ensembB, $line[1]; push @orientationB, $line[4]; push @seqidB, $line[0]; $testerB++;}
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
46 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
47 close (ANN);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
48 if ($testerA==0 || $testerB==0) {die "couldn't find one of the genes\n";}
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
49
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
50 # Get exon lengths for each transcript for GeneA and GeneB
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
51
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
52 my @exon_length_A;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
53 my @exon_length_B;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
54
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
55 my $chr_A;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
56 my $chr_B;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
57
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
58 my @exon_end_A;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
59 my @exon_start_B;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
60
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
61 open (MAP, "<$map") or die "can't open Ensembl map\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
62 while (my $text = <MAP>) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
63 chomp $text;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
64 my @line = split " ", $text;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
65 my $ensID = pop (@line);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
66 pop @line;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
67 for (my $i=0; $i<=$#ensembA; $i++) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
68 if ($ensembA[$i] eq $ensID) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
69 $chr_A = $line[0];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
70 my $num_exons = ($#line+1)/3;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
71 for (my $exonA=0; $exonA<$num_exons; $exonA++) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
72 my $length = $line[3*$exonA + 2] - $line[3*$exonA + 1] + 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
73 if ($orientationA[$i] == 1) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
74 push @{$exon_length_A[$i]}, $length;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
75 push @{$exon_end_A[$i]}, $line[3*$exonA + 2];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
76 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
77 else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
78 unshift @{$exon_length_A[$i]}, $length;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
79 unshift @{$exon_end_A[$i]}, $line[3*$exonA + 1];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
80 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
81 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
82 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
83 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
84 for (my $i=0; $i<=$#ensembB; $i++) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
85 if ($ensembB[$i] eq $ensID) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
86 $chr_B = $line[0];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
87 my $num_exons = ($#line+1)/3;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
88 for (my $exonB=0; $exonB<$num_exons; $exonB++) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
89 my $length = $line[3*$exonB + 2] - $line[3*$exonB + 1] + 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
90 if ($orientationB[$i] == 1) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
91 push @{$exon_length_B[$i]}, $length;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
92 push @{$exon_start_B[$i]}, $line[3*$exonB + 1];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
93 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
94 else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
95 unshift @{$exon_length_B[$i]}, $length;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
96 unshift @{$exon_start_B[$i]}, $line[3*$exonB + 2];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
97 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
98 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
99 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
100 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
101 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
102 close (MAP);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
103
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
104 # Get sequence for each transcript (take reverse complement when necessary)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
105
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
106 my @sequenceA;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
107 my @sequenceB;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
108
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
109 open (FHSEQ, "<$fasta") or die "can't open Ensembl map\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
110 my $top = <FHSEQ>;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
111 my $readID = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
112 while (my $text = <FHSEQ>) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
113 chomp $text;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
114 if ($text =~ ">") { #WTG changed to update for specific ENST ID number which might not be in the same order in the fasta as in the annotation
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
115 $text=~ s/>//;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
116 $readID=$text;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
117 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
118 else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
119 for (my $i=0; $i<=$#seqidA; $i++) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
120 if ($seqidA[$i] == $readID) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
121 if ($sequenceA[$i]) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
122 $sequenceA[$i] = $sequenceA[$i].$text;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
123 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
124 else {$sequenceA[$i] = $text;}
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
125 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
126 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
127 for (my $i=0; $i<=$#seqidB; $i++) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
128 if ($seqidB[$i] == $readID) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
129 if ($sequenceB[$i]) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
130 $sequenceB[$i] = $sequenceB[$i].$text;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
131 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
132 else {$sequenceB[$i] = $text;}
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
133 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
134 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
135 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
136 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
137 close (FHSEQ);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
138
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
139 for (my $txtA=0; $txtA<=$#sequenceA; $txtA++) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
140 if ($orientationA[$txtA]==-1) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
141 $sequenceA[$txtA] = rc($sequenceA[$txtA]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
142 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
143 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
144 for (my $txtB=0; $txtB<=$#sequenceB; $txtB++) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
145 if ($orientationB[$txtB]==-1) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
146 $sequenceB[$txtB] = rc($sequenceB[$txtB]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
147 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
148 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
149
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
150
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
151 # Print sequences for each hypothetical exon junction (for each pair of transcripts)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
152
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
153
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
154 for (my $txtA=0; $txtA<=$#sequenceA; $txtA++) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
155 for (my $txtB=0; $txtB<=$#sequenceB; $txtB++) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
156
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
157 my $num_exon_A = $#{$exon_length_A[$txtA]}+1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
158 my $num_exon_B = $#{$exon_length_B[$txtB]}+1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
159
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
160 my $running_pos_A=0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
161 for (my $exonA=0; $exonA<$num_exon_A; $exonA++) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
162 $running_pos_A += $exon_length_A[$txtA][$exonA];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
163 my $junction_start = $exon_end_A[$txtA][$exonA];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
164 my $start = $running_pos_A - $overlap;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
165 my $seqA;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
166 if ($start >= 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
167 $seqA = substr($sequenceA[$txtA], $start, $overlap);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
168 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
169 else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
170 $start=0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
171 my $tmp_length = $running_pos_A;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
172 $seqA = substr($sequenceA[$txtA], $start, $tmp_length);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
173 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
174 my $running_pos_B=0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
175 for (my $exonB=0; $exonB<$num_exon_B; $exonB++) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
176 my $junction_end = $exon_start_B[$txtB][$exonB];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
177 my $start = $running_pos_B;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
178 my $seqB = substr($sequenceB[$txtB], $start, $overlap);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
179 $running_pos_B += $exon_length_B[$txtB][$exonB];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
180 print ">chr$chr_A\.$junction_start\.chr$chr_B\.$junction_end\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
181 my $junction = $seqA.$seqB;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
182 print "$junction\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
183 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
184 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
185 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
186 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
187
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
188
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
189
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
190 ##################
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
191 ##################
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
192 ### SUBROUTINES
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
193 ##################
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
194 ##################
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
195
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
196 sub rc{
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
197 my $dna = shift;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
198 my $revcom = reverse($dna);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
199 $revcom =~ tr/ACGTacgt/TGCAtgca/;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
200 return $revcom;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
201 }