Mercurial > repos > siyuan > prada
view pyPRADA_1.2/make_intragenic_junctions.pl @ 3:f17965495ec9 draft default tip
Uploaded
author | siyuan |
---|---|
date | Tue, 11 Mar 2014 12:14:01 -0400 |
parents | acc2ca1a3ba4 |
children |
line wrap: on
line source
#!/usr/bin/perl use strict; use warnings; ################################## # make_intragenic_junctions.pl # # This is an extension of MFB's perl code make_exon_junctions.pl # Generates "abnormal" junctions between exons of a gene # In the code GeneA, GeneB should be input as same gene # # RV last modified 4/9/2013 ################################## if ($#ARGV!=5) {die "usage is 'perl make_exon_junctions.pl GeneA GeneB ref.annotation ref.map ref.fasta junL > output'\n";} my $geneA = shift; my $geneB = shift; my $annotations = shift; my $map = shift; my $fasta = shift; my $overlap = shift; # Get transcript Ensembl IDs and orientations for GeneA and GeneB my @ensembA; my @ensembB; my @orientationA; my @orientationB; my @seqidA; my @seqidB; my $testerA=0; my $testerB=0; open (ANN, "<$annotations") or die "can't open Ensembl annotations\n"; while (my $text = <ANN>) { chomp $text; my @line = split " ", $text; if ($line[3] eq $geneA) {push @ensembA, $line[1]; push @orientationA, $line[4]; push @seqidA, $line[0]; $testerA++;} if ($line[3] eq $geneB) {push @ensembB, $line[1]; push @orientationB, $line[4]; push @seqidB, $line[0]; $testerB++;} } close (ANN); if ($testerA==0 || $testerB==0) {die "couldn't find one of the genes\n";} # Get exon lengths for each transcript for GeneA and GeneB my @exon_length_A; my @exon_length_B; my $chr_A; my $chr_B; my @exon_end_A; my @exon_start_B; open (MAP, "<$map") or die "can't open Ensembl map\n"; while (my $text = <MAP>) { chomp $text; my @line = split " ", $text; my $ensID = pop (@line); pop @line; for (my $i=0; $i<=$#ensembA; $i++) { if ($ensembA[$i] eq $ensID) { $chr_A = $line[0]; my $num_exons = ($#line+1)/3; for (my $exonA=0; $exonA<$num_exons; $exonA++) { my $length = $line[3*$exonA + 2] - $line[3*$exonA + 1] + 1; if ($orientationA[$i] == 1) { push @{$exon_length_A[$i]}, $length; push @{$exon_end_A[$i]}, $line[3*$exonA + 2]; } else { unshift @{$exon_length_A[$i]}, $length; unshift @{$exon_end_A[$i]}, $line[3*$exonA + 1]; } } } } for (my $i=0; $i<=$#ensembB; $i++) { if ($ensembB[$i] eq $ensID) { $chr_B = $line[0]; my $num_exons = ($#line+1)/3; for (my $exonB=0; $exonB<$num_exons; $exonB++) { my $length = $line[3*$exonB + 2] - $line[3*$exonB + 1] + 1; if ($orientationB[$i] == 1) { push @{$exon_length_B[$i]}, $length; push @{$exon_start_B[$i]}, $line[3*$exonB + 1]; } else { unshift @{$exon_length_B[$i]}, $length; unshift @{$exon_start_B[$i]}, $line[3*$exonB + 2]; } } } } } close (MAP); # Get sequence for each transcript (take reverse complement when necessary) my @sequenceA; my @sequenceB; open (FHSEQ, "<$fasta") or die "can't open Ensembl map\n"; my $top = <FHSEQ>; my $readID = 0; while (my $text = <FHSEQ>) { chomp $text; if ($text =~ ">") { #WTG changed to update for specific ENST ID number which might not be in the same order in the fasta as in the annotation $text=~ s/>//; $readID=$text; } else { for (my $i=0; $i<=$#seqidA; $i++) { if ($seqidA[$i] == $readID) { if ($sequenceA[$i]) { $sequenceA[$i] = $sequenceA[$i].$text; } else {$sequenceA[$i] = $text;} } } for (my $i=0; $i<=$#seqidB; $i++) { if ($seqidB[$i] == $readID) { if ($sequenceB[$i]) { $sequenceB[$i] = $sequenceB[$i].$text; } else {$sequenceB[$i] = $text;} } } } } close (FHSEQ); for (my $txtA=0; $txtA<=$#sequenceA; $txtA++) { if ($orientationA[$txtA]==-1) { $sequenceA[$txtA] = rc($sequenceA[$txtA]); } } for (my $txtB=0; $txtB<=$#sequenceB; $txtB++) { if ($orientationB[$txtB]==-1) { $sequenceB[$txtB] = rc($sequenceB[$txtB]); } } # Print sequences for each hypothetical exon junction (for each pair of transcripts) my %junctions; my %junctions_known; for (my $txtA=0; $txtA<=$#sequenceA; $txtA++) { for (my $txtB=0; $txtB<=$#sequenceB; $txtB++) { my $num_exon_A = $#{$exon_length_A[$txtA]}+1; my $num_exon_B = $#{$exon_length_B[$txtB]}+1; my $running_pos_A=0; for (my $exonA=0; $exonA<$num_exon_A; $exonA++) { $running_pos_A += $exon_length_A[$txtA][$exonA]; my $junction_start = $exon_end_A[$txtA][$exonA]; my $start = $running_pos_A - $overlap; my $seqA; if ($start >= 0) { $seqA = substr($sequenceA[$txtA], $start, $overlap); } else { $start=0; my $tmp_length = $running_pos_A; $seqA = substr($sequenceA[$txtA], $start, $tmp_length); } my $running_pos_B=0; for (my $exonB=0; $exonB<$num_exon_B; $exonB++) { my $junction_end = $exon_start_B[$txtB][$exonB]; my $start = $running_pos_B; my $seqB = substr($sequenceB[$txtB], $start, $overlap); $running_pos_B += $exon_length_B[$txtB][$exonB]; my $key= ">chr$chr_A\.$junction_start\.chr$chr_B\.$junction_end\n"; my $junction = $seqA.$seqB; #print ">chr$chr_A\.$junction_start\.chr$chr_B\.$junction_end\n"; #print "$junction\n"; $junctions{$key}=$junction; } } } } for (my $txtA=0; $txtA<=$#sequenceA; $txtA++) { my $num_exon_A = $#{$exon_length_A[$txtA]}; my $running_pos_A=0; for (my $exonA=0; $exonA<$num_exon_A; $exonA++) { $running_pos_A += $exon_length_A[$txtA][$exonA]; my $junction_start = $exon_end_A[$txtA][$exonA]; my $start = $running_pos_A - $overlap; my $seqA; if ($start >= 0) { $seqA = substr($sequenceA[$txtA], $start, $overlap); } else { $start=0; my $tmp_length = $running_pos_A; $seqA = substr($sequenceA[$txtA], $start, $tmp_length); } my $running_pos_B=0; my $junction_end = $exon_start_B[$txtA][$exonA+1]; my $start1 = $running_pos_B; my $seqB = substr($sequenceA[$txtA], $start1, $overlap); #$running_pos_B += $exon_length_A[$txtA][$exonA+1]; my $key= ">chr$chr_A\.$junction_start\.chr$chr_A\.$junction_end\n"; my $junction = $seqA.$seqB; #print "$junction\n"; $junctions_known{$key}=$junction; } } my @disc; my %count = (); foreach my $element (keys (%junctions_known), keys (%junctions)) { $count{$element}++ } foreach my $element (keys %count) { if($count{$element} == 1){ #print $element; push @disc , $element; } } foreach my $junc(@disc){ if($junctions_known{$junc}){print $junc,$junctions_known{$junc},"\n"; } else {print $junc,$junctions{$junc},"\n"; } } ################## ################## ### SUBROUTINES ################## ################## sub rc{ my $dna = shift; my $revcom = reverse($dna); $revcom =~ tr/ACGTacgt/TGCAtgca/; return $revcom; }