Mercurial > repos > alermine > nebula
view [APliBio]Nebula tools suite/Nebula/AnnotatePeaks/annotatePeaks.pl @ 3:1c699789d6d3 draft
Uploaded
author | alermine |
---|---|
date | Wed, 14 Nov 2012 06:02:48 -0500 |
parents | 2ec3ba0e9e70 |
children |
line wrap: on
line source
#:t:::::::::::::::::g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ #:t::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ #:::::::::::::z;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ #::::::::::::i@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ #::::::::::::@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@$@@@@ #:::::::::::3@@@@@@@@@@@@@@@@@@@@@@@@@B@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ #::::::::::3@@@@@@@@@@@@@@@@@@@@@BEEESSE5EEEEBBM@@@@@@@@@@@@@@@@@@@@@@@@@@ #::::::::::3@@@@@@@@@@@@@@@@@@@@BEEEEEE35EE55E2355E5SBMB@@@@@@@@@@@@@@@@@$ #::::::::::@@@@@@@@@@@@@@@@@@@EEEE55533t3tttt::::::!!!!7755E755SBBMMM@@@MM #::::::::::3@@@@@@@@@@@@@@@@@@EEEE2t3ttttt:::::::::::::::::::::::!7?5225EE #::::::::::3@@@@@@@@@@@@@@@@@@EEEEE31t::::::::::::::::::::::::::::::::3E5@ #::::::::::3@@@@@@@@@@@@@@@@@@EEEEEEtt:::::::::::::::::::::::::::::::::353 #::::::::::3@@@@@@@@@@@@@@@@@@EEEEEE1ttz::::::::::::::::::::::::::::::::35 #:::::::::::@@@@@@@@@@@@@@@@@@EEEEEEEtz1::::::::::::::::::::::::::::::::t: #:::::::::!3@@@@@@@@@@@@@@@@@@@EEEEEttt::::::::::::::::::::::::::::::::;zz #::::::::::@@@@@@@@@@@@@@@@@@@@EEEEEttt:::::z;z:::::::::::::::::::::::::13 #::::::::::3B@@@@@@@@@@@@@@@@@@EEEEEEE3tt:czzztti;:::::::::::::::::::::::3 #::::ttt::::3@@@@@@@@@@@@@@@@EEEEE5EE25Ezt1EEEz5Etzzz;;;;::::::::::::::::: #:::::::::::I9@@@@@@@@@@@@@@@@@@@@@@@@@@EEEEEE@@@@@@@@@@@@@@Ez;::::::::::: #:::::::::::::E@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@Ez:::::: #::::::::::::::E@@@@@@@@@@@@@@@@@@@@@@@@@@@@@BE5EBB@@@@@@@@@@@@@@@EEE::::: #:::::::::::::::@@@@@@@@@@@@@@@@@@@@@@@@@@@@E1::35@@@@@@@@@@ME3MMME2:::::: #:::::::::::::::?@@@@@@@@@@@@@@@@@@M@@@@@@@EE:::::3SB@@BBESEEt:::::::::::: #::::::::::::::::J$@@@@@@@B@@@@@@@@@@@@@@@@EE:::::::!35E33t::::::::::::::: #:::::::::::::::::3@E@@@EE5EESE5EESE@@@@@@@Et::::::::::::tz::::::::::::::: #:::::::::::::::::J@E$@EEE5133555SE@@@@@@@@Et::::::::::::::::::::::::::::: #::::::::::::::::::E@E@EEEEtt3523EEE@@@@@@@E:::::::::::::::::::::::::::::: #:t::::::::::::::::JEE3@@@EEEEEEEEEE@@@@@@@E:::::::::t;::::::::::::::::::: #:t:::::::::::::::::!5ES@EEEEEEEEES@@@@@@@@@E;:::;;;:3Ez:::::::::::::::::: #:t::::::::::::::::::::JE@@EEEEEEE@@@@@@@@@@@@@@@@ME!:::;::::::::::::::::: #:tz::::::::::::::::::::JE@@@EEEE@@@@@@@@@@@@@@EE!:::::::t:::::::::::::::: #:t::::::::::::::::::::::3@@@@@@@@@@@@@@@@@@ESBE:::::::::::::::::::::::::: #:::::::::::::::::::::::::Q@@@@@@@@@@@@@@@@EE3EE;:::::zzzz:::::::::::::::: #:::::::::::::::::::::::::3@@@@@@@@@@@@@@@@@@@@@@NN@@@@@@Ez::::::::::::::: #:zt:::::::::::::::::::::::3@@@@EE@@@@@@@@@@EEEEt::;z113E5t::::::::::::::: #::tt:::::::::::::::::::::::3@@@E@@@@@@@@@@@@@@@@BEt::::::::::::::::t::::: #:tt:t:::::::::::::::::::::::?S@@@@@@@@@@@BBEEE51!::::::::::::::zzzEt::::: #::::::::::::::::::::::::::::::3Q@@@@@@@BEEEEEt:::::::::::::;zz@@@EE:::::: #::::::::::::::::::::::::::::::::75B@@@@@EEEtt;:::::::::;zz@@@@BEEEtz::::: #::::::::::::::::::::::::::::::::::::?9@@@@@@@@@@@E2Ezg@@@@@B@@@EEEE1t:::: #:::::::::::::::::::::::::::::::::::::::3@@@@@@@@@@@@@@@@@@@E@EEEEEEEzzz:: #::::::::::::::::::::::::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@EEEEEEE5ttttt #:::::::::::::::::::::::::::::::;g@@@@@@@@@@@@@@@@@@@@@@@@@@EEEEEEEEEEEtzt #::::::::::::::::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@E@@EEEEEEEEEEEE@@@ #::::::::::::::::::::::::::g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@EEEE3EEEE@@@@@@@ #:::::::::::::::::::::;;g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@EEEt33@@@@@@@@@@ #:::::::::::::::::;g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@E@@@@@@EEEtg@@@@@@@@@@@@ #::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@EEEE@@@@@@@@@@@@@@@@@@@@@@@@ #:::::::::::::@@@@@@@@@@@@@@@@@$@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ #::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ # # Copyleft ↄ⃝ 2012 Institut Curie # Author(s): Valentina Boeva, Alban Lermine (Institut Curie) 2012 # Contact: valentina.boeva@curie.fr, alban.lermine@curie.fr # This software is distributed under the terms of the GNU General # Public License, either Version 2, June 1991 or Version 3, June 2007. #!/usr/bin/perl -w #for a complete list of genes and a list of sites, sorts genes close to sites #printDistance #read only gene files with refSeqGenes #counts only once overlapping transcripts or genes #calculates some stats about locations of peaks #uses hierarchie : promoter, imUpstream, intragenic,enh, 5kbdownstream, intergenic + for genes: fExon,exon,fIntron,intron,junction+-50kb ONLY FOR "findClosestGene" #outputs fold change of expression is available #outputname - corrected #all isoforms are considered, even those that start and end at the same coordinates #uses fluorescence values #only one location per gene in mode "all" (but promoter+intragenic if both) #read directly Noresp/down/up genes #prints distance to TE use strict; use POSIX; my $SitesFilename ; my $GenesFilename ; my $SelectedGenesFilename = ""; my $MirFilename = ""; my $minScore = 0; my $BIGNUMBER = 10000000000; my $enhLeft = -30000; my $enhRight = -1500; my $promLeft = $enhRight ; my $immediateDownstream = 2000; my $maxScore = $BIGNUMBER ; my $verbose = 0; my $maxDistance = $BIGNUMBER ; my $downStreamDist = 5000; my $jonctionSize = 50; my $all = 0; my $fluoFile = ""; my $regType = ""; my $GCislands = ""; my $ResFilename = ""; my $usage = qq{ $0 ----------------------------- mandatory parameters: -g filename file with all genes -tf filename file with sites of TF ----------------------------- optional parameters: -v for verbose -mir filename file with positions of miRNA -maxDist value maximal Distance to genes (def. Inf) -minScore value minimal Score (def. 0) -maxScore value maximal Score (def. Inf) -selG filename file with selected genes and their expression levels -all to output all genes intersecting with the peak -fluo filename file with fluorescence -regType valeu down-regulated, up-regulated, no-response -gc filename file with gc-islands -lp valeu upstream of TSS region to define promoter -rp valeu downstream of TSS region to define immediate downstream -enh valeu upstream of TSS region to define enhancer -dg valeu downstream of transcription end region to define gene downstream -o filename output filename }; if(scalar(@ARGV) == 0){ print $usage; exit(0); } while(scalar(@ARGV) > 0){ my $this_arg = shift @ARGV; if ( $this_arg eq '-h') {print "$usage\n"; exit; } elsif ( $this_arg eq '-g') {$GenesFilename = shift @ARGV;} elsif ( $this_arg eq '-tf') {$SitesFilename = shift @ARGV;} elsif ( $this_arg eq '-mir') {$MirFilename = shift @ARGV;} elsif ( $this_arg eq '-selG') {$SelectedGenesFilename = shift @ARGV;} elsif ( $this_arg eq '-maxDist') {$maxDistance = 1;} elsif ( $this_arg eq '-maxScore') {$maxScore = shift @ARGV;} elsif ( $this_arg eq '-minScore') {$minScore = shift @ARGV;} elsif ( $this_arg eq '-v') {$verbose = 1;} elsif ( $this_arg eq '-all') {$all = 1;} elsif ( $this_arg eq '-regType') {$regType = shift @ARGV;} elsif ( $this_arg eq '-fluo') {$fluoFile = shift @ARGV;} elsif ( $this_arg eq '-gc') {$GCislands = shift @ARGV;} elsif ( $this_arg eq '-o') {$ResFilename = shift @ARGV;} elsif ( $this_arg eq '-lp') {$enhRight = shift @ARGV;$promLeft = $enhRight;} elsif ( $this_arg eq '-rp') {$immediateDownstream = shift @ARGV;} elsif ( $this_arg eq '-enh') {$enhLeft = shift @ARGV;} elsif ( $this_arg eq '-dg') {$downStreamDist = shift @ARGV;} elsif ( $this_arg =~ m/^-/ ) { print "unknown flag: $this_arg\n";} } if ($maxDistance<0) { die "\tThe maximal distance must be positive!\n"; } my $tmpn = $SitesFilename; $tmpn =~ s/.*[\/\\]//; if ($ResFilename eq "") { $ResFilename = $SelectedGenesFilename.$tmpn."_".$minScore."_dists.txt"; if ($maxScore != $BIGNUMBER) { $ResFilename = $SelectedGenesFilename.$SitesFilename."_".$minScore."_$maxScore"."_dists.txt"; } if ($all == 1) { $ResFilename =~ s/_dists/_all_dists/; } } #-----------read selected genes---------------- my %selectedGenes; my %selectedGenesFoldChange; if ( $SelectedGenesFilename ne "") { open (FILE, "<$SelectedGenesFilename ") or die "Cannot open file $SelectedGenesFilename !!!!: $!"; while (<FILE>) { chomp; my @a = split/\t/; $selectedGenes{$a[1]} = $a[3]; $selectedGenesFoldChange{$a[1]} = $a[2]; #print "gene:$a[1],reg:$selectedGenes{$a[1]},FC:$selectedGenesFoldChange{$a[1]}\n"; } close FILE; print "\t\t$SelectedGenesFilename is read!\n" if ($verbose); } #-----------read genes with fluorescence--------- my %fluoGenes; if ( $fluoFile ne "") { open (FILE, "<$fluoFile ") or die "Cannot open file $fluoFile !!!!: $!"; my $gene = ""; my $med = 0; my %h; while (<FILE>) { chomp; my @a = split/\t/; next if (scalar(@a)<5); next if ($a[0] eq "probesets"); next unless ($a[0] =~m/\S/); next unless ($a[4] =~m/\S/); if ($gene ne "" && $gene ne $a[2]) { #calcMed $med = med(keys %h); $fluoGenes{$gene} = $med; $med=0; delete @h{keys %h}; } else { #$h{$a[4]} = 1; } $gene = $a[2]; $h{$a[4]} = 1; #print "keys ", scalar(keys %h),"\t",keys %h,"\n"; } if ($gene ne "") { $med = med(keys %h); $fluoGenes{$gene} = $med; } close FILE; print "\t\t$fluoFile is read!\n" if ($verbose); } #-----------read GC-islands---------------- my %GCislands; if ($GCislands ne "") { open (FILE, "<$GCislands ") or die "Cannot open file $GCislands !!!!: $!"; while (<FILE>) { chomp; my @a = split/\t/; #bin chrom chromStart chromEnd name length cpgNum gcNum perCpg perGc obsExp #107 chr1 36568608 36569851 CpG: 128 1243 128 766 20.6 61.6 1.09 my $chr = $a[1]; my $start = $a[2]; my $end = $a[3]; $GCislands{$chr}->{$start}=$end; } close FILE; } elsif ($verbose) { print "you did not specify a file with GC-islands\n"; } #-----------read genes---------------- my %genes; my $count = 0; open (GENES, "<$GenesFilename") or die "Cannot open file $GenesFilename!!!!: $!"; <GENES>; while (<GENES>) { chomp; if (/(chr.*)\s([+-])\s(\d+)\s(\d+)\s(\d+)\s(\d+)\s(\d+)\s(\S+)\s(\S+)\s\S+\s(\S+)/){ my $name = $10; my $chr = $1; my $strand = $2; if ($strand eq '+') { $strand = 1; } else { $strand = -1; } my $leftPos = $3; my $rightPos = $4; my $cdsStart= $5; my $cdsEnd= $6; my $exonCount= $7; my $exonStarts= $8; my $exonEnds= $9; my $ID = "$name\t$chr:$leftPos-$rightPos;$exonStarts-$exonEnds"; my $foldChange = 1; my $reg = "NA"; my $fluo = "NA"; if ( $SelectedGenesFilename ne "") { #print "$name\t"; if (exists($selectedGenes{$name})) { $reg = $selectedGenes{$name}; $foldChange = $selectedGenesFoldChange{$name}; } } unless (exists($genes{$chr})) { my %h; $genes{$chr} = \%h; } unless (exists($genes{$chr}->{$ID})) { my %h1; $genes{$chr}->{$ID} = \%h1; $count++; } if ( $fluoFile ne "") { if (exists($fluoGenes{$name})) { $fluo = $fluoGenes{$name}; } } $genes{$chr}->{$ID}{'name'} = $name ; $genes{$chr}->{$ID}{'left'} = $leftPos ; $genes{$chr}->{$ID}{'right'} = $rightPos ; $genes{$chr}->{$ID}{'cdsStart'} = $cdsStart; $genes{$chr}->{$ID}{'cdsEnd'} = $cdsEnd; $genes{$chr}->{$ID}{'strand'} = $strand; $genes{$chr}->{$ID}{'exonCount'} = $exonCount; $genes{$chr}->{$ID}{'exonStarts'} = $exonStarts; $genes{$chr}->{$ID}{'exonEnds'} = $exonEnds; $genes{$chr}->{$ID}{'TSS'} = ($strand == 1) ? $leftPos :$rightPos ; $genes{$chr}->{$ID}{'TE'} = ($strand == -1) ? $leftPos :$rightPos ; $genes{$chr}->{$ID}{'reg'} = $reg; $genes{$chr}->{$ID}{'foldChange'} = $foldChange; $genes{$chr}->{$ID}{'length'} = abs ($leftPos-$rightPos); ($genes{$chr}->{$ID}{'firstIntronStart'},$genes{$chr}->{$ID}{'firstIntronEnd'}) = getFirstIntron ($exonCount,$exonStarts,$exonEnds,$strand); ($genes{$chr}->{$ID}{'firstExonStart'},$genes{$chr}->{$ID}{'firstExonEnd'}) = getFirstExon ($exonCount,$exonStarts,$exonEnds,$strand); $genes{$chr}->{$ID}{'fluo'} = $fluo; $genes{$chr}->{$ID}{'GCisland'} = 0; if ($GCislands ne "") { $genes{$chr}->{$ID}{'GCisland'} = checkIfGC ($genes{$chr}->{$ID}{'TSS'},$strand,2000,$GCislands{$chr}); } } } print "Total genes : $count\n"; close GENES; print "\t\t$GenesFilename is read!\n" if ($verbose); #for my $gName (sort keys %{$genes{'chr18'}}) { # print "$gName\t$genes{'chr18'}->{$gName}{'TSS'}\n"; #} #-----------read file with sites miRNA, store as genes----- if ( $MirFilename eq ""){ print "you did not specify file with miRNA\n" if ($verbose);; } else { open (MIR, "<$MirFilename ") or die "Cannot open file $MirFilename !!!!: $!"; #chr1 20669090 20669163 mmu-mir-206 960 + while (<MIR>) { chomp; my ($name, $chr, $leftPos, $rightPos, $strand ); my $fluo = "NA"; #1 . miRNA 20669091 20669163 . + . ACC="MI0000249"; ID="mmu-mir-206"; if (/([0-9XYM]+)\s.\smiRNA\s(\d+)\s(\d+)\s.\s([+-])\s.\sACC=.*ID=\"(.*)\"/) { $name = $5; $chr = $1; $leftPos = $2; $rightPos = $3; $strand = $4; } elsif (/(.*)\s(\d+)\s(\d+)\s(.*)\s(.*)\s(.*)/){ $name = $4; $chr = $1; $leftPos = $2; $rightPos = $3; $strand = $6; } else { next; } unless ($chr =~ m/chr/) { $chr = "chr".$chr; } my $ID = "$name\t$chr:$leftPos-$rightPos"; if ($strand eq '+') { $strand = 1; } else { $strand = -1; } unless (exists($genes{$chr})) { my %h; $genes{$chr} = \%h; } unless (exists($genes{$chr}->{$ID})) { my %h1; $genes{$chr}->{$ID} = \%h1; $count++; } $genes{$chr}->{$ID}{'name'} = $name ; $genes{$chr}->{$ID}{'left'} = $leftPos ; $genes{$chr}->{$ID}{'right'} = $rightPos ; $genes{$chr}->{$ID}{'cdsStart'} = $leftPos ; $genes{$chr}->{$ID}{'cdsEnd'} = $rightPos ; $genes{$chr}->{$ID}{'strand'} = $strand; $genes{$chr}->{$ID}{'length'} = abs ($leftPos-$rightPos); $genes{$chr}->{$ID}{'exonCount'} = 1; $genes{$chr}->{$ID}{'exonStarts'} = $leftPos ; $genes{$chr}->{$ID}{'exonEnds'} = $rightPos ; $genes{$chr}->{$ID}{'TSS'} = ($strand == 1) ? $leftPos :$rightPos ; $genes{$chr}->{$ID}{'TE'} = ($strand == -1) ? $leftPos :$rightPos ; $genes{$chr}->{$ID}{'reg'} = "miRNA"; $genes{$chr}->{$ID}{'foldChange'} = 1; ($genes{$chr}->{$ID}{'firstIntronStart'},$genes{$chr}->{$ID}{'firstIntronEnd'}) = (0,0); ($genes{$chr}->{$ID}{'firstExonStart'},$genes{$chr}->{$ID}{'firstExonEnd'}) = (0,0); $genes{$chr}->{$ID}{'fluo'} = $fluo; $genes{$chr}->{$ID}{'GCisland'} = 0; } close MIR; print "\t\t$MirFilename is read!\n" if ($verbose) ; } #-----------read file with sites and find N closest genes----- my $numberOfAllSites = 0; open (FILE, "<$SitesFilename") or die "Cannot open file $SitesFilename!!!!: $!"; open (OUT , ">$ResFilename") or die "Cannot open file $ResFilename!!!!: $!"; print OUT "Chromosome\tStart\tEnd\tMax\tScore\tDistTSS\tType\tTypeIntra\tReg\tFoldChange\tDistTE\tGeneName\tGeneCoordinates\n" ; my $header = <FILE>; #read header my @a = split (/\t/,$header ); my $correction = 0; if ($a[1] =~m/chr/) { $correction=1; } unless ($header=~m/Chromosome/ || $header=~m/track/|| $header=~m/Start/i) { close FILE; open (FILE, "<$SitesFilename") or die "Cannot open file $SitesFilename!!!!: $!"; } $count = 0; my %typehash; my %typehashIntra; $typehashIntra{"f_exon"} = 0; $typehashIntra{"f_intron"} = 0; $typehashIntra{"jonction"} = 0; $typehashIntra{"exon"} = 0; $typehashIntra{"intron"} = 0; while (<FILE>) { chomp; $numberOfAllSites++; my @a = split /\t/; my $chro = $a[0+$correction]; my $fpos = $a[1+$correction]; my $lpos = $a[2+$correction]; my $maxpos = $a[3+$correction]; if ($maxpos =~ m/\D/) { $maxpos = int(($lpos+$fpos)/2); } my $score = $a[4+$correction]; next if ($score < $minScore); next if ($score >= $maxScore); $score = $score +1 -1; #print "$score" ; #my $site = "$chro:$fpos-$lpos\t$maxpos\t$score"; #print $site ; #my $motifNumber = $a[3]; #my $geneSet = &search_genes($N,$chro,($fpos+$lpos)/2,\%genes); my @b; unless ($all) { @b = &findClosestGene($fpos,$lpos,$score,$chro,$maxpos,\%genes); #findAllClosestGene for all genes overlaping a peak } else { @b = &findAllClosestGeneOneLocation ($fpos,$lpos,$score,$chro,$maxpos,\%genes); #findAllClosestGene } #print "$distTSS\t$distTE\n" unless ($distTSS == $BIGNUMBER); for my $type (@b) { $typehash{$type} = 0 unless (exists($typehash{$type})); $typehash{$type}++; } $count ++; } close FILE; close OUT ; print "Total Sites: $count\n"; my $nEntries = 0; for my $type (sort keys %typehash) { $nEntries += $typehash{$type}; } print "Type\tCount\tFrequency\n"; for my $type (sort keys %typehash) { print $type,"\t",$typehash{$type},"\t",$typehash{$type}/$nEntries,"\n"; } for my $type (sort keys %typehashIntra) { print $type,"\t",$typehashIntra{$type},"\t",$typehashIntra{$type}/$nEntries,"\n"; } #my @arr = @{$genes{'chr'}}; sub findClosestGene { my ($fpos,$lpos,$score,$chro,$pos,$genes) = @_; my ($distTSS,$distTE,$type,$reg,$geneName,$foldChange) = ($BIGNUMBER,$BIGNUMBER,"intergenic",0,0,0); my $locDistTSS = 0; my $locDistTE = 0; my %hashForOverlapingGenes; my @array2return; my $typeIntra = "NA"; for my $gName (keys %{$genes->{$chro}}) { my $TSS = $genes->{$chro}{$gName}{'TSS'}; my $strand = $genes->{$chro}{$gName}{'strand'}; my $TE = $genes->{$chro}{$gName}{'TE'}; $locDistTE = ($pos-$TE)*$strand; $locDistTSS = ($pos-$TSS)*$strand; #print "$pos-$genes->{$chro}{$gName}{'TSS'})*$genes->{$chro}{$gName}{'strand'} = $locDistTSS\n"; #my $locDistTE = ($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'}; #print "$gName $maxDistance $pos-$genes->{$chro}{$gName}{'TSS'} $genes->{$chro}{$gName}{'strand'}\n"; #print "$locDistTSS $enhLeft $enhRight\n"; if (($locDistTSS>= $enhLeft)&&($locDistTSS<$enhRight)) { $type = "enhancer"; } elsif (($locDistTSS>= $enhRight)&&($locDistTSS<=0)) { $type = "promoter"; } elsif (($locDistTSS> 0)&&($locDistTSS<=$immediateDownstream)&&($locDistTE<=0)) { $type = "immediateDownstream"; } elsif (($locDistTSS >= $immediateDownstream)&&($locDistTE<=0)) { $type = "intragenic"; } elsif (($locDistTE >= 0)&&($locDistTE<=$downStreamDist)) { $type = "5kbDownstream"; } elsif (abs($locDistTSS)<abs($distTSS)) { $distTSS = $locDistTSS; $distTE = $locDistTE; ($reg,$geneName,$foldChange) = ($genes->{$chro}{$gName}{'reg'},$gName,$genes->{$chro}{$gName}{'foldChange'}); } if (($type ne "intergenic")&&($type ne "NA")) { unless (exists ($hashForOverlapingGenes{$type})) { my %h; $hashForOverlapingGenes{$type} = \%h; } $hashForOverlapingGenes{$type}->{$locDistTSS}=$gName; #print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$genes->{$chro}{$gName}{'reg'}\t$gName\n" ; $type = "NA"; #unless ($gName =~ m/$TSS/) { # print "$gName\t$TSS\n"; #} } } if ($type eq "intergenic") { print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$typeIntra\t$reg\t$foldChange\t$distTE\t$geneName\n" ; #print "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$reg\t$geneName\n" ; push(@array2return,$type); } else { my $selectedType; if (exists ($hashForOverlapingGenes{"promoter"})) { $selectedType = "promoter"; } elsif (exists ($hashForOverlapingGenes{"immediateDownstream"})) { $selectedType = "immediateDownstream"; } elsif (exists ($hashForOverlapingGenes{"intragenic"})) { $selectedType = "intragenic"; } elsif (exists ($hashForOverlapingGenes{"enhancer"})) { $selectedType = "enhancer"; } elsif (exists ($hashForOverlapingGenes{"5kbDownstream"})) { $selectedType = "5kbDownstream"; } my $bestGene = printBest($hashForOverlapingGenes{$selectedType},$chro,$fpos,$lpos,$pos,$score,$genes,$selectedType); #print "$type\n"; push(@array2return,$selectedType); my %otherGenes; for my $type ("promoter","immediateDownstream","intragenic","enhancer","5kbDownstream") { #"firstIntron","exon","intron", for my $locDistTSS (sort {$a<=>$b} keys %{$hashForOverlapingGenes{$type}}) { my $otherGene = $hashForOverlapingGenes{$type}->{$locDistTSS}; next if ($genes->{$chro}{$bestGene}{'name'} eq $genes->{$chro}{$otherGene}{'name'}); next if (exists ($otherGenes{$genes->{$chro}{$otherGene}{'name'}})); if (overlapGenes($otherGene,$bestGene,$chro,$genes)) { if (($type eq "intragenic")||($type eq "immediateDownstream")) { $typeIntra = &getTypeIntra($genes->{$chro}{$otherGene},$pos); $typehashIntra{$typeIntra}++; }else { $typeIntra = "NA"; } print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$otherGene}{'reg'}\t$genes->{$chro}{$otherGene}{'foldChange'}\t",($pos-$genes->{$chro}{$otherGene}{'TE'})*$genes->{$chro}{$otherGene}{'strand'},"\t$otherGene\n" ; push(@array2return,$type); #print "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$genes->{$chro}{$otherGene}{'reg'}\t$otherGene\n" ; #print $type,"\n"; $otherGenes{$genes->{$chro}{$otherGene}{'name'}} = 1; } } } } return @array2return; } sub findAllClosestGeneOneLocation { my ($fpos,$lpos,$score,$chro,$pos,$genes) = @_; my ($distTSS,$type,$reg,$geneName,$foldChange) = ($BIGNUMBER,"intergenic",0,0,0); my $locDistTSS = 0; my $locDistTE = 0; my %hashForOverlapingGenes; my @array2return; my $typeIntra = "NA"; for my $gName (keys %{$genes->{$chro}}) { my $TSS = $genes->{$chro}{$gName}{'TSS'}; my $strand = $genes->{$chro}{$gName}{'strand'}; my $TE = $genes->{$chro}{$gName}{'TE'}; $locDistTE = ($pos-$TE)*$strand; $locDistTSS = ($pos-$TSS)*$strand; #print "$pos-$genes->{$chro}{$gName}{'TSS'})*$genes->{$chro}{$gName}{'strand'} = $locDistTSS\n"; #my $locDistTE = ($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'}; #print "$gName $maxDistance $pos-$genes->{$chro}{$gName}{'TSS'} $genes->{$chro}{$gName}{'strand'}\n"; #print "$locDistTSS $enhLeft $enhRight\n"; if (($locDistTSS>= $enhLeft)&&($locDistTSS<$enhRight)) { $type = "enhancer"; } elsif (($locDistTSS>= $enhRight)&&($locDistTSS<=0)) { $type = "promoter"; } elsif (($locDistTSS> 0)&&($locDistTSS<=$immediateDownstream)&&($locDistTE<=0)) { $type = "immediateDownstream"; } elsif (($locDistTSS >= $immediateDownstream)&&($locDistTE<=0)) { $type = "intragenic"; } elsif (($locDistTE >= 0)&&($locDistTE<=$downStreamDist)) { $type = "5kbDownstream"; } elsif (abs($locDistTSS)<abs($distTSS)) { $distTSS = $locDistTSS; ($reg,$geneName,$foldChange) = ($genes->{$chro}{$gName}{'reg'},$gName,$genes->{$chro}{$gName}{'foldChange'}); } if (($type ne "intergenic")&&($type ne "NA")) { unless (exists ($hashForOverlapingGenes{$type})) { my %h; $hashForOverlapingGenes{$type} = \%h; $hashForOverlapingGenes{$type}->{$gName}=$locDistTSS; if (($type eq "intragenic")||($type eq "immediateDownstream")) { $typeIntra = &getTypeIntra($genes->{$chro}{$gName},$pos); $typehashIntra{$typeIntra}++; } print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$gName}{'reg'}\t$genes->{$chro}{$gName}{'fluo'}\t$genes->{$chro}{$gName}{'foldChange'}\t$genes{$chro}->{$gName}{'GCisland'}\t",($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'},"\t$gName\n" ; push(@array2return,$type); } $typeIntra = "NA"; $type = "NA"; #unless ($gName =~ m/$TSS/) { # print "$gName\t$TSS\n"; #} } } if ($type eq "intergenic") { print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$geneName}{'reg'}\t$genes->{$chro}{$geneName}{'fluo'}\t$genes->{$chro}{$geneName}{'foldChange'}\t$genes{$chro}->{$geneName}{'GCisland'}\t",($pos-$genes->{$chro}{$geneName}{'TE'})*$genes->{$chro}{$geneName}{'strand'},"\t$geneName\n" ; #print "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$reg\t$geneName\n" ; push(@array2return,$type); } return @array2return; } sub findAllClosestGene { my ($fpos,$lpos,$score,$chro,$pos,$genes) = @_; my ($distTSS,$type,$reg,$geneName,$foldChange) = ($BIGNUMBER,"intergenic",0,0,0); my $locDistTSS = 0; my $locDistTE = 0; my %hashForOverlapingGenes; my @array2return; my $typeIntra = "NA"; for my $gName (keys %{$genes->{$chro}}) { my $TSS = $genes->{$chro}{$gName}{'TSS'}; my $strand = $genes->{$chro}{$gName}{'strand'}; my $TE = $genes->{$chro}{$gName}{'TE'}; $locDistTE = ($pos-$TE)*$strand; $locDistTSS = ($pos-$TSS)*$strand; #print "$pos-$genes->{$chro}{$gName}{'TSS'})*$genes->{$chro}{$gName}{'strand'} = $locDistTSS\n"; #my $locDistTE = ($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'}; #print "$gName $maxDistance $pos-$genes->{$chro}{$gName}{'TSS'} $genes->{$chro}{$gName}{'strand'}\n"; #print "$locDistTSS $enhLeft $enhRight\n"; if (($locDistTSS>= $enhLeft)&&($locDistTSS<$enhRight)) { $type = "enhancer"; } elsif (($locDistTSS>= $enhRight)&&($locDistTSS<=0)) { $type = "promoter"; } elsif (($locDistTSS> 0)&&($locDistTSS<=$immediateDownstream)&&($locDistTE<=0)) { $type = "immediateDownstream"; } elsif (($locDistTSS >= $immediateDownstream)&&($locDistTE<=0)) { $type = "intragenic"; } elsif (($locDistTE >= 0)&&($locDistTE<=$downStreamDist)) { $type = "5kbDownstream"; } elsif (abs($locDistTSS)<abs($distTSS)) { $distTSS = $locDistTSS; ($reg,$geneName,$foldChange) = ($genes->{$chro}{$gName}{'reg'},$gName,$genes->{$chro}{$gName}{'foldChange'}); } if (($type ne "intergenic")&&($type ne "NA")) { unless (exists ($hashForOverlapingGenes{$type})) { my %h; $hashForOverlapingGenes{$type} = \%h; } #$hashForOverlapingGenes{$type}->{$gName}=$locDistTSS; $typeIntra = "NA"; if (($type eq "intragenic")||($type eq "immediateDownstream")) { $typeIntra = &getTypeIntra($genes->{$chro}{$gName},$pos); $typehashIntra{$typeIntra}++; } print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$gName}{'reg'}\t$genes->{$chro}{$gName}{'fluo'}\t$genes->{$chro}{$gName}{'foldChange'}\t$genes{$chro}->{$gName}{'GCisland'}\t",($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'},"\t$gName\n" ; push(@array2return,$type); $type = "NA"; #unless ($gName =~ m/$TSS/) { # print "$gName\t$TSS\n"; #} } } if ($type eq "intergenic") { print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$geneName}{'reg'}\t$genes->{$chro}{$geneName}{'fluo'}\t$genes->{$chro}{$geneName}{'foldChange'}\t$genes{$chro}->{$geneName}{'GCisland'}\t",($pos-$genes->{$chro}{$geneName}{'TE'})*$genes->{$chro}{$geneName}{'strand'},"\t$geneName\n" ; #print "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$reg\t$geneName\n" ; push(@array2return,$type); } return @array2return; } sub getTypeIntra { my ($geneEntry, $pos) = @_; my $type; if (($pos >= $geneEntry->{'firstIntronStart'})&&($pos <=$geneEntry->{'firstIntronEnd'})) { return "f_intron"; } if (($pos >= $geneEntry->{'firstExonStart'})&&($pos <=$geneEntry->{'firstExonEnd'})) { return "f_exon"; } $type = getIntronExon ($pos, $geneEntry->{'exonCount'},$geneEntry->{'exonStarts'},$geneEntry->{'exonEnds'},$geneEntry->{'strand'}); return $type; } sub overlapGenes { my ($otherGene,$bestGene,$chro,$genes)= @_; my $a1 = $genes->{$chro}{$otherGene}{'left'}; my $a2 = $genes->{$chro}{$bestGene}{'left'}; my $e1 = $genes->{$chro}{$otherGene}{'right'}; my $e2 = $genes->{$chro}{$bestGene}{'right'}; if (($a1 >= $a2)&&($a1 <= $e2)) { return 1; } if (($a2 >= $a1)&&($a2 <= $e1)) { return 1; } return 0; } sub printBest { my ($hash,$chro,$fpos,$lpos,$pos,$score,$genes,$type) = @_; for my $locDistTSS (sort {$a<=>$b} keys %{$hash}) { my $gName = $hash->{$locDistTSS}; my $typeIntra = "NA"; if (($type eq "intragenic")||($type eq "immediateDownstream")) { $typeIntra = &getTypeIntra($genes->{$chro}{$gName},$pos); $typehashIntra{$typeIntra}++; } print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$gName}{'reg'}\t$genes->{$chro}{$gName}{'foldChange'}\t",($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'},"\t$gName\n" ; return $gName; } } #sub findAllClosestGene { # my ($fpos,$lpos,$score,$chro,$pos,$genes) = @_; # my ($distTSS,$type,$reg,$geneName,$foldChange) = ($BIGNUMBER,"intergenic",0,0,0); # my $locDistTSS = 0; # my $locDistTE = 0; # my @array2return; # for my $gName (keys %{$genes->{$chro}}) { # my $TSS = $genes->{$chro}{$gName}{'TSS'}; # my $strand = $genes->{$chro}{$gName}{'strand'}; # my $TE = $genes->{$chro}{$gName}{'TE'}; # $locDistTSS = ($pos-$TSS)*$strand; # $locDistTE = ($pos-$TE)*$strand; # #print "$pos-$genes->{$chro}{$gName}{'TSS'})*$genes->{$chro}{$gName}{'strand'} = $locDistTSS\n"; # #my $locDistTE = ($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'}; # # #print "$gName $maxDistance $pos-$genes->{$chro}{$gName}{'TSS'} $genes->{$chro}{$gName}{'strand'}\n"; # #print "$locDistTSS $enhLeft $enhRight\n"; # # if (($locDistTSS>= $enhLeft)&&($locDistTSS<$enhRight)) { # $type = "enhancer"; # } elsif (($locDistTSS>= $enhRight)&&($locDistTSS<=0)) { # $type = "promoter"; # } elsif (($locDistTSS> 0)&&($locDistTSS<=$immediateDownstream)) { # $type = "immediateDownstream"; # } elsif (($pos >= $genes{$chro}->{$gName}{'firstIntronStart'})&&($pos <= $genes{$chro}->{$gName}{'firstIntronEnd'})) { # $type = "firstIntron"; # } elsif (($locDistTSS> $immediateDownstream)&&($locDistTSS<=$genes{$chro}->{$gName}{'length'})) { # #$type = "intragenic"; # $type = &getIntronExon ($pos, $genes{$chro}->{$gName}{'exonCount'},$genes{$chro}->{$gName}{'exonStarts'},$genes{$chro}->{$gName}{'exonEnds'},$genes{$chro}->{$gName}{'strand'}); # } elsif (($locDistTE >= 0)&&($locDistTE<=$downStreamDist)) { # $type = "5kbDownstream"; # } elsif (abs($locDistTSS)<abs($distTSS)) { # $distTSS = $locDistTSS; # ($reg,$geneName,$foldChange) = ($genes->{$chro}{$gName}{'reg'},$gName,$genes->{$chro}{$gName}{'foldChange'}); # } # if (($type ne "intergenic")&&($type ne "NA")) { # print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$genes->{$chro}{$gName}{'reg'}\t$genes->{$chro}{$gName}{'foldChange'}\t$gName\n" ; # push(@array2return,$type); # $type = "NA"; # #unless ($gName =~ m/$TSS/) { # # print "$gName\t$TSS\n"; # #} # } # # } # if ($type eq "intergenic") { # print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$reg\t$foldChange\t$geneName\n" ; # push(@array2return,$type); # #print "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$reg\t$geneName\n" ; # # } # return @array2return; #} sub getFirstIntron { my ($exonCount,$exonStarts,$exonEnds,$strand) = @_; my ($left,$right); if ($exonCount == 1) { return (0,0); } if ($strand == 1) { $left = (split ",", $exonEnds)[0]+$jonctionSize; $right = (split (",", $exonStarts))[1]-$jonctionSize; } else { $left = (split (",", $exonEnds))[$exonCount-2]+$jonctionSize; $right = (split (",", $exonStarts))[$exonCount-1]-$jonctionSize; } ($left,$right); } sub getFirstExon { my ($exonCount,$exonStarts,$exonEnds,$strand) = @_; my ($left,$right); if ($exonCount == 1) { return (0,0); } if ($strand == 1) { $left = (split ",", $exonStarts)[0]; $right = (split (",", $exonEnds))[0]-$jonctionSize; } else { $left = (split (",", $exonStarts))[$exonCount-1]+$jonctionSize; $right = (split (",", $exonEnds))[$exonCount-1]; } ($left,$right); } sub getIntronExon { my ($pos,$exonCount,$exonStarts,$exonEnds,$strand) = @_; my (@left,@right); @left = (split ",", $exonStarts); @right = (split (",", $exonEnds)); for (my $i = 0; $i<$exonCount;$i++) { #print "$left[$i] <= $pos ? $pos <= $right[$i]\n"; if (($left[$i]+$jonctionSize < $pos) && ($pos < $right[$i]-$jonctionSize)) { #print "URA!\n"; return "exon"; } elsif (($i+1<$exonCount)&&($right[$i]+$jonctionSize < $pos) && ($pos < $left[$i+1]-$jonctionSize)) { return "intron"; } } return "jonction"; } sub min { return $_[0] if ($_[0]<$_[1]); $_[1]; } sub med { my @arr = @_; my $med = 0; @arr = sort {$a <=> $b} @arr; if ((scalar(@arr)/2) =~ m/[\.\,]5/) { return $arr[floor(scalar(@arr)/2)]; } else { return ($arr[scalar(@arr)/2]+$arr[scalar(@arr)/2-1])/2; } $med; } sub checkIfGC { my ($TSS,$strand,$dist,$GCislandsChr)=@_; my $ifGC = 0; my $leftProm=$TSS-$dist; my $rightProm = $TSS; if ($strand== -1) { my $leftProm=$TSS; my $rightProm = $TSS+$dist; } for my $leftGC (keys %{$GCislandsChr}) { my $rightGC = $GCislandsChr->{$leftGC}; if ($leftGC>=$leftProm&&$leftGC<=$rightProm || $rightGC>=$leftProm&&$rightGC<=$rightProm) { return "GC-island"; } } return $ifGC ; }