Mercurial > repos > alermine > nebula
comparison [APliBio]Nebula tools suite/Nebula/AnnotatePeaks/annotatePeaks.pl @ 0:2ec3ba0e9e70 draft
Uploaded
author | alermine |
---|---|
date | Thu, 25 Oct 2012 08:18:25 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2ec3ba0e9e70 |
---|---|
1 #:t:::::::::::::::::g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
2 #:t::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
3 #:::::::::::::z;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
4 #::::::::::::i@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
5 #::::::::::::@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@$@@@@ | |
6 #:::::::::::3@@@@@@@@@@@@@@@@@@@@@@@@@B@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
7 #::::::::::3@@@@@@@@@@@@@@@@@@@@@BEEESSE5EEEEBBM@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
8 #::::::::::3@@@@@@@@@@@@@@@@@@@@BEEEEEE35EE55E2355E5SBMB@@@@@@@@@@@@@@@@@$ | |
9 #::::::::::@@@@@@@@@@@@@@@@@@@EEEE55533t3tttt::::::!!!!7755E755SBBMMM@@@MM | |
10 #::::::::::3@@@@@@@@@@@@@@@@@@EEEE2t3ttttt:::::::::::::::::::::::!7?5225EE | |
11 #::::::::::3@@@@@@@@@@@@@@@@@@EEEEE31t::::::::::::::::::::::::::::::::3E5@ | |
12 #::::::::::3@@@@@@@@@@@@@@@@@@EEEEEEtt:::::::::::::::::::::::::::::::::353 | |
13 #::::::::::3@@@@@@@@@@@@@@@@@@EEEEEE1ttz::::::::::::::::::::::::::::::::35 | |
14 #:::::::::::@@@@@@@@@@@@@@@@@@EEEEEEEtz1::::::::::::::::::::::::::::::::t: | |
15 #:::::::::!3@@@@@@@@@@@@@@@@@@@EEEEEttt::::::::::::::::::::::::::::::::;zz | |
16 #::::::::::@@@@@@@@@@@@@@@@@@@@EEEEEttt:::::z;z:::::::::::::::::::::::::13 | |
17 #::::::::::3B@@@@@@@@@@@@@@@@@@EEEEEEE3tt:czzztti;:::::::::::::::::::::::3 | |
18 #::::ttt::::3@@@@@@@@@@@@@@@@EEEEE5EE25Ezt1EEEz5Etzzz;;;;::::::::::::::::: | |
19 #:::::::::::I9@@@@@@@@@@@@@@@@@@@@@@@@@@EEEEEE@@@@@@@@@@@@@@Ez;::::::::::: | |
20 #:::::::::::::E@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@Ez:::::: | |
21 #::::::::::::::E@@@@@@@@@@@@@@@@@@@@@@@@@@@@@BE5EBB@@@@@@@@@@@@@@@EEE::::: | |
22 #:::::::::::::::@@@@@@@@@@@@@@@@@@@@@@@@@@@@E1::35@@@@@@@@@@ME3MMME2:::::: | |
23 #:::::::::::::::?@@@@@@@@@@@@@@@@@@M@@@@@@@EE:::::3SB@@BBESEEt:::::::::::: | |
24 #::::::::::::::::J$@@@@@@@B@@@@@@@@@@@@@@@@EE:::::::!35E33t::::::::::::::: | |
25 #:::::::::::::::::3@E@@@EE5EESE5EESE@@@@@@@Et::::::::::::tz::::::::::::::: | |
26 #:::::::::::::::::J@E$@EEE5133555SE@@@@@@@@Et::::::::::::::::::::::::::::: | |
27 #::::::::::::::::::E@E@EEEEtt3523EEE@@@@@@@E:::::::::::::::::::::::::::::: | |
28 #:t::::::::::::::::JEE3@@@EEEEEEEEEE@@@@@@@E:::::::::t;::::::::::::::::::: | |
29 #:t:::::::::::::::::!5ES@EEEEEEEEES@@@@@@@@@E;:::;;;:3Ez:::::::::::::::::: | |
30 #:t::::::::::::::::::::JE@@EEEEEEE@@@@@@@@@@@@@@@@ME!:::;::::::::::::::::: | |
31 #:tz::::::::::::::::::::JE@@@EEEE@@@@@@@@@@@@@@EE!:::::::t:::::::::::::::: | |
32 #:t::::::::::::::::::::::3@@@@@@@@@@@@@@@@@@ESBE:::::::::::::::::::::::::: | |
33 #:::::::::::::::::::::::::Q@@@@@@@@@@@@@@@@EE3EE;:::::zzzz:::::::::::::::: | |
34 #:::::::::::::::::::::::::3@@@@@@@@@@@@@@@@@@@@@@NN@@@@@@Ez::::::::::::::: | |
35 #:zt:::::::::::::::::::::::3@@@@EE@@@@@@@@@@EEEEt::;z113E5t::::::::::::::: | |
36 #::tt:::::::::::::::::::::::3@@@E@@@@@@@@@@@@@@@@BEt::::::::::::::::t::::: | |
37 #:tt:t:::::::::::::::::::::::?S@@@@@@@@@@@BBEEE51!::::::::::::::zzzEt::::: | |
38 #::::::::::::::::::::::::::::::3Q@@@@@@@BEEEEEt:::::::::::::;zz@@@EE:::::: | |
39 #::::::::::::::::::::::::::::::::75B@@@@@EEEtt;:::::::::;zz@@@@BEEEtz::::: | |
40 #::::::::::::::::::::::::::::::::::::?9@@@@@@@@@@@E2Ezg@@@@@B@@@EEEE1t:::: | |
41 #:::::::::::::::::::::::::::::::::::::::3@@@@@@@@@@@@@@@@@@@E@EEEEEEEzzz:: | |
42 #::::::::::::::::::::::::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@EEEEEEE5ttttt | |
43 #:::::::::::::::::::::::::::::::;g@@@@@@@@@@@@@@@@@@@@@@@@@@EEEEEEEEEEEtzt | |
44 #::::::::::::::::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@E@@EEEEEEEEEEEE@@@ | |
45 #::::::::::::::::::::::::::g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@EEEE3EEEE@@@@@@@ | |
46 #:::::::::::::::::::::;;g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@EEEt33@@@@@@@@@@ | |
47 #:::::::::::::::::;g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@E@@@@@@EEEtg@@@@@@@@@@@@ | |
48 #::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@EEEE@@@@@@@@@@@@@@@@@@@@@@@@ | |
49 #:::::::::::::@@@@@@@@@@@@@@@@@$@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
50 #::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
51 # | |
52 # Copyleft ↄ⃝ 2012 Institut Curie | |
53 # Author(s): Valentina Boeva, Alban Lermine (Institut Curie) 2012 | |
54 # Contact: valentina.boeva@curie.fr, alban.lermine@curie.fr | |
55 # This software is distributed under the terms of the GNU General | |
56 # Public License, either Version 2, June 1991 or Version 3, June 2007. | |
57 | |
58 #!/usr/bin/perl -w | |
59 | |
60 #for a complete list of genes and a list of sites, sorts genes close to sites | |
61 #printDistance | |
62 | |
63 #read only gene files with refSeqGenes | |
64 #counts only once overlapping transcripts or genes | |
65 | |
66 #calculates some stats about locations of peaks | |
67 | |
68 #uses hierarchie : promoter, imUpstream, intragenic,enh, 5kbdownstream, intergenic + for genes: fExon,exon,fIntron,intron,junction+-50kb ONLY FOR "findClosestGene" | |
69 #outputs fold change of expression is available | |
70 | |
71 #outputname - corrected | |
72 #all isoforms are considered, even those that start and end at the same coordinates | |
73 #uses fluorescence values | |
74 | |
75 #only one location per gene in mode "all" (but promoter+intragenic if both) | |
76 | |
77 #read directly Noresp/down/up genes | |
78 | |
79 #prints distance to TE | |
80 | |
81 use strict; | |
82 use POSIX; | |
83 | |
84 my $SitesFilename ; | |
85 my $GenesFilename ; | |
86 my $SelectedGenesFilename = ""; | |
87 my $MirFilename = ""; | |
88 my $minScore = 0; | |
89 my $BIGNUMBER = 10000000000; | |
90 my $enhLeft = -30000; | |
91 my $enhRight = -1500; | |
92 my $promLeft = $enhRight ; | |
93 my $immediateDownstream = 2000; | |
94 my $maxScore = $BIGNUMBER ; | |
95 my $verbose = 0; | |
96 my $maxDistance = $BIGNUMBER ; | |
97 my $downStreamDist = 5000; | |
98 my $jonctionSize = 50; | |
99 my $all = 0; | |
100 my $fluoFile = ""; | |
101 my $regType = ""; | |
102 my $GCislands = ""; | |
103 my $ResFilename = ""; | |
104 | |
105 my $usage = qq{ | |
106 $0 | |
107 | |
108 ----------------------------- | |
109 mandatory parameters: | |
110 | |
111 -g filename file with all genes | |
112 -tf filename file with sites of TF | |
113 | |
114 ----------------------------- | |
115 optional parameters: | |
116 -v for verbose | |
117 -mir filename file with positions of miRNA | |
118 -maxDist value maximal Distance to genes (def. Inf) | |
119 -minScore value minimal Score (def. 0) | |
120 -maxScore value maximal Score (def. Inf) | |
121 -selG filename file with selected genes and their expression levels | |
122 -all to output all genes intersecting with the peak | |
123 -fluo filename file with fluorescence | |
124 -regType valeu down-regulated, up-regulated, no-response | |
125 -gc filename file with gc-islands | |
126 | |
127 -lp valeu upstream of TSS region to define promoter | |
128 -rp valeu downstream of TSS region to define immediate downstream | |
129 -enh valeu upstream of TSS region to define enhancer | |
130 -dg valeu downstream of transcription end region to define gene downstream | |
131 | |
132 -o filename output filename | |
133 | |
134 }; | |
135 | |
136 if(scalar(@ARGV) == 0){ | |
137 print $usage; | |
138 exit(0); | |
139 } | |
140 | |
141 | |
142 while(scalar(@ARGV) > 0){ | |
143 my $this_arg = shift @ARGV; | |
144 if ( $this_arg eq '-h') {print "$usage\n"; exit; } | |
145 | |
146 elsif ( $this_arg eq '-g') {$GenesFilename = shift @ARGV;} | |
147 elsif ( $this_arg eq '-tf') {$SitesFilename = shift @ARGV;} | |
148 elsif ( $this_arg eq '-mir') {$MirFilename = shift @ARGV;} | |
149 elsif ( $this_arg eq '-selG') {$SelectedGenesFilename = shift @ARGV;} | |
150 elsif ( $this_arg eq '-maxDist') {$maxDistance = 1;} | |
151 elsif ( $this_arg eq '-maxScore') {$maxScore = shift @ARGV;} | |
152 elsif ( $this_arg eq '-minScore') {$minScore = shift @ARGV;} | |
153 elsif ( $this_arg eq '-v') {$verbose = 1;} | |
154 elsif ( $this_arg eq '-all') {$all = 1;} | |
155 elsif ( $this_arg eq '-regType') {$regType = shift @ARGV;} | |
156 | |
157 elsif ( $this_arg eq '-fluo') {$fluoFile = shift @ARGV;} | |
158 elsif ( $this_arg eq '-gc') {$GCislands = shift @ARGV;} | |
159 elsif ( $this_arg eq '-o') {$ResFilename = shift @ARGV;} | |
160 | |
161 elsif ( $this_arg eq '-lp') {$enhRight = shift @ARGV;$promLeft = $enhRight;} | |
162 elsif ( $this_arg eq '-rp') {$immediateDownstream = shift @ARGV;} | |
163 elsif ( $this_arg eq '-enh') {$enhLeft = shift @ARGV;} | |
164 elsif ( $this_arg eq '-dg') {$downStreamDist = shift @ARGV;} | |
165 | |
166 elsif ( $this_arg =~ m/^-/ ) { print "unknown flag: $this_arg\n";} | |
167 } | |
168 | |
169 if ($maxDistance<0) { | |
170 die "\tThe maximal distance must be positive!\n"; | |
171 } | |
172 my $tmpn = $SitesFilename; | |
173 $tmpn =~ s/.*[\/\\]//; | |
174 if ($ResFilename eq "") { | |
175 $ResFilename = $SelectedGenesFilename.$tmpn."_".$minScore."_dists.txt"; | |
176 | |
177 if ($maxScore != $BIGNUMBER) { | |
178 $ResFilename = $SelectedGenesFilename.$SitesFilename."_".$minScore."_$maxScore"."_dists.txt"; | |
179 } | |
180 | |
181 if ($all == 1) { | |
182 $ResFilename =~ s/_dists/_all_dists/; | |
183 } | |
184 } | |
185 | |
186 | |
187 #-----------read selected genes---------------- | |
188 my %selectedGenes; | |
189 my %selectedGenesFoldChange; | |
190 if ( $SelectedGenesFilename ne "") { | |
191 open (FILE, "<$SelectedGenesFilename ") or die "Cannot open file $SelectedGenesFilename !!!!: $!"; | |
192 while (<FILE>) { | |
193 chomp; | |
194 my @a = split/\t/; | |
195 $selectedGenes{$a[1]} = $a[3]; | |
196 $selectedGenesFoldChange{$a[1]} = $a[2]; | |
197 #print "gene:$a[1],reg:$selectedGenes{$a[1]},FC:$selectedGenesFoldChange{$a[1]}\n"; | |
198 } | |
199 | |
200 close FILE; | |
201 print "\t\t$SelectedGenesFilename is read!\n" if ($verbose); | |
202 } | |
203 | |
204 #-----------read genes with fluorescence--------- | |
205 my %fluoGenes; | |
206 if ( $fluoFile ne "") { | |
207 open (FILE, "<$fluoFile ") or die "Cannot open file $fluoFile !!!!: $!"; | |
208 my $gene = ""; | |
209 my $med = 0; | |
210 my %h; | |
211 while (<FILE>) { | |
212 chomp; | |
213 my @a = split/\t/; | |
214 | |
215 next if (scalar(@a)<5); | |
216 next if ($a[0] eq "probesets"); | |
217 next unless ($a[0] =~m/\S/); | |
218 next unless ($a[4] =~m/\S/); | |
219 if ($gene ne "" && $gene ne $a[2]) { | |
220 #calcMed | |
221 $med = med(keys %h); | |
222 $fluoGenes{$gene} = $med; | |
223 $med=0; | |
224 delete @h{keys %h}; | |
225 } else { | |
226 #$h{$a[4]} = 1; | |
227 } | |
228 $gene = $a[2]; | |
229 $h{$a[4]} = 1; | |
230 #print "keys ", scalar(keys %h),"\t",keys %h,"\n"; | |
231 } | |
232 if ($gene ne "") { | |
233 $med = med(keys %h); | |
234 $fluoGenes{$gene} = $med; | |
235 } | |
236 close FILE; | |
237 print "\t\t$fluoFile is read!\n" if ($verbose); | |
238 } | |
239 #-----------read GC-islands---------------- | |
240 my %GCislands; | |
241 if ($GCislands ne "") { | |
242 open (FILE, "<$GCislands ") or die "Cannot open file $GCislands !!!!: $!"; | |
243 | |
244 while (<FILE>) { | |
245 chomp; | |
246 my @a = split/\t/; | |
247 #bin chrom chromStart chromEnd name length cpgNum gcNum perCpg perGc obsExp | |
248 #107 chr1 36568608 36569851 CpG: 128 1243 128 766 20.6 61.6 1.09 | |
249 my $chr = $a[1]; | |
250 my $start = $a[2]; | |
251 my $end = $a[3]; | |
252 $GCislands{$chr}->{$start}=$end; | |
253 } | |
254 close FILE; | |
255 } elsif ($verbose) { | |
256 print "you did not specify a file with GC-islands\n"; | |
257 } | |
258 | |
259 #-----------read genes---------------- | |
260 | |
261 my %genes; | |
262 | |
263 my $count = 0; | |
264 | |
265 open (GENES, "<$GenesFilename") or die "Cannot open file $GenesFilename!!!!: $!"; | |
266 <GENES>; | |
267 while (<GENES>) { | |
268 chomp; | |
269 if (/(chr.*)\s([+-])\s(\d+)\s(\d+)\s(\d+)\s(\d+)\s(\d+)\s(\S+)\s(\S+)\s\S+\s(\S+)/){ | |
270 my $name = $10; | |
271 my $chr = $1; | |
272 my $strand = $2; | |
273 if ($strand eq '+') { | |
274 $strand = 1; | |
275 } | |
276 else { | |
277 $strand = -1; | |
278 } | |
279 my $leftPos = $3; | |
280 my $rightPos = $4; | |
281 my $cdsStart= $5; | |
282 my $cdsEnd= $6; | |
283 my $exonCount= $7; | |
284 my $exonStarts= $8; | |
285 my $exonEnds= $9; | |
286 my $ID = "$name\t$chr:$leftPos-$rightPos;$exonStarts-$exonEnds"; | |
287 my $foldChange = 1; | |
288 my $reg = "NA"; | |
289 my $fluo = "NA"; | |
290 if ( $SelectedGenesFilename ne "") { | |
291 #print "$name\t"; | |
292 if (exists($selectedGenes{$name})) { | |
293 $reg = $selectedGenes{$name}; | |
294 $foldChange = $selectedGenesFoldChange{$name}; | |
295 } | |
296 } | |
297 unless (exists($genes{$chr})) { | |
298 my %h; | |
299 $genes{$chr} = \%h; | |
300 } | |
301 unless (exists($genes{$chr}->{$ID})) { | |
302 my %h1; | |
303 $genes{$chr}->{$ID} = \%h1; | |
304 $count++; | |
305 } | |
306 if ( $fluoFile ne "") { | |
307 if (exists($fluoGenes{$name})) { | |
308 $fluo = $fluoGenes{$name}; | |
309 } | |
310 } | |
311 $genes{$chr}->{$ID}{'name'} = $name ; | |
312 $genes{$chr}->{$ID}{'left'} = $leftPos ; | |
313 $genes{$chr}->{$ID}{'right'} = $rightPos ; | |
314 $genes{$chr}->{$ID}{'cdsStart'} = $cdsStart; | |
315 $genes{$chr}->{$ID}{'cdsEnd'} = $cdsEnd; | |
316 $genes{$chr}->{$ID}{'strand'} = $strand; | |
317 $genes{$chr}->{$ID}{'exonCount'} = $exonCount; | |
318 $genes{$chr}->{$ID}{'exonStarts'} = $exonStarts; | |
319 $genes{$chr}->{$ID}{'exonEnds'} = $exonEnds; | |
320 $genes{$chr}->{$ID}{'TSS'} = ($strand == 1) ? $leftPos :$rightPos ; | |
321 $genes{$chr}->{$ID}{'TE'} = ($strand == -1) ? $leftPos :$rightPos ; | |
322 $genes{$chr}->{$ID}{'reg'} = $reg; | |
323 $genes{$chr}->{$ID}{'foldChange'} = $foldChange; | |
324 $genes{$chr}->{$ID}{'length'} = abs ($leftPos-$rightPos); | |
325 ($genes{$chr}->{$ID}{'firstIntronStart'},$genes{$chr}->{$ID}{'firstIntronEnd'}) = getFirstIntron ($exonCount,$exonStarts,$exonEnds,$strand); | |
326 ($genes{$chr}->{$ID}{'firstExonStart'},$genes{$chr}->{$ID}{'firstExonEnd'}) = getFirstExon ($exonCount,$exonStarts,$exonEnds,$strand); | |
327 | |
328 $genes{$chr}->{$ID}{'fluo'} = $fluo; | |
329 | |
330 $genes{$chr}->{$ID}{'GCisland'} = 0; | |
331 if ($GCislands ne "") { | |
332 $genes{$chr}->{$ID}{'GCisland'} = checkIfGC ($genes{$chr}->{$ID}{'TSS'},$strand,2000,$GCislands{$chr}); | |
333 } | |
334 | |
335 } | |
336 } | |
337 print "Total genes : $count\n"; | |
338 close GENES; | |
339 print "\t\t$GenesFilename is read!\n" if ($verbose); | |
340 #for my $gName (sort keys %{$genes{'chr18'}}) { | |
341 | |
342 # print "$gName\t$genes{'chr18'}->{$gName}{'TSS'}\n"; | |
343 #} | |
344 | |
345 #-----------read file with sites miRNA, store as genes----- | |
346 | |
347 if ( $MirFilename eq ""){ | |
348 print "you did not specify file with miRNA\n" if ($verbose);; | |
349 } | |
350 else { | |
351 | |
352 | |
353 open (MIR, "<$MirFilename ") or die "Cannot open file $MirFilename !!!!: $!"; | |
354 #chr1 20669090 20669163 mmu-mir-206 960 + | |
355 while (<MIR>) { | |
356 chomp; | |
357 my ($name, $chr, $leftPos, $rightPos, $strand ); | |
358 my $fluo = "NA"; | |
359 | |
360 #1 . miRNA 20669091 20669163 . + . ACC="MI0000249"; ID="mmu-mir-206"; | |
361 if (/([0-9XYM]+)\s.\smiRNA\s(\d+)\s(\d+)\s.\s([+-])\s.\sACC=.*ID=\"(.*)\"/) { | |
362 $name = $5; | |
363 $chr = $1; | |
364 $leftPos = $2; | |
365 $rightPos = $3; | |
366 $strand = $4; | |
367 } | |
368 elsif (/(.*)\s(\d+)\s(\d+)\s(.*)\s(.*)\s(.*)/){ | |
369 $name = $4; | |
370 $chr = $1; | |
371 $leftPos = $2; | |
372 $rightPos = $3; | |
373 $strand = $6; | |
374 } else { | |
375 next; | |
376 } | |
377 | |
378 unless ($chr =~ m/chr/) { | |
379 $chr = "chr".$chr; | |
380 } | |
381 my $ID = "$name\t$chr:$leftPos-$rightPos"; | |
382 | |
383 if ($strand eq '+') { | |
384 $strand = 1; | |
385 } | |
386 else { | |
387 $strand = -1; | |
388 } | |
389 | |
390 unless (exists($genes{$chr})) { | |
391 my %h; | |
392 $genes{$chr} = \%h; | |
393 } | |
394 unless (exists($genes{$chr}->{$ID})) { | |
395 my %h1; | |
396 $genes{$chr}->{$ID} = \%h1; | |
397 $count++; | |
398 } | |
399 $genes{$chr}->{$ID}{'name'} = $name ; | |
400 $genes{$chr}->{$ID}{'left'} = $leftPos ; | |
401 $genes{$chr}->{$ID}{'right'} = $rightPos ; | |
402 $genes{$chr}->{$ID}{'cdsStart'} = $leftPos ; | |
403 $genes{$chr}->{$ID}{'cdsEnd'} = $rightPos ; | |
404 $genes{$chr}->{$ID}{'strand'} = $strand; | |
405 $genes{$chr}->{$ID}{'length'} = abs ($leftPos-$rightPos); | |
406 $genes{$chr}->{$ID}{'exonCount'} = 1; | |
407 $genes{$chr}->{$ID}{'exonStarts'} = $leftPos ; | |
408 $genes{$chr}->{$ID}{'exonEnds'} = $rightPos ; | |
409 $genes{$chr}->{$ID}{'TSS'} = ($strand == 1) ? $leftPos :$rightPos ; | |
410 $genes{$chr}->{$ID}{'TE'} = ($strand == -1) ? $leftPos :$rightPos ; | |
411 $genes{$chr}->{$ID}{'reg'} = "miRNA"; | |
412 $genes{$chr}->{$ID}{'foldChange'} = 1; | |
413 ($genes{$chr}->{$ID}{'firstIntronStart'},$genes{$chr}->{$ID}{'firstIntronEnd'}) = (0,0); | |
414 ($genes{$chr}->{$ID}{'firstExonStart'},$genes{$chr}->{$ID}{'firstExonEnd'}) = (0,0); | |
415 | |
416 $genes{$chr}->{$ID}{'fluo'} = $fluo; | |
417 $genes{$chr}->{$ID}{'GCisland'} = 0; | |
418 | |
419 | |
420 | |
421 } | |
422 | |
423 | |
424 close MIR; | |
425 print "\t\t$MirFilename is read!\n" if ($verbose) ; | |
426 } | |
427 #-----------read file with sites and find N closest genes----- | |
428 my $numberOfAllSites = 0; | |
429 | |
430 open (FILE, "<$SitesFilename") or die "Cannot open file $SitesFilename!!!!: $!"; | |
431 open (OUT , ">$ResFilename") or die "Cannot open file $ResFilename!!!!: $!"; | |
432 print OUT "Chromosome\tStart\tEnd\tMax\tScore\tDistTSS\tType\tTypeIntra\tReg\tFoldChange\tDistTE\tGeneName\tGeneCoordinates\n" ; | |
433 my $header = <FILE>; #read header | |
434 my @a = split (/\t/,$header ); | |
435 my $correction = 0; | |
436 if ($a[1] =~m/chr/) { | |
437 $correction=1; | |
438 } | |
439 | |
440 unless ($header=~m/Chromosome/ || $header=~m/track/|| $header=~m/Start/i) { | |
441 close FILE; | |
442 open (FILE, "<$SitesFilename") or die "Cannot open file $SitesFilename!!!!: $!"; | |
443 } | |
444 | |
445 $count = 0; | |
446 | |
447 my %typehash; | |
448 my %typehashIntra; | |
449 | |
450 $typehashIntra{"f_exon"} = 0; | |
451 $typehashIntra{"f_intron"} = 0; | |
452 $typehashIntra{"jonction"} = 0; | |
453 $typehashIntra{"exon"} = 0; | |
454 $typehashIntra{"intron"} = 0; | |
455 | |
456 | |
457 while (<FILE>) { | |
458 chomp; | |
459 $numberOfAllSites++; | |
460 my @a = split /\t/; | |
461 my $chro = $a[0+$correction]; | |
462 my $fpos = $a[1+$correction]; | |
463 my $lpos = $a[2+$correction]; | |
464 my $maxpos = $a[3+$correction]; | |
465 if ($maxpos =~ m/\D/) { | |
466 $maxpos = int(($lpos+$fpos)/2); | |
467 } | |
468 my $score = $a[4+$correction]; | |
469 next if ($score < $minScore); | |
470 next if ($score >= $maxScore); | |
471 $score = $score +1 -1; | |
472 #print "$score" ; | |
473 | |
474 #my $site = "$chro:$fpos-$lpos\t$maxpos\t$score"; | |
475 #print $site ; | |
476 #my $motifNumber = $a[3]; | |
477 | |
478 #my $geneSet = &search_genes($N,$chro,($fpos+$lpos)/2,\%genes); | |
479 my @b; | |
480 unless ($all) { | |
481 @b = &findClosestGene($fpos,$lpos,$score,$chro,$maxpos,\%genes); #findAllClosestGene for all genes overlaping a peak | |
482 } else { | |
483 @b = &findAllClosestGeneOneLocation ($fpos,$lpos,$score,$chro,$maxpos,\%genes); #findAllClosestGene | |
484 } | |
485 #print "$distTSS\t$distTE\n" unless ($distTSS == $BIGNUMBER); | |
486 for my $type (@b) { | |
487 $typehash{$type} = 0 unless (exists($typehash{$type})); | |
488 $typehash{$type}++; | |
489 } | |
490 $count ++; | |
491 | |
492 } | |
493 close FILE; | |
494 close OUT ; | |
495 | |
496 print "Total Sites: $count\n"; | |
497 my $nEntries = 0; | |
498 for my $type (sort keys %typehash) { | |
499 $nEntries += $typehash{$type}; | |
500 } | |
501 print "Type\tCount\tFrequency\n"; | |
502 for my $type (sort keys %typehash) { | |
503 print $type,"\t",$typehash{$type},"\t",$typehash{$type}/$nEntries,"\n"; | |
504 } | |
505 for my $type (sort keys %typehashIntra) { | |
506 print $type,"\t",$typehashIntra{$type},"\t",$typehashIntra{$type}/$nEntries,"\n"; | |
507 } | |
508 | |
509 | |
510 #my @arr = @{$genes{'chr'}}; | |
511 | |
512 sub findClosestGene { | |
513 my ($fpos,$lpos,$score,$chro,$pos,$genes) = @_; | |
514 my ($distTSS,$distTE,$type,$reg,$geneName,$foldChange) = ($BIGNUMBER,$BIGNUMBER,"intergenic",0,0,0); | |
515 my $locDistTSS = 0; | |
516 my $locDistTE = 0; | |
517 my %hashForOverlapingGenes; | |
518 my @array2return; | |
519 my $typeIntra = "NA"; | |
520 for my $gName (keys %{$genes->{$chro}}) { | |
521 my $TSS = $genes->{$chro}{$gName}{'TSS'}; | |
522 my $strand = $genes->{$chro}{$gName}{'strand'}; | |
523 my $TE = $genes->{$chro}{$gName}{'TE'}; | |
524 | |
525 $locDistTE = ($pos-$TE)*$strand; | |
526 $locDistTSS = ($pos-$TSS)*$strand; | |
527 #print "$pos-$genes->{$chro}{$gName}{'TSS'})*$genes->{$chro}{$gName}{'strand'} = $locDistTSS\n"; | |
528 #my $locDistTE = ($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'}; | |
529 | |
530 #print "$gName $maxDistance $pos-$genes->{$chro}{$gName}{'TSS'} $genes->{$chro}{$gName}{'strand'}\n"; | |
531 #print "$locDistTSS $enhLeft $enhRight\n"; | |
532 | |
533 if (($locDistTSS>= $enhLeft)&&($locDistTSS<$enhRight)) { | |
534 $type = "enhancer"; | |
535 } elsif (($locDistTSS>= $enhRight)&&($locDistTSS<=0)) { | |
536 $type = "promoter"; | |
537 } elsif (($locDistTSS> 0)&&($locDistTSS<=$immediateDownstream)&&($locDistTE<=0)) { | |
538 $type = "immediateDownstream"; | |
539 } elsif (($locDistTSS >= $immediateDownstream)&&($locDistTE<=0)) { | |
540 $type = "intragenic"; | |
541 } elsif (($locDistTE >= 0)&&($locDistTE<=$downStreamDist)) { | |
542 $type = "5kbDownstream"; | |
543 } elsif (abs($locDistTSS)<abs($distTSS)) { | |
544 $distTSS = $locDistTSS; | |
545 $distTE = $locDistTE; | |
546 ($reg,$geneName,$foldChange) = ($genes->{$chro}{$gName}{'reg'},$gName,$genes->{$chro}{$gName}{'foldChange'}); | |
547 } | |
548 | |
549 if (($type ne "intergenic")&&($type ne "NA")) { | |
550 unless (exists ($hashForOverlapingGenes{$type})) { | |
551 my %h; | |
552 $hashForOverlapingGenes{$type} = \%h; | |
553 } | |
554 $hashForOverlapingGenes{$type}->{$locDistTSS}=$gName; | |
555 | |
556 #print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$genes->{$chro}{$gName}{'reg'}\t$gName\n" ; | |
557 $type = "NA"; | |
558 #unless ($gName =~ m/$TSS/) { | |
559 # print "$gName\t$TSS\n"; | |
560 #} | |
561 } | |
562 | |
563 } | |
564 if ($type eq "intergenic") { | |
565 print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$typeIntra\t$reg\t$foldChange\t$distTE\t$geneName\n" ; | |
566 #print "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$reg\t$geneName\n" ; | |
567 push(@array2return,$type); | |
568 | |
569 } else { | |
570 my $selectedType; | |
571 if (exists ($hashForOverlapingGenes{"promoter"})) { | |
572 $selectedType = "promoter"; | |
573 } elsif (exists ($hashForOverlapingGenes{"immediateDownstream"})) { | |
574 $selectedType = "immediateDownstream"; | |
575 } elsif (exists ($hashForOverlapingGenes{"intragenic"})) { | |
576 $selectedType = "intragenic"; | |
577 } elsif (exists ($hashForOverlapingGenes{"enhancer"})) { | |
578 $selectedType = "enhancer"; | |
579 } elsif (exists ($hashForOverlapingGenes{"5kbDownstream"})) { | |
580 $selectedType = "5kbDownstream"; | |
581 } | |
582 my $bestGene = printBest($hashForOverlapingGenes{$selectedType},$chro,$fpos,$lpos,$pos,$score,$genes,$selectedType); #print "$type\n"; | |
583 push(@array2return,$selectedType); | |
584 my %otherGenes; | |
585 for my $type ("promoter","immediateDownstream","intragenic","enhancer","5kbDownstream") { #"firstIntron","exon","intron", | |
586 for my $locDistTSS (sort {$a<=>$b} keys %{$hashForOverlapingGenes{$type}}) { | |
587 my $otherGene = $hashForOverlapingGenes{$type}->{$locDistTSS}; | |
588 next if ($genes->{$chro}{$bestGene}{'name'} eq $genes->{$chro}{$otherGene}{'name'}); | |
589 next if (exists ($otherGenes{$genes->{$chro}{$otherGene}{'name'}})); | |
590 if (overlapGenes($otherGene,$bestGene,$chro,$genes)) { | |
591 | |
592 if (($type eq "intragenic")||($type eq "immediateDownstream")) { | |
593 $typeIntra = &getTypeIntra($genes->{$chro}{$otherGene},$pos); | |
594 $typehashIntra{$typeIntra}++; | |
595 }else { | |
596 $typeIntra = "NA"; } | |
597 print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$otherGene}{'reg'}\t$genes->{$chro}{$otherGene}{'foldChange'}\t",($pos-$genes->{$chro}{$otherGene}{'TE'})*$genes->{$chro}{$otherGene}{'strand'},"\t$otherGene\n" ; | |
598 push(@array2return,$type); | |
599 | |
600 #print "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$genes->{$chro}{$otherGene}{'reg'}\t$otherGene\n" ; | |
601 #print $type,"\n"; | |
602 $otherGenes{$genes->{$chro}{$otherGene}{'name'}} = 1; | |
603 } | |
604 } | |
605 } | |
606 | |
607 } | |
608 return @array2return; | |
609 } | |
610 | |
611 sub findAllClosestGeneOneLocation { | |
612 my ($fpos,$lpos,$score,$chro,$pos,$genes) = @_; | |
613 my ($distTSS,$type,$reg,$geneName,$foldChange) = ($BIGNUMBER,"intergenic",0,0,0); | |
614 my $locDistTSS = 0; | |
615 my $locDistTE = 0; | |
616 my %hashForOverlapingGenes; | |
617 my @array2return; | |
618 my $typeIntra = "NA"; | |
619 for my $gName (keys %{$genes->{$chro}}) { | |
620 my $TSS = $genes->{$chro}{$gName}{'TSS'}; | |
621 my $strand = $genes->{$chro}{$gName}{'strand'}; | |
622 my $TE = $genes->{$chro}{$gName}{'TE'}; | |
623 | |
624 $locDistTE = ($pos-$TE)*$strand; | |
625 $locDistTSS = ($pos-$TSS)*$strand; | |
626 #print "$pos-$genes->{$chro}{$gName}{'TSS'})*$genes->{$chro}{$gName}{'strand'} = $locDistTSS\n"; | |
627 #my $locDistTE = ($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'}; | |
628 | |
629 #print "$gName $maxDistance $pos-$genes->{$chro}{$gName}{'TSS'} $genes->{$chro}{$gName}{'strand'}\n"; | |
630 #print "$locDistTSS $enhLeft $enhRight\n"; | |
631 | |
632 if (($locDistTSS>= $enhLeft)&&($locDistTSS<$enhRight)) { | |
633 $type = "enhancer"; | |
634 } elsif (($locDistTSS>= $enhRight)&&($locDistTSS<=0)) { | |
635 $type = "promoter"; | |
636 } elsif (($locDistTSS> 0)&&($locDistTSS<=$immediateDownstream)&&($locDistTE<=0)) { | |
637 $type = "immediateDownstream"; | |
638 } elsif (($locDistTSS >= $immediateDownstream)&&($locDistTE<=0)) { | |
639 $type = "intragenic"; | |
640 } elsif (($locDistTE >= 0)&&($locDistTE<=$downStreamDist)) { | |
641 $type = "5kbDownstream"; | |
642 } elsif (abs($locDistTSS)<abs($distTSS)) { | |
643 $distTSS = $locDistTSS; | |
644 ($reg,$geneName,$foldChange) = ($genes->{$chro}{$gName}{'reg'},$gName,$genes->{$chro}{$gName}{'foldChange'}); | |
645 } | |
646 | |
647 if (($type ne "intergenic")&&($type ne "NA")) { | |
648 unless (exists ($hashForOverlapingGenes{$type})) { | |
649 my %h; | |
650 $hashForOverlapingGenes{$type} = \%h; | |
651 $hashForOverlapingGenes{$type}->{$gName}=$locDistTSS; | |
652 | |
653 if (($type eq "intragenic")||($type eq "immediateDownstream")) { | |
654 $typeIntra = &getTypeIntra($genes->{$chro}{$gName},$pos); | |
655 $typehashIntra{$typeIntra}++; | |
656 } | |
657 print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$gName}{'reg'}\t$genes->{$chro}{$gName}{'fluo'}\t$genes->{$chro}{$gName}{'foldChange'}\t$genes{$chro}->{$gName}{'GCisland'}\t",($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'},"\t$gName\n" ; | |
658 push(@array2return,$type); | |
659 } | |
660 $typeIntra = "NA"; | |
661 $type = "NA"; | |
662 #unless ($gName =~ m/$TSS/) { | |
663 # print "$gName\t$TSS\n"; | |
664 #} | |
665 } | |
666 | |
667 } | |
668 if ($type eq "intergenic") { | |
669 print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$geneName}{'reg'}\t$genes->{$chro}{$geneName}{'fluo'}\t$genes->{$chro}{$geneName}{'foldChange'}\t$genes{$chro}->{$geneName}{'GCisland'}\t",($pos-$genes->{$chro}{$geneName}{'TE'})*$genes->{$chro}{$geneName}{'strand'},"\t$geneName\n" ; | |
670 | |
671 #print "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$reg\t$geneName\n" ; | |
672 push(@array2return,$type); | |
673 | |
674 } | |
675 return @array2return; | |
676 } | |
677 | |
678 sub findAllClosestGene { | |
679 my ($fpos,$lpos,$score,$chro,$pos,$genes) = @_; | |
680 my ($distTSS,$type,$reg,$geneName,$foldChange) = ($BIGNUMBER,"intergenic",0,0,0); | |
681 my $locDistTSS = 0; | |
682 my $locDistTE = 0; | |
683 my %hashForOverlapingGenes; | |
684 my @array2return; | |
685 my $typeIntra = "NA"; | |
686 for my $gName (keys %{$genes->{$chro}}) { | |
687 my $TSS = $genes->{$chro}{$gName}{'TSS'}; | |
688 my $strand = $genes->{$chro}{$gName}{'strand'}; | |
689 my $TE = $genes->{$chro}{$gName}{'TE'}; | |
690 | |
691 $locDistTE = ($pos-$TE)*$strand; | |
692 $locDistTSS = ($pos-$TSS)*$strand; | |
693 #print "$pos-$genes->{$chro}{$gName}{'TSS'})*$genes->{$chro}{$gName}{'strand'} = $locDistTSS\n"; | |
694 #my $locDistTE = ($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'}; | |
695 | |
696 #print "$gName $maxDistance $pos-$genes->{$chro}{$gName}{'TSS'} $genes->{$chro}{$gName}{'strand'}\n"; | |
697 #print "$locDistTSS $enhLeft $enhRight\n"; | |
698 | |
699 if (($locDistTSS>= $enhLeft)&&($locDistTSS<$enhRight)) { | |
700 $type = "enhancer"; | |
701 } elsif (($locDistTSS>= $enhRight)&&($locDistTSS<=0)) { | |
702 $type = "promoter"; | |
703 } elsif (($locDistTSS> 0)&&($locDistTSS<=$immediateDownstream)&&($locDistTE<=0)) { | |
704 $type = "immediateDownstream"; | |
705 } elsif (($locDistTSS >= $immediateDownstream)&&($locDistTE<=0)) { | |
706 $type = "intragenic"; | |
707 } elsif (($locDistTE >= 0)&&($locDistTE<=$downStreamDist)) { | |
708 $type = "5kbDownstream"; | |
709 } elsif (abs($locDistTSS)<abs($distTSS)) { | |
710 $distTSS = $locDistTSS; | |
711 ($reg,$geneName,$foldChange) = ($genes->{$chro}{$gName}{'reg'},$gName,$genes->{$chro}{$gName}{'foldChange'}); | |
712 } | |
713 | |
714 if (($type ne "intergenic")&&($type ne "NA")) { | |
715 unless (exists ($hashForOverlapingGenes{$type})) { | |
716 my %h; | |
717 $hashForOverlapingGenes{$type} = \%h; | |
718 } | |
719 #$hashForOverlapingGenes{$type}->{$gName}=$locDistTSS; | |
720 | |
721 $typeIntra = "NA"; | |
722 if (($type eq "intragenic")||($type eq "immediateDownstream")) { | |
723 $typeIntra = &getTypeIntra($genes->{$chro}{$gName},$pos); | |
724 $typehashIntra{$typeIntra}++; | |
725 } | |
726 print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$gName}{'reg'}\t$genes->{$chro}{$gName}{'fluo'}\t$genes->{$chro}{$gName}{'foldChange'}\t$genes{$chro}->{$gName}{'GCisland'}\t",($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'},"\t$gName\n" ; | |
727 push(@array2return,$type); | |
728 | |
729 $type = "NA"; | |
730 #unless ($gName =~ m/$TSS/) { | |
731 # print "$gName\t$TSS\n"; | |
732 #} | |
733 } | |
734 | |
735 } | |
736 if ($type eq "intergenic") { | |
737 print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$geneName}{'reg'}\t$genes->{$chro}{$geneName}{'fluo'}\t$genes->{$chro}{$geneName}{'foldChange'}\t$genes{$chro}->{$geneName}{'GCisland'}\t",($pos-$genes->{$chro}{$geneName}{'TE'})*$genes->{$chro}{$geneName}{'strand'},"\t$geneName\n" ; | |
738 | |
739 #print "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$reg\t$geneName\n" ; | |
740 push(@array2return,$type); | |
741 | |
742 } | |
743 return @array2return; | |
744 } | |
745 | |
746 sub getTypeIntra { | |
747 | |
748 my ($geneEntry, $pos) = @_; | |
749 my $type; | |
750 | |
751 if (($pos >= $geneEntry->{'firstIntronStart'})&&($pos <=$geneEntry->{'firstIntronEnd'})) { | |
752 return "f_intron"; | |
753 } | |
754 if (($pos >= $geneEntry->{'firstExonStart'})&&($pos <=$geneEntry->{'firstExonEnd'})) { | |
755 return "f_exon"; | |
756 } | |
757 $type = getIntronExon ($pos, $geneEntry->{'exonCount'},$geneEntry->{'exonStarts'},$geneEntry->{'exonEnds'},$geneEntry->{'strand'}); | |
758 return $type; | |
759 } | |
760 | |
761 sub overlapGenes { | |
762 my ($otherGene,$bestGene,$chro,$genes)= @_; | |
763 my $a1 = $genes->{$chro}{$otherGene}{'left'}; | |
764 my $a2 = $genes->{$chro}{$bestGene}{'left'}; | |
765 my $e1 = $genes->{$chro}{$otherGene}{'right'}; | |
766 my $e2 = $genes->{$chro}{$bestGene}{'right'}; | |
767 if (($a1 >= $a2)&&($a1 <= $e2)) { | |
768 return 1; | |
769 } | |
770 if (($a2 >= $a1)&&($a2 <= $e1)) { | |
771 return 1; | |
772 } | |
773 return 0; | |
774 } | |
775 | |
776 sub printBest { | |
777 my ($hash,$chro,$fpos,$lpos,$pos,$score,$genes,$type) = @_; | |
778 for my $locDistTSS (sort {$a<=>$b} keys %{$hash}) { | |
779 my $gName = $hash->{$locDistTSS}; | |
780 my $typeIntra = "NA"; | |
781 if (($type eq "intragenic")||($type eq "immediateDownstream")) { | |
782 $typeIntra = &getTypeIntra($genes->{$chro}{$gName},$pos); | |
783 $typehashIntra{$typeIntra}++; | |
784 } | |
785 print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$gName}{'reg'}\t$genes->{$chro}{$gName}{'foldChange'}\t",($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'},"\t$gName\n" ; | |
786 return $gName; | |
787 } | |
788 } | |
789 | |
790 #sub findAllClosestGene { | |
791 # my ($fpos,$lpos,$score,$chro,$pos,$genes) = @_; | |
792 # my ($distTSS,$type,$reg,$geneName,$foldChange) = ($BIGNUMBER,"intergenic",0,0,0); | |
793 # my $locDistTSS = 0; | |
794 # my $locDistTE = 0; | |
795 # my @array2return; | |
796 # for my $gName (keys %{$genes->{$chro}}) { | |
797 # my $TSS = $genes->{$chro}{$gName}{'TSS'}; | |
798 # my $strand = $genes->{$chro}{$gName}{'strand'}; | |
799 # my $TE = $genes->{$chro}{$gName}{'TE'}; | |
800 # $locDistTSS = ($pos-$TSS)*$strand; | |
801 # $locDistTE = ($pos-$TE)*$strand; | |
802 # #print "$pos-$genes->{$chro}{$gName}{'TSS'})*$genes->{$chro}{$gName}{'strand'} = $locDistTSS\n"; | |
803 # #my $locDistTE = ($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'}; | |
804 # | |
805 # #print "$gName $maxDistance $pos-$genes->{$chro}{$gName}{'TSS'} $genes->{$chro}{$gName}{'strand'}\n"; | |
806 # #print "$locDistTSS $enhLeft $enhRight\n"; | |
807 # | |
808 # if (($locDistTSS>= $enhLeft)&&($locDistTSS<$enhRight)) { | |
809 # $type = "enhancer"; | |
810 # } elsif (($locDistTSS>= $enhRight)&&($locDistTSS<=0)) { | |
811 # $type = "promoter"; | |
812 # } elsif (($locDistTSS> 0)&&($locDistTSS<=$immediateDownstream)) { | |
813 # $type = "immediateDownstream"; | |
814 # } elsif (($pos >= $genes{$chro}->{$gName}{'firstIntronStart'})&&($pos <= $genes{$chro}->{$gName}{'firstIntronEnd'})) { | |
815 # $type = "firstIntron"; | |
816 # } elsif (($locDistTSS> $immediateDownstream)&&($locDistTSS<=$genes{$chro}->{$gName}{'length'})) { | |
817 # #$type = "intragenic"; | |
818 # $type = &getIntronExon ($pos, $genes{$chro}->{$gName}{'exonCount'},$genes{$chro}->{$gName}{'exonStarts'},$genes{$chro}->{$gName}{'exonEnds'},$genes{$chro}->{$gName}{'strand'}); | |
819 # } elsif (($locDistTE >= 0)&&($locDistTE<=$downStreamDist)) { | |
820 # $type = "5kbDownstream"; | |
821 # } elsif (abs($locDistTSS)<abs($distTSS)) { | |
822 # $distTSS = $locDistTSS; | |
823 # ($reg,$geneName,$foldChange) = ($genes->{$chro}{$gName}{'reg'},$gName,$genes->{$chro}{$gName}{'foldChange'}); | |
824 # } | |
825 # if (($type ne "intergenic")&&($type ne "NA")) { | |
826 # print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$genes->{$chro}{$gName}{'reg'}\t$genes->{$chro}{$gName}{'foldChange'}\t$gName\n" ; | |
827 # push(@array2return,$type); | |
828 # $type = "NA"; | |
829 # #unless ($gName =~ m/$TSS/) { | |
830 # # print "$gName\t$TSS\n"; | |
831 # #} | |
832 # } | |
833 # | |
834 # } | |
835 # if ($type eq "intergenic") { | |
836 # print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$reg\t$foldChange\t$geneName\n" ; | |
837 # push(@array2return,$type); | |
838 # #print "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$reg\t$geneName\n" ; | |
839 # | |
840 # } | |
841 # return @array2return; | |
842 #} | |
843 | |
844 sub getFirstIntron { | |
845 my ($exonCount,$exonStarts,$exonEnds,$strand) = @_; | |
846 my ($left,$right); | |
847 if ($exonCount == 1) { | |
848 return (0,0); | |
849 } | |
850 if ($strand == 1) { | |
851 $left = (split ",", $exonEnds)[0]+$jonctionSize; | |
852 $right = (split (",", $exonStarts))[1]-$jonctionSize; | |
853 } else { | |
854 $left = (split (",", $exonEnds))[$exonCount-2]+$jonctionSize; | |
855 $right = (split (",", $exonStarts))[$exonCount-1]-$jonctionSize; | |
856 } | |
857 ($left,$right); | |
858 } | |
859 | |
860 sub getFirstExon { | |
861 my ($exonCount,$exonStarts,$exonEnds,$strand) = @_; | |
862 my ($left,$right); | |
863 if ($exonCount == 1) { | |
864 return (0,0); | |
865 } | |
866 if ($strand == 1) { | |
867 $left = (split ",", $exonStarts)[0]; | |
868 $right = (split (",", $exonEnds))[0]-$jonctionSize; | |
869 } else { | |
870 $left = (split (",", $exonStarts))[$exonCount-1]+$jonctionSize; | |
871 $right = (split (",", $exonEnds))[$exonCount-1]; | |
872 } | |
873 ($left,$right); | |
874 } | |
875 | |
876 sub getIntronExon { | |
877 my ($pos,$exonCount,$exonStarts,$exonEnds,$strand) = @_; | |
878 my (@left,@right); | |
879 @left = (split ",", $exonStarts); | |
880 @right = (split (",", $exonEnds)); | |
881 | |
882 for (my $i = 0; $i<$exonCount;$i++) { | |
883 #print "$left[$i] <= $pos ? $pos <= $right[$i]\n"; | |
884 if (($left[$i]+$jonctionSize < $pos) && ($pos < $right[$i]-$jonctionSize)) { | |
885 #print "URA!\n"; | |
886 return "exon"; | |
887 } elsif (($i+1<$exonCount)&&($right[$i]+$jonctionSize < $pos) && ($pos < $left[$i+1]-$jonctionSize)) { | |
888 return "intron"; | |
889 } | |
890 } | |
891 return "jonction"; | |
892 } | |
893 | |
894 sub min { | |
895 return $_[0] if ($_[0]<$_[1]); | |
896 $_[1]; | |
897 } | |
898 sub med { | |
899 my @arr = @_; | |
900 my $med = 0; | |
901 @arr = sort {$a <=> $b} @arr; | |
902 if ((scalar(@arr)/2) =~ m/[\.\,]5/) { | |
903 return $arr[floor(scalar(@arr)/2)]; | |
904 } else { | |
905 return ($arr[scalar(@arr)/2]+$arr[scalar(@arr)/2-1])/2; | |
906 } | |
907 $med; | |
908 } | |
909 | |
910 sub checkIfGC { | |
911 my ($TSS,$strand,$dist,$GCislandsChr)=@_; | |
912 my $ifGC = 0; | |
913 my $leftProm=$TSS-$dist; | |
914 my $rightProm = $TSS; | |
915 if ($strand== -1) { | |
916 my $leftProm=$TSS; | |
917 my $rightProm = $TSS+$dist; | |
918 } | |
919 for my $leftGC (keys %{$GCislandsChr}) { | |
920 my $rightGC = $GCislandsChr->{$leftGC}; | |
921 if ($leftGC>=$leftProm&&$leftGC<=$rightProm || $rightGC>=$leftProm&&$rightGC<=$rightProm) { | |
922 return "GC-island"; | |
923 } | |
924 } | |
925 return $ifGC ; | |
926 } |