0
|
1 #:t:::::::::::::::::g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
|
2 #:t::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
|
3 #:::::::::::::z;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
|
4 #::::::::::::i@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
|
5 #::::::::::::@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@$@@@@
|
|
6 #:::::::::::3@@@@@@@@@@@@@@@@@@@@@@@@@B@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
|
7 #::::::::::3@@@@@@@@@@@@@@@@@@@@@BEEESSE5EEEEBBM@@@@@@@@@@@@@@@@@@@@@@@@@@
|
|
8 #::::::::::3@@@@@@@@@@@@@@@@@@@@BEEEEEE35EE55E2355E5SBMB@@@@@@@@@@@@@@@@@$
|
|
9 #::::::::::@@@@@@@@@@@@@@@@@@@EEEE55533t3tttt::::::!!!!7755E755SBBMMM@@@MM
|
|
10 #::::::::::3@@@@@@@@@@@@@@@@@@EEEE2t3ttttt:::::::::::::::::::::::!7?5225EE
|
|
11 #::::::::::3@@@@@@@@@@@@@@@@@@EEEEE31t::::::::::::::::::::::::::::::::3E5@
|
|
12 #::::::::::3@@@@@@@@@@@@@@@@@@EEEEEEtt:::::::::::::::::::::::::::::::::353
|
|
13 #::::::::::3@@@@@@@@@@@@@@@@@@EEEEEE1ttz::::::::::::::::::::::::::::::::35
|
|
14 #:::::::::::@@@@@@@@@@@@@@@@@@EEEEEEEtz1::::::::::::::::::::::::::::::::t:
|
|
15 #:::::::::!3@@@@@@@@@@@@@@@@@@@EEEEEttt::::::::::::::::::::::::::::::::;zz
|
|
16 #::::::::::@@@@@@@@@@@@@@@@@@@@EEEEEttt:::::z;z:::::::::::::::::::::::::13
|
|
17 #::::::::::3B@@@@@@@@@@@@@@@@@@EEEEEEE3tt:czzztti;:::::::::::::::::::::::3
|
|
18 #::::ttt::::3@@@@@@@@@@@@@@@@EEEEE5EE25Ezt1EEEz5Etzzz;;;;:::::::::::::::::
|
|
19 #:::::::::::I9@@@@@@@@@@@@@@@@@@@@@@@@@@EEEEEE@@@@@@@@@@@@@@Ez;:::::::::::
|
|
20 #:::::::::::::E@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@Ez::::::
|
|
21 #::::::::::::::E@@@@@@@@@@@@@@@@@@@@@@@@@@@@@BE5EBB@@@@@@@@@@@@@@@EEE:::::
|
|
22 #:::::::::::::::@@@@@@@@@@@@@@@@@@@@@@@@@@@@E1::35@@@@@@@@@@ME3MMME2::::::
|
|
23 #:::::::::::::::?@@@@@@@@@@@@@@@@@@M@@@@@@@EE:::::3SB@@BBESEEt::::::::::::
|
|
24 #::::::::::::::::J$@@@@@@@B@@@@@@@@@@@@@@@@EE:::::::!35E33t:::::::::::::::
|
|
25 #:::::::::::::::::3@E@@@EE5EESE5EESE@@@@@@@Et::::::::::::tz:::::::::::::::
|
|
26 #:::::::::::::::::J@E$@EEE5133555SE@@@@@@@@Et:::::::::::::::::::::::::::::
|
|
27 #::::::::::::::::::E@E@EEEEtt3523EEE@@@@@@@E::::::::::::::::::::::::::::::
|
|
28 #:t::::::::::::::::JEE3@@@EEEEEEEEEE@@@@@@@E:::::::::t;:::::::::::::::::::
|
|
29 #:t:::::::::::::::::!5ES@EEEEEEEEES@@@@@@@@@E;:::;;;:3Ez::::::::::::::::::
|
|
30 #:t::::::::::::::::::::JE@@EEEEEEE@@@@@@@@@@@@@@@@ME!:::;:::::::::::::::::
|
|
31 #:tz::::::::::::::::::::JE@@@EEEE@@@@@@@@@@@@@@EE!:::::::t::::::::::::::::
|
|
32 #:t::::::::::::::::::::::3@@@@@@@@@@@@@@@@@@ESBE::::::::::::::::::::::::::
|
|
33 #:::::::::::::::::::::::::Q@@@@@@@@@@@@@@@@EE3EE;:::::zzzz::::::::::::::::
|
|
34 #:::::::::::::::::::::::::3@@@@@@@@@@@@@@@@@@@@@@NN@@@@@@Ez:::::::::::::::
|
|
35 #:zt:::::::::::::::::::::::3@@@@EE@@@@@@@@@@EEEEt::;z113E5t:::::::::::::::
|
|
36 #::tt:::::::::::::::::::::::3@@@E@@@@@@@@@@@@@@@@BEt::::::::::::::::t:::::
|
|
37 #:tt:t:::::::::::::::::::::::?S@@@@@@@@@@@BBEEE51!::::::::::::::zzzEt:::::
|
|
38 #::::::::::::::::::::::::::::::3Q@@@@@@@BEEEEEt:::::::::::::;zz@@@EE::::::
|
|
39 #::::::::::::::::::::::::::::::::75B@@@@@EEEtt;:::::::::;zz@@@@BEEEtz:::::
|
|
40 #::::::::::::::::::::::::::::::::::::?9@@@@@@@@@@@E2Ezg@@@@@B@@@EEEE1t::::
|
|
41 #:::::::::::::::::::::::::::::::::::::::3@@@@@@@@@@@@@@@@@@@E@EEEEEEEzzz::
|
|
42 #::::::::::::::::::::::::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@EEEEEEE5ttttt
|
|
43 #:::::::::::::::::::::::::::::::;g@@@@@@@@@@@@@@@@@@@@@@@@@@EEEEEEEEEEEtzt
|
|
44 #::::::::::::::::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@E@@EEEEEEEEEEEE@@@
|
|
45 #::::::::::::::::::::::::::g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@EEEE3EEEE@@@@@@@
|
|
46 #:::::::::::::::::::::;;g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@EEEt33@@@@@@@@@@
|
|
47 #:::::::::::::::::;g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@E@@@@@@EEEtg@@@@@@@@@@@@
|
|
48 #::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@EEEE@@@@@@@@@@@@@@@@@@@@@@@@
|
|
49 #:::::::::::::@@@@@@@@@@@@@@@@@$@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
|
50 #::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
|
51 #
|
|
52 # Copyleft ↄ⃝ 2012 Institut Curie
|
|
53 # Author(s): Valentina Boeva, Alban Lermine (Institut Curie) 2012
|
|
54 # Contact: valentina.boeva@curie.fr, alban.lermine@curie.fr
|
|
55 # This software is distributed under the terms of the GNU General
|
|
56 # Public License, either Version 2, June 1991 or Version 3, June 2007.
|
|
57
|
|
58 #!/usr/bin/perl -w
|
|
59
|
|
60 #for a complete list of genes and a list of sites, sorts genes close to sites
|
|
61 #printDistance
|
|
62
|
|
63 #read only gene files with refSeqGenes
|
|
64 #counts only once overlapping transcripts or genes
|
|
65
|
|
66 #calculates some stats about locations of peaks
|
|
67
|
|
68 #uses hierarchie : promoter, imUpstream, intragenic,enh, 5kbdownstream, intergenic + for genes: fExon,exon,fIntron,intron,junction+-50kb ONLY FOR "findClosestGene"
|
|
69 #outputs fold change of expression is available
|
|
70
|
|
71 #outputname - corrected
|
|
72 #all isoforms are considered, even those that start and end at the same coordinates
|
|
73 #uses fluorescence values
|
|
74
|
|
75 #only one location per gene in mode "all" (but promoter+intragenic if both)
|
|
76
|
|
77 #read directly Noresp/down/up genes
|
|
78
|
|
79 #prints distance to TE
|
|
80
|
|
81 use strict;
|
|
82 use POSIX;
|
|
83
|
|
84 my $SitesFilename ;
|
|
85 my $GenesFilename ;
|
|
86 my $SelectedGenesFilename = "";
|
|
87 my $MirFilename = "";
|
|
88 my $minScore = 0;
|
|
89 my $BIGNUMBER = 10000000000;
|
|
90 my $enhLeft = -30000;
|
|
91 my $enhRight = -1500;
|
|
92 my $promLeft = $enhRight ;
|
|
93 my $immediateDownstream = 2000;
|
|
94 my $maxScore = $BIGNUMBER ;
|
|
95 my $verbose = 0;
|
|
96 my $maxDistance = $BIGNUMBER ;
|
|
97 my $downStreamDist = 5000;
|
|
98 my $jonctionSize = 50;
|
|
99 my $all = 0;
|
|
100 my $fluoFile = "";
|
|
101 my $regType = "";
|
|
102 my $GCislands = "";
|
|
103 my $ResFilename = "";
|
|
104
|
|
105 my $usage = qq{
|
|
106 $0
|
|
107
|
|
108 -----------------------------
|
|
109 mandatory parameters:
|
|
110
|
|
111 -g filename file with all genes
|
|
112 -tf filename file with sites of TF
|
|
113
|
|
114 -----------------------------
|
|
115 optional parameters:
|
|
116 -v for verbose
|
|
117 -mir filename file with positions of miRNA
|
|
118 -maxDist value maximal Distance to genes (def. Inf)
|
|
119 -minScore value minimal Score (def. 0)
|
|
120 -maxScore value maximal Score (def. Inf)
|
|
121 -selG filename file with selected genes and their expression levels
|
|
122 -all to output all genes intersecting with the peak
|
|
123 -fluo filename file with fluorescence
|
|
124 -regType valeu down-regulated, up-regulated, no-response
|
|
125 -gc filename file with gc-islands
|
|
126
|
|
127 -lp valeu upstream of TSS region to define promoter
|
|
128 -rp valeu downstream of TSS region to define immediate downstream
|
|
129 -enh valeu upstream of TSS region to define enhancer
|
|
130 -dg valeu downstream of transcription end region to define gene downstream
|
|
131
|
|
132 -o filename output filename
|
|
133
|
|
134 };
|
|
135
|
|
136 if(scalar(@ARGV) == 0){
|
|
137 print $usage;
|
|
138 exit(0);
|
|
139 }
|
|
140
|
|
141
|
|
142 while(scalar(@ARGV) > 0){
|
|
143 my $this_arg = shift @ARGV;
|
|
144 if ( $this_arg eq '-h') {print "$usage\n"; exit; }
|
|
145
|
|
146 elsif ( $this_arg eq '-g') {$GenesFilename = shift @ARGV;}
|
|
147 elsif ( $this_arg eq '-tf') {$SitesFilename = shift @ARGV;}
|
|
148 elsif ( $this_arg eq '-mir') {$MirFilename = shift @ARGV;}
|
|
149 elsif ( $this_arg eq '-selG') {$SelectedGenesFilename = shift @ARGV;}
|
|
150 elsif ( $this_arg eq '-maxDist') {$maxDistance = 1;}
|
|
151 elsif ( $this_arg eq '-maxScore') {$maxScore = shift @ARGV;}
|
|
152 elsif ( $this_arg eq '-minScore') {$minScore = shift @ARGV;}
|
|
153 elsif ( $this_arg eq '-v') {$verbose = 1;}
|
|
154 elsif ( $this_arg eq '-all') {$all = 1;}
|
|
155 elsif ( $this_arg eq '-regType') {$regType = shift @ARGV;}
|
|
156
|
|
157 elsif ( $this_arg eq '-fluo') {$fluoFile = shift @ARGV;}
|
|
158 elsif ( $this_arg eq '-gc') {$GCislands = shift @ARGV;}
|
|
159 elsif ( $this_arg eq '-o') {$ResFilename = shift @ARGV;}
|
|
160
|
|
161 elsif ( $this_arg eq '-lp') {$enhRight = shift @ARGV;$promLeft = $enhRight;}
|
|
162 elsif ( $this_arg eq '-rp') {$immediateDownstream = shift @ARGV;}
|
|
163 elsif ( $this_arg eq '-enh') {$enhLeft = shift @ARGV;}
|
|
164 elsif ( $this_arg eq '-dg') {$downStreamDist = shift @ARGV;}
|
|
165
|
|
166 elsif ( $this_arg =~ m/^-/ ) { print "unknown flag: $this_arg\n";}
|
|
167 }
|
|
168
|
|
169 if ($maxDistance<0) {
|
|
170 die "\tThe maximal distance must be positive!\n";
|
|
171 }
|
|
172 my $tmpn = $SitesFilename;
|
|
173 $tmpn =~ s/.*[\/\\]//;
|
|
174 if ($ResFilename eq "") {
|
|
175 $ResFilename = $SelectedGenesFilename.$tmpn."_".$minScore."_dists.txt";
|
|
176
|
|
177 if ($maxScore != $BIGNUMBER) {
|
|
178 $ResFilename = $SelectedGenesFilename.$SitesFilename."_".$minScore."_$maxScore"."_dists.txt";
|
|
179 }
|
|
180
|
|
181 if ($all == 1) {
|
|
182 $ResFilename =~ s/_dists/_all_dists/;
|
|
183 }
|
|
184 }
|
|
185
|
|
186
|
|
187 #-----------read selected genes----------------
|
|
188 my %selectedGenes;
|
|
189 my %selectedGenesFoldChange;
|
|
190 if ( $SelectedGenesFilename ne "") {
|
|
191 open (FILE, "<$SelectedGenesFilename ") or die "Cannot open file $SelectedGenesFilename !!!!: $!";
|
|
192 while (<FILE>) {
|
|
193 chomp;
|
|
194 my @a = split/\t/;
|
|
195 $selectedGenes{$a[1]} = $a[3];
|
|
196 $selectedGenesFoldChange{$a[1]} = $a[2];
|
|
197 #print "gene:$a[1],reg:$selectedGenes{$a[1]},FC:$selectedGenesFoldChange{$a[1]}\n";
|
|
198 }
|
|
199
|
|
200 close FILE;
|
|
201 print "\t\t$SelectedGenesFilename is read!\n" if ($verbose);
|
|
202 }
|
|
203
|
|
204 #-----------read genes with fluorescence---------
|
|
205 my %fluoGenes;
|
|
206 if ( $fluoFile ne "") {
|
|
207 open (FILE, "<$fluoFile ") or die "Cannot open file $fluoFile !!!!: $!";
|
|
208 my $gene = "";
|
|
209 my $med = 0;
|
|
210 my %h;
|
|
211 while (<FILE>) {
|
|
212 chomp;
|
|
213 my @a = split/\t/;
|
|
214
|
|
215 next if (scalar(@a)<5);
|
|
216 next if ($a[0] eq "probesets");
|
|
217 next unless ($a[0] =~m/\S/);
|
|
218 next unless ($a[4] =~m/\S/);
|
|
219 if ($gene ne "" && $gene ne $a[2]) {
|
|
220 #calcMed
|
|
221 $med = med(keys %h);
|
|
222 $fluoGenes{$gene} = $med;
|
|
223 $med=0;
|
|
224 delete @h{keys %h};
|
|
225 } else {
|
|
226 #$h{$a[4]} = 1;
|
|
227 }
|
|
228 $gene = $a[2];
|
|
229 $h{$a[4]} = 1;
|
|
230 #print "keys ", scalar(keys %h),"\t",keys %h,"\n";
|
|
231 }
|
|
232 if ($gene ne "") {
|
|
233 $med = med(keys %h);
|
|
234 $fluoGenes{$gene} = $med;
|
|
235 }
|
|
236 close FILE;
|
|
237 print "\t\t$fluoFile is read!\n" if ($verbose);
|
|
238 }
|
|
239 #-----------read GC-islands----------------
|
|
240 my %GCislands;
|
|
241 if ($GCislands ne "") {
|
|
242 open (FILE, "<$GCislands ") or die "Cannot open file $GCislands !!!!: $!";
|
|
243
|
|
244 while (<FILE>) {
|
|
245 chomp;
|
|
246 my @a = split/\t/;
|
|
247 #bin chrom chromStart chromEnd name length cpgNum gcNum perCpg perGc obsExp
|
|
248 #107 chr1 36568608 36569851 CpG: 128 1243 128 766 20.6 61.6 1.09
|
|
249 my $chr = $a[1];
|
|
250 my $start = $a[2];
|
|
251 my $end = $a[3];
|
|
252 $GCislands{$chr}->{$start}=$end;
|
|
253 }
|
|
254 close FILE;
|
|
255 } elsif ($verbose) {
|
|
256 print "you did not specify a file with GC-islands\n";
|
|
257 }
|
|
258
|
|
259 #-----------read genes----------------
|
|
260
|
|
261 my %genes;
|
|
262
|
|
263 my $count = 0;
|
|
264
|
|
265 open (GENES, "<$GenesFilename") or die "Cannot open file $GenesFilename!!!!: $!";
|
|
266 <GENES>;
|
|
267 while (<GENES>) {
|
|
268 chomp;
|
|
269 if (/(chr.*)\s([+-])\s(\d+)\s(\d+)\s(\d+)\s(\d+)\s(\d+)\s(\S+)\s(\S+)\s\S+\s(\S+)/){
|
|
270 my $name = $10;
|
|
271 my $chr = $1;
|
|
272 my $strand = $2;
|
|
273 if ($strand eq '+') {
|
|
274 $strand = 1;
|
|
275 }
|
|
276 else {
|
|
277 $strand = -1;
|
|
278 }
|
|
279 my $leftPos = $3;
|
|
280 my $rightPos = $4;
|
|
281 my $cdsStart= $5;
|
|
282 my $cdsEnd= $6;
|
|
283 my $exonCount= $7;
|
|
284 my $exonStarts= $8;
|
|
285 my $exonEnds= $9;
|
|
286 my $ID = "$name\t$chr:$leftPos-$rightPos;$exonStarts-$exonEnds";
|
|
287 my $foldChange = 1;
|
|
288 my $reg = "NA";
|
|
289 my $fluo = "NA";
|
|
290 if ( $SelectedGenesFilename ne "") {
|
|
291 #print "$name\t";
|
|
292 if (exists($selectedGenes{$name})) {
|
|
293 $reg = $selectedGenes{$name};
|
|
294 $foldChange = $selectedGenesFoldChange{$name};
|
|
295 }
|
|
296 }
|
|
297 unless (exists($genes{$chr})) {
|
|
298 my %h;
|
|
299 $genes{$chr} = \%h;
|
|
300 }
|
|
301 unless (exists($genes{$chr}->{$ID})) {
|
|
302 my %h1;
|
|
303 $genes{$chr}->{$ID} = \%h1;
|
|
304 $count++;
|
|
305 }
|
|
306 if ( $fluoFile ne "") {
|
|
307 if (exists($fluoGenes{$name})) {
|
|
308 $fluo = $fluoGenes{$name};
|
|
309 }
|
|
310 }
|
|
311 $genes{$chr}->{$ID}{'name'} = $name ;
|
|
312 $genes{$chr}->{$ID}{'left'} = $leftPos ;
|
|
313 $genes{$chr}->{$ID}{'right'} = $rightPos ;
|
|
314 $genes{$chr}->{$ID}{'cdsStart'} = $cdsStart;
|
|
315 $genes{$chr}->{$ID}{'cdsEnd'} = $cdsEnd;
|
|
316 $genes{$chr}->{$ID}{'strand'} = $strand;
|
|
317 $genes{$chr}->{$ID}{'exonCount'} = $exonCount;
|
|
318 $genes{$chr}->{$ID}{'exonStarts'} = $exonStarts;
|
|
319 $genes{$chr}->{$ID}{'exonEnds'} = $exonEnds;
|
|
320 $genes{$chr}->{$ID}{'TSS'} = ($strand == 1) ? $leftPos :$rightPos ;
|
|
321 $genes{$chr}->{$ID}{'TE'} = ($strand == -1) ? $leftPos :$rightPos ;
|
|
322 $genes{$chr}->{$ID}{'reg'} = $reg;
|
|
323 $genes{$chr}->{$ID}{'foldChange'} = $foldChange;
|
|
324 $genes{$chr}->{$ID}{'length'} = abs ($leftPos-$rightPos);
|
|
325 ($genes{$chr}->{$ID}{'firstIntronStart'},$genes{$chr}->{$ID}{'firstIntronEnd'}) = getFirstIntron ($exonCount,$exonStarts,$exonEnds,$strand);
|
|
326 ($genes{$chr}->{$ID}{'firstExonStart'},$genes{$chr}->{$ID}{'firstExonEnd'}) = getFirstExon ($exonCount,$exonStarts,$exonEnds,$strand);
|
|
327
|
|
328 $genes{$chr}->{$ID}{'fluo'} = $fluo;
|
|
329
|
|
330 $genes{$chr}->{$ID}{'GCisland'} = 0;
|
|
331 if ($GCislands ne "") {
|
|
332 $genes{$chr}->{$ID}{'GCisland'} = checkIfGC ($genes{$chr}->{$ID}{'TSS'},$strand,2000,$GCislands{$chr});
|
|
333 }
|
|
334
|
|
335 }
|
|
336 }
|
|
337 print "Total genes : $count\n";
|
|
338 close GENES;
|
|
339 print "\t\t$GenesFilename is read!\n" if ($verbose);
|
|
340 #for my $gName (sort keys %{$genes{'chr18'}}) {
|
|
341
|
|
342 # print "$gName\t$genes{'chr18'}->{$gName}{'TSS'}\n";
|
|
343 #}
|
|
344
|
|
345 #-----------read file with sites miRNA, store as genes-----
|
|
346
|
|
347 if ( $MirFilename eq ""){
|
|
348 print "you did not specify file with miRNA\n" if ($verbose);;
|
|
349 }
|
|
350 else {
|
|
351
|
|
352
|
|
353 open (MIR, "<$MirFilename ") or die "Cannot open file $MirFilename !!!!: $!";
|
|
354 #chr1 20669090 20669163 mmu-mir-206 960 +
|
|
355 while (<MIR>) {
|
|
356 chomp;
|
|
357 my ($name, $chr, $leftPos, $rightPos, $strand );
|
|
358 my $fluo = "NA";
|
|
359
|
|
360 #1 . miRNA 20669091 20669163 . + . ACC="MI0000249"; ID="mmu-mir-206";
|
|
361 if (/([0-9XYM]+)\s.\smiRNA\s(\d+)\s(\d+)\s.\s([+-])\s.\sACC=.*ID=\"(.*)\"/) {
|
|
362 $name = $5;
|
|
363 $chr = $1;
|
|
364 $leftPos = $2;
|
|
365 $rightPos = $3;
|
|
366 $strand = $4;
|
|
367 }
|
|
368 elsif (/(.*)\s(\d+)\s(\d+)\s(.*)\s(.*)\s(.*)/){
|
|
369 $name = $4;
|
|
370 $chr = $1;
|
|
371 $leftPos = $2;
|
|
372 $rightPos = $3;
|
|
373 $strand = $6;
|
|
374 } else {
|
|
375 next;
|
|
376 }
|
|
377
|
|
378 unless ($chr =~ m/chr/) {
|
|
379 $chr = "chr".$chr;
|
|
380 }
|
|
381 my $ID = "$name\t$chr:$leftPos-$rightPos";
|
|
382
|
|
383 if ($strand eq '+') {
|
|
384 $strand = 1;
|
|
385 }
|
|
386 else {
|
|
387 $strand = -1;
|
|
388 }
|
|
389
|
|
390 unless (exists($genes{$chr})) {
|
|
391 my %h;
|
|
392 $genes{$chr} = \%h;
|
|
393 }
|
|
394 unless (exists($genes{$chr}->{$ID})) {
|
|
395 my %h1;
|
|
396 $genes{$chr}->{$ID} = \%h1;
|
|
397 $count++;
|
|
398 }
|
|
399 $genes{$chr}->{$ID}{'name'} = $name ;
|
|
400 $genes{$chr}->{$ID}{'left'} = $leftPos ;
|
|
401 $genes{$chr}->{$ID}{'right'} = $rightPos ;
|
|
402 $genes{$chr}->{$ID}{'cdsStart'} = $leftPos ;
|
|
403 $genes{$chr}->{$ID}{'cdsEnd'} = $rightPos ;
|
|
404 $genes{$chr}->{$ID}{'strand'} = $strand;
|
|
405 $genes{$chr}->{$ID}{'length'} = abs ($leftPos-$rightPos);
|
|
406 $genes{$chr}->{$ID}{'exonCount'} = 1;
|
|
407 $genes{$chr}->{$ID}{'exonStarts'} = $leftPos ;
|
|
408 $genes{$chr}->{$ID}{'exonEnds'} = $rightPos ;
|
|
409 $genes{$chr}->{$ID}{'TSS'} = ($strand == 1) ? $leftPos :$rightPos ;
|
|
410 $genes{$chr}->{$ID}{'TE'} = ($strand == -1) ? $leftPos :$rightPos ;
|
|
411 $genes{$chr}->{$ID}{'reg'} = "miRNA";
|
|
412 $genes{$chr}->{$ID}{'foldChange'} = 1;
|
|
413 ($genes{$chr}->{$ID}{'firstIntronStart'},$genes{$chr}->{$ID}{'firstIntronEnd'}) = (0,0);
|
|
414 ($genes{$chr}->{$ID}{'firstExonStart'},$genes{$chr}->{$ID}{'firstExonEnd'}) = (0,0);
|
|
415
|
|
416 $genes{$chr}->{$ID}{'fluo'} = $fluo;
|
|
417 $genes{$chr}->{$ID}{'GCisland'} = 0;
|
|
418
|
|
419
|
|
420
|
|
421 }
|
|
422
|
|
423
|
|
424 close MIR;
|
|
425 print "\t\t$MirFilename is read!\n" if ($verbose) ;
|
|
426 }
|
|
427 #-----------read file with sites and find N closest genes-----
|
|
428 my $numberOfAllSites = 0;
|
|
429
|
|
430 open (FILE, "<$SitesFilename") or die "Cannot open file $SitesFilename!!!!: $!";
|
|
431 open (OUT , ">$ResFilename") or die "Cannot open file $ResFilename!!!!: $!";
|
|
432 print OUT "Chromosome\tStart\tEnd\tMax\tScore\tDistTSS\tType\tTypeIntra\tReg\tFoldChange\tDistTE\tGeneName\tGeneCoordinates\n" ;
|
|
433 my $header = <FILE>; #read header
|
|
434 my @a = split (/\t/,$header );
|
|
435 my $correction = 0;
|
|
436 if ($a[1] =~m/chr/) {
|
|
437 $correction=1;
|
|
438 }
|
|
439
|
|
440 unless ($header=~m/Chromosome/ || $header=~m/track/|| $header=~m/Start/i) {
|
|
441 close FILE;
|
|
442 open (FILE, "<$SitesFilename") or die "Cannot open file $SitesFilename!!!!: $!";
|
|
443 }
|
|
444
|
|
445 $count = 0;
|
|
446
|
|
447 my %typehash;
|
|
448 my %typehashIntra;
|
|
449
|
|
450 $typehashIntra{"f_exon"} = 0;
|
|
451 $typehashIntra{"f_intron"} = 0;
|
|
452 $typehashIntra{"jonction"} = 0;
|
|
453 $typehashIntra{"exon"} = 0;
|
|
454 $typehashIntra{"intron"} = 0;
|
|
455
|
|
456
|
|
457 while (<FILE>) {
|
|
458 chomp;
|
|
459 $numberOfAllSites++;
|
|
460 my @a = split /\t/;
|
|
461 my $chro = $a[0+$correction];
|
|
462 my $fpos = $a[1+$correction];
|
|
463 my $lpos = $a[2+$correction];
|
|
464 my $maxpos = $a[3+$correction];
|
|
465 if ($maxpos =~ m/\D/) {
|
|
466 $maxpos = int(($lpos+$fpos)/2);
|
|
467 }
|
|
468 my $score = $a[4+$correction];
|
|
469 next if ($score < $minScore);
|
|
470 next if ($score >= $maxScore);
|
|
471 $score = $score +1 -1;
|
|
472 #print "$score" ;
|
|
473
|
|
474 #my $site = "$chro:$fpos-$lpos\t$maxpos\t$score";
|
|
475 #print $site ;
|
|
476 #my $motifNumber = $a[3];
|
|
477
|
|
478 #my $geneSet = &search_genes($N,$chro,($fpos+$lpos)/2,\%genes);
|
|
479 my @b;
|
|
480 unless ($all) {
|
|
481 @b = &findClosestGene($fpos,$lpos,$score,$chro,$maxpos,\%genes); #findAllClosestGene for all genes overlaping a peak
|
|
482 } else {
|
|
483 @b = &findAllClosestGeneOneLocation ($fpos,$lpos,$score,$chro,$maxpos,\%genes); #findAllClosestGene
|
|
484 }
|
|
485 #print "$distTSS\t$distTE\n" unless ($distTSS == $BIGNUMBER);
|
|
486 for my $type (@b) {
|
|
487 $typehash{$type} = 0 unless (exists($typehash{$type}));
|
|
488 $typehash{$type}++;
|
|
489 }
|
|
490 $count ++;
|
|
491
|
|
492 }
|
|
493 close FILE;
|
|
494 close OUT ;
|
|
495
|
|
496 print "Total Sites: $count\n";
|
|
497 my $nEntries = 0;
|
|
498 for my $type (sort keys %typehash) {
|
|
499 $nEntries += $typehash{$type};
|
|
500 }
|
|
501 print "Type\tCount\tFrequency\n";
|
|
502 for my $type (sort keys %typehash) {
|
|
503 print $type,"\t",$typehash{$type},"\t",$typehash{$type}/$nEntries,"\n";
|
|
504 }
|
|
505 for my $type (sort keys %typehashIntra) {
|
|
506 print $type,"\t",$typehashIntra{$type},"\t",$typehashIntra{$type}/$nEntries,"\n";
|
|
507 }
|
|
508
|
|
509
|
|
510 #my @arr = @{$genes{'chr'}};
|
|
511
|
|
512 sub findClosestGene {
|
|
513 my ($fpos,$lpos,$score,$chro,$pos,$genes) = @_;
|
|
514 my ($distTSS,$distTE,$type,$reg,$geneName,$foldChange) = ($BIGNUMBER,$BIGNUMBER,"intergenic",0,0,0);
|
|
515 my $locDistTSS = 0;
|
|
516 my $locDistTE = 0;
|
|
517 my %hashForOverlapingGenes;
|
|
518 my @array2return;
|
|
519 my $typeIntra = "NA";
|
|
520 for my $gName (keys %{$genes->{$chro}}) {
|
|
521 my $TSS = $genes->{$chro}{$gName}{'TSS'};
|
|
522 my $strand = $genes->{$chro}{$gName}{'strand'};
|
|
523 my $TE = $genes->{$chro}{$gName}{'TE'};
|
|
524
|
|
525 $locDistTE = ($pos-$TE)*$strand;
|
|
526 $locDistTSS = ($pos-$TSS)*$strand;
|
|
527 #print "$pos-$genes->{$chro}{$gName}{'TSS'})*$genes->{$chro}{$gName}{'strand'} = $locDistTSS\n";
|
|
528 #my $locDistTE = ($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'};
|
|
529
|
|
530 #print "$gName $maxDistance $pos-$genes->{$chro}{$gName}{'TSS'} $genes->{$chro}{$gName}{'strand'}\n";
|
|
531 #print "$locDistTSS $enhLeft $enhRight\n";
|
|
532
|
|
533 if (($locDistTSS>= $enhLeft)&&($locDistTSS<$enhRight)) {
|
|
534 $type = "enhancer";
|
|
535 } elsif (($locDistTSS>= $enhRight)&&($locDistTSS<=0)) {
|
|
536 $type = "promoter";
|
|
537 } elsif (($locDistTSS> 0)&&($locDistTSS<=$immediateDownstream)&&($locDistTE<=0)) {
|
|
538 $type = "immediateDownstream";
|
|
539 } elsif (($locDistTSS >= $immediateDownstream)&&($locDistTE<=0)) {
|
|
540 $type = "intragenic";
|
|
541 } elsif (($locDistTE >= 0)&&($locDistTE<=$downStreamDist)) {
|
|
542 $type = "5kbDownstream";
|
|
543 } elsif (abs($locDistTSS)<abs($distTSS)) {
|
|
544 $distTSS = $locDistTSS;
|
|
545 $distTE = $locDistTE;
|
|
546 ($reg,$geneName,$foldChange) = ($genes->{$chro}{$gName}{'reg'},$gName,$genes->{$chro}{$gName}{'foldChange'});
|
|
547 }
|
|
548
|
|
549 if (($type ne "intergenic")&&($type ne "NA")) {
|
|
550 unless (exists ($hashForOverlapingGenes{$type})) {
|
|
551 my %h;
|
|
552 $hashForOverlapingGenes{$type} = \%h;
|
|
553 }
|
|
554 $hashForOverlapingGenes{$type}->{$locDistTSS}=$gName;
|
|
555
|
|
556 #print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$genes->{$chro}{$gName}{'reg'}\t$gName\n" ;
|
|
557 $type = "NA";
|
|
558 #unless ($gName =~ m/$TSS/) {
|
|
559 # print "$gName\t$TSS\n";
|
|
560 #}
|
|
561 }
|
|
562
|
|
563 }
|
|
564 if ($type eq "intergenic") {
|
|
565 print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$typeIntra\t$reg\t$foldChange\t$distTE\t$geneName\n" ;
|
|
566 #print "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$reg\t$geneName\n" ;
|
|
567 push(@array2return,$type);
|
|
568
|
|
569 } else {
|
|
570 my $selectedType;
|
|
571 if (exists ($hashForOverlapingGenes{"promoter"})) {
|
|
572 $selectedType = "promoter";
|
|
573 } elsif (exists ($hashForOverlapingGenes{"immediateDownstream"})) {
|
|
574 $selectedType = "immediateDownstream";
|
|
575 } elsif (exists ($hashForOverlapingGenes{"intragenic"})) {
|
|
576 $selectedType = "intragenic";
|
|
577 } elsif (exists ($hashForOverlapingGenes{"enhancer"})) {
|
|
578 $selectedType = "enhancer";
|
|
579 } elsif (exists ($hashForOverlapingGenes{"5kbDownstream"})) {
|
|
580 $selectedType = "5kbDownstream";
|
|
581 }
|
|
582 my $bestGene = printBest($hashForOverlapingGenes{$selectedType},$chro,$fpos,$lpos,$pos,$score,$genes,$selectedType); #print "$type\n";
|
|
583 push(@array2return,$selectedType);
|
|
584 my %otherGenes;
|
|
585 for my $type ("promoter","immediateDownstream","intragenic","enhancer","5kbDownstream") { #"firstIntron","exon","intron",
|
|
586 for my $locDistTSS (sort {$a<=>$b} keys %{$hashForOverlapingGenes{$type}}) {
|
|
587 my $otherGene = $hashForOverlapingGenes{$type}->{$locDistTSS};
|
|
588 next if ($genes->{$chro}{$bestGene}{'name'} eq $genes->{$chro}{$otherGene}{'name'});
|
|
589 next if (exists ($otherGenes{$genes->{$chro}{$otherGene}{'name'}}));
|
|
590 if (overlapGenes($otherGene,$bestGene,$chro,$genes)) {
|
|
591
|
|
592 if (($type eq "intragenic")||($type eq "immediateDownstream")) {
|
|
593 $typeIntra = &getTypeIntra($genes->{$chro}{$otherGene},$pos);
|
|
594 $typehashIntra{$typeIntra}++;
|
|
595 }else {
|
|
596 $typeIntra = "NA"; }
|
|
597 print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$otherGene}{'reg'}\t$genes->{$chro}{$otherGene}{'foldChange'}\t",($pos-$genes->{$chro}{$otherGene}{'TE'})*$genes->{$chro}{$otherGene}{'strand'},"\t$otherGene\n" ;
|
|
598 push(@array2return,$type);
|
|
599
|
|
600 #print "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$genes->{$chro}{$otherGene}{'reg'}\t$otherGene\n" ;
|
|
601 #print $type,"\n";
|
|
602 $otherGenes{$genes->{$chro}{$otherGene}{'name'}} = 1;
|
|
603 }
|
|
604 }
|
|
605 }
|
|
606
|
|
607 }
|
|
608 return @array2return;
|
|
609 }
|
|
610
|
|
611 sub findAllClosestGeneOneLocation {
|
|
612 my ($fpos,$lpos,$score,$chro,$pos,$genes) = @_;
|
|
613 my ($distTSS,$type,$reg,$geneName,$foldChange) = ($BIGNUMBER,"intergenic",0,0,0);
|
|
614 my $locDistTSS = 0;
|
|
615 my $locDistTE = 0;
|
|
616 my %hashForOverlapingGenes;
|
|
617 my @array2return;
|
|
618 my $typeIntra = "NA";
|
|
619 for my $gName (keys %{$genes->{$chro}}) {
|
|
620 my $TSS = $genes->{$chro}{$gName}{'TSS'};
|
|
621 my $strand = $genes->{$chro}{$gName}{'strand'};
|
|
622 my $TE = $genes->{$chro}{$gName}{'TE'};
|
|
623
|
|
624 $locDistTE = ($pos-$TE)*$strand;
|
|
625 $locDistTSS = ($pos-$TSS)*$strand;
|
|
626 #print "$pos-$genes->{$chro}{$gName}{'TSS'})*$genes->{$chro}{$gName}{'strand'} = $locDistTSS\n";
|
|
627 #my $locDistTE = ($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'};
|
|
628
|
|
629 #print "$gName $maxDistance $pos-$genes->{$chro}{$gName}{'TSS'} $genes->{$chro}{$gName}{'strand'}\n";
|
|
630 #print "$locDistTSS $enhLeft $enhRight\n";
|
|
631
|
|
632 if (($locDistTSS>= $enhLeft)&&($locDistTSS<$enhRight)) {
|
|
633 $type = "enhancer";
|
|
634 } elsif (($locDistTSS>= $enhRight)&&($locDistTSS<=0)) {
|
|
635 $type = "promoter";
|
|
636 } elsif (($locDistTSS> 0)&&($locDistTSS<=$immediateDownstream)&&($locDistTE<=0)) {
|
|
637 $type = "immediateDownstream";
|
|
638 } elsif (($locDistTSS >= $immediateDownstream)&&($locDistTE<=0)) {
|
|
639 $type = "intragenic";
|
|
640 } elsif (($locDistTE >= 0)&&($locDistTE<=$downStreamDist)) {
|
|
641 $type = "5kbDownstream";
|
|
642 } elsif (abs($locDistTSS)<abs($distTSS)) {
|
|
643 $distTSS = $locDistTSS;
|
|
644 ($reg,$geneName,$foldChange) = ($genes->{$chro}{$gName}{'reg'},$gName,$genes->{$chro}{$gName}{'foldChange'});
|
|
645 }
|
|
646
|
|
647 if (($type ne "intergenic")&&($type ne "NA")) {
|
|
648 unless (exists ($hashForOverlapingGenes{$type})) {
|
|
649 my %h;
|
|
650 $hashForOverlapingGenes{$type} = \%h;
|
|
651 $hashForOverlapingGenes{$type}->{$gName}=$locDistTSS;
|
|
652
|
|
653 if (($type eq "intragenic")||($type eq "immediateDownstream")) {
|
|
654 $typeIntra = &getTypeIntra($genes->{$chro}{$gName},$pos);
|
|
655 $typehashIntra{$typeIntra}++;
|
|
656 }
|
|
657 print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$gName}{'reg'}\t$genes->{$chro}{$gName}{'fluo'}\t$genes->{$chro}{$gName}{'foldChange'}\t$genes{$chro}->{$gName}{'GCisland'}\t",($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'},"\t$gName\n" ;
|
|
658 push(@array2return,$type);
|
|
659 }
|
|
660 $typeIntra = "NA";
|
|
661 $type = "NA";
|
|
662 #unless ($gName =~ m/$TSS/) {
|
|
663 # print "$gName\t$TSS\n";
|
|
664 #}
|
|
665 }
|
|
666
|
|
667 }
|
|
668 if ($type eq "intergenic") {
|
|
669 print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$geneName}{'reg'}\t$genes->{$chro}{$geneName}{'fluo'}\t$genes->{$chro}{$geneName}{'foldChange'}\t$genes{$chro}->{$geneName}{'GCisland'}\t",($pos-$genes->{$chro}{$geneName}{'TE'})*$genes->{$chro}{$geneName}{'strand'},"\t$geneName\n" ;
|
|
670
|
|
671 #print "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$reg\t$geneName\n" ;
|
|
672 push(@array2return,$type);
|
|
673
|
|
674 }
|
|
675 return @array2return;
|
|
676 }
|
|
677
|
|
678 sub findAllClosestGene {
|
|
679 my ($fpos,$lpos,$score,$chro,$pos,$genes) = @_;
|
|
680 my ($distTSS,$type,$reg,$geneName,$foldChange) = ($BIGNUMBER,"intergenic",0,0,0);
|
|
681 my $locDistTSS = 0;
|
|
682 my $locDistTE = 0;
|
|
683 my %hashForOverlapingGenes;
|
|
684 my @array2return;
|
|
685 my $typeIntra = "NA";
|
|
686 for my $gName (keys %{$genes->{$chro}}) {
|
|
687 my $TSS = $genes->{$chro}{$gName}{'TSS'};
|
|
688 my $strand = $genes->{$chro}{$gName}{'strand'};
|
|
689 my $TE = $genes->{$chro}{$gName}{'TE'};
|
|
690
|
|
691 $locDistTE = ($pos-$TE)*$strand;
|
|
692 $locDistTSS = ($pos-$TSS)*$strand;
|
|
693 #print "$pos-$genes->{$chro}{$gName}{'TSS'})*$genes->{$chro}{$gName}{'strand'} = $locDistTSS\n";
|
|
694 #my $locDistTE = ($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'};
|
|
695
|
|
696 #print "$gName $maxDistance $pos-$genes->{$chro}{$gName}{'TSS'} $genes->{$chro}{$gName}{'strand'}\n";
|
|
697 #print "$locDistTSS $enhLeft $enhRight\n";
|
|
698
|
|
699 if (($locDistTSS>= $enhLeft)&&($locDistTSS<$enhRight)) {
|
|
700 $type = "enhancer";
|
|
701 } elsif (($locDistTSS>= $enhRight)&&($locDistTSS<=0)) {
|
|
702 $type = "promoter";
|
|
703 } elsif (($locDistTSS> 0)&&($locDistTSS<=$immediateDownstream)&&($locDistTE<=0)) {
|
|
704 $type = "immediateDownstream";
|
|
705 } elsif (($locDistTSS >= $immediateDownstream)&&($locDistTE<=0)) {
|
|
706 $type = "intragenic";
|
|
707 } elsif (($locDistTE >= 0)&&($locDistTE<=$downStreamDist)) {
|
|
708 $type = "5kbDownstream";
|
|
709 } elsif (abs($locDistTSS)<abs($distTSS)) {
|
|
710 $distTSS = $locDistTSS;
|
|
711 ($reg,$geneName,$foldChange) = ($genes->{$chro}{$gName}{'reg'},$gName,$genes->{$chro}{$gName}{'foldChange'});
|
|
712 }
|
|
713
|
|
714 if (($type ne "intergenic")&&($type ne "NA")) {
|
|
715 unless (exists ($hashForOverlapingGenes{$type})) {
|
|
716 my %h;
|
|
717 $hashForOverlapingGenes{$type} = \%h;
|
|
718 }
|
|
719 #$hashForOverlapingGenes{$type}->{$gName}=$locDistTSS;
|
|
720
|
|
721 $typeIntra = "NA";
|
|
722 if (($type eq "intragenic")||($type eq "immediateDownstream")) {
|
|
723 $typeIntra = &getTypeIntra($genes->{$chro}{$gName},$pos);
|
|
724 $typehashIntra{$typeIntra}++;
|
|
725 }
|
|
726 print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$gName}{'reg'}\t$genes->{$chro}{$gName}{'fluo'}\t$genes->{$chro}{$gName}{'foldChange'}\t$genes{$chro}->{$gName}{'GCisland'}\t",($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'},"\t$gName\n" ;
|
|
727 push(@array2return,$type);
|
|
728
|
|
729 $type = "NA";
|
|
730 #unless ($gName =~ m/$TSS/) {
|
|
731 # print "$gName\t$TSS\n";
|
|
732 #}
|
|
733 }
|
|
734
|
|
735 }
|
|
736 if ($type eq "intergenic") {
|
|
737 print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$geneName}{'reg'}\t$genes->{$chro}{$geneName}{'fluo'}\t$genes->{$chro}{$geneName}{'foldChange'}\t$genes{$chro}->{$geneName}{'GCisland'}\t",($pos-$genes->{$chro}{$geneName}{'TE'})*$genes->{$chro}{$geneName}{'strand'},"\t$geneName\n" ;
|
|
738
|
|
739 #print "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$reg\t$geneName\n" ;
|
|
740 push(@array2return,$type);
|
|
741
|
|
742 }
|
|
743 return @array2return;
|
|
744 }
|
|
745
|
|
746 sub getTypeIntra {
|
|
747
|
|
748 my ($geneEntry, $pos) = @_;
|
|
749 my $type;
|
|
750
|
|
751 if (($pos >= $geneEntry->{'firstIntronStart'})&&($pos <=$geneEntry->{'firstIntronEnd'})) {
|
|
752 return "f_intron";
|
|
753 }
|
|
754 if (($pos >= $geneEntry->{'firstExonStart'})&&($pos <=$geneEntry->{'firstExonEnd'})) {
|
|
755 return "f_exon";
|
|
756 }
|
|
757 $type = getIntronExon ($pos, $geneEntry->{'exonCount'},$geneEntry->{'exonStarts'},$geneEntry->{'exonEnds'},$geneEntry->{'strand'});
|
|
758 return $type;
|
|
759 }
|
|
760
|
|
761 sub overlapGenes {
|
|
762 my ($otherGene,$bestGene,$chro,$genes)= @_;
|
|
763 my $a1 = $genes->{$chro}{$otherGene}{'left'};
|
|
764 my $a2 = $genes->{$chro}{$bestGene}{'left'};
|
|
765 my $e1 = $genes->{$chro}{$otherGene}{'right'};
|
|
766 my $e2 = $genes->{$chro}{$bestGene}{'right'};
|
|
767 if (($a1 >= $a2)&&($a1 <= $e2)) {
|
|
768 return 1;
|
|
769 }
|
|
770 if (($a2 >= $a1)&&($a2 <= $e1)) {
|
|
771 return 1;
|
|
772 }
|
|
773 return 0;
|
|
774 }
|
|
775
|
|
776 sub printBest {
|
|
777 my ($hash,$chro,$fpos,$lpos,$pos,$score,$genes,$type) = @_;
|
|
778 for my $locDistTSS (sort {$a<=>$b} keys %{$hash}) {
|
|
779 my $gName = $hash->{$locDistTSS};
|
|
780 my $typeIntra = "NA";
|
|
781 if (($type eq "intragenic")||($type eq "immediateDownstream")) {
|
|
782 $typeIntra = &getTypeIntra($genes->{$chro}{$gName},$pos);
|
|
783 $typehashIntra{$typeIntra}++;
|
|
784 }
|
|
785 print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$gName}{'reg'}\t$genes->{$chro}{$gName}{'foldChange'}\t",($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'},"\t$gName\n" ;
|
|
786 return $gName;
|
|
787 }
|
|
788 }
|
|
789
|
|
790 #sub findAllClosestGene {
|
|
791 # my ($fpos,$lpos,$score,$chro,$pos,$genes) = @_;
|
|
792 # my ($distTSS,$type,$reg,$geneName,$foldChange) = ($BIGNUMBER,"intergenic",0,0,0);
|
|
793 # my $locDistTSS = 0;
|
|
794 # my $locDistTE = 0;
|
|
795 # my @array2return;
|
|
796 # for my $gName (keys %{$genes->{$chro}}) {
|
|
797 # my $TSS = $genes->{$chro}{$gName}{'TSS'};
|
|
798 # my $strand = $genes->{$chro}{$gName}{'strand'};
|
|
799 # my $TE = $genes->{$chro}{$gName}{'TE'};
|
|
800 # $locDistTSS = ($pos-$TSS)*$strand;
|
|
801 # $locDistTE = ($pos-$TE)*$strand;
|
|
802 # #print "$pos-$genes->{$chro}{$gName}{'TSS'})*$genes->{$chro}{$gName}{'strand'} = $locDistTSS\n";
|
|
803 # #my $locDistTE = ($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'};
|
|
804 #
|
|
805 # #print "$gName $maxDistance $pos-$genes->{$chro}{$gName}{'TSS'} $genes->{$chro}{$gName}{'strand'}\n";
|
|
806 # #print "$locDistTSS $enhLeft $enhRight\n";
|
|
807 #
|
|
808 # if (($locDistTSS>= $enhLeft)&&($locDistTSS<$enhRight)) {
|
|
809 # $type = "enhancer";
|
|
810 # } elsif (($locDistTSS>= $enhRight)&&($locDistTSS<=0)) {
|
|
811 # $type = "promoter";
|
|
812 # } elsif (($locDistTSS> 0)&&($locDistTSS<=$immediateDownstream)) {
|
|
813 # $type = "immediateDownstream";
|
|
814 # } elsif (($pos >= $genes{$chro}->{$gName}{'firstIntronStart'})&&($pos <= $genes{$chro}->{$gName}{'firstIntronEnd'})) {
|
|
815 # $type = "firstIntron";
|
|
816 # } elsif (($locDistTSS> $immediateDownstream)&&($locDistTSS<=$genes{$chro}->{$gName}{'length'})) {
|
|
817 # #$type = "intragenic";
|
|
818 # $type = &getIntronExon ($pos, $genes{$chro}->{$gName}{'exonCount'},$genes{$chro}->{$gName}{'exonStarts'},$genes{$chro}->{$gName}{'exonEnds'},$genes{$chro}->{$gName}{'strand'});
|
|
819 # } elsif (($locDistTE >= 0)&&($locDistTE<=$downStreamDist)) {
|
|
820 # $type = "5kbDownstream";
|
|
821 # } elsif (abs($locDistTSS)<abs($distTSS)) {
|
|
822 # $distTSS = $locDistTSS;
|
|
823 # ($reg,$geneName,$foldChange) = ($genes->{$chro}{$gName}{'reg'},$gName,$genes->{$chro}{$gName}{'foldChange'});
|
|
824 # }
|
|
825 # if (($type ne "intergenic")&&($type ne "NA")) {
|
|
826 # print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$genes->{$chro}{$gName}{'reg'}\t$genes->{$chro}{$gName}{'foldChange'}\t$gName\n" ;
|
|
827 # push(@array2return,$type);
|
|
828 # $type = "NA";
|
|
829 # #unless ($gName =~ m/$TSS/) {
|
|
830 # # print "$gName\t$TSS\n";
|
|
831 # #}
|
|
832 # }
|
|
833 #
|
|
834 # }
|
|
835 # if ($type eq "intergenic") {
|
|
836 # print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$reg\t$foldChange\t$geneName\n" ;
|
|
837 # push(@array2return,$type);
|
|
838 # #print "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$reg\t$geneName\n" ;
|
|
839 #
|
|
840 # }
|
|
841 # return @array2return;
|
|
842 #}
|
|
843
|
|
844 sub getFirstIntron {
|
|
845 my ($exonCount,$exonStarts,$exonEnds,$strand) = @_;
|
|
846 my ($left,$right);
|
|
847 if ($exonCount == 1) {
|
|
848 return (0,0);
|
|
849 }
|
|
850 if ($strand == 1) {
|
|
851 $left = (split ",", $exonEnds)[0]+$jonctionSize;
|
|
852 $right = (split (",", $exonStarts))[1]-$jonctionSize;
|
|
853 } else {
|
|
854 $left = (split (",", $exonEnds))[$exonCount-2]+$jonctionSize;
|
|
855 $right = (split (",", $exonStarts))[$exonCount-1]-$jonctionSize;
|
|
856 }
|
|
857 ($left,$right);
|
|
858 }
|
|
859
|
|
860 sub getFirstExon {
|
|
861 my ($exonCount,$exonStarts,$exonEnds,$strand) = @_;
|
|
862 my ($left,$right);
|
|
863 if ($exonCount == 1) {
|
|
864 return (0,0);
|
|
865 }
|
|
866 if ($strand == 1) {
|
|
867 $left = (split ",", $exonStarts)[0];
|
|
868 $right = (split (",", $exonEnds))[0]-$jonctionSize;
|
|
869 } else {
|
|
870 $left = (split (",", $exonStarts))[$exonCount-1]+$jonctionSize;
|
|
871 $right = (split (",", $exonEnds))[$exonCount-1];
|
|
872 }
|
|
873 ($left,$right);
|
|
874 }
|
|
875
|
|
876 sub getIntronExon {
|
|
877 my ($pos,$exonCount,$exonStarts,$exonEnds,$strand) = @_;
|
|
878 my (@left,@right);
|
|
879 @left = (split ",", $exonStarts);
|
|
880 @right = (split (",", $exonEnds));
|
|
881
|
|
882 for (my $i = 0; $i<$exonCount;$i++) {
|
|
883 #print "$left[$i] <= $pos ? $pos <= $right[$i]\n";
|
|
884 if (($left[$i]+$jonctionSize < $pos) && ($pos < $right[$i]-$jonctionSize)) {
|
|
885 #print "URA!\n";
|
|
886 return "exon";
|
|
887 } elsif (($i+1<$exonCount)&&($right[$i]+$jonctionSize < $pos) && ($pos < $left[$i+1]-$jonctionSize)) {
|
|
888 return "intron";
|
|
889 }
|
|
890 }
|
|
891 return "jonction";
|
|
892 }
|
|
893
|
|
894 sub min {
|
|
895 return $_[0] if ($_[0]<$_[1]);
|
|
896 $_[1];
|
|
897 }
|
|
898 sub med {
|
|
899 my @arr = @_;
|
|
900 my $med = 0;
|
|
901 @arr = sort {$a <=> $b} @arr;
|
|
902 if ((scalar(@arr)/2) =~ m/[\.\,]5/) {
|
|
903 return $arr[floor(scalar(@arr)/2)];
|
|
904 } else {
|
|
905 return ($arr[scalar(@arr)/2]+$arr[scalar(@arr)/2-1])/2;
|
|
906 }
|
|
907 $med;
|
|
908 }
|
|
909
|
|
910 sub checkIfGC {
|
|
911 my ($TSS,$strand,$dist,$GCislandsChr)=@_;
|
|
912 my $ifGC = 0;
|
|
913 my $leftProm=$TSS-$dist;
|
|
914 my $rightProm = $TSS;
|
|
915 if ($strand== -1) {
|
|
916 my $leftProm=$TSS;
|
|
917 my $rightProm = $TSS+$dist;
|
|
918 }
|
|
919 for my $leftGC (keys %{$GCislandsChr}) {
|
|
920 my $rightGC = $GCislandsChr->{$leftGC};
|
|
921 if ($leftGC>=$leftProm&&$leftGC<=$rightProm || $rightGC>=$leftProm&&$rightGC<=$rightProm) {
|
|
922 return "GC-island";
|
|
923 }
|
|
924 }
|
|
925 return $ifGC ;
|
|
926 }
|