comparison [APliBio]Nebula tools suite/Nebula/AnnotateGenes/geneAnnotation.pl @ 0:2ec3ba0e9e70 draft

Uploaded
author alermine
date Thu, 25 Oct 2012 08:18:25 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2ec3ba0e9e70
1 #:::::::::::::::::::::::::g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@j::::
2 #::::::::::::::::::::::;g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@E:::
3 #:ttt:::::::::::::::::@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@p@;
4 #:t:::::::::::::::::g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
5 #:t::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
6 #:::::::::::::z;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
7 #::::::::::::i@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
8 #::::::::::::@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@$@@@@
9 #:::::::::::3@@@@@@@@@@@@@@@@@@@@@@@@@B@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
10 #::::::::::3@@@@@@@@@@@@@@@@@@@@@BEEESSE5EEEEBBM@@@@@@@@@@@@@@@@@@@@@@@@@@
11 #::::::::::3@@@@@@@@@@@@@@@@@@@@BEEEEEE35EE55E2355E5SBMB@@@@@@@@@@@@@@@@@$
12 #::::::::::@@@@@@@@@@@@@@@@@@@EEEE55533t3tttt::::::!!!!7755E755SBBMMM@@@MM
13 #::::::::::3@@@@@@@@@@@@@@@@@@EEEE2t3ttttt:::::::::::::::::::::::!7?5225EE
14 #::::::::::3@@@@@@@@@@@@@@@@@@EEEEE31t::::::::::::::::::::::::::::::::3E5@
15 #::::::::::3@@@@@@@@@@@@@@@@@@EEEEEEtt:::::::::::::::::::::::::::::::::353
16 #::::::::::3@@@@@@@@@@@@@@@@@@EEEEEE1ttz::::::::::::::::::::::::::::::::35
17 #:::::::::::@@@@@@@@@@@@@@@@@@EEEEEEEtz1::::::::::::::::::::::::::::::::t:
18 #:::::::::!3@@@@@@@@@@@@@@@@@@@EEEEEttt::::::::::::::::::::::::::::::::;zz
19 #::::::::::@@@@@@@@@@@@@@@@@@@@EEEEEttt:::::z;z:::::::::::::::::::::::::13
20 #::::::::::3B@@@@@@@@@@@@@@@@@@EEEEEEE3tt:czzztti;:::::::::::::::::::::::3
21 #::::ttt::::3@@@@@@@@@@@@@@@@EEEEE5EE25Ezt1EEEz5Etzzz;;;;:::::::::::::::::
22 #:::::::::::I9@@@@@@@@@@@@@@@@@@@@@@@@@@EEEEEE@@@@@@@@@@@@@@Ez;:::::::::::
23 #:::::::::::::E@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@Ez::::::
24 #::::::::::::::E@@@@@@@@@@@@@@@@@@@@@@@@@@@@@BE5EBB@@@@@@@@@@@@@@@EEE:::::
25 #:::::::::::::::@@@@@@@@@@@@@@@@@@@@@@@@@@@@E1::35@@@@@@@@@@ME3MMME2::::::
26 #:::::::::::::::?@@@@@@@@@@@@@@@@@@M@@@@@@@EE:::::3SB@@BBESEEt::::::::::::
27 #::::::::::::::::J$@@@@@@@B@@@@@@@@@@@@@@@@EE:::::::!35E33t:::::::::::::::
28 #:::::::::::::::::3@E@@@EE5EESE5EESE@@@@@@@Et::::::::::::tz:::::::::::::::
29 #:::::::::::::::::J@E$@EEE5133555SE@@@@@@@@Et:::::::::::::::::::::::::::::
30 #::::::::::::::::::E@E@EEEEtt3523EEE@@@@@@@E::::::::::::::::::::::::::::::
31 #:t::::::::::::::::JEE3@@@EEEEEEEEEE@@@@@@@E:::::::::t;:::::::::::::::::::
32 #:t:::::::::::::::::!5ES@EEEEEEEEES@@@@@@@@@E;:::;;;:3Ez::::::::::::::::::
33 #:t::::::::::::::::::::JE@@EEEEEEE@@@@@@@@@@@@@@@@ME!:::;:::::::::::::::::
34 #:tz::::::::::::::::::::JE@@@EEEE@@@@@@@@@@@@@@EE!:::::::t::::::::::::::::
35 #:t::::::::::::::::::::::3@@@@@@@@@@@@@@@@@@ESBE::::::::::::::::::::::::::
36 #:::::::::::::::::::::::::Q@@@@@@@@@@@@@@@@EE3EE;:::::zzzz::::::::::::::::
37 #:::::::::::::::::::::::::3@@@@@@@@@@@@@@@@@@@@@@NN@@@@@@Ez:::::::::::::::
38 #:zt:::::::::::::::::::::::3@@@@EE@@@@@@@@@@EEEEt::;z113E5t:::::::::::::::
39 #::tt:::::::::::::::::::::::3@@@E@@@@@@@@@@@@@@@@BEt::::::::::::::::t:::::
40 #:tt:t:::::::::::::::::::::::?S@@@@@@@@@@@BBEEE51!::::::::::::::zzzEt:::::
41 #::::::::::::::::::::::::::::::3Q@@@@@@@BEEEEEt:::::::::::::;zz@@@EE::::::
42 #::::::::::::::::::::::::::::::::75B@@@@@EEEtt;:::::::::;zz@@@@BEEEtz:::::
43 #::::::::::::::::::::::::::::::::::::?9@@@@@@@@@@@E2Ezg@@@@@B@@@EEEE1t::::
44 #:::::::::::::::::::::::::::::::::::::::3@@@@@@@@@@@@@@@@@@@E@EEEEEEEzzz::
45 #::::::::::::::::::::::::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@EEEEEEE5ttttt
46 #:::::::::::::::::::::::::::::::;g@@@@@@@@@@@@@@@@@@@@@@@@@@EEEEEEEEEEEtzt
47 #::::::::::::::::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@E@@EEEEEEEEEEEE@@@
48 #::::::::::::::::::::::::::g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@EEEE3EEEE@@@@@@@
49 #:::::::::::::::::::::;;g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@EEEt33@@@@@@@@@@
50 #:::::::::::::::::;g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@E@@@@@@EEEtg@@@@@@@@@@@@
51 #::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@EEEE@@@@@@@@@@@@@@@@@@@@@@@@
52 #:::::::::::::@@@@@@@@@@@@@@@@@$@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
53 #::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
54 #
55 # Copyleft ↄ⃝ 2012 Institut Curie
56 # Author(s): Valentina Boeva, Alban Lermine (Institut Curie) 2012
57 # Contact: valentina.boeva@curie.fr, alban.lermine@curie.fr
58 # This software is distributed under the terms of the GNU General
59 # Public License, either Version 2, June 1991 or Version 3, June 2007.
60
61 #!/usr/bin/perl -w
62 #outputs statistics for all genes in the list
63
64 #different boundaries
65 #no motif p-value for binding sites
66 #read directly No-resp/Up/down category
67
68 #all isoforms from the file with genes
69
70 #RNApolII sites on junctions
71
72 use strict;
73 use POSIX;
74
75 my $usage = qq{
76 $0
77
78 -----------------------------
79 mandatory parameters:
80
81 -g filename file with all genes
82 -rp filename file with sites of RNApolII
83 -k36 filename file with sites of K36me3
84 -tf filename file with sites of TF1
85
86 -i filename file with a table where to add colomnes
87 -add values which colomns to add
88
89 -----------------------------
90 optional parameters:
91
92 -o filename output filename (defaut "genes.output.txt")
93 -v verbose mode
94 -mir filename file with positions of miRNA
95 -k9 filename ile with sites of K9me3
96
97 -c_rp value cutoff for -rp
98 -c_k9 value cutoff for -k9
99 -c_k36 value cutoff for -k36
100
101 -selG filename selected genes (up-down-regulated)
102 -fluo filename file with fluorescence
103 -gc filename file with gc-islands
104
105 -long for each gene take the longest isoform
106
107
108 };
109
110 if(scalar(@ARGV) == 0){
111 print $usage;
112 exit(0);
113 }
114
115 ## mandatory arguments
116
117 my $RNApolFilename = "";
118 my $H3K36Me3polFilename = "";
119 my $H3K9Me3polFilename = "";
120 my $TF1Filename = "";
121 my $TF2Filename = "";
122 my $GenesFilename = "";
123 my $MirFilename = "";
124 my $TF1FilenameALL = "";
125 my $TF2FilenameALL = "";
126 my $SelectedGenesFilename = "";
127 my $fluoFile = "";
128 my $initialTable = "";
129 my $colomnesToAdd = "";
130
131 my $enhLeft = -30000;
132 my $longEnhLeft = -60000;
133 my $enhRight = -1500;
134 my $immediateDownstream = 2000;
135 my $K9dist = 5000;
136 my $kb5 = 5000;
137 my $INFINITY = 10000000000;
138 my $jonctionSize = 50;
139 ## optional arguments
140 my $outname = "genes.output.txt";
141 my $verbose = 0;
142 my $GCislands = "";
143
144 my $longest = 0;
145
146 #my $cutoff_tf1 = 0;
147 #my $cutoff_tf2 = 0;
148 my $cutoff_tf1All = 0;
149 my $cutoff_tf2All = 0;
150 my $cutoff_rp = 0;
151 my $cutoff_k9 = 0;
152 my $cutoff_k36 = 0;
153 my $ifTFcoord = 0;
154
155 ## parse command line arguments
156
157 while(scalar(@ARGV) > 0){
158 my $this_arg = shift @ARGV;
159 if ( $this_arg eq '-h') {print "$usage\n"; exit; }
160
161 elsif ( $this_arg eq '-selG') {$SelectedGenesFilename = shift @ARGV;}
162
163 elsif ( $this_arg eq '-g') {$GenesFilename = shift @ARGV;}
164 elsif ( $this_arg eq '-rp') {$RNApolFilename = shift @ARGV;}
165 elsif ( $this_arg eq '-k36') {$H3K36Me3polFilename = shift @ARGV;}
166 elsif ( $this_arg eq '-k9') {$H3K9Me3polFilename = shift @ARGV;}
167
168 elsif ( $this_arg eq '-tf') {$TF1Filename = shift @ARGV;}
169
170 elsif ( $this_arg eq '-v') {$verbose = 1;}
171
172 elsif ( $this_arg eq '-long') {$longest = 1;}
173
174
175 elsif ( $this_arg eq '-o') {$outname = shift @ARGV;}
176 elsif ( $this_arg eq '-mir') {$MirFilename = shift @ARGV;}
177
178 elsif ( $this_arg eq '-c_rp') {$cutoff_rp = shift @ARGV;}
179 elsif ( $this_arg eq '-c_k9') {$cutoff_k9 = shift @ARGV;}
180 elsif ( $this_arg eq '-c_k36') {$cutoff_k36 = shift @ARGV;}
181
182 elsif ( $this_arg eq '-fluo') {$fluoFile = shift @ARGV;}
183 elsif ( $this_arg eq '-gc') {$GCislands = shift @ARGV;}
184
185 elsif ( $this_arg eq '-i') {$initialTable = shift @ARGV;}
186 elsif ( $this_arg eq '-add') {$colomnesToAdd = shift @ARGV;}
187
188 elsif ( $this_arg eq '-lp') {$enhRight = shift @ARGV;}
189 elsif ( $this_arg eq '-rightp') {$immediateDownstream = shift @ARGV;}
190 elsif ( $this_arg eq '-enh') {$enhLeft = shift @ARGV;}
191 elsif ( $this_arg eq '-dg') {$kb5 = shift @ARGV;}
192
193 elsif ( $this_arg =~ m/^-/ ) { print "unknown flag: $this_arg\n";}
194 }
195
196
197 if ( $GenesFilename eq "" ){
198 die "you should specify a file with genes \n";
199 }
200 if(( $RNApolFilename eq "")&&($H3K36Me3polFilename eq "")&&($TF1Filename eq "")&&($H3K9Me3polFilename eq "")){
201 die "you should specify at least one file with peaks\n";
202 }
203
204
205 #-----------read selected genes----------------
206 my %selectedGenes;
207 my %selectedGenesFoldChange;
208 if ( $SelectedGenesFilename ne "") {
209 open (FILE, "<$SelectedGenesFilename ") or die "Cannot open file $SelectedGenesFilename !!!!: $!";
210 while (<FILE>) {
211 chomp;
212 my @a = split/\t/;
213 $selectedGenes{$a[1]} = $a[3];
214 $selectedGenesFoldChange{$a[1]} = $a[2];
215 #print "gene:$a[1],reg:$selectedGenes{$a[1]},FC:$selectedGenesFoldChange{$a[1]}\n";
216 }
217
218 close FILE;
219 print "\t\t$SelectedGenesFilename is read!\n" if ($verbose);
220 }
221
222 #-----------read genes with fluorescence---------
223 my %fluoGenes;
224 if ( $fluoFile ne "") {
225 open (FILE, "<$fluoFile ") or die "Cannot open file $fluoFile !!!!: $!";
226 my $gene = "";
227 my $med = 0;
228 my %h;
229 while (<FILE>) {
230 chomp;
231 my @a = split/\t/;
232
233 next if (scalar(@a)<5);
234 next if ($a[0] eq "probesets");
235 next unless ($a[0] =~m/\S/);
236 next unless ($a[4] =~m/\S/);
237 if ($gene ne "" && $gene ne $a[2]) {
238 #calcMed
239 $med = med(keys %h);
240 $fluoGenes{$gene} = $med;
241 $med=0;
242 delete @h{keys %h};
243 } else {
244 #$h{$a[4]} = 1;
245 }
246 $gene = $a[2];
247 $h{$a[4]} = 1;
248 #print "keys ", scalar(keys %h),"\t",keys %h,"\n";
249 }
250 if ($gene ne "") {
251 $med = med(keys %h);
252 $fluoGenes{$gene} = $med;
253 }
254 close FILE;
255 print "\t\t$fluoFile is read!\n" if ($verbose);;
256 }
257 #-----------read GC-islands----------------
258 my %GCislands;
259 if ($GCislands ne "") {
260 open (FILE, "<$GCislands ") or die "Cannot open file $GCislands !!!!: $!";
261
262 while (<FILE>) {
263 chomp;
264 my @a = split/\t/;
265 #bin chrom chromStart chromEnd name length cpgNum gcNum perCpg perGc obsExp
266 #107 chr1 36568608 36569851 CpG: 128 1243 128 766 20.6 61.6 1.09
267 my $chr = $a[1];
268 my $start = $a[2];
269 my $end = $a[3];
270 $GCislands{$chr}->{$start}=$end;
271 }
272 close FILE;
273 if ($verbose) {
274 print "$GCislands is read\n";
275 }
276 } elsif ($verbose) {
277 print "you did not specify a file with GC-islands\n";
278 }
279
280 #-----------read genes----------------
281
282 my %genes;
283
284 my $count = 0;
285
286 open (GENES, "<$GenesFilename") or die "Cannot open file $GenesFilename!!!!: $!";
287 <GENES>;
288 while (<GENES>) {
289 chomp;
290 if (/(chr.*)\s([+-])\s(\d+)\s(\d+)\s(\d+)\s(\d+)\s(\d+)\s(\S+)\s(\S+)\s\S+\s(\S+)/){
291 my $name = $10;
292 my $chr = $1;
293 my $strand = $2;
294 if ($strand eq '+') {
295 $strand = 1;
296 }
297 else {
298 $strand = -1;
299 }
300 my $leftPos = $3;
301 my $rightPos = $4;
302 my $cdsStart= $5;
303 my $cdsEnd= $6;
304 my $exonCount= $7;
305 my $exonStarts= $8;
306 my $exonEnds= $9;
307 my $ID = "$name\t$chr:$leftPos-$rightPos\t$count";
308 my $foldChange = 1;
309 my $reg = "NA";
310 my $fluo = "NA";
311 if ( $SelectedGenesFilename ne "") {
312 #print "$name\t";
313 if (exists($selectedGenes{$name})) {
314 $reg = $selectedGenes{$name};
315 $foldChange = $selectedGenesFoldChange{$name};
316 }
317 }
318 if ( $fluoFile ne "") {
319 if (exists($fluoGenes{$name})) {
320 $fluo = $fluoGenes{$name};
321 }
322 }
323 unless (exists($genes{$chr})) {
324 my %h;
325 $genes{$chr} = \%h;
326 }
327
328 my $RNAlength = 0;
329 my $skip = 0;
330
331 #print "$ID\n";
332 if($longest) {
333 $RNAlength = getRNAlength($exonStarts,$exonEnds);
334 for my $IDgene (keys %{$genes{$chr}}) {
335 my $nameGene= (split('\t', $IDgene))[0];
336 if ($nameGene eq $name && $RNAlength > $genes{$chr}->{$IDgene}{'RNAlength'}) {
337 #print "found longer isofome: $ID longer than $IDgene\n";
338 # print "$RNAlength > ".$genes{$chr}->{$IDgene}{'RNAlength'}."\n";
339
340 $ID=$IDgene;
341 } elsif ($nameGene eq $name && $RNAlength <= $genes{$chr}->{$IDgene}{'RNAlength'}) {
342 #print "found shorter isofome: $ID shorted than $IDgene\nwill skip it\n";
343 #print "$RNAlength <= ".$genes{$chr}->{$IDgene}{'RNAlength'}."\n";
344 $skip = 1;
345 }
346 }
347 }
348
349
350 unless ($skip) {
351
352 unless (exists($genes{$chr}->{$ID})) {
353 my %h1;
354 $genes{$chr}->{$ID} = \%h1;
355 $count++;
356 }
357
358 $genes{$chr}->{$ID}{'name'} = $name ;
359 $genes{$chr}->{$ID}{'left'} = $leftPos ;
360 $genes{$chr}->{$ID}{'right'} = $rightPos ;
361 $genes{$chr}->{$ID}{'cdsStart'} = $cdsStart;
362 $genes{$chr}->{$ID}{'cdsEnd'} = $cdsEnd;
363 $genes{$chr}->{$ID}{'strand'} = $strand;
364 $genes{$chr}->{$ID}{'exonCount'} = $exonCount;
365 $genes{$chr}->{$ID}{'exonStarts'} = $exonStarts;
366 $genes{$chr}->{$ID}{'exonEnds'} = $exonEnds;
367 $genes{$chr}->{$ID}{'TSS'} = ($strand == 1) ? $leftPos :$rightPos ;
368 $genes{$chr}->{$ID}{'TE'} = ($strand == -1) ? $leftPos :$rightPos ;
369 $genes{$chr}->{$ID}{'reg'} = $reg;
370 $genes{$chr}->{$ID}{'foldChange'} = $foldChange;
371 $genes{$chr}->{$ID}{'length'} = abs ($leftPos-$rightPos);
372 $genes{$chr}->{$ID}{'RNAlength'} = $RNAlength ;
373
374 $genes{$chr}->{$ID}{'fluo'} = $fluo;
375
376 $genes{$chr}->{$ID}{'RNApolScore'} = 0;
377 $genes{$chr}->{$ID}{'RNApolDist'} = $INFINITY;
378 $genes{$chr}->{$ID}{'RNApol_junctionScore'} = 0;
379 $genes{$chr}->{$ID}{'RNApol_junctionDist'} = $INFINITY;
380
381 $genes{$chr}->{$ID}{'K36score'} = 0;
382 $genes{$chr}->{$ID}{'K9promScore'} = 0;
383 $genes{$chr}->{$ID}{'K9promDist'} = $INFINITY;
384 $genes{$chr}->{$ID}{'K9largeScore'} = 0;
385 $genes{$chr}->{$ID}{'K9largeDist'} = $INFINITY;
386 $genes{$chr}->{$ID}{'TFpromScore'} = 0;
387 $genes{$chr}->{$ID}{'TFpromDist'} = $INFINITY;
388 $genes{$chr}->{$ID}{'TFenhScore'} = 0;
389 $genes{$chr}->{$ID}{'TFenhDist'} = $INFINITY;
390 $genes{$chr}->{$ID}{'TFintraScore'} = 0;
391 $genes{$chr}->{$ID}{'TFintraDist'} = $INFINITY;
392 $genes{$chr}->{$ID}{'TFallScore'} = 0;
393 $genes{$chr}->{$ID}{'TFallDist'} = $INFINITY;
394
395
396 $genes{$chr}->{$ID}{'TFFirstIntronAndIntraScore'} = 0;
397 $genes{$chr}->{$ID}{'TFFirstIntronAndIntraDist'} = $INFINITY;
398 $genes{$chr}->{$ID}{'TFFirstIntronScore'} = 0;
399 $genes{$chr}->{$ID}{'TFFirstIntronDist'} = $INFINITY;
400 $genes{$chr}->{$ID}{'TFintraMinusFirstIntronScore'} = 0;
401 $genes{$chr}->{$ID}{'TFintraMinusFirstIntronDist'} = $INFINITY;
402 $genes{$chr}->{$ID}{'TFImmDownScore'} = 0;
403 $genes{$chr}->{$ID}{'TFImmDownDist'} = $INFINITY;
404 $genes{$chr}->{$ID}{'TFpromSimpleScore'} = 0;
405 $genes{$chr}->{$ID}{'TFpromSimpleDist'} = $INFINITY;
406 $genes{$chr}->{$ID}{'TFenh60kbScore'} = 0;
407 $genes{$chr}->{$ID}{'TFenh60kbDist'} = $INFINITY;
408
409 $genes{$chr}->{$ID}{'TF_FirstExonScore'} = 0;
410 $genes{$chr}->{$ID}{'TF_FirstExonDist'} = $INFINITY;
411 $genes{$chr}->{$ID}{'TF_junctionScore'} = 0;
412 $genes{$chr}->{$ID}{'TF_junctionDist'} = $INFINITY;
413
414 $genes{$chr}->{$ID}{'TF_junctionAndIntraScore'} = 0;
415 $genes{$chr}->{$ID}{'TF_junctionAndIntraDist'} = $INFINITY;
416 $genes{$chr}->{$ID}{'TF_FirstExonAndIntraScore'} = 0;
417 $genes{$chr}->{$ID}{'TF_FirstExonAndIntraDist'} = $INFINITY;
418
419 $genes{$chr}->{$ID}{'TF_OtherExonsScore'} = 0;
420 $genes{$chr}->{$ID}{'TF_OtherExonsDist'} = $INFINITY;
421 $genes{$chr}->{$ID}{'TF_OtherExonsAndIntraScore'} = 0;
422 $genes{$chr}->{$ID}{'TF_OtherExonsAndIntraDist'} = $INFINITY;
423
424
425 $genes{$chr}->{$ID}{'TF_OtherIntronsScore'} = 0;
426 $genes{$chr}->{$ID}{'TF_OtherIntronsDist'} = $INFINITY;
427 $genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraScore'} = 0;
428 $genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraDist'} = $INFINITY;
429
430 $genes{$chr}->{$ID}{'TF5kbDownScore'} = 0;
431 $genes{$chr}->{$ID}{'TF5kbDownDist'} = $INFINITY;
432 $genes{$chr}->{$ID}{'K9enhScore'} = 0;
433 $genes{$chr}->{$ID}{'K9enhDist'} = $INFINITY;
434
435 ($genes{$chr}->{$ID}{'firstIntronStart'},$genes{$chr}->{$ID}{'firstIntronEnd'}) = getFirstIntron ($exonCount,$exonStarts,$exonEnds,$strand);
436 ($genes{$chr}->{$ID}{'firstExonStart'},$genes{$chr}->{$ID}{'firstExonEnd'}) = getFirstExon ($exonCount,$exonStarts,$exonEnds,$strand);
437
438 $genes{$chr}->{$ID}{'exonCount'} = $exonCount;
439 $genes{$chr}->{$ID}{'exonStarts'} = $exonStarts;
440 $genes{$chr}->{$ID}{'exonEnds'} = $exonEnds;
441
442
443 $genes{$chr}->{$ID}{'GCisland'} = 0;
444 if ($GCislands ne "") {
445 $genes{$chr}->{$ID}{'GCisland'} = checkIfGC ($genes{$chr}->{$ID}{'TSS'},$strand,2000,$GCislands{$chr});
446 }
447 }
448 }
449 }
450
451 print "Total genes (including isoforms) : $count\n" ;
452 close GENES;
453 print "\t\t$GenesFilename is read!\n" if ($verbose);;
454 #for my $gName (sort keys %{$genes{'chr18'}}) {
455
456 # print "$gName\t$genes{'chr18'}->{$gName}{'TSS'}\n";
457 #}
458
459 #-----------read file with sites miRNA, store as genes-----
460
461 if ( $MirFilename eq ""){
462 print "you did not specify file with miRNA\n" if ($verbose);;
463 }
464 else {
465 $count = 0;
466 open (MIR, "<$MirFilename ") or die "Cannot open file $MirFilename !!!!: $!";
467 #chr1 20669090 20669163 mmu-mir-206 960 +
468 while (<MIR>) {
469 chomp;
470 my ($name, $chr, $leftPos, $rightPos, $strand );
471 #1 . miRNA 20669091 20669163 . + . ACC="MI0000249"; ID="mmu-mir-206";
472 if (/([0-9XYM]+)\s.\smiRNA\s(\d+)\s(\d+)\s.\s([+-])\s.\sACC=.*ID=\"(.*)\"/) {
473 $name = $5;
474 $chr = $1;
475 $leftPos = $2;
476 $rightPos = $3;
477 $strand = $4;
478 }
479 elsif (/(.*)\s(\d+)\s(\d+)\s(.*)\s(.*)\s(.*)/){
480 $name = $4;
481 $chr = $1;
482 $leftPos = $2;
483 $rightPos = $3;
484 $strand = $6;
485 } else {
486 next;
487 }
488
489 unless ($chr =~ m/chr/) {
490 $chr = "chr".$chr;
491 }
492 my $ID = "$name\t$chr:$leftPos-$rightPos\t$count";
493
494 if ($strand eq '+') {
495 $strand = 1;
496 }
497 else {
498 $strand = -1;
499 }
500
501 unless (exists($genes{$chr})) {
502 my %h;
503 $genes{$chr} = \%h;
504 }
505 unless (exists($genes{$chr}->{$ID})) {
506 my %h1;
507 $genes{$chr}->{$ID} = \%h1;
508 $count++;
509 }
510 $genes{$chr}->{$ID}{'name'} = $name ;
511 $genes{$chr}->{$ID}{'left'} = $leftPos ;
512 $genes{$chr}->{$ID}{'right'} = $rightPos ;
513 $genes{$chr}->{$ID}{'cdsStart'} = $leftPos ;
514 $genes{$chr}->{$ID}{'cdsEnd'} = $rightPos ;
515 $genes{$chr}->{$ID}{'strand'} = $strand;
516 $genes{$chr}->{$ID}{'length'} = abs ($leftPos-$rightPos);
517 $genes{$chr}->{$ID}{'exonCount'} = 1;
518 $genes{$chr}->{$ID}{'exonStarts'} = $leftPos ;
519 $genes{$chr}->{$ID}{'exonEnds'} = $rightPos ;
520 $genes{$chr}->{$ID}{'TSS'} = ($strand == 1) ? $leftPos :$rightPos ;
521 $genes{$chr}->{$ID}{'TE'} = ($strand == -1) ? $leftPos :$rightPos ;
522 $genes{$chr}->{$ID}{'reg'} = "miRNA";
523 $genes{$chr}->{$ID}{'foldChange'} = 1;
524
525 $genes{$chr}->{$ID}{'fluo'} = "N/A";
526
527
528 $genes{$chr}->{$ID}{'RNApolScore'} = 0;
529 $genes{$chr}->{$ID}{'RNApolDist'} = $INFINITY;
530 $genes{$chr}->{$ID}{'RNApol_junctionScore'} = 0;
531 $genes{$chr}->{$ID}{'RNApol_junctionDist'} = $INFINITY;
532 $genes{$chr}->{$ID}{'K36score'} = 0;
533 $genes{$chr}->{$ID}{'K9promScore'} = 0;
534 $genes{$chr}->{$ID}{'K9promDist'} = $INFINITY;
535 $genes{$chr}->{$ID}{'K9largeScore'} = 0;
536 $genes{$chr}->{$ID}{'K9largeDist'} = $INFINITY;
537 $genes{$chr}->{$ID}{'TFpromScore'} = 0;
538 $genes{$chr}->{$ID}{'TFpromDist'} = $INFINITY;
539 $genes{$chr}->{$ID}{'TFenhScore'} = 0;
540 $genes{$chr}->{$ID}{'TFenhDist'} = $INFINITY;
541 $genes{$chr}->{$ID}{'TFintraScore'} = 0;
542 $genes{$chr}->{$ID}{'TFintraDist'} = $INFINITY;
543 $genes{$chr}->{$ID}{'TFallScore'} = 0;
544 $genes{$chr}->{$ID}{'TFallDist'} = $INFINITY;
545
546 $genes{$chr}->{$ID}{'TFFirstIntronAndIntraScore'} = 0;
547 $genes{$chr}->{$ID}{'TFFirstIntronAndIntraDist'} = $INFINITY;
548 $genes{$chr}->{$ID}{'TFFirstIntronScore'} = 0;
549 $genes{$chr}->{$ID}{'TFFirstIntronDist'} = $INFINITY;
550 $genes{$chr}->{$ID}{'TFintraMinusFirstIntronScore'} = 0;
551 $genes{$chr}->{$ID}{'TFintraMinusFirstIntronDist'} = $INFINITY;
552 $genes{$chr}->{$ID}{'TFImmDownScore'} = 0;
553 $genes{$chr}->{$ID}{'TFImmDownDist'} = $INFINITY;
554 $genes{$chr}->{$ID}{'TFpromSimpleScore'} = 0;
555 $genes{$chr}->{$ID}{'TFpromSimpleDist'} = $INFINITY;
556
557 $genes{$chr}->{$ID}{'TF_FirstExonScore'} = 0;
558 $genes{$chr}->{$ID}{'TF_FirstExonDist'} = $INFINITY;
559 $genes{$chr}->{$ID}{'TF_OtherExonsScore'} = 0;
560 $genes{$chr}->{$ID}{'TF_OtherExonsDist'} = $INFINITY;
561 $genes{$chr}->{$ID}{'TF_OtherExonsAndIntraScore'} = 0;
562 $genes{$chr}->{$ID}{'TF_OtherExonsAndIntraDist'} = $INFINITY;
563 $genes{$chr}->{$ID}{'TF_junctionScore'} = 0;
564 $genes{$chr}->{$ID}{'TF_junctionDist'} = $INFINITY;
565 $genes{$chr}->{$ID}{'TF_OtherIntronsScore'} = 0;
566 $genes{$chr}->{$ID}{'TF_OtherIntronsDist'} = $INFINITY;
567 $genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraScore'} = 0;
568 $genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraDist'} = $INFINITY;
569
570 $genes{$chr}->{$ID}{'TF_junctionAndIntraScore'} = 0;
571 $genes{$chr}->{$ID}{'TF_junctionAndIntraDist'} = $INFINITY;
572 $genes{$chr}->{$ID}{'TF_FirstExonAndIntraScore'} = 0;
573 $genes{$chr}->{$ID}{'TF_FirstExonAndIntraDist'} = $INFINITY;
574
575 $genes{$chr}->{$ID}{'TFenh60kbScore'} = 0;
576 $genes{$chr}->{$ID}{'TFenh60kbDist'} = $INFINITY;
577
578 $genes{$chr}->{$ID}{'TF5kbDownScore'} = 0;
579 $genes{$chr}->{$ID}{'TF5kbDownDist'} = $INFINITY;
580 $genes{$chr}->{$ID}{'K9enhScore'} = 0;
581 $genes{$chr}->{$ID}{'K9enhDist'} = $INFINITY;
582
583 ($genes{$chr}->{$ID}{'firstIntronStart'},$genes{$chr}->{$ID}{'firstIntronEnd'}) = (0,0);
584 ($genes{$chr}->{$ID}{'firstExonStart'},$genes{$chr}->{$ID}{'firstExonEnd'}) = (0,0);
585
586 $genes{$chr}->{$ID}{'GCisland'} = 0;
587
588 $genes{$chr}->{$ID}{'exonCount'} = 1;
589 $genes{$chr}->{$ID}{'exonStarts'} = $leftPos ;
590 $genes{$chr}->{$ID}{'exonEnds'} = $rightPos ;
591
592
593 }
594
595
596 close MIR;
597 print "\t\t$MirFilename is read!\n" if ($verbose) ;
598 print "$count miRNA\n" if ($verbose);;
599 }
600
601 #-----------read file with sites of TF1-----
602 my $numberOfAllSites = 0;
603
604 if ($TF1Filename eq "") {
605 print "No file with peaks of TF1!\n" if ($verbose) ;
606 } else {
607 open (FILE, "<$TF1Filename ") or die "Cannot open file $TF1Filename !!!!: $!";
608 $_ = <FILE>;
609 my $correction = 0;
610 my @a = split /\t/;
611 if ( $a[1] =~ m/chr/ ) {
612 $correction = 1;
613 }
614
615 while (<FILE>) {
616 chomp;
617
618 my @a = split /\t/;
619
620 my $chr = $a[0+$correction];
621 my $firstPos = $a[1+$correction];
622 my $LastPos = $a[2+$correction];
623 my $maxPos = $a[3+$correction];
624 if ($maxPos=~/\D/) {
625 $maxPos = int(($firstPos+$LastPos)/2);
626 }
627 my $score = $a[4+$correction];
628
629 for my $ID (keys %{$genes{$chr}}) {
630
631 my $distTSS = ($maxPos - $genes{$chr}->{$ID}{'TSS'})*$genes{$chr}->{$ID}{'strand'};
632 my $distTE = ($maxPos - $genes{$chr}->{$ID}{'TE'})*$genes{$chr}->{$ID}{'strand'};
633
634 if (($distTSS>= $enhLeft)&&($distTSS<$enhRight)) {
635 if ($genes{$chr}->{$ID}{'TFenhScore'}<$score) {
636 $genes{$chr}->{$ID}{'TFenhScore'}=$score;
637 $genes{$chr}->{$ID}{'TFenhDist'} = $distTSS;
638 }
639 } elsif (($distTSS>= $enhRight)&&($distTSS<=$immediateDownstream)) {
640 if ($genes{$chr}->{$ID}{'TFpromScore'}<$score) {
641 $genes{$chr}->{$ID}{'TFpromScore'}=$score;
642 $genes{$chr}->{$ID}{'TFpromDist'} = $distTSS;
643 }
644 } elsif (($distTSS >= $immediateDownstream)&&($distTE<=0)) {
645 if ($genes{$chr}->{$ID}{'TFintraScore'}<$score) {
646 $genes{$chr}->{$ID}{'TFintraScore'}=$score;
647 $genes{$chr}->{$ID}{'TFintraDist'} = $distTSS;
648 }
649 }
650 if (($distTSS>= $enhLeft)&&($distTE<=$kb5)) {
651 if ($genes{$chr}->{$ID}{'TFallScore'}<$score) {
652 $genes{$chr}->{$ID}{'TFallScore'}=$score;
653 $genes{$chr}->{$ID}{'TFallDist'} = $distTSS;
654 }
655 }
656
657 if (($distTSS>= 0)&&($distTSS<=$immediateDownstream)) {
658 if ($genes{$chr}->{$ID}{'TFImmDownScore'}<$score) {
659 $genes{$chr}->{$ID}{'TFImmDownScore'}=$score;
660 $genes{$chr}->{$ID}{'TFImmDownDist'} = $distTSS;
661 }
662 }
663 if (($distTSS<= 0)&&($distTSS>=$enhRight)) {
664 if ($genes{$chr}->{$ID}{'TFpromSimpleScore'}<$score) {
665 $genes{$chr}->{$ID}{'TFpromSimpleScore'}=$score;
666 $genes{$chr}->{$ID}{'TFpromSimpleDist'} = $distTSS;
667 }
668 }
669
670 my ($firstIntronStart,$firstIntronEnd)=($genes{$chr}->{$ID}{'firstIntronStart'},$genes{$chr}->{$ID}{'firstIntronEnd'});
671 ($firstIntronStart,$firstIntronEnd)= ($firstIntronEnd,$firstIntronStart) if ($firstIntronStart>$firstIntronEnd) ;
672
673 my ($firstExonStart,$firstExonEnd) = ($genes{$chr}->{$ID}{'firstExonStart'},$genes{$chr}->{$ID}{'firstExonEnd'}) ;
674 ($firstExonStart,$firstExonEnd)= ($firstExonEnd,$firstExonStart) if ($firstExonStart>$firstExonEnd) ;
675
676 if ($maxPos>=$firstIntronStart && $maxPos <= $firstIntronEnd) {
677 if ($genes{$chr}->{$ID}{'TFFirstIntronScore'}<$score) {
678 $genes{$chr}->{$ID}{'TFFirstIntronScore'}=$score;
679 $genes{$chr}->{$ID}{'TFFirstIntronDist'} = $distTSS;
680 }
681
682 if (($distTSS >= $immediateDownstream)&&($distTE<=0)) {
683 if ($genes{$chr}->{$ID}{'TFFirstIntronAndIntraScore'}<$score) {
684 $genes{$chr}->{$ID}{'TFFirstIntronAndIntraScore'}=$score;
685 $genes{$chr}->{$ID}{'TFFirstIntronAndIntraDist'} = $distTSS;
686 }
687 }
688 } elsif (($distTSS >= $immediateDownstream)&&($distTE<=0)) {
689 if ($genes{$chr}->{$ID}{'TFintraMinusFirstIntronScore'}<$score) {
690 $genes{$chr}->{$ID}{'TFintraMinusFirstIntronScore'}=$score;
691 $genes{$chr}->{$ID}{'TFintraMinusFirstIntronDist'} = $distTSS;
692 }
693 }
694 if (($distTSS>= $longEnhLeft)&&($distTSS<$enhRight)) {
695 if ($genes{$chr}->{$ID}{'TFenh60kbScore'}<$score) {
696 $genes{$chr}->{$ID}{'TFenh60kbScore'}=$score;
697 $genes{$chr}->{$ID}{'TFenh60kbDist'} = $distTSS;
698 }
699 }
700 if (($distTE>=0)&&($distTE<=$kb5)) {
701 if ($genes{$chr}->{$ID}{'TF5kbDownScore'}<$score) {
702 $genes{$chr}->{$ID}{'TF5kbDownScore'}=$score;
703 $genes{$chr}->{$ID}{'TF5kbDownDist'} = $distTSS;
704 }
705 }
706 if ($distTSS>=0 && $distTE<=0) {
707 my $typeIntra = &getTypeIntra($genes{$chr}->{$ID},$maxPos);
708 if ($typeIntra eq "f_exon") {
709 if ($genes{$chr}->{$ID}{'TF_FirstExonScore'}<$score) {
710 $genes{$chr}->{$ID}{'TF_FirstExonScore'}=$score;
711 $genes{$chr}->{$ID}{'TF_FirstExonDist'} = $distTSS;
712 }
713
714 if ($genes{$chr}->{$ID}{'TF_FirstExonAndIntraScore'}<$score && ($distTSS >= $immediateDownstream)&&($distTE<=0)) {
715 $genes{$chr}->{$ID}{'TF_FirstExonAndIntraScore'}=$score;
716 $genes{$chr}->{$ID}{'TF_FirstExonAndIntraDist'} = $distTSS;
717 }
718
719 } else {
720 if ($typeIntra eq "exon") {
721 if ($genes{$chr}->{$ID}{'TF_OtherExonsScore'}<$score) {
722 $genes{$chr}->{$ID}{'TF_OtherExonsScore'}=$score;
723 $genes{$chr}->{$ID}{'TF_OtherExonsDist'} = $distTSS;
724 }
725
726 if (($distTSS >= $immediateDownstream)&&($distTE<=0)) {
727 if ($genes{$chr}->{$ID}{'TF_OtherExonsAndIntraScore'}<$score) {
728 $genes{$chr}->{$ID}{'TF_OtherExonsAndIntraScore'}=$score;
729 $genes{$chr}->{$ID}{'TF_OtherExonsAndIntraDist'} = $distTSS;
730 }
731 }
732
733 } elsif ($typeIntra eq "intron") {
734 if ($genes{$chr}->{$ID}{'TF_OtherIntronsScore'}<$score) {
735 $genes{$chr}->{$ID}{'TF_OtherIntronsScore'}=$score;
736 $genes{$chr}->{$ID}{'TF_OtherIntronsDist'} = $distTSS;
737 }
738
739 if (($distTSS >= $immediateDownstream)&&($distTE<=0)) {
740 if ($genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraScore'}<$score) {
741 $genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraScore'}=$score;
742 $genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraDist'} = $distTSS;
743 }
744 }
745 } elsif ($typeIntra eq "jonction") {
746 if ($genes{$chr}->{$ID}{'TF_junctionScore'}<$score) {
747 $genes{$chr}->{$ID}{'TF_junctionScore'}=$score;
748 $genes{$chr}->{$ID}{'TF_junctionDist'} = $distTSS;
749 }
750
751 if ($genes{$chr}->{$ID}{'TF_junctionAndIntraScore'}<$score && ($distTSS >= $immediateDownstream)&&($distTE<=0)) {
752 $genes{$chr}->{$ID}{'TF_junctionAndIntraScore'}=$score;
753 $genes{$chr}->{$ID}{'TF_junctionAndIntraDist'} = $distTSS;
754 }
755
756 }
757
758 }
759 }
760
761
762 }
763 $numberOfAllSites++;
764 }
765
766 close FILE;
767 print "\t$TF1Filename is read!\n" if ($verbose) ;
768 print "$numberOfAllSites sites\n" ;
769 }
770
771 #-----------read file with sites RNApolII-----
772 $numberOfAllSites = 0;
773
774 if ($RNApolFilename eq "") {
775 print "No file with peaks of RNA pol II!\n" if ($verbose) ;
776 } else {
777 open (FILE, "<$RNApolFilename ") or die "Cannot open file $RNApolFilename !!!!: $!";
778 $_ = <FILE>;
779 my @a = split /\t/;
780 my $correction = 0;
781
782 if ($a[0]=~m/chr/) {
783 $correction = -1;
784 }
785 #seek (FILE, 0, 0);
786 while (<FILE>) {
787 chomp;
788
789 my @a = split /\t/;
790
791 my $chr = $a[1+$correction];
792 my $firstPos = $a[2+$correction];
793 my $LastPos = $a[3+$correction];
794 my $maxPos = $a[4+$correction];
795 my $score = $a[5+$correction];
796 #print "$numberOfAllSites: $score\n";
797 next if ($score < $cutoff_rp);
798
799 for my $ID (keys %{$genes{$chr}}) {
800
801 my $distTSS = ($maxPos - $genes{$chr}->{$ID}{'TSS'})*$genes{$chr}->{$ID}{'strand'};
802 my $distTE = ($maxPos - $genes{$chr}->{$ID}{'TE'})*$genes{$chr}->{$ID}{'strand'};
803
804 if (($distTSS>= $enhRight)&&($distTSS<=$immediateDownstream)) {
805 if ($genes{$chr}->{$ID}{'RNApolScore'}<$score) {
806 $genes{$chr}->{$ID}{'RNApolScore'}=$score;
807 $genes{$chr}->{$ID}{'RNApolDist'} = $distTSS;
808 }
809 }
810 if ($distTSS>=0 && $distTE<=0) {
811 my $typeIntra = &getTypeIntra($genes{$chr}->{$ID},$maxPos);
812 if ($typeIntra eq "jonction") {
813 if ($genes{$chr}->{$ID}{'RNApol_junctionScore'}<$score) {
814 $genes{$chr}->{$ID}{'RNApol_junctionScore'}=$score;
815 $genes{$chr}->{$ID}{'RNApol_junctionDist'} = $distTSS;
816 }
817 }
818 }
819 }
820
821 $numberOfAllSites++;
822 }
823 close FILE;
824 print "\t$RNApolFilename is read!\n$numberOfAllSites sites\n" if ($verbose) ;
825
826 }
827
828 #-----------read file with sites K36me3-----
829 $numberOfAllSites = 0;
830 my @K36Score;
831
832 if ($H3K36Me3polFilename eq "") {
833 print "No file with peaks of H3K36me3!\n" if ($verbose) ;
834 } else {
835 open (FILE, "<$H3K36Me3polFilename ") or die "Cannot open file $H3K36Me3polFilename !!!!: $!";
836
837 $_ = <FILE>;
838 my @a = split /\t/;
839 my $correction = 0;
840
841 if ($a[0]=~m/chr/) {
842 $correction = -1;
843 }
844
845 while (<FILE>) {
846 chomp;
847
848 my @a = split /\t/;
849
850 my $chr = $a[1+$correction];
851 my $firstPos = $a[2+$correction];
852 my $lastPos = $a[3+$correction];
853 my $maxPos = $a[4+$correction];
854 my $score= $a[5+$correction];
855
856 for my $ID (keys %{$genes{$chr}}) {
857
858 my $distTSS = ($maxPos - $genes{$chr}->{$ID}{'TSS'})*$genes{$chr}->{$ID}{'strand'};
859 my $distTE = ($maxPos - $genes{$chr}->{$ID}{'TE'})*$genes{$chr}->{$ID}{'strand'};
860
861 my $scoreToadd = 0;
862
863 if (($firstPos>=$genes{$chr}->{$ID}{'left'})&&($lastPos<=$genes{$chr}->{$ID}{'right'})) {
864 $scoreToadd = $score/2.*($lastPos-$firstPos+1)/($genes{$chr}->{$ID}{'right'}-$genes{$chr}->{$ID}{'left'}+1);
865 }
866 if (($firstPos>=$genes{$chr}->{$ID}{'left'})&&($firstPos<$genes{$chr}->{$ID}{'right'})&&($lastPos>$genes{$chr}->{$ID}{'right'})) {
867 $scoreToadd = $score/2.*($genes{$chr}->{$ID}{'right'}-$firstPos+1)/($genes{$chr}->{$ID}{'right'}-$genes{$chr}->{$ID}{'left'}+1);
868 }
869 if (($firstPos<$genes{$chr}->{$ID}{'left'})&&($lastPos>$genes{$chr}->{$ID}{'left'})&&($lastPos<=$genes{$chr}->{$ID}{'right'})) {
870 $scoreToadd = $score/2.*($lastPos-$genes{$chr}->{$ID}{'left'}+1)/($genes{$chr}->{$ID}{'right'}-$genes{$chr}->{$ID}{'left'}+1);
871 }
872 if (($firstPos<$genes{$chr}->{$ID}{'left'})&&($lastPos>$genes{$chr}->{$ID}{'right'})) {
873 my $scoreToadd = $score/2.*($lastPos-$firstPos+1);
874 }
875 $genes{$chr}->{$ID}{'K36score'} += $scoreToadd;
876 }
877 $numberOfAllSites++;
878 }
879 close FILE;
880 print "\t$H3K36Me3polFilename is read!\n$numberOfAllSites sites\n" if ($verbose) ;
881 }
882
883 #-----------read file with sites H3K9me3-----
884 $numberOfAllSites = 0;
885
886
887 if ($H3K9Me3polFilename eq "") {
888 print "No file with peaks of H3K9me3!\n" if ($verbose) ;
889 } else {
890 open (FILE, "<$H3K9Me3polFilename ") or die "Cannot open file $H3K9Me3polFilename !!!!: $!";
891
892 $_ = <FILE>;
893 my @a = split /\t/;
894 my $correction = 0;
895
896 if ($a[0]=~m/chr/) {
897 $correction = -1;
898 }
899
900 while (<FILE>) {
901 chomp;
902 my @a = split /\t/;
903 my $chr = $a[1+$correction];
904 my $firstPos = $a[2+$correction];
905 my $LastPos = $a[3+$correction];
906 my $maxPos = $a[4+$correction];
907 my $score = $a[5+$correction];
908
909 next if ($score < $cutoff_k9);
910
911 for my $ID (keys %{$genes{$chr}}) {
912
913 my $distTSS = ($maxPos - $genes{$chr}->{$ID}{'TSS'})*$genes{$chr}->{$ID}{'strand'};
914 my $distTE = ($maxPos - $genes{$chr}->{$ID}{'TE'})*$genes{$chr}->{$ID}{'strand'};
915
916 if (($distTSS>= $enhRight)&&($distTSS<=$immediateDownstream)) {
917 if ($genes{$chr}->{$ID}{'K9promScore'}<$score) {
918 $genes{$chr}->{$ID}{'K9promScore'}=$score;
919 $genes{$chr}->{$ID}{'K9promDist'} = $distTSS;
920 }
921 } elsif (($distTSS >= -$K9dist)&&($distTSS<=$K9dist)) {
922 if ($genes{$chr}->{$ID}{'K9largeScore'}<$score) {
923 $genes{$chr}->{$ID}{'K9largeScore'}=$score;
924 $genes{$chr}->{$ID}{'K9largeDist'} = $distTSS;
925 }
926 }
927 if (($distTSS>= $enhLeft)&&($distTSS<$enhRight)) {
928 if ($genes{$chr}->{$ID}{'K9enhScore'}<$score) {
929 $genes{$chr}->{$ID}{'K9enhScore'}=$score;
930 $genes{$chr}->{$ID}{'K9enhDist'} = $distTSS;
931 }
932 }
933 }
934 $numberOfAllSites++;
935 }
936 close FILE;
937 print "\t$H3K9Me3polFilename is read!\n$numberOfAllSites sites\n" if ($verbose) ;
938 }
939
940 #-----------output all-----
941 #unless($initialTable eq "") {}
942
943
944 open (OUT , ">$outname") or die "Cannot open file $outname!!!!: $!";
945
946 print OUT "name\tchr\tstart\tend\tstrand\tReg\tfoldChange\t";
947
948 if ($GCislands ne "") {
949 print OUT "GC-island\t";
950 }
951
952 if ( $fluoFile ne "") {
953 print OUT "fluorescence\t";
954 }
955
956 if ($RNApolFilename ne "") {
957 print OUT "RNApolII_score\tRNApolII_distTSS\tRNApol_junctionScore\tRNApol_junctionDist\t";
958 }
959 if ($H3K36Me3polFilename ne "") {
960 print OUT "H3K36me3_score\t";
961 }
962 if ($H3K9Me3polFilename ne "") {
963 print OUT "H3K9me3_score_prom\tH3K9me3_distTSS_prom\tH3K9me3_score_large\tH3K9me3_distTSS_large\tH3K9me3_score_enh\tH3K9me3_distTSS_enh\t";
964 }
965 if ($TF1Filename ne "") {
966 print OUT "TF_score_Gene\tTF_distTSS_Gene\t";
967 print OUT "TF_score_Promoter\tTF_distTSS_Promoter\t";
968 print OUT "TF_score_ImmDown\tTF_distTSS_ImmDown\t";
969 print OUT "TF_score_PromoterORImmDown\tTF_distTSS_PromoterORImmDown\t";
970 print OUT "TF_score_Enhancer\tTF_distTSS_Enhancer\t";
971 print OUT "TF_score_Intragenic\tTF_distTSS_Intragenic\t";
972 print OUT "TF_score_GeneDownstream\tTF_distTSS_GeneDownstream\t";
973 print OUT "TF_score_FirstExon\tTF_distTSS_FirstExon\t";
974 print OUT "TF_score_FisrtIntron\tTF_distTSS_FisrtIntron\t";
975 print OUT "TF_score_FirstExonAND>$immediateDownstream\tTF_distTSS_FirstExonAND>$immediateDownstream\t";
976 print OUT "TF_score_FisrtIntronAND>$immediateDownstream\tTF_distTSS_FisrtIntronAND>$immediateDownstream\t";
977 #print OUT "TF_score_IntraMinusFisrtIntron\tTF_distTSS_IntraMinusFisrtIntron\t";
978
979 #print OUT "TF_score_enh60kb\tTF_distTSS_enh60kb\t";
980
981 print OUT "TF_score_Exons2,3,4,etc\tTF_distTSS_Exons2,3,4,etc\t";
982 print OUT "TF_score_Exons2,3,4,etcAND>$immediateDownstream\tTF_distTSS_Exons2,3,4,etcAND>$immediateDownstream\t";
983
984 print OUT "TF_score_Introns2,3,4,etc\tTF_distTSS_Introns2,3,4,etc\t";
985 print OUT "TF_score_Introns2,3,4,etcAND>$immediateDownstream\tTF_distTSS_Introns2,3,4,etcAND>$immediateDownstream\t";
986
987 print OUT "TF_score_EIjunction\tTF_distTSS_EIjunction\t";
988 print OUT "TF_score_EIjunctionAND>$immediateDownstream\tTF_distTSS_EIjunctionAND>$immediateDownstream";
989 }
990
991 print OUT "\n";
992
993 for my $chr (keys %genes) {
994 for my $ID (keys %{$genes{$chr}}) {
995 print OUT "$genes{$chr}->{$ID}{'name'}\t$chr\t$genes{$chr}->{$ID}{'left'}\t$genes{$chr}->{$ID}{'right'}\t$genes{$chr}->{$ID}{'strand'}\t$genes{$chr}->{$ID}{'reg'}\t$genes{$chr}->{$ID}{'foldChange'}\t";
996
997 if ($GCislands ne "") {
998 print OUT "$genes{$chr}->{$ID}{'GCisland'}\t";
999 }
1000
1001 if ( $fluoFile ne "") {
1002 print OUT "$genes{$chr}->{$ID}{'fluo'}\t";
1003 }
1004
1005 if ($RNApolFilename ne "") {
1006 print OUT "$genes{$chr}->{$ID}{'RNApolScore'}\t$genes{$chr}->{$ID}{'RNApolDist'}\t$genes{$chr}->{$ID}{'RNApol_junctionScore'}\t$genes{$chr}->{$ID}{'RNApol_junctionDist'}\t";
1007 }
1008 if ($H3K36Me3polFilename ne "") {
1009 print OUT "$genes{$chr}->{$ID}{'K36score'}\t";
1010 }
1011 if ($H3K9Me3polFilename ne "") {
1012 print OUT "$genes{$chr}->{$ID}{'K9promScore'}\t$genes{$chr}->{$ID}{'K9promDist'}\t$genes{$chr}->{$ID}{'K9largeScore'}\t$genes{$chr}->{$ID}{'K9largeDist'}\t$genes{$chr}->{$ID}{'K9enhScore'}\t$genes{$chr}->{$ID}{'K9enhDist'}\t";
1013 }
1014 if ($TF1Filename ne "") {
1015 print OUT "$genes{$chr}->{$ID}{'TFallScore'}\t$genes{$chr}->{$ID}{'TFallDist'}\t";
1016 print OUT "$genes{$chr}->{$ID}{'TFpromSimpleScore'}\t$genes{$chr}->{$ID}{'TFpromSimpleDist'}\t";
1017 print OUT "$genes{$chr}->{$ID}{'TFImmDownScore'}\t$genes{$chr}->{$ID}{'TFImmDownDist'}\t";
1018 print OUT "$genes{$chr}->{$ID}{'TFpromScore'}\t$genes{$chr}->{$ID}{'TFpromDist'}\t";
1019 print OUT "$genes{$chr}->{$ID}{'TFenhScore'}\t$genes{$chr}->{$ID}{'TFenhDist'}\t";
1020 print OUT "$genes{$chr}->{$ID}{'TFintraScore'}\t$genes{$chr}->{$ID}{'TFintraDist'}\t";
1021 print OUT "$genes{$chr}->{$ID}{'TF5kbDownScore'}\t$genes{$chr}->{$ID}{'TF5kbDownDist'}\t";
1022 print OUT "$genes{$chr}->{$ID}{'TF_FirstExonScore'}\t$genes{$chr}->{$ID}{'TF_FirstExonDist'}\t";
1023 print OUT "$genes{$chr}->{$ID}{'TF_FirstExonAndIntraScore'}\t$genes{$chr}->{$ID}{'TF_FirstExonAndIntraDist'}\t";
1024 print OUT "$genes{$chr}->{$ID}{'TFFirstIntronScore'}\t$genes{$chr}->{$ID}{'TFFirstIntronDist'}\t";
1025 print OUT "$genes{$chr}->{$ID}{'TFFirstIntronAndIntraScore'}\t$genes{$chr}->{$ID}{'TFFirstIntronAndIntraDist'}\t";
1026 #print OUT "$genes{$chr}->{$ID}{'TFintraMinusFirstIntronScore'}\t$genes{$chr}->{$ID}{'TFintraMinusFirstIntronDist'}\t";
1027
1028 #print OUT "$genes{$chr}->{$ID}{'TFenh60kbScore'}\t$genes{$chr}->{$ID}{'TFenh60kbDist'}\t";
1029
1030 print OUT "$genes{$chr}->{$ID}{'TF_OtherExonsScore'}\t$genes{$chr}->{$ID}{'TF_OtherExonsDist'}\t";
1031 print OUT "$genes{$chr}->{$ID}{'TF_OtherExonsAndIntraScore'}\t$genes{$chr}->{$ID}{'TF_OtherExonsAndIntraDist'}\t";
1032
1033 print OUT "$genes{$chr}->{$ID}{'TF_OtherIntronsScore'}\t$genes{$chr}->{$ID}{'TF_OtherIntronsDist'}\t";
1034 print OUT "$genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraScore'}\t$genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraDist'}\t";
1035
1036 print OUT "$genes{$chr}->{$ID}{'TF_junctionScore'}\t$genes{$chr}->{$ID}{'TF_junctionDist'}\t";
1037 print OUT "$genes{$chr}->{$ID}{'TF_junctionAndIntraScore'}\t$genes{$chr}->{$ID}{'TF_junctionAndIntraDist'}";
1038 }
1039 print OUT "\n";
1040 }
1041 }
1042
1043 close OUT;
1044
1045 ###################################
1046 sub med {
1047 my @arr = @_;
1048 my $med = 0;
1049 @arr = sort {$a <=> $b} @arr;
1050 if ((scalar(@arr)/2) =~ m/[\.\,]5/) {
1051 return $arr[floor(scalar(@arr)/2)];
1052 } else {
1053 return ($arr[scalar(@arr)/2]+$arr[scalar(@arr)/2-1])/2;
1054 }
1055 $med;
1056 }
1057
1058 sub checkIfGC {
1059 my ($TSS,$strand,$dist,$GCislandsChr)=@_;
1060 my $ifGC = 0;
1061 my $leftProm=$TSS-$dist;
1062 my $rightProm = $TSS;
1063 if ($strand== -1) {
1064 my $leftProm=$TSS;
1065 my $rightProm = $TSS+$dist;
1066 } #print "$leftProm\t"; print "$rightProm\n";
1067 for my $leftGC (keys %{$GCislandsChr}) {
1068 my $rightGC = $GCislandsChr->{$leftGC};
1069 if ($leftGC>=$leftProm&&$leftGC<=$rightProm || $rightGC>=$leftProm&&$rightGC<=$rightProm) {
1070 return "GC-island";
1071 }
1072 }
1073 return $ifGC ;
1074 }
1075
1076 sub getFirstIntron {
1077 my ($exonCount,$exonStarts,$exonEnds,$strand) = @_;
1078 my ($left,$right);
1079 if ($exonCount == 1) {
1080 return (0,0);
1081 }
1082 if ($strand == 1) {
1083 $left = (split ",", $exonEnds)[0];
1084 $right = (split (",", $exonStarts))[1];
1085 } else {
1086 $left = (split (",", $exonEnds))[$exonCount-2];
1087 $right = (split (",", $exonStarts))[$exonCount-1];
1088 }
1089 ($left,$right);
1090 }
1091
1092 sub getFirstExon {
1093 my ($exonCount,$exonStarts,$exonEnds,$strand) = @_;
1094 my ($left,$right);
1095 if ($exonCount == 1) {
1096 return (0,0);
1097 }
1098 if ($strand == 1) {
1099 $left = (split ",", $exonStarts)[0];
1100 $right = (split (",", $exonEnds))[0]-$jonctionSize;
1101 } else {
1102 $left = (split (",", $exonStarts))[$exonCount-1]+$jonctionSize;
1103 $right = (split (",", $exonEnds))[$exonCount-1];
1104 }
1105 ($left,$right);
1106 }
1107
1108
1109 sub getIntronExon {
1110 my ($pos,$exonCount,$exonStarts,$exonEnds,$strand) = @_;
1111 my (@left,@right);
1112 @left = (split ",", $exonStarts);
1113 @right = (split (",", $exonEnds));
1114
1115 for (my $i = 0; $i<$exonCount;$i++) {
1116 #print "$left[$i] <= $pos ? $pos <= $right[$i]\n";
1117 if (($left[$i]+$jonctionSize < $pos) && ($pos < $right[$i]-$jonctionSize)) {
1118 #print "URA!\n";
1119 return "exon";
1120 } elsif (($i+1<$exonCount)&&($right[$i]+$jonctionSize < $pos) && ($pos < $left[$i+1]-$jonctionSize)) {
1121 return "intron";
1122 }
1123 }
1124 return "jonction";
1125 }
1126
1127
1128 sub getTypeIntra {
1129
1130 my ($geneEntry, $pos) = @_;
1131 my $type;
1132
1133 if (($pos >= $geneEntry->{'firstIntronStart'})&&($pos <=$geneEntry->{'firstIntronEnd'})) {
1134 return "f_intron";
1135 }
1136 if (($pos >= $geneEntry->{'firstExonStart'})&&($pos <=$geneEntry->{'firstExonEnd'})) {
1137 return "f_exon";
1138 }
1139 $type = getIntronExon ($pos, $geneEntry->{'exonCount'},$geneEntry->{'exonStarts'},$geneEntry->{'exonEnds'},$geneEntry->{'strand'});
1140 return $type;
1141 }
1142
1143 sub getRNAlength {
1144 my ($exonStarts,$exonEnds) = @_;
1145 my (@left,@right);
1146 @left = (split ",", $exonStarts);
1147 @right = (split (",", $exonEnds));
1148 my $length = 0;
1149 for (my $i = 0; $i<scalar(@right);$i++) {
1150 $length+=$right[$i]-$left[$i];
1151 }
1152 #print STDERR "length = $length\n";
1153 return $length;
1154 }