Mercurial > repos > alermine > nebula
comparison [APliBio]Nebula tools suite/Nebula/AnnotateGenes/geneAnnotation.pl @ 0:2ec3ba0e9e70 draft
Uploaded
author | alermine |
---|---|
date | Thu, 25 Oct 2012 08:18:25 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2ec3ba0e9e70 |
---|---|
1 #:::::::::::::::::::::::::g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@j:::: | |
2 #::::::::::::::::::::::;g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@E::: | |
3 #:ttt:::::::::::::::::@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@p@; | |
4 #:t:::::::::::::::::g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
5 #:t::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
6 #:::::::::::::z;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
7 #::::::::::::i@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
8 #::::::::::::@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@$@@@@ | |
9 #:::::::::::3@@@@@@@@@@@@@@@@@@@@@@@@@B@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
10 #::::::::::3@@@@@@@@@@@@@@@@@@@@@BEEESSE5EEEEBBM@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
11 #::::::::::3@@@@@@@@@@@@@@@@@@@@BEEEEEE35EE55E2355E5SBMB@@@@@@@@@@@@@@@@@$ | |
12 #::::::::::@@@@@@@@@@@@@@@@@@@EEEE55533t3tttt::::::!!!!7755E755SBBMMM@@@MM | |
13 #::::::::::3@@@@@@@@@@@@@@@@@@EEEE2t3ttttt:::::::::::::::::::::::!7?5225EE | |
14 #::::::::::3@@@@@@@@@@@@@@@@@@EEEEE31t::::::::::::::::::::::::::::::::3E5@ | |
15 #::::::::::3@@@@@@@@@@@@@@@@@@EEEEEEtt:::::::::::::::::::::::::::::::::353 | |
16 #::::::::::3@@@@@@@@@@@@@@@@@@EEEEEE1ttz::::::::::::::::::::::::::::::::35 | |
17 #:::::::::::@@@@@@@@@@@@@@@@@@EEEEEEEtz1::::::::::::::::::::::::::::::::t: | |
18 #:::::::::!3@@@@@@@@@@@@@@@@@@@EEEEEttt::::::::::::::::::::::::::::::::;zz | |
19 #::::::::::@@@@@@@@@@@@@@@@@@@@EEEEEttt:::::z;z:::::::::::::::::::::::::13 | |
20 #::::::::::3B@@@@@@@@@@@@@@@@@@EEEEEEE3tt:czzztti;:::::::::::::::::::::::3 | |
21 #::::ttt::::3@@@@@@@@@@@@@@@@EEEEE5EE25Ezt1EEEz5Etzzz;;;;::::::::::::::::: | |
22 #:::::::::::I9@@@@@@@@@@@@@@@@@@@@@@@@@@EEEEEE@@@@@@@@@@@@@@Ez;::::::::::: | |
23 #:::::::::::::E@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@Ez:::::: | |
24 #::::::::::::::E@@@@@@@@@@@@@@@@@@@@@@@@@@@@@BE5EBB@@@@@@@@@@@@@@@EEE::::: | |
25 #:::::::::::::::@@@@@@@@@@@@@@@@@@@@@@@@@@@@E1::35@@@@@@@@@@ME3MMME2:::::: | |
26 #:::::::::::::::?@@@@@@@@@@@@@@@@@@M@@@@@@@EE:::::3SB@@BBESEEt:::::::::::: | |
27 #::::::::::::::::J$@@@@@@@B@@@@@@@@@@@@@@@@EE:::::::!35E33t::::::::::::::: | |
28 #:::::::::::::::::3@E@@@EE5EESE5EESE@@@@@@@Et::::::::::::tz::::::::::::::: | |
29 #:::::::::::::::::J@E$@EEE5133555SE@@@@@@@@Et::::::::::::::::::::::::::::: | |
30 #::::::::::::::::::E@E@EEEEtt3523EEE@@@@@@@E:::::::::::::::::::::::::::::: | |
31 #:t::::::::::::::::JEE3@@@EEEEEEEEEE@@@@@@@E:::::::::t;::::::::::::::::::: | |
32 #:t:::::::::::::::::!5ES@EEEEEEEEES@@@@@@@@@E;:::;;;:3Ez:::::::::::::::::: | |
33 #:t::::::::::::::::::::JE@@EEEEEEE@@@@@@@@@@@@@@@@ME!:::;::::::::::::::::: | |
34 #:tz::::::::::::::::::::JE@@@EEEE@@@@@@@@@@@@@@EE!:::::::t:::::::::::::::: | |
35 #:t::::::::::::::::::::::3@@@@@@@@@@@@@@@@@@ESBE:::::::::::::::::::::::::: | |
36 #:::::::::::::::::::::::::Q@@@@@@@@@@@@@@@@EE3EE;:::::zzzz:::::::::::::::: | |
37 #:::::::::::::::::::::::::3@@@@@@@@@@@@@@@@@@@@@@NN@@@@@@Ez::::::::::::::: | |
38 #:zt:::::::::::::::::::::::3@@@@EE@@@@@@@@@@EEEEt::;z113E5t::::::::::::::: | |
39 #::tt:::::::::::::::::::::::3@@@E@@@@@@@@@@@@@@@@BEt::::::::::::::::t::::: | |
40 #:tt:t:::::::::::::::::::::::?S@@@@@@@@@@@BBEEE51!::::::::::::::zzzEt::::: | |
41 #::::::::::::::::::::::::::::::3Q@@@@@@@BEEEEEt:::::::::::::;zz@@@EE:::::: | |
42 #::::::::::::::::::::::::::::::::75B@@@@@EEEtt;:::::::::;zz@@@@BEEEtz::::: | |
43 #::::::::::::::::::::::::::::::::::::?9@@@@@@@@@@@E2Ezg@@@@@B@@@EEEE1t:::: | |
44 #:::::::::::::::::::::::::::::::::::::::3@@@@@@@@@@@@@@@@@@@E@EEEEEEEzzz:: | |
45 #::::::::::::::::::::::::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@EEEEEEE5ttttt | |
46 #:::::::::::::::::::::::::::::::;g@@@@@@@@@@@@@@@@@@@@@@@@@@EEEEEEEEEEEtzt | |
47 #::::::::::::::::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@E@@EEEEEEEEEEEE@@@ | |
48 #::::::::::::::::::::::::::g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@EEEE3EEEE@@@@@@@ | |
49 #:::::::::::::::::::::;;g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@EEEt33@@@@@@@@@@ | |
50 #:::::::::::::::::;g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@E@@@@@@EEEtg@@@@@@@@@@@@ | |
51 #::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@EEEE@@@@@@@@@@@@@@@@@@@@@@@@ | |
52 #:::::::::::::@@@@@@@@@@@@@@@@@$@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
53 #::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
54 # | |
55 # Copyleft ↄ⃝ 2012 Institut Curie | |
56 # Author(s): Valentina Boeva, Alban Lermine (Institut Curie) 2012 | |
57 # Contact: valentina.boeva@curie.fr, alban.lermine@curie.fr | |
58 # This software is distributed under the terms of the GNU General | |
59 # Public License, either Version 2, June 1991 or Version 3, June 2007. | |
60 | |
61 #!/usr/bin/perl -w | |
62 #outputs statistics for all genes in the list | |
63 | |
64 #different boundaries | |
65 #no motif p-value for binding sites | |
66 #read directly No-resp/Up/down category | |
67 | |
68 #all isoforms from the file with genes | |
69 | |
70 #RNApolII sites on junctions | |
71 | |
72 use strict; | |
73 use POSIX; | |
74 | |
75 my $usage = qq{ | |
76 $0 | |
77 | |
78 ----------------------------- | |
79 mandatory parameters: | |
80 | |
81 -g filename file with all genes | |
82 -rp filename file with sites of RNApolII | |
83 -k36 filename file with sites of K36me3 | |
84 -tf filename file with sites of TF1 | |
85 | |
86 -i filename file with a table where to add colomnes | |
87 -add values which colomns to add | |
88 | |
89 ----------------------------- | |
90 optional parameters: | |
91 | |
92 -o filename output filename (defaut "genes.output.txt") | |
93 -v verbose mode | |
94 -mir filename file with positions of miRNA | |
95 -k9 filename ile with sites of K9me3 | |
96 | |
97 -c_rp value cutoff for -rp | |
98 -c_k9 value cutoff for -k9 | |
99 -c_k36 value cutoff for -k36 | |
100 | |
101 -selG filename selected genes (up-down-regulated) | |
102 -fluo filename file with fluorescence | |
103 -gc filename file with gc-islands | |
104 | |
105 -long for each gene take the longest isoform | |
106 | |
107 | |
108 }; | |
109 | |
110 if(scalar(@ARGV) == 0){ | |
111 print $usage; | |
112 exit(0); | |
113 } | |
114 | |
115 ## mandatory arguments | |
116 | |
117 my $RNApolFilename = ""; | |
118 my $H3K36Me3polFilename = ""; | |
119 my $H3K9Me3polFilename = ""; | |
120 my $TF1Filename = ""; | |
121 my $TF2Filename = ""; | |
122 my $GenesFilename = ""; | |
123 my $MirFilename = ""; | |
124 my $TF1FilenameALL = ""; | |
125 my $TF2FilenameALL = ""; | |
126 my $SelectedGenesFilename = ""; | |
127 my $fluoFile = ""; | |
128 my $initialTable = ""; | |
129 my $colomnesToAdd = ""; | |
130 | |
131 my $enhLeft = -30000; | |
132 my $longEnhLeft = -60000; | |
133 my $enhRight = -1500; | |
134 my $immediateDownstream = 2000; | |
135 my $K9dist = 5000; | |
136 my $kb5 = 5000; | |
137 my $INFINITY = 10000000000; | |
138 my $jonctionSize = 50; | |
139 ## optional arguments | |
140 my $outname = "genes.output.txt"; | |
141 my $verbose = 0; | |
142 my $GCislands = ""; | |
143 | |
144 my $longest = 0; | |
145 | |
146 #my $cutoff_tf1 = 0; | |
147 #my $cutoff_tf2 = 0; | |
148 my $cutoff_tf1All = 0; | |
149 my $cutoff_tf2All = 0; | |
150 my $cutoff_rp = 0; | |
151 my $cutoff_k9 = 0; | |
152 my $cutoff_k36 = 0; | |
153 my $ifTFcoord = 0; | |
154 | |
155 ## parse command line arguments | |
156 | |
157 while(scalar(@ARGV) > 0){ | |
158 my $this_arg = shift @ARGV; | |
159 if ( $this_arg eq '-h') {print "$usage\n"; exit; } | |
160 | |
161 elsif ( $this_arg eq '-selG') {$SelectedGenesFilename = shift @ARGV;} | |
162 | |
163 elsif ( $this_arg eq '-g') {$GenesFilename = shift @ARGV;} | |
164 elsif ( $this_arg eq '-rp') {$RNApolFilename = shift @ARGV;} | |
165 elsif ( $this_arg eq '-k36') {$H3K36Me3polFilename = shift @ARGV;} | |
166 elsif ( $this_arg eq '-k9') {$H3K9Me3polFilename = shift @ARGV;} | |
167 | |
168 elsif ( $this_arg eq '-tf') {$TF1Filename = shift @ARGV;} | |
169 | |
170 elsif ( $this_arg eq '-v') {$verbose = 1;} | |
171 | |
172 elsif ( $this_arg eq '-long') {$longest = 1;} | |
173 | |
174 | |
175 elsif ( $this_arg eq '-o') {$outname = shift @ARGV;} | |
176 elsif ( $this_arg eq '-mir') {$MirFilename = shift @ARGV;} | |
177 | |
178 elsif ( $this_arg eq '-c_rp') {$cutoff_rp = shift @ARGV;} | |
179 elsif ( $this_arg eq '-c_k9') {$cutoff_k9 = shift @ARGV;} | |
180 elsif ( $this_arg eq '-c_k36') {$cutoff_k36 = shift @ARGV;} | |
181 | |
182 elsif ( $this_arg eq '-fluo') {$fluoFile = shift @ARGV;} | |
183 elsif ( $this_arg eq '-gc') {$GCislands = shift @ARGV;} | |
184 | |
185 elsif ( $this_arg eq '-i') {$initialTable = shift @ARGV;} | |
186 elsif ( $this_arg eq '-add') {$colomnesToAdd = shift @ARGV;} | |
187 | |
188 elsif ( $this_arg eq '-lp') {$enhRight = shift @ARGV;} | |
189 elsif ( $this_arg eq '-rightp') {$immediateDownstream = shift @ARGV;} | |
190 elsif ( $this_arg eq '-enh') {$enhLeft = shift @ARGV;} | |
191 elsif ( $this_arg eq '-dg') {$kb5 = shift @ARGV;} | |
192 | |
193 elsif ( $this_arg =~ m/^-/ ) { print "unknown flag: $this_arg\n";} | |
194 } | |
195 | |
196 | |
197 if ( $GenesFilename eq "" ){ | |
198 die "you should specify a file with genes \n"; | |
199 } | |
200 if(( $RNApolFilename eq "")&&($H3K36Me3polFilename eq "")&&($TF1Filename eq "")&&($H3K9Me3polFilename eq "")){ | |
201 die "you should specify at least one file with peaks\n"; | |
202 } | |
203 | |
204 | |
205 #-----------read selected genes---------------- | |
206 my %selectedGenes; | |
207 my %selectedGenesFoldChange; | |
208 if ( $SelectedGenesFilename ne "") { | |
209 open (FILE, "<$SelectedGenesFilename ") or die "Cannot open file $SelectedGenesFilename !!!!: $!"; | |
210 while (<FILE>) { | |
211 chomp; | |
212 my @a = split/\t/; | |
213 $selectedGenes{$a[1]} = $a[3]; | |
214 $selectedGenesFoldChange{$a[1]} = $a[2]; | |
215 #print "gene:$a[1],reg:$selectedGenes{$a[1]},FC:$selectedGenesFoldChange{$a[1]}\n"; | |
216 } | |
217 | |
218 close FILE; | |
219 print "\t\t$SelectedGenesFilename is read!\n" if ($verbose); | |
220 } | |
221 | |
222 #-----------read genes with fluorescence--------- | |
223 my %fluoGenes; | |
224 if ( $fluoFile ne "") { | |
225 open (FILE, "<$fluoFile ") or die "Cannot open file $fluoFile !!!!: $!"; | |
226 my $gene = ""; | |
227 my $med = 0; | |
228 my %h; | |
229 while (<FILE>) { | |
230 chomp; | |
231 my @a = split/\t/; | |
232 | |
233 next if (scalar(@a)<5); | |
234 next if ($a[0] eq "probesets"); | |
235 next unless ($a[0] =~m/\S/); | |
236 next unless ($a[4] =~m/\S/); | |
237 if ($gene ne "" && $gene ne $a[2]) { | |
238 #calcMed | |
239 $med = med(keys %h); | |
240 $fluoGenes{$gene} = $med; | |
241 $med=0; | |
242 delete @h{keys %h}; | |
243 } else { | |
244 #$h{$a[4]} = 1; | |
245 } | |
246 $gene = $a[2]; | |
247 $h{$a[4]} = 1; | |
248 #print "keys ", scalar(keys %h),"\t",keys %h,"\n"; | |
249 } | |
250 if ($gene ne "") { | |
251 $med = med(keys %h); | |
252 $fluoGenes{$gene} = $med; | |
253 } | |
254 close FILE; | |
255 print "\t\t$fluoFile is read!\n" if ($verbose);; | |
256 } | |
257 #-----------read GC-islands---------------- | |
258 my %GCislands; | |
259 if ($GCislands ne "") { | |
260 open (FILE, "<$GCislands ") or die "Cannot open file $GCislands !!!!: $!"; | |
261 | |
262 while (<FILE>) { | |
263 chomp; | |
264 my @a = split/\t/; | |
265 #bin chrom chromStart chromEnd name length cpgNum gcNum perCpg perGc obsExp | |
266 #107 chr1 36568608 36569851 CpG: 128 1243 128 766 20.6 61.6 1.09 | |
267 my $chr = $a[1]; | |
268 my $start = $a[2]; | |
269 my $end = $a[3]; | |
270 $GCislands{$chr}->{$start}=$end; | |
271 } | |
272 close FILE; | |
273 if ($verbose) { | |
274 print "$GCislands is read\n"; | |
275 } | |
276 } elsif ($verbose) { | |
277 print "you did not specify a file with GC-islands\n"; | |
278 } | |
279 | |
280 #-----------read genes---------------- | |
281 | |
282 my %genes; | |
283 | |
284 my $count = 0; | |
285 | |
286 open (GENES, "<$GenesFilename") or die "Cannot open file $GenesFilename!!!!: $!"; | |
287 <GENES>; | |
288 while (<GENES>) { | |
289 chomp; | |
290 if (/(chr.*)\s([+-])\s(\d+)\s(\d+)\s(\d+)\s(\d+)\s(\d+)\s(\S+)\s(\S+)\s\S+\s(\S+)/){ | |
291 my $name = $10; | |
292 my $chr = $1; | |
293 my $strand = $2; | |
294 if ($strand eq '+') { | |
295 $strand = 1; | |
296 } | |
297 else { | |
298 $strand = -1; | |
299 } | |
300 my $leftPos = $3; | |
301 my $rightPos = $4; | |
302 my $cdsStart= $5; | |
303 my $cdsEnd= $6; | |
304 my $exonCount= $7; | |
305 my $exonStarts= $8; | |
306 my $exonEnds= $9; | |
307 my $ID = "$name\t$chr:$leftPos-$rightPos\t$count"; | |
308 my $foldChange = 1; | |
309 my $reg = "NA"; | |
310 my $fluo = "NA"; | |
311 if ( $SelectedGenesFilename ne "") { | |
312 #print "$name\t"; | |
313 if (exists($selectedGenes{$name})) { | |
314 $reg = $selectedGenes{$name}; | |
315 $foldChange = $selectedGenesFoldChange{$name}; | |
316 } | |
317 } | |
318 if ( $fluoFile ne "") { | |
319 if (exists($fluoGenes{$name})) { | |
320 $fluo = $fluoGenes{$name}; | |
321 } | |
322 } | |
323 unless (exists($genes{$chr})) { | |
324 my %h; | |
325 $genes{$chr} = \%h; | |
326 } | |
327 | |
328 my $RNAlength = 0; | |
329 my $skip = 0; | |
330 | |
331 #print "$ID\n"; | |
332 if($longest) { | |
333 $RNAlength = getRNAlength($exonStarts,$exonEnds); | |
334 for my $IDgene (keys %{$genes{$chr}}) { | |
335 my $nameGene= (split('\t', $IDgene))[0]; | |
336 if ($nameGene eq $name && $RNAlength > $genes{$chr}->{$IDgene}{'RNAlength'}) { | |
337 #print "found longer isofome: $ID longer than $IDgene\n"; | |
338 # print "$RNAlength > ".$genes{$chr}->{$IDgene}{'RNAlength'}."\n"; | |
339 | |
340 $ID=$IDgene; | |
341 } elsif ($nameGene eq $name && $RNAlength <= $genes{$chr}->{$IDgene}{'RNAlength'}) { | |
342 #print "found shorter isofome: $ID shorted than $IDgene\nwill skip it\n"; | |
343 #print "$RNAlength <= ".$genes{$chr}->{$IDgene}{'RNAlength'}."\n"; | |
344 $skip = 1; | |
345 } | |
346 } | |
347 } | |
348 | |
349 | |
350 unless ($skip) { | |
351 | |
352 unless (exists($genes{$chr}->{$ID})) { | |
353 my %h1; | |
354 $genes{$chr}->{$ID} = \%h1; | |
355 $count++; | |
356 } | |
357 | |
358 $genes{$chr}->{$ID}{'name'} = $name ; | |
359 $genes{$chr}->{$ID}{'left'} = $leftPos ; | |
360 $genes{$chr}->{$ID}{'right'} = $rightPos ; | |
361 $genes{$chr}->{$ID}{'cdsStart'} = $cdsStart; | |
362 $genes{$chr}->{$ID}{'cdsEnd'} = $cdsEnd; | |
363 $genes{$chr}->{$ID}{'strand'} = $strand; | |
364 $genes{$chr}->{$ID}{'exonCount'} = $exonCount; | |
365 $genes{$chr}->{$ID}{'exonStarts'} = $exonStarts; | |
366 $genes{$chr}->{$ID}{'exonEnds'} = $exonEnds; | |
367 $genes{$chr}->{$ID}{'TSS'} = ($strand == 1) ? $leftPos :$rightPos ; | |
368 $genes{$chr}->{$ID}{'TE'} = ($strand == -1) ? $leftPos :$rightPos ; | |
369 $genes{$chr}->{$ID}{'reg'} = $reg; | |
370 $genes{$chr}->{$ID}{'foldChange'} = $foldChange; | |
371 $genes{$chr}->{$ID}{'length'} = abs ($leftPos-$rightPos); | |
372 $genes{$chr}->{$ID}{'RNAlength'} = $RNAlength ; | |
373 | |
374 $genes{$chr}->{$ID}{'fluo'} = $fluo; | |
375 | |
376 $genes{$chr}->{$ID}{'RNApolScore'} = 0; | |
377 $genes{$chr}->{$ID}{'RNApolDist'} = $INFINITY; | |
378 $genes{$chr}->{$ID}{'RNApol_junctionScore'} = 0; | |
379 $genes{$chr}->{$ID}{'RNApol_junctionDist'} = $INFINITY; | |
380 | |
381 $genes{$chr}->{$ID}{'K36score'} = 0; | |
382 $genes{$chr}->{$ID}{'K9promScore'} = 0; | |
383 $genes{$chr}->{$ID}{'K9promDist'} = $INFINITY; | |
384 $genes{$chr}->{$ID}{'K9largeScore'} = 0; | |
385 $genes{$chr}->{$ID}{'K9largeDist'} = $INFINITY; | |
386 $genes{$chr}->{$ID}{'TFpromScore'} = 0; | |
387 $genes{$chr}->{$ID}{'TFpromDist'} = $INFINITY; | |
388 $genes{$chr}->{$ID}{'TFenhScore'} = 0; | |
389 $genes{$chr}->{$ID}{'TFenhDist'} = $INFINITY; | |
390 $genes{$chr}->{$ID}{'TFintraScore'} = 0; | |
391 $genes{$chr}->{$ID}{'TFintraDist'} = $INFINITY; | |
392 $genes{$chr}->{$ID}{'TFallScore'} = 0; | |
393 $genes{$chr}->{$ID}{'TFallDist'} = $INFINITY; | |
394 | |
395 | |
396 $genes{$chr}->{$ID}{'TFFirstIntronAndIntraScore'} = 0; | |
397 $genes{$chr}->{$ID}{'TFFirstIntronAndIntraDist'} = $INFINITY; | |
398 $genes{$chr}->{$ID}{'TFFirstIntronScore'} = 0; | |
399 $genes{$chr}->{$ID}{'TFFirstIntronDist'} = $INFINITY; | |
400 $genes{$chr}->{$ID}{'TFintraMinusFirstIntronScore'} = 0; | |
401 $genes{$chr}->{$ID}{'TFintraMinusFirstIntronDist'} = $INFINITY; | |
402 $genes{$chr}->{$ID}{'TFImmDownScore'} = 0; | |
403 $genes{$chr}->{$ID}{'TFImmDownDist'} = $INFINITY; | |
404 $genes{$chr}->{$ID}{'TFpromSimpleScore'} = 0; | |
405 $genes{$chr}->{$ID}{'TFpromSimpleDist'} = $INFINITY; | |
406 $genes{$chr}->{$ID}{'TFenh60kbScore'} = 0; | |
407 $genes{$chr}->{$ID}{'TFenh60kbDist'} = $INFINITY; | |
408 | |
409 $genes{$chr}->{$ID}{'TF_FirstExonScore'} = 0; | |
410 $genes{$chr}->{$ID}{'TF_FirstExonDist'} = $INFINITY; | |
411 $genes{$chr}->{$ID}{'TF_junctionScore'} = 0; | |
412 $genes{$chr}->{$ID}{'TF_junctionDist'} = $INFINITY; | |
413 | |
414 $genes{$chr}->{$ID}{'TF_junctionAndIntraScore'} = 0; | |
415 $genes{$chr}->{$ID}{'TF_junctionAndIntraDist'} = $INFINITY; | |
416 $genes{$chr}->{$ID}{'TF_FirstExonAndIntraScore'} = 0; | |
417 $genes{$chr}->{$ID}{'TF_FirstExonAndIntraDist'} = $INFINITY; | |
418 | |
419 $genes{$chr}->{$ID}{'TF_OtherExonsScore'} = 0; | |
420 $genes{$chr}->{$ID}{'TF_OtherExonsDist'} = $INFINITY; | |
421 $genes{$chr}->{$ID}{'TF_OtherExonsAndIntraScore'} = 0; | |
422 $genes{$chr}->{$ID}{'TF_OtherExonsAndIntraDist'} = $INFINITY; | |
423 | |
424 | |
425 $genes{$chr}->{$ID}{'TF_OtherIntronsScore'} = 0; | |
426 $genes{$chr}->{$ID}{'TF_OtherIntronsDist'} = $INFINITY; | |
427 $genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraScore'} = 0; | |
428 $genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraDist'} = $INFINITY; | |
429 | |
430 $genes{$chr}->{$ID}{'TF5kbDownScore'} = 0; | |
431 $genes{$chr}->{$ID}{'TF5kbDownDist'} = $INFINITY; | |
432 $genes{$chr}->{$ID}{'K9enhScore'} = 0; | |
433 $genes{$chr}->{$ID}{'K9enhDist'} = $INFINITY; | |
434 | |
435 ($genes{$chr}->{$ID}{'firstIntronStart'},$genes{$chr}->{$ID}{'firstIntronEnd'}) = getFirstIntron ($exonCount,$exonStarts,$exonEnds,$strand); | |
436 ($genes{$chr}->{$ID}{'firstExonStart'},$genes{$chr}->{$ID}{'firstExonEnd'}) = getFirstExon ($exonCount,$exonStarts,$exonEnds,$strand); | |
437 | |
438 $genes{$chr}->{$ID}{'exonCount'} = $exonCount; | |
439 $genes{$chr}->{$ID}{'exonStarts'} = $exonStarts; | |
440 $genes{$chr}->{$ID}{'exonEnds'} = $exonEnds; | |
441 | |
442 | |
443 $genes{$chr}->{$ID}{'GCisland'} = 0; | |
444 if ($GCislands ne "") { | |
445 $genes{$chr}->{$ID}{'GCisland'} = checkIfGC ($genes{$chr}->{$ID}{'TSS'},$strand,2000,$GCislands{$chr}); | |
446 } | |
447 } | |
448 } | |
449 } | |
450 | |
451 print "Total genes (including isoforms) : $count\n" ; | |
452 close GENES; | |
453 print "\t\t$GenesFilename is read!\n" if ($verbose);; | |
454 #for my $gName (sort keys %{$genes{'chr18'}}) { | |
455 | |
456 # print "$gName\t$genes{'chr18'}->{$gName}{'TSS'}\n"; | |
457 #} | |
458 | |
459 #-----------read file with sites miRNA, store as genes----- | |
460 | |
461 if ( $MirFilename eq ""){ | |
462 print "you did not specify file with miRNA\n" if ($verbose);; | |
463 } | |
464 else { | |
465 $count = 0; | |
466 open (MIR, "<$MirFilename ") or die "Cannot open file $MirFilename !!!!: $!"; | |
467 #chr1 20669090 20669163 mmu-mir-206 960 + | |
468 while (<MIR>) { | |
469 chomp; | |
470 my ($name, $chr, $leftPos, $rightPos, $strand ); | |
471 #1 . miRNA 20669091 20669163 . + . ACC="MI0000249"; ID="mmu-mir-206"; | |
472 if (/([0-9XYM]+)\s.\smiRNA\s(\d+)\s(\d+)\s.\s([+-])\s.\sACC=.*ID=\"(.*)\"/) { | |
473 $name = $5; | |
474 $chr = $1; | |
475 $leftPos = $2; | |
476 $rightPos = $3; | |
477 $strand = $4; | |
478 } | |
479 elsif (/(.*)\s(\d+)\s(\d+)\s(.*)\s(.*)\s(.*)/){ | |
480 $name = $4; | |
481 $chr = $1; | |
482 $leftPos = $2; | |
483 $rightPos = $3; | |
484 $strand = $6; | |
485 } else { | |
486 next; | |
487 } | |
488 | |
489 unless ($chr =~ m/chr/) { | |
490 $chr = "chr".$chr; | |
491 } | |
492 my $ID = "$name\t$chr:$leftPos-$rightPos\t$count"; | |
493 | |
494 if ($strand eq '+') { | |
495 $strand = 1; | |
496 } | |
497 else { | |
498 $strand = -1; | |
499 } | |
500 | |
501 unless (exists($genes{$chr})) { | |
502 my %h; | |
503 $genes{$chr} = \%h; | |
504 } | |
505 unless (exists($genes{$chr}->{$ID})) { | |
506 my %h1; | |
507 $genes{$chr}->{$ID} = \%h1; | |
508 $count++; | |
509 } | |
510 $genes{$chr}->{$ID}{'name'} = $name ; | |
511 $genes{$chr}->{$ID}{'left'} = $leftPos ; | |
512 $genes{$chr}->{$ID}{'right'} = $rightPos ; | |
513 $genes{$chr}->{$ID}{'cdsStart'} = $leftPos ; | |
514 $genes{$chr}->{$ID}{'cdsEnd'} = $rightPos ; | |
515 $genes{$chr}->{$ID}{'strand'} = $strand; | |
516 $genes{$chr}->{$ID}{'length'} = abs ($leftPos-$rightPos); | |
517 $genes{$chr}->{$ID}{'exonCount'} = 1; | |
518 $genes{$chr}->{$ID}{'exonStarts'} = $leftPos ; | |
519 $genes{$chr}->{$ID}{'exonEnds'} = $rightPos ; | |
520 $genes{$chr}->{$ID}{'TSS'} = ($strand == 1) ? $leftPos :$rightPos ; | |
521 $genes{$chr}->{$ID}{'TE'} = ($strand == -1) ? $leftPos :$rightPos ; | |
522 $genes{$chr}->{$ID}{'reg'} = "miRNA"; | |
523 $genes{$chr}->{$ID}{'foldChange'} = 1; | |
524 | |
525 $genes{$chr}->{$ID}{'fluo'} = "N/A"; | |
526 | |
527 | |
528 $genes{$chr}->{$ID}{'RNApolScore'} = 0; | |
529 $genes{$chr}->{$ID}{'RNApolDist'} = $INFINITY; | |
530 $genes{$chr}->{$ID}{'RNApol_junctionScore'} = 0; | |
531 $genes{$chr}->{$ID}{'RNApol_junctionDist'} = $INFINITY; | |
532 $genes{$chr}->{$ID}{'K36score'} = 0; | |
533 $genes{$chr}->{$ID}{'K9promScore'} = 0; | |
534 $genes{$chr}->{$ID}{'K9promDist'} = $INFINITY; | |
535 $genes{$chr}->{$ID}{'K9largeScore'} = 0; | |
536 $genes{$chr}->{$ID}{'K9largeDist'} = $INFINITY; | |
537 $genes{$chr}->{$ID}{'TFpromScore'} = 0; | |
538 $genes{$chr}->{$ID}{'TFpromDist'} = $INFINITY; | |
539 $genes{$chr}->{$ID}{'TFenhScore'} = 0; | |
540 $genes{$chr}->{$ID}{'TFenhDist'} = $INFINITY; | |
541 $genes{$chr}->{$ID}{'TFintraScore'} = 0; | |
542 $genes{$chr}->{$ID}{'TFintraDist'} = $INFINITY; | |
543 $genes{$chr}->{$ID}{'TFallScore'} = 0; | |
544 $genes{$chr}->{$ID}{'TFallDist'} = $INFINITY; | |
545 | |
546 $genes{$chr}->{$ID}{'TFFirstIntronAndIntraScore'} = 0; | |
547 $genes{$chr}->{$ID}{'TFFirstIntronAndIntraDist'} = $INFINITY; | |
548 $genes{$chr}->{$ID}{'TFFirstIntronScore'} = 0; | |
549 $genes{$chr}->{$ID}{'TFFirstIntronDist'} = $INFINITY; | |
550 $genes{$chr}->{$ID}{'TFintraMinusFirstIntronScore'} = 0; | |
551 $genes{$chr}->{$ID}{'TFintraMinusFirstIntronDist'} = $INFINITY; | |
552 $genes{$chr}->{$ID}{'TFImmDownScore'} = 0; | |
553 $genes{$chr}->{$ID}{'TFImmDownDist'} = $INFINITY; | |
554 $genes{$chr}->{$ID}{'TFpromSimpleScore'} = 0; | |
555 $genes{$chr}->{$ID}{'TFpromSimpleDist'} = $INFINITY; | |
556 | |
557 $genes{$chr}->{$ID}{'TF_FirstExonScore'} = 0; | |
558 $genes{$chr}->{$ID}{'TF_FirstExonDist'} = $INFINITY; | |
559 $genes{$chr}->{$ID}{'TF_OtherExonsScore'} = 0; | |
560 $genes{$chr}->{$ID}{'TF_OtherExonsDist'} = $INFINITY; | |
561 $genes{$chr}->{$ID}{'TF_OtherExonsAndIntraScore'} = 0; | |
562 $genes{$chr}->{$ID}{'TF_OtherExonsAndIntraDist'} = $INFINITY; | |
563 $genes{$chr}->{$ID}{'TF_junctionScore'} = 0; | |
564 $genes{$chr}->{$ID}{'TF_junctionDist'} = $INFINITY; | |
565 $genes{$chr}->{$ID}{'TF_OtherIntronsScore'} = 0; | |
566 $genes{$chr}->{$ID}{'TF_OtherIntronsDist'} = $INFINITY; | |
567 $genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraScore'} = 0; | |
568 $genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraDist'} = $INFINITY; | |
569 | |
570 $genes{$chr}->{$ID}{'TF_junctionAndIntraScore'} = 0; | |
571 $genes{$chr}->{$ID}{'TF_junctionAndIntraDist'} = $INFINITY; | |
572 $genes{$chr}->{$ID}{'TF_FirstExonAndIntraScore'} = 0; | |
573 $genes{$chr}->{$ID}{'TF_FirstExonAndIntraDist'} = $INFINITY; | |
574 | |
575 $genes{$chr}->{$ID}{'TFenh60kbScore'} = 0; | |
576 $genes{$chr}->{$ID}{'TFenh60kbDist'} = $INFINITY; | |
577 | |
578 $genes{$chr}->{$ID}{'TF5kbDownScore'} = 0; | |
579 $genes{$chr}->{$ID}{'TF5kbDownDist'} = $INFINITY; | |
580 $genes{$chr}->{$ID}{'K9enhScore'} = 0; | |
581 $genes{$chr}->{$ID}{'K9enhDist'} = $INFINITY; | |
582 | |
583 ($genes{$chr}->{$ID}{'firstIntronStart'},$genes{$chr}->{$ID}{'firstIntronEnd'}) = (0,0); | |
584 ($genes{$chr}->{$ID}{'firstExonStart'},$genes{$chr}->{$ID}{'firstExonEnd'}) = (0,0); | |
585 | |
586 $genes{$chr}->{$ID}{'GCisland'} = 0; | |
587 | |
588 $genes{$chr}->{$ID}{'exonCount'} = 1; | |
589 $genes{$chr}->{$ID}{'exonStarts'} = $leftPos ; | |
590 $genes{$chr}->{$ID}{'exonEnds'} = $rightPos ; | |
591 | |
592 | |
593 } | |
594 | |
595 | |
596 close MIR; | |
597 print "\t\t$MirFilename is read!\n" if ($verbose) ; | |
598 print "$count miRNA\n" if ($verbose);; | |
599 } | |
600 | |
601 #-----------read file with sites of TF1----- | |
602 my $numberOfAllSites = 0; | |
603 | |
604 if ($TF1Filename eq "") { | |
605 print "No file with peaks of TF1!\n" if ($verbose) ; | |
606 } else { | |
607 open (FILE, "<$TF1Filename ") or die "Cannot open file $TF1Filename !!!!: $!"; | |
608 $_ = <FILE>; | |
609 my $correction = 0; | |
610 my @a = split /\t/; | |
611 if ( $a[1] =~ m/chr/ ) { | |
612 $correction = 1; | |
613 } | |
614 | |
615 while (<FILE>) { | |
616 chomp; | |
617 | |
618 my @a = split /\t/; | |
619 | |
620 my $chr = $a[0+$correction]; | |
621 my $firstPos = $a[1+$correction]; | |
622 my $LastPos = $a[2+$correction]; | |
623 my $maxPos = $a[3+$correction]; | |
624 if ($maxPos=~/\D/) { | |
625 $maxPos = int(($firstPos+$LastPos)/2); | |
626 } | |
627 my $score = $a[4+$correction]; | |
628 | |
629 for my $ID (keys %{$genes{$chr}}) { | |
630 | |
631 my $distTSS = ($maxPos - $genes{$chr}->{$ID}{'TSS'})*$genes{$chr}->{$ID}{'strand'}; | |
632 my $distTE = ($maxPos - $genes{$chr}->{$ID}{'TE'})*$genes{$chr}->{$ID}{'strand'}; | |
633 | |
634 if (($distTSS>= $enhLeft)&&($distTSS<$enhRight)) { | |
635 if ($genes{$chr}->{$ID}{'TFenhScore'}<$score) { | |
636 $genes{$chr}->{$ID}{'TFenhScore'}=$score; | |
637 $genes{$chr}->{$ID}{'TFenhDist'} = $distTSS; | |
638 } | |
639 } elsif (($distTSS>= $enhRight)&&($distTSS<=$immediateDownstream)) { | |
640 if ($genes{$chr}->{$ID}{'TFpromScore'}<$score) { | |
641 $genes{$chr}->{$ID}{'TFpromScore'}=$score; | |
642 $genes{$chr}->{$ID}{'TFpromDist'} = $distTSS; | |
643 } | |
644 } elsif (($distTSS >= $immediateDownstream)&&($distTE<=0)) { | |
645 if ($genes{$chr}->{$ID}{'TFintraScore'}<$score) { | |
646 $genes{$chr}->{$ID}{'TFintraScore'}=$score; | |
647 $genes{$chr}->{$ID}{'TFintraDist'} = $distTSS; | |
648 } | |
649 } | |
650 if (($distTSS>= $enhLeft)&&($distTE<=$kb5)) { | |
651 if ($genes{$chr}->{$ID}{'TFallScore'}<$score) { | |
652 $genes{$chr}->{$ID}{'TFallScore'}=$score; | |
653 $genes{$chr}->{$ID}{'TFallDist'} = $distTSS; | |
654 } | |
655 } | |
656 | |
657 if (($distTSS>= 0)&&($distTSS<=$immediateDownstream)) { | |
658 if ($genes{$chr}->{$ID}{'TFImmDownScore'}<$score) { | |
659 $genes{$chr}->{$ID}{'TFImmDownScore'}=$score; | |
660 $genes{$chr}->{$ID}{'TFImmDownDist'} = $distTSS; | |
661 } | |
662 } | |
663 if (($distTSS<= 0)&&($distTSS>=$enhRight)) { | |
664 if ($genes{$chr}->{$ID}{'TFpromSimpleScore'}<$score) { | |
665 $genes{$chr}->{$ID}{'TFpromSimpleScore'}=$score; | |
666 $genes{$chr}->{$ID}{'TFpromSimpleDist'} = $distTSS; | |
667 } | |
668 } | |
669 | |
670 my ($firstIntronStart,$firstIntronEnd)=($genes{$chr}->{$ID}{'firstIntronStart'},$genes{$chr}->{$ID}{'firstIntronEnd'}); | |
671 ($firstIntronStart,$firstIntronEnd)= ($firstIntronEnd,$firstIntronStart) if ($firstIntronStart>$firstIntronEnd) ; | |
672 | |
673 my ($firstExonStart,$firstExonEnd) = ($genes{$chr}->{$ID}{'firstExonStart'},$genes{$chr}->{$ID}{'firstExonEnd'}) ; | |
674 ($firstExonStart,$firstExonEnd)= ($firstExonEnd,$firstExonStart) if ($firstExonStart>$firstExonEnd) ; | |
675 | |
676 if ($maxPos>=$firstIntronStart && $maxPos <= $firstIntronEnd) { | |
677 if ($genes{$chr}->{$ID}{'TFFirstIntronScore'}<$score) { | |
678 $genes{$chr}->{$ID}{'TFFirstIntronScore'}=$score; | |
679 $genes{$chr}->{$ID}{'TFFirstIntronDist'} = $distTSS; | |
680 } | |
681 | |
682 if (($distTSS >= $immediateDownstream)&&($distTE<=0)) { | |
683 if ($genes{$chr}->{$ID}{'TFFirstIntronAndIntraScore'}<$score) { | |
684 $genes{$chr}->{$ID}{'TFFirstIntronAndIntraScore'}=$score; | |
685 $genes{$chr}->{$ID}{'TFFirstIntronAndIntraDist'} = $distTSS; | |
686 } | |
687 } | |
688 } elsif (($distTSS >= $immediateDownstream)&&($distTE<=0)) { | |
689 if ($genes{$chr}->{$ID}{'TFintraMinusFirstIntronScore'}<$score) { | |
690 $genes{$chr}->{$ID}{'TFintraMinusFirstIntronScore'}=$score; | |
691 $genes{$chr}->{$ID}{'TFintraMinusFirstIntronDist'} = $distTSS; | |
692 } | |
693 } | |
694 if (($distTSS>= $longEnhLeft)&&($distTSS<$enhRight)) { | |
695 if ($genes{$chr}->{$ID}{'TFenh60kbScore'}<$score) { | |
696 $genes{$chr}->{$ID}{'TFenh60kbScore'}=$score; | |
697 $genes{$chr}->{$ID}{'TFenh60kbDist'} = $distTSS; | |
698 } | |
699 } | |
700 if (($distTE>=0)&&($distTE<=$kb5)) { | |
701 if ($genes{$chr}->{$ID}{'TF5kbDownScore'}<$score) { | |
702 $genes{$chr}->{$ID}{'TF5kbDownScore'}=$score; | |
703 $genes{$chr}->{$ID}{'TF5kbDownDist'} = $distTSS; | |
704 } | |
705 } | |
706 if ($distTSS>=0 && $distTE<=0) { | |
707 my $typeIntra = &getTypeIntra($genes{$chr}->{$ID},$maxPos); | |
708 if ($typeIntra eq "f_exon") { | |
709 if ($genes{$chr}->{$ID}{'TF_FirstExonScore'}<$score) { | |
710 $genes{$chr}->{$ID}{'TF_FirstExonScore'}=$score; | |
711 $genes{$chr}->{$ID}{'TF_FirstExonDist'} = $distTSS; | |
712 } | |
713 | |
714 if ($genes{$chr}->{$ID}{'TF_FirstExonAndIntraScore'}<$score && ($distTSS >= $immediateDownstream)&&($distTE<=0)) { | |
715 $genes{$chr}->{$ID}{'TF_FirstExonAndIntraScore'}=$score; | |
716 $genes{$chr}->{$ID}{'TF_FirstExonAndIntraDist'} = $distTSS; | |
717 } | |
718 | |
719 } else { | |
720 if ($typeIntra eq "exon") { | |
721 if ($genes{$chr}->{$ID}{'TF_OtherExonsScore'}<$score) { | |
722 $genes{$chr}->{$ID}{'TF_OtherExonsScore'}=$score; | |
723 $genes{$chr}->{$ID}{'TF_OtherExonsDist'} = $distTSS; | |
724 } | |
725 | |
726 if (($distTSS >= $immediateDownstream)&&($distTE<=0)) { | |
727 if ($genes{$chr}->{$ID}{'TF_OtherExonsAndIntraScore'}<$score) { | |
728 $genes{$chr}->{$ID}{'TF_OtherExonsAndIntraScore'}=$score; | |
729 $genes{$chr}->{$ID}{'TF_OtherExonsAndIntraDist'} = $distTSS; | |
730 } | |
731 } | |
732 | |
733 } elsif ($typeIntra eq "intron") { | |
734 if ($genes{$chr}->{$ID}{'TF_OtherIntronsScore'}<$score) { | |
735 $genes{$chr}->{$ID}{'TF_OtherIntronsScore'}=$score; | |
736 $genes{$chr}->{$ID}{'TF_OtherIntronsDist'} = $distTSS; | |
737 } | |
738 | |
739 if (($distTSS >= $immediateDownstream)&&($distTE<=0)) { | |
740 if ($genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraScore'}<$score) { | |
741 $genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraScore'}=$score; | |
742 $genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraDist'} = $distTSS; | |
743 } | |
744 } | |
745 } elsif ($typeIntra eq "jonction") { | |
746 if ($genes{$chr}->{$ID}{'TF_junctionScore'}<$score) { | |
747 $genes{$chr}->{$ID}{'TF_junctionScore'}=$score; | |
748 $genes{$chr}->{$ID}{'TF_junctionDist'} = $distTSS; | |
749 } | |
750 | |
751 if ($genes{$chr}->{$ID}{'TF_junctionAndIntraScore'}<$score && ($distTSS >= $immediateDownstream)&&($distTE<=0)) { | |
752 $genes{$chr}->{$ID}{'TF_junctionAndIntraScore'}=$score; | |
753 $genes{$chr}->{$ID}{'TF_junctionAndIntraDist'} = $distTSS; | |
754 } | |
755 | |
756 } | |
757 | |
758 } | |
759 } | |
760 | |
761 | |
762 } | |
763 $numberOfAllSites++; | |
764 } | |
765 | |
766 close FILE; | |
767 print "\t$TF1Filename is read!\n" if ($verbose) ; | |
768 print "$numberOfAllSites sites\n" ; | |
769 } | |
770 | |
771 #-----------read file with sites RNApolII----- | |
772 $numberOfAllSites = 0; | |
773 | |
774 if ($RNApolFilename eq "") { | |
775 print "No file with peaks of RNA pol II!\n" if ($verbose) ; | |
776 } else { | |
777 open (FILE, "<$RNApolFilename ") or die "Cannot open file $RNApolFilename !!!!: $!"; | |
778 $_ = <FILE>; | |
779 my @a = split /\t/; | |
780 my $correction = 0; | |
781 | |
782 if ($a[0]=~m/chr/) { | |
783 $correction = -1; | |
784 } | |
785 #seek (FILE, 0, 0); | |
786 while (<FILE>) { | |
787 chomp; | |
788 | |
789 my @a = split /\t/; | |
790 | |
791 my $chr = $a[1+$correction]; | |
792 my $firstPos = $a[2+$correction]; | |
793 my $LastPos = $a[3+$correction]; | |
794 my $maxPos = $a[4+$correction]; | |
795 my $score = $a[5+$correction]; | |
796 #print "$numberOfAllSites: $score\n"; | |
797 next if ($score < $cutoff_rp); | |
798 | |
799 for my $ID (keys %{$genes{$chr}}) { | |
800 | |
801 my $distTSS = ($maxPos - $genes{$chr}->{$ID}{'TSS'})*$genes{$chr}->{$ID}{'strand'}; | |
802 my $distTE = ($maxPos - $genes{$chr}->{$ID}{'TE'})*$genes{$chr}->{$ID}{'strand'}; | |
803 | |
804 if (($distTSS>= $enhRight)&&($distTSS<=$immediateDownstream)) { | |
805 if ($genes{$chr}->{$ID}{'RNApolScore'}<$score) { | |
806 $genes{$chr}->{$ID}{'RNApolScore'}=$score; | |
807 $genes{$chr}->{$ID}{'RNApolDist'} = $distTSS; | |
808 } | |
809 } | |
810 if ($distTSS>=0 && $distTE<=0) { | |
811 my $typeIntra = &getTypeIntra($genes{$chr}->{$ID},$maxPos); | |
812 if ($typeIntra eq "jonction") { | |
813 if ($genes{$chr}->{$ID}{'RNApol_junctionScore'}<$score) { | |
814 $genes{$chr}->{$ID}{'RNApol_junctionScore'}=$score; | |
815 $genes{$chr}->{$ID}{'RNApol_junctionDist'} = $distTSS; | |
816 } | |
817 } | |
818 } | |
819 } | |
820 | |
821 $numberOfAllSites++; | |
822 } | |
823 close FILE; | |
824 print "\t$RNApolFilename is read!\n$numberOfAllSites sites\n" if ($verbose) ; | |
825 | |
826 } | |
827 | |
828 #-----------read file with sites K36me3----- | |
829 $numberOfAllSites = 0; | |
830 my @K36Score; | |
831 | |
832 if ($H3K36Me3polFilename eq "") { | |
833 print "No file with peaks of H3K36me3!\n" if ($verbose) ; | |
834 } else { | |
835 open (FILE, "<$H3K36Me3polFilename ") or die "Cannot open file $H3K36Me3polFilename !!!!: $!"; | |
836 | |
837 $_ = <FILE>; | |
838 my @a = split /\t/; | |
839 my $correction = 0; | |
840 | |
841 if ($a[0]=~m/chr/) { | |
842 $correction = -1; | |
843 } | |
844 | |
845 while (<FILE>) { | |
846 chomp; | |
847 | |
848 my @a = split /\t/; | |
849 | |
850 my $chr = $a[1+$correction]; | |
851 my $firstPos = $a[2+$correction]; | |
852 my $lastPos = $a[3+$correction]; | |
853 my $maxPos = $a[4+$correction]; | |
854 my $score= $a[5+$correction]; | |
855 | |
856 for my $ID (keys %{$genes{$chr}}) { | |
857 | |
858 my $distTSS = ($maxPos - $genes{$chr}->{$ID}{'TSS'})*$genes{$chr}->{$ID}{'strand'}; | |
859 my $distTE = ($maxPos - $genes{$chr}->{$ID}{'TE'})*$genes{$chr}->{$ID}{'strand'}; | |
860 | |
861 my $scoreToadd = 0; | |
862 | |
863 if (($firstPos>=$genes{$chr}->{$ID}{'left'})&&($lastPos<=$genes{$chr}->{$ID}{'right'})) { | |
864 $scoreToadd = $score/2.*($lastPos-$firstPos+1)/($genes{$chr}->{$ID}{'right'}-$genes{$chr}->{$ID}{'left'}+1); | |
865 } | |
866 if (($firstPos>=$genes{$chr}->{$ID}{'left'})&&($firstPos<$genes{$chr}->{$ID}{'right'})&&($lastPos>$genes{$chr}->{$ID}{'right'})) { | |
867 $scoreToadd = $score/2.*($genes{$chr}->{$ID}{'right'}-$firstPos+1)/($genes{$chr}->{$ID}{'right'}-$genes{$chr}->{$ID}{'left'}+1); | |
868 } | |
869 if (($firstPos<$genes{$chr}->{$ID}{'left'})&&($lastPos>$genes{$chr}->{$ID}{'left'})&&($lastPos<=$genes{$chr}->{$ID}{'right'})) { | |
870 $scoreToadd = $score/2.*($lastPos-$genes{$chr}->{$ID}{'left'}+1)/($genes{$chr}->{$ID}{'right'}-$genes{$chr}->{$ID}{'left'}+1); | |
871 } | |
872 if (($firstPos<$genes{$chr}->{$ID}{'left'})&&($lastPos>$genes{$chr}->{$ID}{'right'})) { | |
873 my $scoreToadd = $score/2.*($lastPos-$firstPos+1); | |
874 } | |
875 $genes{$chr}->{$ID}{'K36score'} += $scoreToadd; | |
876 } | |
877 $numberOfAllSites++; | |
878 } | |
879 close FILE; | |
880 print "\t$H3K36Me3polFilename is read!\n$numberOfAllSites sites\n" if ($verbose) ; | |
881 } | |
882 | |
883 #-----------read file with sites H3K9me3----- | |
884 $numberOfAllSites = 0; | |
885 | |
886 | |
887 if ($H3K9Me3polFilename eq "") { | |
888 print "No file with peaks of H3K9me3!\n" if ($verbose) ; | |
889 } else { | |
890 open (FILE, "<$H3K9Me3polFilename ") or die "Cannot open file $H3K9Me3polFilename !!!!: $!"; | |
891 | |
892 $_ = <FILE>; | |
893 my @a = split /\t/; | |
894 my $correction = 0; | |
895 | |
896 if ($a[0]=~m/chr/) { | |
897 $correction = -1; | |
898 } | |
899 | |
900 while (<FILE>) { | |
901 chomp; | |
902 my @a = split /\t/; | |
903 my $chr = $a[1+$correction]; | |
904 my $firstPos = $a[2+$correction]; | |
905 my $LastPos = $a[3+$correction]; | |
906 my $maxPos = $a[4+$correction]; | |
907 my $score = $a[5+$correction]; | |
908 | |
909 next if ($score < $cutoff_k9); | |
910 | |
911 for my $ID (keys %{$genes{$chr}}) { | |
912 | |
913 my $distTSS = ($maxPos - $genes{$chr}->{$ID}{'TSS'})*$genes{$chr}->{$ID}{'strand'}; | |
914 my $distTE = ($maxPos - $genes{$chr}->{$ID}{'TE'})*$genes{$chr}->{$ID}{'strand'}; | |
915 | |
916 if (($distTSS>= $enhRight)&&($distTSS<=$immediateDownstream)) { | |
917 if ($genes{$chr}->{$ID}{'K9promScore'}<$score) { | |
918 $genes{$chr}->{$ID}{'K9promScore'}=$score; | |
919 $genes{$chr}->{$ID}{'K9promDist'} = $distTSS; | |
920 } | |
921 } elsif (($distTSS >= -$K9dist)&&($distTSS<=$K9dist)) { | |
922 if ($genes{$chr}->{$ID}{'K9largeScore'}<$score) { | |
923 $genes{$chr}->{$ID}{'K9largeScore'}=$score; | |
924 $genes{$chr}->{$ID}{'K9largeDist'} = $distTSS; | |
925 } | |
926 } | |
927 if (($distTSS>= $enhLeft)&&($distTSS<$enhRight)) { | |
928 if ($genes{$chr}->{$ID}{'K9enhScore'}<$score) { | |
929 $genes{$chr}->{$ID}{'K9enhScore'}=$score; | |
930 $genes{$chr}->{$ID}{'K9enhDist'} = $distTSS; | |
931 } | |
932 } | |
933 } | |
934 $numberOfAllSites++; | |
935 } | |
936 close FILE; | |
937 print "\t$H3K9Me3polFilename is read!\n$numberOfAllSites sites\n" if ($verbose) ; | |
938 } | |
939 | |
940 #-----------output all----- | |
941 #unless($initialTable eq "") {} | |
942 | |
943 | |
944 open (OUT , ">$outname") or die "Cannot open file $outname!!!!: $!"; | |
945 | |
946 print OUT "name\tchr\tstart\tend\tstrand\tReg\tfoldChange\t"; | |
947 | |
948 if ($GCislands ne "") { | |
949 print OUT "GC-island\t"; | |
950 } | |
951 | |
952 if ( $fluoFile ne "") { | |
953 print OUT "fluorescence\t"; | |
954 } | |
955 | |
956 if ($RNApolFilename ne "") { | |
957 print OUT "RNApolII_score\tRNApolII_distTSS\tRNApol_junctionScore\tRNApol_junctionDist\t"; | |
958 } | |
959 if ($H3K36Me3polFilename ne "") { | |
960 print OUT "H3K36me3_score\t"; | |
961 } | |
962 if ($H3K9Me3polFilename ne "") { | |
963 print OUT "H3K9me3_score_prom\tH3K9me3_distTSS_prom\tH3K9me3_score_large\tH3K9me3_distTSS_large\tH3K9me3_score_enh\tH3K9me3_distTSS_enh\t"; | |
964 } | |
965 if ($TF1Filename ne "") { | |
966 print OUT "TF_score_Gene\tTF_distTSS_Gene\t"; | |
967 print OUT "TF_score_Promoter\tTF_distTSS_Promoter\t"; | |
968 print OUT "TF_score_ImmDown\tTF_distTSS_ImmDown\t"; | |
969 print OUT "TF_score_PromoterORImmDown\tTF_distTSS_PromoterORImmDown\t"; | |
970 print OUT "TF_score_Enhancer\tTF_distTSS_Enhancer\t"; | |
971 print OUT "TF_score_Intragenic\tTF_distTSS_Intragenic\t"; | |
972 print OUT "TF_score_GeneDownstream\tTF_distTSS_GeneDownstream\t"; | |
973 print OUT "TF_score_FirstExon\tTF_distTSS_FirstExon\t"; | |
974 print OUT "TF_score_FisrtIntron\tTF_distTSS_FisrtIntron\t"; | |
975 print OUT "TF_score_FirstExonAND>$immediateDownstream\tTF_distTSS_FirstExonAND>$immediateDownstream\t"; | |
976 print OUT "TF_score_FisrtIntronAND>$immediateDownstream\tTF_distTSS_FisrtIntronAND>$immediateDownstream\t"; | |
977 #print OUT "TF_score_IntraMinusFisrtIntron\tTF_distTSS_IntraMinusFisrtIntron\t"; | |
978 | |
979 #print OUT "TF_score_enh60kb\tTF_distTSS_enh60kb\t"; | |
980 | |
981 print OUT "TF_score_Exons2,3,4,etc\tTF_distTSS_Exons2,3,4,etc\t"; | |
982 print OUT "TF_score_Exons2,3,4,etcAND>$immediateDownstream\tTF_distTSS_Exons2,3,4,etcAND>$immediateDownstream\t"; | |
983 | |
984 print OUT "TF_score_Introns2,3,4,etc\tTF_distTSS_Introns2,3,4,etc\t"; | |
985 print OUT "TF_score_Introns2,3,4,etcAND>$immediateDownstream\tTF_distTSS_Introns2,3,4,etcAND>$immediateDownstream\t"; | |
986 | |
987 print OUT "TF_score_EIjunction\tTF_distTSS_EIjunction\t"; | |
988 print OUT "TF_score_EIjunctionAND>$immediateDownstream\tTF_distTSS_EIjunctionAND>$immediateDownstream"; | |
989 } | |
990 | |
991 print OUT "\n"; | |
992 | |
993 for my $chr (keys %genes) { | |
994 for my $ID (keys %{$genes{$chr}}) { | |
995 print OUT "$genes{$chr}->{$ID}{'name'}\t$chr\t$genes{$chr}->{$ID}{'left'}\t$genes{$chr}->{$ID}{'right'}\t$genes{$chr}->{$ID}{'strand'}\t$genes{$chr}->{$ID}{'reg'}\t$genes{$chr}->{$ID}{'foldChange'}\t"; | |
996 | |
997 if ($GCislands ne "") { | |
998 print OUT "$genes{$chr}->{$ID}{'GCisland'}\t"; | |
999 } | |
1000 | |
1001 if ( $fluoFile ne "") { | |
1002 print OUT "$genes{$chr}->{$ID}{'fluo'}\t"; | |
1003 } | |
1004 | |
1005 if ($RNApolFilename ne "") { | |
1006 print OUT "$genes{$chr}->{$ID}{'RNApolScore'}\t$genes{$chr}->{$ID}{'RNApolDist'}\t$genes{$chr}->{$ID}{'RNApol_junctionScore'}\t$genes{$chr}->{$ID}{'RNApol_junctionDist'}\t"; | |
1007 } | |
1008 if ($H3K36Me3polFilename ne "") { | |
1009 print OUT "$genes{$chr}->{$ID}{'K36score'}\t"; | |
1010 } | |
1011 if ($H3K9Me3polFilename ne "") { | |
1012 print OUT "$genes{$chr}->{$ID}{'K9promScore'}\t$genes{$chr}->{$ID}{'K9promDist'}\t$genes{$chr}->{$ID}{'K9largeScore'}\t$genes{$chr}->{$ID}{'K9largeDist'}\t$genes{$chr}->{$ID}{'K9enhScore'}\t$genes{$chr}->{$ID}{'K9enhDist'}\t"; | |
1013 } | |
1014 if ($TF1Filename ne "") { | |
1015 print OUT "$genes{$chr}->{$ID}{'TFallScore'}\t$genes{$chr}->{$ID}{'TFallDist'}\t"; | |
1016 print OUT "$genes{$chr}->{$ID}{'TFpromSimpleScore'}\t$genes{$chr}->{$ID}{'TFpromSimpleDist'}\t"; | |
1017 print OUT "$genes{$chr}->{$ID}{'TFImmDownScore'}\t$genes{$chr}->{$ID}{'TFImmDownDist'}\t"; | |
1018 print OUT "$genes{$chr}->{$ID}{'TFpromScore'}\t$genes{$chr}->{$ID}{'TFpromDist'}\t"; | |
1019 print OUT "$genes{$chr}->{$ID}{'TFenhScore'}\t$genes{$chr}->{$ID}{'TFenhDist'}\t"; | |
1020 print OUT "$genes{$chr}->{$ID}{'TFintraScore'}\t$genes{$chr}->{$ID}{'TFintraDist'}\t"; | |
1021 print OUT "$genes{$chr}->{$ID}{'TF5kbDownScore'}\t$genes{$chr}->{$ID}{'TF5kbDownDist'}\t"; | |
1022 print OUT "$genes{$chr}->{$ID}{'TF_FirstExonScore'}\t$genes{$chr}->{$ID}{'TF_FirstExonDist'}\t"; | |
1023 print OUT "$genes{$chr}->{$ID}{'TF_FirstExonAndIntraScore'}\t$genes{$chr}->{$ID}{'TF_FirstExonAndIntraDist'}\t"; | |
1024 print OUT "$genes{$chr}->{$ID}{'TFFirstIntronScore'}\t$genes{$chr}->{$ID}{'TFFirstIntronDist'}\t"; | |
1025 print OUT "$genes{$chr}->{$ID}{'TFFirstIntronAndIntraScore'}\t$genes{$chr}->{$ID}{'TFFirstIntronAndIntraDist'}\t"; | |
1026 #print OUT "$genes{$chr}->{$ID}{'TFintraMinusFirstIntronScore'}\t$genes{$chr}->{$ID}{'TFintraMinusFirstIntronDist'}\t"; | |
1027 | |
1028 #print OUT "$genes{$chr}->{$ID}{'TFenh60kbScore'}\t$genes{$chr}->{$ID}{'TFenh60kbDist'}\t"; | |
1029 | |
1030 print OUT "$genes{$chr}->{$ID}{'TF_OtherExonsScore'}\t$genes{$chr}->{$ID}{'TF_OtherExonsDist'}\t"; | |
1031 print OUT "$genes{$chr}->{$ID}{'TF_OtherExonsAndIntraScore'}\t$genes{$chr}->{$ID}{'TF_OtherExonsAndIntraDist'}\t"; | |
1032 | |
1033 print OUT "$genes{$chr}->{$ID}{'TF_OtherIntronsScore'}\t$genes{$chr}->{$ID}{'TF_OtherIntronsDist'}\t"; | |
1034 print OUT "$genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraScore'}\t$genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraDist'}\t"; | |
1035 | |
1036 print OUT "$genes{$chr}->{$ID}{'TF_junctionScore'}\t$genes{$chr}->{$ID}{'TF_junctionDist'}\t"; | |
1037 print OUT "$genes{$chr}->{$ID}{'TF_junctionAndIntraScore'}\t$genes{$chr}->{$ID}{'TF_junctionAndIntraDist'}"; | |
1038 } | |
1039 print OUT "\n"; | |
1040 } | |
1041 } | |
1042 | |
1043 close OUT; | |
1044 | |
1045 ################################### | |
1046 sub med { | |
1047 my @arr = @_; | |
1048 my $med = 0; | |
1049 @arr = sort {$a <=> $b} @arr; | |
1050 if ((scalar(@arr)/2) =~ m/[\.\,]5/) { | |
1051 return $arr[floor(scalar(@arr)/2)]; | |
1052 } else { | |
1053 return ($arr[scalar(@arr)/2]+$arr[scalar(@arr)/2-1])/2; | |
1054 } | |
1055 $med; | |
1056 } | |
1057 | |
1058 sub checkIfGC { | |
1059 my ($TSS,$strand,$dist,$GCislandsChr)=@_; | |
1060 my $ifGC = 0; | |
1061 my $leftProm=$TSS-$dist; | |
1062 my $rightProm = $TSS; | |
1063 if ($strand== -1) { | |
1064 my $leftProm=$TSS; | |
1065 my $rightProm = $TSS+$dist; | |
1066 } #print "$leftProm\t"; print "$rightProm\n"; | |
1067 for my $leftGC (keys %{$GCislandsChr}) { | |
1068 my $rightGC = $GCislandsChr->{$leftGC}; | |
1069 if ($leftGC>=$leftProm&&$leftGC<=$rightProm || $rightGC>=$leftProm&&$rightGC<=$rightProm) { | |
1070 return "GC-island"; | |
1071 } | |
1072 } | |
1073 return $ifGC ; | |
1074 } | |
1075 | |
1076 sub getFirstIntron { | |
1077 my ($exonCount,$exonStarts,$exonEnds,$strand) = @_; | |
1078 my ($left,$right); | |
1079 if ($exonCount == 1) { | |
1080 return (0,0); | |
1081 } | |
1082 if ($strand == 1) { | |
1083 $left = (split ",", $exonEnds)[0]; | |
1084 $right = (split (",", $exonStarts))[1]; | |
1085 } else { | |
1086 $left = (split (",", $exonEnds))[$exonCount-2]; | |
1087 $right = (split (",", $exonStarts))[$exonCount-1]; | |
1088 } | |
1089 ($left,$right); | |
1090 } | |
1091 | |
1092 sub getFirstExon { | |
1093 my ($exonCount,$exonStarts,$exonEnds,$strand) = @_; | |
1094 my ($left,$right); | |
1095 if ($exonCount == 1) { | |
1096 return (0,0); | |
1097 } | |
1098 if ($strand == 1) { | |
1099 $left = (split ",", $exonStarts)[0]; | |
1100 $right = (split (",", $exonEnds))[0]-$jonctionSize; | |
1101 } else { | |
1102 $left = (split (",", $exonStarts))[$exonCount-1]+$jonctionSize; | |
1103 $right = (split (",", $exonEnds))[$exonCount-1]; | |
1104 } | |
1105 ($left,$right); | |
1106 } | |
1107 | |
1108 | |
1109 sub getIntronExon { | |
1110 my ($pos,$exonCount,$exonStarts,$exonEnds,$strand) = @_; | |
1111 my (@left,@right); | |
1112 @left = (split ",", $exonStarts); | |
1113 @right = (split (",", $exonEnds)); | |
1114 | |
1115 for (my $i = 0; $i<$exonCount;$i++) { | |
1116 #print "$left[$i] <= $pos ? $pos <= $right[$i]\n"; | |
1117 if (($left[$i]+$jonctionSize < $pos) && ($pos < $right[$i]-$jonctionSize)) { | |
1118 #print "URA!\n"; | |
1119 return "exon"; | |
1120 } elsif (($i+1<$exonCount)&&($right[$i]+$jonctionSize < $pos) && ($pos < $left[$i+1]-$jonctionSize)) { | |
1121 return "intron"; | |
1122 } | |
1123 } | |
1124 return "jonction"; | |
1125 } | |
1126 | |
1127 | |
1128 sub getTypeIntra { | |
1129 | |
1130 my ($geneEntry, $pos) = @_; | |
1131 my $type; | |
1132 | |
1133 if (($pos >= $geneEntry->{'firstIntronStart'})&&($pos <=$geneEntry->{'firstIntronEnd'})) { | |
1134 return "f_intron"; | |
1135 } | |
1136 if (($pos >= $geneEntry->{'firstExonStart'})&&($pos <=$geneEntry->{'firstExonEnd'})) { | |
1137 return "f_exon"; | |
1138 } | |
1139 $type = getIntronExon ($pos, $geneEntry->{'exonCount'},$geneEntry->{'exonStarts'},$geneEntry->{'exonEnds'},$geneEntry->{'strand'}); | |
1140 return $type; | |
1141 } | |
1142 | |
1143 sub getRNAlength { | |
1144 my ($exonStarts,$exonEnds) = @_; | |
1145 my (@left,@right); | |
1146 @left = (split ",", $exonStarts); | |
1147 @right = (split (",", $exonEnds)); | |
1148 my $length = 0; | |
1149 for (my $i = 0; $i<scalar(@right);$i++) { | |
1150 $length+=$right[$i]-$left[$i]; | |
1151 } | |
1152 #print STDERR "length = $length\n"; | |
1153 return $length; | |
1154 } |