2
|
1 #!/usr/bin/perl
|
|
2
|
|
3 # TFBScluster version 2.0 - cluster together TFBSs from distinct
|
|
4 # TFBSsearch generated libraries.
|
|
5 #
|
|
6 # (c) Ian Donaldson 2003 and Mike Chapman (TFBS overlap subs)
|
|
7 #
|
|
8 # This program is free software; you can redistribute it and/or
|
|
9 # modify it under the terms of the GNU General Public License
|
|
10 # as published by the Free Software Foundation; either version 2
|
|
11 # of the License, or (at your option) any later version.
|
|
12 #
|
|
13 # This program is distributed in the hope that it will be useful,
|
|
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
16 # GNU General Public License for more details.
|
|
17 #
|
|
18 # You should have received a copy of the GNU General Public License
|
|
19 # along with this program; if not, write to the Free Software
|
|
20 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
21
|
|
22 use strict;
|
|
23 use FileHandle;
|
|
24 #use integer; # Force floating point into integers
|
|
25
|
|
26 $|=1;
|
|
27
|
|
28 ###########
|
|
29 # Program to determine whether a TF site is in close proximity to
|
|
30 # one or more other TF sites.
|
|
31 # September 2003. Ian Donaldson.
|
|
32 # Revised 8 Sept. 2003 to not include the query TF site in the threshold.
|
|
33 # This is to allow one to determine whether a TF is near to another of the
|
|
34 # same type.
|
|
35 # Revised 9 Sept to alter the threshold size to only include the core of a
|
|
36 # pattern i.e. gata of nngatann.
|
|
37 # Revised 19 Sept to replace query and subject libraries with the statement
|
|
38 # of all interested libraries.
|
|
39 # Revised 22 Sept to scan output for overlapping patterns.
|
|
40 # NOTE: Any overlap in comparative record start with start pattern
|
|
41 # Revised 30 Sept to ignore duplication of TFBS in a group record caused by
|
|
42 # palindrome. By skipping if name and positions the same.
|
|
43 # Revised 6 Oct code for tree searching method to deal with overlapping TFBSs
|
|
44 ###########
|
|
45
|
|
46 #### Command line usage
|
|
47 if(@ARGV != 7)
|
|
48 {
|
|
49 die ("USAGE:
|
|
50 $0
|
|
51 TF libraries \(comma delimited NO SPACES\)
|
|
52 Number of flanking 'N's for subject files \(comma delimited NO SPACES\)
|
|
53 Minimum number of occurences \(comma delimited NO SPACES\)
|
|
54 TF IDs \(comma delimited NO SPACES\)
|
|
55 Single range value in bp \(+/-\) query start and end values
|
|
56 Include overlapping TFBSs \(include/exclude\)
|
|
57 Output file\n\n");
|
|
58 }
|
|
59
|
|
60 #### File handles
|
|
61 my $subject = new FileHandle;
|
|
62 my $combined = new FileHandle;
|
|
63 my $sorted = new FileHandle;
|
|
64 my $groups = new FileHandle;
|
|
65 my $filtered_groups = new FileHandle;
|
|
66 my $output = new FileHandle;
|
|
67 my $output2 = new FileHandle;
|
|
68
|
|
69 #### Variables
|
|
70 my @subject_files = (); # Array containing the names of selected subject files
|
|
71 my @flanking_n = (); # Array containing the number of flanking 'n' for each pattern
|
|
72 my @min_occur = (); # Array containing the minimum occurences for the TF library
|
|
73 my @ids = (); # Array containing user defined IDs for the TF libraries
|
|
74 my @file_sizes = ();
|
|
75 my @sorted_file_sizes = ();
|
|
76 my $range = $ARGV[4];
|
|
77 my $allow = $ARGV[5];
|
|
78 my @regex_ids = ();
|
|
79
|
|
80 #####################################################
|
|
81 #### Deal with user arguments processed into an array
|
|
82 #####################################################
|
|
83
|
|
84 #### Convert TF file names string to an array
|
|
85 @subject_files = split(/,/, $ARGV[0]);
|
|
86
|
|
87 #### Convert flanking 'N' numbers string into an array
|
|
88 @flanking_n = split(/,/, $ARGV[1]);
|
|
89
|
|
90 #### Convert minimum occurences string into an array
|
|
91 @min_occur = split(/,/, $ARGV[2]);
|
|
92
|
|
93 #### Convert minimum occurences string into an array
|
|
94 @ids = split(/,/, $ARGV[3]);
|
|
95
|
|
96 foreach my $id_string (@ids) {
|
|
97 if($id_string !~ /^[\w\d_,]+$/) {
|
|
98 die("A non-letter/number character was detected in your label string!\n");
|
|
99 }
|
|
100 }
|
|
101
|
|
102 #### Record how large they are
|
|
103 for(my $i=0; $i < $#subject_files + 1; $i++) {
|
|
104 my $size = (-s $subject_files[$i]); # -s performed on an unopened file!
|
|
105
|
|
106 push(@file_sizes, ["$subject_files[$i]", "$size", "$flanking_n[$i]", "$min_occur[$i]", "$ids[$i]"]);
|
|
107 }
|
|
108
|
|
109 #### Sort file sizes array by file sizes
|
|
110 # ARRAY NOT SORTED BUT COPIED TO ALLOW SORTING AT A LATER DATE
|
|
111 # @sorted_file_sizes = sort{$a->[1] <=> $b->[1]} @file_sizes;
|
|
112 push(@sorted_file_sizes, @file_sizes);
|
|
113
|
|
114 #### Summary file information
|
|
115 print STDOUT "TFBScluster analysis:\n",
|
|
116 "--------------------\n";
|
|
117
|
|
118 print STDOUT "TFBS library information:\n";
|
|
119
|
|
120 #### Show file summary
|
|
121 for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) {
|
|
122 print STDOUT "TFBS lib. ID = $sorted_file_sizes[$i][4].\n",
|
|
123 "Extended conservation = $sorted_file_sizes[$i][2].\n",
|
|
124 "Minimum occurrence = $sorted_file_sizes[$i][3].\n\n";
|
|
125 }
|
|
126
|
|
127 #print STDOUT "\n";
|
|
128
|
|
129
|
|
130 #####################################################
|
|
131 #### Information required by tree searching algorithm
|
|
132 #####################################################
|
|
133
|
|
134 #### Array containing the minimum number of each TF, also corresp names
|
|
135 my @tf_min = ();
|
|
136 my @tf_names = ();
|
|
137
|
|
138 for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) {
|
|
139 push(@tf_min, $sorted_file_sizes[$i][3]);
|
|
140 }
|
|
141
|
|
142 for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) {
|
|
143 push(@tf_names, $sorted_file_sizes[$i][4]);
|
|
144 }
|
|
145
|
|
146 #### TEST
|
|
147 #print "ARRAY1 = @tf_min\n\n";
|
|
148
|
|
149 #####################################################################
|
|
150 #### Open a file to store all the TF data from each selected library.
|
|
151 #####################################################################
|
|
152 $combined->open(">TFcombined\.$$") or die("Could not open TFcombined file!\n");
|
|
153
|
|
154 #### Copy each TF file to another file and sort it.
|
|
155 for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) {
|
|
156 #### Save necessary parts of the current subject file to chr. specific arrays
|
|
157 $subject->open("<$sorted_file_sizes[$i][0]") or die("Could not open subject file!\n");
|
|
158
|
|
159 #### Message to user
|
|
160 #print "Adding data for TF file $i\.\t";
|
|
161
|
|
162 SUBLINE: while(defined(my $sub_line = <$subject>)) {
|
|
163 my ($sub_seqname, $sub_source, $sub_feature, $sub_start, $sub_end, $sub_score,
|
|
164 $sub_strand, $sub_frame, $sub_attribute) = '';
|
|
165
|
|
166 #### Skip line if GFF comment or blank line
|
|
167 if($sub_line =~ /(^\s|^#)/)
|
|
168 {
|
|
169 next SUBLINE;
|
|
170 }
|
|
171
|
|
172 #### Split each line by TAB
|
|
173 ($sub_seqname, $sub_source, $sub_feature, $sub_start, $sub_end, $sub_score,
|
|
174 $sub_strand, $sub_frame, $sub_attribute) = split(/\t/, $sub_line, 9);
|
|
175
|
|
176 #### Clean up attribute
|
|
177 #($sub_attribute) = $sub_attribute =~ /[ACTG\-]+/g;
|
|
178 chomp($sub_attribute);
|
|
179
|
|
180 #### Adjust thresold of subject positions to reflect the core sequence
|
|
181 #### Possibly make an argument?!
|
|
182 $sub_start = $sub_start + $sorted_file_sizes[$i][2];
|
|
183 $sub_end = $sub_end - $sorted_file_sizes[$i][2];
|
|
184
|
|
185 #### Determine chromosome
|
|
186 (my $sub_chr) = $sub_seqname;
|
|
187
|
|
188 #### Add modified line to analysis library file
|
|
189 $combined->print("$sub_seqname\t$sub_source\t$sub_feature\t$sub_start\t$sub_end\t",
|
|
190 "$sub_score\t$sub_strand\t$sub_frame\t$sorted_file_sizes[$i][4]\n");
|
|
191 # "$sub_score\t$sub_strand\t$sub_frame\t$sub_attribute",
|
|
192 # "\_\_$sorted_file_sizes[$i][4]\n");
|
|
193 }
|
|
194
|
|
195 #### Message
|
|
196 #print STDOUT "\[DONE\]\n";
|
|
197
|
|
198 $subject->close;
|
|
199 }
|
|
200
|
|
201 #### Spacer on screen
|
|
202 #print STDOUT "\n";
|
|
203
|
|
204 $combined->close;
|
|
205
|
|
206
|
|
207 #############################
|
|
208 #### Sort the TFcombined file
|
|
209 #############################
|
|
210 #print STDOUT "Sorting TFcombined file.\t";
|
|
211
|
|
212 #system("sort +0.3 -0.5 +3n -4n TFcombined\.$$ > TFcombined_sorted_temp\.$$");
|
|
213 system("sort +0 -1 +3n -4n TFcombined\.$$ > TFcombined_sorted\.$$");
|
|
214
|
|
215 #print "HELLO\n";
|
|
216
|
|
217 ###################################
|
|
218 #### Convert all chr01 back to chr1
|
|
219 ###################################
|
|
220 #system("/home/donaldson/bin/TFBScluster/nozero_before_1-9.pl TFcombined_sorted_temp\.$$ TFcombined_sorted\.$$");
|
|
221
|
|
222 #print STDOUT "\[DONE\]\n\n";
|
|
223
|
|
224
|
|
225 #####################################
|
|
226 #### Sort the sorted file into groups
|
|
227 #####################################
|
|
228 #### Work thru each line of the combined TF file. Store record of all TFs downstream
|
|
229 #### WITHIN the predefined distance
|
|
230 #print STDOUT "Organising the sorted file into groups.\t";
|
|
231
|
|
232 my $last = 0;
|
|
233
|
|
234 $sorted->open("<TFcombined_sorted\.$$") or die("Could not open sorted TFcombined file!\n");
|
|
235
|
|
236 #### Rewind combined TF file to start
|
|
237 seek($sorted, 0, 0);
|
|
238
|
|
239 COMBLINE: while(defined(my $comb_line = <$sorted>)) {
|
|
240 #### Get info about the line
|
|
241 my @comb_line_array = split(/\t/, $comb_line);
|
|
242 my $comb_seqname = $comb_line_array[0];
|
|
243 (my $comb_chr) = $comb_seqname;
|
|
244
|
|
245 my $comb_start = $comb_line_array[3];
|
|
246
|
|
247 #### Store the start of the next line
|
|
248 my $next_line_pos = tell($sorted);
|
|
249
|
|
250 #### Variable to hold lines
|
|
251 my $group_holder = '';
|
|
252
|
|
253 $group_holder = $comb_line;
|
|
254
|
|
255 #### Read thru the next lines until the end position is not within the specified
|
|
256 #### range of the start line start
|
|
257 my $count_hit = 0;
|
|
258
|
|
259 GROUPLINE: while(defined(my $group_line = <$sorted>)) {
|
|
260 my @group_line_array = split(/\t/, $group_line);
|
|
261 my $group_seqname = $group_line_array[0];
|
|
262 (my $group_chr) = $group_seqname;
|
|
263
|
|
264 my $group_end = $group_line_array[4];
|
|
265
|
|
266 #### CHR
|
|
267 #if( (($group_end - $comb_start + 1) < $range ) and ($comb_chr eq $group_chr) ) {
|
|
268 if( (($group_end - $comb_start + 1) <= $range ) and ($comb_chr eq $group_chr) ) {
|
|
269 $group_holder .= $group_line;
|
|
270 $count_hit++;
|
|
271 }
|
|
272
|
|
273 else {
|
|
274 last GROUPLINE;
|
|
275 }
|
|
276 }
|
|
277
|
|
278 if($count_hit > 0) {
|
|
279 #Make the record
|
|
280 $groups->open(">>TFgroups\.$$") or die("Could not open TF groups file!\n");
|
|
281
|
|
282 $groups->print("$group_holder\/\/\n");
|
|
283
|
|
284 $groups->close;
|
|
285
|
|
286 #### Move to the end of the last line
|
|
287 seek($sorted, $next_line_pos , 0);
|
|
288
|
|
289 next COMBLINE;
|
|
290 }
|
|
291
|
|
292 else {
|
|
293 #### Move to the end of the last line
|
|
294 seek($sorted, $next_line_pos , 0);
|
|
295
|
|
296 next COMBLINE;
|
|
297 }
|
|
298 }
|
|
299
|
|
300 $sorted->close;
|
|
301
|
|
302 #print STDOUT "\[DONE\]\n\n";
|
|
303
|
|
304
|
|
305 ###################################################################################
|
|
306 #### Look through the groups file to find records matching the user defined params
|
|
307 ###################################################################################
|
|
308 my $record = '';
|
|
309 my $target = 0;
|
|
310 my $count_pass = 0;
|
|
311
|
|
312 #### Another user message
|
|
313 print STDOUT "You have chosen to search for groups containing at least:\n";
|
|
314
|
|
315 for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) {
|
|
316 print STDOUT "$sorted_file_sizes[$i][3] $sorted_file_sizes[$i][4] site(s).\n";
|
|
317
|
|
318 #### Increment the desired number of matches for a group to be selected
|
|
319 $target++;
|
|
320 }
|
|
321
|
|
322 #### Another user message
|
|
323 #print STDOUT "\nOutput will be written to \[$ARGV[6]\].\n";
|
|
324 print STDOUT "\nCombining overlapping clusters:\n";
|
|
325
|
|
326 #### Open an output file
|
|
327 $output->open(">$ARGV[6]\_v1") or die("Could not open output file!\n");
|
|
328
|
|
329 #### Open the TFgroups files
|
|
330 $groups->open("<TFgroups\.$$") or die("Could not open TFgroups file!\n");
|
|
331
|
|
332 #### Open the filtered record test file
|
|
333 $filtered_groups->open(">filtered_groups\.$$") or die("Could not open filtered groups file!\n");
|
|
334
|
|
335 #### Take each record of the groups file
|
|
336 RECORD: while($record = GetNewRecord($groups)) {
|
|
337 #### What about if the positions overlap?
|
|
338 my @record_array = ();
|
|
339 my $last_record_start = 0;
|
|
340 my $last_record_end = 0;
|
|
341 my $last_record_attribute = 0;
|
|
342
|
|
343 my $save_filtered_group = '';
|
|
344
|
|
345 #### Take each line of the record beginning with chr...
|
|
346 RECORDLINE: while($record =~ /(\w.+\n)/mg) {
|
|
347 my $record_line = $1;
|
|
348
|
|
349 my @record_line_array = ();
|
|
350 @record_line_array = split(/\t/, $record_line);
|
|
351
|
|
352 my $record_seqname = $record_line_array[0];
|
|
353 my $record_start = $record_line_array[3];
|
|
354 my $record_end = $record_line_array[4];
|
|
355 my $record_strand = $record_line_array[6];
|
|
356 my $record_attribute = $record_line_array[8];
|
|
357 chomp($record_attribute);
|
|
358
|
|
359 #print "$record_seqname\t$record_start\t$record_end\t$record_strand\t$record_attribute\n";
|
|
360 #exit;
|
|
361
|
|
362 #### If the last motif exactly overlaps and is same ID - skip adding to array
|
|
363 if( ($record_start == $last_record_start) and
|
|
364 ($record_end == $last_record_end) and
|
|
365 ($record_attribute eq $last_record_attribute) ) {
|
|
366
|
|
367 $last_record_start = $record_start;
|
|
368 $last_record_end = $record_end;
|
|
369 $last_record_attribute = $record_attribute;
|
|
370
|
|
371 next RECORDLINE;
|
|
372 }
|
|
373
|
|
374 $last_record_start = $record_start;
|
|
375 $last_record_end = $record_end;
|
|
376 $last_record_attribute = $record_attribute;
|
|
377
|
|
378 #### File 2D array
|
|
379 push(@record_array, ["$record_seqname", "$record_start", "$record_end", "$record_strand", "$record_attribute"]);
|
|
380
|
|
381 #### Test file
|
|
382 $save_filtered_group .= "$record_seqname\t$record_start\t$record_end\t$record_strand\t$record_attribute\n";
|
|
383 }
|
|
384
|
|
385 #### Test file record marker
|
|
386 $save_filtered_group .= "//\n";
|
|
387
|
|
388 ######################################################################
|
|
389 #### Make sure the record contains the minimum number of specified TFs
|
|
390 ######################################################################
|
|
391
|
|
392 #### Counter to see whether all params matched
|
|
393 my $pass = 0;
|
|
394 @regex_ids = (); # Array to hold regexs as they are used
|
|
395 my @regex_totals = (); # Array to hold info on regex totals in the record
|
|
396
|
|
397 #### Work thru each of the input parameter lists
|
|
398 for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) {
|
|
399 #### Site name for regex
|
|
400 my $regex = $sorted_file_sizes[$i][4];
|
|
401
|
|
402 push(@regex_ids, $regex);
|
|
403
|
|
404 #### Min number of hits for regex
|
|
405 my $min_regex = $sorted_file_sizes[$i][3];
|
|
406
|
|
407 #### Search for current regex and tally
|
|
408 my $regex_hits = 0;
|
|
409
|
|
410 #### Work thru each of the non-repeating record lines array
|
|
411 for(my $k=0; $k < $#record_array + 1; $k++) {
|
|
412 my $line_regex = $record_array[$k][4];
|
|
413
|
|
414 if($regex eq $line_regex) {
|
|
415 $regex_hits++;
|
|
416 }
|
|
417 }
|
|
418
|
|
419 #### Were the min number of regex hits found?
|
|
420 if($regex_hits >= $min_regex) {
|
|
421 $pass++;
|
|
422 }
|
|
423 }
|
|
424
|
|
425 #####################################################################
|
|
426 #### If there are the minimum number of TFBS then check they are not
|
|
427 #### sufficiently overlapping to reduce the numbers below the minimum
|
|
428 #####################################################################
|
|
429 my $good_cluster = '';
|
|
430
|
|
431 if($pass == $target) {
|
|
432 #### Test file
|
|
433 $filtered_groups->print("$save_filtered_group");
|
|
434
|
|
435 # Declarations
|
|
436 my ($end);
|
|
437 my (@starts, @ends, @choices);
|
|
438
|
|
439 # Assign start and end positions
|
|
440
|
|
441 grep { $starts[$_] = $record_array[$_][1]; $ends[$_] = $record_array[$_][2] }
|
|
442 (0 .. $#record_array);
|
|
443
|
|
444 $end = -1; # First choice does not refer to a transcription factor
|
|
445
|
|
446 # Loop through all transcription factors
|
|
447
|
|
448 I_LOOP:
|
|
449 for my $i (0 .. @starts) {
|
|
450 my $next_factor = 0;
|
|
451
|
|
452 # Now loop through all possible following transcription factors
|
|
453 # Note that $starts[0] and @ends[0] refer to the first transcription factor
|
|
454 # whereas $choices[1] will refer to the first transcription factor.
|
|
455
|
|
456 for my $j ( $i .. $#starts) {
|
|
457 if ($starts[$j] > $end) {
|
|
458 $next_factor = ($j + 1);
|
|
459 last;
|
|
460 }
|
|
461 }
|
|
462
|
|
463 push @{ $choices[$i] }, $next_factor;
|
|
464
|
|
465 # If no factor follows, we have a terminating factor and progress to the next.
|
|
466 # Must first modify $end to be the end of the next transcription factor.
|
|
467
|
|
468 unless ($next_factor) {
|
|
469 $end = $ends[$i];
|
|
470 next I_LOOP;
|
|
471 }
|
|
472
|
|
473 # Now need to check the factors overlapping with $next_factor and add them
|
|
474 # as possibilities. Note that in some circumstances, this may result in a
|
|
475 # redundant path. This will not give spurious results, however.
|
|
476
|
|
477 for my $k ( $next_factor .. $#starts ) {
|
|
478 if ($starts[$k] <= $ends[$next_factor - 1]) {
|
|
479 push @{ $choices[$i] }, ($k + 1);
|
|
480 }
|
|
481 }
|
|
482
|
|
483 # Finally, modify $end to be the end of the next transcription factor
|
|
484
|
|
485 $end = $ends[$i];
|
|
486
|
|
487 # And go to next loop
|
|
488 }
|
|
489
|
|
490
|
|
491 #### Print out @choices array to file
|
|
492 foreach (@choices) {
|
|
493 $filtered_groups->print("@{ $_ } \n");
|
|
494 }
|
|
495
|
|
496 $filtered_groups->print("//\n");
|
|
497
|
|
498
|
|
499 #####################################################
|
|
500 #### Information required by tree searching algorithm
|
|
501 #### Only for records that contain min number of TFBS
|
|
502 #####################################################
|
|
503
|
|
504 #### Array relating each TF to their minimum values
|
|
505 my @tf_relate = ();
|
|
506
|
|
507 #$tf_relate[0] = 0;
|
|
508
|
|
509 #### Get TF ID for current record
|
|
510 for(my $i=0; $i < $#record_array+1; $i++) {
|
|
511 #my $next_i = $i + 1;
|
|
512
|
|
513 my $current_attrib = $record_array[$i][4];
|
|
514
|
|
515 #### Scan @sorted_files_sizes array for matching TF ID and save
|
|
516 #### row number. This will relate directly to @tf_min
|
|
517 for(my $f=0; $f < $#sorted_file_sizes + 1; $f++) {
|
|
518
|
|
519 if($current_attrib =~ /$sorted_file_sizes[$f][4]/) {
|
|
520 #$tf_relate[$next_i] = "$f";
|
|
521 $tf_relate[$i] = "$f";
|
|
522 }
|
|
523 }
|
|
524 }
|
|
525
|
|
526 #### TEST
|
|
527 # print "ARRAY2 = @tf_relate\n\n";
|
|
528 # for(0..$#tf_relate) {
|
|
529 # print "[$_] $tf_relate[$_]\n";
|
|
530 # }
|
|
531
|
|
532 #### DUMMY ARRAYS
|
|
533 #@choices = ();
|
|
534 #@choices = (
|
|
535 # [ 1,2,3 ],
|
|
536 # [ 4,5,6 ],
|
|
537 # [ 4,5,6 ],
|
|
538 # [ 4,5,6 ],
|
|
539 # [ 7 ],
|
|
540 # [ 7 ],
|
|
541 # [ 7 ],
|
|
542 # [ 8,9,10 ],
|
|
543 # [ 0 ],
|
|
544 # [ 0 ],
|
|
545 # [ 0 ]
|
|
546 # );
|
|
547
|
|
548 #@tf_min = ();
|
|
549 #@tf_min = ( 3, 1, 1);
|
|
550
|
|
551 #@tf_relate = ();
|
|
552 #@tf_relate = (0,0,0,1,1,1,2,2,2,2);
|
|
553
|
|
554
|
|
555
|
|
556 #### References for test_cluster sub
|
|
557 my $choices_to_pass = \@choices; #### Tree decisions
|
|
558 my $required_to_pass = \@tf_min; #### Min TFBS numbers
|
|
559 my $order = \@tf_relate; #### Relate TFBSs to min numbers
|
|
560
|
|
561
|
|
562 ########################################
|
|
563 #### Run tree searching algorithm (Mike)
|
|
564 ########################################
|
|
565 if($allow eq 'exclude') {
|
|
566 #$good_cluster = tf_cluster_tree($choices_to_pass, $required_to_pass, $order);
|
|
567 $good_cluster = tf_cluster_tree(\@choices, \@tf_min, \@tf_relate);
|
|
568
|
|
569 #print "GOODCLUSTER = $good_cluster\n";
|
|
570 }
|
|
571
|
|
572 else {
|
|
573 $good_cluster = 1;
|
|
574 }
|
|
575 }
|
|
576
|
|
577
|
|
578 #### If all parameters are matched create a summary of the record
|
|
579 #### Work thru each line of the record string start end and TF ID to an array
|
|
580
|
|
581 #### Carry on if overlapping not a problem ####
|
|
582 if( ($pass == $target) and ($good_cluster == 1) ){
|
|
583 my $regex_chr = $record_array[0][0];
|
|
584 my $regex_start = $record_array[0][1];
|
|
585 my $regex_end = $record_array[$#record_array][2];
|
|
586 my $joined_ids = join("-", @regex_ids);
|
|
587
|
|
588 $output->print("$regex_chr\t",
|
|
589 "TFBScluster\t",
|
|
590 "CNS\t",
|
|
591 "$regex_start\t",
|
|
592 "$regex_end\t",
|
|
593 ".\t",
|
|
594 "+\t",
|
|
595 ".\t",
|
|
596 "$joined_ids\n");
|
|
597 }
|
|
598 }
|
|
599
|
|
600 $groups->close;
|
|
601 $filtered_groups->close;
|
|
602 $output->close;
|
|
603
|
|
604 #### Space on screen
|
|
605 #print STDOUT "\n";
|
|
606
|
|
607 #### Read each line of output file and combine lines that overlap
|
|
608 my $version = 1;
|
|
609
|
|
610 #### Remain in loop until last command given
|
|
611 while(1) {
|
|
612 my $last_seqname = '';
|
|
613 my $last_chr = '';
|
|
614 my $last_start = '';
|
|
615 my $last_end = '';
|
|
616 my $changes = 0;
|
|
617 my $outline_count = 0;
|
|
618
|
|
619 my ($out_seqname, $out_source, $out_feature, $out_start, $out_end, $out_score,
|
|
620 $out_strand, $out_frame, $out_attribute, $out_chr) = '';
|
|
621
|
|
622 $output->open("<$ARGV[6]\_v$version") or die("Could not open output file!\n");
|
|
623
|
|
624 #### If the output file is empty then exit loop and finish
|
|
625 if(-z $output) {
|
|
626 $output->close;
|
|
627
|
|
628 system("rm $ARGV[6]\_v$version");
|
|
629
|
|
630 exit;
|
|
631 #die("\nNo clusters were found!\n");
|
|
632 }
|
|
633
|
|
634 $version++;
|
|
635
|
|
636 $output2->open(">$ARGV[6]\_v$version") or die("Could not open output2 file!\n");
|
|
637
|
|
638 OUTLINE: while(defined(my $out_line = <$output>)) {
|
|
639 #### Skip line if GFF comment or blank line
|
|
640 if($out_line =~ /^\s/)
|
|
641 {
|
|
642 next OUTLINE;
|
|
643 }
|
|
644
|
|
645 #### Tally lines read
|
|
646 $outline_count++;
|
|
647
|
|
648 ($out_seqname, $out_source, $out_feature, $out_start, $out_end, $out_score,
|
|
649 $out_strand, $out_frame, $out_attribute) = split(/\t/, $out_line, 9);
|
|
650
|
|
651 $out_chr = $out_seqname;
|
|
652
|
|
653 #### Handle the first line
|
|
654 if($outline_count == 1) {
|
|
655 $last_seqname = $out_seqname;
|
|
656 $last_chr = $out_chr;
|
|
657 $last_start = $out_start;
|
|
658 $last_end = $out_end;
|
|
659
|
|
660 next OUTLINE;
|
|
661 }
|
|
662
|
|
663 #### Remaining lines
|
|
664
|
|
665 #### If the patterns are on different chromosomes
|
|
666 #### CHR
|
|
667 if($last_chr ne $out_chr) {
|
|
668 #### Print the last line to the file and save the current
|
|
669 $output2->print("$last_seqname\t$out_source\t$out_feature\t$last_start\t$last_end\t",
|
|
670 "$out_score\t$out_strand\t$out_frame\t$out_attribute");
|
|
671
|
|
672 $last_seqname = $out_seqname;
|
|
673 $last_chr = $out_chr;
|
|
674 $last_start = $out_start;
|
|
675 $last_end = $out_end;
|
|
676
|
|
677 next OUTLINE;
|
|
678 }
|
|
679
|
|
680 #### If they overlap change current line start to the previous
|
|
681 if( ($last_end > $out_start) and ($last_end <= $out_end) ) {
|
|
682 $last_end = $out_end;
|
|
683
|
|
684 #### Record the number of changes
|
|
685 $changes++;
|
|
686 }
|
|
687
|
|
688 #### If not just print to output
|
|
689 else {
|
|
690 $output2->print("$last_seqname\t$out_source\t$out_feature\t$last_start\t$last_end\t",
|
|
691 "$out_score\t$out_strand\t$out_frame\t$out_attribute");
|
|
692
|
|
693 $last_seqname = $out_seqname;
|
|
694 $last_chr = $out_chr;
|
|
695 $last_start = $out_start;
|
|
696 $last_end = $out_end;
|
|
697 }
|
|
698 }
|
|
699
|
|
700 #### Print last line to outfile
|
|
701 $output2->print("$last_seqname\t$out_source\t$out_feature\t$last_start\t$last_end\t",
|
|
702 "$out_score\t$out_strand\t$out_frame\t$out_attribute");
|
|
703
|
|
704 #### Message
|
|
705 my $previous = $version - 1;
|
|
706
|
|
707 print STDOUT "Records in output file version $previous \($outline_count patterns\).\n";
|
|
708
|
|
709
|
|
710 $output->close;
|
|
711 $output2->close;
|
|
712
|
|
713 if($changes == 0) {
|
|
714 my $joined_ids = join("-", @regex_ids);
|
|
715 #### Copy last version to file name without v number
|
|
716 #system("cp $ARGV[6]\_v$version $ARGV[6]");
|
|
717
|
|
718 #### Open final output file and convert the attribute
|
|
719 open(FINAL_VER, "<$ARGV[6]\_v$version") or die("Could not open final ver GFF!\n");
|
|
720 open(FINAL, ">$ARGV[6]") or die("Could not open final GFF!\n");
|
|
721
|
|
722 while(defined(my $final_line = <FINAL_VER>)) {
|
|
723
|
|
724 my ($final_seqname, $final_source, $final_feature, $final_start, $final_end,
|
|
725 $final_score, $final_strand, $final_frame, $final_attribute) =
|
|
726 split(/\t/, $final_line, 9);
|
|
727
|
|
728 my $pattern_length = ($final_end - $final_start) + 1;
|
|
729
|
|
730 print FINAL "$final_seqname\t$final_source\t$final_feature\t$final_start\t",
|
|
731 "$final_end\t$final_score\t$final_strand\t$final_frame\t",
|
|
732 "$joined_ids\_len$pattern_length\n";
|
|
733
|
|
734 }
|
|
735
|
|
736 close(FINAL_VER);
|
|
737 close(FINAL);
|
|
738
|
|
739 last;
|
|
740 }
|
|
741 }
|
|
742
|
|
743 #### Spacer for summary file
|
|
744 print STDOUT "\n";
|
|
745
|
|
746 #### Remove intermediate files
|
|
747 system("rm ./TFcombined\.$$ ./TFcombined_sorted\.$$ ./TFgroups\.$$ ./filtered_groups\.$$ $ARGV[6]\_v*");
|
|
748
|
|
749 exit;
|
|
750
|
|
751
|
|
752 ############
|
|
753 #Subroutines
|
|
754 ############
|
|
755 sub GetNewRecord
|
|
756 # Load record from a library file, delimited by //
|
|
757 {
|
|
758 my $fh = shift (@_);
|
|
759 my $record = '';
|
|
760 my $saveinputsep = $/;
|
|
761 $/ = "//\n";
|
|
762
|
|
763 $record = <$fh>;
|
|
764
|
|
765 $/ = $saveinputsep;
|
|
766 return $record;
|
|
767 }
|
|
768
|
|
769 sub tf_cluster_tree {
|
|
770
|
|
771 my @path;
|
|
772 my $node_count;
|
|
773 my $node = 0;
|
|
774 my $success = 0;
|
|
775 my $choice = -1;
|
|
776 my @node_contains;
|
|
777 my @node_to_choice;
|
|
778 my @count;
|
|
779 my @branches;
|
|
780 $branches[0] = -1;
|
|
781 my $path_ref;
|
|
782 my $branch_ref;
|
|
783 my $node_contains;
|
|
784 my $node_to_choice;
|
|
785 my ($choice_ref,
|
|
786 $required_ref,
|
|
787 $tf_ref) = @_;
|
|
788 my @choices = @$choice_ref;
|
|
789 my @required = @$required_ref;
|
|
790 my @tfs = @$tf_ref;
|
|
791 ($node,
|
|
792 $choice,
|
|
793 $node_count,
|
|
794 $choice_ref,
|
|
795 $path_ref,
|
|
796 $branch_ref,
|
|
797 $node_to_choice,
|
|
798 $node_contains) = next_node ($node,
|
|
799 $choice,
|
|
800 $node_count,
|
|
801 $choice_ref,
|
|
802 \@path,
|
|
803 \@branches,
|
|
804 $tf_ref);
|
|
805 @choices = @$choice_ref;
|
|
806 @path = @$path_ref;
|
|
807 @branches = @$branch_ref;
|
|
808 $node_contains[$node] = $node_contains;
|
|
809 $node_to_choice[$node] = $node_to_choice;
|
|
810
|
|
811 BLOCK_A:
|
|
812
|
|
813 while (1) {
|
|
814 no strict "vars";
|
|
815
|
|
816 if ( node_terminating($choice, $choice_ref) ) {
|
|
817 push @path, $node;
|
|
818 @count = undef;
|
|
819 grep { $count[ $node_contains[$_] ]++ } @path;
|
|
820 #print "Count is @count.\n";
|
|
821 my $score = grep { $count[$_] >= $required[$_] }
|
|
822 (0 .. $#count);
|
|
823 #print "Path is @path\n";
|
|
824 if ($score == scalar @required) {
|
|
825 $success = 1;
|
|
826 last BLOCK_A;
|
|
827 }
|
|
828 pop @path;
|
|
829 ($node,
|
|
830 $choice,
|
|
831 $path_ref)
|
|
832 = last_unexplored_node(\@path, $choice, $choice_ref,
|
|
833 \@node_to_choice, \@branches);
|
|
834 last BLOCK_A if ($node == -1);
|
|
835 @path = @$path_ref;
|
|
836 }
|
|
837 if ( node_fully_explored($choice, $node, $choice_ref, \@branches) ) {
|
|
838 ($node,
|
|
839 $choice,
|
|
840 $path_ref)
|
|
841 = last_unexplored_node(\@path, $choice, $choice_ref,
|
|
842 \@node_to_choice, \@branches);
|
|
843 @path = @$path_ref;
|
|
844 last BLOCK_A if ($node == -1);
|
|
845 }
|
|
846 ($node,
|
|
847 $choice,
|
|
848 $node_count,
|
|
849 $choice_ref,
|
|
850 $path_ref,
|
|
851 $branch_ref,
|
|
852 $node_to_choice,
|
|
853 $node_contains,) = next_node ($node,
|
|
854 $choice,
|
|
855 $node_count,
|
|
856 $choice_ref,
|
|
857 \@path,
|
|
858 \@branches,
|
|
859 $tf_ref);
|
|
860 @choices = @$choice_ref;
|
|
861 @path = @$path_ref;
|
|
862 @branches = @$branch_ref;
|
|
863 $node_contains[$node] = $node_contains;
|
|
864 $node_to_choice[$node] = $node_to_choice;
|
|
865 }
|
|
866 return $success;
|
|
867 }
|
|
868
|
|
869
|
|
870 sub next_node {
|
|
871
|
|
872 my ($node,
|
|
873 my $choice,
|
|
874 my $node_count,
|
|
875 my $choice_ref,
|
|
876 my $path_ref,
|
|
877 my $branch_ref,
|
|
878 my $tf_ref) = @_;
|
|
879 my @choices = @$choice_ref;
|
|
880 my @path = @$path_ref;
|
|
881 my @branches = @$branch_ref;
|
|
882 my @tfs = @$tf_ref;
|
|
883 my $new_choice = $choices[$choice][0];
|
|
884 push @{ $choices[$choice] }, shift @{ $choices[$choice] };
|
|
885 $choice = $new_choice;
|
|
886 my $node_to_choice = $choice;
|
|
887 push @path, $node if $node;
|
|
888 $branches[$node]++;
|
|
889 $node = $node_count++;
|
|
890 my $node_contains = $tfs[$choice - 1] if $choice;
|
|
891 return (
|
|
892 $node,
|
|
893 $choice,
|
|
894 $node_count,
|
|
895 \@choices,
|
|
896 \@path,
|
|
897 \@branches,
|
|
898 $node_to_choice,
|
|
899 $node_contains);
|
|
900 }
|
|
901
|
|
902 sub node_fully_explored {
|
|
903 no strict "vars";
|
|
904
|
|
905 my $choice = shift;
|
|
906 my $node = shift;
|
|
907 my $choices_ref = shift;
|
|
908 my $branch_ref = shift;
|
|
909 my @choices = @$choices_ref;
|
|
910 my @branches = @$branch_ref;
|
|
911 if ($branches[$node] == (scalar @{ $choices[$choice] })) {
|
|
912 return 1 }
|
|
913 else { return 0 }
|
|
914 }
|
|
915
|
|
916 sub last_unexplored_node {
|
|
917 no strict "vars";
|
|
918
|
|
919 my $path_ref = shift;
|
|
920 my $choice = shift;
|
|
921 my $choice_ref = shift;
|
|
922 my $node_to_choice_ref = shift;
|
|
923 my $branch_ref = shift;
|
|
924 my @path = @$path_ref;
|
|
925 my @node_to_choice = @$node_to_choice_ref;
|
|
926 my @branches = @$branch_ref;
|
|
927 my $node;
|
|
928
|
|
929 do {
|
|
930 $node = pop @path;
|
|
931 $choice = $node_to_choice[$node];
|
|
932 if ( $node == 0 and node_fully_explored($choice, $node,
|
|
933 $choice_ref, \@branches) ) {
|
|
934 $node = -1;
|
|
935 last;
|
|
936 }
|
|
937 } while ( node_fully_explored($choice, $node, $choice_ref,
|
|
938 \@branches) );
|
|
939 return ($node, $choice, \@path);
|
|
940 }
|
|
941
|
|
942 sub node_terminating {
|
|
943
|
|
944 my $choice = shift;
|
|
945 my $choices_ref = shift;
|
|
946 my @choices = @$choices_ref;
|
|
947 if ($choices[$choice][0]) { return 0 }
|
|
948 else { return 1 }
|
|
949
|
|
950 }
|
|
951
|
|
952
|
|
953
|
|
954
|
|
955
|
|
956
|
|
957
|
|
958
|
|
959
|
|
960
|
|
961
|
|
962
|
|
963
|
|
964
|