Mercurial > repos > pjbriggs > motif_tools

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CountUniqueIDs.pl	Mon Apr 09 04:56:28 2018 -0400
@@ -0,0 +1,42 @@
+#! /usr/bin/perl -w
+
+use strict;
+
+#### Read thru a GFF file of motifs return the number of unique ids
+#### Ian Donaldson Sept 2008
+
+#### Usage
+unless(@ARGV == 2) {
+   die("USAGE: $0 | GFF file | Output file\n\n");
+}
+
+#### Ready output file
+open(GFF, "<$ARGV[0]") or die("Could not open GFF file!!\n\n");
+open(OUTPUT, ">$ARGV[1]") or die("Could not open output file!!\n\n");
+
+#### Hash to hold ids
+my %id_hash = ();
+
+#### Work thru GFF file
+while(defined(my $gff_line = <GFF>)) {
+   if($gff_line =~ /(^#|^\s)/) { next }
+
+   my @gff_line_bits = split(/\t/, $gff_line);
+
+   my $id = $gff_line_bits[0];
+
+   $id_hash{$id}=1;
+}
+
+my @all_keys = sort(keys(%id_hash));
+
+my $elements = scalar(@all_keys);
+
+#print OUTPUT "There are $elements unique sequences in the file\n";
+print OUTPUT "$elements non-redundant sequences\n";
+
+#### Close files
+close(GFF);
+close(OUTPUT);
+
+exit;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Scan_IUPAC_output_each_match.pl	Mon Apr 09 04:56:28 2018 -0400
@@ -0,0 +1,210 @@
+#! /usr/bin/perl
+
+use strict;
+use FileHandle;
+use Bio::SeqIO;
+#use Statistics::Descriptive;
+
+#####
+# Program to count all occurences of a particular REGEX
+# in a file containing mutiple FASTA sequences.
+# 11 September 2003. Ian Donaldson.
+# Revised to convert IUPAC to regex
+# Revised to read a multiple FASTA file
+# was CountRegexGFF_IUPAC_1input_nosummary.pl
+#####
+
+#### File handles
+my $input = new FileHandle;
+my $output = new FileHandle;
+
+#### Variables
+my $file_number = 0;
+my $count_fwd_regex = 0;
+my $count_rvs_regex = 0;
+my $count_all_regex = 0;
+my $seq_tally = 0;
+my @seq_totals = ();
+
+#### Command line usage
+if(@ARGV != 5) {
+   die ("USAGE:
+   $0
+   IUPAC
+   Multiple FASTA input file
+   Output
+   Label
+   Skip palindromic (0=F+R-default|1=F only)\n\n");
+}
+
+#### Search forward strand only?
+my $skip = $ARGV[4];
+unless($skip =~ /^[01]$/) {
+   die("Only accept 0 or 1 for Skip!\n");
+}
+
+#### Process IUPAC string
+my $iupac = $ARGV[0];
+chomp $iupac;
+$iupac = uc($iupac);
+
+if($iupac !~ /^[ACGTRYMKWSBDHVN]+$/) {
+   die("A non-IUPAC character was detected in your input string!\n");
+}
+
+#### Forward strand IUPAC
+my @fwd_iupac_letters = split(//, $iupac);
+my @fwd_regex_list = ();
+
+foreach my $letter (@fwd_iupac_letters) {
+   my $converted_iupac = iupac2regex($letter);
+   push(@fwd_regex_list, $converted_iupac);
+}
+
+my $fwd_regex = join('', @fwd_regex_list);
+
+
+#### Reverse strand IUPAC
+my $revcomp_iupac = RevCompIUPAC($iupac);
+my @rev_iupac_letters = split(//, $revcomp_iupac);
+my @rev_regex_list = ();
+
+foreach my $letter (@rev_iupac_letters) {
+   my $converted_iupac = iupac2regex($letter);
+   push(@rev_regex_list, $converted_iupac);
+}
+
+my $rvs_regex = join('', @rev_regex_list);
+
+#### Other variables
+my $label = $ARGV[3];
+
+if($label !~ /^[\w\d]+$/) {
+   die("A non-letter/number character was detected in your label string!\n");
+}
+
+my $length = length($iupac);
+
+#### Open output file
+$output->open(">$ARGV[2]") or die "Could not open output file $ARGV[2]!\n";
+#$output->print("##gff-version 2\n");
+
+#if($skip == 0) {
+#   $output->print("##Pattern search: $iupac and $revcomp_iupac\n");
+#}
+
+#else {
+#   $output->print("##Pattern search: $iupac\n");
+#}
+
+#### Work thru FASTA entries in the input file with SeqIO
+my $seqio = Bio::SeqIO->new(-file => "$ARGV[1]" , '-format' => 'Fasta');
+
+while( my $seqobj = $seqio->next_seq() ) {
+   $seq_tally++;
+   my $this_seq_tally = 0;
+   my $sequence = $seqobj->seq(); # actual sequence as a string
+   my $seq_id = $seqobj->id(); # header
+   #print(">$seq_id\n$seq\n\n");
+
+   #$output->print(">$seq_id\n");
+
+   #### Clean up $sequence to leave only nucleotides
+   $sequence =~ s/[\s\W\d]//g;
+
+   while ($sequence =~ /($fwd_regex)/ig) {
+      $this_seq_tally++;
+      $count_fwd_regex++;
+      $count_all_regex++;
+
+      my $end_position = pos($sequence);
+      my $start_position = $end_position - ($length - 1);
+      $output->print("$seq_id\tRegexSearch\tCNS\t$start_position\t$end_position\t.\t+\t.\t$label\n");
+   }
+
+   #### Count reverse REGEX
+   unless($skip == 1) {
+      while ($sequence =~ /($rvs_regex)/ig) {
+         $this_seq_tally++;
+         $count_rvs_regex++;
+         $count_all_regex++;
+
+         my $end_position = pos($sequence);
+         my $start_position = $end_position - ($length - 1);
+         $output->print("$seq_id\tRegexSearch\tCNS\t$start_position\t$end_position\t.\t-\t.\t$label\n");
+      }
+
+   push(@seq_totals, $this_seq_tally);
+   #$output->print("$this_seq_tally matches\n");
+   }
+}
+
+#### Mean motifs per seq
+#my $stat = Statistics::Descriptive::Full->new();
+#$stat->add_data(@seq_totals);
+#my $mean = $stat->mean();
+
+
+#### Print a summary file
+if($skip == 0) {
+#   $output->print("##Forward: $fwd_regex. Reverse: $rvs_regex.\n",
+#                  "##$count_fwd_regex on the forward strand.\n",
+#                  "##$count_rvs_regex on the reverse strand.\n",
+#                  "##$count_all_regex in total.\n",
+#                  "##$seq_tally sequences.  Mean motifs per seq = $mean\n");
+#
+   print STDOUT "There were $count_all_regex instances of $fwd_regex and $rvs_regex.\n";
+}
+
+if($skip == 1) {
+#   $output->print("##Forward: $fwd_regex.\n",
+#                  "##$count_fwd_regex on the forward strand.\n",
+#                  "##$seq_tally sequences.  Mean motifs per seq = $mean\n");
+#
+   print STDOUT "There were $count_fwd_regex instances of $fwd_regex on the forward strand.\n";
+}
+
+$output->close;
+
+exit;
+
+sub iupac2regex {
+# Convert IUPAC codes to REGEX
+   my $iupac = shift;
+
+   #### Series of regexes to convert
+   if($iupac =~ /A/) { return 'A' }
+   if($iupac =~ /C/) { return 'C' }
+   if($iupac =~ /G/) { return 'G' }
+   if($iupac =~ /T/) { return 'T' }
+   if($iupac =~ /M/) { return '[AC]' }
+   if($iupac =~ /R/) { return '[AG]' }
+   if($iupac =~ /W/) { return '[AT]' }
+   if($iupac =~ /S/) { return '[CG]' }
+   if($iupac =~ /Y/) { return '[CT]' }
+   if($iupac =~ /K/) { return '[GT]' }
+   if($iupac =~ /V/) { return '[ACG]' }
+   if($iupac =~ /H/) { return '[ACT]' }
+   if($iupac =~ /D/) { return '[AGT]' }
+   if($iupac =~ /B/) { return '[CGT]' }
+   if($iupac =~ /N/) { return '[ACGT]' }
+
+   die("IUPAC not recognised by sub iupac2regex!\n");
+}
+
+sub RevCompIUPAC {
+   my $iupac_string = shift;
+   my @converted_list = ();
+
+   my @iupac_string_list = split(//, $iupac_string);
+
+   @iupac_string_list = reverse(@iupac_string_list);
+
+   foreach my $letter (@iupac_string_list) {
+      $letter =~ tr/ACGTRYMKWSBDHVN/TGCAYRKMWSVHDBN/;
+      push(@converted_list, $letter);
+   }
+
+   my $joined_list = join('', @converted_list);
+   return $joined_list;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Scan_IUPAC_output_matches_per_seq.pl	Mon Apr 09 04:56:28 2018 -0400
@@ -0,0 +1,209 @@
+#! /usr/bin/perl
+
+use strict;
+use FileHandle;
+use Bio::SeqIO;
+#use Statistics::Descriptive;
+
+#####
+# Program to count all occurences of a particular REGEX
+# in a file containing mutiple FASTA sequences.
+# 11 September 2003. Ian Donaldson.
+# Revised to convert IUPAC to regex
+# Revised to read a multiple FASTA file
+# was CountRegexGFF_IUPAC_1input_simple_output.pl
+#####
+
+#### File handles
+my $input = new FileHandle;
+my $output = new FileHandle;
+
+#### Variables
+my $file_number = 0;
+my $count_fwd_regex = 0;
+my $count_rvs_regex = 0;
+my $count_all_regex = 0;
+my $seq_tally = 0;
+my @seq_totals = ();
+
+#### Command line usage
+if(@ARGV != 4) {
+   die ("USAGE:
+   $0
+   IUPAC
+   Multiple FASTA input file
+   Output
+   Skip palindromic (0=F+R-default|1=F only)\n\n");
+}
+
+#### Search forward strand only?
+my $skip = $ARGV[3];
+unless($skip =~ /^[01]$/) {
+   die("Only accept 0 or 1 for Skip!\n");
+}
+
+#### Process IUPAC string
+my $iupac = $ARGV[0];
+chomp $iupac;
+$iupac = uc($iupac);
+
+if($iupac !~ /^[ACGTRYMKWSBDHVN]+$/) {
+   die("A non-IUPAC character was detected in your input string!\n");
+}
+
+#### Forward strand IUPAC
+my @fwd_iupac_letters = split(//, $iupac);
+my @fwd_regex_list = ();
+
+foreach my $letter (@fwd_iupac_letters) {
+   my $converted_iupac = iupac2regex($letter);
+   push(@fwd_regex_list, $converted_iupac);
+}
+
+my $fwd_regex = join('', @fwd_regex_list);
+
+
+#### Reverse strand IUPAC
+my $revcomp_iupac = RevCompIUPAC($iupac);
+my @rev_iupac_letters = split(//, $revcomp_iupac);
+my @rev_regex_list = ();
+
+foreach my $letter (@rev_iupac_letters) {
+   my $converted_iupac = iupac2regex($letter);
+   push(@rev_regex_list, $converted_iupac);
+}
+
+my $rvs_regex = join('', @rev_regex_list);
+
+#### Other variables
+#my $label = $ARGV[3];
+#
+#if($label !~ /^[\w\d]+$/) {
+#   die("A non-letter/number character was detected in your label string!\n");
+#}
+
+my $length = length($iupac);
+
+#### Open output file
+$output->open(">$ARGV[2]") or die "Could not open output file $ARGV[2]!\n";
+#$output->print("##gff-version 2\n");
+
+#if($skip == 0) {
+#   $output->print("##Pattern search: $iupac and $revcomp_iupac\n");
+#}
+
+#else {
+#   $output->print("##Pattern search: $iupac\n");
+#}
+
+#### Work thru FASTA entries in the input file with SeqIO
+my $seqio = Bio::SeqIO->new(-file => "$ARGV[1]" , '-format' => 'Fasta');
+
+while( my $seqobj = $seqio->next_seq() ) {
+   $seq_tally++;
+   my $this_seq_tally = 0;
+   my $sequence = $seqobj->seq(); # actual sequence as a string
+   my $seq_id = $seqobj->id(); # header
+   #print(">$seq_id\n$seq\n\n");
+
+   #$output->print(">$seq_id\n");
+
+   #### Clean up $sequence to leave only nucleotides
+   #$sequence =~ s/[\s\W\d]//g;
+
+   while ($sequence =~ /($fwd_regex)/ig) {
+      $this_seq_tally++;
+      $count_fwd_regex++;
+      $count_all_regex++;
+
+      #my $end_position = pos($sequence);
+      #my $start_position = $end_position - ($length - 1);
+      #$output->print("$seq_id\tRegexSearch\tCNS\t$start_position\t$end_position\t.\t+\t.\t$label\n");
+   }
+
+   #### Count reverse REGEX
+   unless($skip == 1) {
+      while ($sequence =~ /($rvs_regex)/ig) {
+         $this_seq_tally++;
+         $count_rvs_regex++;
+         $count_all_regex++;
+
+         #my $end_position = pos($sequence);
+         #my $start_position = $end_position - ($length - 1);
+         #$output->print("$seq_id\tRegexSearch\tCNS\t$start_position\t$end_position\t.\t-\t.\t$label\n");
+      }
+   }
+
+   push(@seq_totals, $this_seq_tally);
+   $output->print("$seq_id\t$this_seq_tally\n");
+}
+
+#### Mean motifs per seq
+#my $stat = Statistics::Descriptive::Full->new();
+#$stat->add_data(@seq_totals);
+#my $mean = $stat->mean();
+
+
+#### Print a summary file
+#if($skip == 0) {
+#   $output->print("##Forward: $fwd_regex. Reverse: $rvs_regex.\n",
+#                  "##$count_fwd_regex on the forward strand.\n",
+#                  "##$count_rvs_regex on the reverse strand.\n",
+#                  "##$count_all_regex in total.\n",
+#                  "##$seq_tally sequences.  Mean motifs per seq = $mean\n");
+#
+#   print STDOUT "There were $count_all_regex instances of $fwd_regex and $rvs_regex.\n\n";
+#}
+
+#if($skip == 1) {
+#   $output->print("##Forward: $fwd_regex.\n",
+#                  "##$count_fwd_regex on the forward strand.\n",
+#                  "##$seq_tally sequences.  Mean motifs per seq = $mean\n");
+#
+#   print STDOUT "There were $count_fwd_regex instances of $fwd_regex on the forward strand.\n\n";
+#}
+
+$output->close;
+
+exit;
+
+sub iupac2regex {
+# Convert IUPAC codes to REGEX
+   my $iupac = shift;
+
+   #### Series of regexes to convert
+   if($iupac =~ /A/) { return 'A' }
+   if($iupac =~ /C/) { return 'C' }
+   if($iupac =~ /G/) { return 'G' }
+   if($iupac =~ /T/) { return 'T' }
+   if($iupac =~ /M/) { return '[AC]' }
+   if($iupac =~ /R/) { return '[AG]' }
+   if($iupac =~ /W/) { return '[AT]' }
+   if($iupac =~ /S/) { return '[CG]' }
+   if($iupac =~ /Y/) { return '[CT]' }
+   if($iupac =~ /K/) { return '[GT]' }
+   if($iupac =~ /V/) { return '[ACG]' }
+   if($iupac =~ /H/) { return '[ACT]' }
+   if($iupac =~ /D/) { return '[AGT]' }
+   if($iupac =~ /B/) { return '[CGT]' }
+   if($iupac =~ /N/) { return '[ACGT]' }
+
+   die("IUPAC not recognised by sub iupac2regex!\n");
+}
+
+sub RevCompIUPAC {
+   my $iupac_string = shift;
+   my @converted_list = ();
+
+   my @iupac_string_list = split(//, $iupac_string);
+
+   @iupac_string_list = reverse(@iupac_string_list);
+
+   foreach my $letter (@iupac_string_list) {
+      $letter =~ tr/ACGTRYMKWSBDHVN/TGCAYRKMWSVHDBN/;
+      push(@converted_list, $letter);
+   }
+
+   my $joined_list = join('', @converted_list);
+   return $joined_list;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TFBScluster_candidates.pl	Mon Apr 09 04:56:28 2018 -0400
@@ -0,0 +1,964 @@
+#!/usr/bin/perl
+
+# TFBScluster version 2.0 - cluster together TFBSs from distinct
+#                           TFBSsearch generated libraries.
+#
+# (c) Ian Donaldson 2003 and Mike Chapman (TFBS overlap subs)
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+use strict;
+use FileHandle;
+#use integer; # Force floating point into integers
+
+$|=1;
+
+###########
+# Program to determine whether a TF site is in close proximity to
+# one or more other TF sites.
+# September 2003.  Ian Donaldson.
+# Revised 8 Sept. 2003 to not include the query TF site in the threshold.
+# This is to allow one to determine whether a TF is near to another of the
+# same type.
+# Revised 9 Sept to alter the threshold size to only include the core of a
+# pattern i.e. gata of nngatann.
+# Revised 19 Sept to replace query and subject libraries with the statement
+# of all interested libraries.
+# Revised 22 Sept to scan output for overlapping patterns.
+# NOTE: Any overlap in comparative record start with start pattern
+# Revised 30 Sept to ignore duplication of TFBS in a group record caused by
+# palindrome.  By skipping if name and positions the same.
+# Revised 6 Oct code for tree searching method to deal with overlapping TFBSs
+###########
+
+#### Command line usage
+if(@ARGV != 7)
+{
+   die ("USAGE:
+   $0
+   TF libraries \(comma delimited NO SPACES\)
+   Number of flanking 'N's for subject files \(comma delimited NO SPACES\)
+   Minimum number of occurences \(comma delimited NO SPACES\)
+   TF IDs \(comma delimited NO SPACES\)
+   Single range value in bp \(+/-\) query start and end values
+   Include overlapping TFBSs \(include/exclude\)
+   Output file\n\n");
+}
+
+#### File handles
+my $subject = new FileHandle;
+my $combined = new FileHandle;
+my $sorted = new FileHandle;
+my $groups = new FileHandle;
+my $filtered_groups = new FileHandle;
+my $output = new FileHandle;
+my $output2 = new FileHandle;
+
+#### Variables
+my @subject_files = (); # Array containing the names of selected subject files
+my @flanking_n = (); # Array containing the number of flanking 'n' for each pattern
+my @min_occur = (); # Array containing the minimum occurences for the TF library
+my @ids = (); # Array containing user defined IDs for the TF libraries
+my @file_sizes = ();
+my @sorted_file_sizes = ();
+my $range = $ARGV[4];
+my $allow = $ARGV[5];
+my @regex_ids = ();
+
+#####################################################
+#### Deal with user arguments processed into an array
+#####################################################
+
+#### Convert TF file names string to an array
+@subject_files = split(/,/, $ARGV[0]);
+
+#### Convert flanking 'N' numbers string into an array
+@flanking_n = split(/,/, $ARGV[1]);
+
+#### Convert minimum occurences string into an array
+@min_occur = split(/,/, $ARGV[2]);
+
+#### Convert minimum occurences string into an array
+@ids = split(/,/, $ARGV[3]);
+
+foreach my $id_string (@ids) {
+  if($id_string !~ /^[\w\d_,]+$/) {
+     die("A non-letter/number character was detected in your label string!\n");
+  }
+}
+
+#### Record how large they are
+for(my $i=0; $i < $#subject_files + 1; $i++) {
+   my $size = (-s $subject_files[$i]); # -s performed on an unopened file!
+
+   push(@file_sizes, ["$subject_files[$i]", "$size", "$flanking_n[$i]", "$min_occur[$i]", "$ids[$i]"]);
+}
+
+#### Sort file sizes array by file sizes
+# ARRAY NOT SORTED BUT COPIED TO ALLOW SORTING AT A LATER DATE
+# @sorted_file_sizes = sort{$a->[1] <=> $b->[1]} @file_sizes;
+push(@sorted_file_sizes, @file_sizes);
+
+#### Summary file information
+print STDOUT "TFBScluster analysis:\n",
+             "--------------------\n";
+
+print STDOUT "TFBS library information:\n";
+
+#### Show file summary
+for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) {
+   print STDOUT "TFBS lib. ID = $sorted_file_sizes[$i][4].\n",
+                "Extended conservation = $sorted_file_sizes[$i][2].\n",
+		"Minimum occurrence = $sorted_file_sizes[$i][3].\n\n";
+}
+
+#print STDOUT "\n";
+
+
+#####################################################
+#### Information required by tree searching algorithm
+#####################################################
+
+#### Array containing the minimum number of each TF, also corresp names
+my @tf_min = ();
+my @tf_names = ();
+
+for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) {
+   push(@tf_min, $sorted_file_sizes[$i][3]);
+}
+
+for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) {
+   push(@tf_names, $sorted_file_sizes[$i][4]);
+}
+
+#### TEST
+#print "ARRAY1 = @tf_min\n\n";
+
+#####################################################################
+#### Open a file to store all the TF data from each selected library.
+#####################################################################
+$combined->open(">TFcombined\.$$") or die("Could not open TFcombined file!\n");
+
+#### Copy each TF file to another file and sort it.
+for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) {
+   #### Save necessary parts of the current subject file to chr. specific arrays
+   $subject->open("<$sorted_file_sizes[$i][0]") or die("Could not open subject file!\n");
+
+   #### Message to user
+   #print "Adding data for TF file $i\.\t";
+
+   SUBLINE: while(defined(my $sub_line = <$subject>)) {
+      my ($sub_seqname, $sub_source, $sub_feature, $sub_start, $sub_end, $sub_score,
+          $sub_strand, $sub_frame, $sub_attribute) = '';
+
+      #### Skip line if GFF comment or blank line
+      if($sub_line =~ /(^\s|^#)/)
+      {
+         next SUBLINE;
+      }
+
+      #### Split each line by TAB
+      ($sub_seqname, $sub_source, $sub_feature, $sub_start, $sub_end, $sub_score,
+       $sub_strand, $sub_frame, $sub_attribute) = split(/\t/, $sub_line, 9);
+
+      #### Clean up attribute
+      #($sub_attribute) = $sub_attribute =~ /[ACTG\-]+/g;
+      chomp($sub_attribute);
+
+      #### Adjust thresold of subject positions to reflect the core sequence
+      #### Possibly make an argument?!
+      $sub_start = $sub_start + $sorted_file_sizes[$i][2];
+      $sub_end = $sub_end - $sorted_file_sizes[$i][2];
+
+      #### Determine chromosome
+      (my $sub_chr) = $sub_seqname;
+
+      #### Add modified line to analysis library file
+      $combined->print("$sub_seqname\t$sub_source\t$sub_feature\t$sub_start\t$sub_end\t",
+                       "$sub_score\t$sub_strand\t$sub_frame\t$sorted_file_sizes[$i][4]\n");
+#                       "$sub_score\t$sub_strand\t$sub_frame\t$sub_attribute",
+#		       "\_\_$sorted_file_sizes[$i][4]\n");
+   }
+
+   #### Message
+   #print STDOUT "\[DONE\]\n";
+
+   $subject->close;
+}
+
+#### Spacer on screen
+#print STDOUT "\n";
+
+$combined->close;
+
+
+#############################
+#### Sort the TFcombined file
+#############################
+#print STDOUT "Sorting TFcombined file.\t";
+
+#system("sort +0.3 -0.5 +3n -4n TFcombined\.$$ > TFcombined_sorted_temp\.$$");
+system("sort +0 -1 +3n -4n TFcombined\.$$ > TFcombined_sorted\.$$");
+
+#print "HELLO\n";
+
+###################################
+#### Convert all chr01 back to chr1
+###################################
+#system("/home/donaldson/bin/TFBScluster/nozero_before_1-9.pl TFcombined_sorted_temp\.$$ TFcombined_sorted\.$$");
+
+#print STDOUT "\[DONE\]\n\n";
+
+
+#####################################
+#### Sort the sorted file into groups
+#####################################
+#### Work thru each line of the combined TF file.  Store record of all TFs downstream
+#### WITHIN the predefined distance
+#print STDOUT "Organising the sorted file into groups.\t";
+
+my $last = 0;
+
+$sorted->open("<TFcombined_sorted\.$$") or die("Could not open sorted TFcombined file!\n");
+
+#### Rewind combined TF file to start
+seek($sorted, 0, 0);
+
+COMBLINE: while(defined(my $comb_line = <$sorted>)) {
+   #### Get info about the line
+   my @comb_line_array = split(/\t/, $comb_line);
+   my $comb_seqname = $comb_line_array[0];
+   (my $comb_chr) = $comb_seqname;
+
+   my $comb_start = $comb_line_array[3];
+
+   #### Store the start of the next line
+   my $next_line_pos = tell($sorted);
+
+   #### Variable to hold lines
+   my $group_holder = '';
+
+   $group_holder = $comb_line;
+
+   #### Read thru the next lines until the end position is not within the specified
+   #### range of the start line start
+   my $count_hit = 0;
+
+   GROUPLINE: while(defined(my $group_line = <$sorted>)) {
+      my @group_line_array = split(/\t/, $group_line);
+      my $group_seqname = $group_line_array[0];
+      (my $group_chr) = $group_seqname;
+
+      my $group_end = $group_line_array[4];
+
+      #### CHR
+      #if( (($group_end - $comb_start + 1) < $range ) and ($comb_chr eq $group_chr) ) {
+      if( (($group_end - $comb_start + 1) <= $range ) and ($comb_chr eq $group_chr) ) {
+         $group_holder .= $group_line;
+         $count_hit++;
+      }
+
+      else {
+	 last GROUPLINE;
+      }
+   }
+
+   if($count_hit > 0) {
+      #Make the record
+      $groups->open(">>TFgroups\.$$") or die("Could not open TF groups file!\n");
+
+      $groups->print("$group_holder\/\/\n");
+
+      $groups->close;
+
+      #### Move to the end of the last line
+      seek($sorted, $next_line_pos , 0);
+
+      next COMBLINE;
+   }
+
+   else {
+      #### Move to the end of the last line
+      seek($sorted, $next_line_pos , 0);
+
+      next COMBLINE;
+   }
+}
+
+$sorted->close;
+
+#print STDOUT "\[DONE\]\n\n";
+
+
+###################################################################################
+#### Look through the groups file to find records matching the user defined params
+###################################################################################
+my $record = '';
+my $target = 0;
+my $count_pass = 0;
+
+#### Another user message
+print STDOUT "You have chosen to search for groups containing at least:\n";
+
+for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) {
+   print STDOUT "$sorted_file_sizes[$i][3] $sorted_file_sizes[$i][4] site(s).\n";
+
+   #### Increment the desired number of matches for a group to be selected
+   $target++;
+}
+
+#### Another user message
+#print STDOUT "\nOutput will be written to \[$ARGV[6]\].\n";
+print STDOUT "\nCombining overlapping clusters:\n";
+
+#### Open an output file
+$output->open(">$ARGV[6]\_v1") or die("Could not open output file!\n");
+
+#### Open the TFgroups files
+$groups->open("<TFgroups\.$$") or die("Could not open TFgroups file!\n");
+
+#### Open the filtered record test file
+$filtered_groups->open(">filtered_groups\.$$") or die("Could not open filtered groups file!\n");
+
+#### Take each record of the groups file
+RECORD: while($record = GetNewRecord($groups)) {
+   #### What about if the positions overlap?
+   my @record_array = ();
+   my $last_record_start = 0;
+   my $last_record_end = 0;
+   my $last_record_attribute = 0;
+
+   my $save_filtered_group = '';
+
+   #### Take each line of the record beginning with chr...
+   RECORDLINE: while($record =~ /(\w.+\n)/mg) {
+      my $record_line = $1;
+
+      my @record_line_array = ();
+      @record_line_array = split(/\t/, $record_line);
+
+      my $record_seqname = $record_line_array[0];
+      my $record_start = $record_line_array[3];
+      my $record_end = $record_line_array[4];
+      my $record_strand = $record_line_array[6];
+      my $record_attribute = $record_line_array[8];
+      chomp($record_attribute);
+
+      #print "$record_seqname\t$record_start\t$record_end\t$record_strand\t$record_attribute\n";
+      #exit;
+
+      #### If the last motif exactly overlaps and is same ID - skip adding to array
+      if( ($record_start == $last_record_start) and
+          ($record_end == $last_record_end) and
+	  ($record_attribute eq $last_record_attribute) ) {
+
+         $last_record_start = $record_start;
+	 $last_record_end = $record_end;
+	 $last_record_attribute = $record_attribute;
+
+         next RECORDLINE;
+      }
+
+      $last_record_start = $record_start;
+      $last_record_end = $record_end;
+      $last_record_attribute = $record_attribute;
+
+      #### File 2D array
+      push(@record_array, ["$record_seqname", "$record_start", "$record_end", "$record_strand", "$record_attribute"]);
+
+      #### Test file
+      $save_filtered_group .= "$record_seqname\t$record_start\t$record_end\t$record_strand\t$record_attribute\n";
+   }
+
+   #### Test file record marker
+   $save_filtered_group .= "//\n";
+
+   ######################################################################
+   #### Make sure the record contains the minimum number of specified TFs
+   ######################################################################
+
+   #### Counter to see whether all params matched
+   my $pass = 0;
+   @regex_ids = ();  # Array to hold regexs as they are used
+   my @regex_totals = ();  # Array to hold info on regex totals in the record
+
+   #### Work thru each of the input parameter lists
+   for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) {
+      #### Site name for regex
+      my $regex = $sorted_file_sizes[$i][4];
+
+      push(@regex_ids, $regex);
+
+      #### Min number of hits for regex
+      my $min_regex = $sorted_file_sizes[$i][3];
+
+      #### Search for current regex and tally
+      my $regex_hits = 0;
+
+      #### Work thru each of the non-repeating record lines array
+      for(my $k=0; $k < $#record_array + 1; $k++) {
+         my $line_regex = $record_array[$k][4];
+
+	 if($regex eq $line_regex) {
+	    $regex_hits++;
+	 }
+      }
+
+      #### Were the min number of regex hits found?
+      if($regex_hits >= $min_regex) {
+         $pass++;
+      }
+   }
+
+   #####################################################################
+   #### If there are the minimum number of TFBS then check they are not
+   #### sufficiently overlapping to reduce the numbers below the minimum
+   #####################################################################
+   my $good_cluster = '';
+
+   if($pass == $target) {
+      #### Test file
+      $filtered_groups->print("$save_filtered_group");
+
+      # Declarations
+      my ($end);
+      my (@starts, @ends, @choices);
+
+      # Assign start and end positions
+
+      grep { $starts[$_] = $record_array[$_][1]; $ends[$_] = $record_array[$_][2] }
+      (0 .. $#record_array);
+
+      $end = -1; # First choice does not refer to a transcription factor
+
+      # Loop through all transcription factors
+
+      I_LOOP:
+      for my $i (0 .. @starts) {
+         my $next_factor = 0;
+
+         # Now loop through all possible following transcription factors
+         # Note that $starts[0] and @ends[0] refer to the first transcription factor
+         # whereas $choices[1] will refer to the first transcription factor.
+
+         for my $j ( $i .. $#starts) {
+            if ($starts[$j] > $end) {
+               $next_factor = ($j + 1);
+               last;
+            }
+         }
+
+         push @{ $choices[$i] }, $next_factor;
+
+         # If no factor follows, we have a terminating factor and progress to the next.
+         # Must first modify $end to be the end of the next transcription factor.
+
+         unless ($next_factor) {
+            $end = $ends[$i];
+            next I_LOOP;
+         }
+
+         # Now need to check the factors overlapping with $next_factor and add them
+         # as possibilities. Note that in some circumstances, this may result in a
+         # redundant path. This will not give spurious results, however.
+
+         for my $k ( $next_factor .. $#starts ) {
+            if ($starts[$k] <= $ends[$next_factor - 1]) {
+               push @{ $choices[$i] }, ($k + 1);
+            }
+         }
+
+         # Finally, modify $end to be the end of the next transcription factor
+
+         $end = $ends[$i];
+
+         # And go to next loop
+      }
+
+
+      #### Print out @choices array to file
+      foreach (@choices) {
+         $filtered_groups->print("@{ $_ } \n");
+      }
+
+      $filtered_groups->print("//\n");
+
+
+      #####################################################
+      #### Information required by tree searching algorithm
+      #### Only for records that contain min number of TFBS
+      #####################################################
+
+      #### Array relating each TF to their minimum values
+      my @tf_relate = ();
+
+      #$tf_relate[0] = 0;
+
+      #### Get TF ID for current record
+      for(my $i=0; $i < $#record_array+1; $i++) {
+         #my $next_i = $i + 1;
+
+         my $current_attrib = $record_array[$i][4];
+
+         #### Scan @sorted_files_sizes array for matching TF ID and save
+         #### row number.  This will relate directly to @tf_min
+         for(my $f=0; $f < $#sorted_file_sizes + 1; $f++) {
+
+            if($current_attrib =~ /$sorted_file_sizes[$f][4]/) {
+               #$tf_relate[$next_i] = "$f";
+               $tf_relate[$i] = "$f";
+            }
+         }
+      }
+
+      #### TEST
+#      print "ARRAY2 = @tf_relate\n\n";
+#      for(0..$#tf_relate) {
+#         print "[$_] $tf_relate[$_]\n";
+#      }
+
+      #### DUMMY ARRAYS
+      #@choices = ();
+      #@choices = (
+#	[ 1,2,3 ],
+#	[ 4,5,6 ],
+#	[ 4,5,6 ],
+#	[ 4,5,6 ],
+#	[ 7 ],
+#	[ 7 ],
+#	[ 7 ],
+#	[ 8,9,10 ],
+#	[ 0 ],
+#	[ 0 ],
+#	[ 0 ]
+#        );
+
+      #@tf_min = ();
+      #@tf_min = ( 3, 1, 1);
+
+      #@tf_relate = ();
+      #@tf_relate = (0,0,0,1,1,1,2,2,2,2);
+
+
+
+      #### References for test_cluster sub
+      my $choices_to_pass = \@choices;      #### Tree decisions
+      my $required_to_pass = \@tf_min;        #### Min TFBS numbers
+      my $order = \@tf_relate;  #### Relate TFBSs to min numbers
+
+
+      ########################################
+      #### Run tree searching algorithm (Mike)
+      ########################################
+      if($allow eq 'exclude') {
+         #$good_cluster = tf_cluster_tree($choices_to_pass, $required_to_pass, $order);
+         $good_cluster = tf_cluster_tree(\@choices, \@tf_min, \@tf_relate);
+
+         #print "GOODCLUSTER = $good_cluster\n";
+      }
+
+      else {
+         $good_cluster = 1;
+      }
+   }
+
+
+   #### If all parameters are matched create a summary of the record
+   #### Work thru each line of the record string start end and TF ID to an array
+
+   #### Carry on if overlapping not a problem ####
+   if( ($pass == $target) and ($good_cluster == 1) ){
+      my $regex_chr = $record_array[0][0];
+      my $regex_start = $record_array[0][1];
+      my $regex_end = $record_array[$#record_array][2];
+      my $joined_ids = join("-", @regex_ids);
+
+      $output->print("$regex_chr\t",
+                     "TFBScluster\t",
+		     "CNS\t",
+		     "$regex_start\t",
+		     "$regex_end\t",
+		     ".\t",
+		     "+\t",
+		     ".\t",
+		     "$joined_ids\n");
+   }
+}
+
+$groups->close;
+$filtered_groups->close;
+$output->close;
+
+#### Space on screen
+#print STDOUT "\n";
+
+#### Read each line of output file and combine lines that overlap
+my $version = 1;
+
+#### Remain in loop until last command given
+while(1) {
+   my $last_seqname = '';
+   my $last_chr = '';
+   my $last_start = '';
+   my $last_end = '';
+   my $changes = 0;
+   my $outline_count = 0;
+
+   my ($out_seqname, $out_source, $out_feature, $out_start, $out_end, $out_score,
+       $out_strand, $out_frame, $out_attribute, $out_chr) = '';
+
+   $output->open("<$ARGV[6]\_v$version") or die("Could not open output file!\n");
+
+   #### If the output file is empty then exit loop and finish
+   if(-z $output) {
+      $output->close;
+
+      system("rm $ARGV[6]\_v$version");
+
+      exit;
+      #die("\nNo clusters were found!\n");
+   }
+
+   $version++;
+
+   $output2->open(">$ARGV[6]\_v$version") or die("Could not open output2 file!\n");
+
+   OUTLINE: while(defined(my $out_line = <$output>)) {
+      #### Skip line if GFF comment or blank line
+      if($out_line =~ /^\s/)
+      {
+         next OUTLINE;
+      }
+
+      #### Tally lines read
+      $outline_count++;
+
+      ($out_seqname, $out_source, $out_feature, $out_start, $out_end, $out_score,
+       $out_strand, $out_frame, $out_attribute) = split(/\t/, $out_line, 9);
+
+      $out_chr = $out_seqname;
+
+      #### Handle the first line
+      if($outline_count == 1) {
+         $last_seqname = $out_seqname;
+         $last_chr = $out_chr;
+         $last_start = $out_start;
+         $last_end = $out_end;
+
+         next OUTLINE;
+      }
+
+      #### Remaining lines
+
+      #### If the patterns are on different chromosomes
+      #### CHR
+      if($last_chr ne $out_chr) {
+         #### Print the last line to the file and save the current
+         $output2->print("$last_seqname\t$out_source\t$out_feature\t$last_start\t$last_end\t",
+                         "$out_score\t$out_strand\t$out_frame\t$out_attribute");
+
+         $last_seqname = $out_seqname;
+         $last_chr = $out_chr;
+         $last_start = $out_start;
+         $last_end = $out_end;
+
+         next OUTLINE;
+      }
+
+      #### If they overlap change current line start to the previous
+      if( ($last_end > $out_start) and ($last_end <= $out_end) ) {
+         $last_end = $out_end;
+
+         #### Record the number of changes
+         $changes++;
+      }
+
+      #### If not just print to output
+      else {
+            $output2->print("$last_seqname\t$out_source\t$out_feature\t$last_start\t$last_end\t",
+                      "$out_score\t$out_strand\t$out_frame\t$out_attribute");
+
+         $last_seqname = $out_seqname;
+         $last_chr = $out_chr;
+         $last_start = $out_start;
+         $last_end = $out_end;
+      }
+   }
+
+   #### Print last line to outfile
+   $output2->print("$last_seqname\t$out_source\t$out_feature\t$last_start\t$last_end\t",
+                   "$out_score\t$out_strand\t$out_frame\t$out_attribute");
+
+   #### Message
+   my $previous = $version - 1;
+
+   print STDOUT "Records in output file version $previous \($outline_count patterns\).\n";
+
+
+   $output->close;
+   $output2->close;
+
+   if($changes == 0) {
+      my $joined_ids = join("-", @regex_ids);
+      #### Copy last version to file name without v number
+      #system("cp $ARGV[6]\_v$version $ARGV[6]");
+
+      #### Open final output file and convert the attribute
+      open(FINAL_VER, "<$ARGV[6]\_v$version") or die("Could not open final ver GFF!\n");
+      open(FINAL, ">$ARGV[6]") or die("Could not open final GFF!\n");
+
+      while(defined(my $final_line = <FINAL_VER>)) {
+
+         my ($final_seqname, $final_source, $final_feature, $final_start, $final_end,
+          $final_score, $final_strand, $final_frame, $final_attribute) =
+          split(/\t/, $final_line, 9);
+
+         my $pattern_length = ($final_end - $final_start) + 1;
+
+         print FINAL "$final_seqname\t$final_source\t$final_feature\t$final_start\t",
+                     "$final_end\t$final_score\t$final_strand\t$final_frame\t",
+                     "$joined_ids\_len$pattern_length\n";
+
+      }
+
+      close(FINAL_VER);
+      close(FINAL);
+
+      last;
+   }
+}
+
+#### Spacer for summary file
+print STDOUT "\n";
+
+#### Remove intermediate files
+system("rm ./TFcombined\.$$ ./TFcombined_sorted\.$$ ./TFgroups\.$$ ./filtered_groups\.$$ $ARGV[6]\_v*");
+
+exit;
+
+
+############
+#Subroutines
+############
+sub GetNewRecord
+# Load record from a library file, delimited by //
+{
+   my $fh = shift (@_);
+   my $record = '';
+   my $saveinputsep = $/;
+   $/ = "//\n";
+
+   $record = <$fh>;
+
+   $/ = $saveinputsep;
+   return $record;
+}
+
+sub tf_cluster_tree {
+
+my @path;
+my $node_count;
+my $node = 0;
+my $success = 0;
+my $choice = -1;
+my @node_contains;
+my @node_to_choice;
+my @count;
+my @branches;
+$branches[0] = -1;
+my $path_ref;
+my $branch_ref;
+my $node_contains;
+my $node_to_choice;
+my ($choice_ref,
+	$required_ref,
+	$tf_ref) = @_;
+my @choices = @$choice_ref;
+my @required = @$required_ref;
+my @tfs = @$tf_ref;
+($node,
+ $choice,
+ $node_count,
+ $choice_ref,
+ $path_ref,
+ $branch_ref,
+ $node_to_choice,
+ $node_contains) = next_node ($node,
+			      $choice,
+			      $node_count,
+			      $choice_ref,
+			      \@path,
+			      \@branches,
+			      $tf_ref);
+@choices = @$choice_ref;
+@path = @$path_ref;
+@branches = @$branch_ref;
+$node_contains[$node] = $node_contains;
+$node_to_choice[$node] = $node_to_choice;
+
+BLOCK_A:
+
+while (1) {
+no strict "vars";
+
+	if ( node_terminating($choice, $choice_ref) ) {
+		push @path, $node;
+		@count = undef;
+		grep { $count[ $node_contains[$_] ]++ } @path;
+		#print "Count is @count.\n";
+		my $score = grep { $count[$_] >= $required[$_] }
+			(0 .. $#count);
+		#print "Path is @path\n";
+		if ($score == scalar @required) {
+			$success = 1;
+			last BLOCK_A;
+		}
+		pop @path;
+		($node,
+		 $choice,
+	 	 $path_ref)
+		= last_unexplored_node(\@path, $choice, $choice_ref,
+		\@node_to_choice, \@branches);
+		last BLOCK_A if ($node == -1);
+		@path = @$path_ref;
+	}
+	if ( node_fully_explored($choice, $node, $choice_ref, \@branches) ) {
+		($node,
+		 $choice,
+		 $path_ref)
+		= last_unexplored_node(\@path, $choice, $choice_ref,
+		\@node_to_choice, \@branches);
+		@path = @$path_ref;
+		last BLOCK_A if ($node == -1);
+	}
+	($node,
+	 $choice,
+	 $node_count,
+	 $choice_ref,
+	 $path_ref,
+	 $branch_ref,
+	 $node_to_choice,
+	 $node_contains,) = next_node ($node,
+	 			       $choice,
+				       $node_count,
+				       $choice_ref,
+				       \@path,
+				       \@branches,
+				       $tf_ref);
+	@choices = @$choice_ref;
+	@path = @$path_ref;
+	@branches = @$branch_ref;
+	$node_contains[$node] = $node_contains;
+	$node_to_choice[$node] = $node_to_choice;
+	}
+return $success;
+}
+
+
+sub next_node {
+
+	my ($node,
+	my $choice,
+	my $node_count,
+	my $choice_ref,
+	my $path_ref,
+	my $branch_ref,
+	my $tf_ref) = @_;
+	my @choices = @$choice_ref;
+	my @path = @$path_ref;
+	my @branches = @$branch_ref;
+	my @tfs = @$tf_ref;
+	my $new_choice = $choices[$choice][0];
+	push @{ $choices[$choice] }, shift @{ $choices[$choice] };
+	$choice = $new_choice;
+	my $node_to_choice = $choice;
+	push @path, $node if $node;
+	$branches[$node]++;
+	$node = $node_count++;
+	my $node_contains = $tfs[$choice - 1] if $choice;
+	return (
+		$node,
+		$choice,
+		$node_count,
+		\@choices,
+		\@path,
+		\@branches,
+		$node_to_choice,
+		$node_contains);
+}
+
+sub node_fully_explored {
+no strict "vars";
+
+	my $choice = shift;
+	my $node = shift;
+	my $choices_ref = shift;
+	my $branch_ref = shift;
+	my @choices = @$choices_ref;
+	my @branches = @$branch_ref;
+	if ($branches[$node] == (scalar @{ $choices[$choice] })) {
+		return 1 }
+	else { return 0 }
+}
+
+sub last_unexplored_node {
+no strict "vars";
+
+	my $path_ref = shift;
+	my $choice = shift;
+	my $choice_ref = shift;
+	my $node_to_choice_ref = shift;
+	my $branch_ref = shift;
+	my @path = @$path_ref;
+	my @node_to_choice = @$node_to_choice_ref;
+	my @branches = @$branch_ref;
+	my $node;
+
+	do {
+		$node = pop @path;
+		$choice = $node_to_choice[$node];
+		if ( $node == 0 and node_fully_explored($choice, $node,
+			$choice_ref, \@branches) ) {
+			$node = -1;
+			last;
+		}
+	} while ( node_fully_explored($choice, $node, $choice_ref,
+		\@branches) );
+	return ($node, $choice, \@path);
+}
+
+sub node_terminating {
+
+	my $choice = shift;
+	my $choices_ref = shift;
+	my @choices = @$choices_ref;
+	if ($choices[$choice][0]) { return 0 }
+	else { return 1 }
+
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+