Mercurial > repos > pjbriggs > motif_tools
changeset 2:2f48cf393d25 draft
Add Perl scripts missing from previous upload.
author | pjbriggs |
---|---|
date | Mon, 09 Apr 2018 04:56:28 -0400 |
parents | 764d562755bd |
children | 856008c4a5f3 |
files | CountUniqueIDs.pl Scan_IUPAC_output_each_match.pl Scan_IUPAC_output_matches_per_seq.pl TFBScluster_candidates.pl |
diffstat | 4 files changed, 1425 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CountUniqueIDs.pl Mon Apr 09 04:56:28 2018 -0400 @@ -0,0 +1,42 @@ +#! /usr/bin/perl -w + +use strict; + +#### Read thru a GFF file of motifs return the number of unique ids +#### Ian Donaldson Sept 2008 + +#### Usage +unless(@ARGV == 2) { + die("USAGE: $0 | GFF file | Output file\n\n"); +} + +#### Ready output file +open(GFF, "<$ARGV[0]") or die("Could not open GFF file!!\n\n"); +open(OUTPUT, ">$ARGV[1]") or die("Could not open output file!!\n\n"); + +#### Hash to hold ids +my %id_hash = (); + +#### Work thru GFF file +while(defined(my $gff_line = <GFF>)) { + if($gff_line =~ /(^#|^\s)/) { next } + + my @gff_line_bits = split(/\t/, $gff_line); + + my $id = $gff_line_bits[0]; + + $id_hash{$id}=1; +} + +my @all_keys = sort(keys(%id_hash)); + +my $elements = scalar(@all_keys); + +#print OUTPUT "There are $elements unique sequences in the file\n"; +print OUTPUT "$elements non-redundant sequences\n"; + +#### Close files +close(GFF); +close(OUTPUT); + +exit;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Scan_IUPAC_output_each_match.pl Mon Apr 09 04:56:28 2018 -0400 @@ -0,0 +1,210 @@ +#! /usr/bin/perl + +use strict; +use FileHandle; +use Bio::SeqIO; +#use Statistics::Descriptive; + +##### +# Program to count all occurences of a particular REGEX +# in a file containing mutiple FASTA sequences. +# 11 September 2003. Ian Donaldson. +# Revised to convert IUPAC to regex +# Revised to read a multiple FASTA file +# was CountRegexGFF_IUPAC_1input_nosummary.pl +##### + +#### File handles +my $input = new FileHandle; +my $output = new FileHandle; + +#### Variables +my $file_number = 0; +my $count_fwd_regex = 0; +my $count_rvs_regex = 0; +my $count_all_regex = 0; +my $seq_tally = 0; +my @seq_totals = (); + +#### Command line usage +if(@ARGV != 5) { + die ("USAGE: + $0 + IUPAC + Multiple FASTA input file + Output + Label + Skip palindromic (0=F+R-default|1=F only)\n\n"); +} + +#### Search forward strand only? +my $skip = $ARGV[4]; +unless($skip =~ /^[01]$/) { + die("Only accept 0 or 1 for Skip!\n"); +} + +#### Process IUPAC string +my $iupac = $ARGV[0]; +chomp $iupac; +$iupac = uc($iupac); + +if($iupac !~ /^[ACGTRYMKWSBDHVN]+$/) { + die("A non-IUPAC character was detected in your input string!\n"); +} + +#### Forward strand IUPAC +my @fwd_iupac_letters = split(//, $iupac); +my @fwd_regex_list = (); + +foreach my $letter (@fwd_iupac_letters) { + my $converted_iupac = iupac2regex($letter); + push(@fwd_regex_list, $converted_iupac); +} + +my $fwd_regex = join('', @fwd_regex_list); + + +#### Reverse strand IUPAC +my $revcomp_iupac = RevCompIUPAC($iupac); +my @rev_iupac_letters = split(//, $revcomp_iupac); +my @rev_regex_list = (); + +foreach my $letter (@rev_iupac_letters) { + my $converted_iupac = iupac2regex($letter); + push(@rev_regex_list, $converted_iupac); +} + +my $rvs_regex = join('', @rev_regex_list); + +#### Other variables +my $label = $ARGV[3]; + +if($label !~ /^[\w\d]+$/) { + die("A non-letter/number character was detected in your label string!\n"); +} + +my $length = length($iupac); + +#### Open output file +$output->open(">$ARGV[2]") or die "Could not open output file $ARGV[2]!\n"; +#$output->print("##gff-version 2\n"); + +#if($skip == 0) { +# $output->print("##Pattern search: $iupac and $revcomp_iupac\n"); +#} + +#else { +# $output->print("##Pattern search: $iupac\n"); +#} + +#### Work thru FASTA entries in the input file with SeqIO +my $seqio = Bio::SeqIO->new(-file => "$ARGV[1]" , '-format' => 'Fasta'); + +while( my $seqobj = $seqio->next_seq() ) { + $seq_tally++; + my $this_seq_tally = 0; + my $sequence = $seqobj->seq(); # actual sequence as a string + my $seq_id = $seqobj->id(); # header + #print(">$seq_id\n$seq\n\n"); + + #$output->print(">$seq_id\n"); + + #### Clean up $sequence to leave only nucleotides + $sequence =~ s/[\s\W\d]//g; + + while ($sequence =~ /($fwd_regex)/ig) { + $this_seq_tally++; + $count_fwd_regex++; + $count_all_regex++; + + my $end_position = pos($sequence); + my $start_position = $end_position - ($length - 1); + $output->print("$seq_id\tRegexSearch\tCNS\t$start_position\t$end_position\t.\t+\t.\t$label\n"); + } + + #### Count reverse REGEX + unless($skip == 1) { + while ($sequence =~ /($rvs_regex)/ig) { + $this_seq_tally++; + $count_rvs_regex++; + $count_all_regex++; + + my $end_position = pos($sequence); + my $start_position = $end_position - ($length - 1); + $output->print("$seq_id\tRegexSearch\tCNS\t$start_position\t$end_position\t.\t-\t.\t$label\n"); + } + + push(@seq_totals, $this_seq_tally); + #$output->print("$this_seq_tally matches\n"); + } +} + +#### Mean motifs per seq +#my $stat = Statistics::Descriptive::Full->new(); +#$stat->add_data(@seq_totals); +#my $mean = $stat->mean(); + + +#### Print a summary file +if($skip == 0) { +# $output->print("##Forward: $fwd_regex. Reverse: $rvs_regex.\n", +# "##$count_fwd_regex on the forward strand.\n", +# "##$count_rvs_regex on the reverse strand.\n", +# "##$count_all_regex in total.\n", +# "##$seq_tally sequences. Mean motifs per seq = $mean\n"); +# + print STDOUT "There were $count_all_regex instances of $fwd_regex and $rvs_regex.\n"; +} + +if($skip == 1) { +# $output->print("##Forward: $fwd_regex.\n", +# "##$count_fwd_regex on the forward strand.\n", +# "##$seq_tally sequences. Mean motifs per seq = $mean\n"); +# + print STDOUT "There were $count_fwd_regex instances of $fwd_regex on the forward strand.\n"; +} + +$output->close; + +exit; + +sub iupac2regex { +# Convert IUPAC codes to REGEX + my $iupac = shift; + + #### Series of regexes to convert + if($iupac =~ /A/) { return 'A' } + if($iupac =~ /C/) { return 'C' } + if($iupac =~ /G/) { return 'G' } + if($iupac =~ /T/) { return 'T' } + if($iupac =~ /M/) { return '[AC]' } + if($iupac =~ /R/) { return '[AG]' } + if($iupac =~ /W/) { return '[AT]' } + if($iupac =~ /S/) { return '[CG]' } + if($iupac =~ /Y/) { return '[CT]' } + if($iupac =~ /K/) { return '[GT]' } + if($iupac =~ /V/) { return '[ACG]' } + if($iupac =~ /H/) { return '[ACT]' } + if($iupac =~ /D/) { return '[AGT]' } + if($iupac =~ /B/) { return '[CGT]' } + if($iupac =~ /N/) { return '[ACGT]' } + + die("IUPAC not recognised by sub iupac2regex!\n"); +} + +sub RevCompIUPAC { + my $iupac_string = shift; + my @converted_list = (); + + my @iupac_string_list = split(//, $iupac_string); + + @iupac_string_list = reverse(@iupac_string_list); + + foreach my $letter (@iupac_string_list) { + $letter =~ tr/ACGTRYMKWSBDHVN/TGCAYRKMWSVHDBN/; + push(@converted_list, $letter); + } + + my $joined_list = join('', @converted_list); + return $joined_list; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Scan_IUPAC_output_matches_per_seq.pl Mon Apr 09 04:56:28 2018 -0400 @@ -0,0 +1,209 @@ +#! /usr/bin/perl + +use strict; +use FileHandle; +use Bio::SeqIO; +#use Statistics::Descriptive; + +##### +# Program to count all occurences of a particular REGEX +# in a file containing mutiple FASTA sequences. +# 11 September 2003. Ian Donaldson. +# Revised to convert IUPAC to regex +# Revised to read a multiple FASTA file +# was CountRegexGFF_IUPAC_1input_simple_output.pl +##### + +#### File handles +my $input = new FileHandle; +my $output = new FileHandle; + +#### Variables +my $file_number = 0; +my $count_fwd_regex = 0; +my $count_rvs_regex = 0; +my $count_all_regex = 0; +my $seq_tally = 0; +my @seq_totals = (); + +#### Command line usage +if(@ARGV != 4) { + die ("USAGE: + $0 + IUPAC + Multiple FASTA input file + Output + Skip palindromic (0=F+R-default|1=F only)\n\n"); +} + +#### Search forward strand only? +my $skip = $ARGV[3]; +unless($skip =~ /^[01]$/) { + die("Only accept 0 or 1 for Skip!\n"); +} + +#### Process IUPAC string +my $iupac = $ARGV[0]; +chomp $iupac; +$iupac = uc($iupac); + +if($iupac !~ /^[ACGTRYMKWSBDHVN]+$/) { + die("A non-IUPAC character was detected in your input string!\n"); +} + +#### Forward strand IUPAC +my @fwd_iupac_letters = split(//, $iupac); +my @fwd_regex_list = (); + +foreach my $letter (@fwd_iupac_letters) { + my $converted_iupac = iupac2regex($letter); + push(@fwd_regex_list, $converted_iupac); +} + +my $fwd_regex = join('', @fwd_regex_list); + + +#### Reverse strand IUPAC +my $revcomp_iupac = RevCompIUPAC($iupac); +my @rev_iupac_letters = split(//, $revcomp_iupac); +my @rev_regex_list = (); + +foreach my $letter (@rev_iupac_letters) { + my $converted_iupac = iupac2regex($letter); + push(@rev_regex_list, $converted_iupac); +} + +my $rvs_regex = join('', @rev_regex_list); + +#### Other variables +#my $label = $ARGV[3]; +# +#if($label !~ /^[\w\d]+$/) { +# die("A non-letter/number character was detected in your label string!\n"); +#} + +my $length = length($iupac); + +#### Open output file +$output->open(">$ARGV[2]") or die "Could not open output file $ARGV[2]!\n"; +#$output->print("##gff-version 2\n"); + +#if($skip == 0) { +# $output->print("##Pattern search: $iupac and $revcomp_iupac\n"); +#} + +#else { +# $output->print("##Pattern search: $iupac\n"); +#} + +#### Work thru FASTA entries in the input file with SeqIO +my $seqio = Bio::SeqIO->new(-file => "$ARGV[1]" , '-format' => 'Fasta'); + +while( my $seqobj = $seqio->next_seq() ) { + $seq_tally++; + my $this_seq_tally = 0; + my $sequence = $seqobj->seq(); # actual sequence as a string + my $seq_id = $seqobj->id(); # header + #print(">$seq_id\n$seq\n\n"); + + #$output->print(">$seq_id\n"); + + #### Clean up $sequence to leave only nucleotides + #$sequence =~ s/[\s\W\d]//g; + + while ($sequence =~ /($fwd_regex)/ig) { + $this_seq_tally++; + $count_fwd_regex++; + $count_all_regex++; + + #my $end_position = pos($sequence); + #my $start_position = $end_position - ($length - 1); + #$output->print("$seq_id\tRegexSearch\tCNS\t$start_position\t$end_position\t.\t+\t.\t$label\n"); + } + + #### Count reverse REGEX + unless($skip == 1) { + while ($sequence =~ /($rvs_regex)/ig) { + $this_seq_tally++; + $count_rvs_regex++; + $count_all_regex++; + + #my $end_position = pos($sequence); + #my $start_position = $end_position - ($length - 1); + #$output->print("$seq_id\tRegexSearch\tCNS\t$start_position\t$end_position\t.\t-\t.\t$label\n"); + } + } + + push(@seq_totals, $this_seq_tally); + $output->print("$seq_id\t$this_seq_tally\n"); +} + +#### Mean motifs per seq +#my $stat = Statistics::Descriptive::Full->new(); +#$stat->add_data(@seq_totals); +#my $mean = $stat->mean(); + + +#### Print a summary file +#if($skip == 0) { +# $output->print("##Forward: $fwd_regex. Reverse: $rvs_regex.\n", +# "##$count_fwd_regex on the forward strand.\n", +# "##$count_rvs_regex on the reverse strand.\n", +# "##$count_all_regex in total.\n", +# "##$seq_tally sequences. Mean motifs per seq = $mean\n"); +# +# print STDOUT "There were $count_all_regex instances of $fwd_regex and $rvs_regex.\n\n"; +#} + +#if($skip == 1) { +# $output->print("##Forward: $fwd_regex.\n", +# "##$count_fwd_regex on the forward strand.\n", +# "##$seq_tally sequences. Mean motifs per seq = $mean\n"); +# +# print STDOUT "There were $count_fwd_regex instances of $fwd_regex on the forward strand.\n\n"; +#} + +$output->close; + +exit; + +sub iupac2regex { +# Convert IUPAC codes to REGEX + my $iupac = shift; + + #### Series of regexes to convert + if($iupac =~ /A/) { return 'A' } + if($iupac =~ /C/) { return 'C' } + if($iupac =~ /G/) { return 'G' } + if($iupac =~ /T/) { return 'T' } + if($iupac =~ /M/) { return '[AC]' } + if($iupac =~ /R/) { return '[AG]' } + if($iupac =~ /W/) { return '[AT]' } + if($iupac =~ /S/) { return '[CG]' } + if($iupac =~ /Y/) { return '[CT]' } + if($iupac =~ /K/) { return '[GT]' } + if($iupac =~ /V/) { return '[ACG]' } + if($iupac =~ /H/) { return '[ACT]' } + if($iupac =~ /D/) { return '[AGT]' } + if($iupac =~ /B/) { return '[CGT]' } + if($iupac =~ /N/) { return '[ACGT]' } + + die("IUPAC not recognised by sub iupac2regex!\n"); +} + +sub RevCompIUPAC { + my $iupac_string = shift; + my @converted_list = (); + + my @iupac_string_list = split(//, $iupac_string); + + @iupac_string_list = reverse(@iupac_string_list); + + foreach my $letter (@iupac_string_list) { + $letter =~ tr/ACGTRYMKWSBDHVN/TGCAYRKMWSVHDBN/; + push(@converted_list, $letter); + } + + my $joined_list = join('', @converted_list); + return $joined_list; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TFBScluster_candidates.pl Mon Apr 09 04:56:28 2018 -0400 @@ -0,0 +1,964 @@ +#!/usr/bin/perl + +# TFBScluster version 2.0 - cluster together TFBSs from distinct +# TFBSsearch generated libraries. +# +# (c) Ian Donaldson 2003 and Mike Chapman (TFBS overlap subs) +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +use strict; +use FileHandle; +#use integer; # Force floating point into integers + +$|=1; + +########### +# Program to determine whether a TF site is in close proximity to +# one or more other TF sites. +# September 2003. Ian Donaldson. +# Revised 8 Sept. 2003 to not include the query TF site in the threshold. +# This is to allow one to determine whether a TF is near to another of the +# same type. +# Revised 9 Sept to alter the threshold size to only include the core of a +# pattern i.e. gata of nngatann. +# Revised 19 Sept to replace query and subject libraries with the statement +# of all interested libraries. +# Revised 22 Sept to scan output for overlapping patterns. +# NOTE: Any overlap in comparative record start with start pattern +# Revised 30 Sept to ignore duplication of TFBS in a group record caused by +# palindrome. By skipping if name and positions the same. +# Revised 6 Oct code for tree searching method to deal with overlapping TFBSs +########### + +#### Command line usage +if(@ARGV != 7) +{ + die ("USAGE: + $0 + TF libraries \(comma delimited NO SPACES\) + Number of flanking 'N's for subject files \(comma delimited NO SPACES\) + Minimum number of occurences \(comma delimited NO SPACES\) + TF IDs \(comma delimited NO SPACES\) + Single range value in bp \(+/-\) query start and end values + Include overlapping TFBSs \(include/exclude\) + Output file\n\n"); +} + +#### File handles +my $subject = new FileHandle; +my $combined = new FileHandle; +my $sorted = new FileHandle; +my $groups = new FileHandle; +my $filtered_groups = new FileHandle; +my $output = new FileHandle; +my $output2 = new FileHandle; + +#### Variables +my @subject_files = (); # Array containing the names of selected subject files +my @flanking_n = (); # Array containing the number of flanking 'n' for each pattern +my @min_occur = (); # Array containing the minimum occurences for the TF library +my @ids = (); # Array containing user defined IDs for the TF libraries +my @file_sizes = (); +my @sorted_file_sizes = (); +my $range = $ARGV[4]; +my $allow = $ARGV[5]; +my @regex_ids = (); + +##################################################### +#### Deal with user arguments processed into an array +##################################################### + +#### Convert TF file names string to an array +@subject_files = split(/,/, $ARGV[0]); + +#### Convert flanking 'N' numbers string into an array +@flanking_n = split(/,/, $ARGV[1]); + +#### Convert minimum occurences string into an array +@min_occur = split(/,/, $ARGV[2]); + +#### Convert minimum occurences string into an array +@ids = split(/,/, $ARGV[3]); + +foreach my $id_string (@ids) { + if($id_string !~ /^[\w\d_,]+$/) { + die("A non-letter/number character was detected in your label string!\n"); + } +} + +#### Record how large they are +for(my $i=0; $i < $#subject_files + 1; $i++) { + my $size = (-s $subject_files[$i]); # -s performed on an unopened file! + + push(@file_sizes, ["$subject_files[$i]", "$size", "$flanking_n[$i]", "$min_occur[$i]", "$ids[$i]"]); +} + +#### Sort file sizes array by file sizes +# ARRAY NOT SORTED BUT COPIED TO ALLOW SORTING AT A LATER DATE +# @sorted_file_sizes = sort{$a->[1] <=> $b->[1]} @file_sizes; +push(@sorted_file_sizes, @file_sizes); + +#### Summary file information +print STDOUT "TFBScluster analysis:\n", + "--------------------\n"; + +print STDOUT "TFBS library information:\n"; + +#### Show file summary +for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) { + print STDOUT "TFBS lib. ID = $sorted_file_sizes[$i][4].\n", + "Extended conservation = $sorted_file_sizes[$i][2].\n", + "Minimum occurrence = $sorted_file_sizes[$i][3].\n\n"; +} + +#print STDOUT "\n"; + + +##################################################### +#### Information required by tree searching algorithm +##################################################### + +#### Array containing the minimum number of each TF, also corresp names +my @tf_min = (); +my @tf_names = (); + +for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) { + push(@tf_min, $sorted_file_sizes[$i][3]); +} + +for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) { + push(@tf_names, $sorted_file_sizes[$i][4]); +} + +#### TEST +#print "ARRAY1 = @tf_min\n\n"; + +##################################################################### +#### Open a file to store all the TF data from each selected library. +##################################################################### +$combined->open(">TFcombined\.$$") or die("Could not open TFcombined file!\n"); + +#### Copy each TF file to another file and sort it. +for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) { + #### Save necessary parts of the current subject file to chr. specific arrays + $subject->open("<$sorted_file_sizes[$i][0]") or die("Could not open subject file!\n"); + + #### Message to user + #print "Adding data for TF file $i\.\t"; + + SUBLINE: while(defined(my $sub_line = <$subject>)) { + my ($sub_seqname, $sub_source, $sub_feature, $sub_start, $sub_end, $sub_score, + $sub_strand, $sub_frame, $sub_attribute) = ''; + + #### Skip line if GFF comment or blank line + if($sub_line =~ /(^\s|^#)/) + { + next SUBLINE; + } + + #### Split each line by TAB + ($sub_seqname, $sub_source, $sub_feature, $sub_start, $sub_end, $sub_score, + $sub_strand, $sub_frame, $sub_attribute) = split(/\t/, $sub_line, 9); + + #### Clean up attribute + #($sub_attribute) = $sub_attribute =~ /[ACTG\-]+/g; + chomp($sub_attribute); + + #### Adjust thresold of subject positions to reflect the core sequence + #### Possibly make an argument?! + $sub_start = $sub_start + $sorted_file_sizes[$i][2]; + $sub_end = $sub_end - $sorted_file_sizes[$i][2]; + + #### Determine chromosome + (my $sub_chr) = $sub_seqname; + + #### Add modified line to analysis library file + $combined->print("$sub_seqname\t$sub_source\t$sub_feature\t$sub_start\t$sub_end\t", + "$sub_score\t$sub_strand\t$sub_frame\t$sorted_file_sizes[$i][4]\n"); +# "$sub_score\t$sub_strand\t$sub_frame\t$sub_attribute", +# "\_\_$sorted_file_sizes[$i][4]\n"); + } + + #### Message + #print STDOUT "\[DONE\]\n"; + + $subject->close; +} + +#### Spacer on screen +#print STDOUT "\n"; + +$combined->close; + + +############################# +#### Sort the TFcombined file +############################# +#print STDOUT "Sorting TFcombined file.\t"; + +#system("sort +0.3 -0.5 +3n -4n TFcombined\.$$ > TFcombined_sorted_temp\.$$"); +system("sort +0 -1 +3n -4n TFcombined\.$$ > TFcombined_sorted\.$$"); + +#print "HELLO\n"; + +################################### +#### Convert all chr01 back to chr1 +################################### +#system("/home/donaldson/bin/TFBScluster/nozero_before_1-9.pl TFcombined_sorted_temp\.$$ TFcombined_sorted\.$$"); + +#print STDOUT "\[DONE\]\n\n"; + + +##################################### +#### Sort the sorted file into groups +##################################### +#### Work thru each line of the combined TF file. Store record of all TFs downstream +#### WITHIN the predefined distance +#print STDOUT "Organising the sorted file into groups.\t"; + +my $last = 0; + +$sorted->open("<TFcombined_sorted\.$$") or die("Could not open sorted TFcombined file!\n"); + +#### Rewind combined TF file to start +seek($sorted, 0, 0); + +COMBLINE: while(defined(my $comb_line = <$sorted>)) { + #### Get info about the line + my @comb_line_array = split(/\t/, $comb_line); + my $comb_seqname = $comb_line_array[0]; + (my $comb_chr) = $comb_seqname; + + my $comb_start = $comb_line_array[3]; + + #### Store the start of the next line + my $next_line_pos = tell($sorted); + + #### Variable to hold lines + my $group_holder = ''; + + $group_holder = $comb_line; + + #### Read thru the next lines until the end position is not within the specified + #### range of the start line start + my $count_hit = 0; + + GROUPLINE: while(defined(my $group_line = <$sorted>)) { + my @group_line_array = split(/\t/, $group_line); + my $group_seqname = $group_line_array[0]; + (my $group_chr) = $group_seqname; + + my $group_end = $group_line_array[4]; + + #### CHR + #if( (($group_end - $comb_start + 1) < $range ) and ($comb_chr eq $group_chr) ) { + if( (($group_end - $comb_start + 1) <= $range ) and ($comb_chr eq $group_chr) ) { + $group_holder .= $group_line; + $count_hit++; + } + + else { + last GROUPLINE; + } + } + + if($count_hit > 0) { + #Make the record + $groups->open(">>TFgroups\.$$") or die("Could not open TF groups file!\n"); + + $groups->print("$group_holder\/\/\n"); + + $groups->close; + + #### Move to the end of the last line + seek($sorted, $next_line_pos , 0); + + next COMBLINE; + } + + else { + #### Move to the end of the last line + seek($sorted, $next_line_pos , 0); + + next COMBLINE; + } +} + +$sorted->close; + +#print STDOUT "\[DONE\]\n\n"; + + +################################################################################### +#### Look through the groups file to find records matching the user defined params +################################################################################### +my $record = ''; +my $target = 0; +my $count_pass = 0; + +#### Another user message +print STDOUT "You have chosen to search for groups containing at least:\n"; + +for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) { + print STDOUT "$sorted_file_sizes[$i][3] $sorted_file_sizes[$i][4] site(s).\n"; + + #### Increment the desired number of matches for a group to be selected + $target++; +} + +#### Another user message +#print STDOUT "\nOutput will be written to \[$ARGV[6]\].\n"; +print STDOUT "\nCombining overlapping clusters:\n"; + +#### Open an output file +$output->open(">$ARGV[6]\_v1") or die("Could not open output file!\n"); + +#### Open the TFgroups files +$groups->open("<TFgroups\.$$") or die("Could not open TFgroups file!\n"); + +#### Open the filtered record test file +$filtered_groups->open(">filtered_groups\.$$") or die("Could not open filtered groups file!\n"); + +#### Take each record of the groups file +RECORD: while($record = GetNewRecord($groups)) { + #### What about if the positions overlap? + my @record_array = (); + my $last_record_start = 0; + my $last_record_end = 0; + my $last_record_attribute = 0; + + my $save_filtered_group = ''; + + #### Take each line of the record beginning with chr... + RECORDLINE: while($record =~ /(\w.+\n)/mg) { + my $record_line = $1; + + my @record_line_array = (); + @record_line_array = split(/\t/, $record_line); + + my $record_seqname = $record_line_array[0]; + my $record_start = $record_line_array[3]; + my $record_end = $record_line_array[4]; + my $record_strand = $record_line_array[6]; + my $record_attribute = $record_line_array[8]; + chomp($record_attribute); + + #print "$record_seqname\t$record_start\t$record_end\t$record_strand\t$record_attribute\n"; + #exit; + + #### If the last motif exactly overlaps and is same ID - skip adding to array + if( ($record_start == $last_record_start) and + ($record_end == $last_record_end) and + ($record_attribute eq $last_record_attribute) ) { + + $last_record_start = $record_start; + $last_record_end = $record_end; + $last_record_attribute = $record_attribute; + + next RECORDLINE; + } + + $last_record_start = $record_start; + $last_record_end = $record_end; + $last_record_attribute = $record_attribute; + + #### File 2D array + push(@record_array, ["$record_seqname", "$record_start", "$record_end", "$record_strand", "$record_attribute"]); + + #### Test file + $save_filtered_group .= "$record_seqname\t$record_start\t$record_end\t$record_strand\t$record_attribute\n"; + } + + #### Test file record marker + $save_filtered_group .= "//\n"; + + ###################################################################### + #### Make sure the record contains the minimum number of specified TFs + ###################################################################### + + #### Counter to see whether all params matched + my $pass = 0; + @regex_ids = (); # Array to hold regexs as they are used + my @regex_totals = (); # Array to hold info on regex totals in the record + + #### Work thru each of the input parameter lists + for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) { + #### Site name for regex + my $regex = $sorted_file_sizes[$i][4]; + + push(@regex_ids, $regex); + + #### Min number of hits for regex + my $min_regex = $sorted_file_sizes[$i][3]; + + #### Search for current regex and tally + my $regex_hits = 0; + + #### Work thru each of the non-repeating record lines array + for(my $k=0; $k < $#record_array + 1; $k++) { + my $line_regex = $record_array[$k][4]; + + if($regex eq $line_regex) { + $regex_hits++; + } + } + + #### Were the min number of regex hits found? + if($regex_hits >= $min_regex) { + $pass++; + } + } + + ##################################################################### + #### If there are the minimum number of TFBS then check they are not + #### sufficiently overlapping to reduce the numbers below the minimum + ##################################################################### + my $good_cluster = ''; + + if($pass == $target) { + #### Test file + $filtered_groups->print("$save_filtered_group"); + + # Declarations + my ($end); + my (@starts, @ends, @choices); + + # Assign start and end positions + + grep { $starts[$_] = $record_array[$_][1]; $ends[$_] = $record_array[$_][2] } + (0 .. $#record_array); + + $end = -1; # First choice does not refer to a transcription factor + + # Loop through all transcription factors + + I_LOOP: + for my $i (0 .. @starts) { + my $next_factor = 0; + + # Now loop through all possible following transcription factors + # Note that $starts[0] and @ends[0] refer to the first transcription factor + # whereas $choices[1] will refer to the first transcription factor. + + for my $j ( $i .. $#starts) { + if ($starts[$j] > $end) { + $next_factor = ($j + 1); + last; + } + } + + push @{ $choices[$i] }, $next_factor; + + # If no factor follows, we have a terminating factor and progress to the next. + # Must first modify $end to be the end of the next transcription factor. + + unless ($next_factor) { + $end = $ends[$i]; + next I_LOOP; + } + + # Now need to check the factors overlapping with $next_factor and add them + # as possibilities. Note that in some circumstances, this may result in a + # redundant path. This will not give spurious results, however. + + for my $k ( $next_factor .. $#starts ) { + if ($starts[$k] <= $ends[$next_factor - 1]) { + push @{ $choices[$i] }, ($k + 1); + } + } + + # Finally, modify $end to be the end of the next transcription factor + + $end = $ends[$i]; + + # And go to next loop + } + + + #### Print out @choices array to file + foreach (@choices) { + $filtered_groups->print("@{ $_ } \n"); + } + + $filtered_groups->print("//\n"); + + + ##################################################### + #### Information required by tree searching algorithm + #### Only for records that contain min number of TFBS + ##################################################### + + #### Array relating each TF to their minimum values + my @tf_relate = (); + + #$tf_relate[0] = 0; + + #### Get TF ID for current record + for(my $i=0; $i < $#record_array+1; $i++) { + #my $next_i = $i + 1; + + my $current_attrib = $record_array[$i][4]; + + #### Scan @sorted_files_sizes array for matching TF ID and save + #### row number. This will relate directly to @tf_min + for(my $f=0; $f < $#sorted_file_sizes + 1; $f++) { + + if($current_attrib =~ /$sorted_file_sizes[$f][4]/) { + #$tf_relate[$next_i] = "$f"; + $tf_relate[$i] = "$f"; + } + } + } + + #### TEST +# print "ARRAY2 = @tf_relate\n\n"; +# for(0..$#tf_relate) { +# print "[$_] $tf_relate[$_]\n"; +# } + + #### DUMMY ARRAYS + #@choices = (); + #@choices = ( +# [ 1,2,3 ], +# [ 4,5,6 ], +# [ 4,5,6 ], +# [ 4,5,6 ], +# [ 7 ], +# [ 7 ], +# [ 7 ], +# [ 8,9,10 ], +# [ 0 ], +# [ 0 ], +# [ 0 ] +# ); + + #@tf_min = (); + #@tf_min = ( 3, 1, 1); + + #@tf_relate = (); + #@tf_relate = (0,0,0,1,1,1,2,2,2,2); + + + + #### References for test_cluster sub + my $choices_to_pass = \@choices; #### Tree decisions + my $required_to_pass = \@tf_min; #### Min TFBS numbers + my $order = \@tf_relate; #### Relate TFBSs to min numbers + + + ######################################## + #### Run tree searching algorithm (Mike) + ######################################## + if($allow eq 'exclude') { + #$good_cluster = tf_cluster_tree($choices_to_pass, $required_to_pass, $order); + $good_cluster = tf_cluster_tree(\@choices, \@tf_min, \@tf_relate); + + #print "GOODCLUSTER = $good_cluster\n"; + } + + else { + $good_cluster = 1; + } + } + + + #### If all parameters are matched create a summary of the record + #### Work thru each line of the record string start end and TF ID to an array + + #### Carry on if overlapping not a problem #### + if( ($pass == $target) and ($good_cluster == 1) ){ + my $regex_chr = $record_array[0][0]; + my $regex_start = $record_array[0][1]; + my $regex_end = $record_array[$#record_array][2]; + my $joined_ids = join("-", @regex_ids); + + $output->print("$regex_chr\t", + "TFBScluster\t", + "CNS\t", + "$regex_start\t", + "$regex_end\t", + ".\t", + "+\t", + ".\t", + "$joined_ids\n"); + } +} + +$groups->close; +$filtered_groups->close; +$output->close; + +#### Space on screen +#print STDOUT "\n"; + +#### Read each line of output file and combine lines that overlap +my $version = 1; + +#### Remain in loop until last command given +while(1) { + my $last_seqname = ''; + my $last_chr = ''; + my $last_start = ''; + my $last_end = ''; + my $changes = 0; + my $outline_count = 0; + + my ($out_seqname, $out_source, $out_feature, $out_start, $out_end, $out_score, + $out_strand, $out_frame, $out_attribute, $out_chr) = ''; + + $output->open("<$ARGV[6]\_v$version") or die("Could not open output file!\n"); + + #### If the output file is empty then exit loop and finish + if(-z $output) { + $output->close; + + system("rm $ARGV[6]\_v$version"); + + exit; + #die("\nNo clusters were found!\n"); + } + + $version++; + + $output2->open(">$ARGV[6]\_v$version") or die("Could not open output2 file!\n"); + + OUTLINE: while(defined(my $out_line = <$output>)) { + #### Skip line if GFF comment or blank line + if($out_line =~ /^\s/) + { + next OUTLINE; + } + + #### Tally lines read + $outline_count++; + + ($out_seqname, $out_source, $out_feature, $out_start, $out_end, $out_score, + $out_strand, $out_frame, $out_attribute) = split(/\t/, $out_line, 9); + + $out_chr = $out_seqname; + + #### Handle the first line + if($outline_count == 1) { + $last_seqname = $out_seqname; + $last_chr = $out_chr; + $last_start = $out_start; + $last_end = $out_end; + + next OUTLINE; + } + + #### Remaining lines + + #### If the patterns are on different chromosomes + #### CHR + if($last_chr ne $out_chr) { + #### Print the last line to the file and save the current + $output2->print("$last_seqname\t$out_source\t$out_feature\t$last_start\t$last_end\t", + "$out_score\t$out_strand\t$out_frame\t$out_attribute"); + + $last_seqname = $out_seqname; + $last_chr = $out_chr; + $last_start = $out_start; + $last_end = $out_end; + + next OUTLINE; + } + + #### If they overlap change current line start to the previous + if( ($last_end > $out_start) and ($last_end <= $out_end) ) { + $last_end = $out_end; + + #### Record the number of changes + $changes++; + } + + #### If not just print to output + else { + $output2->print("$last_seqname\t$out_source\t$out_feature\t$last_start\t$last_end\t", + "$out_score\t$out_strand\t$out_frame\t$out_attribute"); + + $last_seqname = $out_seqname; + $last_chr = $out_chr; + $last_start = $out_start; + $last_end = $out_end; + } + } + + #### Print last line to outfile + $output2->print("$last_seqname\t$out_source\t$out_feature\t$last_start\t$last_end\t", + "$out_score\t$out_strand\t$out_frame\t$out_attribute"); + + #### Message + my $previous = $version - 1; + + print STDOUT "Records in output file version $previous \($outline_count patterns\).\n"; + + + $output->close; + $output2->close; + + if($changes == 0) { + my $joined_ids = join("-", @regex_ids); + #### Copy last version to file name without v number + #system("cp $ARGV[6]\_v$version $ARGV[6]"); + + #### Open final output file and convert the attribute + open(FINAL_VER, "<$ARGV[6]\_v$version") or die("Could not open final ver GFF!\n"); + open(FINAL, ">$ARGV[6]") or die("Could not open final GFF!\n"); + + while(defined(my $final_line = <FINAL_VER>)) { + + my ($final_seqname, $final_source, $final_feature, $final_start, $final_end, + $final_score, $final_strand, $final_frame, $final_attribute) = + split(/\t/, $final_line, 9); + + my $pattern_length = ($final_end - $final_start) + 1; + + print FINAL "$final_seqname\t$final_source\t$final_feature\t$final_start\t", + "$final_end\t$final_score\t$final_strand\t$final_frame\t", + "$joined_ids\_len$pattern_length\n"; + + } + + close(FINAL_VER); + close(FINAL); + + last; + } +} + +#### Spacer for summary file +print STDOUT "\n"; + +#### Remove intermediate files +system("rm ./TFcombined\.$$ ./TFcombined_sorted\.$$ ./TFgroups\.$$ ./filtered_groups\.$$ $ARGV[6]\_v*"); + +exit; + + +############ +#Subroutines +############ +sub GetNewRecord +# Load record from a library file, delimited by // +{ + my $fh = shift (@_); + my $record = ''; + my $saveinputsep = $/; + $/ = "//\n"; + + $record = <$fh>; + + $/ = $saveinputsep; + return $record; +} + +sub tf_cluster_tree { + +my @path; +my $node_count; +my $node = 0; +my $success = 0; +my $choice = -1; +my @node_contains; +my @node_to_choice; +my @count; +my @branches; +$branches[0] = -1; +my $path_ref; +my $branch_ref; +my $node_contains; +my $node_to_choice; +my ($choice_ref, + $required_ref, + $tf_ref) = @_; +my @choices = @$choice_ref; +my @required = @$required_ref; +my @tfs = @$tf_ref; +($node, + $choice, + $node_count, + $choice_ref, + $path_ref, + $branch_ref, + $node_to_choice, + $node_contains) = next_node ($node, + $choice, + $node_count, + $choice_ref, + \@path, + \@branches, + $tf_ref); +@choices = @$choice_ref; +@path = @$path_ref; +@branches = @$branch_ref; +$node_contains[$node] = $node_contains; +$node_to_choice[$node] = $node_to_choice; + +BLOCK_A: + +while (1) { +no strict "vars"; + + if ( node_terminating($choice, $choice_ref) ) { + push @path, $node; + @count = undef; + grep { $count[ $node_contains[$_] ]++ } @path; + #print "Count is @count.\n"; + my $score = grep { $count[$_] >= $required[$_] } + (0 .. $#count); + #print "Path is @path\n"; + if ($score == scalar @required) { + $success = 1; + last BLOCK_A; + } + pop @path; + ($node, + $choice, + $path_ref) + = last_unexplored_node(\@path, $choice, $choice_ref, + \@node_to_choice, \@branches); + last BLOCK_A if ($node == -1); + @path = @$path_ref; + } + if ( node_fully_explored($choice, $node, $choice_ref, \@branches) ) { + ($node, + $choice, + $path_ref) + = last_unexplored_node(\@path, $choice, $choice_ref, + \@node_to_choice, \@branches); + @path = @$path_ref; + last BLOCK_A if ($node == -1); + } + ($node, + $choice, + $node_count, + $choice_ref, + $path_ref, + $branch_ref, + $node_to_choice, + $node_contains,) = next_node ($node, + $choice, + $node_count, + $choice_ref, + \@path, + \@branches, + $tf_ref); + @choices = @$choice_ref; + @path = @$path_ref; + @branches = @$branch_ref; + $node_contains[$node] = $node_contains; + $node_to_choice[$node] = $node_to_choice; + } +return $success; +} + + +sub next_node { + + my ($node, + my $choice, + my $node_count, + my $choice_ref, + my $path_ref, + my $branch_ref, + my $tf_ref) = @_; + my @choices = @$choice_ref; + my @path = @$path_ref; + my @branches = @$branch_ref; + my @tfs = @$tf_ref; + my $new_choice = $choices[$choice][0]; + push @{ $choices[$choice] }, shift @{ $choices[$choice] }; + $choice = $new_choice; + my $node_to_choice = $choice; + push @path, $node if $node; + $branches[$node]++; + $node = $node_count++; + my $node_contains = $tfs[$choice - 1] if $choice; + return ( + $node, + $choice, + $node_count, + \@choices, + \@path, + \@branches, + $node_to_choice, + $node_contains); +} + +sub node_fully_explored { +no strict "vars"; + + my $choice = shift; + my $node = shift; + my $choices_ref = shift; + my $branch_ref = shift; + my @choices = @$choices_ref; + my @branches = @$branch_ref; + if ($branches[$node] == (scalar @{ $choices[$choice] })) { + return 1 } + else { return 0 } +} + +sub last_unexplored_node { +no strict "vars"; + + my $path_ref = shift; + my $choice = shift; + my $choice_ref = shift; + my $node_to_choice_ref = shift; + my $branch_ref = shift; + my @path = @$path_ref; + my @node_to_choice = @$node_to_choice_ref; + my @branches = @$branch_ref; + my $node; + + do { + $node = pop @path; + $choice = $node_to_choice[$node]; + if ( $node == 0 and node_fully_explored($choice, $node, + $choice_ref, \@branches) ) { + $node = -1; + last; + } + } while ( node_fully_explored($choice, $node, $choice_ref, + \@branches) ); + return ($node, $choice, \@path); +} + +sub node_terminating { + + my $choice = shift; + my $choices_ref = shift; + my @choices = @$choices_ref; + if ($choices[$choice][0]) { return 0 } + else { return 1 } + +} + + + + + + + + + + + + + +