Mercurial > repos > devteam > microsatellite_birthdeath
diff microsatellite_birthdeath.pl @ 0:4e31fad3f08e draft
Uploaded tool tarball.
author | devteam |
---|---|
date | Wed, 25 Sep 2013 11:26:02 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/microsatellite_birthdeath.pl Wed Sep 25 11:26:02 2013 -0400 @@ -0,0 +1,4333 @@ +#!/usr/bin/perl -w +use strict; +use warnings; +use Term::ANSIColor; +use Pod::Checker; +use File::Basename; +use IO::Handle; +use Cwd; +use File::Path qw(make_path remove_tree); +use File::Temp qw/ tempfile tempdir /; +my $tdir = tempdir( CLEANUP => 1 ); +chdir $tdir; +my $dir = getcwd; +# print "current dit=$dir\n"; +#<STDIN>; +use vars qw (%treesToReject %template $printer $interr_poscord $interrcord $no_of_interruptionscord $stringfile @tags +$infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species +$gapcord %thresholdhash $tree_decipherer @sp_ident %revHash %sameHash %treesToIgnore %alternate @exactspecies_orig @exactspecies @exacttags +$mono_flanksimplicityRepno $di_flanksimplicityRepno $prop_of_seq_allowedtoAT $prop_of_seq_allowedtoCG); +use FileHandle; +use IO::Handle; # 5.004 or higher + + + +#my @ar = ("/Users/ydk/work/rhesus_microsat/results/galay/chr22_5sp.maf.txt", "/Users/ydk/work/rhesus_microsat/results/galay/dataset_11.dat", +#"/Users/ydk/work/rhesus_microsat/results/galay/chr22_5spec.maf.summ","hg18,panTro2,ponAbe2,rheMac2,calJac1","((((hg18, panTro2), ponAbe2), rheMac2), calJac1)","9,10,12,12", +#"10","0.8"); +my @ar = @ARGV; +my ($maf, $orth, $summout, $species_set, $tree_definition, $thresholds, $FLANK_SUPPORT, $SIMILARITY_THRESH) = @ar; +$SIMILARITY_THRESH=$SIMILARITY_THRESH/100; +######################### +$SIMILARITY_THRESH = $SIMILARITY_THRESH/100; +my $EDGE_DISTANCE = 10; +my $COMPLEXITY_SUPPORT = 20; +load_thresholds("9_10_12_12"); +my $FLANKINDEL_MAXTHRESH = 0.3; + +my $mono_flanksimplicityRepno=6; +my $di_flanksimplicityRepno=10; +my $prop_of_seq_allowedtoAT=0.5; +my $prop_of_seq_allowedtoCG=0.66; + +######################### +my $tspecies_set = $species_set; + +my %speciesReplacement = (); +my %speciesReplacementTag = (); +my %replacementArr= (); +my %replacementArrTag= (); +my %backReplacementArr= (); +my %backReplacementArrTag= (); +$tree_definition=~s/\s+//g; + +my $tree_definition_split = $tree_definition; +$tree_definition_split =~ s/[\(\)]//g; +my @gotSpecies = ($tree_definition_split =~ /(,)/g); +# print "gotSpecies = @gotSpecies\n"; + +if (scalar(@gotSpecies)+1 ==5){ + + $speciesReplacement{1}="calJac1"; + $speciesReplacement{2}="rheMac2"; + $speciesReplacement{3}="ponAbe2"; + $speciesReplacement{4}="panTro2"; + $speciesReplacement{5}="hg18"; + + $speciesReplacementTag{1}="M"; + $speciesReplacementTag{2}="R"; + $speciesReplacementTag{3}="O"; + $speciesReplacementTag{4}="C"; + $speciesReplacementTag{5}="H"; + $species_set="hg18,panTro2,ponAbe2,rheMac2,calJac1"; + +} +if (scalar(@gotSpecies)+1 ==4){ + + $speciesReplacement{1}="rheMac2"; + $speciesReplacement{2}="ponAbe2"; + $speciesReplacement{3}="panTro2"; + $speciesReplacement{4}="hg18"; + + $speciesReplacementTag{1}="R"; + $speciesReplacementTag{2}="O"; + $speciesReplacementTag{3}="C"; + $speciesReplacementTag{4}="H"; + $species_set="hg18,panTro2,ponAbe2,rheMac2"; + +} + +# $tree_definition = "((((hg18,panTro2),ponAbe2),rheMac2),calJac1)"; +my $tree_definition_copy = $tree_definition; +my $tree_definition_orig = $tree_definition; +my $brackets = 0; + +while (1){ + #last if $tree_definition_copy !~ /\(/; + $brackets++; +# print "brackets = $brackets\n"; + last if $brackets > 6; + $tree_definition_copy =~ s/^\(//g; + $tree_definition_copy =~ s/\)$//g; +# print "tree_definition_copy = $tree_definition_copy\n"; + my @arr = (); + + if ($tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)\)$/){ + @arr = $tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)$/; +# print "arr = @arr\n"; + $tree_definition_copy = $2; + $replacementArr{$1} = $speciesReplacement{$brackets}; + $backReplacementArr{$speciesReplacement{$brackets}}=$1; + + $replacementArrTag{$1} = $speciesReplacementTag{$brackets}; + $backReplacementArrTag{$speciesReplacementTag{$brackets}}=$1; +# print "replacing $1 with $replacementArr{$1}\n"; + + $sp_ident[$brackets-1] = $1; + + } + elsif ($tree_definition_copy =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/){ + @arr = $tree_definition_copy =~ /^([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/; +# print "arr = @arr\n"; + $tree_definition_copy = $1; + $replacementArr{$2} = $speciesReplacement{$brackets}; + $backReplacementArr{$speciesReplacement{$brackets}}=$2; + + $replacementArrTag{$2} = $speciesReplacementTag{$brackets}; + $backReplacementArrTag{$speciesReplacementTag{$brackets}}=$2; +# print "replacing $2 with $replacementArr{$2}\n"; + + $sp_ident[$brackets-1] = $2; + } + elsif ($tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/){ + @arr = $tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/; +# print "arr = @arr .. TERMINAL\n"; + $tree_definition_copy = $1; + $replacementArr{$2} = $speciesReplacement{$brackets}; + $replacementArr{$1} = $speciesReplacement{$brackets+1}; + $backReplacementArr{$speciesReplacement{$brackets}}=$2; + $backReplacementArr{$speciesReplacement{$brackets+1}}=$1; + + $replacementArrTag{$1} = $speciesReplacementTag{$brackets+1}; + $backReplacementArrTag{$speciesReplacementTag{$brackets+1}}=$1; + + $replacementArrTag{$2} = $speciesReplacementTag{$brackets}; + $backReplacementArrTag{$speciesReplacementTag{$brackets}}=$2; +# print "replacing $1 with $replacementArr{$1}\n"; +# print "replacing $2 with $replacementArr{$2}\n"; +# print "replacing $1 with $replacementArrTag{$1}\n"; +# print "replacing $2 with $replacementArrTag{$2}\n"; + + $sp_ident[$brackets-1] = $2; + $sp_ident[$brackets] = $1; + + + last; + + } + elsif ($tree_definition_copy =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_\(\),]+)\)$/){ + $tree_definition_copy =~ s/^\(//g; + $tree_definition_copy =~ s/\)$//g; + $brackets--; + } +} + +foreach my $key (keys %replacementArr){ + my $replacement = $replacementArr{$key}; + $tree_definition =~ s/$key/$replacement/g; +} +@sp_ident = reverse(@sp_ident); +# print "modified tree_definition = $tree_definition\n"; +# print "done .. tree_definition = $tree_definition\n"; +# print "sp_ident = @sp_ident\n"; +#<STDIN>; + + +my $complexity=int($COMPLEXITY_SUPPORT * (1/40)); + +#print "complexity=$complexity\n"; +#<STDIN>; + +$printer = 1; + +my $rando = int(rand(1000)); +my $localdate = `date`; +$localdate =~ /([0-9]+):([0-9]+):([0-9]+)/; +my $info = $rando.$1.$2.$3; + +#--------------------------------------------------------------------------- +# GETTING INPUT INFORMATION AND OPENING INPUT AND OUTPUT FILES + + +my @thresharr = (0, split(/,/,$thresholds)); +my $randno=int(rand(100000)); +my $megamatch = $randno.".megamatch.net.axt"; #"/gpfs/home/ydk104/work/rhesus_microsat/axtNet/hg18.panTro2.ponAbe2.rheMac2.calJac1/chr1.hg18.panTro2.ponAbe2.rheMac2.calJac1.net.axt"; +my $megamatchlck = $megamatch.".lck"; +unlink $megamatchlck; + +my $selected= $orth; +#my $eventfile = $orth; + $selected = $selected."_SELECTED"; +#$selected = $selected."_".$SIMILARITY_THRESH; +#my $runtime = $selected.".runtime"; + +my $inputtags = "H:C:O:R:M"; +$inputtags = $ARGV[3] if exists $ARGV[3] && $ARGV[3] =~ /[A-Z]:[A-Z]/; + +my @all_tags = split(/:/, $inputtags); +my $inputsp = "hg18:panTro2:ponAbe2:rheMac2:calJac1"; +$inputsp = $ARGV[4] if exists $ARGV[4] && $ARGV[3] =~ /[0-9]+:/; +#@sp_ident = split(/:/,$inputsp); +my $junkfile = $orth."_junk"; + +my $sh = load_sameHash(1); +my $rh = load_revHash(1); + +#print "inputs are : \n"; foreach(@ARGV){print $_,"\n";} +#open (SELECT, ">$selected") or die "Cannot open selected file: $selected: $!"; +open (SUMMARY, ">$summout") or die "Cannot open summout file: $summout: $!"; +#open (RUN, ">$runtime") or die "Cannot open orth file: $runtime: $!"; +#my $ctlfile = "baseml\.ctl"; #$ARGV[4]; +#my $treefile = "/gpfs/home/ydk104/work/rhesus_microsat/codes/lib/"; #1 THIS IS THE THE TREE UNDER CONSIDERATION, IN NEWICK +my %registeredTrees = (); +my @removalReasons = +("microsatellite is compound", +"complex structure", +"if no. if micros is more than no. of species", +"if more than one micro per species ", +"if microsat contains N", +"different motif than required ", +"more than zero interruptions", +"microsat could not form key ", +"orthologous microsats of different motif size ", +"orthologous microsats of different motifs ", +"microsats belong to different alignment blocks altogether", +"microsat near edge", +"microsat in low complexity region", +"microsat flanks dont align well", +"phylogeny not informative"); +my %allowedhash=(); +#--------------------------------------------------------------------------- +# WORKING ON MAKING THE MEGAMATCH FILE +my $chromt=int(rand(10000)); +my $p_chr=$chromt; + +my $tree_definition_orig_copy = $tree_definition_orig; + +$tree_definition=~s/,/, /g; +$tree_definition =~ s/, +/, /g; +$tree_definition_orig=~s/,/, /g; +$tree_definition_orig =~ s/, +/, /g; +my @exactspeciesset_unarranged = split(/,/,$species_set); +my @exactspeciesset_unarranged_orig = split(/,/,$tspecies_set); +my $largesttree = "$tree_definition;"; +my $largesttree_orig = "$tree_definition_orig;"; +# print "largesttree = $largesttree\n"; +$tree_definition =~ s/\(//g; +$tree_definition =~ s/\)//g; +$tree_definition=~s/[\)\(, ]/\t/g; +$tree_definition =~ s/\t+/\t/g; + +$tree_definition_orig =~ s/\(//g; +$tree_definition_orig =~ s/\)//g; +$tree_definition_orig =~s/[\)\(, ]/\t/g; +$tree_definition_orig =~ s/\t+/\t/g; +# print "tree_definition = $tree_definition tree_definition_orig = $tree_definition_orig\n"; + +my @treespecies=split(/\t+/,$tree_definition); +my @treespecies_orig=split(/\t+/,$tree_definition_orig); +# print "tree_definition = $tree_definition .. treespecies=@treespecies ... treespecies_orig=@treespecies_orig\n"; +#<STDIN>; + +foreach my $spec (@treespecies){ + foreach my $espec (@exactspeciesset_unarranged){ +# print "spec=$spec and espec=$espec\n"; + push @exactspecies, $spec if $spec eq $espec; + } +} + +foreach my $spec (@treespecies_orig){ + foreach my $espec (@exactspeciesset_unarranged_orig){ +# print "spec=$spec and espec=$espec\n"; + push @exactspecies_orig, $spec if $spec eq $espec; + } +} + +my $focalspec = $exactspecies[0]; +my $focalspec_orig = $exactspecies_orig[0]; +# print "exactspecies=@exactspecies ... focalspec=$focalspec\n"; +# print "texactspecies=@exactspecies_orig ... focalspec_orig=$focalspec_orig\n"; +#<STDIN>; +my $arranged_species_set = join(".",@exactspecies); +my $arranged_species_set_orig = join(".",@exactspecies_orig); + +@exacttags=@exactspecies; +my @exacttags_orig=@exactspecies_orig; + +foreach my $extag (@exacttags){ + $extag =~ s/hg18/H/g; + $extag =~ s/panTro2/C/g; + $extag =~ s/ponAbe2/O/g; + $extag =~ s/rheMac2/R/g; + $extag =~ s/calJac1/M/g; +} + +foreach my $extag (@exacttags_orig){ + $extag =~ s/hg18/H/g; + $extag =~ s/panTro2/C/g; + $extag =~ s/ponAbe2/O/g; + $extag =~ s/rheMac2/R/g; + $extag =~ s/calJac1/M/g; +} + +my $chr_name = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt"); +#print "sending to maftoAxt_multispecies: $maf, $tree_definition, $chr_name, $species_set .. focalspec=$focalspec \n"; + +maftoAxt_multispecies($maf, $tree_definition_orig_copy, $chr_name, $tspecies_set); +#print "made files\n"; +my @filterseqfiles= ($chr_name); +$largesttree =~ s/hg18/H/g; +$largesttree =~ s/panTro2/C/g; +$largesttree =~ s/ponAbe2/O/g; +$largesttree =~ s/rheMac2/R/g; +$largesttree =~ s/calJac1/M/g; +#<STDIN>; +#--------------------------------------------------------------------------- + +my ($lagestnodes, $largestbranches) = get_nodes($largesttree); +shift (@$lagestnodes); +my @extendedtitle=(); + +my $title = (); +my $parttitle = (); +my @titlearr = (); +my @firsttitle=($focalspec_orig."chrom", $focalspec_orig."start", $focalspec_orig."end", $focalspec_orig."motif", $focalspec_orig."motifsize", $focalspec_orig."threshold"); + +my @finames= qw(chr start end motif motifsize microsat mutation mutation.position mutation.from mutation.to insertion.details deletion.details); + +my @fititle=(); + +foreach my $spec (split(",",$tspecies_set)){ + push @fititle, $spec; + foreach my $name (@finames){ + push @fititle, $spec.".".$name; + } +} + + +my @othertitle=qw(somechr somestart somened event source); + +my @fnames = (); +push @fnames, qw(insertions_num deletions_num motinsertions_num motinsertionsf_num motdeletions_num motdeletionsf_num noninsertions_num nondeletions_num) ; +push @fnames, qw(binsertions_num bdeletions_num bmotinsertions_num bmotinsertionsf_num bmotdeletions_num bmotdeletionsf_num bnoninsertions_num bnondeletions_num) ; +push @fnames, qw(dinsertions_num ddeletions_num dmotinsertions_num dmotinsertionsf_num dmotdeletions_num dmotdeletionsf_num dnoninsertions_num dnondeletions_num) ; +push @fnames, qw(ninsertions_num ndeletions_num nmotinsertions_num nmotinsertionsf_num nmotdeletions_num nmotdeletionsf_num nnoninsertions_num nnondeletions_num) ; +push @fnames, qw(substitutions_num bsubstitutions_num dsubstitutions_num nsubstitutions_num indels_num subs_num); + +my @fullnames = (); +# print "revising\n"; +# print "H = $backReplacementArrTag{H}\n"; +# print "C = $backReplacementArrTag{C}\n"; +# print "O = $backReplacementArrTag{O}\n"; +# print "R = $backReplacementArrTag{R}\n"; +# print "M = $backReplacementArrTag{M}\n"; + +foreach my $lnode (@$lagestnodes){ + my @pair = @$lnode; + my @nodemutarr = (); + for my $p (@pair){ +# print "p = $p\n"; + $p =~ s/[\(\), ]+//g; + $p =~ s/([A-Z])/$1./g; + $p =~ s/\.$//g; + + $p =~ s/H/$backReplacementArrTag{H}/g; + $p =~ s/C/$backReplacementArrTag{C}/g; + $p =~ s/O/$backReplacementArrTag{O}/g; + $p =~ s/R/$backReplacementArrTag{R}/g; + $p =~ s/M/$backReplacementArrTag{M}/g; + foreach my $n (@fnames) { push @fullnames, $p.".".$n;} + } +} + +#print SUMMARY "#",join("\t", @firsttitle, @fititle, @othertitle); + +#print SUMMARY "\t",join("\t", @fullnames); +my $header = join("\t",@firsttitle, @fititle, @othertitle, @fullnames, @fnames, "tree", "cleancase"); +# print "header= $header\n"; +#<STDIN>; + +#print SUMMARY "\t",join("\t", @fnames); +#$title= $title."\t".join("\t", @fnames); + +#print SUMMARY "\t","tree","\t", "cleancase", "\n"; +#$title= $title."\t"."tree"."\t"."cleancase". "\n"; + +#print $title; #<STDIN>; + +#print "all_tags = @all_tags\n"; + +for my $no (3 ... $#all_tags+1){ +# print "no=$no\n"; #<STDIN>; + @tags = @all_tags[0 ... $no-1]; +# print "all_tags=>@all_tags< , tags = >@tags<\n" if $printer == 1; #<STDIN>; + %template=(); + my @nextcounter = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + #next if scalar(@tags) < 4; +# print "now doing tags = @tags, no = $no\n"; + open (ORTH, "<$orth") or die "Cannot open orth file: $orth: $!"; + +# print SUMMARY join "\t", qw (species chr start end branch motif microsat mutation position from to insertion deletion); + + + ##################### T E M P O R A R Y ##################### + my @finaltitle=(); + my @singletitle = qw (species chr start end motif motifsize microsat strand microsatsize col10 col11 col12 col13); + my $endtitle = (); + foreach my $tag (@tags){ + my @tempsingle = (); + + foreach my $single (@singletitle){ + push @tempsingle, $tag.$single; + } + @finaltitle = (@finaltitle, @tempsingle); + } + +# print SUMMARY join("\t",@finaltitle),"\n"; + + ############################################################# + + #--------------------------------------------------------------------------- + # GET THE TREE FROM TREE FILE + my $tree = (); + $tree = "((H, C), O)" if $no == 3; + $tree = "(((H, C), O), R)" if $no == 4; + $tree = "((((H, C), O), R), M)" if $no == 5; +# $tree=~s/;$//g; +# print "our tree = $tree\n"; + #--------------------------------------------------------------------------- + # LOADING HASH CONTAINING ALL POSSIBLE TREES: + $tree_decipherer = "/gpfs/home/ydk104/work/rhesus_microsat/codes/lib/tree_analysis_".join("",@tags).".txt"; + %template=(); + %alternate=(); + load_allPossibleTrees($tree_decipherer, \%template, \%alternate); + + #--------------------------------------------------------------------------- + # LOADING THE TREES TO REJECT FOR BIRTH ANALYSIS + %treesToReject=(); + %treesToIgnore=(); + load_treesToReject(@tags); + load_treesToIgnore(@tags); + #--------------------------------------------------------------------------- + # LOADING INPUT DATA INTO HASHES AND ARRAYS + + + #1 THIS IS THE POINT WHERE WE CAN FILTER OUT LARGE MICROSAT CLUSTERS + #2 AS WELL AS MULTIPLE-ALIGNMENT-BLOCKS-SPANNING MICROSATS (KIND OF + #3 IMPLICIT IN THE FIRST PART OF THE SENTENCE ITSELF IN MOST CASES). + + my %orths=(); + my $counterm = 0; + my $loaded = 0; + my %seen = (); + my @allowedchrs = (); +# print "no = $no\n"; #<STDIN>; + + while (my $line = <ORTH>){ + # print "line=$line\n"; + my $register1 = $line =~ s/>$exactspecies_orig[0]/>$replacementArrTag{$exactspecies_orig[0]}/g; + my $register2 = $line =~ s/>$exactspecies_orig[1]/>$replacementArrTag{$exactspecies_orig[1]}/g; + my $register3 = $line =~ s/>$exactspecies_orig[2]/>$replacementArrTag{$exactspecies_orig[2]}/g; + my $register4 = $line =~ s/>$exactspecies_orig[3]/>$replacementArrTag{$exactspecies_orig[3]}/g; + my $register5 = $line =~ s/>$exactspecies_orig[4]/>$replacementArrTag{$exactspecies_orig[4]}/g if exists $exactspecies_orig[4]; + + # print "line = $line\n"; <STDIN>; + + + # next if $register1 + $register2 + $register3 + $register4 + $register5 > scalar(@tags); + my @micros = split(/>/,$line); # LOADING ALL THE MICROSAT ENTRIES FROM THE CLUSTER INTO @micros + #print "micros=",printarr(@micros),"\n"; #<STDIN>; + shift @micros; # EMPTYING THE FIRST, EMTPY ELEMENT OF THE ARRAY + + $no_of_species = adjustCoordinates($micros[0]); +# print "A: $no_of_species\n"; + next if $no_of_species != $no; +# print "no = $no ... no_of_species=$no_of_species\n";#<STDIN>; + $counterm++; + #------------------------------------------------ + $nextcounter[0]++ if $line =~ /compound/; + next if $line =~ /compound/; # GETTING RID OF COMPOUND MICROSATS + #------------------------------------------------ + #next if $line =~ /[A-Za-z]>[a-zA-Z]/; + #------------------------------------------------ + chomp $line; + my $match_count = ($line =~ s/>/>/g); # COUNTING THE NUMBER OF MICROSAT ENTRIES IN THE CLUSTER + #print "number of species = $match_count\n"; + my $stopper = 0; + foreach my $mic (@micros){ + my @local = split(/\t/,$mic); + if ($local[$typecord] =~ /\./ || exists($local[$no_of_interruptionscord+2])) {$stopper = 1; $nextcounter[1]++; + last; } + # REMOVING CLUSTERS WITH THE CYRPTIC, (UNRESOLVABLY COMPLEX) MICROSAT ENTRIES IN THEM + } + next if $stopper ==1; + #------------------------------------------------ + $nextcounter[2]++ if (scalar(@micros) >$no_of_species); + + next if (scalar(@micros) >$no_of_species); #1 REMOVING MICROSAT CLUSTERS WITH MORE NUMBER OF MICROSAT ENTRIES THAN THE NUMBER OF SPECIES IN THE DATASET. + #2 THIS IS SO BECAUSE SUCH CLUSTERS IMPLY THAT IN AT LEAST ONE SPECIES, THERE IS MORE THAN ONE MICROSAT ENTRY + #3 IN THE CLUSTER. THUS, HERE WE ARE GETTING RID OF MICROSATS CLUSTERS THAT INCLUDE MULTUPLE, NEIGHBORING + #4 MICROSATS, AND STICK TO CLEAN MICROSATS THAT DO NOT HAVE ANY MICROSATS IN NEIGHBORHOOD. + #5 THIS 'NEIGHBORHOOD-RANGE' HAD BEEN DECIDED PREVIOUSLY IN OUR CODE multiSpecies_orthFinder4.pl + my $nexter = 0; + foreach my $tag (@tags){ + my $tagcount = ($line =~ s/>$tag\t/>$tag\t/g); + if ($tagcount > 1) { $nexter =1; #print colored ['red'],"multiple entires per species : $tagcount of $tag\n" if $printer == 1; + next; + } + } + + if ($nexter == 1){ + $nextcounter[3]++; + next; + } + #------------------------------------------------ + foreach my $mic (@micros){ #1 REMOVING MICROSATELLITES WITH ANY 'N's IN THEM + my @local = split(/\t/,$mic); + if ($local[$microsatcord] =~ /N/) {$stopper =1; $nextcounter[4]++; + last;} + } + next if $stopper ==1; + #print "till here 1\n"; #<STDIN>; + #------------------------------------------------ + my @micros_copy = @micros; + + my $tempmicro = shift(@micros_copy); #1 CURRENTLY OBTAINING INFORMATION FOR THE FIRST + #2 MICROSAT IN THE CLUSTER. + my @tempfields = split(/\t/,$tempmicro); + my $prevtype = $tempfields[$typecord]; + my $tempmotif = $tempfields[$motifcord]; + + my $tempfirstmotif = (); + if (scalar(@tempfields) > $microsatcord + 2){ + if ($tempfields[$no_of_interruptionscord] >= 1) { #1 DISCARDING MICROSATS WITH MORE THAN ZERO INTERRUPTIONS + #2 IN THE FIRST MICROSAT OF THE CLUSTER + $nexter =1; #print colored ['blue'],"more than one interruptions \n" if $printer == 1; + } + } + if ($nexter == 1){ + $nextcounter[6]++; + next; + } #1 DONE OBTAINING INFORMATION REGARDING + #2 THE FIRST MICROSAT FROM THE CLUSTER + + if ($tempmotif =~ /^\[/){ + $tempmotif =~ s/^\[//g; + $tempmotif =~ /([a-zA-Z]+)\].*/; + $tempfirstmotif = $1; #1 OBTAINING THE FIRTS MOTIF OF MICROSAT + } + else {$tempfirstmotif = $tempmotif;} + my $prevmotif = $tempfirstmotif; + + my $key = (); + # print "searching temp micro for 0-9 $focalspec chr0-9a-zA-Z 0-9 0-9 \n"; + # print "tempmicro = $tempmicro .. looking for ([0-9]+)\s+($focalspec_orig)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\n"; <STDIN>; + if ($tempmicro =~ /([0-9]+)\s+($focalspec_orig)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) { + # print "B: $no_of_species\n"; + $key = join("_",$2, $3, $4, $5); + } + else{ +# print "counld not form a key for temp\n"; # if $printer == 1; + $nextcounter[7]++; + next; + } + #----------------- #1 NOW, AFTER OBTAINING INFORMATION ABOUT + #2 THE FIRST MICROSAT IN THE CLUSTER, THE + #3 FOLLOWING LOOP GOES THROUGH THE OTHER MICROSATS + #4 TO SEE IF THEY SHARE THE REQUIRED FEATURES (BELOW) + + foreach my $micro (@micros_copy){ + my @fields = split(/\t/,$micro); + #----------------- + if (scalar(@fields) > $microsatcord + 2){ #1 DISCARDING MICROSATS WITH MORE THAN ONE INTERRUPTIONS + if ($fields[$no_of_interruptionscord] >= 1) {$nexter =1; #print colored ['blue'],"more than one interruptions \n" if $printer == 1; + $nextcounter[6]++; + last; } + } + #----------------- + if (($prevtype ne "0") && ($prevtype ne $fields[$typecord])) { + $nexter =1; #print colored ['yellow'],"microsat of different type \n" if $printer == 1; + $nextcounter[8]++; + last; } #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSATS BELONG + #----------------- #2 TO DIFFERENT TYPES (MONOS, DIS, TRIS ETC.) + $prevtype = $fields[$typecord]; + + my $motif = $fields[$motifcord]; + my $firstmotif = (); + + if ($motif =~ /^\[/){ + $motif =~ s/^\[//g; + $motif =~ /([a-zA-Z]+)\].*/; + $firstmotif = $1; + } + else {$firstmotif = $motif;} + + my $motifpattern = $firstmotif.$firstmotif; + my $prevmotifpattern = $prevmotif.$prevmotif; + + if (($prevmotif ne "0")&&(($motifpattern !~ /$prevmotif/i)||($prevmotifpattern !~ /$firstmotif/i)) ) { + $nexter =1; #print colored ['green'],"different motifs used \n$line\n" if $printer == 1; + $nextcounter[9]++; + last; + } #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSATS BELONG + #2 TO DIFFERENT MOTIFS + my $prevmotif = $firstmotif; + #----------------- + + for my $t (0 ... $#tags){ #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSAT ENTRIES BELONG + #2 DIFFERENT ALIGNMENT BLOCKS + if ($micro =~ /([0-9]+)\s+($focalspec_orig)\s([_0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) { + my $key2 = join("_",$2, $3, $4, $5); +# print "key = $key .. key2 = $key2\n"; #<STDIN>; + if ($key2 ne $key){ +# print "microsats belong to diffferent alignment blocks altogether\n" if $printer == 1; + $nextcounter[10]++; + $nexter = 1; last; + } + } + else{ +# print "counld not form a key for $line\n"; # if $printer == 1; + #<STDIN>; + $nexter = 1; last; + } + } + } + #print "D2: $no_of_species\n"; + + ##################### + if ($nexter == 1){ +# print "nexting\n"; # if $printer == 1; + next; + } + else{ +# print "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n$key:\n$line\nvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv\n" if $printer == 1; + push (@{$orths{$key}},$line); + $loaded++; + if ($line =~ /($focalspec_orig)\s([_a-zA-Z0-9]+)\s([0-9]+)\s([0-9]+)/ ) { + +# print "$line\n" if $printer == 1; #if $line =~ /Contig/; +# print "################ ################\n" if $printer == 1; + push @allowedchrs, $2 if !exists $allowedhash{$2}; + $allowedhash{$2} = 1; + my $key = join("\t",$1, $2, $3, $4); +# print "C: $no_of_species .. key = $key\n";#<STDIN>; +# print "print the shit: $key\n" ; #if $printer == 1; + $seen{$key} = 1; + } + else { #print "Key could not be formed in SPUT for ($focalspec_orig) (chrom) ([0-9]+) ([0-9]+)\n"; + } + } + } + close ORTH; +# print "now studying where we lost microsatellites: @nextcounter\n"; + for my $reason (0 ... $#nextcounter){ + #print $removalReasons[$reason]."\t".$nextcounter[$reason],"\n"; + } +# print "\ntotal number of keys formed = ", scalar(keys %orths), " = \n"; +# print "done filtering .. counterm = $counterm and loaded = $loaded\n"; + #---------------------------------------------------------------------------------------------------------------- + # NOW GENERATING THE ALIGNMENT FILE WITH RELELEVENT ALIGNMENTS STORED ONLY. +# print "adding files @filterseqfiles \n"; + #<STDIN>; + + while (1){ + if (-e $megamatchlck){ +# print "waiting to write into $megamatchlck\n"; + sleep 10; + } + else{ + open (MEGAMLCK, ">$megamatchlck") or die "Cannot open megamatchlck file $megamatchlck: $!"; + open (MEGAM, ">$megamatch") or die "Cannot open megamatch file $megamatch: $!"; + last; + } + } + foreach my $seqfile (@filterseqfiles){ + my $fullpath = $seqfile; + open (MATCH, "<$fullpath") or die "Cannot open MATCH file $fullpath: $!"; + my $matchlines = 0; + + while (my $line = <MATCH>) { + #print "checking $line"; + if ($line =~ /($focalspec_orig)\s([a-zA-Z0-9]+)\s([0-9]+)\s([0-9]+)/ ) { + my $key = join("\t",$1, $2, $3, $4); +# print "key = $key\n"; + #print "------------------------------------------------------\n"; + #print "asking $line\n"; + if (exists $seen{$key}){ + + #print "seen $line \n"; <STDIN>; + while (1){ + $matchlines++; + print MEGAM $line; + $line = <MATCH>; + print MEGAM "\n" if $line !~ /[0-9a-zA-Z]/; + last if $line !~/[0-9a-zA-Z]/; + } + } + else{ +# print "not seen\n"; + } + } + } +# print "matchlines = $matchlines\n"; + close MATCH; + } + close MEGAMLCK; + + unlink $megamatchlck; + close MEGAM; + undef %seen; +# print "done writitn to $megamatch\n";#<STDIN>; + #---------------------------------------------------------------------------------------------------------------- + #<STDIN>; + #--------------------------------------------------------------------------- + # NOW, AFTER FILTERING MANY MICROSATS, AND LOADING THE FILTERED ONES INTO + # THE HASH %orths , WE GO THROUGH THE ALIGNMENT FILE, AND STUDY THE + # FLANKING SEQUENCES OF ALL THESE MICROSATS, TO FILTER THEM FURTHER + #$printer = 1; + + my $microreadcounter=0; + my $contigsentered=0; + my $contignotrightcounter=0; + my $keynotformedcounter=0; + my $keynotfoundcounter= 0; + my $dotcounter = 0; + +# print "opening $megamatch\n"; + + open (BO, "<$megamatch") or die "Cannot open alignment file: $megamatch: $!"; +# print "doing $megamatch\n " ; + + #<STDIN>; + + while (my $line = <BO>){ +# print $line; #<STDIN>; +# print "." if $dotcounter % 100 ==0; +# print "\n" if $dotcounter % 5000 ==0; +# print "dotcounter = $dotcounter\n " if $printer == 1; + next if $line !~ /^[0-9]+/; + $dotcounter++; +# print colored ['green'], "~" x 60, "\n" if $printer == 1; +# print colored ['green'], $line;# if $printer == 1; + chomp $line; + my @fields2 = split(/\t/,$line); + my $key2 = (); + my $alignment_no = (); #1 TEMPORARY + if ($line =~ /([0-9]+)\s+($focalspec_orig)\s([_\-s0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) { +# $key2 = join("\t",$1, $2, $4, $5); + $key2 = join("_",$2, $3, $4, $5); +# print "key = $key2\n"; + +# print "key = $key2\n"; + $alignment_no=$1; + } + else {print "seq line $line incompatible\n"; $keynotformedcounter++; next;} + + $no_of_species = adjustCoordinates($line); + + $contignotrightcounter++ if $no_of_species != $no; +# print "contignotrightcounter=$contignotrightcounter\n"; +# print "no_of_species=$no_of_species\n"; +# print "no=$no\n"; + + next if $no_of_species != $no; +# print "D: $no_of_species\n"; +# print "E: $no_of_species\n"; + #<STDIN>; + # print "key = $key2\n" if $printer == 1; + my @clusters = (); #1 EXTRACTING MICROSATS CORRESPONDING TO THIS + #2 ALIGNMENT BLOCK + if (exists($orths{$key2})){ + @clusters = @{$orths{$key2}}; + $contigsentered++; + delete $orths{$key2}; + } + else{ +# print "orth does not exist\n"; + $keynotfoundcounter++; + next; + } + + my %sequences=(); #1 WILL STORE SEQUENCES IN THE CURRENT ALIGNMENT BLOCK + my $humseq = (); + foreach my $tag (@tags){ #1 READING THE ALIGNMENT FILE AND CAPTURING SEQUENCES + my $seq = <BO>; #2 OF ALL SPECIES. + chomp $seq; + $sequences{$tag} = " ".$seq; + #print "sequences = $sequences{$tag}\n" if $printer == 1; + $humseq = $seq if $tag =~ /H/; + } + + + foreach my $cluster (@clusters){ #1 NOW, GOING THROUGH THE CLUSTER OF MICROSATS +# print "x" x 60, "\n" if $printer == 1; +# print colored ['red'],"cluster = $cluster\n"; + $largesttree =~ s/hg18/H/g; + $largesttree =~ s/panTro2/C/g; + $largesttree =~ s/ponAbe2/O/g; + $largesttree =~ s/rheMac2/R/g; + $largesttree =~ s/calJac1/M/g; + + $microreadcounter++; + my @micros = split(/>/,$cluster); + + + + + + + shift @micros; + + my $edge_microsat=0; #1 THIS WILL HAVE VALUE "1" IF MICROSAT IS FOUND + #2 TO BE TOO CLOSE TO THE EDGES OF ALIGNMENT BLOCK + + my @starts= (); my %start_hash=(); #1 STORES THE START AND END COORDINATES OF MICROSATELLITES + my @ends = (); my %end_hash=(); #2 SO THAT LATER, WE WILL BE ABLE TO FIND THE EXTREME + #3 COORDINATE VALUES OF THE ORTHOLOGOUS MIROSATELLITES. + + my %microhash=(); + my %microsathash=(); + my %nonmicrosathash=(); + my $motif=(); #1 BASIC MOTIF OF THE MICROSATELLITE.. THERE'S ONLY 1 +# print "tags=@tags\n"; + for my $i (0 ... $#tags){ #1 FINDING THE MICROSAT, AND THE ALIGNMENT SEQUENCE + #2 CORRESPONDING TO THE PARTICULAR SPECIES (AS PER + #3 THE VARIABLE $TAG; + my $tag = $tags[$i]; + # print $seq; + my $locus="NULL"; #1 THIS WILL STORE THE MICROSAT OF THIS SPECIES. + #2 IF THERE IS NO MICROSAT, IT WILL REMAIN "NULL" + + foreach my $micro (@micros){ + # print "micro=$micro, tag=$tag\n"; + if ($micro =~ /^$tag/){ #1 MICROSAT OF THIS SPECIES FOUND.. + $locus = $micro; + my @fields = split(/\t/,$micro); + $motif = $fields[$motifcord]; + $microsathash{$tag}=$fields[$microsatcord]; + # print "fields=@fields, and startcord=$startcord = $fields[$startcord]\n"; + push(@starts, $fields[$startcord]); + push(@ends, $fields[$endcord]); + $start_hash{$tag}=$fields[$startcord]; + $end_hash{$tag}=$fields[$endcord]; + last; + } + else{$microsathash{$tag}="NULL"} + } + $microhash{$tag}=$locus; + + } + + + + my $extreme_start = smallest_number(@starts); #1 THESE TWO ARE THE EXTREME COORDINATES OF THE + my $extreme_end = largest_number(@ends); #2 MICROSAT CLUSTER ACCROSS ALL THE SPECIES IN + #3 WHOM IT IS FOUND TO BE ORTHOLOGOUS. + +# print "starts=@starts... ends=@ends\n"; + + my %up_flanks = (); #1 CONTAINS UPSTEAM FLANKING REGIONS FOR EACH SPECIES + my %down_flanks = (); #1 CONTAINS DOWNDTREAM FLANKING REGIONS FOR EACH SPECIES + + my %up_largeflanks = (); + my %down_largeflanks = (); + + my %locusandflanks = (); + my %locusandlargeflanks = (); + + my %up_internal_flanks=(); #1 CONTAINS SEQUENCE BETWEEN THE $extreme_start and the + #2 ACTUAL START OF MICROSATELLITE IN THE SPECIES + my %down_internal_flanks=(); #1 CONTAINS SEQUENCE BETWEEN THE $extreme_end and the + #2 ACTUAL end OF MICROSATELLITE IN THE SPECIES + + my %alignment=(); #1 CONTAINS ACTUAL ALIGNMENT SEQUENCE BETWEEN THE TWO + #2 EXTEME VALUES. + + my %microsatstarts=(); #1 WITHIN EACH ALIGNMENT, IF THERE EXISTS A MICROSATELLITE + #2 THIS HASH CONTAINS THE START SITE OF THE MICROSATELLITE + #3 WIHIN THE ALIGNMENT + next if !defined $extreme_start; + next if !defined $extreme_end; + next if $extreme_start > length($sequences{$tags[0]}); + next if $extreme_start < 0; + next if $extreme_end > length($sequences{$tags[0]}); + + for my $i (0 ... $#tags){ #1 NOW THAT WE HAVE GATHERED INFORMATION REGARDING + #2 SEQUENCE ALIGNMENT AND MICROSATELLITE COORDINATES + #3 AS WELL AS THE EXTREME COORDINATES OF THE + #4 MICROSAT CLUSTER, WE WILL PROCEED TO EXTRACT THE + #5 FLANKING SEQUENCE OF ALL ORGS, AND STUDY IT IN + #6 MORE DETAIL. + my $tag = $tags[$i]; +# print "tag=$tag.. seqlength = ",length($sequences{$tag})," extreme_start=$extreme_start and extreme_end=$extreme_end\n"; + my $upstream_gaps = (substr($sequences{$tag}, 0, $extreme_start) =~ s/\-/-/g); #1 NOW MEASURING THE NUMBER OF GAPS IN THE UPSTEAM + #2 AND DOWNSTREAM SEQUENCES OF THE MICROSATs IN THIS + #3 CLUSTER. +# print "seq length $tag = $sequences{$tag} = ",length($sequences{$tag})," extreme_end=$extreme_end\n" ; + my $downstream_gaps = (substr($sequences{$tag}, $extreme_end) =~ s/\-/-/g); + if (($extreme_start - $upstream_gaps )< $EDGE_DISTANCE || (length($sequences{$tag}) - $extreme_end - $downstream_gaps) < $EDGE_DISTANCE){ + $edge_microsat=1; + + last; + } + else{ + $up_flanks{$tag} = substr($sequences{$tag}, $extreme_start - $FLANK_SUPPORT, $FLANK_SUPPORT); + $down_flanks{$tag} = substr($sequences{$tag}, $extreme_end+1, $FLANK_SUPPORT); + + $up_largeflanks{$tag} = substr($sequences{$tag}, $extreme_start - $COMPLEXITY_SUPPORT, $COMPLEXITY_SUPPORT); + $down_largeflanks{$tag} = substr($sequences{$tag}, $extreme_end+1, $COMPLEXITY_SUPPORT); + + + $alignment{$tag} = substr($sequences{$tag}, $extreme_start, $extreme_end-$extreme_start+1); + $locusandflanks{$tag} = $up_flanks{$tag}."[".$alignment{$tag}."]".$down_flanks{$tag}; + $locusandlargeflanks{$tag} = $up_largeflanks{$tag}."[".$alignment{$tag}."]".$down_largeflanks{$tag}; + + if ($microhash{$tag} ne "NULL"){ + $up_internal_flanks{$tag} = substr($sequences{$tag}, $extreme_start , $start_hash{$tag}-$extreme_start); + $down_internal_flanks{$tag} = substr($sequences{$tag}, $end_hash{$tag} , $extreme_end-$end_hash{$tag}); + $microsatstarts{$tag}=$start_hash{$tag}-$extreme_start; +# print "tag = $tag, internal flanks = $up_internal_flanks{$tag} and $down_internal_flanks{$tag} and start = $microsatstarts{$tag}\n" if $printer == 1; + } + else{ + $nonmicrosathash{$tag}=substr($sequences{$tag}, $extreme_start, $extreme_end-$extreme_start+1); + + } + # print "up flank for species $tag = $up_flanks{$tag} \ndown flank for species $tag = $down_flanks{$tag} \n" if $printer == 1; + + } + + } + $nextcounter[11]++ if $edge_microsat==1; + next if $edge_microsat==1; + + + my $low_complexity = 0; #1 VALUE WILL BE 1 IF ANY OF THE FLANKING REGIONS + #2 IS FOUND TO BE OF LOW COMPLEXITY, BY USING THE + #3 FUNCTION sub test_complexity + + + for my $i (0 ... $#tags){ +# print "i = $tags[$i]\n" if $printer == 1; + if (test_complexity($up_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT) eq "LOW" || test_complexity($down_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT) eq "LOW"){ +# print "i = $i, low complexity regions: $up_largeflanks{$tags[$i]}: ",test_complexity($up_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT), " and $down_largeflanks{$tags[$i]} = ",test_complexity($down_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT),"\n" if $printer == 1; + $low_complexity =1; last; + } + } + + $nextcounter[12]++ if $low_complexity==1; + next if $low_complexity == 1; + + + my $sequence_dissimilarity = 0; #1 THIS VALYE WILL BE 1 IF THE SEQUENCE SIMILARITY + #2 BETWEEN ANY OF THE SPECIES AGAINST THE HUMAN + #3 FLANKING SEQUENCES IS BELOW A CERTAIN THRESHOLD + #4 AS DESCRIBED IN FUNCTION sub sequence_similarity + my %donepair = (); + for my $i (0 ... $#tags){ + # print "i = $tags[$i]\n" if $printer == 1; +# next if $i == 0; + # print colored ['magenta'],"THIS IS UP\n" if $printer == 1; + + for my $b (0 ... $#tags){ + next if $b == $i; + my $pair = (); + $pair = $i."_".$b if $i < $b; + $pair = $b."_".$i if $b < $i; + next if exists $donepair{$pair}; + my ($up_similarity,$upnucdiffs, $upindeldiffs) = sequence_similarity($up_flanks{$tags[$i]}, $up_flanks{$tags[$b]}, $SIMILARITY_THRESH, $info); + my ($down_similarity,$downnucdiffs, $downindeldiffs) = sequence_similarity($down_flanks{$tags[$i]}, $down_flanks{$tags[$b]}, $SIMILARITY_THRESH, $info); + $donepair{$pair} = $up_similarity."_".$down_similarity; + +# print RUN "$up_similarity $upnucdiffs $upindeldiffs $down_similarity $downnucdiffs $downindeldiffs\n"; + + if ( $up_similarity < $SIMILARITY_THRESH || $down_similarity < $SIMILARITY_THRESH){ + $sequence_dissimilarity =1; + last; + } + } + } + $nextcounter[13]++ if $sequence_dissimilarity==1; + + next if $sequence_dissimilarity == 1; + my ($simplified_microsat, $Hchrom, $Hstart, $Hend, $locusmotif, $locusmotifsize) = summarize_microsat($cluster, $humseq); +# print "simplified_microsat=$simplified_microsat\n"; + my ($tree_analysis, $conformation) = treeStudy($simplified_microsat); +# print "tree_analysis = $tree_analysis .. conformation=$conformation\n"; + #<STDIN>; + +# print SELECT "\"$conformation\"\t$tree_analysis\n"; + + next if $tree_analysis =~ /DISCARD/; + if (exists $treesToReject{$tree_analysis}){ + $nextcounter[14]++; + next; + } + +# print "F: $no_of_species\n"; + +# my $adjuster=(); +# if ($no_of_species == 4){ +# my @sields = split(/\t/,$simplified_microsat); +# my $somend = pop(@sields); +# my $somestart = pop(@sields); +# my $somechr = pop(@sields); +# $adjuster = "NA\t" x 13 ; +# $simplified_microsat = join ("\t", @sields, $adjuster).$somechr."\t".$somestart."\t".$somend; +# } +# if ($no_of_species == 3){ +# my @sields = split(/\t/,$simplified_microsat); +# my $somend = pop(@sields); +# my $somestart = pop(@sields); +# my $somechr = pop(@sields); +# $adjuster = "NA\t" x 26 ; +# $simplified_microsat = join ("\t", @sields, $adjuster).$somechr."\t".$somestart."\t".$somend; +# } +# + $registeredTrees{$tree_analysis} = 1 if !exists $registeredTrees{$tree_analysis}; + $registeredTrees{$tree_analysis}++ if exists $registeredTrees{$tree_analysis}; + + if (exists $treesToIgnore{$tree_analysis}){ + my @appendarr = (); + +# print SUMMARY $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize], "\t", $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t"; + #print "SUMMARY ",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize], "\t", $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t"; +# print SELECT $Hchrom,"\t",$Hstart,"\t",$Hend,"\t","NOEVENT", "\t\t", $cluster,"\n"; + + foreach my $lnode (@$lagestnodes){ + my @pair = @$lnode; + my @nodemutarr = (); + for my $p (@pair){ + my @mutinfoarray1 = (); + for (1 ... 38){ +# push (@mutinfoarray1, "NA") + } +# print SUMMARY join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t"; +# print join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t"; + } + + } + for (1 ... 38){ + push (@appendarr, "NA") + } +# print SUMMARY join ("\t", @appendarr,"NULL", "NULL"),"\n"; +# print join ("\t", @appendarr,"NULL", "NULL"),"\n"; + # print "SUMMARY ",join ("\t", @appendarr,"NULL", "NULL"),"\n"; #<STDIN>; + next; + } +# print colored ['blue'],"cluster = $cluster\n"; + + my ($mutations_array, $nodes, $branches_hash, $alivehash, $primaryalignment) = peel_onion($tree, \%sequences, \%alignment, \@tags, \%microsathash, \%nonmicrosathash, $motif, $tree_analysis, $thresholdhash{length($motif)}, \%microsatstarts); + + if ($mutations_array eq "NULL"){ + # print "cluster = $cluster \n"; <STDIN>; + my @appendarr = (); + + # print SUMMARY $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize, "\t"; + + # foreach my $lnode (@$lagestnodes){ + # my @pair = @$lnode; + # my @nodemutarr = (); + # for my $p (@pair){ + # my @mutinfoarray1 = (); + # for (1 ... 38){ + # push (@mutinfoarray1, "NA") + # } + # print SUMMARY join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t"; + # print join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t"; + # } + # } + # for (1 ... 38){ + # push (@appendarr, "NA") + # } + # print SUMMARY join ("\t", @appendarr,"NULL", "NULL"),"\n"; + # print join ("\t", @appendarr,"NULL", "NULL"),"\n"; + # print join ("\t","SUMMARY", @appendarr,"NULL", "NULL"),"\n"; #<STDIN>; + next; + } + + +# print "sent: \n" if $printer == 1; +# print "nodes = @$nodes, branches array:\n" if $mutations_array ne "NULL" && $printer == 1; + + my ($newstatus, $newmutations_array, $newnodes, $newbranches_hash, $newalivehash, $finalalignment) = fillAlignmentGaps($tree, \%sequences, \%alignment, \@tags, \%microsathash, \%nonmicrosathash, $motif, $tree_analysis, $thresholdhash{length($motif)}, \%microsatstarts); +# print "newmutations_array returned = \n",join("\n",@$newmutations_array),"\n" if $newmutations_array ne "NULL" && $printer == 1; + my @finalmutations_array= (); + @finalmutations_array = selectMutationArray($mutations_array, $newmutations_array, \@tags, $alivehash, \%alignment, $motif) if $newmutations_array ne "NULL"; + @finalmutations_array = selectMutationArray($mutations_array, $mutations_array, \@tags, $alivehash, \%alignment, $motif) if $newmutations_array eq "NULL"; +# print "alt = $alternate{$conformation}\n"; + + my ($besttree, $treescore) = selectBetterTree($tree_analysis, $alternate{$conformation}, \@finalmutations_array); + my $cleancase = "UNCLEAN"; + $cleancase = checkCleanCase($besttree, $finalalignment) if $treescore > 0 && $finalalignment ne "NULL" && $finalalignment =~ /\!/; + $cleancase = checkCleanCase($besttree, $primaryalignment) if $treescore > 0 && $finalalignment eq "NULL" && $primaryalignment =~ /\!/ && $primaryalignment ne "NULL"; + $cleancase = "CLEAN" if $finalalignment eq "NULL" && $primaryalignment !~ /\!/ && $primaryalignment ne "NULL"; + $cleancase = "CLEAN" if $finalalignment ne "NULL" && $finalalignment !~ /\!/ ; +# print "besttree = $besttree ... cleancase=$cleancase\n"; #<STDIN>; + + my @selects = ("-C","+C","-H","+H","-HC","+HC","-O","+O","-H.-C","-H.-O","-HC,+C","-HC,+H","-HC.-O","-HCO,+HC","-HCO,+O","-O.-C","-O.-H", + "+C.+O","+H.+C","+H.+O","+HC,-C","+HC,-H","+HC.+O","+HCO,-C","+HCO,-H","+HCO,-HC","+HCO,-O","+O.+C","+O.+H","+H.+C.+O","-H.-C.-O","+HCO","-HCO"); + next if (oneOf(@selects, $besttree) == 0); + if ( ($besttree =~ /,/ || $besttree =~ /\./) && $cleancase eq "UNCLEAN"){ + $besttree = "$besttree / $tree_analysis"; + } + + $besttree = "NULL" if $treescore <= 0; + + while ($besttree =~ /[A-Z][A-Z]/){ + $besttree =~ s/([A-Z])([A-Z])/$1:$2/g; + } + + if ($besttree !~ /NULL/){ + my @elements = ($besttree =~ /([A-Z])/g); + + foreach my $ele (@elements){ +# print "replacing $ele with $backReplacementArrTag{$ele}\n"; + $besttree =~ s/$ele/$backReplacementArrTag{$ele}/g if exists $backReplacementArrTag{$ele}; + } + } + my $endendstate = $focalspec_orig.".".$Hchrom."\t".$Hstart."\t".$Hend."\t".$locusmotif."\t".$locusmotifsize."\t".$tree_analysis."\t"; + next if $endendstate =~ /NA\tNA\tNA/; + + print SUMMARY $focalspec_orig,".",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t"; +# print "SUMMARY\t", $focalspec_orig,".",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t",$tree_analysis,"\t" ; + + my @mutinfoarray =(); + + foreach my $lnode (@$lagestnodes){ + my @pair = @$lnode; + my $joint = "(".join(", ",@pair).")"; + my @nodemutarr = (); + + for my $p (@pair){ + foreach my $mut (@finalmutations_array){ + $mut =~ /node=([A-Z, \(\)]+)/; + push @nodemutarr, $mut if $p eq $1; + } + @mutinfoarray = summarizeMutations(\@nodemutarr, $besttree); + + # print SUMMARY join ("\t", @mutinfoarray[0...($#mutinfoarray-1)] ),"\t"; + # print join ("\t", @mutinfoarray[0...($#mutinfoarray-1)] ),"\t"; + } + } + +# print "G: $no_of_species\n"; + + my @alignmentarr = (); + + foreach my $key (keys %alignment){ + push @alignmentarr, $backReplacementArrTag{$key}.":".$alignment{$key}; + + } +# print "alignmentarr = @alignmentarr"; <STDIN>; + + @mutinfoarray = summarizeMutations(\@finalmutations_array, $besttree); + print SUMMARY join ("\t", @mutinfoarray ),"\t"; + print SUMMARY join(",",@alignmentarr),"\n"; +# print join("\t","--------------","\n",$besttree, join("",@tags)),"\n" if scalar(@tags) < 5; +# <STDIN> if scalar(@tags) < 5; +# print $cleancase, "\n"; +# print join ("\t", @mutinfoarray,$cleancase,join(",",@alignmentarr)),"\n"; #<STDIN>; +# print "summarized\n"; #<STDIN>; + + + + my %indelcatch = (); + my %substcatch = (); + my %typecatch = (); + my %nodescatch = (); + my $mutconcat = join("\t", @finalmutations_array)."\n"; + my %indelposcatch = (); + my %subsposcatch = (); + + foreach my $fmut ( @finalmutations_array){ +# next if $fmut !~ /indeltype=[a-zA-Z]+/; + #print RUN $fmut, "\n"; + $fmut =~ /node=([a-zA-Z, \(\)]+)/; + my $lnode = $1; + $nodescatch{$1}=1; + + if ($fmut =~ /type=substitution/){ + # print "fmut=$fmut\n"; + $fmut =~ /from=([a-zA-Z\-]+)\tto=([a-zA-Z\-]+)/; + my $from=$1; + # print "from=$from\n"; + my $to=$2; + # print "to=$to\n"; + push @{$substcatch{$lnode}} , ("from:".$from." to:".$to); + $fmut =~ /position=([0-9]+)/; + push @{$subsposcatch{$lnode}}, $1; + } + + if ($fmut =~ /insertion=[a-zA-Z\-]+/){ + $fmut =~ /insertion=([a-zA-Z\-]+)/; + push @{$indelcatch{$lnode}} , $1; + $fmut =~ /indeltype=([a-zA-Z]+)/; + push @{$typecatch{$lnode}}, $1; + $fmut =~ /position=([0-9]+)/; + push @{$indelposcatch{$lnode}}, $1; + } + if ($fmut =~ /deletion=[a-zA-Z\-]+/){ + $fmut =~ /deletion=([a-zA-Z\-]+)/; + push @{$indelcatch{$lnode}} , $1; + $fmut =~ /indeltype=([a-zA-Z]+)/; + push @{$typecatch{$lnode}}, $1; + $fmut =~ /position=([0-9]+)/; + push @{$indelposcatch{$lnode}}, $1; + } + } + + # print $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t" if $printer == 1; + # print join ("<\t>", @mutinfoarray),"\n" if $printer == 1; + # print "where mutinfoarray = @mutinfoarray\n" if $printer == 1; + # #print RUN "."; + + # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1; + # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1; + + # print colored ['red'],"finalmutations_array=\n" if $printer == 1; +# foreach (@finalmutations_array) { +# print colored ['red'], "$_\n" if $_ =~ /type=substitution/ && $printer == 1 ; +# print colored ['yellow'], "$_\n" if $_ !~ /type=substitution/ && $printer == 1 ; + +# }# if $line =~ /cal/;# && $line =~ /chr4/; + +# print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1; +# print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1; +# print "tree analysis = $tree_analysis\n" if $printer == 1; + + # my $mutations = "@$mutations_array"; + + + next; + for my $keys (@$nodes) {foreach my $key (@$keys){ + #print "key = $key, => $branches_hash->{$key}\n"; + } + # print "x" x 50, "\n"; + } + my ($birth_steps, $death_steps) = decipher_history($mutations_array,join("",@tags),$nodes,$branches_hash,$tree_analysis,$conformation, $alivehash, $simplified_microsat); + } + } + close BO; +# print "now studying where we lost microsatellites:\n"; +# print "x" x 60,"\n"; + for my $reason (0 ... $#nextcounter){ +# print $removalReasons[$reason]."\t".$nextcounter[$reason],"\n"; + } +# print "x" x 60,"\n"; +# print "In total we read $microreadcounter microsatellites after reading through $contigsentered contigs\n"; +# print " we lost $keynotformedcounter contigs as they did not form the key, \n"; +# print "$contignotrightcounter contigs as they were not of the right species configuration\n"; +# print "$keynotfoundcounter contigs as they did not contain the microsats\n"; +# print "... In total we went through a file that had $dotcounter contigs...\n"; +# print join ("\n","remaining orth keys = ", (keys %orths),""); +# print "------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ \n"; +# print "now printing counted trees: \n"; + if (scalar(keys %registeredTrees) > 0){ + foreach my $keyb ( sort (keys %registeredTrees) ) + { +# print "$keyb : $registeredTrees{$keyb}\n"; + } + } + + +} +close SUMMARY; + +my @summarizarr = ("+C=+C +R.+C -HCOR,+C", +"+H=+H +R.+H -HCOR,+H", +"-C=-C -R.-C +HCOR,-C", +"-H=-H -R.-H +HCOR,-H", +"+HC=+HC", +"-HC=-HC", +"+O=+O -HCOR,+O", +"-O=-O +HCOR,-O", +"+HCO=+HCO", +"-HCO=-HCO", +"+R=+R +R.+C +R.+H", +"-R=-R -R.-C -R.-H"); + +foreach my $line (@summarizarr){ + next if $line !~ /[A-Za-z0-9]/; +# print $line; + chomp $line; + my @fields = split(/=/,$line); +# print "title = $fields[0]\n"; + my @parts=split(/ +/, $fields[1]); + my %partshash = (); + foreach my $part (@parts){$partshash{$part}=1;} + my $count=0; + foreach my $key ( sort keys %registeredTrees ){ + next if !exists $partshash{$key}; +# print "now adding $registeredTrees{$key} from $key\n"; + $count+=$registeredTrees{$key}; + } +# print "$fields[0] : $count\n"; +} +my $rootdir = $dir; +$rootdir =~ s/\/[A-Za-z0-9\-_]+$//; +chdir $rootdir; +remove_tree($dir); + + +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +sub largest_number{ + my $counter = 0; + my($max) = shift(@_); + foreach my $temp (@_) { + + #print "finding largest array: $maxcounter \n"; + if($temp > $max){ + $max = $temp; + } + } + return($max); +} + +sub smallest_number{ + my $counter = 0; + my($min) = shift(@_); + foreach my $temp (@_) { + #print "finding largest array: $maxcounter \n"; + if($temp < $min){ + $min = $temp; + } + } + return($min); +} +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +sub baseml_parser{ + my $outputfile = $_[0]; + open(BOUT,"<$outputfile") or die "Cannot open output of upstream baseml $outputfile: $!"; + my @info = (); + my @branchields = (); + my @distanceields = (); + my @bout = <BOUT>; + #print colored ['red'], @bout ,"\n"; + for my $b (0 ... $#bout){ + my $bine=$bout[$b]; + #print colored ['yellow'], "sentence = ",$bine; + if ($bine =~ /TREE/){ + $bine=$bout[$b++]; + $bine=$bout[$b++]; + $bine=$bout[$b++]; + #print "FOUND",$bine; + chomp $bine; + $bine =~ s/^\s+//g; + @branchields = split(/\s+/,$bine); + $bine=$bout[$b++]; + chomp $bine; + $bine =~ s/^\s+//g; + @distanceields = split(/\s+/,$bine); + #print "LASTING..............\n"; + last; + } + else{ + } + } + + close BOUT; +# print "branchfields = @branchields and distanceields = @distanceields\n" if $printer == 1; + my %distance_hash=(); + for my $d (0 ... $#branchields){ + $distance_hash{$branchields[$d]} = $distanceields[$d]; + } + + $info[0] = $distance_hash{"9..1"} + $distance_hash{"9..2"}; + $info[1] = $distance_hash{"9..1"} + $distance_hash{"8..9"}+ $distance_hash{"8..3"}; + $info[2] = $distance_hash{"9..1"} + $distance_hash{"8..9"}+$distance_hash{"7..8"}+$distance_hash{"7..4"}; + $info[3] = $distance_hash{"9..1"} + $distance_hash{"8..9"}+$distance_hash{"7..8"}+$distance_hash{"6..7"}+$distance_hash{"6..5"}; + +# print "\nsending back: @info\n" if $printer == 1; + + return join("\t",@info); + +} + + +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +sub test_complexity{ + my $printer = 0; + my $sequence = $_[0]; + #print "sequence = $sequence\n"; + my $COMPLEXITY_SUPPORT = $_[1]; + my $complexity=int($COMPLEXITY_SUPPORT * (1/40)); #1 THIS IS AN ARBITRARY THRESHOLD SET FOR LOW COMPLEXITY. + #2 THE INSPIRATION WAS WEB MILLER'S MAIL SENT ON + #3 19 Apr 2008 WHERE HE CLASSED AS HIGH COMPLEXITY + #4 REGION, IF 40 BP OF SEQUENCE HAS AT LEAST 3 OF + #5 EACH NUCLEOTIDE. HENCE, I NORMALIZE THIS PARAMETER + #6 FOR THE ACTUAL LENGTH OF $FLANK_SUPPORT SET BY + #7 THE USER. + #8 WEB MILLER SENT THE MAIL TO YDK104@PSU.EDU + + + + my $As = ($sequence=~ s/A/A/gi); + my $Ts = ($sequence=~ s/T/T/gi); + my $Gs = ($sequence=~ s/G/G/gi); + my $Cs = ($sequence=~ s/C/C/gi); + my $dashes = ($sequence=~ s/\-/-/gi); + $dashes = 0 if $sequence !~ /\-/; +# print "seq = $sequence, As=$As, Ts=$Ts, Gs=$Gs, Cs=$Cs, dashes=$dashes\n"; + return "LOW" if $dashes > length($sequence)/2; + + my $ans = (); + + return "HIGH" if $As >= $complexity && $Ts >= $complexity && $Cs >= $complexity && $Gs >= $complexity; + + my @nts = ("A","T","G","C","-"); + + my $lowcomplex = 0; + + foreach my $nt (@nts){ + $lowcomplex =1 if $sequence =~ /(($nt\-*){$mono_flanksimplicityRepno,})/i; + $lowcomplex =1 if $sequence =~ /(($nt[A-Za-z]){$di_flanksimplicityRepno,})/i; + $lowcomplex =1 if $sequence =~ /(([A-Za-z]$nt){$di_flanksimplicityRepno,})/i; + my $nont = ($sequence=~ s/$nt/$nt/gi); + $lowcomplex = 1 if $nont > (length($sequence) * $prop_of_seq_allowedtoAT) && ($nt =~ /[AT\-]/); + $lowcomplex = 1 if $nont > (length($sequence) * $prop_of_seq_allowedtoCG) && ($nt =~ /[CG]/); + } +# print "leaving for now.. $sequence\n" if $printer == 1 && $lowcomplex == 0; + #<STDIN>; + return "HIGH" if $lowcomplex == 0; + return "LOW" ; +} +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +sub sequence_similarity{ + my $printer = 0; + my @seq1 = split(/\s*/, $_[0]); + my @seq2 = split(/\s*/, $_[1]); + my $similarity_thresh = $_[2]; + my $info = $_[3]; +# print "input = @_\n" if $printer == 1; + my $seq1str = $_[0]; + my $seq2str = $_[1]; + $seq1str=~s/\-//g; $seq2str=~s/\-//g; + my $similarity=0; + + my $nucdiffs=0; + my $nucsims=0; + my $indeldiffs=0; + + for my $i (0...$#seq1){ + $similarity++ if $seq1[$i] =~ /$seq2[$i]/i ; #|| $seq1[$i] =~ /\-/i || $seq2[$i] =~ /\-/i ; + $nucsims++ if $seq1[$i] =~ /$seq2[$i]/i && ($seq1[$i] =~ /[a-zA-Z]/i && $seq2[$i] =~ /[a-zA-Z]/i); + $nucdiffs++ if $seq1[$i] !~ /$seq2[$i]/i && ($seq1[$i] =~ /[a-zA-Z]/i && $seq2[$i] =~ /[a-zA-Z]/i); + $indeldiffs++ if $seq1[$i] !~ /$seq2[$i]/i && $seq1[$i] =~ /\-/i || $seq2[$i] =~ /\-/i; + } + my $sim = $similarity/length($_[0]); + return ( $sim, $nucdiffs, $indeldiffs ); #<= $similarity_thresh; +} +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- + +sub load_treesToReject{ + my @rejectlist = (); + my $alltags = join("",@_); + @rejectlist = qw (-HCOR +HCOR) if $alltags eq "HCORM"; + @rejectlist = qw ( -HCO|+R +HCO|-R) if $alltags eq "HCOR"; + @rejectlist = qw ( -HC|+O +HC|-O) if $alltags eq "HCO"; + + %treesToReject=(); + $treesToReject{$_} = $_ foreach (@rejectlist); + #print "loaded to reject for $alltags; ", $treesToReject{$_},"\n" foreach (@rejectlist); #<STDIN>; +} +#-------------------------------------------------------------------------------------------------------- +sub load_treesToIgnore{ + my @rejectlist = (); + my $alltags = join("",@_); + @rejectlist = qw (-HCOR +HCOR +HCORM -HCORM) if $alltags eq "HCORM"; + @rejectlist = qw ( -HCO|+R +HCO|-R +HCOR -HCOR) if $alltags eq "HCOR"; + @rejectlist = qw ( -HC|+O +HC|-O +HCO -HCO) if $alltags eq "HCO"; + + %treesToIgnore=(); + $treesToIgnore{$_} = $_ foreach (@rejectlist); + #print "loaded ", $treesToIgnore{$_},"\n" foreach (@rejectlist); +} +#-------------------------------------------------------------------------------------------------------- +sub load_thresholds{ + my @threshold_array=split(/[,_]/,$_[0]); + unshift @threshold_array, "0"; + for my $size (1 ... 4){ + $thresholdhash{$size}=$threshold_array[$size]; + } +} +#-------------------------------------------------------------------------------------------------------- +sub load_allPossibleTrees{ + #1 THIS FILE STORES ALL POSSIBLE SCENARIOS OF MICROSATELLITE + #2 BIRTH AND DEATH EVENTS ON A 5-PRIMATE TREE OF H,C,O,R,M + #3 IN FORM OF A TEXT FILE. THIS WILL BE USED AS A TEMPLET + #4 TO COMPARE EACH MICROSATELLITE CLUSTER TO UNDERSTAND THE + #5 EVOLUTION OF EACH LOCUS. WE WILL THEN DISCARD SOME + #6 MICROSATS ACCRODING TO THEIR EVOLUTIONARY BEHAVIOUR ON + #7 THE TREE. MOST PROBABLY WE WILL REMOVE THOSE MICROSATS + #8 THAT ARE NOT SUFFICIENTLY INFORMATIVE, LIKE IN CASE OF + #9 AN OUTGROUP MICROSATELLITE BEING DIFFERENT FRON ALL OTHER + #10 SPECIES IN THE TREE. + my $tree_list = $_[0]; +# print "file to be loaded: $tree_list\n"; + + my @trarr = (); + @trarr = ("#H C O CONCLUSION ALTERNATE", +"+ + + +HCO NA", +"+ _ _ +H NA", +"_ + _ +C NA", +"_ _ + -HC|+O NA", +"+ _ + -C +H", +"_ + + -H +C", +"+ + _ +HC|-O NA", +"_ _ _ -HCO NA") if $tree_list =~ /_HCO\.txt/; + @trarr = ("#H C O R CONCLUSION ALTERNATE", +"_ _ _ _ -HCOR NA", +"+ + + + +HCOR NA", +"+ + + _ +HCO|-R +H.+C.+O", +"+ + _ _ +HC +H.+C;-O", +"+ _ _ _ +H +HC,-C;+HC,-C", +"_ + _ _ +C +HC,-H;+HC,-H", +"_ _ + _ +O -HC|-H.-C", +"_ _ + + -HC -H.-C", +"+ _ _ + +H|-C.-O +HC,-C", +"_ + _ + +C -H.-O", +"_ + + _ -H +C.+O", +"_ _ _ + -HCO|+R NA", +"+ _ + _ +H.+O|-C NA", +"_ + + + -H -HC,+C", +"+ _ + + -C -HC,+H", +"+ + _ + -O +HC") if $tree_list =~ /_HCOR\.txt/; + + @trarr = ("#H C O R M CONCLUSION ALTERNATE", +"+ + _ + + -O -HCO,+HC|-HCO,+HC;-HCO,(+H.+C)", +"+ _ + + + -C -HC,+H;+HCO,(+H.+O)", +"_ + + + + -H -HC,+C;-HCO,(+C.+O)", +"_ _ + _ _ +O +HCO,-HC;+HCO,(-H.-C)", +"_ + _ _ _ +C +HC,-H;+HCO,(-H.-O)", +"+ _ _ _ _ +H +HC,-C;+HCO,(-C.-O)", +"+ + + _ _ +HCO +H.+C.+O", +"_ _ _ + + -HCO -HC.-O;-H.-C.-O", +"+ _ _ + + -O.-C|-HCO,+H +R.+H;-HCO,(+R.+H)", +"_ + _ + + -O.-H|-HCO,+C +R.+C;-HCO,(+R.+C)", +"_ + + _ _ +HCO,-H|+O.+C NA", +"+ _ + _ _ +HCO,-C|+O.+H NA", +"_ _ + + + -HC -H.-C|-HCO,+O", +"+ + _ _ _ +HC +H.+C|+HCO,-O|-HCO,+HC;-HCO,(+H.+C)", +"+ + + + + +HCORM NA", +"_ _ + _ + DISCARD +O;+HCO,-HC;+HCO,(-H.-C)", +"_ + _ _ + +C +HC,-H;+HCO,(-H.-O)", +"+ _ _ _ + +H +HC,-C;+HCO,(-C.-O)", +"+ + _ _ + +HC -R.-O|+HCO,-O|+H.+C;-HCO,+HC;-HCO,(+H.+C)", +"+ _ + _ + DISCARD -R.-C|+HCO,-C|+H.+O NA", +"_ + + _ + DISCARD -R.-H|+HCO,-H|+C.+O NA", +"_ _ _ _ + DISCARD -HCOR NA", +"_ _ _ + _ DISCARD +R;-HC.-O;-H.-C.-O", +"+ + _ + _ -O +R.+HC|-HCO,+HC;+H.+C.+R|-HCO,(+H.+C)", +"+ + + + _ +HCOR NA", +"+ + + _ + DISCARD -R;+HCO;+HC.+O;+H.+C.+O", +"+ _ + + _ -C -HC,+H;+H.+O.+R|-HCO,(+H.+O)", +"_ + + + _ -H -HC,+C;+C.+O.+R|-HCO,(+C.+O)", +"_ _ + + _ -HC +R.+O|-HCO,+O|+HCO,-HC", +"_ + _ + _ +C +R.+C|-HCO,+C|-HC,+C +HCO,(-H.-O)", +"+ _ _ + _ +H +R.+H|-C.-O +HCO,(-C.-O)" +) if $tree_list =~ /_HCORM\.txt/; + + + my $template_p = $_[1]; + my $alternate_p = $_[2]; + #1 THIS IS THE HASH IN WHICH INFORMATION FROM THE ABOVE FILE + #2 GETS STORED, USING THE WHILE LOOP BELOW. HERE, THE KEY + #3 OF EACH ROW IS THE EVOLUTIONARY CONFIGURATION OF A LOCUS + #4 ON THE PRIMATE TREE, BASED ON PRESENCE/ABSENCE OF A MICROSAT + #5 AT THAT LOCUS, LIKE SAY "+ + + _ _" .. EACH COLUMN BELONGS + #6 TO ONE SPECIES; HERE THE COLUMN NAMES ARE "H C O R M". + #7 THE VALUE FOR EACH ENTRY IS THE MEANING OF THE ABOVE + #8 CONFIGURATION (I.E., CONFIGURAION OF THE KEY. HERE, THE + #9 VALUE WILL BE +HCO, SIGNIFYING A BIRTH IN HUMAN-CHIMP-ORANG + #10 COMMON ANCESTOR. THIS HASH HAS BEEN LOADED HERE TO BE USED + #11 LATER BY THE SUBROUTINE sub treeStudy{} THAT STUDIES + #12 EVOLUTIONARY CONFIGURAION OF EACH MICROSAT LOCUS, AS + #13 MENTIONED ABOVE. + my @keys_array=(); + foreach my $line (@trarr){ +# print $line,"\n"; + next if $line =~ /^#/; + chomp $line; + my @fields = split("\t", $line); + push @keys_array, $fields[0]; +# print "loading: $fields[0]\n"; + $template_p->{$fields[0]}[0] = $fields[1]; + $template_p->{$fields[0]}[1] = 0; + $alternate_p->{$fields[0]} = $fields[2]; +# $alternate_p->{$fields[1]} = $fields[2]; +# print "loading alternate_p $fields[1] $fields[2]\n"; #<STDIN> if $fields[1] eq "+H"; + } +# print "loaded the trees with keys: @keys_array\n"; + return $template_p, \@keys_array, $alternate_p; +} + +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +sub checkCleanCase{ + my $printer = 0; + my $tree = $_[0]; + my $finalalignment = $_[1]; + + #print "IN checkCleanCase: @_\n"; + #<STDIN>; + my @indivspecies = $tree =~ /[A-Z]/g; + $finalalignment =~ s/\./_/g; + my @captured = $finalalignment =~ /[A-Za-z, \(\):]+\![:A-Za-z, \(\)]/g; + + my $unclean = 0; + + foreach my $sp (@indivspecies){ + foreach my $cap (@captured){ + $cap =~ s/:[A-Za-z\-]+//g; + my @sps = $cap =~ /[A-Z]+/g; + my $spsc = join("", @sps); +# print "checking whether imp species $sp is present in $cap i.e, in $spsc\n " if $printer == 1; + if ($spsc =~ /$sp/){ +# print "foind : $sp\n"; + $unclean = 1; last; + } + } + last if $unclean == 1; + } + #<STDIN>; + return "CLEAN" if $unclean == 0; + return "UNCLEAN"; +} + +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------------------------------- + + +sub adjustCoordinates{ + my $line = $_[0]; + return 0 if !defined $line; + #print "------x------x------x------x------x------x------x------x------\n"; + #print $line,"\n\n"; + my $no_of_species = $line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)|(scaffold[0-9a-zA-Z\._\-]+)|(supercontig[0-9a-zA-Z\._\-]+)/x/ig; + #print $line,"\n"; + #print "------x------x------x------x------x------x------x------x------\n\n\n"; +# my @got = ($line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)/x/g); +# print "line = $line\n"; + $infocord = 2 + (4*$no_of_species) - 1; + $typecord = 2 + (4*$no_of_species) + 1 - 1; + $motifcord = 2 + (4*$no_of_species) + 2 - 1; + $gapcord = $motifcord+1; + $startcord = $gapcord+1; + $strandcord = $startcord+1; + $endcord = $strandcord + 1; + $microsatcord = $endcord + 1; + $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; + $interr_poscord = $microsatcord + 3; + $no_of_interruptionscord = $microsatcord + 4; + $interrcord = $microsatcord + 2; +# print "$line\n startcord = $startcord, and endcord = $endcord and no_of_species = $no_of_species\n" if $line !~ /calJac/i; + + return $no_of_species; +} + + +sub printhash{ + my $alivehash = $_[0]; + my @tags = @$_[1]; +# print "print hash\n"; + foreach my $tag (@tags){ +# print "$tag=",$alivehash->{$tag},"\n" if exists $alivehash->{$tag}; + } + + return "\n" +} +sub peel_onion{ + my $printer = 0; +# print "received: @_\n" ; #<STDIN>; + $printer = 0; + my ($tree, $sequences, $alignment, $tagarray, $microsathash, $nonmicrosathash, $motif, $tree_analysis, $threshold, $microsatstarts) = @_; +# print "in peel onion.. tree = $tree \n" if $printer == 1; + my %sequence_hash=(); + + +# for my $i (0 ... $#sequences){ $sequence_hash{$species[$i]}=$sequences->[$i]; } + + + my %node_sequences=(); + + my %node_alignments = (); #NEW, Nov 28 2008 + my @tags=(); + my @locus_sequences=(); + my %alivehash=(); + foreach my $tag (@$tagarray) { + #print "adding: $tag\n"; + push(@tags, $tag); + $node_sequences{$tag}=join ".",split(/\s*/,$microsathash->{$tag}) if $microsathash->{$tag} ne "NULL"; + $alivehash{$tag}= $tag if $microsathash->{$tag} ne "NULL"; + $node_sequences{$tag}=join ".",split(/\s*/,$nonmicrosathash->{$tag}) if $microsathash->{$tag} eq "NULL"; + $node_alignments{$tag}=join ".",split(/\s*/,$alignment->{$tag}) ; + push @locus_sequences, $node_sequences{$tag}; +# print "adding to node_seq: $tag = ",$node_alignments{$tag},"\n"; + } + + #<STDIN>; + + my ($nodes_arr, $branches_hash) = get_nodes($tree); + my @nodes=@$nodes_arr; +# print "recieved nodes = " if $printer == 1; +# foreach my $key (@nodes) {print "@$key " if $printer == 1;} + +# print "\n" if $printer == 1; + + #POPULATE branches_hash WITH INFORMATION ABOUT LIVESTATUS + foreach my $keys (@nodes){ + my @pair = @$keys; + my $joint = "(".join(", ",@pair).")"; + my $copykey = join "", @pair; + $copykey =~ s/[\W ]+//g; +# print "for node: $keys, copykey = $copykey and joint = $joint\n" if $printer == 1; + my $livestatus = 1; + foreach my $copy (split(/\s*/,$copykey)){ + $livestatus = 0 if !exists $alivehash{$copy}; + } + $alivehash{$joint} = $joint if !exists $alivehash{$joint} && $livestatus == 1; +# print "alivehash = $alivehash{$joint}\n" if exists $alivehash{$joint} && $printer == 1; + } + + @nodes = reverse(@nodes); #1 THIS IS IN ORDER TO GO THROUGH THE TREE FROM LEAVES TO ROOT. + + my @mutations_array=(); + + my $joint = (); + foreach my $node (@nodes){ + my @pair = @$node; +# print "now in the nodes for loop, pair = @pair\n and sequences=\n" if $printer == 1; + $joint = "(".join(", ",@pair).")"; + my @pair_sequences=(); + + foreach my $tag (@pair){ +# print "$tag: $node_alignments{$tag}\n" if $printer == 1; +# print $node_alignments{$tag},"\n" if $printer == 1; + push @pair_sequences, $node_alignments{$tag}; + } +# print "ppeel onion joint = $joint , pair_sequences=>@pair_sequences< , pair=>@pair<\n" if $printer == 1; + + my ($compared, $substitutions_list) = base_by_base_simple($motif,\@pair_sequences, scalar(@pair_sequences), @pair, $joint); + $node_alignments{$joint}=$compared; + push( @mutations_array,split(/:/,$substitutions_list)); +# print "newly added to node_sequences: $node_alignments{$joint} and list of mutations =\n", join("\n",@mutations_array),"\n" if $printer == 1; + } + + + my $analayzed_mutations = analyze_mutations(\@mutations_array, \@nodes, $branches_hash, $alignment, \@tags, \%alivehash, \%node_sequences, $microsatstarts, $motif); + + return ($analayzed_mutations, \@nodes, $branches_hash, \%alivehash, $node_alignments{$joint}) if scalar @mutations_array > 0; + return ("NULL",\@nodes,$branches_hash, \%alivehash, "NULL") if scalar @mutations_array == 0; +} + +#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# +#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# + +#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# +#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# + +sub get_nodes{ + my $printer = 0; + + my $tree=$_[0]; + #$tree =~ s/ +//g; + $tree =~ s/\t+//g; + $tree=~s/;//g; +# print "tree=$tree\n" if $printer == 1; + my @nodes = (); + my @onions=($tree); + my %branches=(); + foreach my $bite (@onions){ + $bite=~ s/^\(|\)$//g; + chomp $bite; +# print "tree = $bite \n"; +# <STDIN>; + $bite=~ /([ ,\(\)A-Z]+)\,\s*([ ,\(\)A-Z]+)/; + #$tree =~ /(\(\(\(H, C\), O\), R\))\, (M)/; + my @raw_nodes = ($1, $2); +# print "raw nodes = $1 and $2\n" if $printer == 1; + push(@nodes, [@raw_nodes]); + foreach my $node (@raw_nodes) {push (@onions, $node) if $node =~ /,/;} + foreach my $node (@raw_nodes) {$branches{$node}="(".$bite.")"; } +# print "onions = @onions\n" if $printer == 1;<STDIN> if $printer == 1; + } + $printer = 0; + return \@nodes, \%branches; +} + + +#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# +#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# +#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# +#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# +sub analyze_mutations{ + my ($mutations_array, $nodes, $branches_hash, $alignment, $tags, $alivehash, $node_sequences, $microsatstarts, $motif) = @_; + my $locuslength = length($alignment->{$tags->[0]}); + my $printer = 0; + + +# print " IN analyzed_mutations....\n" if $printer == 1; # \n mutations array = @$mutations_array, \nAND locuslength = $locuslength\n" if $printer == 1; + my %mutation_hash=(); + my %froms_megahash=(); + my %tos_megahash=(); + my %position_hash=(); + my @solutions_array=(); + foreach my $mutation (@$mutations_array){ +# print "loadin mutation: $mutation\n" if $printer == 1; + my %localhash= $mutation =~ /([\S ]+)=([\S ]+)/g; + $mutation_hash{$localhash{"position"}} = {%localhash}; + push @{$position_hash{$localhash{"position"}}},$localhash{"node"}; +# print "feeding position hash with $localhash{position}: $position_hash{$localhash{position}}[0]\n" if $printer == 1; + $froms_megahash{$localhash{"position"}}{$localhash{"node"}}=$localhash{"from"}; + $tos_megahash{$localhash{"position"}}{$localhash{"node"}}=$localhash{"to"}; +# print "just a trial: $mutation_hash{$localhash{position}}{position}\n" if $printer == 1; +# print "loadin in tos_megahash: $localhash{position} {$localhash{node} = $localhash{to}\n" if $printer == 1; +# print "loadin in from: $localhash{position} {$localhash{node} = $localhash{from}\n" if $printer == 1; + } + +# print "now going through each position in loculength:\n" if $printer == 1; + ## <STDIN> if $printer == 1; + + for my $pos (0 ... $locuslength-1){ +# print "at position: $pos\n" if $printer == 1; + + if (exists($mutation_hash{$pos})){ + my @local_nodes=@{$position_hash{$pos}}; +# print "found mutation: @{$position_hash{$pos}} : @local_nodes\n" if $printer == 1; + + foreach my $local_node (@local_nodes){ +# print "at local node: $local_node ... from state = $froms_megahash{$pos}{$local_node}\n" if $printer == 1; + my $open_insertion=(); + my $open_deletion=(); + my $open_to_substitution=(); + my $open_from_substitution=(); + if ($froms_megahash{$pos}{$local_node} eq "-"){ + # print "here exists a microsatellite from $local_node to $branches_hash->{$local_node}\n" if $printer == 1 && exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};; + # print "for localnode $local_node, amd the realated branches_hash:$branches_hash->{$local_node}, nexting as exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}\n" if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}} && $printer == 1; + #next if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}; + $open_insertion=$tos_megahash{$pos}{$local_node}; + for my $posnext ($pos+1 ... $locuslength-1){ +# print "in first if .... studying posnext: $posnext\n" if $printer == 1; + last if !exists ($froms_megahash{$posnext}{$local_node}); +# print "for posnext: $posnext, there exists $froms_megahash{$posnext}{$local_node}.. already, open_insertion = $open_insertion.. checking is $froms_megahash{$posnext}{$local_node} matters\n" if $printer == 1; + $open_insertion = $open_insertion.$tos_megahash{$posnext}{$local_node} if $froms_megahash{$posnext}{$local_node} eq "-"; +# print "now open_insertion=$open_insertion\n" if $printer == 1; + delete $mutation_hash{$posnext} if $froms_megahash{$posnext}{$local_node} eq "-"; + } +# print "1 Feeding in: ", join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_insertion", "deletion="),"\n" if $printer == 1; + push (@solutions_array, join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_insertion", "deletion=")); + } + elsif ($tos_megahash{$pos}{$local_node} eq "-"){ + # print "here exists a microsatellite to $local_node from $branches_hash->{$local_node}\n" if $printer == 1 && exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};; + # print "for localnode $local_node, amd the realated branches_hash:$branches_hash->{$local_node}, nexting as exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}\n" if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}; + #next if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}; + $open_deletion=$froms_megahash{$pos}{$local_node}; + for my $posnext ($pos+1 ... $locuslength-1){ +# print "in 1st elsif studying posnext: $posnext\n" if $printer == 1; +# print "nexting as nextpos does not exist\n" if !exists ($tos_megahash{$posnext}{$local_node}) && $printer == 1; + last if !exists ($tos_megahash{$posnext}{$local_node}); +# print "for posnext: $posnext, there exists $tos_megahash{$posnext}{$local_node}\n" if $printer == 1; + $open_deletion = $open_deletion.$froms_megahash{$posnext}{$local_node} if $tos_megahash{$posnext}{$local_node} eq "-"; + delete $mutation_hash{$posnext} if $tos_megahash{$posnext}{$local_node} eq "-"; + } +# print "2 Feeding in:", join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_deletion"), "\n" if $printer == 1; + push (@solutions_array, join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_deletion")); + } + elsif ($tos_megahash{$pos}{$local_node} ne "-"){ + # print "here exists a microsatellite from $local_node to $branches_hash->{$local_node}\n" if $printer == 1 && exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};; + # print "for localnode $local_node, amd the realated branches_hash:$branches_hash->{$local_node}, nexting as exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}\n" if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}; + #next if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}; + # print "microsatstart = $microsatstarts->{$local_node} \n" if exists $microsatstarts->{$local_node} && $pos < $microsatstarts->{$local_node} && $printer == 1; + next if exists $microsatstarts->{$local_node} && $pos < $microsatstarts->{$local_node}; + $open_to_substitution=$tos_megahash{$pos}{$local_node}; + $open_from_substitution=$froms_megahash{$pos}{$local_node}; +# print "open from substitution: $open_from_substitution \n" if $printer == 1; + for my $posnext ($pos+1 ... $locuslength-1){ + #print "in last elsif studying posnext: $posnext\n"; + last if !exists ($tos_megahash{$posnext}{$local_node}); +# print "for posnext: $posnext, there exists $tos_megahash{$posnext}{$local_node}\n" if $printer == 1; + $open_to_substitution = $open_to_substitution.$tos_megahash{$posnext}{$local_node} if $tos_megahash{$posnext}{$local_node} ne "-"; + $open_from_substitution = $open_from_substitution.$froms_megahash{$posnext}{$local_node} if $tos_megahash{$posnext}{$local_node} ne "-"; + delete $mutation_hash{$posnext} if $tos_megahash{$posnext}{$local_node} ne "-" && $froms_megahash{$posnext}{$local_node} ; + } +# print "open from substitution: $open_from_substitution \n" if $printer == 1; + + #IS THE STRETCH OF SUBSTITUTION MICROSATELLITE-LIKE? + my @motif_parts=split(/\s*/,$motif); + #GENERATING THE FLEXIBLE LEFT END + my $left_query=(); + for my $k (1 ... $#motif_parts) { + $left_query= $motif_parts[$k]."|)"; + $left_query="(".$left_query; + } + $left_query=$left_query."?"; +# print "left_quewry = $left_query\n" if $printer == 1; + #GENERATING THE FLEXIBLE RIGHT END + my $right_query=(); + for my $k (0 ... ($#motif_parts-1)) { + $right_query= "(|".$motif_parts[$k]; + $right_query=$right_query.")"; + } + $right_query=$right_query."?"; +# print "right_query = $right_query\n" if $printer == 1; +# print "Hence, searching for: ^$left_query($motif)+$right_query\$\n" if $printer == 1; + + my $motifcomb=$motif x 50; +# print "motifcomb = $motifcomb\n" if $printer == 1; + if ( ($motifcomb =~/$open_to_substitution/i) && (length ($open_to_substitution) >= length($motif)) ){ +# print "sequence microsat-like\n" if $printer == 1; + my $all_microsat_like = 0; +# print "3 feeding in: ", join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_from_substitution"), "\n" if $printer == 1; + push (@solutions_array, join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_from_substitution")); +# print "4 feeding in: ", join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_to_substitution", "deletion="), "\n" if $printer == 1; + push (@solutions_array, join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_to_substitution", "deletion=")); + + } + else{ +# print "5 feeding in: ", join("\t", "node=$local_node","type=substitution" ,"position=$pos", "from=$open_from_substitution", "to=$open_to_substitution", "insertion=", "deletion="), "\n" if $printer == 1; + push (@solutions_array, join("\t", "node=$local_node","type=substitution" ,"position=$pos", "from=$open_from_substitution", "to=$open_to_substitution", "insertion=", "deletion=")); + } + #IS THE FROM-SEQUENCE MICROSATELLITE-LIKE? + + } + #<STDIN> if $printer ==1; + } + #<STDIN> if $printer ==1; + } + } +# print "\n", "#" x 50, "\n" if $printer == 1; + foreach my $tag (@$tags){ +# print "$tag: $alignment->{$tag}\n" if $printer == 1; + } +# print "\n", "#" x 50, "\n" if $printer == 1; +# print "returning SOLUTIONS ARRAY : \n",join("\n", @solutions_array),"\n" if $printer == 1; + #print "end\n"; + #<STDIN> if + return \@solutions_array; +} +#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# +#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# +#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# +#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# + +sub base_by_base_simple{ + my $printer = 0; + my ($motif, $locus, $no, $pair0, $pair1, $joint) = @_; + my @seq_array=(); +# print "IN SUBROUTUNE base_by_base_simple.. information received = @_\n" if $printer == 1; +# print "pair0 = $pair0 and pair1 = $pair1\n" if $printer == 1; + + my @example=split(/\./,$locus->[0]); +# print "example, for length = @example\n" if $printer == 1; + for my $i (0...$no-1){push(@seq_array, [split(/\./,$locus->[$i])]); } + + my @compared_sequence=(); + my @substitutions_list; + for my $i (0...scalar(@example)-1){ + + #print "i = $i\n" if $printer == 1; + #print "comparing $seq_array[0][$i] and $seq_array[1][$i] \n" ;#if $printer == 1; + if ($seq_array[0][$i] =~ /!/ && $seq_array[1][$i] !~ /!/){ + + my $resolution= resolve_base($seq_array[0][$i],$seq_array[1][$i], $pair1 ,"keep" ); + # print "ancestral = $resolution\n" if $printer == 1; + + if ($resolution =~ /$seq_array[1][$i]/i && $resolution !~ /!/){ + push @substitutions_list, add_mutation($i, $pair0, $seq_array[0][$i], $resolution ); + } + elsif ( $resolution !~ /!/){ + push @substitutions_list, add_mutation($i, $pair1, $seq_array[1][$i], $resolution); + } + push @compared_sequence,$resolution; + } + elsif ($seq_array[0][$i] !~ /!/ && $seq_array[1][$i] =~ /!/){ + + my $resolution= resolve_base($seq_array[1][$i],$seq_array[0][$i], $pair0, "invert" ); + # print "ancestral = $resolution\n" if $printer == 1; + + if ($resolution =~ /$seq_array[0][$i]/i && $resolution !~ /!/){ + push @substitutions_list, add_mutation($i, $pair1, $seq_array[1][$i], $resolution); + } + elsif ( $resolution !~ /!/){ + push @substitutions_list, add_mutation($i, $pair0, $seq_array[0][$i], $resolution); + } + push @compared_sequence,$resolution; + } + elsif($seq_array[0][$i] =~ /!/ && $seq_array[1][$i] =~ /!/){ + push @compared_sequence, add_bases($seq_array[0][$i],$seq_array[1][$i], $pair0, $pair1, $joint ); + } + else{ + if($seq_array[0][$i] !~ /^$seq_array[1][$i]$/i){ + push @compared_sequence, $pair0.":".$seq_array[0][$i]."!".$pair1.":".$seq_array[1][$i]; + } + else{ + # print "perfect match\n" if $printer == 1; + push @compared_sequence, $seq_array[0][$i]; + } + } + } +# print "returning: comared = @compared_sequence \nand substitutions list =\n", join("\n",@substitutions_list),"\n" if $printer == 1; + return join(".",@compared_sequence), join(":", @substitutions_list) if scalar (@substitutions_list) > 0; + return join(".",@compared_sequence), "" if scalar (@substitutions_list) == 0; +} + + +sub resolve_base{ + my $printer = 0; +# print "IN SUBROUTUNE resolve_base.. information received = @_\n" if $printer == 1; + my ($optional, $single, $singlesp, $arg) = @_; + my @options=split(/!/,$optional); + foreach my $option(@options) { + $option=~s/[A-Z\(\) ,]+://g; + if ($option =~ /$single/i){ +# print "option = $option , returning single: $single\n" if $printer == 1; + return $single; + } + } +# print "returning ",$optional."!".$singlesp.":".$single. "\n" if $arg eq "keep" && $printer == 1; +# print "returning ",$singlesp.":".$single."!".$optional. "\n" if $arg eq "invert" && $printer == 1; + return $optional."!".$singlesp.":".$single if $arg eq "keep"; + return $singlesp.":".$single."!".$optional if $arg eq "invert"; + +} + +sub same_length{ + my $printer = 0; + my @locus = @_; + my $temp = shift @locus; + $temp=~s/-|,//g; + foreach my $l (@locus){ + $l=~s/-|,//g; + return 0 if length($l) != length($temp); + $temp = $l; + } + return 1; +} +sub treeStudy{ + my $printer = 1; +# print "template DEFINED.. received: @_\n" if defined %template; +# print "only received = @_" if !defined %template; + my $stopper = 0; +# if (!defined %template){ TEMP MASKED OCT 18 2012 + $stopper = 1; + %template=(); +# print "tree decipherer = $tree_decipherer\n" if $printer == 1; + my ( $template_ref, $keys_array)=load_allPossibleTrees($tree_decipherer, \%template); +# print "return = $template_ref and @{$keys_array}\n" if $printer == 1; + foreach my $key (@$keys_array){ +# print "addding : $template_ref->{$key} for $key\n" if $printer == 1; + $template{$key} = $template_ref->{$key}; + } +# } TEMP MASK OCT 18 2012 END +# <STDIN>; + for my $templet ( keys %template ) { + # print "$templet => @{$template{$templet}}\n"; + } +# <STDIN> if !defined %template; + + my $strict = 0; + + my $H = 0; + my $Hchr = 1; + my $Hstart = 2; + my $Hend = 3; + my $Hmotif = 4; + my $Hmotiflen = 5; + my $Hmicro = 6; + my $Hstrand = 7; + my $Hmicrolen = 8; + my $Hinterpos = 9; + my $Hrelativepos = 10; + my $Hinter = 11; + my $Hinterlen = 12; + + my $C = 13; + my $Cchr = 14; + my $Cstart = 15; + my $Cend = 16; + my $Cmotif = 17; + my $Cmotiflen = 18; + my $Cmicro = 19; + my $Cstrand = 20; + my $Cmicrolen = 21; + my $Cinterpos = 22; + my $Crelativepos = 23; + my $Cinter = 24; + my $Cinterlen = 25; + + my $O = 26; + my $Ochr = 27; + my $Ostart = 28; + my $Oend = 29; + my $Omotif = 30; + my $Omotiflen = 31; + my $Omicro = 32; + my $Ostrand = 33; + my $Omicrolen = 34; + my $Ointerpos = 35; + my $Orelativepos = 36; + my $Ointer = 37; + my $Ointerlen = 38; + + my $R = 39; + my $Rchr = 40; + my $Rstart = 41; + my $Rend = 42; + my $Rmotif = 43; + my $Rmotiflen = 44; + my $Rmicro = 45; + my $Rstrand = 46; + my $Rmicrolen = 47; + my $Rinterpos = 48; + my $Rrelativepos = 49; + my $Rinter = 50; + my $Rinterlen = 51; + + my $Mchr = 52; + my $Mstart = 53; + my $Mend = 54; + my $M = 55; + my $Mmotif = 56; + my $Mmotiflen = 57; + my $Mmicro = 58; + my $Mstrand = 59; + my $Mmicrolen = 60; + my $Minterpos = 61; + my $Mrelativepos = 62; + my $Minter = 63; + my $Minterlen = 64; + + #-------------------------------------------------------------------------------# + my @analysis=(); + + + my %speciesOrder = (); + $speciesOrder{"H"} = 0; + $speciesOrder{"C"} = 1; + $speciesOrder{"O"} = 2; + $speciesOrder{"R"} = 3; + $speciesOrder{"M"} = 4; + #-------------------------------------------------------------------------------# + + my $line = $_[0]; + chomp $line; + + my @f = split(/\t/,$line); +# print "received array : @f.. recieved tags = @tags\n" if $printer == 1; + + # collect all motifs + my @motifs=(); + @motifs = ($f[$Hmotif], $f[$Cmotif], $f[$Omotif], $f[$Rmotif], $f[$Mmotif]) if $tags[$#tags] =~ /M/; + @motifs = ($f[$Hmotif], $f[$Cmotif], $f[$Omotif], $f[$Rmotif]) if $tags[$#tags] =~ /R/; + @motifs = ($f[$Hmotif], $f[$Cmotif], $f[$Omotif]) if $tags[$#tags] =~ /O/; +# print "motifs in the array = $f[$Hmotif], $f[$Cmotif], $f[$Omotif], $f[$Rmotif]\n" if $tags[$#tags] =~ /R/;; +# print "motifs = @motifs\n" if $printer == 1; + my @translation = (); + foreach my $motif (@motifs){ + push(@translation, "_") if $motif eq "NA"; + push(@translation, "+") if $motif ne "NA"; + } + my $translate = join(" ", @translation); +# print "translate = >$translate< and analysis = $template{$translate}[0].. on the other hand, ",$template{"- - +"}[0],"\n"; + my @analyses = split(/\|/,$template{$translate}[0]); +# print "motifs = @motifs, analyses = @analyses\n" if $printer == 1; + + if (scalar(@analyses) == 1) { + #print "analysis = $analyses[0]\n"; + if ($analyses[0] !~ /,|\./ ){ + if ($analyses[0] =~ /\+/){ + my $analysis = $analyses[0]; + $analysis =~ s/\+|\-//g; + my @species = split(/\s*/,$analysis); + my @currentMotifs = (); + foreach my $specie (@species){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); #print "pushing into currentMotifs: $speciesOrder{$specie}: $motifs[$speciesOrder{$specie}]\n" if $printer == 1; + } +# print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; + $template{$translate}[1]++ if $strict == 1 && consistency(@currentMotifs) ne "NULL"; + $template{$translate}[1]++ if $strict == 0; +# print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; + } + else{ + my $analysis = $analyses[0]; + $analysis =~ s/\+|\-//g; + my @species = split(/\s*/,$analysis); + my @currentMotifs = (); + my @complementarySpecies = (); + my $allSpecies = join("",@tags); + foreach my $specie (@species){ $allSpecies =~ s/$specie//g; } + foreach my $specie (split(/\s*/,$allSpecies)){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); #print "pushing into currentMotifs: $speciesOrder{$specie}: $motifs[$speciesOrder{$specie}]\n" if $printer == 1;; + } +# print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; + $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL"; + $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0; +# print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; + } + } + + elsif ($analyses[0] =~ /,/) { + my @events = split(/,/,$analyses[0]); +# print "events = @events \n " if $printer == 1; + if ($events[0] =~ /\+/){ + my $analysis1 = $events[0]; + $analysis1 =~ s/\+|\-//g; + my $analysis2 = $events[1]; + $analysis2 =~ s/\+|\-//g; + my @nSpecies = split(/\s*/,$analysis2); +# print "original anslysis = $analysis1 " if $printer == 1; + foreach my $specie (@nSpecies){ $analysis1=~ s/$specie//g;} +# print "processed anslysis = $analysis1 \n" if $printer == 1; + my @currentMotifs = (); + foreach my $specie (split(/\s*/,$analysis1)){push(@currentMotifs, $motifs[$speciesOrder{$specie}]); } +# print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; + $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL"; + $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0; +# print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; + } + else{ + my $analysis1 = $events[0]; + $analysis1 =~ s/\+|\-//g; + my $analysis2 = $events[1]; + $analysis2 =~ s/\+|\-//g; + my @pSpecies = split(/\s*/,$analysis2); + my @currentMotifs = (); + foreach my $specie (@pSpecies){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); } +# print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; + $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL"; + $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0; +# print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; + + } + + } + elsif ($analyses[0] =~ /\./) { + my @events = split(/\./,$analyses[0]); + foreach my $event (@events){ +# print "event = $event \n" if $printer == 1; + if ($event =~ /\+/){ + my $analysis = $event; + $analysis =~ s/\+|\-//g; + my @species = split(/\s*/,$analysis); + my @currentMotifs = (); + foreach my $specie (@species){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); } + #print consistency(@currentMotifs),"<- \n"; +# print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; + $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL"; + $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0; +# print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; + } + else{ + my $analysis = $event; + $analysis =~ s/\+|\-//g; + my @species = split(/\s*/,$analysis); + my @currentMotifs = (); + my @complementarySpecies = (); + my $allSpecies = join("",@tags); + foreach my $specie (@species){ $allSpecies =~ s/$specie//g; } + foreach my $specie (split(/\s*/,$allSpecies)){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); } + #print consistency(@currentMotifs),"<- \n"; +# print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; + $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL"; + $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0; +# print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; + } + } + + } + } + else{ + my $finalanalysis = (); + $template{$translate}[1]++; + foreach my $analysis (@analyses){ ;} + } + # test if motifs where microsats are present, as indeed of same the motif composition + + + + for my $templet ( keys %template ) { + if (@{ $template{$templet} }[1] > 0){ + + $template{$templet}[1] = 0; +# print "now returning: @{$template{$templet}}[0], $templet\n"; + return (@{$template{$templet}}[0], $templet); + } + } + undef %template; +# print "sending NULL\n" if $printer == 1; + return ("NULL", "NULL"); + +} + + +sub consistency{ + my @motifs = @_; +# print "in consistency \n" if $printer == 1; +# print "motifs sent = >",join("|",@motifs),"< \n" if $printer == 1; + return $motifs[0] if scalar(@motifs) == 1; + my $prevmotif = shift(@motifs); + my $stopper = 0; + for my $i (0 ... $#motifs){ + next if $motifs[$i] eq "NA"; + my $templet = $motifs[$i].$motifs[$i]; + if ($templet !~ /$prevmotif/i){ + $stopper = 1; last; + } + } + return $prevmotif if $stopper == 0; + return "NULL" if $stopper == 1; +} +sub summarize_microsat{ + my $printer = 1; + my $line = $_[0]; + my $humseq = $_[1]; + + my @gaps = $line =~ /[0-9]+\t[0-9]+\t[\+\-]/g; + my @starts = $line =~ /[0-9]+\t[\+\-]/g; + my @ends = $line =~ /[\+\-]\t[0-9]+/g; +# print "starts = @starts\tends = @ends\n" if $printer == 1; + for my $i (0 ... $#gaps) {$gaps[$i] =~ s/\t[0-9]+\t[\+\-]//g;} + for my $i (0 ... $#starts) {$starts[$i] =~ s/\t[\+\-]//g;} + for my $i (0 ... $#ends) {$ends[$i] =~ s/[\+\-]\t//g;} + + my $minstart = array_smallest_number(@starts); + my $maxend = array_largest_number(@ends); + + my $humupstream_st = substr($humseq, 0, $minstart); + my $humupstream_en = substr($humseq, 0, $maxend); + my $no_of_gaps_to_start = 0; + my $no_of_gaps_to_end = 0; + $no_of_gaps_to_start = ($humupstream_st =~ s/\-/x/g) if $humupstream_st=~/\-/; + $no_of_gaps_to_end = ($humupstream_en =~ s/\-/x/g) if $humupstream_en=~/\-/; + + my $locusmotif = (); +# print "IN SUB SUMMARIZE_MICROSAT $line\n" if $printer == 1; + #return "NULL" if $line =~ /compound/; + my $Hstart = "NA"; + my $Hend = "NA"; + chomp $line; + my $match_count = ($line =~ s/>/>/g); + #print "number of species = $match_count\n"; + my @micros = split(/>/,$line); + shift @micros; + my $stopper = 0; + + + foreach my $mic (@micros){ + my @local = split(/\t/,$mic); + if ($local[$microsatcord] =~ /N/) {$stopper =1; last;} + } + return "NULL" if $stopper ==1; + + #------------------------------------------------------ + + my @arranged = (); + for my $arr (0 ... $#exacttags) {$arranged[$arr] = '0';} + + foreach my $micro (@micros){ + for my $i (0 ... $#exacttags){ + if ($micro =~ /^$exacttags[$i]/){ + $arranged[$i] = $micro; + last; + } + } + } +# print "arranged = @arranged \n" ; <STDIN>;; + + my @endstatement = (); + my $turn = 0; + my $species_counter = 0; + # print scalar(@arranged),"\n"; + + my $species_no=0; + + my $orthHchr = 0; + + foreach my $micro (@arranged) { + $micro =~ s/\t\t/\t \t/g; + $micro =~ s/\t,/\t ,/g; + $micro =~ s/,\t/, \t/g; +# print "------------------------------------------------------------------------------------------\n" if $printer == 1; + chomp $micro; + if ($micro eq '0'){ + push(@endstatement, join("\t",$exacttags[$species_counter],"NA","NA","NA","NA",0 ,"NA", "NA", 0,"NA","NA","NA", "NA" )); + $species_counter++; + # print join("|","ENDSTATEMENT:",@endstatement),"\n" if $printer == 1; + next; + } + # print $micro,"\n"; +# print "micro = $micro \n" if $printer == 1; + my @fields = split(/\t/,$micro); + my $microcopy = $fields[$microsatcord]; + $microcopy =~ s/\[|\]|-//g; + my $microsatlength = length($microcopy); +# print "microsat = $fields[$microsatcord] and microsatlength = $microsatlength\n" if $printer == 1; +# print "sp_ident = @sp_ident.. species_no=$species_no\n"; + $micro =~ /$sp_ident[$species_no]\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)/; +# print "$micro =~ /$sp_ident[$species_no] ([0-9a-zA-Z_]+) ([0-9]+) ([0-9]+)/\n"; + my $sp_chr=$1; + my $sp_start=$2 + $fields[$startcord] - $fields[$gapcord]; + my $sp_end= $sp_start + $microsatlength - 1; + + $species_no++; + + $micro =~ /$focalspec_orig\s(\S+)\s([0-9]+)\s([0-9]+)/; + $orthHchr=$1; + $Hstart=$2+$minstart-$no_of_gaps_to_start; + $Hend=$2+$maxend-$no_of_gaps_to_end; +# print "Hstart = $Hstart = $fields[4] + $fields[$startcord] - $fields[$gapcord]\n" if $printer == 1; + + my $motif = $fields[$motifcord]; + my $firstmotif = (); + my $strand = $fields[$strandcord]; + # print "strand = $strand\n"; + + + if ($motif =~ /^\[/){ + $motif =~ s/^\[//g; + $motif =~ /([a-zA-Z]+)\].*/; + $firstmotif = $1; + } + + else {$firstmotif = $motif;} +# print "firstmotif =$firstmotif : \n" if $printer == 1; + $firstmotif = allCaps($firstmotif); + + if (exists $revHash{$firstmotif} && $turn == 0) { + $turn=1 if $species_counter==0; + $firstmotif = $revHash{$firstmotif}; + } + + elsif (exists $revHash{$firstmotif} && $turn == 1) {$firstmotif = $revHash{$firstmotif}; $turn = 1;} +# print "changed firstmotif =$firstmotif\n" if $printer == 1; + # <STDIN>; + $locusmotif = $firstmotif; + + if (scalar(@fields) > $microsatcord + 2){ +# print "fields = @fields ... interr_poscord=$interr_poscord=$fields[$interr_poscord] .. interrcord=$interrcord=$fields[$interrcord]\n" if $printer == 1; + + my @interposes = (); + @interposes = split(",",$fields[$interr_poscord]) if $fields[$interr_poscord] =~ /,/; + $interposes[0] = $fields[$interr_poscord] if $fields[$interr_poscord] !~ /,/ ; +# print "interposes=@interposes\n" if $printer == 1; + my @relativeposes = (); + my @interruptions = (); + @interruptions = split(",",$fields[$interrcord]) if $fields[$interrcord] =~ /,/; + $interruptions[0] = $fields[$interrcord] if $fields[$interrcord] !~ /,/; + my @interlens = (); + + + for my $i (0 ... $#interposes){ + + my $interpos = $interposes[$i]; + my $nexter = 0; + my $interruption = $interruptions[$i]; + my $interlen = length($interruption); + push (@interlens, $interlen); + + + my $relativepos = (100 * $interpos) / $microsatlength; +# print "relativepos = $relativepos ,interpos=$interpos, interruption=$interruption, interlen=$interlen \n" if $printer == 1; + $relativepos = (100 * ($interpos-$interlen)) / $microsatlength if $relativepos > 50; +# print "--> = $relativepos\n" if $printer == 1; + $interruption = "IND" if length($interruption) < 1; + + if ($turn == 1){ + $fields[$microsatcord] = switch_micro($fields[$microsatcord]); + $interruption = switch_nucl($interruption) unless $interruption eq "IND"; + $interpos = ($microsatlength - $interpos) - $interlen + 2; +# print "turn interpos = $interpos for $fields[$microsatcord]\n" if $printer == 1; + $relativepos = (100 * $interpos) / $microsatlength; + $relativepos = (100 * ($interpos-$interlen)) / $microsatlength if $relativepos > 50; + + + $strand = '+' if $strand eq '-'; + $strand = '-' if $strand eq '+'; + } +# print "final relativepos = $relativepos\n" if $printer == 1; + push(@relativeposes, $relativepos); + } + push(@endstatement,join("\t",($exacttags[$species_counter],$sp_chr, $sp_start, $sp_end, $firstmotif,length($firstmotif),$fields[$microsatcord],$strand,$microsatlength,join(",",@interposes),join(",",@relativeposes),join(",",@interruptions), join(",",@interlens)))); + } + + else{ + push(@endstatement, join("\t",$exacttags[$species_counter],$sp_chr, $sp_start, $sp_end, $firstmotif,length($firstmotif),$fields[$microsatcord],$strand,$microsatlength,"NA","NA","NA", "NA")); + } + + $species_counter++; + } + + $locusmotif = $sameHash{$locusmotif} if exists $sameHash{$locusmotif}; + $locusmotif = $revHash{$locusmotif} if exists $revHash{$locusmotif}; + + my $endst = join("\t", @endstatement, $orthHchr, $Hstart, $Hend); +# print join("\t", @endstatement, $orthHchr, $Hstart, $Hend), "\n" if $printer == 1; + + + return (join("\t", @endstatement, $orthHchr, $Hstart, $Hend), $orthHchr, $Hstart, $Hend, $locusmotif, length($locusmotif)); + +} + +sub switch_nucl{ + my @strand = split(/\s*/,$_[0]); + for my $i (0 ... $#strand){ + if ($strand[$i] =~ /c/i) {$strand[$i] = "G";next;} + if ($strand[$i] =~ /a/i) {$strand[$i] = "T";next;} + if ($strand[$i] =~ /t/i) { $strand[$i] = "A";next;} + if ($strand[$i] =~ /g/i) {$strand[$i] = "C";next;} + } + return join("",@strand); +} + + +sub switch_micro{ + my $micro = reverse($_[0]); + my @strand = split(/\s*/,$micro); + for my $i (0 ... $#strand){ + if ($strand[$i] =~ /c/i) {$strand[$i] = "G";next;} + if ($strand[$i] =~ /a/i) {$strand[$i] = "T";next;} + if ($strand[$i] =~ /t/i) { $strand[$i] = "A";next;} + if ($strand[$i] =~ /g/i) {$strand[$i] = "C";next;} + if ($strand[$i] =~ /\[/i) {$strand[$i] = "]";next;} + if ($strand[$i] =~ /\]/i) {$strand[$i] = "[";next;} + } + return join("",@strand); +} +sub decipher_history{ + my $printer = 0; + my ($mutations_array, $tags_string, $nodes, $branches_hash, $tree_analysis, $confirmation_string, $alivehash) = @_; + my %mutations_hash=(); + foreach my $mutation (@$mutations_array){ +# print "mutation = $mutation\n" if $printer == 1; + my %local = $mutation =~ /([\S ]+)=([\S ]+)/g; + push @{$mutations_hash{$local{"node"}}},$mutation; +# print "just for confirmation: $local{node} pushed as: $mutation\n" if $printer == 1; + } + my @nodes; + my @birth_steps=(); + my @death_steps=(); + + my @tags=split(/\s*/,$tags_string); + my @confirmation=split(/\s+/,$confirmation_string); + my %info=(); + + for my $i (0 ... $#tags){ + $info{$tags[$i]}=$confirmation[$i]; +# print "feeding info: $tags[$i] = $info{$tags[$i]}\n" if $printer == 1; + } + + for my $keys (@$nodes) { + foreach my $key (@$keys){ +# print "current key = $key\n"; + my $copykey = $key; + $copykey =~ s/[\W ]+//g; + my @copykeys=split(/\s*/,$copykey); + my $states=(); + foreach my $copy (@copykeys){ + $states=$states.$info{$copy}; + } +# print "reduced key = $copykey and state = $states\n" if $printer == 1; + + if (exists $mutations_hash{$key}) { + + if ($states=~/\+/){ + push @birth_steps, @{$mutations_hash{$key}}; + $birth_steps[$#birth_steps] =~ s/\S+=//g; + delete $mutations_hash{$key}; + } + else{ + push @death_steps, @{$mutations_hash{$key}}; + $death_steps[$#death_steps] =~ s/\S+=//g; + delete $mutations_hash{$key}; + } + } + } + } +# print "conformation = $confirmation_string\n" if $printer == 1; + push (@birth_steps, "NULL") if scalar(@birth_steps) == 0; + push (@death_steps, "NULL") if scalar(@death_steps) == 0; +# print "birth steps = ",join("\n",@birth_steps)," and death steps = ",join("\n",@death_steps),"\n" if $printer == 1; + return \@birth_steps, \@death_steps; +} + +sub fillAlignmentGaps{ + my $printer = 0; +# print "received: @_\n" if $printer == 1; + my ($tree, $sequences, $alignment, $tagarray, $microsathash, $nonmicrosathash, $motif, $tree_analysis, $threshold, $microsatstarts) = @_; +# print "in fillAlignmentGaps.. tree = $tree \n" if $printer == 1; + my %sequence_hash=(); + + my @phases = (); + my $concat = $motif.$motif; + my $motifsize = length($motif); + + for my $i (1 ... $motifsize){ + push @phases, substr($concat, $i, $motifsize); + } + + my $concatalignment = (); + foreach my $tag (@tags){ + $concatalignment = $concatalignment.$alignment->{$tag}; + } +# print "returningg NULL","NULL","NULL", "NULL\n" if $concatalignment !~ /-/; + return 0, "NULL","NULL","NULL", "NULL","NULL" if $concatalignment !~ /-/; + + + + my %node_sequences_temp=(); + my %node_alignments_temp =(); #NEW, Nov 28 2008 + + my @tags=(); + my @locus_sequences=(); + my %alivehash=(); + +# print "IN fillAlignmentGaps\n";# <STDIN>; + my %fillrecord = (); + + my $change = 0; + foreach my $tag (@$tagarray) { + #print "adding: $tag\n"; + push(@tags, $tag); + if (exists $microsathash->{$tag}){ + my $micro = $microsathash->{$tag}; + my $orig_micro = $micro; + ($micro, $fillrecord{$tag}) = fillgaps($micro, \@phases); + $change = 1 if uc($micro) ne uc($orig_micro); + $node_sequences_temp{$tag}=$micro if $microsathash->{$tag} ne "NULL"; + } + if (exists $nonmicrosathash->{$tag}){ + my $micro = $nonmicrosathash->{$tag}; + my $orig_micro = $micro; + ($micro, $fillrecord{$tag}) = fillgaps($micro, \@phases); + $change = 1 if uc($micro) ne uc($orig_micro); + $node_sequences_temp{$tag}=$micro if $nonmicrosathash->{$tag} ne "NULL"; + } + + if (exists $alignment->{$tag}){ + my $micro = $alignment->{$tag}; + my $orig_micro = $micro; + ($micro, $fillrecord{$tag}) = fillgaps($micro, \@phases); + $change = 1 if uc($micro) ne uc($orig_micro); + $node_alignments_temp{$tag}=$micro if $alignment->{$tag} ne "NULL"; + } + + #print "adding to node_sequences: $tag = ",$node_sequences_temp{$tag},"\n" if $printer == 1; + #print "adding to node_alignments: $tag = ",$node_alignments_temp{$tag},"\n" if $printer == 1; + } + + + my %node_sequences=(); + my %node_alignments =(); #NEW, Nov 28 2008 + foreach my $tag (@$tagarray) { + $node_sequences{$tag} = join ".",split(/\s*/,$node_sequences_temp{$tag}); + $node_alignments{$tag} = join ".",split(/\s*/,$node_alignments_temp{$tag}); + } +# print "\n", "#" x 50, "\n" if $printer == 1; + foreach my $tag (@tags){ +# print "$tag: $alignment->{$tag} = $node_alignments{$tag}\n" if $printer == 1; + } +# print "\n", "#" x 50, "\n" if $printer == 1; +# print "change = $change\n"; + #<STDIN> if $concatalignment=~/\-/; + +# <STDIN> if $printer == 1 && $concatalignment =~ /\-/; + + return 0, "NULL","NULL","NULL", "NULL", "NULL" if $change == 0; + + my ($nodes_arr, $branches_hash) = get_nodes($tree); + my @nodes=@$nodes_arr; +# print "recieved nodes = @nodes\n" if $printer == 1; + + + #POPULATE branches_hash WITH INFORMATION ABOUT LIVESTATUS + foreach my $keys (@nodes){ + my @pair = @$keys; + my $joint = "(".join(", ",@pair).")"; + my $copykey = join "", @pair; + $copykey =~ s/[\W ]+//g; +# print "for node: $keys, copykey = $copykey and joint = $joint\n" if $printer == 1; + my $livestatus = 1; + foreach my $copy (split(/\s*/,$copykey)){ + $livestatus = 0 if !exists $alivehash{$copy}; + } + $alivehash{$joint} = $joint if !exists $alivehash{$joint} && $livestatus == 1; +# print "alivehash = $alivehash{$joint}\n" if exists $alivehash{$joint} && $printer == 1; + } + + + + @nodes = reverse(@nodes); #1 THIS IS IN ORDER TO GO THROUGH THE TREE FROM LEAVES TO ROOT. + + my @mutations_array=(); + + my $joint = (); + foreach my $node (@nodes){ + my @pair = @$node; +# print "now in the nodes for loop, pair = @pair\n and sequences=\n" if $printer == 1; + $joint = "(".join(", ",@pair).")"; +# print "joint = $joint \n" if $printer == 1; + my @pair_sequences=(); + + foreach my $tag (@pair){ +# print "tag = $tag: " if $printer == 1; +# print $node_alignments{$tag},"\n" if $printer == 1; + push @pair_sequences, $node_alignments{$tag}; + } +# print "fillgap\n"; + my ($compared, $substitutions_list) = base_by_base_simple($motif,\@pair_sequences, scalar(@pair_sequences), @pair, $joint); + $node_alignments{$joint}=$compared; + push( @mutations_array,split(/:/,$substitutions_list)); +# print "newly added to node_sequences: $node_alignments{$joint} and list of mutations = @mutations_array\n" if $printer == 1; + } +# print "now sending for analyze_mutations: mutation_array=@mutations_array, nodes=@nodes, branches_hash=$branches_hash, alignment=$alignment, tags=@tags, alivehash=%alivehash, node_sequences=\%node_sequences, microsatstarts=$microsatstarts, motif=$motif\n" if $printer == 1; +# <STDIN> if $printer == 1; + + my $analayzed_mutations = analyze_mutations(\@mutations_array, \@nodes, $branches_hash, $alignment, \@tags, \%alivehash, \%node_sequences, $microsatstarts, $motif); + +# print "returningt: ", $analayzed_mutations, \@nodes,"\n" if scalar @mutations_array > 0;; +# print "returningy: NULL, NULL, NULL " if scalar @mutations_array == 0 && $printer == 1; +# print "final node alignment after filling for $joint= " if $printer == 1; +# print "$node_alignments{$joint}\n" if $printer == 1; + + + return 1, $analayzed_mutations, \@nodes, $branches_hash, \%alivehash, $node_alignments{$joint} if scalar @mutations_array > 0 ; + return 1, "NULL","NULL","NULL", "NULL", "NULL" if scalar @mutations_array == 0; +} + + + +sub add_mutation{ + my $printer = 0; +# print "IN SUBROUTUNE add_mutation.. information received = @_\n" if $printer == 1; + my ($i , $bite, $to, $from) = @_; +# print "bite = $bite.. all received info = ",join("^", @_),"\n" if $printer == 1; +# print "to=$to\n" if $printer == 1; +# print "tis split = ",join(" and ",split(/!/,$to)),"\n" if $printer == 1; + my @toields = split "!",$to; +# print "toilds = @toields\n" if $printer == 1; + my @mutations=(); + + foreach my $toield (@toields){ + my @toinfo=split(":",$toield); +# print " at toinfo=@toinfo \n" if $printer == 1; + next if $toinfo[1] =~ /$from/i; + my @mutation = @toinfo if $toinfo[1] !~ /$from/i; +# print "adding to mutaton list: ", join(",", "node=$mutation[0]","type=substitution" ,"position=$i", "from=$from", "to=$mutation[1]", "insertion=", "deletion="),"\n" if $printer == 1; + push (@mutations, join("\t", "node=$mutation[0]","type=substitution" ,"position=$i", "from=$from", "to=$mutation[1]", "insertion=", "deletion=")); + } + return @mutations; +} + + +sub add_bases{ + + my $printer = 0; +# print "IN SUBROUTUNE add_bases.. information received = @_\n" if $printer == 1; + my ($optional0, $optional1, $pair0, $pair1,$joint) = @_; + my $total_list=(); + + my @total_list0=split(/!/,$optional0); + my @total_list1=split(/!/,$optional1); + my @all_list=(); + my %total_hash0=(); + foreach my $entry (@total_list0) { + $entry = uc $entry; + $entry =~ /(\S+):(\S+)/; + $total_hash0{$2}=$1; + push @all_list, $2; + } + + my %total_hash1=(); + foreach my $entry (@total_list1) { + $entry = uc $entry; + $entry =~ /(\S+):(\S+)/; + $total_hash1{$2}=$1; + push @all_list, $2; + } + + my %alphabetical_hash=(); + my @return_options=(); + + for my $i (0 ... $#all_list){ + my $alph = $all_list[$i]; + if (exists $total_hash0{$alph} && exists $total_hash1{$alph}){ + push(@return_options, $joint.":".$alph); + delete $total_hash0{$alph}; delete $total_hash1{$alph}; + } + if (exists $total_hash0{$alph} && !exists $total_hash1{$alph}){ + push(@return_options, $pair0.":".$alph); + delete $total_hash0{$alph}; + } + if (!exists $total_hash0{$alph} && exists $total_hash1{$alph}){ + push(@return_options, $pair1.":".$alph); + delete $total_hash1{$alph}; + } + + } +# print "returning ",join "!",@return_options,"\n" if $printer == 1; + return join "!",@return_options; + +} + + +sub fillgaps{ +# print "IN fillgaps: @_\n"; + my ($micro, $phasesinput) = @_; + #print "in microsathash ,,.. micro = $micro\n"; + return $micro if $micro !~ /\-/; + my $orig_micro = $micro; + my @phases = @$phasesinput; + + my %tested_patterns = (); + + foreach my $phase (@phases){ + # print "considering phase: $phase\n"; + my @phase_prefixes = (); + my @prephase_left_contexts = (); + my @prephase_right_contexts = (); + my @pregapsize = (); + my @prepostfilins = (); + + my @phase_suffixes; + my @suffphase_left_contexts; + my @suffphase_right_contexts; + my @suffgapsize; + my @suffpostfilins; + + my @postfilins = (); + my $motifsize = length($phases[0]); + + my $change = 0; + + for my $u (0 ... $motifsize-1){ + my $concat = $phase.$phase.$phase.$phase; + my @concatarr = split(/\s*/, $concat); + my $l = 0; + while ($l < $u){ + shift @concatarr; + $l++; + } + $concat = join ("", @concatarr); + + for my $t (0 ... $motifsize-1){ + for my $k (1 ... $motifsize-1){ + push @phase_prefixes, substr($concat, $motifsize+$t, $k); + push @prephase_left_contexts, substr ($concat, $t, $motifsize); + push @prephase_right_contexts, substr ($concat, $motifsize+$t+$k+($motifsize-$k), 1); + push @pregapsize, $k; + push @prepostfilins, substr($concat, $motifsize+$t+$k, ($motifsize-$k)); + # print "reading: $concat, t=$t, k=$k prefix: $prephase_left_contexts[$#prephase_left_contexts] $phase_prefixes[$#phase_prefixes] -x$pregapsize[$#pregapsize] $prephase_right_contexts[$#prephase_right_contexts]\n"; + # print "phase_prefixes = $phase_prefixes[$#phase_prefixes]\n"; + # print "prephase_left_contexts = $prephase_left_contexts[$#prephase_left_contexts]\n"; + # print "prephase_right_contexts = $prephase_right_contexts[$#prephase_right_contexts]\n"; + # print "pregapsize = $pregapsize[$#pregapsize]\n"; + # print "prepostfilins = $prepostfilins[$#prepostfilins]\n"; + } + } + } + + # print "looking if $micro =~ /($phase\-{$motifsize})/i || $micro =~ /^(\-{$motifsize,}$phase)/i\n"; + if ($micro =~ /($phase\-{$motifsize,})$/i || $micro =~ /^(\-{$motifsize,}$phase)/i){ + # print "micro: $micro needs further gap removal: $1\n"; + while ($micro =~ /$phase(\-{$motifsize,})$/i || $micro =~ /^(\-{$motifsize,})$phase/i){ + # print "micro: $micro needs further gap removal: $1\n"; + + # print "phase being considered = $phase\n"; + my $num = (); + $num = $micro =~ s/$phase\-{$motifsize}/$phase$phase/gi if $micro =~ /$phase\-{$motifsize,}/i; + $num = $micro =~ s/\-{$motifsize}$phase/$phase$phase/gi if $micro =~ /\-{$motifsize,}$phase/i; + # print "num = $num\n"; + $change = 1 if $num == 1; + } + } + + elsif ($micro =~ /(($phase)+)\-{$motifsize,}(($phase)+)/i){ + while ($micro =~ /(($phase)+)\-{$motifsize,}(($phase)+)/i){ + # print "checking lengths of $1 and $3 for $micro... \n"; + my $num = (); + if (length($1) >= length($3)){ + # print "$micro matches (($phase)+)\-{$motifsize,}(($phase)+) = $1, >= , $3 \n"; + $num = $micro =~ s/$phase\-{$motifsize}/$phase$phase/gi ; + } + if (length($1) < length($3)){ + # print "$micro matches (($phase)+)\-{$motifsize,}(($phase)+) = $1, < , $3 \n"; + $num = $micro =~ s/\-{$motifsize}$phase/$phase$phase/gi ; + } + # print "micro changed to $micro\n"; + } + } + elsif ($micro =~ /([A-Z]+)\-{$motifsize,}(($phase)+)/i){ + while ($micro =~ /([A-Z]+)\-{$motifsize,}(($phase)+)/i){ + # print "$micro matches ([A-Z]+)\-{$motifsize}(($phase)+) = 1=$1, - , 3=$3 \n"; + my $num = 0; + $num = $micro =~ s/\-{$motifsize}$phase/$phase$phase/gi ; + } + } + elsif ($micro =~ /(($phase)+)\-{$motifsize,}([A-Z]+)/i){ + while ($micro =~ /(($phase)+)\-{$motifsize,}([A-Z]+)/i){ + # print "$micro matches (($phase)+)\-{$motifsize,}([A-Z]+) = 1=$1, - , 3=$3 \n"; + my $num = 0; + $num = $micro =~ s/$phase\-{$motifsize}/$phase$phase/gi ; + } + } + + # print "$orig_micro to $micro\n"; + + #s <STDIN>; + + for my $h (0 ... $#phase_prefixes){ + # print "searching using prefix : $prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h]\n"; + my $pattern = $prephase_left_contexts[$h].$phase_prefixes[$h].$pregapsize[$h].$prephase_right_contexts[$h]; + # print "returning orig_micro = $orig_micro, micro = $micro \n" if exists $tested_patterns{$pattern}; + if ($micro =~ /$prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h]/i){ + return $orig_micro if exists $tested_patterns{$pattern}; + while ($micro =~ /($prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h])/i){ + $tested_patterns{$pattern} = $pattern; + # print "micro: $micro needs further gap removal: $1\n"; + + # print "prefix being considered = $phase_prefixes[$h]\n"; + my $num = (); + $num = ($micro =~ s/$prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h]/$prephase_left_contexts[$h]$phase_prefixes[$h]$prepostfilins[$h]$prephase_right_contexts[$h]/gi) ; + # print "num = $num, micro = $micro\n"; + $change = 1 if $num == 1; + + return $orig_micro if $num > 1; + } + } + + } + } + return $orig_micro if length($micro) != length($orig_micro); + return $micro; +} + +sub selectMutationArray{ + my $printer =0; + + my $oldmutspt = $_[0]; + my $newmutspt = $_[1]; + my $tagstringpt = $_[2]; + my $alivehashpt = $_[3]; + my $alignmentpt = $_[4]; + my $motif = $_[5]; + + my @alivehasharr=(); + + my @tags = @$tagstringpt; + my $alignmentln = length($alignmentpt->{$tags[0]}); + + foreach my $key (keys %$alivehashpt) { push @alivehasharr, $key; } + + my %newside = (); + my %oldside = (); + my %newmuts = (); + + my %commons = (); + my %olds = (); + foreach my $old (@$oldmutspt){ + $olds{$old} = 1; + } + foreach my $new (@$newmutspt){ + $commons{$new} = 1 if exists $olds{$new};; + } + + + foreach my $pos ( 0 ... $alignmentln){ + #print "pos = $pos\n" if $printer == 1; + my $newyes = 0; + foreach my $mut (@$newmutspt){ + $newmuts{$mut} = 1; + chomp $mut; + $newyes++; + $mut =~ s/=\t/= \t/g; + $mut =~ s/=$/= /g; + + $mut =~ /node=([A-Z\(\), ]+)\stype=([a-zA-Z ]+)\sposition=([0-9 ]+)\sfrom=([a-zA-Z\- ]+)\sto=([a-zA-Z\- ]+)\sinsertion=([a-zA-Z\- ]+)\sdeletion=([a-zA-Z\- ]+)/; + my $node = $1; + next if $3 != $pos; +# print "new mut = $mut\n" if $printer == 1; +# print "node = $node, pos = $3 ... and alivehasharr = >@alivehasharr<\n" if $printer == 1; + my $alivenode = 0; + foreach my $key (@alivehasharr){ + $alivenode = 1 if $key =~ /$node/; + } + # next if $alivenode == 0; + my $indel_type = " "; + if ($2 eq "insertion" || $2 eq "deletion"){ + my $thisindel = (); + $thisindel = $6 if $2 eq "insertion"; + $thisindel = $7 if $2 eq "deletion"; + + $indel_type = "i".checkIndelType($node, $thisindel, $motif,$alignmentpt,$3, $2) if $2 eq "insertion"; + $indel_type = "d".checkIndelType($node, $thisindel, $motif,$alignmentpt, $3, $2) if $2 eq "deletion"; + $indel_type = $indel_type."f" if $indel_type =~ /mot/ && length($thisindel) >= length($motif); + } +# print "indeltype = $indel_type\n" if $printer == 1; + my $added = 0; + + if (exists $newside{$pos} && $indel_type =~ /[a-z]+/){ +# print "we have a preexisting one for $pos\n" if $printer == 1; + my @preexisting = @{$newside{$pos}}; + foreach my $pre (@preexisting){ +# print "looking at $pre\n" if $printer == 1; + next if $pre !~ /node=$node/; + next if $pre !~ /indeltype=([a-z]+)/; + my $currtype = $1; + + if ($currtype =~ /inon/ && $indel_type =~ /dmot/){ + delete $newside{$pos}; + push @{$newside{$pos}}, $pre; + $added = 1; + } + if ($currtype =~ /dnon/ && $indel_type =~ /imot/){ + delete $newside{$pos}; + push @{$newside{$pos}}, $pre; + $added = 1; + } + if ($currtype =~ /dmot/ && $indel_type =~ /inon/){ + delete $newside{$pos}; + push @{$newside{$pos}}, $mut."\tindeltype=$indel_type"; + $added = 1; + } + if ($currtype =~ /imot/ && $indel_type =~ /dnon/){ + delete $newside{$pos}; + push @{$newside{$pos}}, $mut."\tindeltype=$indel_type"; + $added = 1; + } + } + } +# print "added = $added\n" if $printer == 1; + push @{$newside{$pos}}, $mut."\tindeltype=$indel_type" if $added == 0; +# print "for new pos,: $pos we have: @{$newside{$pos}}\n " if $printer == 1; + } + } + + foreach my $pos ( 0 ... $alignmentln){ + my $oldyes = 0; + foreach my $mut (@$oldmutspt){ + chomp $mut; + $oldyes++; + $mut =~ s/=\t/= \t/g; + $mut =~ s/=$/= /g; + $mut =~ /node=([A-Z\(\), ]+)\ttype=([a-zA-Z ]+)\tposition=([0-9 ]+)\tfrom=([a-zA-Z\- ]+)\tto=([a-zA-Z\- ]+)\tinsertion=([a-zA-Z\- ]+)\tdeletion=([a-zA-Z\- ]+)/; + my $node = $1; + next if $3 != $pos; +# print "old mut = $mut\n" if $printer == 1; + my $alivenode = 0; + foreach my $key (@alivehasharr){ + $alivenode = 1 if $key =~ /$node/; + } + #next if $alivenode == 0; + my $indel_type = " "; + if ($2 eq "insertion" || $2 eq "deletion"){ + $indel_type = "i".checkIndelType($node, $6, $motif,$alignmentpt, $3, $2) if $2 eq "insertion"; + $indel_type = "d".checkIndelType($node, $7, $motif,$alignmentpt, $3, $2) if $2 eq "deletion"; + next if $indel_type =~/non/; + } + else{ next;} + + my $imp=0; + $imp = 1 if $indel_type =~ /dmot/ && $alivenode == 0; + $imp = 1 if $indel_type =~ /imot/ && $alivenode == 1; + + + if (exists $newside{$pos} && $indel_type =~ /[a-z]+/){ + my @preexisting = @{$newside{$pos}}; +# print "we have a preexisting one for $pos: @preexisting\n" if $printer == 1; + next if $imp == 0; + + if (scalar(@preexisting) == 1){ + my $foundmut = $preexisting[0]; + $foundmut=~ /node=([A-Z, \(\)]+)/; + next if $1 eq $node; + + if (exists $oldside{$pos} || exists $commons{$foundmut}){ +# print "not replacing, but just adding\n" if $printer == 1; + push @{$newside{$pos}}, $mut."\tindeltype=$indel_type"; + push @{$oldside{$pos}}, $mut."\tindeltype=$indel_type"; + next; + } + + delete $newside{$pos}; + push @{$oldside{$pos}}, $mut."\tindeltype=$indel_type"; + push @{$newside{$pos}}, $mut."\tindeltype=$indel_type"; +# print "now new one is : @{$newside{$pos}}\n" if $printer == 1; + } +# print "for pos: $pos: @{$newside{$pos}}\n" if $printer == 1; + next; + } + + + my @news = @{$newside{$pos}} if exists $newside{$pos}; +# print "mut = $mut and news = @news\n" if $printer == 1; + push @{$oldside{$pos}}, $mut."\tindeltype=$indel_type"; + push @{$newside{$pos}}, $mut."\tindeltype=$indel_type"; + } + } +# print "in the end, our collected mutations = \n" if $printer == 1; + my @returnarr = (); + foreach my $key (keys %newside) {push @returnarr,@{$newside{$key}};} +# print join("\n", @returnarr),"\n" if $printer == 1; + #<STDIN>; + return @returnarr; + +} + + +sub checkIndelType{ + my $printer = 0; + my $node = $_[0]; + my $indel = $_[1]; + my $motif = $_[2]; + my $alignmentpt = $_[3]; + my $posit = $_[4]; + my $type = $_[5]; + my @phases =(); + my %prephases = (); + my %postphases = (); + #print "motif = $motif\n"; +# print "IN checkIndelType ... received: @_\n" if $printer == 1; + my $concat = $motif.$motif.$motif.$motif; + my $motiflength = length($motif); + + if ($motiflength > length ($indel)){ + return "non" if $motif !~ /$indel/i; + return checkIndelType_ComplexAnalysis($node, $indel, $motif, $alignmentpt, $posit, $type); + } + + my $firstpass = 0; + for my $y (0 ... $motiflength-1){ + my $phase = substr($concat, $motiflength+$y, $motiflength); + push @phases, $phase; + $firstpass = 1 if $indel =~ /$phase/i; + for my $k (0 ... length($motif)-1){ +# print "at: motiflength=$motiflength , y=$y , k=$k.. for pre: $motiflength+$y-$k and post: $motiflength+$y-$k+$motiflength in $concat\n" if $printer == 1; + my $pre = substr($concat, $motiflength+$y-$k, $k ); + my $post = substr($concat, $motiflength+$y+$motiflength, $k); +# print "adding to phases : $phase - $pre and $post\n" if $printer == 1; + push @{$prephases{$phase}} , $pre; + push @{$postphases{$phase}} , $post; + } + + } +# print "firstpass 1= $firstpass\n" if $printer == 1; + return "non" if $firstpass ==0; + $firstpass =0; + + foreach my $phase (@phases){ + my @pres = @{$prephases{$phase}}; + my @posts = @{$postphases{$phase}}; + + foreach my $pre (@pres){ + foreach my $post (@posts){ + + $firstpass = 1 if $indel =~ /($pre)?($phase)+($post)?/i && length($indel) > (3 * length($motif)); + $firstpass = 1 if $indel =~ /^($pre)?($phase)+($post)?$/i && length($indel) < (3 * length($motif)); +# print "matched here : ($pre)?($phase)+($post)?\n" if $printer == 1; + last if $firstpass == 1; + } + last if $firstpass == 1; + } + last if $firstpass == 1; + } +# print "firstpass 2= $firstpass\n" if $printer == 1; + return "non" if $firstpass ==0; + return "mot" if $firstpass ==1; +} + + +sub checkIndelType_ComplexAnalysis{ + my $printer = 0; + my $node = $_[0]; + my $indel = $_[1]; + my $motif = $_[2]; + my $alignmentpt = $_[3]; + my $pos = $_[4]; + my $type = $_[5]; + my @speciesinvolved = $node =~ /[A-Z]+/g; + + my @seqs = (); + my $residualseq = length($motif) - length($indel); +# print "IN COMPLEX ANALYSIS ... received: @_ .... speciesinvolved = @speciesinvolved\n" if $printer == 1; +# print "we have position = $pos, sseq = $alignmentpt->{$speciesinvolved[0]}\n" if $printer == 1; +# print "residualseq = $residualseq\n" if $printer == 1; +# print "pos=$pos... got: @_\n" if $printer == 1; + foreach my $sp (@speciesinvolved){ + my $spseq = $alignmentpt->{$sp}; + #print "orig spseq = $spseq\n"; + my $subseq = (); + + if ($type eq "deletion"){ + my @indelparts = split(/\s*/,$indel); + my @seqparts = split(/\s*/,$spseq); + + for my $p ($pos ... $pos+length($indel)-1){ + $seqparts[$p] = shift @indelparts; + } + $spseq = join("",@seqparts); + } + #print "mod spseq = $spseq\n"; + # $spseq=~ s/\-//g if $type !~ /deletion/; +# print "substr($spseq, $pos-($residualseq), length($indel)+$residualseq+$residualseq)\n" if $pos > 0 && $pos < (length($spseq) - length($motif)) && $printer == 1; +# print "substr($spseq, 0, length($indel)+$residualseq)\n" if $pos == 0 && $printer == 1; +# print "substr($spseq, $pos - $residualseq, length($indel)+$residualseq)\n" if $pos >= (length($spseq) - length($motif)) && $printer == 1; + + $subseq = substr($spseq, $pos-($residualseq), length($indel)+$residualseq+$residualseq) if $pos > 0 && $pos < (length($spseq) - length($motif)) ; + $subseq = substr($spseq, 0, length($indel)+$residualseq) if $pos == 0; + $subseq = substr($spseq, $pos - $residualseq, length($indel)+$residualseq) if $pos >= (length($spseq) - length($motif)) ; +# print "spseq = $spseq . subseq=$subseq . type = $type\n" if $printer == 1; + #<STDIN> if $subseq !~ /[a-z\-]/i; + $subseq =~ s/\-/$indel/g if $type =~ /insertion/; + push @seqs, $subseq; +# print "seqs = @seqs\n" if $printer == 1; + } + return "non" if checkIfSeqsIdentical(@seqs) eq "NO"; +# print "checking for $seqs[0] \n" if $printer == 1; + + my @phases =(); + my %prephases = (); + my %postphases = (); + my $concat = $motif.$motif.$motif.$motif; + my $motiflength = length($motif); + + my $firstpass = 0; + + for my $y (0 ... $motiflength-1){ + my $phase = substr($concat, $motiflength+$y, $motiflength); + push @phases, $phase; + $firstpass = 1 if $seqs[0] =~ /$phase/i; + for my $k (0 ... length($motif)-1){ + my $pre = substr($concat, $motiflength+$y-$k, $k ); + my $post = substr($concat, $motiflength+$y+$motiflength, $k); +# print "adding to phases : $phase - $pre and $post\n" if $printer == 1; + push @{$prephases{$phase}} , $pre; + push @{$postphases{$phase}} , $post; + } + + } +# print "firstpass 1= $firstpass.. also, res-d = ",(length($seqs[0]))%(length($motif)),"\n" if $printer == 1; + return "non" if $firstpass ==0; + $firstpass =0; + foreach my $phase (@phases){ + + $firstpass = 1 if $seqs[0] =~ /^($phase)+$/i && ((length($seqs[0]))%(length($motif))) == 0; + + if (((length($seqs[0]))%(length($motif))) != 0){ + my @pres = @{$prephases{$phase}}; + my @posts = @{$postphases{$phase}}; + foreach my $pre (@pres){ + foreach my $post (@posts){ + next if $pre !~ /\S/ && $post !~ /\S/; + $firstpass = 1 if ($seqs[0] =~ /^($pre)($phase)+($post)$/i || $seqs[0] =~ /^($pre)($phase)+$/i || $seqs[0] =~ /^($phase)+($post)$/i); +# print "caught with $pre $phase $post\n" if $printer == 1; + last if $firstpass == 1; + } + last if $firstpass == 1; + } + } + + last if $firstpass == 1; + } + + #print "indel = $indel.. motif = $motif.. firstpass 2= mot\n" if $firstpass ==1; + #print "indel = $indel.. motif = $motif.. firstpass 2= non\n" if $firstpass ==0; + #<STDIN>;# if $firstpass ==1; + return "non" if $firstpass ==0; + return "mot" if $firstpass ==1; + +} + +sub checkIfSeqsIdentical{ + my @seqs = @_; + my $identical = 1; + + for my $j (1 ... $#seqs){ + $identical = 0 if uc($seqs[0]) ne uc($seqs[$j]); + } + return "NO" if $identical == 0; + return "YES" if $identical == 1; + +} + +sub summarizeMutations{ + my $mutspt = $_[0]; + my @muts = @$mutspt; + my $tree = $_[1]; + + my @returnarr = (); + + for (1 ... 38){ + push @returnarr, "NA"; + } + push @returnarr, "NULL"; + return @returnarr if $tree eq "NULL" || scalar(@muts) < 1; + + + my @bspecies = (); + my @dspecies = (); + my $treecopy = $tree; + $treecopy =~ s/[\(\)]//g; + my @treeparts = split(/[\.,]+/, $treecopy); + + for my $part (@treeparts){ + if ($part =~ /\+/){ + $part =~ s/\+//g; + #my @sp = split(/\s*/, $part); + #foreach my $p (@sp) {push @bspecies, $p;} + push @bspecies, $part; + } + if ($part =~ /\-/){ + $part =~ s/\-//g; + #my @sp = split(/\s*/, $part); + #foreach my $p (@sp) {push @dspecies, $p;} + push @dspecies, $part; + } + + } + #print "-------------------------------------------------------\n"; + + my ($insertions, $deletions, $motinsertions, $motinsertionsf, $motdeletions, $motdeletionsf, $noninsertions, $nondeletions) = (0,0,0,0,0,0,0,0); + my ($binsertions, $bdeletions, $bmotinsertions,$bmotinsertionsf, $bmotdeletions, $bmotdeletionsf, $bnoninsertions, $bnondeletions) = (0,0,0,0,0,0,0,0); + my ($dinsertions, $ddeletions, $dmotinsertions,$dmotinsertionsf, $dmotdeletions, $dmotdeletionsf, $dnoninsertions, $dnondeletions) = (0,0,0,0,0,0,0,0); + my ($ninsertions, $ndeletions, $nmotinsertions,$nmotinsertionsf, $nmotdeletions, $nmotdeletionsf, $nnoninsertions, $nnondeletions) = (0,0,0,0,0,0,0,0); + my ($substitutions, $bsubstitutions, $dsubstitutions, $nsubstitutions, $indels, $subs) = (0,0,0,0,"NA","NA"); + + my @insertionsarr = (" "); + my @deletionsarr = (" "); + + my @substitutionsarr = (" "); + + + foreach my $mut (@muts){ + # print "mut = $mut\n"; + chomp $mut; + $mut =~ s/=\t/= /g; + $mut =~ s/=$/= /g; + my %mhash = (); + my @mields = split(/\t/,$mut); + + foreach my $m (@mields){ + my @fields = split(/=/,$m); + next if $fields[1] eq " "; + $mhash{$fields[0]} = $fields[1]; + } + + my $myutype = (); + my $decided = 0; + + my $localnode = $mhash{"node"}; + $localnode =~ s/[\(\)\. ,]//g; + + + foreach my $s (@bspecies){ + if ($localnode eq $s) { + $decided = 1; $myutype = "b"; + } + } + + foreach my $s (@dspecies){ + if ($localnode eq $s) { + $decided = 1; $myutype = "d"; + } + } + + $myutype = "n" if $decided != 1; + + + # print "tree=$tree, birth species=@bspecies, death species=@dspecies, node=$mhash{node} .. myutype=$myutype .. \n"; + # <STDIN> if $mhash{"type"} eq "insertion" && $myutype eq "b"; + + + if ($mhash{"type"} eq "substitution"){ + $substitutions++; + $bsubstitutions++ if $myutype eq "b"; + $dsubstitutions++ if $myutype eq "d"; + $nsubstitutions++ if $myutype eq "n"; + # print "substitution: from= $mhash{from}, to = $mhash{to}, and type = myutype\n"; + push @substitutionsarr, "$mhash{position}:".$mhash{"from"}.">".$mhash{"to"} if $myutype eq "b"; + push @substitutionsarr, "$mhash{position}:".$mhash{"from"}.">".$mhash{"to"} if $myutype eq "d"; + push @substitutionsarr, "$mhash{position}:".$mhash{"from"}.">".$mhash{"to"} if $myutype eq "n"; + # print "substitutionsarr = @substitutionsarr\n"; + # <STDIN>; + } + else{ + #print "tree=$tree, birth species=@bspecies, death species=@dspecies, node=$mhash{node} .. myutype=$myutype .. indeltype=$mhash{indeltype}\n"; + if ($mhash{"type"} eq "deletion"){ + $deletions++; + + $motdeletions++ if $mhash{"indeltype"} =~ /dmot/; + $motdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/; + + $nondeletions++ if $mhash{"indeltype"} =~ /dnon/; + + $bdeletions++ if $myutype eq "b"; + $ddeletions++ if $myutype eq "d"; + $ndeletions++ if $myutype eq "n"; + + $bmotdeletions++ if $mhash{"indeltype"} =~ /dmot/ && $myutype eq "b"; + $bmotdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/ && $myutype eq "b"; + $bnondeletions++ if $mhash{"indeltype"} =~ /dnon/ && $myutype eq "b"; + + $dmotdeletions++ if $mhash{"indeltype"} =~ /dmot/ && $myutype eq "d"; + $dmotdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/ && $myutype eq "d"; + $dnondeletions++ if $mhash{"indeltype"} =~ /dnon/ && $myutype eq "d"; + + $nmotdeletions++ if $mhash{"indeltype"} =~ /dmot/ && $myutype eq "n"; + $nmotdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/ && $myutype eq "n"; + $nnondeletions++ if $mhash{"indeltype"} =~ /dnon/ && $myutype eq "n"; + + push @deletionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"deletion"} if $myutype eq "b"; + push @deletionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"deletion"} if $myutype eq "d"; + push @deletionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"deletion"} if $myutype eq "n"; + } + + if ($mhash{"type"} eq "insertion"){ + $insertions++; + + $motinsertions++ if $mhash{"indeltype"} =~ /imot/; + $motinsertionsf++ if $mhash{"indeltype"} =~ /imotf/; + $noninsertions++ if $mhash{"indeltype"} =~ /inon/; + + $binsertions++ if $myutype eq "b"; + $dinsertions++ if $myutype eq "d"; + $ninsertions++ if $myutype eq "n"; + + $bmotinsertions++ if $mhash{"indeltype"} =~ /imot/ && $myutype eq "b"; + $bmotinsertionsf++ if $mhash{"indeltype"} =~ /imotf/ && $myutype eq "b"; + $bnoninsertions++ if $mhash{"indeltype"} =~ /inon/ && $myutype eq "b"; + + $dmotinsertions++ if $mhash{"indeltype"} =~ /imot/ && $myutype eq "d"; + $dmotinsertionsf++ if $mhash{"indeltype"} =~ /imotf/ && $myutype eq "d"; + $dnoninsertions++ if $mhash{"indeltype"} =~ /inon/ && $myutype eq "d"; + + $nmotinsertions++ if $mhash{"indeltype"} =~ /imot/ && $myutype eq "n"; + $nmotinsertionsf++ if $mhash{"indeltype"} =~ /imotf/ && $myutype eq "n"; + $nnoninsertions++ if $mhash{"indeltype"} =~ /inon/ && $myutype eq "n"; + + push @insertionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"insertion"} if $myutype eq "b"; + push @insertionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"insertion"} if $myutype eq "d"; + push @insertionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"insertion"} if $myutype eq "n"; + + } + } + } + + + + $indels = "ins=".join(",",@insertionsarr).";dels=".join(",",@deletionsarr) if scalar(@insertionsarr) > 1 || scalar(@deletionsarr) > 1 ; + $subs = join(",",@substitutionsarr) if scalar(@substitutionsarr) > 1; + $indels =~ s/ //g; + $subs =~ s/ //g ; + + #print "indels = $indels, subs=$subs\n"; + ##<STDIN> if $indels =~ /[a-zA-Z0-9]/ || $subs =~ /[a-zA-Z0-9]/ ; + #print "tree = $tree, indels = $indels, subs = $subs, bspecies = @bspecies, dspecies = @dspecies \n"; + my @returnarray = (); + + push (@returnarray, $indels, $subs) ; + + push @returnarray, $tree; + + my @copy = @returnarray; + #print "\n\nreturnarray = @returnarray ... binsertions=$binsertions dinsertions=$dinsertions bsubstitutions=$bsubstitutions dsubstitutions=$dsubstitutions\n"; + #<STDIN>; + return (@returnarray); + +} + +sub selectBetterTree{ + my $printer = 1; + my $treestudy = $_[0]; + my $alt = $_[1]; + my $mutspt = $_[2]; + my @muts = @$mutspt; + my @trees = (); my @alternatetrees=(); + + @trees = split(/\|/,$treestudy) if $treestudy =~ /\|/; + @alternatetrees = split(/[\|;]/,$alt) if $alt =~ /[\|;\(\)]/; + + $trees[0] = $treestudy if $treestudy !~ /\|/; + $alternatetrees[0] = $alt if $alt !~ /[\|;\(\)]/; + + my @alltrees = (@trees, @alternatetrees); +# push(@alltrees,@alternatetrees); + + my %mutspecies = (); +# print "IN selectBetterTree..treestudy=$treestudy. alt=$alt. for: @_. trees=@trees<. alternatetrees=@alternatetrees\n" if $printer == 1; + #<STDIN>; + foreach my $mut (@muts){ +# print colored ['green'],"mut = $mut\n" if $printer == 1; + $mut =~ /node=([A-Z,\(\) ]+)/; + my $node = $1; + $node =~s/[,\(\) ]+//g; + my @indivspecies = $node =~ /[A-Z]+/g; + #print "adding node: $node\n" if $printer == 1; + $mutspecies{$node} = $node; + + } + + my @treerecords = (); + my $treecount = -1; + foreach my $tree (@alltrees){ +# print "checking with tree $tree\n" if $printer == 1; + $treecount++; + $treerecords[$treecount] = 0; + my @indivspecies = ($tree =~ /[A-Z]+/g); +# print "indivspecies=@indivspecies\n" if $printer == 1; + foreach my $species (@indivspecies){ +# print "checkin if exists species: $species\n" if $printer == 1; + $treerecords[$treecount]+=2 if exists $mutspecies{$species} && $mutspecies{$species} !~ /indeltype=[a-z]mot/; + $treerecords[$treecount]+=1.5 if exists $mutspecies{$species} && $mutspecies{$species} =~ /indeltype=[a-z]mot/; + $treerecords[$treecount]-- if !exists $mutspecies{$species}; + } +# print "for tree $tree, our treecount = $treerecords[$treecount]\n" if $printer == 1; + } + + my @best_tree = array_largest_number_arrayPosition(@treerecords); +# print "treerecords = @treerecords. hence, best tree = @best_tree = $alltrees[$best_tree[0]], $treerecords[$best_tree[0]]\n" if $printer == 1; + #<STDIN>; + return ($alltrees[$best_tree[0]], $treerecords[$best_tree[0]]) if scalar(@best_tree) == 1; +# print "best_tree[0] = $best_tree[0], and treerecords = $treerecords[$best_tree[0]]\n" if $printer == 1; + return ("NULL", -1) if $treerecords[$best_tree[0]] < 1; + my $rando = int(rand($#trees)); + return ($alltrees[$rando], $treerecords[$rando]) if scalar(@best_tree) > 1; + +} + + + + +sub load_sameHash{ + #my $g = %$_[0]; + $sameHash{"CAGT"}="AGTC"; + $sameHash{"ATGA"}="AATG"; + $sameHash{"CAAC"}="AACC"; + $sameHash{"GGAA"}="AAGG"; + $sameHash{"TAAG"}="AAGT"; + $sameHash{"CGAG"}="AGCG"; + $sameHash{"TAGG"}="AGGT"; + $sameHash{"GCAG"}="AGGC"; + $sameHash{"TAGA"}="ATAG"; + $sameHash{"TGA"}="ATG"; + $sameHash{"CAAG"}="AAGC"; + $sameHash{"CTAA"}="AACT"; + $sameHash{"CAAT"}="AATC"; + $sameHash{"GTAG"}="AGGT"; + $sameHash{"GAAG"}="AAGG"; + $sameHash{"CGA"}="ACG"; + $sameHash{"GTAA"}="AAGT"; + $sameHash{"ACAA"}="AAAC"; + $sameHash{"GCGG"}="GGGC"; + $sameHash{"ATCA"}="AATC"; + $sameHash{"TAAC"}="AACT"; + $sameHash{"GGCA"}="AGGC"; + $sameHash{"TGAG"}="AGTG"; + $sameHash{"AACA"}="AAAC"; + $sameHash{"GAGC"}="AGCG"; + $sameHash{"ACCA"}="AACC"; + $sameHash{"TGAA"}="AATG"; + $sameHash{"ACA"}="AAC"; + $sameHash{"GAAC"}="AACG"; + $sameHash{"GCA"}="AGC"; + $sameHash{"CCAC"}="ACCC"; + $sameHash{"CATA"}="ATAC"; + $sameHash{"CAC"}="ACC"; + $sameHash{"TACA"}="ATAC"; + $sameHash{"GGAC"}="ACGG"; + $sameHash{"AGA"}="AAG"; + $sameHash{"ATAA"}="AAAT"; + $sameHash{"CA"}="AC"; + $sameHash{"CCCA"}="ACCC"; + $sameHash{"TCAA"}="AATC"; + $sameHash{"CAGA"}="AGAC"; + $sameHash{"AATA"}="AAAT"; + $sameHash{"CCA"}="ACC"; + $sameHash{"AGAA"}="AAAG"; + $sameHash{"AGTA"}="AAGT"; + $sameHash{"GACG"}="ACGG"; + $sameHash{"TCAG"}="AGTC"; + $sameHash{"ACGA"}="AACG"; + $sameHash{"CGCA"}="ACGC"; + $sameHash{"GAGT"}="AGTG"; + $sameHash{"GA"}="AG"; + $sameHash{"TA"}="AT"; + $sameHash{"TAA"}="AAT"; + $sameHash{"CAG"}="AGC"; + $sameHash{"GATA"}="ATAG"; + $sameHash{"GTA"}="AGT"; + $sameHash{"CCAA"}="AACC"; + $sameHash{"TAG"}="AGT"; + $sameHash{"CAAA"}="AAAC"; + $sameHash{"AAGA"}="AAAG"; + $sameHash{"CACG"}="ACGC"; + $sameHash{"GTCA"}="AGTC"; + $sameHash{"GGA"}="AGG"; + $sameHash{"GGAT"}="ATGG"; + $sameHash{"CGGG"}="GGGC"; + $sameHash{"CGGA"}="ACGG"; + $sameHash{"AGGA"}="AAGG"; + $sameHash{"TAAA"}="AAAT"; + $sameHash{"GAGA"}="AGAG"; + $sameHash{"ACTA"}="AACT"; + $sameHash{"GCGA"}="AGCG"; + $sameHash{"CACA"}="ACAC"; + $sameHash{"AGAT"}="ATAG"; + $sameHash{"GAGG"}="AGGG"; + $sameHash{"CGAC"}="ACCG"; + $sameHash{"GGAG"}="AGGG"; + $sameHash{"GCCA"}="AGCC"; + $sameHash{"CCAG"}="AGCC"; + $sameHash{"GAAA"}="AAAG"; + $sameHash{"CAGG"}="AGGC"; + $sameHash{"GAC"}="ACG"; + $sameHash{"CAA"}="AAC"; + $sameHash{"GACC"}="ACCG"; + $sameHash{"GGCG"}="GGGC"; + $sameHash{"GGTA"}="AGGT"; + $sameHash{"AGCA"}="AAGC"; + $sameHash{"GATG"}="ATGG"; + $sameHash{"GTGA"}="AGTG"; + $sameHash{"ACAG"}="AGAC"; + $sameHash{"CGG"}="GGC"; + $sameHash{"ATA"}="AAT"; + $sameHash{"GACA"}="AGAC"; + $sameHash{"GCAA"}="AAGC"; + $sameHash{"CAGC"}="AGCC"; + $sameHash{"GGGA"}="AGGG"; + $sameHash{"GAG"}="AGG"; + $sameHash{"ACAT"}="ATAC"; + $sameHash{"GAAT"}="AATG"; + $sameHash{"CACC"}="ACCC"; + $sameHash{"GAT"}="ATG"; + $sameHash{"GCG"}="GGC"; + $sameHash{"GCAC"}="ACGC"; + $sameHash{"GAA"}="AAG"; + $sameHash{"TGGA"}="ATGG"; + $sameHash{"CCGA"}="ACCG"; + $sameHash{"CGAA"}="AACG"; +} + + + +sub load_revHash{ + $revHash{"CTGA"}="AGTC"; + $revHash{"TCTT"}="AAAG"; + $revHash{"CTAG"}="AGCT"; + $revHash{"GGTG"}="ACCC"; + $revHash{"GCC"}="GGC"; + $revHash{"GCTT"}="AAGC"; + $revHash{"GCGT"}="ACGC"; + $revHash{"GTTG"}="AACC"; + $revHash{"CTCC"}="AGGG"; + $revHash{"ATC"}="ATG"; + $revHash{"CGAT"}="ATCG"; + $revHash{"TTAA"}="AATT"; + $revHash{"GTTC"}="AACG"; + $revHash{"CTGC"}="AGGC"; + $revHash{"TCGA"}="ATCG"; + $revHash{"ATCT"}="ATAG"; + $revHash{"GGTT"}="AACC"; + $revHash{"CTTA"}="AAGT"; + $revHash{"TGGC"}="AGCC"; + $revHash{"CCG"}="GGC"; + $revHash{"CGGC"}="GGCC"; + $revHash{"TTAG"}="AACT"; + $revHash{"GTG"}="ACC"; + $revHash{"CTTT"}="AAAG"; + $revHash{"TGCA"}="ATGC"; + $revHash{"CGCT"}="AGCG"; + $revHash{"TTCC"}="AAGG"; + $revHash{"CT"}="AG"; + $revHash{"C"}="G"; + $revHash{"CTCT"}="AGAG"; + $revHash{"ACTT"}="AAGT"; + $revHash{"GGTC"}="ACCG"; + $revHash{"ATTC"}="AATG"; + $revHash{"GGGT"}="ACCC"; + $revHash{"CCTA"}="AGGT"; + $revHash{"CGCG"}="GCGC"; + $revHash{"GTGT"}="ACAC"; + $revHash{"GCCC"}="GGGC"; + $revHash{"GTCG"}="ACCG"; + $revHash{"TCCC"}="AGGG"; + $revHash{"TTCA"}="AATG"; + $revHash{"AGTT"}="AACT"; + $revHash{"CCCT"}="AGGG"; + $revHash{"CCGC"}="GGGC"; + $revHash{"CTT"}="AAG"; + $revHash{"TTGG"}="AACC"; + $revHash{"ATT"}="AAT"; + $revHash{"TAGC"}="AGCT"; + $revHash{"ACTG"}="AGTC"; + $revHash{"TCAC"}="AGTG"; + $revHash{"CTGT"}="AGAC"; + $revHash{"TGTG"}="ACAC"; + $revHash{"ATCC"}="ATGG"; + $revHash{"GTGG"}="ACCC"; + $revHash{"TGGG"}="ACCC"; + $revHash{"TCGG"}="ACCG"; + $revHash{"CGGT"}="ACCG"; + $revHash{"GCTC"}="AGCG"; + $revHash{"TACG"}="ACGT"; + $revHash{"GTTT"}="AAAC"; + $revHash{"CAT"}="ATG"; + $revHash{"CATG"}="ATGC"; + $revHash{"GTTA"}="AACT"; + $revHash{"CACT"}="AGTG"; + $revHash{"TCAT"}="AATG"; + $revHash{"TTA"}="AAT"; + $revHash{"TGTA"}="ATAC"; + $revHash{"TTTC"}="AAAG"; + $revHash{"TACT"}="AAGT"; + $revHash{"TGTT"}="AAAC"; + $revHash{"CTA"}="AGT"; + $revHash{"GACT"}="AGTC"; + $revHash{"TTGC"}="AAGC"; + $revHash{"TTC"}="AAG"; + $revHash{"GCT"}="AGC"; + $revHash{"GCAT"}="ATGC"; + $revHash{"TGGT"}="AACC"; + $revHash{"CCT"}="AGG"; + $revHash{"CATC"}="ATGG"; + $revHash{"CCAT"}="ATGG"; + $revHash{"CCCG"}="GGGC"; + $revHash{"TGCC"}="AGGC"; + $revHash{"TG"}="AC"; + $revHash{"TGCT"}="AAGC"; + $revHash{"GCCG"}="GGCC"; + $revHash{"TCTG"}="AGAC"; + $revHash{"TGT"}="AAC"; + $revHash{"TTAT"}="AAAT"; + $revHash{"TAGT"}="AACT"; + $revHash{"TATG"}="ATAC"; + $revHash{"TTTA"}="AAAT"; + $revHash{"CGTA"}="ACGT"; + $revHash{"TA"}="AT"; + $revHash{"TGTC"}="AGAC"; + $revHash{"CTAT"}="ATAG"; + $revHash{"TATA"}="ATAT"; + $revHash{"TAC"}="AGT"; + $revHash{"TC"}="AG"; + $revHash{"CATT"}="AATG"; + $revHash{"TCG"}="ACG"; + $revHash{"ATTT"}="AAAT"; + $revHash{"CGTG"}="ACGC"; + $revHash{"CTG"}="AGC"; + $revHash{"TCGT"}="AACG"; + $revHash{"TCCG"}="ACGG"; + $revHash{"GTT"}="AAC"; + $revHash{"ATGT"}="ATAC"; + $revHash{"CTTG"}="AAGC"; + $revHash{"CCTT"}="AAGG"; + $revHash{"GATC"}="ATCG"; + $revHash{"CTGG"}="AGCC"; + $revHash{"TTCT"}="AAAG"; + $revHash{"CGTC"}="ACGG"; + $revHash{"CG"}="GC"; + $revHash{"TATT"}="AAAT"; + $revHash{"CTCG"}="AGCG"; + $revHash{"TCTC"}="AGAG"; + $revHash{"TCCT"}="AAGG"; + $revHash{"TGG"}="ACC"; + $revHash{"ACTC"}="AGTG"; + $revHash{"CTC"}="AGG"; + $revHash{"CGC"}="GGC"; + $revHash{"TTG"}="AAC"; + $revHash{"ACCT"}="AGGT"; + $revHash{"TCTA"}="ATAG"; + $revHash{"GTAC"}="ACGT"; + $revHash{"TTGA"}="AATC"; + $revHash{"GTCC"}="ACGG"; + $revHash{"GATT"}="AATC"; + $revHash{"T"}="A"; + $revHash{"CGTT"}="AACG"; + $revHash{"GTC"}="ACG"; + $revHash{"GCCT"}="AGGC"; + $revHash{"TGC"}="AGC"; + $revHash{"TTTG"}="AAAC"; + $revHash{"GGCT"}="AGCC"; + $revHash{"TCA"}="ATG"; + $revHash{"GTGC"}="ACGC"; + $revHash{"TGAT"}="AATC"; + $revHash{"TAT"}="AAT"; + $revHash{"CTAC"}="AGGT"; + $revHash{"TGCG"}="ACGC"; + $revHash{"CTCA"}="AGTG"; + $revHash{"CTTC"}="AAGG"; + $revHash{"GCTG"}="AGCC"; + $revHash{"TATC"}="ATAG"; + $revHash{"TAAT"}="AATT"; + $revHash{"ACT"}="AGT"; + $revHash{"TCGC"}="AGCG"; + $revHash{"GGT"}="ACC"; + $revHash{"TCC"}="AGG"; + $revHash{"TTGT"}="AAAC"; + $revHash{"TGAC"}="AGTC"; + $revHash{"TTAC"}="AAGT"; + $revHash{"CGT"}="ACG"; + $revHash{"ATTA"}="AATT"; + $revHash{"ATTG"}="AATC"; + $revHash{"CCTC"}="AGGG"; + $revHash{"CCGG"}="GGCC"; + $revHash{"CCGT"}="ACGG"; + $revHash{"TCCA"}="ATGG"; + $revHash{"CGCC"}="GGGC"; + $revHash{"GT"}="AC"; + $revHash{"TTCG"}="AACG"; + $revHash{"CCTG"}="AGGC"; + $revHash{"TCT"}="AAG"; + $revHash{"GTAT"}="ATAC"; + $revHash{"GTCT"}="AGAC"; + $revHash{"GCTA"}="AGCT"; + $revHash{"TACC"}="AGGT"; +} + + +sub allCaps{ + my $motif = $_[0]; + $motif =~ s/a/A/g; + $motif =~ s/c/C/g; + $motif =~ s/t/T/g; + $motif =~ s/g/G/g; + return $motif; +} + + +sub all_caps{ + my @strand = split(/\s*/,$_[0]); + for my $i (0 ... $#strand){ + if ($strand[$i] =~ /c/) {$strand[$i] = "C";next;} + if ($strand[$i] =~ /a/) {$strand[$i] = "A";next;} + if ($strand[$i] =~ /t/) { $strand[$i] = "T";next;} + if ($strand[$i] =~ /g/) {$strand[$i] = "G";next;} + } + return join("",@strand); +} +sub array_mean{ + return "NA" if scalar(@_) == 0; + my $sum = 0; + foreach my $val (@_){ + $sum = $sum + $val; + } + return ($sum/scalar(@_)); +} +sub array_sum{ + return "NA" if scalar(@_) == 0; + my $sum = 0; + foreach my $val (@_){ + $sum = $sum + $val; + } + return ($sum); +} + +sub variance{ + return "NA" if scalar(@_) == 0; + return 0 if scalar(@_) == 1; + my $mean = array_mean(@_); + my $num = 0; + return 0 if scalar(@_) == 1; +# print "mean = $mean .. array = >@_<\n"; + foreach my $ele (@_){ + # print "$num = $num + ($ele-$mean)*($ele-$mean)\n"; + $num = $num + ($ele-$mean)*($ele-$mean); + } + my $var = $num / scalar(@_); + return $var; +} + +sub array_95confIntervals{ + return "NA" if scalar(@_) <= 0; + my @sorted = sort { $a <=> $b } @_; +# print "@sorted=",scalar(@sorted), "\n"; + my $aDeechNo = int((scalar(@sorted) * 2.5) / 100); + my $saaDeNo = int((scalar(@sorted) * 97.5) / 100); + + return ($sorted[$aDeechNo], $sorted[$saaDeNo]); +} + +sub array_median{ + return "NA" if scalar(@_) == 0; + return $_[0] if scalar(@_) == 1; + my @sorted = sort { $a <=> $b } @_; + my $totalno = scalar(@sorted); + + #print "sorted = @sorted\n"; + + my $pick = (); + if ($totalno % 2 == 1){ + #print "odd set .. totalno = $totalno\n"; + my $mid = $totalno / 2; + my $onehalfno = $mid - $mid % 1; + my $secondhalfno = $onehalfno + 1; + my $onehalf = $sorted[$onehalfno-1]; + my $secondhalf = $sorted[$secondhalfno-1]; + #print "onehalfno = $onehalfno and secondhalfno = $secondhalfno \n onehalf = $onehalf and secondhalf = $secondhalf\n"; + + $pick = $secondhalf; + } + else{ + #print "even set .. totalno = $totalno\n"; + my $mid = $totalno / 2; + my $onehalfno = $mid; + my $secondhalfno = $onehalfno + 1; + my $onehalf = $sorted[$onehalfno-1]; + my $secondhalf = $sorted[$secondhalfno-1]; + #print "onehalfno = $onehalfno and secondhalfno = $secondhalfno \n onehalf = $onehalf and secondhalf = $secondhalf\n"; + $pick = ($onehalf + $secondhalf )/2; + + } + #print "pick = $pick..\n"; + return $pick; + +} + + +sub array_numerical_sort{ + return "NA" if scalar(@_) == 0; + my @sorted = sort { $a <=> $b } @_; + return (@sorted); +} + +sub array_smallest_number{ + return "NA" if scalar(@_) == 0; + return $_[0] if scalar(@_) == 1; + my @sorted = sort { $a <=> $b } @_; + return $sorted[0]; +} + + +sub array_largest_number{ + return "NA" if scalar(@_) == 0; + return $_[0] if scalar(@_) == 1; + my @sorted = sort { $a <=> $b } @_; + return $sorted[$#sorted]; +} + + +sub array_largest_number_arrayPosition{ + return "NA" if scalar(@_) == 0; + return 0 if scalar(@_) == 1; + my $maxpos = 0; + my @maxposes = (); + my @maxvals = (); + my $maxval = array_smallest_number(@_); + for my $i (0 ... $#_){ + if ($_[$i] > $maxval){ + $maxval = $_[$i]; + $maxpos = $i; + } + if ($_[$i] == $maxval){ + $maxval = $_[$i]; + if (scalar(@maxposes) == 0){ + push @maxposes, $i; + push @maxvals, $_[$i]; + + } + elsif ($maxvals[0] == $maxval){ + push @maxposes, $i; + push @maxvals, $_[$i]; + } + else{ + @maxposes = (); @maxvals = (); + push @maxposes, $i; + push @maxvals, $_[$i]; + } + + } + + } + return $maxpos if scalar(@maxposes) < 2; + return (@maxposes); +} + +sub array_smallest_number_arrayPosition{ + return "NA" if scalar(@_) == 0; + return 0 if scalar(@_) == 1; + my $minpos = 0; + my @minposes = (); + my @minvals = (); + my $minval = array_largest_number(@_); + my $maxval = array_smallest_number(@_); + #print "starting with $maxval, ending with $minval\n"; + for my $i (0 ... $#_){ + if ($_[$i] < $minval){ + $minval = $_[$i]; + $minpos = $i; + } + if ($_[$i] == $minval){ + $minval = $_[$i]; + if (scalar(@minposes) == 0){ + push @minposes, $i; + push @minvals, $_[$i]; + + } + elsif ($minvals[0] == $minval){ + push @minposes, $i; + push @minvals, $_[$i]; + } + else{ + @minposes = (); @minvals = (); + push @minposes, $i; + push @minvals, $_[$i]; + } + + } + + } + #print "minposes=@minposes\n"; + + return $minpos if scalar(@minposes) < 2; + return (@minposes); +} + +sub basic_stats{ + my @arr = @_; +# print " array_smallest_number= ", array_smallest_number(@arr)," array_largest_number= ", array_largest_number(@arr), " array_mean= ",array_mean(@arr),"\n"; + return ":"; +} +#xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx + +sub maftoAxt_multispecies { + my $printer = 0; + #print "in maftoAxt_multispecies : got @_\n"; + my $fname=$_[0]; + open(IN,"<$_[0]") or die "Cannot open $_[0]: $! \n"; + my $treedefinition = $_[1]; + #print "treedefinition= $treedefinition\n"; + + my @treedefinitions = MakeTrees($treedefinition); + + + open(OUT,">$_[2]") or die "Cannot open $_[2]: $! \n"; + my $counter = 0; + my $exactspeciesset = $_[3]; + my @exactspeciesset_unarranged = split(/,/,$exactspeciesset); + + $treedefinition=~s/[\)\(, ]/\t/g; + my @species=split(/\t+/,$treedefinition); + @exactspeciesset_unarranged = @species; +# print "species=@species\n"; + + my @exactspeciesarr=(); + + + foreach my $def (@treedefinitions){ + $def=~s/[\)\(, ]/\t/g; + my @specs = split(/\t+/,$def); + my @exactspecies=(); + foreach my $spec (@specs){ + foreach my $espec (@exactspeciesset_unarranged){ + # print "pushing >$spec< nd >$espec<\n" if $spec eq $espec && $espec =~ /[a-zA-Z0-9]/; + push @exactspecies, $spec if $spec eq $espec && $espec =~ /[a-zA-Z0-9]/; + } + + } + #print "exactspecies = >@exactspecies<\n"; + push @exactspeciesarr, [@exactspecies]; + } + #<STDIN>; + #print "exactspeciesarr=@exactspeciesarr\n"; + + ########### + my $select = 2; + #select = 1 if all species need sequences to be present for each block otherwise, it is 0 + #select = 2 only the allowed set make up the alignment. use the removeset + # information to detect alignmenets that have other important genomes aligned. + ########### + my @allowedset = (); + @allowedset = split(/;/,allowedSetOfSpecies(join("_",@species))) if $select == 0; + @allowedset = join("_",0,@species) if $select == 1; + #print "species = @species , allowedset =",join("\n", @allowedset) ," \n"; + + + foreach my $set (@exactspeciesarr){ + my @openset = @$set; + push @allowedset, join("_",0,@openset) if $select == 2; + # print "openset = >@openset<, allowedset = @allowedset and exactspecies = @exactspecies\n"; + } +# <STDIN>; + my $start = 0; + my @sequences = (); + my @titles = (); + my $species_counter = "0"; + my $countermatch = 0; + my $outsideSpecies=0; + + while(my $line = <IN>){ + next if $line =~ /^#/; + next if $line =~ /^i/; + # print "$line .. species = @species\n"; + chomp $line; + my @fields = split(/\s+/,$line); + chomp $line; + if ($line =~ /^a /){ + $start = 1; + } + + if ($line =~ /^s /){ + # print "fields1 = $fields[1] , start = $start\n"; + + foreach my $sp (@species){ + if ($fields[1] =~ /$sp/){ + $species_counter = $species_counter."_".$sp; + push(@sequences, $fields[6]); + my @sp_info = split(/\./,$fields[1]); + my $title = join(" ",@sp_info, $fields[2], ($fields[2]+$fields[3]), $fields[4]); + push(@titles, $title); + + } + } + } + + if (($line !~ /^a/) && ($line !~ /^s/) && ($line !~ /^#/) && ($line !~ /^i/) && ($start = 1)){ + + my $arranged = reorderSpecies($species_counter, @species); +# print "species_counter=$species_counter .. arranged = $arranged\n"; + + my $stopper = 1; + my $arrno = 0; + foreach my $set (@allowedset){ +# print "checking $set with $arranged\n"; + if ($arranged eq $set){ +# print "checked $set with $arranged\n"; + $stopper = 0; last; + } + $arrno++; + } + + if ($stopper == 0) { + # print " accepted\n"; + @titles = split ";", orderInfo(join(";", @titles), $species_counter, $arranged) if $species_counter ne $arranged; + + @sequences = split ";", orderInfo(join(";", @sequences), $species_counter, $arranged) if $species_counter ne $arranged; + my $filteredseq = filter_gaps(@sequences); + + if ($filteredseq ne "SHORT"){ + $counter++; + print OUT join (" ",$counter, @titles), "\n"; + print OUT $filteredseq, "\n"; + print OUT "\n"; + $countermatch++; + } + # my @filtered_seq = split(/\t/,filter_gaps(@sequences) ); + } + else{#print "\n"; + } + + @sequences = (); @titles = (); $start = 0;$species_counter = "0"; + next; + + } + } + close IN; + close OUT; +# print "countermatch = $countermatch\n"; +# <STDIN>; +} + +sub reorderSpecies{ + my @inarr=@_; + my $currSpecies = shift (@inarr); + my $ordered_species = 0; + my @species=@inarr; + foreach my $order (@species){ + $ordered_species = $ordered_species."_".$order if $currSpecies=~ /$order/; + } + return $ordered_species; + +} + +sub filter_gaps{ + my @sequences = @_; +# print "sequences sent are @sequences\n"; + my $seq_length = length($sequences[0]); + my $seq_no = scalar(@sequences); + my $allgaps = (); + for (1 ... $seq_no){ + $allgaps = $allgaps."-"; + } + + my @seq_array = (); + my $seq_counter = 0; + foreach my $seq (@sequences){ +# my @sequence = split(/\s*/,$seq); + $seq_array[$seq_counter] = [split(/\s*/,$seq)]; +# push @seq_array, [@sequence]; + $seq_counter++; + } + my $g = 0; + while ( $g < $seq_length){ + last if (!exists $seq_array[0][$g]); + my $bases = (); + for my $u (0 ... $#seq_array){ + $bases = $bases.$seq_array[$u][$g]; + } +# print $bases, "\n"; + if ($bases eq $allgaps){ +# print "bases are $bases, position is $g \n"; + for my $seq (@seq_array){ + splice(@$seq , $g, 1); + } + } + else { + $g++; + } + } + + my @outs = (); + + foreach my $seq (@seq_array){ + push(@outs, join("",@$seq)); + } + return "SHORT" if length($outs[0]) <=100; + return (join("\n", @outs)); +} + + +sub allowedSetOfSpecies{ + my @allowed_species = split(/_/,$_[0]); + unshift @allowed_species, 0; +# print "allowed set = @allowed_species \n"; + my @output = (); + for (0 ... scalar(@allowed_species) - 4){ + push(@output, join("_",@allowed_species)); + pop @allowed_species; + } + return join(";",reverse(@output)); + +} + + +sub orderInfo{ + my @info = split(/;/,$_[0]); +# print "info = @info"; + my @old = split(/_/,$_[1]); + my @new = split(/_/,$_[2]); + shift @old; shift @new; + my @outinfo = (); + foreach my $spe (@new){ + for my $no (0 ... $#old){ + if ($spe eq $old[$no]){ + push(@outinfo, $info[$no]); + } + } + } +# print "outinfo = @outinfo \n"; + return join(";", @outinfo); +} + +#xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx + +sub printarr { +# print ">::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"; +# foreach my $line (@_) {print "$line\n";} +# print "::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::<\n"; +} + +sub oneOf{ + my @arr = @_; + my $element = $arr[$#arr]; + @arr = @arr[0 ... $#arr-1]; + my $present = 0; + + foreach my $el (@arr){ + $present = 1 if $el eq $element; + } + return $present; +} + +#xxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx + +sub MakeTrees{ + my $tree = $_[0]; +# my @parts=($tree); + my @parts=(); + +# print "parts=@parts\n"; + + while (1){ + $tree =~ s/^\(//g; + $tree =~ s/\)$//g; + my @arr = (); + + if ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)\)$/){ + @arr = $tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)$/; + push @parts, "(".$tree.")"; + $tree = $2; + } + elsif ($tree =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/){ + @arr = $tree =~ /^([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/; + push @parts, "(".$tree.")"; + $tree = $1; + } + elsif ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/){ + last; + } + } + #print "parts=@parts\n"; + return @parts; +} + +