comparison microsatellite_birthdeath.pl @ 0:4e31fad3f08e draft

Uploaded tool tarball.
author devteam
date Wed, 25 Sep 2013 11:26:02 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4e31fad3f08e
1 #!/usr/bin/perl -w
2 use strict;
3 use warnings;
4 use Term::ANSIColor;
5 use Pod::Checker;
6 use File::Basename;
7 use IO::Handle;
8 use Cwd;
9 use File::Path qw(make_path remove_tree);
10 use File::Temp qw/ tempfile tempdir /;
11 my $tdir = tempdir( CLEANUP => 1 );
12 chdir $tdir;
13 my $dir = getcwd;
14 # print "current dit=$dir\n";
15 #<STDIN>;
16 use vars qw (%treesToReject %template $printer $interr_poscord $interrcord $no_of_interruptionscord $stringfile @tags
17 $infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species
18 $gapcord %thresholdhash $tree_decipherer @sp_ident %revHash %sameHash %treesToIgnore %alternate @exactspecies_orig @exactspecies @exacttags
19 $mono_flanksimplicityRepno $di_flanksimplicityRepno $prop_of_seq_allowedtoAT $prop_of_seq_allowedtoCG);
20 use FileHandle;
21 use IO::Handle; # 5.004 or higher
22
23
24
25 #my @ar = ("/Users/ydk/work/rhesus_microsat/results/galay/chr22_5sp.maf.txt", "/Users/ydk/work/rhesus_microsat/results/galay/dataset_11.dat",
26 #"/Users/ydk/work/rhesus_microsat/results/galay/chr22_5spec.maf.summ","hg18,panTro2,ponAbe2,rheMac2,calJac1","((((hg18, panTro2), ponAbe2), rheMac2), calJac1)","9,10,12,12",
27 #"10","0.8");
28 my @ar = @ARGV;
29 my ($maf, $orth, $summout, $species_set, $tree_definition, $thresholds, $FLANK_SUPPORT, $SIMILARITY_THRESH) = @ar;
30 $SIMILARITY_THRESH=$SIMILARITY_THRESH/100;
31 #########################
32 $SIMILARITY_THRESH = $SIMILARITY_THRESH/100;
33 my $EDGE_DISTANCE = 10;
34 my $COMPLEXITY_SUPPORT = 20;
35 load_thresholds("9_10_12_12");
36 my $FLANKINDEL_MAXTHRESH = 0.3;
37
38 my $mono_flanksimplicityRepno=6;
39 my $di_flanksimplicityRepno=10;
40 my $prop_of_seq_allowedtoAT=0.5;
41 my $prop_of_seq_allowedtoCG=0.66;
42
43 #########################
44 my $tspecies_set = $species_set;
45
46 my %speciesReplacement = ();
47 my %speciesReplacementTag = ();
48 my %replacementArr= ();
49 my %replacementArrTag= ();
50 my %backReplacementArr= ();
51 my %backReplacementArrTag= ();
52 $tree_definition=~s/\s+//g;
53
54 my $tree_definition_split = $tree_definition;
55 $tree_definition_split =~ s/[\(\)]//g;
56 my @gotSpecies = ($tree_definition_split =~ /(,)/g);
57 # print "gotSpecies = @gotSpecies\n";
58
59 if (scalar(@gotSpecies)+1 ==5){
60
61 $speciesReplacement{1}="calJac1";
62 $speciesReplacement{2}="rheMac2";
63 $speciesReplacement{3}="ponAbe2";
64 $speciesReplacement{4}="panTro2";
65 $speciesReplacement{5}="hg18";
66
67 $speciesReplacementTag{1}="M";
68 $speciesReplacementTag{2}="R";
69 $speciesReplacementTag{3}="O";
70 $speciesReplacementTag{4}="C";
71 $speciesReplacementTag{5}="H";
72 $species_set="hg18,panTro2,ponAbe2,rheMac2,calJac1";
73
74 }
75 if (scalar(@gotSpecies)+1 ==4){
76
77 $speciesReplacement{1}="rheMac2";
78 $speciesReplacement{2}="ponAbe2";
79 $speciesReplacement{3}="panTro2";
80 $speciesReplacement{4}="hg18";
81
82 $speciesReplacementTag{1}="R";
83 $speciesReplacementTag{2}="O";
84 $speciesReplacementTag{3}="C";
85 $speciesReplacementTag{4}="H";
86 $species_set="hg18,panTro2,ponAbe2,rheMac2";
87
88 }
89
90 # $tree_definition = "((((hg18,panTro2),ponAbe2),rheMac2),calJac1)";
91 my $tree_definition_copy = $tree_definition;
92 my $tree_definition_orig = $tree_definition;
93 my $brackets = 0;
94
95 while (1){
96 #last if $tree_definition_copy !~ /\(/;
97 $brackets++;
98 # print "brackets = $brackets\n";
99 last if $brackets > 6;
100 $tree_definition_copy =~ s/^\(//g;
101 $tree_definition_copy =~ s/\)$//g;
102 # print "tree_definition_copy = $tree_definition_copy\n";
103 my @arr = ();
104
105 if ($tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)\)$/){
106 @arr = $tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)$/;
107 # print "arr = @arr\n";
108 $tree_definition_copy = $2;
109 $replacementArr{$1} = $speciesReplacement{$brackets};
110 $backReplacementArr{$speciesReplacement{$brackets}}=$1;
111
112 $replacementArrTag{$1} = $speciesReplacementTag{$brackets};
113 $backReplacementArrTag{$speciesReplacementTag{$brackets}}=$1;
114 # print "replacing $1 with $replacementArr{$1}\n";
115
116 $sp_ident[$brackets-1] = $1;
117
118 }
119 elsif ($tree_definition_copy =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/){
120 @arr = $tree_definition_copy =~ /^([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/;
121 # print "arr = @arr\n";
122 $tree_definition_copy = $1;
123 $replacementArr{$2} = $speciesReplacement{$brackets};
124 $backReplacementArr{$speciesReplacement{$brackets}}=$2;
125
126 $replacementArrTag{$2} = $speciesReplacementTag{$brackets};
127 $backReplacementArrTag{$speciesReplacementTag{$brackets}}=$2;
128 # print "replacing $2 with $replacementArr{$2}\n";
129
130 $sp_ident[$brackets-1] = $2;
131 }
132 elsif ($tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/){
133 @arr = $tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/;
134 # print "arr = @arr .. TERMINAL\n";
135 $tree_definition_copy = $1;
136 $replacementArr{$2} = $speciesReplacement{$brackets};
137 $replacementArr{$1} = $speciesReplacement{$brackets+1};
138 $backReplacementArr{$speciesReplacement{$brackets}}=$2;
139 $backReplacementArr{$speciesReplacement{$brackets+1}}=$1;
140
141 $replacementArrTag{$1} = $speciesReplacementTag{$brackets+1};
142 $backReplacementArrTag{$speciesReplacementTag{$brackets+1}}=$1;
143
144 $replacementArrTag{$2} = $speciesReplacementTag{$brackets};
145 $backReplacementArrTag{$speciesReplacementTag{$brackets}}=$2;
146 # print "replacing $1 with $replacementArr{$1}\n";
147 # print "replacing $2 with $replacementArr{$2}\n";
148 # print "replacing $1 with $replacementArrTag{$1}\n";
149 # print "replacing $2 with $replacementArrTag{$2}\n";
150
151 $sp_ident[$brackets-1] = $2;
152 $sp_ident[$brackets] = $1;
153
154
155 last;
156
157 }
158 elsif ($tree_definition_copy =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_\(\),]+)\)$/){
159 $tree_definition_copy =~ s/^\(//g;
160 $tree_definition_copy =~ s/\)$//g;
161 $brackets--;
162 }
163 }
164
165 foreach my $key (keys %replacementArr){
166 my $replacement = $replacementArr{$key};
167 $tree_definition =~ s/$key/$replacement/g;
168 }
169 @sp_ident = reverse(@sp_ident);
170 # print "modified tree_definition = $tree_definition\n";
171 # print "done .. tree_definition = $tree_definition\n";
172 # print "sp_ident = @sp_ident\n";
173 #<STDIN>;
174
175
176 my $complexity=int($COMPLEXITY_SUPPORT * (1/40));
177
178 #print "complexity=$complexity\n";
179 #<STDIN>;
180
181 $printer = 1;
182
183 my $rando = int(rand(1000));
184 my $localdate = `date`;
185 $localdate =~ /([0-9]+):([0-9]+):([0-9]+)/;
186 my $info = $rando.$1.$2.$3;
187
188 #---------------------------------------------------------------------------
189 # GETTING INPUT INFORMATION AND OPENING INPUT AND OUTPUT FILES
190
191
192 my @thresharr = (0, split(/,/,$thresholds));
193 my $randno=int(rand(100000));
194 my $megamatch = $randno.".megamatch.net.axt"; #"/gpfs/home/ydk104/work/rhesus_microsat/axtNet/hg18.panTro2.ponAbe2.rheMac2.calJac1/chr1.hg18.panTro2.ponAbe2.rheMac2.calJac1.net.axt";
195 my $megamatchlck = $megamatch.".lck";
196 unlink $megamatchlck;
197
198 my $selected= $orth;
199 #my $eventfile = $orth;
200 $selected = $selected."_SELECTED";
201 #$selected = $selected."_".$SIMILARITY_THRESH;
202 #my $runtime = $selected.".runtime";
203
204 my $inputtags = "H:C:O:R:M";
205 $inputtags = $ARGV[3] if exists $ARGV[3] && $ARGV[3] =~ /[A-Z]:[A-Z]/;
206
207 my @all_tags = split(/:/, $inputtags);
208 my $inputsp = "hg18:panTro2:ponAbe2:rheMac2:calJac1";
209 $inputsp = $ARGV[4] if exists $ARGV[4] && $ARGV[3] =~ /[0-9]+:/;
210 #@sp_ident = split(/:/,$inputsp);
211 my $junkfile = $orth."_junk";
212
213 my $sh = load_sameHash(1);
214 my $rh = load_revHash(1);
215
216 #print "inputs are : \n"; foreach(@ARGV){print $_,"\n";}
217 #open (SELECT, ">$selected") or die "Cannot open selected file: $selected: $!";
218 open (SUMMARY, ">$summout") or die "Cannot open summout file: $summout: $!";
219 #open (RUN, ">$runtime") or die "Cannot open orth file: $runtime: $!";
220 #my $ctlfile = "baseml\.ctl"; #$ARGV[4];
221 #my $treefile = "/gpfs/home/ydk104/work/rhesus_microsat/codes/lib/"; #1 THIS IS THE THE TREE UNDER CONSIDERATION, IN NEWICK
222 my %registeredTrees = ();
223 my @removalReasons =
224 ("microsatellite is compound",
225 "complex structure",
226 "if no. if micros is more than no. of species",
227 "if more than one micro per species ",
228 "if microsat contains N",
229 "different motif than required ",
230 "more than zero interruptions",
231 "microsat could not form key ",
232 "orthologous microsats of different motif size ",
233 "orthologous microsats of different motifs ",
234 "microsats belong to different alignment blocks altogether",
235 "microsat near edge",
236 "microsat in low complexity region",
237 "microsat flanks dont align well",
238 "phylogeny not informative");
239 my %allowedhash=();
240 #---------------------------------------------------------------------------
241 # WORKING ON MAKING THE MEGAMATCH FILE
242 my $chromt=int(rand(10000));
243 my $p_chr=$chromt;
244
245 my $tree_definition_orig_copy = $tree_definition_orig;
246
247 $tree_definition=~s/,/, /g;
248 $tree_definition =~ s/, +/, /g;
249 $tree_definition_orig=~s/,/, /g;
250 $tree_definition_orig =~ s/, +/, /g;
251 my @exactspeciesset_unarranged = split(/,/,$species_set);
252 my @exactspeciesset_unarranged_orig = split(/,/,$tspecies_set);
253 my $largesttree = "$tree_definition;";
254 my $largesttree_orig = "$tree_definition_orig;";
255 # print "largesttree = $largesttree\n";
256 $tree_definition =~ s/\(//g;
257 $tree_definition =~ s/\)//g;
258 $tree_definition=~s/[\)\(, ]/\t/g;
259 $tree_definition =~ s/\t+/\t/g;
260
261 $tree_definition_orig =~ s/\(//g;
262 $tree_definition_orig =~ s/\)//g;
263 $tree_definition_orig =~s/[\)\(, ]/\t/g;
264 $tree_definition_orig =~ s/\t+/\t/g;
265 # print "tree_definition = $tree_definition tree_definition_orig = $tree_definition_orig\n";
266
267 my @treespecies=split(/\t+/,$tree_definition);
268 my @treespecies_orig=split(/\t+/,$tree_definition_orig);
269 # print "tree_definition = $tree_definition .. treespecies=@treespecies ... treespecies_orig=@treespecies_orig\n";
270 #<STDIN>;
271
272 foreach my $spec (@treespecies){
273 foreach my $espec (@exactspeciesset_unarranged){
274 # print "spec=$spec and espec=$espec\n";
275 push @exactspecies, $spec if $spec eq $espec;
276 }
277 }
278
279 foreach my $spec (@treespecies_orig){
280 foreach my $espec (@exactspeciesset_unarranged_orig){
281 # print "spec=$spec and espec=$espec\n";
282 push @exactspecies_orig, $spec if $spec eq $espec;
283 }
284 }
285
286 my $focalspec = $exactspecies[0];
287 my $focalspec_orig = $exactspecies_orig[0];
288 # print "exactspecies=@exactspecies ... focalspec=$focalspec\n";
289 # print "texactspecies=@exactspecies_orig ... focalspec_orig=$focalspec_orig\n";
290 #<STDIN>;
291 my $arranged_species_set = join(".",@exactspecies);
292 my $arranged_species_set_orig = join(".",@exactspecies_orig);
293
294 @exacttags=@exactspecies;
295 my @exacttags_orig=@exactspecies_orig;
296
297 foreach my $extag (@exacttags){
298 $extag =~ s/hg18/H/g;
299 $extag =~ s/panTro2/C/g;
300 $extag =~ s/ponAbe2/O/g;
301 $extag =~ s/rheMac2/R/g;
302 $extag =~ s/calJac1/M/g;
303 }
304
305 foreach my $extag (@exacttags_orig){
306 $extag =~ s/hg18/H/g;
307 $extag =~ s/panTro2/C/g;
308 $extag =~ s/ponAbe2/O/g;
309 $extag =~ s/rheMac2/R/g;
310 $extag =~ s/calJac1/M/g;
311 }
312
313 my $chr_name = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt");
314 #print "sending to maftoAxt_multispecies: $maf, $tree_definition, $chr_name, $species_set .. focalspec=$focalspec \n";
315
316 maftoAxt_multispecies($maf, $tree_definition_orig_copy, $chr_name, $tspecies_set);
317 #print "made files\n";
318 my @filterseqfiles= ($chr_name);
319 $largesttree =~ s/hg18/H/g;
320 $largesttree =~ s/panTro2/C/g;
321 $largesttree =~ s/ponAbe2/O/g;
322 $largesttree =~ s/rheMac2/R/g;
323 $largesttree =~ s/calJac1/M/g;
324 #<STDIN>;
325 #---------------------------------------------------------------------------
326
327 my ($lagestnodes, $largestbranches) = get_nodes($largesttree);
328 shift (@$lagestnodes);
329 my @extendedtitle=();
330
331 my $title = ();
332 my $parttitle = ();
333 my @titlearr = ();
334 my @firsttitle=($focalspec_orig."chrom", $focalspec_orig."start", $focalspec_orig."end", $focalspec_orig."motif", $focalspec_orig."motifsize", $focalspec_orig."threshold");
335
336 my @finames= qw(chr start end motif motifsize microsat mutation mutation.position mutation.from mutation.to insertion.details deletion.details);
337
338 my @fititle=();
339
340 foreach my $spec (split(",",$tspecies_set)){
341 push @fititle, $spec;
342 foreach my $name (@finames){
343 push @fititle, $spec.".".$name;
344 }
345 }
346
347
348 my @othertitle=qw(somechr somestart somened event source);
349
350 my @fnames = ();
351 push @fnames, qw(insertions_num deletions_num motinsertions_num motinsertionsf_num motdeletions_num motdeletionsf_num noninsertions_num nondeletions_num) ;
352 push @fnames, qw(binsertions_num bdeletions_num bmotinsertions_num bmotinsertionsf_num bmotdeletions_num bmotdeletionsf_num bnoninsertions_num bnondeletions_num) ;
353 push @fnames, qw(dinsertions_num ddeletions_num dmotinsertions_num dmotinsertionsf_num dmotdeletions_num dmotdeletionsf_num dnoninsertions_num dnondeletions_num) ;
354 push @fnames, qw(ninsertions_num ndeletions_num nmotinsertions_num nmotinsertionsf_num nmotdeletions_num nmotdeletionsf_num nnoninsertions_num nnondeletions_num) ;
355 push @fnames, qw(substitutions_num bsubstitutions_num dsubstitutions_num nsubstitutions_num indels_num subs_num);
356
357 my @fullnames = ();
358 # print "revising\n";
359 # print "H = $backReplacementArrTag{H}\n";
360 # print "C = $backReplacementArrTag{C}\n";
361 # print "O = $backReplacementArrTag{O}\n";
362 # print "R = $backReplacementArrTag{R}\n";
363 # print "M = $backReplacementArrTag{M}\n";
364
365 foreach my $lnode (@$lagestnodes){
366 my @pair = @$lnode;
367 my @nodemutarr = ();
368 for my $p (@pair){
369 # print "p = $p\n";
370 $p =~ s/[\(\), ]+//g;
371 $p =~ s/([A-Z])/$1./g;
372 $p =~ s/\.$//g;
373
374 $p =~ s/H/$backReplacementArrTag{H}/g;
375 $p =~ s/C/$backReplacementArrTag{C}/g;
376 $p =~ s/O/$backReplacementArrTag{O}/g;
377 $p =~ s/R/$backReplacementArrTag{R}/g;
378 $p =~ s/M/$backReplacementArrTag{M}/g;
379 foreach my $n (@fnames) { push @fullnames, $p.".".$n;}
380 }
381 }
382
383 #print SUMMARY "#",join("\t", @firsttitle, @fititle, @othertitle);
384
385 #print SUMMARY "\t",join("\t", @fullnames);
386 my $header = join("\t",@firsttitle, @fititle, @othertitle, @fullnames, @fnames, "tree", "cleancase");
387 # print "header= $header\n";
388 #<STDIN>;
389
390 #print SUMMARY "\t",join("\t", @fnames);
391 #$title= $title."\t".join("\t", @fnames);
392
393 #print SUMMARY "\t","tree","\t", "cleancase", "\n";
394 #$title= $title."\t"."tree"."\t"."cleancase". "\n";
395
396 #print $title; #<STDIN>;
397
398 #print "all_tags = @all_tags\n";
399
400 for my $no (3 ... $#all_tags+1){
401 # print "no=$no\n"; #<STDIN>;
402 @tags = @all_tags[0 ... $no-1];
403 # print "all_tags=>@all_tags< , tags = >@tags<\n" if $printer == 1; #<STDIN>;
404 %template=();
405 my @nextcounter = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
406 #next if scalar(@tags) < 4;
407 # print "now doing tags = @tags, no = $no\n";
408 open (ORTH, "<$orth") or die "Cannot open orth file: $orth: $!";
409
410 # print SUMMARY join "\t", qw (species chr start end branch motif microsat mutation position from to insertion deletion);
411
412
413 ##################### T E M P O R A R Y #####################
414 my @finaltitle=();
415 my @singletitle = qw (species chr start end motif motifsize microsat strand microsatsize col10 col11 col12 col13);
416 my $endtitle = ();
417 foreach my $tag (@tags){
418 my @tempsingle = ();
419
420 foreach my $single (@singletitle){
421 push @tempsingle, $tag.$single;
422 }
423 @finaltitle = (@finaltitle, @tempsingle);
424 }
425
426 # print SUMMARY join("\t",@finaltitle),"\n";
427
428 #############################################################
429
430 #---------------------------------------------------------------------------
431 # GET THE TREE FROM TREE FILE
432 my $tree = ();
433 $tree = "((H, C), O)" if $no == 3;
434 $tree = "(((H, C), O), R)" if $no == 4;
435 $tree = "((((H, C), O), R), M)" if $no == 5;
436 # $tree=~s/;$//g;
437 # print "our tree = $tree\n";
438 #---------------------------------------------------------------------------
439 # LOADING HASH CONTAINING ALL POSSIBLE TREES:
440 $tree_decipherer = "/gpfs/home/ydk104/work/rhesus_microsat/codes/lib/tree_analysis_".join("",@tags).".txt";
441 %template=();
442 %alternate=();
443 load_allPossibleTrees($tree_decipherer, \%template, \%alternate);
444
445 #---------------------------------------------------------------------------
446 # LOADING THE TREES TO REJECT FOR BIRTH ANALYSIS
447 %treesToReject=();
448 %treesToIgnore=();
449 load_treesToReject(@tags);
450 load_treesToIgnore(@tags);
451 #---------------------------------------------------------------------------
452 # LOADING INPUT DATA INTO HASHES AND ARRAYS
453
454
455 #1 THIS IS THE POINT WHERE WE CAN FILTER OUT LARGE MICROSAT CLUSTERS
456 #2 AS WELL AS MULTIPLE-ALIGNMENT-BLOCKS-SPANNING MICROSATS (KIND OF
457 #3 IMPLICIT IN THE FIRST PART OF THE SENTENCE ITSELF IN MOST CASES).
458
459 my %orths=();
460 my $counterm = 0;
461 my $loaded = 0;
462 my %seen = ();
463 my @allowedchrs = ();
464 # print "no = $no\n"; #<STDIN>;
465
466 while (my $line = <ORTH>){
467 # print "line=$line\n";
468 my $register1 = $line =~ s/>$exactspecies_orig[0]/>$replacementArrTag{$exactspecies_orig[0]}/g;
469 my $register2 = $line =~ s/>$exactspecies_orig[1]/>$replacementArrTag{$exactspecies_orig[1]}/g;
470 my $register3 = $line =~ s/>$exactspecies_orig[2]/>$replacementArrTag{$exactspecies_orig[2]}/g;
471 my $register4 = $line =~ s/>$exactspecies_orig[3]/>$replacementArrTag{$exactspecies_orig[3]}/g;
472 my $register5 = $line =~ s/>$exactspecies_orig[4]/>$replacementArrTag{$exactspecies_orig[4]}/g if exists $exactspecies_orig[4];
473
474 # print "line = $line\n"; <STDIN>;
475
476
477 # next if $register1 + $register2 + $register3 + $register4 + $register5 > scalar(@tags);
478 my @micros = split(/>/,$line); # LOADING ALL THE MICROSAT ENTRIES FROM THE CLUSTER INTO @micros
479 #print "micros=",printarr(@micros),"\n"; #<STDIN>;
480 shift @micros; # EMPTYING THE FIRST, EMTPY ELEMENT OF THE ARRAY
481
482 $no_of_species = adjustCoordinates($micros[0]);
483 # print "A: $no_of_species\n";
484 next if $no_of_species != $no;
485 # print "no = $no ... no_of_species=$no_of_species\n";#<STDIN>;
486 $counterm++;
487 #------------------------------------------------
488 $nextcounter[0]++ if $line =~ /compound/;
489 next if $line =~ /compound/; # GETTING RID OF COMPOUND MICROSATS
490 #------------------------------------------------
491 #next if $line =~ /[A-Za-z]>[a-zA-Z]/;
492 #------------------------------------------------
493 chomp $line;
494 my $match_count = ($line =~ s/>/>/g); # COUNTING THE NUMBER OF MICROSAT ENTRIES IN THE CLUSTER
495 #print "number of species = $match_count\n";
496 my $stopper = 0;
497 foreach my $mic (@micros){
498 my @local = split(/\t/,$mic);
499 if ($local[$typecord] =~ /\./ || exists($local[$no_of_interruptionscord+2])) {$stopper = 1; $nextcounter[1]++;
500 last; }
501 # REMOVING CLUSTERS WITH THE CYRPTIC, (UNRESOLVABLY COMPLEX) MICROSAT ENTRIES IN THEM
502 }
503 next if $stopper ==1;
504 #------------------------------------------------
505 $nextcounter[2]++ if (scalar(@micros) >$no_of_species);
506
507 next if (scalar(@micros) >$no_of_species); #1 REMOVING MICROSAT CLUSTERS WITH MORE NUMBER OF MICROSAT ENTRIES THAN THE NUMBER OF SPECIES IN THE DATASET.
508 #2 THIS IS SO BECAUSE SUCH CLUSTERS IMPLY THAT IN AT LEAST ONE SPECIES, THERE IS MORE THAN ONE MICROSAT ENTRY
509 #3 IN THE CLUSTER. THUS, HERE WE ARE GETTING RID OF MICROSATS CLUSTERS THAT INCLUDE MULTUPLE, NEIGHBORING
510 #4 MICROSATS, AND STICK TO CLEAN MICROSATS THAT DO NOT HAVE ANY MICROSATS IN NEIGHBORHOOD.
511 #5 THIS 'NEIGHBORHOOD-RANGE' HAD BEEN DECIDED PREVIOUSLY IN OUR CODE multiSpecies_orthFinder4.pl
512 my $nexter = 0;
513 foreach my $tag (@tags){
514 my $tagcount = ($line =~ s/>$tag\t/>$tag\t/g);
515 if ($tagcount > 1) { $nexter =1; #print colored ['red'],"multiple entires per species : $tagcount of $tag\n" if $printer == 1;
516 next;
517 }
518 }
519
520 if ($nexter == 1){
521 $nextcounter[3]++;
522 next;
523 }
524 #------------------------------------------------
525 foreach my $mic (@micros){ #1 REMOVING MICROSATELLITES WITH ANY 'N's IN THEM
526 my @local = split(/\t/,$mic);
527 if ($local[$microsatcord] =~ /N/) {$stopper =1; $nextcounter[4]++;
528 last;}
529 }
530 next if $stopper ==1;
531 #print "till here 1\n"; #<STDIN>;
532 #------------------------------------------------
533 my @micros_copy = @micros;
534
535 my $tempmicro = shift(@micros_copy); #1 CURRENTLY OBTAINING INFORMATION FOR THE FIRST
536 #2 MICROSAT IN THE CLUSTER.
537 my @tempfields = split(/\t/,$tempmicro);
538 my $prevtype = $tempfields[$typecord];
539 my $tempmotif = $tempfields[$motifcord];
540
541 my $tempfirstmotif = ();
542 if (scalar(@tempfields) > $microsatcord + 2){
543 if ($tempfields[$no_of_interruptionscord] >= 1) { #1 DISCARDING MICROSATS WITH MORE THAN ZERO INTERRUPTIONS
544 #2 IN THE FIRST MICROSAT OF THE CLUSTER
545 $nexter =1; #print colored ['blue'],"more than one interruptions \n" if $printer == 1;
546 }
547 }
548 if ($nexter == 1){
549 $nextcounter[6]++;
550 next;
551 } #1 DONE OBTAINING INFORMATION REGARDING
552 #2 THE FIRST MICROSAT FROM THE CLUSTER
553
554 if ($tempmotif =~ /^\[/){
555 $tempmotif =~ s/^\[//g;
556 $tempmotif =~ /([a-zA-Z]+)\].*/;
557 $tempfirstmotif = $1; #1 OBTAINING THE FIRTS MOTIF OF MICROSAT
558 }
559 else {$tempfirstmotif = $tempmotif;}
560 my $prevmotif = $tempfirstmotif;
561
562 my $key = ();
563 # print "searching temp micro for 0-9 $focalspec chr0-9a-zA-Z 0-9 0-9 \n";
564 # print "tempmicro = $tempmicro .. looking for ([0-9]+)\s+($focalspec_orig)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\n"; <STDIN>;
565 if ($tempmicro =~ /([0-9]+)\s+($focalspec_orig)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
566 # print "B: $no_of_species\n";
567 $key = join("_",$2, $3, $4, $5);
568 }
569 else{
570 # print "counld not form a key for temp\n"; # if $printer == 1;
571 $nextcounter[7]++;
572 next;
573 }
574 #----------------- #1 NOW, AFTER OBTAINING INFORMATION ABOUT
575 #2 THE FIRST MICROSAT IN THE CLUSTER, THE
576 #3 FOLLOWING LOOP GOES THROUGH THE OTHER MICROSATS
577 #4 TO SEE IF THEY SHARE THE REQUIRED FEATURES (BELOW)
578
579 foreach my $micro (@micros_copy){
580 my @fields = split(/\t/,$micro);
581 #-----------------
582 if (scalar(@fields) > $microsatcord + 2){ #1 DISCARDING MICROSATS WITH MORE THAN ONE INTERRUPTIONS
583 if ($fields[$no_of_interruptionscord] >= 1) {$nexter =1; #print colored ['blue'],"more than one interruptions \n" if $printer == 1;
584 $nextcounter[6]++;
585 last; }
586 }
587 #-----------------
588 if (($prevtype ne "0") && ($prevtype ne $fields[$typecord])) {
589 $nexter =1; #print colored ['yellow'],"microsat of different type \n" if $printer == 1;
590 $nextcounter[8]++;
591 last; } #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSATS BELONG
592 #----------------- #2 TO DIFFERENT TYPES (MONOS, DIS, TRIS ETC.)
593 $prevtype = $fields[$typecord];
594
595 my $motif = $fields[$motifcord];
596 my $firstmotif = ();
597
598 if ($motif =~ /^\[/){
599 $motif =~ s/^\[//g;
600 $motif =~ /([a-zA-Z]+)\].*/;
601 $firstmotif = $1;
602 }
603 else {$firstmotif = $motif;}
604
605 my $motifpattern = $firstmotif.$firstmotif;
606 my $prevmotifpattern = $prevmotif.$prevmotif;
607
608 if (($prevmotif ne "0")&&(($motifpattern !~ /$prevmotif/i)||($prevmotifpattern !~ /$firstmotif/i)) ) {
609 $nexter =1; #print colored ['green'],"different motifs used \n$line\n" if $printer == 1;
610 $nextcounter[9]++;
611 last;
612 } #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSATS BELONG
613 #2 TO DIFFERENT MOTIFS
614 my $prevmotif = $firstmotif;
615 #-----------------
616
617 for my $t (0 ... $#tags){ #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSAT ENTRIES BELONG
618 #2 DIFFERENT ALIGNMENT BLOCKS
619 if ($micro =~ /([0-9]+)\s+($focalspec_orig)\s([_0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
620 my $key2 = join("_",$2, $3, $4, $5);
621 # print "key = $key .. key2 = $key2\n"; #<STDIN>;
622 if ($key2 ne $key){
623 # print "microsats belong to diffferent alignment blocks altogether\n" if $printer == 1;
624 $nextcounter[10]++;
625 $nexter = 1; last;
626 }
627 }
628 else{
629 # print "counld not form a key for $line\n"; # if $printer == 1;
630 #<STDIN>;
631 $nexter = 1; last;
632 }
633 }
634 }
635 #print "D2: $no_of_species\n";
636
637 #####################
638 if ($nexter == 1){
639 # print "nexting\n"; # if $printer == 1;
640 next;
641 }
642 else{
643 # print "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n$key:\n$line\nvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv\n" if $printer == 1;
644 push (@{$orths{$key}},$line);
645 $loaded++;
646 if ($line =~ /($focalspec_orig)\s([_a-zA-Z0-9]+)\s([0-9]+)\s([0-9]+)/ ) {
647
648 # print "$line\n" if $printer == 1; #if $line =~ /Contig/;
649 # print "################ ################\n" if $printer == 1;
650 push @allowedchrs, $2 if !exists $allowedhash{$2};
651 $allowedhash{$2} = 1;
652 my $key = join("\t",$1, $2, $3, $4);
653 # print "C: $no_of_species .. key = $key\n";#<STDIN>;
654 # print "print the shit: $key\n" ; #if $printer == 1;
655 $seen{$key} = 1;
656 }
657 else { #print "Key could not be formed in SPUT for ($focalspec_orig) (chrom) ([0-9]+) ([0-9]+)\n";
658 }
659 }
660 }
661 close ORTH;
662 # print "now studying where we lost microsatellites: @nextcounter\n";
663 for my $reason (0 ... $#nextcounter){
664 #print $removalReasons[$reason]."\t".$nextcounter[$reason],"\n";
665 }
666 # print "\ntotal number of keys formed = ", scalar(keys %orths), " = \n";
667 # print "done filtering .. counterm = $counterm and loaded = $loaded\n";
668 #----------------------------------------------------------------------------------------------------------------
669 # NOW GENERATING THE ALIGNMENT FILE WITH RELELEVENT ALIGNMENTS STORED ONLY.
670 # print "adding files @filterseqfiles \n";
671 #<STDIN>;
672
673 while (1){
674 if (-e $megamatchlck){
675 # print "waiting to write into $megamatchlck\n";
676 sleep 10;
677 }
678 else{
679 open (MEGAMLCK, ">$megamatchlck") or die "Cannot open megamatchlck file $megamatchlck: $!";
680 open (MEGAM, ">$megamatch") or die "Cannot open megamatch file $megamatch: $!";
681 last;
682 }
683 }
684 foreach my $seqfile (@filterseqfiles){
685 my $fullpath = $seqfile;
686 open (MATCH, "<$fullpath") or die "Cannot open MATCH file $fullpath: $!";
687 my $matchlines = 0;
688
689 while (my $line = <MATCH>) {
690 #print "checking $line";
691 if ($line =~ /($focalspec_orig)\s([a-zA-Z0-9]+)\s([0-9]+)\s([0-9]+)/ ) {
692 my $key = join("\t",$1, $2, $3, $4);
693 # print "key = $key\n";
694 #print "------------------------------------------------------\n";
695 #print "asking $line\n";
696 if (exists $seen{$key}){
697
698 #print "seen $line \n"; <STDIN>;
699 while (1){
700 $matchlines++;
701 print MEGAM $line;
702 $line = <MATCH>;
703 print MEGAM "\n" if $line !~ /[0-9a-zA-Z]/;
704 last if $line !~/[0-9a-zA-Z]/;
705 }
706 }
707 else{
708 # print "not seen\n";
709 }
710 }
711 }
712 # print "matchlines = $matchlines\n";
713 close MATCH;
714 }
715 close MEGAMLCK;
716
717 unlink $megamatchlck;
718 close MEGAM;
719 undef %seen;
720 # print "done writitn to $megamatch\n";#<STDIN>;
721 #----------------------------------------------------------------------------------------------------------------
722 #<STDIN>;
723 #---------------------------------------------------------------------------
724 # NOW, AFTER FILTERING MANY MICROSATS, AND LOADING THE FILTERED ONES INTO
725 # THE HASH %orths , WE GO THROUGH THE ALIGNMENT FILE, AND STUDY THE
726 # FLANKING SEQUENCES OF ALL THESE MICROSATS, TO FILTER THEM FURTHER
727 #$printer = 1;
728
729 my $microreadcounter=0;
730 my $contigsentered=0;
731 my $contignotrightcounter=0;
732 my $keynotformedcounter=0;
733 my $keynotfoundcounter= 0;
734 my $dotcounter = 0;
735
736 # print "opening $megamatch\n";
737
738 open (BO, "<$megamatch") or die "Cannot open alignment file: $megamatch: $!";
739 # print "doing $megamatch\n " ;
740
741 #<STDIN>;
742
743 while (my $line = <BO>){
744 # print $line; #<STDIN>;
745 # print "." if $dotcounter % 100 ==0;
746 # print "\n" if $dotcounter % 5000 ==0;
747 # print "dotcounter = $dotcounter\n " if $printer == 1;
748 next if $line !~ /^[0-9]+/;
749 $dotcounter++;
750 # print colored ['green'], "~" x 60, "\n" if $printer == 1;
751 # print colored ['green'], $line;# if $printer == 1;
752 chomp $line;
753 my @fields2 = split(/\t/,$line);
754 my $key2 = ();
755 my $alignment_no = (); #1 TEMPORARY
756 if ($line =~ /([0-9]+)\s+($focalspec_orig)\s([_\-s0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
757 # $key2 = join("\t",$1, $2, $4, $5);
758 $key2 = join("_",$2, $3, $4, $5);
759 # print "key = $key2\n";
760
761 # print "key = $key2\n";
762 $alignment_no=$1;
763 }
764 else {print "seq line $line incompatible\n"; $keynotformedcounter++; next;}
765
766 $no_of_species = adjustCoordinates($line);
767
768 $contignotrightcounter++ if $no_of_species != $no;
769 # print "contignotrightcounter=$contignotrightcounter\n";
770 # print "no_of_species=$no_of_species\n";
771 # print "no=$no\n";
772
773 next if $no_of_species != $no;
774 # print "D: $no_of_species\n";
775 # print "E: $no_of_species\n";
776 #<STDIN>;
777 # print "key = $key2\n" if $printer == 1;
778 my @clusters = (); #1 EXTRACTING MICROSATS CORRESPONDING TO THIS
779 #2 ALIGNMENT BLOCK
780 if (exists($orths{$key2})){
781 @clusters = @{$orths{$key2}};
782 $contigsentered++;
783 delete $orths{$key2};
784 }
785 else{
786 # print "orth does not exist\n";
787 $keynotfoundcounter++;
788 next;
789 }
790
791 my %sequences=(); #1 WILL STORE SEQUENCES IN THE CURRENT ALIGNMENT BLOCK
792 my $humseq = ();
793 foreach my $tag (@tags){ #1 READING THE ALIGNMENT FILE AND CAPTURING SEQUENCES
794 my $seq = <BO>; #2 OF ALL SPECIES.
795 chomp $seq;
796 $sequences{$tag} = " ".$seq;
797 #print "sequences = $sequences{$tag}\n" if $printer == 1;
798 $humseq = $seq if $tag =~ /H/;
799 }
800
801
802 foreach my $cluster (@clusters){ #1 NOW, GOING THROUGH THE CLUSTER OF MICROSATS
803 # print "x" x 60, "\n" if $printer == 1;
804 # print colored ['red'],"cluster = $cluster\n";
805 $largesttree =~ s/hg18/H/g;
806 $largesttree =~ s/panTro2/C/g;
807 $largesttree =~ s/ponAbe2/O/g;
808 $largesttree =~ s/rheMac2/R/g;
809 $largesttree =~ s/calJac1/M/g;
810
811 $microreadcounter++;
812 my @micros = split(/>/,$cluster);
813
814
815
816
817
818
819 shift @micros;
820
821 my $edge_microsat=0; #1 THIS WILL HAVE VALUE "1" IF MICROSAT IS FOUND
822 #2 TO BE TOO CLOSE TO THE EDGES OF ALIGNMENT BLOCK
823
824 my @starts= (); my %start_hash=(); #1 STORES THE START AND END COORDINATES OF MICROSATELLITES
825 my @ends = (); my %end_hash=(); #2 SO THAT LATER, WE WILL BE ABLE TO FIND THE EXTREME
826 #3 COORDINATE VALUES OF THE ORTHOLOGOUS MIROSATELLITES.
827
828 my %microhash=();
829 my %microsathash=();
830 my %nonmicrosathash=();
831 my $motif=(); #1 BASIC MOTIF OF THE MICROSATELLITE.. THERE'S ONLY 1
832 # print "tags=@tags\n";
833 for my $i (0 ... $#tags){ #1 FINDING THE MICROSAT, AND THE ALIGNMENT SEQUENCE
834 #2 CORRESPONDING TO THE PARTICULAR SPECIES (AS PER
835 #3 THE VARIABLE $TAG;
836 my $tag = $tags[$i];
837 # print $seq;
838 my $locus="NULL"; #1 THIS WILL STORE THE MICROSAT OF THIS SPECIES.
839 #2 IF THERE IS NO MICROSAT, IT WILL REMAIN "NULL"
840
841 foreach my $micro (@micros){
842 # print "micro=$micro, tag=$tag\n";
843 if ($micro =~ /^$tag/){ #1 MICROSAT OF THIS SPECIES FOUND..
844 $locus = $micro;
845 my @fields = split(/\t/,$micro);
846 $motif = $fields[$motifcord];
847 $microsathash{$tag}=$fields[$microsatcord];
848 # print "fields=@fields, and startcord=$startcord = $fields[$startcord]\n";
849 push(@starts, $fields[$startcord]);
850 push(@ends, $fields[$endcord]);
851 $start_hash{$tag}=$fields[$startcord];
852 $end_hash{$tag}=$fields[$endcord];
853 last;
854 }
855 else{$microsathash{$tag}="NULL"}
856 }
857 $microhash{$tag}=$locus;
858
859 }
860
861
862
863 my $extreme_start = smallest_number(@starts); #1 THESE TWO ARE THE EXTREME COORDINATES OF THE
864 my $extreme_end = largest_number(@ends); #2 MICROSAT CLUSTER ACCROSS ALL THE SPECIES IN
865 #3 WHOM IT IS FOUND TO BE ORTHOLOGOUS.
866
867 # print "starts=@starts... ends=@ends\n";
868
869 my %up_flanks = (); #1 CONTAINS UPSTEAM FLANKING REGIONS FOR EACH SPECIES
870 my %down_flanks = (); #1 CONTAINS DOWNDTREAM FLANKING REGIONS FOR EACH SPECIES
871
872 my %up_largeflanks = ();
873 my %down_largeflanks = ();
874
875 my %locusandflanks = ();
876 my %locusandlargeflanks = ();
877
878 my %up_internal_flanks=(); #1 CONTAINS SEQUENCE BETWEEN THE $extreme_start and the
879 #2 ACTUAL START OF MICROSATELLITE IN THE SPECIES
880 my %down_internal_flanks=(); #1 CONTAINS SEQUENCE BETWEEN THE $extreme_end and the
881 #2 ACTUAL end OF MICROSATELLITE IN THE SPECIES
882
883 my %alignment=(); #1 CONTAINS ACTUAL ALIGNMENT SEQUENCE BETWEEN THE TWO
884 #2 EXTEME VALUES.
885
886 my %microsatstarts=(); #1 WITHIN EACH ALIGNMENT, IF THERE EXISTS A MICROSATELLITE
887 #2 THIS HASH CONTAINS THE START SITE OF THE MICROSATELLITE
888 #3 WIHIN THE ALIGNMENT
889 next if !defined $extreme_start;
890 next if !defined $extreme_end;
891 next if $extreme_start > length($sequences{$tags[0]});
892 next if $extreme_start < 0;
893 next if $extreme_end > length($sequences{$tags[0]});
894
895 for my $i (0 ... $#tags){ #1 NOW THAT WE HAVE GATHERED INFORMATION REGARDING
896 #2 SEQUENCE ALIGNMENT AND MICROSATELLITE COORDINATES
897 #3 AS WELL AS THE EXTREME COORDINATES OF THE
898 #4 MICROSAT CLUSTER, WE WILL PROCEED TO EXTRACT THE
899 #5 FLANKING SEQUENCE OF ALL ORGS, AND STUDY IT IN
900 #6 MORE DETAIL.
901 my $tag = $tags[$i];
902 # print "tag=$tag.. seqlength = ",length($sequences{$tag})," extreme_start=$extreme_start and extreme_end=$extreme_end\n";
903 my $upstream_gaps = (substr($sequences{$tag}, 0, $extreme_start) =~ s/\-/-/g); #1 NOW MEASURING THE NUMBER OF GAPS IN THE UPSTEAM
904 #2 AND DOWNSTREAM SEQUENCES OF THE MICROSATs IN THIS
905 #3 CLUSTER.
906 # print "seq length $tag = $sequences{$tag} = ",length($sequences{$tag})," extreme_end=$extreme_end\n" ;
907 my $downstream_gaps = (substr($sequences{$tag}, $extreme_end) =~ s/\-/-/g);
908 if (($extreme_start - $upstream_gaps )< $EDGE_DISTANCE || (length($sequences{$tag}) - $extreme_end - $downstream_gaps) < $EDGE_DISTANCE){
909 $edge_microsat=1;
910
911 last;
912 }
913 else{
914 $up_flanks{$tag} = substr($sequences{$tag}, $extreme_start - $FLANK_SUPPORT, $FLANK_SUPPORT);
915 $down_flanks{$tag} = substr($sequences{$tag}, $extreme_end+1, $FLANK_SUPPORT);
916
917 $up_largeflanks{$tag} = substr($sequences{$tag}, $extreme_start - $COMPLEXITY_SUPPORT, $COMPLEXITY_SUPPORT);
918 $down_largeflanks{$tag} = substr($sequences{$tag}, $extreme_end+1, $COMPLEXITY_SUPPORT);
919
920
921 $alignment{$tag} = substr($sequences{$tag}, $extreme_start, $extreme_end-$extreme_start+1);
922 $locusandflanks{$tag} = $up_flanks{$tag}."[".$alignment{$tag}."]".$down_flanks{$tag};
923 $locusandlargeflanks{$tag} = $up_largeflanks{$tag}."[".$alignment{$tag}."]".$down_largeflanks{$tag};
924
925 if ($microhash{$tag} ne "NULL"){
926 $up_internal_flanks{$tag} = substr($sequences{$tag}, $extreme_start , $start_hash{$tag}-$extreme_start);
927 $down_internal_flanks{$tag} = substr($sequences{$tag}, $end_hash{$tag} , $extreme_end-$end_hash{$tag});
928 $microsatstarts{$tag}=$start_hash{$tag}-$extreme_start;
929 # print "tag = $tag, internal flanks = $up_internal_flanks{$tag} and $down_internal_flanks{$tag} and start = $microsatstarts{$tag}\n" if $printer == 1;
930 }
931 else{
932 $nonmicrosathash{$tag}=substr($sequences{$tag}, $extreme_start, $extreme_end-$extreme_start+1);
933
934 }
935 # print "up flank for species $tag = $up_flanks{$tag} \ndown flank for species $tag = $down_flanks{$tag} \n" if $printer == 1;
936
937 }
938
939 }
940 $nextcounter[11]++ if $edge_microsat==1;
941 next if $edge_microsat==1;
942
943
944 my $low_complexity = 0; #1 VALUE WILL BE 1 IF ANY OF THE FLANKING REGIONS
945 #2 IS FOUND TO BE OF LOW COMPLEXITY, BY USING THE
946 #3 FUNCTION sub test_complexity
947
948
949 for my $i (0 ... $#tags){
950 # print "i = $tags[$i]\n" if $printer == 1;
951 if (test_complexity($up_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT) eq "LOW" || test_complexity($down_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT) eq "LOW"){
952 # print "i = $i, low complexity regions: $up_largeflanks{$tags[$i]}: ",test_complexity($up_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT), " and $down_largeflanks{$tags[$i]} = ",test_complexity($down_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT),"\n" if $printer == 1;
953 $low_complexity =1; last;
954 }
955 }
956
957 $nextcounter[12]++ if $low_complexity==1;
958 next if $low_complexity == 1;
959
960
961 my $sequence_dissimilarity = 0; #1 THIS VALYE WILL BE 1 IF THE SEQUENCE SIMILARITY
962 #2 BETWEEN ANY OF THE SPECIES AGAINST THE HUMAN
963 #3 FLANKING SEQUENCES IS BELOW A CERTAIN THRESHOLD
964 #4 AS DESCRIBED IN FUNCTION sub sequence_similarity
965 my %donepair = ();
966 for my $i (0 ... $#tags){
967 # print "i = $tags[$i]\n" if $printer == 1;
968 # next if $i == 0;
969 # print colored ['magenta'],"THIS IS UP\n" if $printer == 1;
970
971 for my $b (0 ... $#tags){
972 next if $b == $i;
973 my $pair = ();
974 $pair = $i."_".$b if $i < $b;
975 $pair = $b."_".$i if $b < $i;
976 next if exists $donepair{$pair};
977 my ($up_similarity,$upnucdiffs, $upindeldiffs) = sequence_similarity($up_flanks{$tags[$i]}, $up_flanks{$tags[$b]}, $SIMILARITY_THRESH, $info);
978 my ($down_similarity,$downnucdiffs, $downindeldiffs) = sequence_similarity($down_flanks{$tags[$i]}, $down_flanks{$tags[$b]}, $SIMILARITY_THRESH, $info);
979 $donepair{$pair} = $up_similarity."_".$down_similarity;
980
981 # print RUN "$up_similarity $upnucdiffs $upindeldiffs $down_similarity $downnucdiffs $downindeldiffs\n";
982
983 if ( $up_similarity < $SIMILARITY_THRESH || $down_similarity < $SIMILARITY_THRESH){
984 $sequence_dissimilarity =1;
985 last;
986 }
987 }
988 }
989 $nextcounter[13]++ if $sequence_dissimilarity==1;
990
991 next if $sequence_dissimilarity == 1;
992 my ($simplified_microsat, $Hchrom, $Hstart, $Hend, $locusmotif, $locusmotifsize) = summarize_microsat($cluster, $humseq);
993 # print "simplified_microsat=$simplified_microsat\n";
994 my ($tree_analysis, $conformation) = treeStudy($simplified_microsat);
995 # print "tree_analysis = $tree_analysis .. conformation=$conformation\n";
996 #<STDIN>;
997
998 # print SELECT "\"$conformation\"\t$tree_analysis\n";
999
1000 next if $tree_analysis =~ /DISCARD/;
1001 if (exists $treesToReject{$tree_analysis}){
1002 $nextcounter[14]++;
1003 next;
1004 }
1005
1006 # print "F: $no_of_species\n";
1007
1008 # my $adjuster=();
1009 # if ($no_of_species == 4){
1010 # my @sields = split(/\t/,$simplified_microsat);
1011 # my $somend = pop(@sields);
1012 # my $somestart = pop(@sields);
1013 # my $somechr = pop(@sields);
1014 # $adjuster = "NA\t" x 13 ;
1015 # $simplified_microsat = join ("\t", @sields, $adjuster).$somechr."\t".$somestart."\t".$somend;
1016 # }
1017 # if ($no_of_species == 3){
1018 # my @sields = split(/\t/,$simplified_microsat);
1019 # my $somend = pop(@sields);
1020 # my $somestart = pop(@sields);
1021 # my $somechr = pop(@sields);
1022 # $adjuster = "NA\t" x 26 ;
1023 # $simplified_microsat = join ("\t", @sields, $adjuster).$somechr."\t".$somestart."\t".$somend;
1024 # }
1025 #
1026 $registeredTrees{$tree_analysis} = 1 if !exists $registeredTrees{$tree_analysis};
1027 $registeredTrees{$tree_analysis}++ if exists $registeredTrees{$tree_analysis};
1028
1029 if (exists $treesToIgnore{$tree_analysis}){
1030 my @appendarr = ();
1031
1032 # print SUMMARY $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize], "\t", $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t";
1033 #print "SUMMARY ",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize], "\t", $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t";
1034 # print SELECT $Hchrom,"\t",$Hstart,"\t",$Hend,"\t","NOEVENT", "\t\t", $cluster,"\n";
1035
1036 foreach my $lnode (@$lagestnodes){
1037 my @pair = @$lnode;
1038 my @nodemutarr = ();
1039 for my $p (@pair){
1040 my @mutinfoarray1 = ();
1041 for (1 ... 38){
1042 # push (@mutinfoarray1, "NA")
1043 }
1044 # print SUMMARY join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t";
1045 # print join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t";
1046 }
1047
1048 }
1049 for (1 ... 38){
1050 push (@appendarr, "NA")
1051 }
1052 # print SUMMARY join ("\t", @appendarr,"NULL", "NULL"),"\n";
1053 # print join ("\t", @appendarr,"NULL", "NULL"),"\n";
1054 # print "SUMMARY ",join ("\t", @appendarr,"NULL", "NULL"),"\n"; #<STDIN>;
1055 next;
1056 }
1057 # print colored ['blue'],"cluster = $cluster\n";
1058
1059 my ($mutations_array, $nodes, $branches_hash, $alivehash, $primaryalignment) = peel_onion($tree, \%sequences, \%alignment, \@tags, \%microsathash, \%nonmicrosathash, $motif, $tree_analysis, $thresholdhash{length($motif)}, \%microsatstarts);
1060
1061 if ($mutations_array eq "NULL"){
1062 # print "cluster = $cluster \n"; <STDIN>;
1063 my @appendarr = ();
1064
1065 # print SUMMARY $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize, "\t";
1066
1067 # foreach my $lnode (@$lagestnodes){
1068 # my @pair = @$lnode;
1069 # my @nodemutarr = ();
1070 # for my $p (@pair){
1071 # my @mutinfoarray1 = ();
1072 # for (1 ... 38){
1073 # push (@mutinfoarray1, "NA")
1074 # }
1075 # print SUMMARY join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t";
1076 # print join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t";
1077 # }
1078 # }
1079 # for (1 ... 38){
1080 # push (@appendarr, "NA")
1081 # }
1082 # print SUMMARY join ("\t", @appendarr,"NULL", "NULL"),"\n";
1083 # print join ("\t", @appendarr,"NULL", "NULL"),"\n";
1084 # print join ("\t","SUMMARY", @appendarr,"NULL", "NULL"),"\n"; #<STDIN>;
1085 next;
1086 }
1087
1088
1089 # print "sent: \n" if $printer == 1;
1090 # print "nodes = @$nodes, branches array:\n" if $mutations_array ne "NULL" && $printer == 1;
1091
1092 my ($newstatus, $newmutations_array, $newnodes, $newbranches_hash, $newalivehash, $finalalignment) = fillAlignmentGaps($tree, \%sequences, \%alignment, \@tags, \%microsathash, \%nonmicrosathash, $motif, $tree_analysis, $thresholdhash{length($motif)}, \%microsatstarts);
1093 # print "newmutations_array returned = \n",join("\n",@$newmutations_array),"\n" if $newmutations_array ne "NULL" && $printer == 1;
1094 my @finalmutations_array= ();
1095 @finalmutations_array = selectMutationArray($mutations_array, $newmutations_array, \@tags, $alivehash, \%alignment, $motif) if $newmutations_array ne "NULL";
1096 @finalmutations_array = selectMutationArray($mutations_array, $mutations_array, \@tags, $alivehash, \%alignment, $motif) if $newmutations_array eq "NULL";
1097 # print "alt = $alternate{$conformation}\n";
1098
1099 my ($besttree, $treescore) = selectBetterTree($tree_analysis, $alternate{$conformation}, \@finalmutations_array);
1100 my $cleancase = "UNCLEAN";
1101 $cleancase = checkCleanCase($besttree, $finalalignment) if $treescore > 0 && $finalalignment ne "NULL" && $finalalignment =~ /\!/;
1102 $cleancase = checkCleanCase($besttree, $primaryalignment) if $treescore > 0 && $finalalignment eq "NULL" && $primaryalignment =~ /\!/ && $primaryalignment ne "NULL";
1103 $cleancase = "CLEAN" if $finalalignment eq "NULL" && $primaryalignment !~ /\!/ && $primaryalignment ne "NULL";
1104 $cleancase = "CLEAN" if $finalalignment ne "NULL" && $finalalignment !~ /\!/ ;
1105 # print "besttree = $besttree ... cleancase=$cleancase\n"; #<STDIN>;
1106
1107 my @selects = ("-C","+C","-H","+H","-HC","+HC","-O","+O","-H.-C","-H.-O","-HC,+C","-HC,+H","-HC.-O","-HCO,+HC","-HCO,+O","-O.-C","-O.-H",
1108 "+C.+O","+H.+C","+H.+O","+HC,-C","+HC,-H","+HC.+O","+HCO,-C","+HCO,-H","+HCO,-HC","+HCO,-O","+O.+C","+O.+H","+H.+C.+O","-H.-C.-O","+HCO","-HCO");
1109 next if (oneOf(@selects, $besttree) == 0);
1110 if ( ($besttree =~ /,/ || $besttree =~ /\./) && $cleancase eq "UNCLEAN"){
1111 $besttree = "$besttree / $tree_analysis";
1112 }
1113
1114 $besttree = "NULL" if $treescore <= 0;
1115
1116 while ($besttree =~ /[A-Z][A-Z]/){
1117 $besttree =~ s/([A-Z])([A-Z])/$1:$2/g;
1118 }
1119
1120 if ($besttree !~ /NULL/){
1121 my @elements = ($besttree =~ /([A-Z])/g);
1122
1123 foreach my $ele (@elements){
1124 # print "replacing $ele with $backReplacementArrTag{$ele}\n";
1125 $besttree =~ s/$ele/$backReplacementArrTag{$ele}/g if exists $backReplacementArrTag{$ele};
1126 }
1127 }
1128 my $endendstate = $focalspec_orig.".".$Hchrom."\t".$Hstart."\t".$Hend."\t".$locusmotif."\t".$locusmotifsize."\t".$tree_analysis."\t";
1129 next if $endendstate =~ /NA\tNA\tNA/;
1130
1131 print SUMMARY $focalspec_orig,".",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t";
1132 # print "SUMMARY\t", $focalspec_orig,".",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t",$tree_analysis,"\t" ;
1133
1134 my @mutinfoarray =();
1135
1136 foreach my $lnode (@$lagestnodes){
1137 my @pair = @$lnode;
1138 my $joint = "(".join(", ",@pair).")";
1139 my @nodemutarr = ();
1140
1141 for my $p (@pair){
1142 foreach my $mut (@finalmutations_array){
1143 $mut =~ /node=([A-Z, \(\)]+)/;
1144 push @nodemutarr, $mut if $p eq $1;
1145 }
1146 @mutinfoarray = summarizeMutations(\@nodemutarr, $besttree);
1147
1148 # print SUMMARY join ("\t", @mutinfoarray[0...($#mutinfoarray-1)] ),"\t";
1149 # print join ("\t", @mutinfoarray[0...($#mutinfoarray-1)] ),"\t";
1150 }
1151 }
1152
1153 # print "G: $no_of_species\n";
1154
1155 my @alignmentarr = ();
1156
1157 foreach my $key (keys %alignment){
1158 push @alignmentarr, $backReplacementArrTag{$key}.":".$alignment{$key};
1159
1160 }
1161 # print "alignmentarr = @alignmentarr"; <STDIN>;
1162
1163 @mutinfoarray = summarizeMutations(\@finalmutations_array, $besttree);
1164 print SUMMARY join ("\t", @mutinfoarray ),"\t";
1165 print SUMMARY join(",",@alignmentarr),"\n";
1166 # print join("\t","--------------","\n",$besttree, join("",@tags)),"\n" if scalar(@tags) < 5;
1167 # <STDIN> if scalar(@tags) < 5;
1168 # print $cleancase, "\n";
1169 # print join ("\t", @mutinfoarray,$cleancase,join(",",@alignmentarr)),"\n"; #<STDIN>;
1170 # print "summarized\n"; #<STDIN>;
1171
1172
1173
1174 my %indelcatch = ();
1175 my %substcatch = ();
1176 my %typecatch = ();
1177 my %nodescatch = ();
1178 my $mutconcat = join("\t", @finalmutations_array)."\n";
1179 my %indelposcatch = ();
1180 my %subsposcatch = ();
1181
1182 foreach my $fmut ( @finalmutations_array){
1183 # next if $fmut !~ /indeltype=[a-zA-Z]+/;
1184 #print RUN $fmut, "\n";
1185 $fmut =~ /node=([a-zA-Z, \(\)]+)/;
1186 my $lnode = $1;
1187 $nodescatch{$1}=1;
1188
1189 if ($fmut =~ /type=substitution/){
1190 # print "fmut=$fmut\n";
1191 $fmut =~ /from=([a-zA-Z\-]+)\tto=([a-zA-Z\-]+)/;
1192 my $from=$1;
1193 # print "from=$from\n";
1194 my $to=$2;
1195 # print "to=$to\n";
1196 push @{$substcatch{$lnode}} , ("from:".$from." to:".$to);
1197 $fmut =~ /position=([0-9]+)/;
1198 push @{$subsposcatch{$lnode}}, $1;
1199 }
1200
1201 if ($fmut =~ /insertion=[a-zA-Z\-]+/){
1202 $fmut =~ /insertion=([a-zA-Z\-]+)/;
1203 push @{$indelcatch{$lnode}} , $1;
1204 $fmut =~ /indeltype=([a-zA-Z]+)/;
1205 push @{$typecatch{$lnode}}, $1;
1206 $fmut =~ /position=([0-9]+)/;
1207 push @{$indelposcatch{$lnode}}, $1;
1208 }
1209 if ($fmut =~ /deletion=[a-zA-Z\-]+/){
1210 $fmut =~ /deletion=([a-zA-Z\-]+)/;
1211 push @{$indelcatch{$lnode}} , $1;
1212 $fmut =~ /indeltype=([a-zA-Z]+)/;
1213 push @{$typecatch{$lnode}}, $1;
1214 $fmut =~ /position=([0-9]+)/;
1215 push @{$indelposcatch{$lnode}}, $1;
1216 }
1217 }
1218
1219 # print $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t" if $printer == 1;
1220 # print join ("<\t>", @mutinfoarray),"\n" if $printer == 1;
1221 # print "where mutinfoarray = @mutinfoarray\n" if $printer == 1;
1222 # #print RUN ".";
1223
1224 # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;
1225 # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;
1226
1227 # print colored ['red'],"finalmutations_array=\n" if $printer == 1;
1228 # foreach (@finalmutations_array) {
1229 # print colored ['red'], "$_\n" if $_ =~ /type=substitution/ && $printer == 1 ;
1230 # print colored ['yellow'], "$_\n" if $_ !~ /type=substitution/ && $printer == 1 ;
1231
1232 # }# if $line =~ /cal/;# && $line =~ /chr4/;
1233
1234 # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;
1235 # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;
1236 # print "tree analysis = $tree_analysis\n" if $printer == 1;
1237
1238 # my $mutations = "@$mutations_array";
1239
1240
1241 next;
1242 for my $keys (@$nodes) {foreach my $key (@$keys){
1243 #print "key = $key, => $branches_hash->{$key}\n";
1244 }
1245 # print "x" x 50, "\n";
1246 }
1247 my ($birth_steps, $death_steps) = decipher_history($mutations_array,join("",@tags),$nodes,$branches_hash,$tree_analysis,$conformation, $alivehash, $simplified_microsat);
1248 }
1249 }
1250 close BO;
1251 # print "now studying where we lost microsatellites:\n";
1252 # print "x" x 60,"\n";
1253 for my $reason (0 ... $#nextcounter){
1254 # print $removalReasons[$reason]."\t".$nextcounter[$reason],"\n";
1255 }
1256 # print "x" x 60,"\n";
1257 # print "In total we read $microreadcounter microsatellites after reading through $contigsentered contigs\n";
1258 # print " we lost $keynotformedcounter contigs as they did not form the key, \n";
1259 # print "$contignotrightcounter contigs as they were not of the right species configuration\n";
1260 # print "$keynotfoundcounter contigs as they did not contain the microsats\n";
1261 # print "... In total we went through a file that had $dotcounter contigs...\n";
1262 # print join ("\n","remaining orth keys = ", (keys %orths),"");
1263 # print "------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ \n";
1264 # print "now printing counted trees: \n";
1265 if (scalar(keys %registeredTrees) > 0){
1266 foreach my $keyb ( sort (keys %registeredTrees) )
1267 {
1268 # print "$keyb : $registeredTrees{$keyb}\n";
1269 }
1270 }
1271
1272
1273 }
1274 close SUMMARY;
1275
1276 my @summarizarr = ("+C=+C +R.+C -HCOR,+C",
1277 "+H=+H +R.+H -HCOR,+H",
1278 "-C=-C -R.-C +HCOR,-C",
1279 "-H=-H -R.-H +HCOR,-H",
1280 "+HC=+HC",
1281 "-HC=-HC",
1282 "+O=+O -HCOR,+O",
1283 "-O=-O +HCOR,-O",
1284 "+HCO=+HCO",
1285 "-HCO=-HCO",
1286 "+R=+R +R.+C +R.+H",
1287 "-R=-R -R.-C -R.-H");
1288
1289 foreach my $line (@summarizarr){
1290 next if $line !~ /[A-Za-z0-9]/;
1291 # print $line;
1292 chomp $line;
1293 my @fields = split(/=/,$line);
1294 # print "title = $fields[0]\n";
1295 my @parts=split(/ +/, $fields[1]);
1296 my %partshash = ();
1297 foreach my $part (@parts){$partshash{$part}=1;}
1298 my $count=0;
1299 foreach my $key ( sort keys %registeredTrees ){
1300 next if !exists $partshash{$key};
1301 # print "now adding $registeredTrees{$key} from $key\n";
1302 $count+=$registeredTrees{$key};
1303 }
1304 # print "$fields[0] : $count\n";
1305 }
1306 my $rootdir = $dir;
1307 $rootdir =~ s/\/[A-Za-z0-9\-_]+$//;
1308 chdir $rootdir;
1309 remove_tree($dir);
1310
1311
1312 #--------------------------------------------------------------------------------------------------------
1313 #--------------------------------------------------------------------------------------------------------
1314 #--------------------------------------------------------------------------------------------------------
1315 #--------------------------------------------------------------------------------------------------------
1316 #--------------------------------------------------------------------------------------------------------
1317 sub largest_number{
1318 my $counter = 0;
1319 my($max) = shift(@_);
1320 foreach my $temp (@_) {
1321
1322 #print "finding largest array: $maxcounter \n";
1323 if($temp > $max){
1324 $max = $temp;
1325 }
1326 }
1327 return($max);
1328 }
1329
1330 sub smallest_number{
1331 my $counter = 0;
1332 my($min) = shift(@_);
1333 foreach my $temp (@_) {
1334 #print "finding largest array: $maxcounter \n";
1335 if($temp < $min){
1336 $min = $temp;
1337 }
1338 }
1339 return($min);
1340 }
1341 #--------------------------------------------------------------------------------------------------------
1342 #--------------------------------------------------------------------------------------------------------
1343 #--------------------------------------------------------------------------------------------------------
1344 #--------------------------------------------------------------------------------------------------------
1345 #--------------------------------------------------------------------------------------------------------
1346 sub baseml_parser{
1347 my $outputfile = $_[0];
1348 open(BOUT,"<$outputfile") or die "Cannot open output of upstream baseml $outputfile: $!";
1349 my @info = ();
1350 my @branchields = ();
1351 my @distanceields = ();
1352 my @bout = <BOUT>;
1353 #print colored ['red'], @bout ,"\n";
1354 for my $b (0 ... $#bout){
1355 my $bine=$bout[$b];
1356 #print colored ['yellow'], "sentence = ",$bine;
1357 if ($bine =~ /TREE/){
1358 $bine=$bout[$b++];
1359 $bine=$bout[$b++];
1360 $bine=$bout[$b++];
1361 #print "FOUND",$bine;
1362 chomp $bine;
1363 $bine =~ s/^\s+//g;
1364 @branchields = split(/\s+/,$bine);
1365 $bine=$bout[$b++];
1366 chomp $bine;
1367 $bine =~ s/^\s+//g;
1368 @distanceields = split(/\s+/,$bine);
1369 #print "LASTING..............\n";
1370 last;
1371 }
1372 else{
1373 }
1374 }
1375
1376 close BOUT;
1377 # print "branchfields = @branchields and distanceields = @distanceields\n" if $printer == 1;
1378 my %distance_hash=();
1379 for my $d (0 ... $#branchields){
1380 $distance_hash{$branchields[$d]} = $distanceields[$d];
1381 }
1382
1383 $info[0] = $distance_hash{"9..1"} + $distance_hash{"9..2"};
1384 $info[1] = $distance_hash{"9..1"} + $distance_hash{"8..9"}+ $distance_hash{"8..3"};
1385 $info[2] = $distance_hash{"9..1"} + $distance_hash{"8..9"}+$distance_hash{"7..8"}+$distance_hash{"7..4"};
1386 $info[3] = $distance_hash{"9..1"} + $distance_hash{"8..9"}+$distance_hash{"7..8"}+$distance_hash{"6..7"}+$distance_hash{"6..5"};
1387
1388 # print "\nsending back: @info\n" if $printer == 1;
1389
1390 return join("\t",@info);
1391
1392 }
1393
1394
1395 #--------------------------------------------------------------------------------------------------------
1396 #--------------------------------------------------------------------------------------------------------
1397 #--------------------------------------------------------------------------------------------------------
1398 #--------------------------------------------------------------------------------------------------------
1399 #--------------------------------------------------------------------------------------------------------
1400 sub test_complexity{
1401 my $printer = 0;
1402 my $sequence = $_[0];
1403 #print "sequence = $sequence\n";
1404 my $COMPLEXITY_SUPPORT = $_[1];
1405 my $complexity=int($COMPLEXITY_SUPPORT * (1/40)); #1 THIS IS AN ARBITRARY THRESHOLD SET FOR LOW COMPLEXITY.
1406 #2 THE INSPIRATION WAS WEB MILLER'S MAIL SENT ON
1407 #3 19 Apr 2008 WHERE HE CLASSED AS HIGH COMPLEXITY
1408 #4 REGION, IF 40 BP OF SEQUENCE HAS AT LEAST 3 OF
1409 #5 EACH NUCLEOTIDE. HENCE, I NORMALIZE THIS PARAMETER
1410 #6 FOR THE ACTUAL LENGTH OF $FLANK_SUPPORT SET BY
1411 #7 THE USER.
1412 #8 WEB MILLER SENT THE MAIL TO YDK104@PSU.EDU
1413
1414
1415
1416 my $As = ($sequence=~ s/A/A/gi);
1417 my $Ts = ($sequence=~ s/T/T/gi);
1418 my $Gs = ($sequence=~ s/G/G/gi);
1419 my $Cs = ($sequence=~ s/C/C/gi);
1420 my $dashes = ($sequence=~ s/\-/-/gi);
1421 $dashes = 0 if $sequence !~ /\-/;
1422 # print "seq = $sequence, As=$As, Ts=$Ts, Gs=$Gs, Cs=$Cs, dashes=$dashes\n";
1423 return "LOW" if $dashes > length($sequence)/2;
1424
1425 my $ans = ();
1426
1427 return "HIGH" if $As >= $complexity && $Ts >= $complexity && $Cs >= $complexity && $Gs >= $complexity;
1428
1429 my @nts = ("A","T","G","C","-");
1430
1431 my $lowcomplex = 0;
1432
1433 foreach my $nt (@nts){
1434 $lowcomplex =1 if $sequence =~ /(($nt\-*){$mono_flanksimplicityRepno,})/i;
1435 $lowcomplex =1 if $sequence =~ /(($nt[A-Za-z]){$di_flanksimplicityRepno,})/i;
1436 $lowcomplex =1 if $sequence =~ /(([A-Za-z]$nt){$di_flanksimplicityRepno,})/i;
1437 my $nont = ($sequence=~ s/$nt/$nt/gi);
1438 $lowcomplex = 1 if $nont > (length($sequence) * $prop_of_seq_allowedtoAT) && ($nt =~ /[AT\-]/);
1439 $lowcomplex = 1 if $nont > (length($sequence) * $prop_of_seq_allowedtoCG) && ($nt =~ /[CG]/);
1440 }
1441 # print "leaving for now.. $sequence\n" if $printer == 1 && $lowcomplex == 0;
1442 #<STDIN>;
1443 return "HIGH" if $lowcomplex == 0;
1444 return "LOW" ;
1445 }
1446 #--------------------------------------------------------------------------------------------------------
1447 #--------------------------------------------------------------------------------------------------------
1448 #--------------------------------------------------------------------------------------------------------
1449 #--------------------------------------------------------------------------------------------------------
1450 #--------------------------------------------------------------------------------------------------------
1451 sub sequence_similarity{
1452 my $printer = 0;
1453 my @seq1 = split(/\s*/, $_[0]);
1454 my @seq2 = split(/\s*/, $_[1]);
1455 my $similarity_thresh = $_[2];
1456 my $info = $_[3];
1457 # print "input = @_\n" if $printer == 1;
1458 my $seq1str = $_[0];
1459 my $seq2str = $_[1];
1460 $seq1str=~s/\-//g; $seq2str=~s/\-//g;
1461 my $similarity=0;
1462
1463 my $nucdiffs=0;
1464 my $nucsims=0;
1465 my $indeldiffs=0;
1466
1467 for my $i (0...$#seq1){
1468 $similarity++ if $seq1[$i] =~ /$seq2[$i]/i ; #|| $seq1[$i] =~ /\-/i || $seq2[$i] =~ /\-/i ;
1469 $nucsims++ if $seq1[$i] =~ /$seq2[$i]/i && ($seq1[$i] =~ /[a-zA-Z]/i && $seq2[$i] =~ /[a-zA-Z]/i);
1470 $nucdiffs++ if $seq1[$i] !~ /$seq2[$i]/i && ($seq1[$i] =~ /[a-zA-Z]/i && $seq2[$i] =~ /[a-zA-Z]/i);
1471 $indeldiffs++ if $seq1[$i] !~ /$seq2[$i]/i && $seq1[$i] =~ /\-/i || $seq2[$i] =~ /\-/i;
1472 }
1473 my $sim = $similarity/length($_[0]);
1474 return ( $sim, $nucdiffs, $indeldiffs ); #<= $similarity_thresh;
1475 }
1476 #--------------------------------------------------------------------------------------------------------
1477 #--------------------------------------------------------------------------------------------------------
1478 #--------------------------------------------------------------------------------------------------------
1479
1480 sub load_treesToReject{
1481 my @rejectlist = ();
1482 my $alltags = join("",@_);
1483 @rejectlist = qw (-HCOR +HCOR) if $alltags eq "HCORM";
1484 @rejectlist = qw ( -HCO|+R +HCO|-R) if $alltags eq "HCOR";
1485 @rejectlist = qw ( -HC|+O +HC|-O) if $alltags eq "HCO";
1486
1487 %treesToReject=();
1488 $treesToReject{$_} = $_ foreach (@rejectlist);
1489 #print "loaded to reject for $alltags; ", $treesToReject{$_},"\n" foreach (@rejectlist); #<STDIN>;
1490 }
1491 #--------------------------------------------------------------------------------------------------------
1492 sub load_treesToIgnore{
1493 my @rejectlist = ();
1494 my $alltags = join("",@_);
1495 @rejectlist = qw (-HCOR +HCOR +HCORM -HCORM) if $alltags eq "HCORM";
1496 @rejectlist = qw ( -HCO|+R +HCO|-R +HCOR -HCOR) if $alltags eq "HCOR";
1497 @rejectlist = qw ( -HC|+O +HC|-O +HCO -HCO) if $alltags eq "HCO";
1498
1499 %treesToIgnore=();
1500 $treesToIgnore{$_} = $_ foreach (@rejectlist);
1501 #print "loaded ", $treesToIgnore{$_},"\n" foreach (@rejectlist);
1502 }
1503 #--------------------------------------------------------------------------------------------------------
1504 sub load_thresholds{
1505 my @threshold_array=split(/[,_]/,$_[0]);
1506 unshift @threshold_array, "0";
1507 for my $size (1 ... 4){
1508 $thresholdhash{$size}=$threshold_array[$size];
1509 }
1510 }
1511 #--------------------------------------------------------------------------------------------------------
1512 sub load_allPossibleTrees{
1513 #1 THIS FILE STORES ALL POSSIBLE SCENARIOS OF MICROSATELLITE
1514 #2 BIRTH AND DEATH EVENTS ON A 5-PRIMATE TREE OF H,C,O,R,M
1515 #3 IN FORM OF A TEXT FILE. THIS WILL BE USED AS A TEMPLET
1516 #4 TO COMPARE EACH MICROSATELLITE CLUSTER TO UNDERSTAND THE
1517 #5 EVOLUTION OF EACH LOCUS. WE WILL THEN DISCARD SOME
1518 #6 MICROSATS ACCRODING TO THEIR EVOLUTIONARY BEHAVIOUR ON
1519 #7 THE TREE. MOST PROBABLY WE WILL REMOVE THOSE MICROSATS
1520 #8 THAT ARE NOT SUFFICIENTLY INFORMATIVE, LIKE IN CASE OF
1521 #9 AN OUTGROUP MICROSATELLITE BEING DIFFERENT FRON ALL OTHER
1522 #10 SPECIES IN THE TREE.
1523 my $tree_list = $_[0];
1524 # print "file to be loaded: $tree_list\n";
1525
1526 my @trarr = ();
1527 @trarr = ("#H C O CONCLUSION ALTERNATE",
1528 "+ + + +HCO NA",
1529 "+ _ _ +H NA",
1530 "_ + _ +C NA",
1531 "_ _ + -HC|+O NA",
1532 "+ _ + -C +H",
1533 "_ + + -H +C",
1534 "+ + _ +HC|-O NA",
1535 "_ _ _ -HCO NA") if $tree_list =~ /_HCO\.txt/;
1536 @trarr = ("#H C O R CONCLUSION ALTERNATE",
1537 "_ _ _ _ -HCOR NA",
1538 "+ + + + +HCOR NA",
1539 "+ + + _ +HCO|-R +H.+C.+O",
1540 "+ + _ _ +HC +H.+C;-O",
1541 "+ _ _ _ +H +HC,-C;+HC,-C",
1542 "_ + _ _ +C +HC,-H;+HC,-H",
1543 "_ _ + _ +O -HC|-H.-C",
1544 "_ _ + + -HC -H.-C",
1545 "+ _ _ + +H|-C.-O +HC,-C",
1546 "_ + _ + +C -H.-O",
1547 "_ + + _ -H +C.+O",
1548 "_ _ _ + -HCO|+R NA",
1549 "+ _ + _ +H.+O|-C NA",
1550 "_ + + + -H -HC,+C",
1551 "+ _ + + -C -HC,+H",
1552 "+ + _ + -O +HC") if $tree_list =~ /_HCOR\.txt/;
1553
1554 @trarr = ("#H C O R M CONCLUSION ALTERNATE",
1555 "+ + _ + + -O -HCO,+HC|-HCO,+HC;-HCO,(+H.+C)",
1556 "+ _ + + + -C -HC,+H;+HCO,(+H.+O)",
1557 "_ + + + + -H -HC,+C;-HCO,(+C.+O)",
1558 "_ _ + _ _ +O +HCO,-HC;+HCO,(-H.-C)",
1559 "_ + _ _ _ +C +HC,-H;+HCO,(-H.-O)",
1560 "+ _ _ _ _ +H +HC,-C;+HCO,(-C.-O)",
1561 "+ + + _ _ +HCO +H.+C.+O",
1562 "_ _ _ + + -HCO -HC.-O;-H.-C.-O",
1563 "+ _ _ + + -O.-C|-HCO,+H +R.+H;-HCO,(+R.+H)",
1564 "_ + _ + + -O.-H|-HCO,+C +R.+C;-HCO,(+R.+C)",
1565 "_ + + _ _ +HCO,-H|+O.+C NA",
1566 "+ _ + _ _ +HCO,-C|+O.+H NA",
1567 "_ _ + + + -HC -H.-C|-HCO,+O",
1568 "+ + _ _ _ +HC +H.+C|+HCO,-O|-HCO,+HC;-HCO,(+H.+C)",
1569 "+ + + + + +HCORM NA",
1570 "_ _ + _ + DISCARD +O;+HCO,-HC;+HCO,(-H.-C)",
1571 "_ + _ _ + +C +HC,-H;+HCO,(-H.-O)",
1572 "+ _ _ _ + +H +HC,-C;+HCO,(-C.-O)",
1573 "+ + _ _ + +HC -R.-O|+HCO,-O|+H.+C;-HCO,+HC;-HCO,(+H.+C)",
1574 "+ _ + _ + DISCARD -R.-C|+HCO,-C|+H.+O NA",
1575 "_ + + _ + DISCARD -R.-H|+HCO,-H|+C.+O NA",
1576 "_ _ _ _ + DISCARD -HCOR NA",
1577 "_ _ _ + _ DISCARD +R;-HC.-O;-H.-C.-O",
1578 "+ + _ + _ -O +R.+HC|-HCO,+HC;+H.+C.+R|-HCO,(+H.+C)",
1579 "+ + + + _ +HCOR NA",
1580 "+ + + _ + DISCARD -R;+HCO;+HC.+O;+H.+C.+O",
1581 "+ _ + + _ -C -HC,+H;+H.+O.+R|-HCO,(+H.+O)",
1582 "_ + + + _ -H -HC,+C;+C.+O.+R|-HCO,(+C.+O)",
1583 "_ _ + + _ -HC +R.+O|-HCO,+O|+HCO,-HC",
1584 "_ + _ + _ +C +R.+C|-HCO,+C|-HC,+C +HCO,(-H.-O)",
1585 "+ _ _ + _ +H +R.+H|-C.-O +HCO,(-C.-O)"
1586 ) if $tree_list =~ /_HCORM\.txt/;
1587
1588
1589 my $template_p = $_[1];
1590 my $alternate_p = $_[2];
1591 #1 THIS IS THE HASH IN WHICH INFORMATION FROM THE ABOVE FILE
1592 #2 GETS STORED, USING THE WHILE LOOP BELOW. HERE, THE KEY
1593 #3 OF EACH ROW IS THE EVOLUTIONARY CONFIGURATION OF A LOCUS
1594 #4 ON THE PRIMATE TREE, BASED ON PRESENCE/ABSENCE OF A MICROSAT
1595 #5 AT THAT LOCUS, LIKE SAY "+ + + _ _" .. EACH COLUMN BELONGS
1596 #6 TO ONE SPECIES; HERE THE COLUMN NAMES ARE "H C O R M".
1597 #7 THE VALUE FOR EACH ENTRY IS THE MEANING OF THE ABOVE
1598 #8 CONFIGURATION (I.E., CONFIGURAION OF THE KEY. HERE, THE
1599 #9 VALUE WILL BE +HCO, SIGNIFYING A BIRTH IN HUMAN-CHIMP-ORANG
1600 #10 COMMON ANCESTOR. THIS HASH HAS BEEN LOADED HERE TO BE USED
1601 #11 LATER BY THE SUBROUTINE sub treeStudy{} THAT STUDIES
1602 #12 EVOLUTIONARY CONFIGURAION OF EACH MICROSAT LOCUS, AS
1603 #13 MENTIONED ABOVE.
1604 my @keys_array=();
1605 foreach my $line (@trarr){
1606 # print $line,"\n";
1607 next if $line =~ /^#/;
1608 chomp $line;
1609 my @fields = split("\t", $line);
1610 push @keys_array, $fields[0];
1611 # print "loading: $fields[0]\n";
1612 $template_p->{$fields[0]}[0] = $fields[1];
1613 $template_p->{$fields[0]}[1] = 0;
1614 $alternate_p->{$fields[0]} = $fields[2];
1615 # $alternate_p->{$fields[1]} = $fields[2];
1616 # print "loading alternate_p $fields[1] $fields[2]\n"; #<STDIN> if $fields[1] eq "+H";
1617 }
1618 # print "loaded the trees with keys: @keys_array\n";
1619 return $template_p, \@keys_array, $alternate_p;
1620 }
1621
1622 #--------------------------------------------------------------------------------------------------------
1623 #--------------------------------------------------------------------------------------------------------
1624 #--------------------------------------------------------------------------------------------------------
1625 #--------------------------------------------------------------------------------------------------------
1626 #--------------------------------------------------------------------------------------------------------
1627 sub checkCleanCase{
1628 my $printer = 0;
1629 my $tree = $_[0];
1630 my $finalalignment = $_[1];
1631
1632 #print "IN checkCleanCase: @_\n";
1633 #<STDIN>;
1634 my @indivspecies = $tree =~ /[A-Z]/g;
1635 $finalalignment =~ s/\./_/g;
1636 my @captured = $finalalignment =~ /[A-Za-z, \(\):]+\![:A-Za-z, \(\)]/g;
1637
1638 my $unclean = 0;
1639
1640 foreach my $sp (@indivspecies){
1641 foreach my $cap (@captured){
1642 $cap =~ s/:[A-Za-z\-]+//g;
1643 my @sps = $cap =~ /[A-Z]+/g;
1644 my $spsc = join("", @sps);
1645 # print "checking whether imp species $sp is present in $cap i.e, in $spsc\n " if $printer == 1;
1646 if ($spsc =~ /$sp/){
1647 # print "foind : $sp\n";
1648 $unclean = 1; last;
1649 }
1650 }
1651 last if $unclean == 1;
1652 }
1653 #<STDIN>;
1654 return "CLEAN" if $unclean == 0;
1655 return "UNCLEAN";
1656 }
1657
1658 #--------------------------------------------------------------------------------------------------------
1659 #--------------------------------------------------------------------------------------------------------
1660 #--------------------------------------------------------------------------------------------------------
1661 #--------------------------------------------------------------------------------------------------------
1662 #--------------------------------------------------------------------------------------------------------
1663
1664
1665 sub adjustCoordinates{
1666 my $line = $_[0];
1667 return 0 if !defined $line;
1668 #print "------x------x------x------x------x------x------x------x------\n";
1669 #print $line,"\n\n";
1670 my $no_of_species = $line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)|(scaffold[0-9a-zA-Z\._\-]+)|(supercontig[0-9a-zA-Z\._\-]+)/x/ig;
1671 #print $line,"\n";
1672 #print "------x------x------x------x------x------x------x------x------\n\n\n";
1673 # my @got = ($line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)/x/g);
1674 # print "line = $line\n";
1675 $infocord = 2 + (4*$no_of_species) - 1;
1676 $typecord = 2 + (4*$no_of_species) + 1 - 1;
1677 $motifcord = 2 + (4*$no_of_species) + 2 - 1;
1678 $gapcord = $motifcord+1;
1679 $startcord = $gapcord+1;
1680 $strandcord = $startcord+1;
1681 $endcord = $strandcord + 1;
1682 $microsatcord = $endcord + 1;
1683 $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
1684 $interr_poscord = $microsatcord + 3;
1685 $no_of_interruptionscord = $microsatcord + 4;
1686 $interrcord = $microsatcord + 2;
1687 # print "$line\n startcord = $startcord, and endcord = $endcord and no_of_species = $no_of_species\n" if $line !~ /calJac/i;
1688
1689 return $no_of_species;
1690 }
1691
1692
1693 sub printhash{
1694 my $alivehash = $_[0];
1695 my @tags = @$_[1];
1696 # print "print hash\n";
1697 foreach my $tag (@tags){
1698 # print "$tag=",$alivehash->{$tag},"\n" if exists $alivehash->{$tag};
1699 }
1700
1701 return "\n"
1702 }
1703 sub peel_onion{
1704 my $printer = 0;
1705 # print "received: @_\n" ; #<STDIN>;
1706 $printer = 0;
1707 my ($tree, $sequences, $alignment, $tagarray, $microsathash, $nonmicrosathash, $motif, $tree_analysis, $threshold, $microsatstarts) = @_;
1708 # print "in peel onion.. tree = $tree \n" if $printer == 1;
1709 my %sequence_hash=();
1710
1711
1712 # for my $i (0 ... $#sequences){ $sequence_hash{$species[$i]}=$sequences->[$i]; }
1713
1714
1715 my %node_sequences=();
1716
1717 my %node_alignments = (); #NEW, Nov 28 2008
1718 my @tags=();
1719 my @locus_sequences=();
1720 my %alivehash=();
1721 foreach my $tag (@$tagarray) {
1722 #print "adding: $tag\n";
1723 push(@tags, $tag);
1724 $node_sequences{$tag}=join ".",split(/\s*/,$microsathash->{$tag}) if $microsathash->{$tag} ne "NULL";
1725 $alivehash{$tag}= $tag if $microsathash->{$tag} ne "NULL";
1726 $node_sequences{$tag}=join ".",split(/\s*/,$nonmicrosathash->{$tag}) if $microsathash->{$tag} eq "NULL";
1727 $node_alignments{$tag}=join ".",split(/\s*/,$alignment->{$tag}) ;
1728 push @locus_sequences, $node_sequences{$tag};
1729 # print "adding to node_seq: $tag = ",$node_alignments{$tag},"\n";
1730 }
1731
1732 #<STDIN>;
1733
1734 my ($nodes_arr, $branches_hash) = get_nodes($tree);
1735 my @nodes=@$nodes_arr;
1736 # print "recieved nodes = " if $printer == 1;
1737 # foreach my $key (@nodes) {print "@$key " if $printer == 1;}
1738
1739 # print "\n" if $printer == 1;
1740
1741 #POPULATE branches_hash WITH INFORMATION ABOUT LIVESTATUS
1742 foreach my $keys (@nodes){
1743 my @pair = @$keys;
1744 my $joint = "(".join(", ",@pair).")";
1745 my $copykey = join "", @pair;
1746 $copykey =~ s/[\W ]+//g;
1747 # print "for node: $keys, copykey = $copykey and joint = $joint\n" if $printer == 1;
1748 my $livestatus = 1;
1749 foreach my $copy (split(/\s*/,$copykey)){
1750 $livestatus = 0 if !exists $alivehash{$copy};
1751 }
1752 $alivehash{$joint} = $joint if !exists $alivehash{$joint} && $livestatus == 1;
1753 # print "alivehash = $alivehash{$joint}\n" if exists $alivehash{$joint} && $printer == 1;
1754 }
1755
1756 @nodes = reverse(@nodes); #1 THIS IS IN ORDER TO GO THROUGH THE TREE FROM LEAVES TO ROOT.
1757
1758 my @mutations_array=();
1759
1760 my $joint = ();
1761 foreach my $node (@nodes){
1762 my @pair = @$node;
1763 # print "now in the nodes for loop, pair = @pair\n and sequences=\n" if $printer == 1;
1764 $joint = "(".join(", ",@pair).")";
1765 my @pair_sequences=();
1766
1767 foreach my $tag (@pair){
1768 # print "$tag: $node_alignments{$tag}\n" if $printer == 1;
1769 # print $node_alignments{$tag},"\n" if $printer == 1;
1770 push @pair_sequences, $node_alignments{$tag};
1771 }
1772 # print "ppeel onion joint = $joint , pair_sequences=>@pair_sequences< , pair=>@pair<\n" if $printer == 1;
1773
1774 my ($compared, $substitutions_list) = base_by_base_simple($motif,\@pair_sequences, scalar(@pair_sequences), @pair, $joint);
1775 $node_alignments{$joint}=$compared;
1776 push( @mutations_array,split(/:/,$substitutions_list));
1777 # print "newly added to node_sequences: $node_alignments{$joint} and list of mutations =\n", join("\n",@mutations_array),"\n" if $printer == 1;
1778 }
1779
1780
1781 my $analayzed_mutations = analyze_mutations(\@mutations_array, \@nodes, $branches_hash, $alignment, \@tags, \%alivehash, \%node_sequences, $microsatstarts, $motif);
1782
1783 return ($analayzed_mutations, \@nodes, $branches_hash, \%alivehash, $node_alignments{$joint}) if scalar @mutations_array > 0;
1784 return ("NULL",\@nodes,$branches_hash, \%alivehash, "NULL") if scalar @mutations_array == 0;
1785 }
1786
1787 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
1788 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
1789
1790 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
1791 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
1792
1793 sub get_nodes{
1794 my $printer = 0;
1795
1796 my $tree=$_[0];
1797 #$tree =~ s/ +//g;
1798 $tree =~ s/\t+//g;
1799 $tree=~s/;//g;
1800 # print "tree=$tree\n" if $printer == 1;
1801 my @nodes = ();
1802 my @onions=($tree);
1803 my %branches=();
1804 foreach my $bite (@onions){
1805 $bite=~ s/^\(|\)$//g;
1806 chomp $bite;
1807 # print "tree = $bite \n";
1808 # <STDIN>;
1809 $bite=~ /([ ,\(\)A-Z]+)\,\s*([ ,\(\)A-Z]+)/;
1810 #$tree =~ /(\(\(\(H, C\), O\), R\))\, (M)/;
1811 my @raw_nodes = ($1, $2);
1812 # print "raw nodes = $1 and $2\n" if $printer == 1;
1813 push(@nodes, [@raw_nodes]);
1814 foreach my $node (@raw_nodes) {push (@onions, $node) if $node =~ /,/;}
1815 foreach my $node (@raw_nodes) {$branches{$node}="(".$bite.")"; }
1816 # print "onions = @onions\n" if $printer == 1;<STDIN> if $printer == 1;
1817 }
1818 $printer = 0;
1819 return \@nodes, \%branches;
1820 }
1821
1822
1823 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
1824 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
1825 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
1826 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
1827 sub analyze_mutations{
1828 my ($mutations_array, $nodes, $branches_hash, $alignment, $tags, $alivehash, $node_sequences, $microsatstarts, $motif) = @_;
1829 my $locuslength = length($alignment->{$tags->[0]});
1830 my $printer = 0;
1831
1832
1833 # print " IN analyzed_mutations....\n" if $printer == 1; # \n mutations array = @$mutations_array, \nAND locuslength = $locuslength\n" if $printer == 1;
1834 my %mutation_hash=();
1835 my %froms_megahash=();
1836 my %tos_megahash=();
1837 my %position_hash=();
1838 my @solutions_array=();
1839 foreach my $mutation (@$mutations_array){
1840 # print "loadin mutation: $mutation\n" if $printer == 1;
1841 my %localhash= $mutation =~ /([\S ]+)=([\S ]+)/g;
1842 $mutation_hash{$localhash{"position"}} = {%localhash};
1843 push @{$position_hash{$localhash{"position"}}},$localhash{"node"};
1844 # print "feeding position hash with $localhash{position}: $position_hash{$localhash{position}}[0]\n" if $printer == 1;
1845 $froms_megahash{$localhash{"position"}}{$localhash{"node"}}=$localhash{"from"};
1846 $tos_megahash{$localhash{"position"}}{$localhash{"node"}}=$localhash{"to"};
1847 # print "just a trial: $mutation_hash{$localhash{position}}{position}\n" if $printer == 1;
1848 # print "loadin in tos_megahash: $localhash{position} {$localhash{node} = $localhash{to}\n" if $printer == 1;
1849 # print "loadin in from: $localhash{position} {$localhash{node} = $localhash{from}\n" if $printer == 1;
1850 }
1851
1852 # print "now going through each position in loculength:\n" if $printer == 1;
1853 ## <STDIN> if $printer == 1;
1854
1855 for my $pos (0 ... $locuslength-1){
1856 # print "at position: $pos\n" if $printer == 1;
1857
1858 if (exists($mutation_hash{$pos})){
1859 my @local_nodes=@{$position_hash{$pos}};
1860 # print "found mutation: @{$position_hash{$pos}} : @local_nodes\n" if $printer == 1;
1861
1862 foreach my $local_node (@local_nodes){
1863 # print "at local node: $local_node ... from state = $froms_megahash{$pos}{$local_node}\n" if $printer == 1;
1864 my $open_insertion=();
1865 my $open_deletion=();
1866 my $open_to_substitution=();
1867 my $open_from_substitution=();
1868 if ($froms_megahash{$pos}{$local_node} eq "-"){
1869 # print "here exists a microsatellite from $local_node to $branches_hash->{$local_node}\n" if $printer == 1 && exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};;
1870 # print "for localnode $local_node, amd the realated branches_hash:$branches_hash->{$local_node}, nexting as exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}\n" if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}} && $printer == 1;
1871 #next if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};
1872 $open_insertion=$tos_megahash{$pos}{$local_node};
1873 for my $posnext ($pos+1 ... $locuslength-1){
1874 # print "in first if .... studying posnext: $posnext\n" if $printer == 1;
1875 last if !exists ($froms_megahash{$posnext}{$local_node});
1876 # print "for posnext: $posnext, there exists $froms_megahash{$posnext}{$local_node}.. already, open_insertion = $open_insertion.. checking is $froms_megahash{$posnext}{$local_node} matters\n" if $printer == 1;
1877 $open_insertion = $open_insertion.$tos_megahash{$posnext}{$local_node} if $froms_megahash{$posnext}{$local_node} eq "-";
1878 # print "now open_insertion=$open_insertion\n" if $printer == 1;
1879 delete $mutation_hash{$posnext} if $froms_megahash{$posnext}{$local_node} eq "-";
1880 }
1881 # print "1 Feeding in: ", join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_insertion", "deletion="),"\n" if $printer == 1;
1882 push (@solutions_array, join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_insertion", "deletion="));
1883 }
1884 elsif ($tos_megahash{$pos}{$local_node} eq "-"){
1885 # print "here exists a microsatellite to $local_node from $branches_hash->{$local_node}\n" if $printer == 1 && exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};;
1886 # print "for localnode $local_node, amd the realated branches_hash:$branches_hash->{$local_node}, nexting as exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}\n" if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};
1887 #next if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};
1888 $open_deletion=$froms_megahash{$pos}{$local_node};
1889 for my $posnext ($pos+1 ... $locuslength-1){
1890 # print "in 1st elsif studying posnext: $posnext\n" if $printer == 1;
1891 # print "nexting as nextpos does not exist\n" if !exists ($tos_megahash{$posnext}{$local_node}) && $printer == 1;
1892 last if !exists ($tos_megahash{$posnext}{$local_node});
1893 # print "for posnext: $posnext, there exists $tos_megahash{$posnext}{$local_node}\n" if $printer == 1;
1894 $open_deletion = $open_deletion.$froms_megahash{$posnext}{$local_node} if $tos_megahash{$posnext}{$local_node} eq "-";
1895 delete $mutation_hash{$posnext} if $tos_megahash{$posnext}{$local_node} eq "-";
1896 }
1897 # print "2 Feeding in:", join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_deletion"), "\n" if $printer == 1;
1898 push (@solutions_array, join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_deletion"));
1899 }
1900 elsif ($tos_megahash{$pos}{$local_node} ne "-"){
1901 # print "here exists a microsatellite from $local_node to $branches_hash->{$local_node}\n" if $printer == 1 && exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};;
1902 # print "for localnode $local_node, amd the realated branches_hash:$branches_hash->{$local_node}, nexting as exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}\n" if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};
1903 #next if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};
1904 # print "microsatstart = $microsatstarts->{$local_node} \n" if exists $microsatstarts->{$local_node} && $pos < $microsatstarts->{$local_node} && $printer == 1;
1905 next if exists $microsatstarts->{$local_node} && $pos < $microsatstarts->{$local_node};
1906 $open_to_substitution=$tos_megahash{$pos}{$local_node};
1907 $open_from_substitution=$froms_megahash{$pos}{$local_node};
1908 # print "open from substitution: $open_from_substitution \n" if $printer == 1;
1909 for my $posnext ($pos+1 ... $locuslength-1){
1910 #print "in last elsif studying posnext: $posnext\n";
1911 last if !exists ($tos_megahash{$posnext}{$local_node});
1912 # print "for posnext: $posnext, there exists $tos_megahash{$posnext}{$local_node}\n" if $printer == 1;
1913 $open_to_substitution = $open_to_substitution.$tos_megahash{$posnext}{$local_node} if $tos_megahash{$posnext}{$local_node} ne "-";
1914 $open_from_substitution = $open_from_substitution.$froms_megahash{$posnext}{$local_node} if $tos_megahash{$posnext}{$local_node} ne "-";
1915 delete $mutation_hash{$posnext} if $tos_megahash{$posnext}{$local_node} ne "-" && $froms_megahash{$posnext}{$local_node} ;
1916 }
1917 # print "open from substitution: $open_from_substitution \n" if $printer == 1;
1918
1919 #IS THE STRETCH OF SUBSTITUTION MICROSATELLITE-LIKE?
1920 my @motif_parts=split(/\s*/,$motif);
1921 #GENERATING THE FLEXIBLE LEFT END
1922 my $left_query=();
1923 for my $k (1 ... $#motif_parts) {
1924 $left_query= $motif_parts[$k]."|)";
1925 $left_query="(".$left_query;
1926 }
1927 $left_query=$left_query."?";
1928 # print "left_quewry = $left_query\n" if $printer == 1;
1929 #GENERATING THE FLEXIBLE RIGHT END
1930 my $right_query=();
1931 for my $k (0 ... ($#motif_parts-1)) {
1932 $right_query= "(|".$motif_parts[$k];
1933 $right_query=$right_query.")";
1934 }
1935 $right_query=$right_query."?";
1936 # print "right_query = $right_query\n" if $printer == 1;
1937 # print "Hence, searching for: ^$left_query($motif)+$right_query\$\n" if $printer == 1;
1938
1939 my $motifcomb=$motif x 50;
1940 # print "motifcomb = $motifcomb\n" if $printer == 1;
1941 if ( ($motifcomb =~/$open_to_substitution/i) && (length ($open_to_substitution) >= length($motif)) ){
1942 # print "sequence microsat-like\n" if $printer == 1;
1943 my $all_microsat_like = 0;
1944 # print "3 feeding in: ", join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_from_substitution"), "\n" if $printer == 1;
1945 push (@solutions_array, join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_from_substitution"));
1946 # print "4 feeding in: ", join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_to_substitution", "deletion="), "\n" if $printer == 1;
1947 push (@solutions_array, join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_to_substitution", "deletion="));
1948
1949 }
1950 else{
1951 # print "5 feeding in: ", join("\t", "node=$local_node","type=substitution" ,"position=$pos", "from=$open_from_substitution", "to=$open_to_substitution", "insertion=", "deletion="), "\n" if $printer == 1;
1952 push (@solutions_array, join("\t", "node=$local_node","type=substitution" ,"position=$pos", "from=$open_from_substitution", "to=$open_to_substitution", "insertion=", "deletion="));
1953 }
1954 #IS THE FROM-SEQUENCE MICROSATELLITE-LIKE?
1955
1956 }
1957 #<STDIN> if $printer ==1;
1958 }
1959 #<STDIN> if $printer ==1;
1960 }
1961 }
1962 # print "\n", "#" x 50, "\n" if $printer == 1;
1963 foreach my $tag (@$tags){
1964 # print "$tag: $alignment->{$tag}\n" if $printer == 1;
1965 }
1966 # print "\n", "#" x 50, "\n" if $printer == 1;
1967 # print "returning SOLUTIONS ARRAY : \n",join("\n", @solutions_array),"\n" if $printer == 1;
1968 #print "end\n";
1969 #<STDIN> if
1970 return \@solutions_array;
1971 }
1972 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
1973 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
1974 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
1975 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
1976
1977 sub base_by_base_simple{
1978 my $printer = 0;
1979 my ($motif, $locus, $no, $pair0, $pair1, $joint) = @_;
1980 my @seq_array=();
1981 # print "IN SUBROUTUNE base_by_base_simple.. information received = @_\n" if $printer == 1;
1982 # print "pair0 = $pair0 and pair1 = $pair1\n" if $printer == 1;
1983
1984 my @example=split(/\./,$locus->[0]);
1985 # print "example, for length = @example\n" if $printer == 1;
1986 for my $i (0...$no-1){push(@seq_array, [split(/\./,$locus->[$i])]); }
1987
1988 my @compared_sequence=();
1989 my @substitutions_list;
1990 for my $i (0...scalar(@example)-1){
1991
1992 #print "i = $i\n" if $printer == 1;
1993 #print "comparing $seq_array[0][$i] and $seq_array[1][$i] \n" ;#if $printer == 1;
1994 if ($seq_array[0][$i] =~ /!/ && $seq_array[1][$i] !~ /!/){
1995
1996 my $resolution= resolve_base($seq_array[0][$i],$seq_array[1][$i], $pair1 ,"keep" );
1997 # print "ancestral = $resolution\n" if $printer == 1;
1998
1999 if ($resolution =~ /$seq_array[1][$i]/i && $resolution !~ /!/){
2000 push @substitutions_list, add_mutation($i, $pair0, $seq_array[0][$i], $resolution );
2001 }
2002 elsif ( $resolution !~ /!/){
2003 push @substitutions_list, add_mutation($i, $pair1, $seq_array[1][$i], $resolution);
2004 }
2005 push @compared_sequence,$resolution;
2006 }
2007 elsif ($seq_array[0][$i] !~ /!/ && $seq_array[1][$i] =~ /!/){
2008
2009 my $resolution= resolve_base($seq_array[1][$i],$seq_array[0][$i], $pair0, "invert" );
2010 # print "ancestral = $resolution\n" if $printer == 1;
2011
2012 if ($resolution =~ /$seq_array[0][$i]/i && $resolution !~ /!/){
2013 push @substitutions_list, add_mutation($i, $pair1, $seq_array[1][$i], $resolution);
2014 }
2015 elsif ( $resolution !~ /!/){
2016 push @substitutions_list, add_mutation($i, $pair0, $seq_array[0][$i], $resolution);
2017 }
2018 push @compared_sequence,$resolution;
2019 }
2020 elsif($seq_array[0][$i] =~ /!/ && $seq_array[1][$i] =~ /!/){
2021 push @compared_sequence, add_bases($seq_array[0][$i],$seq_array[1][$i], $pair0, $pair1, $joint );
2022 }
2023 else{
2024 if($seq_array[0][$i] !~ /^$seq_array[1][$i]$/i){
2025 push @compared_sequence, $pair0.":".$seq_array[0][$i]."!".$pair1.":".$seq_array[1][$i];
2026 }
2027 else{
2028 # print "perfect match\n" if $printer == 1;
2029 push @compared_sequence, $seq_array[0][$i];
2030 }
2031 }
2032 }
2033 # print "returning: comared = @compared_sequence \nand substitutions list =\n", join("\n",@substitutions_list),"\n" if $printer == 1;
2034 return join(".",@compared_sequence), join(":", @substitutions_list) if scalar (@substitutions_list) > 0;
2035 return join(".",@compared_sequence), "" if scalar (@substitutions_list) == 0;
2036 }
2037
2038
2039 sub resolve_base{
2040 my $printer = 0;
2041 # print "IN SUBROUTUNE resolve_base.. information received = @_\n" if $printer == 1;
2042 my ($optional, $single, $singlesp, $arg) = @_;
2043 my @options=split(/!/,$optional);
2044 foreach my $option(@options) {
2045 $option=~s/[A-Z\(\) ,]+://g;
2046 if ($option =~ /$single/i){
2047 # print "option = $option , returning single: $single\n" if $printer == 1;
2048 return $single;
2049 }
2050 }
2051 # print "returning ",$optional."!".$singlesp.":".$single. "\n" if $arg eq "keep" && $printer == 1;
2052 # print "returning ",$singlesp.":".$single."!".$optional. "\n" if $arg eq "invert" && $printer == 1;
2053 return $optional."!".$singlesp.":".$single if $arg eq "keep";
2054 return $singlesp.":".$single."!".$optional if $arg eq "invert";
2055
2056 }
2057
2058 sub same_length{
2059 my $printer = 0;
2060 my @locus = @_;
2061 my $temp = shift @locus;
2062 $temp=~s/-|,//g;
2063 foreach my $l (@locus){
2064 $l=~s/-|,//g;
2065 return 0 if length($l) != length($temp);
2066 $temp = $l;
2067 }
2068 return 1;
2069 }
2070 sub treeStudy{
2071 my $printer = 1;
2072 # print "template DEFINED.. received: @_\n" if defined %template;
2073 # print "only received = @_" if !defined %template;
2074 my $stopper = 0;
2075 # if (!defined %template){ TEMP MASKED OCT 18 2012
2076 $stopper = 1;
2077 %template=();
2078 # print "tree decipherer = $tree_decipherer\n" if $printer == 1;
2079 my ( $template_ref, $keys_array)=load_allPossibleTrees($tree_decipherer, \%template);
2080 # print "return = $template_ref and @{$keys_array}\n" if $printer == 1;
2081 foreach my $key (@$keys_array){
2082 # print "addding : $template_ref->{$key} for $key\n" if $printer == 1;
2083 $template{$key} = $template_ref->{$key};
2084 }
2085 # } TEMP MASK OCT 18 2012 END
2086 # <STDIN>;
2087 for my $templet ( keys %template ) {
2088 # print "$templet => @{$template{$templet}}\n";
2089 }
2090 # <STDIN> if !defined %template;
2091
2092 my $strict = 0;
2093
2094 my $H = 0;
2095 my $Hchr = 1;
2096 my $Hstart = 2;
2097 my $Hend = 3;
2098 my $Hmotif = 4;
2099 my $Hmotiflen = 5;
2100 my $Hmicro = 6;
2101 my $Hstrand = 7;
2102 my $Hmicrolen = 8;
2103 my $Hinterpos = 9;
2104 my $Hrelativepos = 10;
2105 my $Hinter = 11;
2106 my $Hinterlen = 12;
2107
2108 my $C = 13;
2109 my $Cchr = 14;
2110 my $Cstart = 15;
2111 my $Cend = 16;
2112 my $Cmotif = 17;
2113 my $Cmotiflen = 18;
2114 my $Cmicro = 19;
2115 my $Cstrand = 20;
2116 my $Cmicrolen = 21;
2117 my $Cinterpos = 22;
2118 my $Crelativepos = 23;
2119 my $Cinter = 24;
2120 my $Cinterlen = 25;
2121
2122 my $O = 26;
2123 my $Ochr = 27;
2124 my $Ostart = 28;
2125 my $Oend = 29;
2126 my $Omotif = 30;
2127 my $Omotiflen = 31;
2128 my $Omicro = 32;
2129 my $Ostrand = 33;
2130 my $Omicrolen = 34;
2131 my $Ointerpos = 35;
2132 my $Orelativepos = 36;
2133 my $Ointer = 37;
2134 my $Ointerlen = 38;
2135
2136 my $R = 39;
2137 my $Rchr = 40;
2138 my $Rstart = 41;
2139 my $Rend = 42;
2140 my $Rmotif = 43;
2141 my $Rmotiflen = 44;
2142 my $Rmicro = 45;
2143 my $Rstrand = 46;
2144 my $Rmicrolen = 47;
2145 my $Rinterpos = 48;
2146 my $Rrelativepos = 49;
2147 my $Rinter = 50;
2148 my $Rinterlen = 51;
2149
2150 my $Mchr = 52;
2151 my $Mstart = 53;
2152 my $Mend = 54;
2153 my $M = 55;
2154 my $Mmotif = 56;
2155 my $Mmotiflen = 57;
2156 my $Mmicro = 58;
2157 my $Mstrand = 59;
2158 my $Mmicrolen = 60;
2159 my $Minterpos = 61;
2160 my $Mrelativepos = 62;
2161 my $Minter = 63;
2162 my $Minterlen = 64;
2163
2164 #-------------------------------------------------------------------------------#
2165 my @analysis=();
2166
2167
2168 my %speciesOrder = ();
2169 $speciesOrder{"H"} = 0;
2170 $speciesOrder{"C"} = 1;
2171 $speciesOrder{"O"} = 2;
2172 $speciesOrder{"R"} = 3;
2173 $speciesOrder{"M"} = 4;
2174 #-------------------------------------------------------------------------------#
2175
2176 my $line = $_[0];
2177 chomp $line;
2178
2179 my @f = split(/\t/,$line);
2180 # print "received array : @f.. recieved tags = @tags\n" if $printer == 1;
2181
2182 # collect all motifs
2183 my @motifs=();
2184 @motifs = ($f[$Hmotif], $f[$Cmotif], $f[$Omotif], $f[$Rmotif], $f[$Mmotif]) if $tags[$#tags] =~ /M/;
2185 @motifs = ($f[$Hmotif], $f[$Cmotif], $f[$Omotif], $f[$Rmotif]) if $tags[$#tags] =~ /R/;
2186 @motifs = ($f[$Hmotif], $f[$Cmotif], $f[$Omotif]) if $tags[$#tags] =~ /O/;
2187 # print "motifs in the array = $f[$Hmotif], $f[$Cmotif], $f[$Omotif], $f[$Rmotif]\n" if $tags[$#tags] =~ /R/;;
2188 # print "motifs = @motifs\n" if $printer == 1;
2189 my @translation = ();
2190 foreach my $motif (@motifs){
2191 push(@translation, "_") if $motif eq "NA";
2192 push(@translation, "+") if $motif ne "NA";
2193 }
2194 my $translate = join(" ", @translation);
2195 # print "translate = >$translate< and analysis = $template{$translate}[0].. on the other hand, ",$template{"- - +"}[0],"\n";
2196 my @analyses = split(/\|/,$template{$translate}[0]);
2197 # print "motifs = @motifs, analyses = @analyses\n" if $printer == 1;
2198
2199 if (scalar(@analyses) == 1) {
2200 #print "analysis = $analyses[0]\n";
2201 if ($analyses[0] !~ /,|\./ ){
2202 if ($analyses[0] =~ /\+/){
2203 my $analysis = $analyses[0];
2204 $analysis =~ s/\+|\-//g;
2205 my @species = split(/\s*/,$analysis);
2206 my @currentMotifs = ();
2207 foreach my $specie (@species){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); #print "pushing into currentMotifs: $speciesOrder{$specie}: $motifs[$speciesOrder{$specie}]\n" if $printer == 1;
2208 }
2209 # print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1;
2210 $template{$translate}[1]++ if $strict == 1 && consistency(@currentMotifs) ne "NULL";
2211 $template{$translate}[1]++ if $strict == 0;
2212 # print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1;
2213 }
2214 else{
2215 my $analysis = $analyses[0];
2216 $analysis =~ s/\+|\-//g;
2217 my @species = split(/\s*/,$analysis);
2218 my @currentMotifs = ();
2219 my @complementarySpecies = ();
2220 my $allSpecies = join("",@tags);
2221 foreach my $specie (@species){ $allSpecies =~ s/$specie//g; }
2222 foreach my $specie (split(/\s*/,$allSpecies)){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); #print "pushing into currentMotifs: $speciesOrder{$specie}: $motifs[$speciesOrder{$specie}]\n" if $printer == 1;;
2223 }
2224 # print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1;
2225 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL";
2226 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0;
2227 # print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1;
2228 }
2229 }
2230
2231 elsif ($analyses[0] =~ /,/) {
2232 my @events = split(/,/,$analyses[0]);
2233 # print "events = @events \n " if $printer == 1;
2234 if ($events[0] =~ /\+/){
2235 my $analysis1 = $events[0];
2236 $analysis1 =~ s/\+|\-//g;
2237 my $analysis2 = $events[1];
2238 $analysis2 =~ s/\+|\-//g;
2239 my @nSpecies = split(/\s*/,$analysis2);
2240 # print "original anslysis = $analysis1 " if $printer == 1;
2241 foreach my $specie (@nSpecies){ $analysis1=~ s/$specie//g;}
2242 # print "processed anslysis = $analysis1 \n" if $printer == 1;
2243 my @currentMotifs = ();
2244 foreach my $specie (split(/\s*/,$analysis1)){push(@currentMotifs, $motifs[$speciesOrder{$specie}]); }
2245 # print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1;
2246 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL";
2247 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0;
2248 # print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1;
2249 }
2250 else{
2251 my $analysis1 = $events[0];
2252 $analysis1 =~ s/\+|\-//g;
2253 my $analysis2 = $events[1];
2254 $analysis2 =~ s/\+|\-//g;
2255 my @pSpecies = split(/\s*/,$analysis2);
2256 my @currentMotifs = ();
2257 foreach my $specie (@pSpecies){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); }
2258 # print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1;
2259 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL";
2260 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0;
2261 # print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1;
2262
2263 }
2264
2265 }
2266 elsif ($analyses[0] =~ /\./) {
2267 my @events = split(/\./,$analyses[0]);
2268 foreach my $event (@events){
2269 # print "event = $event \n" if $printer == 1;
2270 if ($event =~ /\+/){
2271 my $analysis = $event;
2272 $analysis =~ s/\+|\-//g;
2273 my @species = split(/\s*/,$analysis);
2274 my @currentMotifs = ();
2275 foreach my $specie (@species){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); }
2276 #print consistency(@currentMotifs),"<- \n";
2277 # print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1;
2278 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL";
2279 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0;
2280 # print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1;
2281 }
2282 else{
2283 my $analysis = $event;
2284 $analysis =~ s/\+|\-//g;
2285 my @species = split(/\s*/,$analysis);
2286 my @currentMotifs = ();
2287 my @complementarySpecies = ();
2288 my $allSpecies = join("",@tags);
2289 foreach my $specie (@species){ $allSpecies =~ s/$specie//g; }
2290 foreach my $specie (split(/\s*/,$allSpecies)){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); }
2291 #print consistency(@currentMotifs),"<- \n";
2292 # print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1;
2293 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL";
2294 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0;
2295 # print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1;
2296 }
2297 }
2298
2299 }
2300 }
2301 else{
2302 my $finalanalysis = ();
2303 $template{$translate}[1]++;
2304 foreach my $analysis (@analyses){ ;}
2305 }
2306 # test if motifs where microsats are present, as indeed of same the motif composition
2307
2308
2309
2310 for my $templet ( keys %template ) {
2311 if (@{ $template{$templet} }[1] > 0){
2312
2313 $template{$templet}[1] = 0;
2314 # print "now returning: @{$template{$templet}}[0], $templet\n";
2315 return (@{$template{$templet}}[0], $templet);
2316 }
2317 }
2318 undef %template;
2319 # print "sending NULL\n" if $printer == 1;
2320 return ("NULL", "NULL");
2321
2322 }
2323
2324
2325 sub consistency{
2326 my @motifs = @_;
2327 # print "in consistency \n" if $printer == 1;
2328 # print "motifs sent = >",join("|",@motifs),"< \n" if $printer == 1;
2329 return $motifs[0] if scalar(@motifs) == 1;
2330 my $prevmotif = shift(@motifs);
2331 my $stopper = 0;
2332 for my $i (0 ... $#motifs){
2333 next if $motifs[$i] eq "NA";
2334 my $templet = $motifs[$i].$motifs[$i];
2335 if ($templet !~ /$prevmotif/i){
2336 $stopper = 1; last;
2337 }
2338 }
2339 return $prevmotif if $stopper == 0;
2340 return "NULL" if $stopper == 1;
2341 }
2342 sub summarize_microsat{
2343 my $printer = 1;
2344 my $line = $_[0];
2345 my $humseq = $_[1];
2346
2347 my @gaps = $line =~ /[0-9]+\t[0-9]+\t[\+\-]/g;
2348 my @starts = $line =~ /[0-9]+\t[\+\-]/g;
2349 my @ends = $line =~ /[\+\-]\t[0-9]+/g;
2350 # print "starts = @starts\tends = @ends\n" if $printer == 1;
2351 for my $i (0 ... $#gaps) {$gaps[$i] =~ s/\t[0-9]+\t[\+\-]//g;}
2352 for my $i (0 ... $#starts) {$starts[$i] =~ s/\t[\+\-]//g;}
2353 for my $i (0 ... $#ends) {$ends[$i] =~ s/[\+\-]\t//g;}
2354
2355 my $minstart = array_smallest_number(@starts);
2356 my $maxend = array_largest_number(@ends);
2357
2358 my $humupstream_st = substr($humseq, 0, $minstart);
2359 my $humupstream_en = substr($humseq, 0, $maxend);
2360 my $no_of_gaps_to_start = 0;
2361 my $no_of_gaps_to_end = 0;
2362 $no_of_gaps_to_start = ($humupstream_st =~ s/\-/x/g) if $humupstream_st=~/\-/;
2363 $no_of_gaps_to_end = ($humupstream_en =~ s/\-/x/g) if $humupstream_en=~/\-/;
2364
2365 my $locusmotif = ();
2366 # print "IN SUB SUMMARIZE_MICROSAT $line\n" if $printer == 1;
2367 #return "NULL" if $line =~ /compound/;
2368 my $Hstart = "NA";
2369 my $Hend = "NA";
2370 chomp $line;
2371 my $match_count = ($line =~ s/>/>/g);
2372 #print "number of species = $match_count\n";
2373 my @micros = split(/>/,$line);
2374 shift @micros;
2375 my $stopper = 0;
2376
2377
2378 foreach my $mic (@micros){
2379 my @local = split(/\t/,$mic);
2380 if ($local[$microsatcord] =~ /N/) {$stopper =1; last;}
2381 }
2382 return "NULL" if $stopper ==1;
2383
2384 #------------------------------------------------------
2385
2386 my @arranged = ();
2387 for my $arr (0 ... $#exacttags) {$arranged[$arr] = '0';}
2388
2389 foreach my $micro (@micros){
2390 for my $i (0 ... $#exacttags){
2391 if ($micro =~ /^$exacttags[$i]/){
2392 $arranged[$i] = $micro;
2393 last;
2394 }
2395 }
2396 }
2397 # print "arranged = @arranged \n" ; <STDIN>;;
2398
2399 my @endstatement = ();
2400 my $turn = 0;
2401 my $species_counter = 0;
2402 # print scalar(@arranged),"\n";
2403
2404 my $species_no=0;
2405
2406 my $orthHchr = 0;
2407
2408 foreach my $micro (@arranged) {
2409 $micro =~ s/\t\t/\t \t/g;
2410 $micro =~ s/\t,/\t ,/g;
2411 $micro =~ s/,\t/, \t/g;
2412 # print "------------------------------------------------------------------------------------------\n" if $printer == 1;
2413 chomp $micro;
2414 if ($micro eq '0'){
2415 push(@endstatement, join("\t",$exacttags[$species_counter],"NA","NA","NA","NA",0 ,"NA", "NA", 0,"NA","NA","NA", "NA" ));
2416 $species_counter++;
2417 # print join("|","ENDSTATEMENT:",@endstatement),"\n" if $printer == 1;
2418 next;
2419 }
2420 # print $micro,"\n";
2421 # print "micro = $micro \n" if $printer == 1;
2422 my @fields = split(/\t/,$micro);
2423 my $microcopy = $fields[$microsatcord];
2424 $microcopy =~ s/\[|\]|-//g;
2425 my $microsatlength = length($microcopy);
2426 # print "microsat = $fields[$microsatcord] and microsatlength = $microsatlength\n" if $printer == 1;
2427 # print "sp_ident = @sp_ident.. species_no=$species_no\n";
2428 $micro =~ /$sp_ident[$species_no]\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)/;
2429 # print "$micro =~ /$sp_ident[$species_no] ([0-9a-zA-Z_]+) ([0-9]+) ([0-9]+)/\n";
2430 my $sp_chr=$1;
2431 my $sp_start=$2 + $fields[$startcord] - $fields[$gapcord];
2432 my $sp_end= $sp_start + $microsatlength - 1;
2433
2434 $species_no++;
2435
2436 $micro =~ /$focalspec_orig\s(\S+)\s([0-9]+)\s([0-9]+)/;
2437 $orthHchr=$1;
2438 $Hstart=$2+$minstart-$no_of_gaps_to_start;
2439 $Hend=$2+$maxend-$no_of_gaps_to_end;
2440 # print "Hstart = $Hstart = $fields[4] + $fields[$startcord] - $fields[$gapcord]\n" if $printer == 1;
2441
2442 my $motif = $fields[$motifcord];
2443 my $firstmotif = ();
2444 my $strand = $fields[$strandcord];
2445 # print "strand = $strand\n";
2446
2447
2448 if ($motif =~ /^\[/){
2449 $motif =~ s/^\[//g;
2450 $motif =~ /([a-zA-Z]+)\].*/;
2451 $firstmotif = $1;
2452 }
2453
2454 else {$firstmotif = $motif;}
2455 # print "firstmotif =$firstmotif : \n" if $printer == 1;
2456 $firstmotif = allCaps($firstmotif);
2457
2458 if (exists $revHash{$firstmotif} && $turn == 0) {
2459 $turn=1 if $species_counter==0;
2460 $firstmotif = $revHash{$firstmotif};
2461 }
2462
2463 elsif (exists $revHash{$firstmotif} && $turn == 1) {$firstmotif = $revHash{$firstmotif}; $turn = 1;}
2464 # print "changed firstmotif =$firstmotif\n" if $printer == 1;
2465 # <STDIN>;
2466 $locusmotif = $firstmotif;
2467
2468 if (scalar(@fields) > $microsatcord + 2){
2469 # print "fields = @fields ... interr_poscord=$interr_poscord=$fields[$interr_poscord] .. interrcord=$interrcord=$fields[$interrcord]\n" if $printer == 1;
2470
2471 my @interposes = ();
2472 @interposes = split(",",$fields[$interr_poscord]) if $fields[$interr_poscord] =~ /,/;
2473 $interposes[0] = $fields[$interr_poscord] if $fields[$interr_poscord] !~ /,/ ;
2474 # print "interposes=@interposes\n" if $printer == 1;
2475 my @relativeposes = ();
2476 my @interruptions = ();
2477 @interruptions = split(",",$fields[$interrcord]) if $fields[$interrcord] =~ /,/;
2478 $interruptions[0] = $fields[$interrcord] if $fields[$interrcord] !~ /,/;
2479 my @interlens = ();
2480
2481
2482 for my $i (0 ... $#interposes){
2483
2484 my $interpos = $interposes[$i];
2485 my $nexter = 0;
2486 my $interruption = $interruptions[$i];
2487 my $interlen = length($interruption);
2488 push (@interlens, $interlen);
2489
2490
2491 my $relativepos = (100 * $interpos) / $microsatlength;
2492 # print "relativepos = $relativepos ,interpos=$interpos, interruption=$interruption, interlen=$interlen \n" if $printer == 1;
2493 $relativepos = (100 * ($interpos-$interlen)) / $microsatlength if $relativepos > 50;
2494 # print "--> = $relativepos\n" if $printer == 1;
2495 $interruption = "IND" if length($interruption) < 1;
2496
2497 if ($turn == 1){
2498 $fields[$microsatcord] = switch_micro($fields[$microsatcord]);
2499 $interruption = switch_nucl($interruption) unless $interruption eq "IND";
2500 $interpos = ($microsatlength - $interpos) - $interlen + 2;
2501 # print "turn interpos = $interpos for $fields[$microsatcord]\n" if $printer == 1;
2502 $relativepos = (100 * $interpos) / $microsatlength;
2503 $relativepos = (100 * ($interpos-$interlen)) / $microsatlength if $relativepos > 50;
2504
2505
2506 $strand = '+' if $strand eq '-';
2507 $strand = '-' if $strand eq '+';
2508 }
2509 # print "final relativepos = $relativepos\n" if $printer == 1;
2510 push(@relativeposes, $relativepos);
2511 }
2512 push(@endstatement,join("\t",($exacttags[$species_counter],$sp_chr, $sp_start, $sp_end, $firstmotif,length($firstmotif),$fields[$microsatcord],$strand,$microsatlength,join(",",@interposes),join(",",@relativeposes),join(",",@interruptions), join(",",@interlens))));
2513 }
2514
2515 else{
2516 push(@endstatement, join("\t",$exacttags[$species_counter],$sp_chr, $sp_start, $sp_end, $firstmotif,length($firstmotif),$fields[$microsatcord],$strand,$microsatlength,"NA","NA","NA", "NA"));
2517 }
2518
2519 $species_counter++;
2520 }
2521
2522 $locusmotif = $sameHash{$locusmotif} if exists $sameHash{$locusmotif};
2523 $locusmotif = $revHash{$locusmotif} if exists $revHash{$locusmotif};
2524
2525 my $endst = join("\t", @endstatement, $orthHchr, $Hstart, $Hend);
2526 # print join("\t", @endstatement, $orthHchr, $Hstart, $Hend), "\n" if $printer == 1;
2527
2528
2529 return (join("\t", @endstatement, $orthHchr, $Hstart, $Hend), $orthHchr, $Hstart, $Hend, $locusmotif, length($locusmotif));
2530
2531 }
2532
2533 sub switch_nucl{
2534 my @strand = split(/\s*/,$_[0]);
2535 for my $i (0 ... $#strand){
2536 if ($strand[$i] =~ /c/i) {$strand[$i] = "G";next;}
2537 if ($strand[$i] =~ /a/i) {$strand[$i] = "T";next;}
2538 if ($strand[$i] =~ /t/i) { $strand[$i] = "A";next;}
2539 if ($strand[$i] =~ /g/i) {$strand[$i] = "C";next;}
2540 }
2541 return join("",@strand);
2542 }
2543
2544
2545 sub switch_micro{
2546 my $micro = reverse($_[0]);
2547 my @strand = split(/\s*/,$micro);
2548 for my $i (0 ... $#strand){
2549 if ($strand[$i] =~ /c/i) {$strand[$i] = "G";next;}
2550 if ($strand[$i] =~ /a/i) {$strand[$i] = "T";next;}
2551 if ($strand[$i] =~ /t/i) { $strand[$i] = "A";next;}
2552 if ($strand[$i] =~ /g/i) {$strand[$i] = "C";next;}
2553 if ($strand[$i] =~ /\[/i) {$strand[$i] = "]";next;}
2554 if ($strand[$i] =~ /\]/i) {$strand[$i] = "[";next;}
2555 }
2556 return join("",@strand);
2557 }
2558 sub decipher_history{
2559 my $printer = 0;
2560 my ($mutations_array, $tags_string, $nodes, $branches_hash, $tree_analysis, $confirmation_string, $alivehash) = @_;
2561 my %mutations_hash=();
2562 foreach my $mutation (@$mutations_array){
2563 # print "mutation = $mutation\n" if $printer == 1;
2564 my %local = $mutation =~ /([\S ]+)=([\S ]+)/g;
2565 push @{$mutations_hash{$local{"node"}}},$mutation;
2566 # print "just for confirmation: $local{node} pushed as: $mutation\n" if $printer == 1;
2567 }
2568 my @nodes;
2569 my @birth_steps=();
2570 my @death_steps=();
2571
2572 my @tags=split(/\s*/,$tags_string);
2573 my @confirmation=split(/\s+/,$confirmation_string);
2574 my %info=();
2575
2576 for my $i (0 ... $#tags){
2577 $info{$tags[$i]}=$confirmation[$i];
2578 # print "feeding info: $tags[$i] = $info{$tags[$i]}\n" if $printer == 1;
2579 }
2580
2581 for my $keys (@$nodes) {
2582 foreach my $key (@$keys){
2583 # print "current key = $key\n";
2584 my $copykey = $key;
2585 $copykey =~ s/[\W ]+//g;
2586 my @copykeys=split(/\s*/,$copykey);
2587 my $states=();
2588 foreach my $copy (@copykeys){
2589 $states=$states.$info{$copy};
2590 }
2591 # print "reduced key = $copykey and state = $states\n" if $printer == 1;
2592
2593 if (exists $mutations_hash{$key}) {
2594
2595 if ($states=~/\+/){
2596 push @birth_steps, @{$mutations_hash{$key}};
2597 $birth_steps[$#birth_steps] =~ s/\S+=//g;
2598 delete $mutations_hash{$key};
2599 }
2600 else{
2601 push @death_steps, @{$mutations_hash{$key}};
2602 $death_steps[$#death_steps] =~ s/\S+=//g;
2603 delete $mutations_hash{$key};
2604 }
2605 }
2606 }
2607 }
2608 # print "conformation = $confirmation_string\n" if $printer == 1;
2609 push (@birth_steps, "NULL") if scalar(@birth_steps) == 0;
2610 push (@death_steps, "NULL") if scalar(@death_steps) == 0;
2611 # print "birth steps = ",join("\n",@birth_steps)," and death steps = ",join("\n",@death_steps),"\n" if $printer == 1;
2612 return \@birth_steps, \@death_steps;
2613 }
2614
2615 sub fillAlignmentGaps{
2616 my $printer = 0;
2617 # print "received: @_\n" if $printer == 1;
2618 my ($tree, $sequences, $alignment, $tagarray, $microsathash, $nonmicrosathash, $motif, $tree_analysis, $threshold, $microsatstarts) = @_;
2619 # print "in fillAlignmentGaps.. tree = $tree \n" if $printer == 1;
2620 my %sequence_hash=();
2621
2622 my @phases = ();
2623 my $concat = $motif.$motif;
2624 my $motifsize = length($motif);
2625
2626 for my $i (1 ... $motifsize){
2627 push @phases, substr($concat, $i, $motifsize);
2628 }
2629
2630 my $concatalignment = ();
2631 foreach my $tag (@tags){
2632 $concatalignment = $concatalignment.$alignment->{$tag};
2633 }
2634 # print "returningg NULL","NULL","NULL", "NULL\n" if $concatalignment !~ /-/;
2635 return 0, "NULL","NULL","NULL", "NULL","NULL" if $concatalignment !~ /-/;
2636
2637
2638
2639 my %node_sequences_temp=();
2640 my %node_alignments_temp =(); #NEW, Nov 28 2008
2641
2642 my @tags=();
2643 my @locus_sequences=();
2644 my %alivehash=();
2645
2646 # print "IN fillAlignmentGaps\n";# <STDIN>;
2647 my %fillrecord = ();
2648
2649 my $change = 0;
2650 foreach my $tag (@$tagarray) {
2651 #print "adding: $tag\n";
2652 push(@tags, $tag);
2653 if (exists $microsathash->{$tag}){
2654 my $micro = $microsathash->{$tag};
2655 my $orig_micro = $micro;
2656 ($micro, $fillrecord{$tag}) = fillgaps($micro, \@phases);
2657 $change = 1 if uc($micro) ne uc($orig_micro);
2658 $node_sequences_temp{$tag}=$micro if $microsathash->{$tag} ne "NULL";
2659 }
2660 if (exists $nonmicrosathash->{$tag}){
2661 my $micro = $nonmicrosathash->{$tag};
2662 my $orig_micro = $micro;
2663 ($micro, $fillrecord{$tag}) = fillgaps($micro, \@phases);
2664 $change = 1 if uc($micro) ne uc($orig_micro);
2665 $node_sequences_temp{$tag}=$micro if $nonmicrosathash->{$tag} ne "NULL";
2666 }
2667
2668 if (exists $alignment->{$tag}){
2669 my $micro = $alignment->{$tag};
2670 my $orig_micro = $micro;
2671 ($micro, $fillrecord{$tag}) = fillgaps($micro, \@phases);
2672 $change = 1 if uc($micro) ne uc($orig_micro);
2673 $node_alignments_temp{$tag}=$micro if $alignment->{$tag} ne "NULL";
2674 }
2675
2676 #print "adding to node_sequences: $tag = ",$node_sequences_temp{$tag},"\n" if $printer == 1;
2677 #print "adding to node_alignments: $tag = ",$node_alignments_temp{$tag},"\n" if $printer == 1;
2678 }
2679
2680
2681 my %node_sequences=();
2682 my %node_alignments =(); #NEW, Nov 28 2008
2683 foreach my $tag (@$tagarray) {
2684 $node_sequences{$tag} = join ".",split(/\s*/,$node_sequences_temp{$tag});
2685 $node_alignments{$tag} = join ".",split(/\s*/,$node_alignments_temp{$tag});
2686 }
2687 # print "\n", "#" x 50, "\n" if $printer == 1;
2688 foreach my $tag (@tags){
2689 # print "$tag: $alignment->{$tag} = $node_alignments{$tag}\n" if $printer == 1;
2690 }
2691 # print "\n", "#" x 50, "\n" if $printer == 1;
2692 # print "change = $change\n";
2693 #<STDIN> if $concatalignment=~/\-/;
2694
2695 # <STDIN> if $printer == 1 && $concatalignment =~ /\-/;
2696
2697 return 0, "NULL","NULL","NULL", "NULL", "NULL" if $change == 0;
2698
2699 my ($nodes_arr, $branches_hash) = get_nodes($tree);
2700 my @nodes=@$nodes_arr;
2701 # print "recieved nodes = @nodes\n" if $printer == 1;
2702
2703
2704 #POPULATE branches_hash WITH INFORMATION ABOUT LIVESTATUS
2705 foreach my $keys (@nodes){
2706 my @pair = @$keys;
2707 my $joint = "(".join(", ",@pair).")";
2708 my $copykey = join "", @pair;
2709 $copykey =~ s/[\W ]+//g;
2710 # print "for node: $keys, copykey = $copykey and joint = $joint\n" if $printer == 1;
2711 my $livestatus = 1;
2712 foreach my $copy (split(/\s*/,$copykey)){
2713 $livestatus = 0 if !exists $alivehash{$copy};
2714 }
2715 $alivehash{$joint} = $joint if !exists $alivehash{$joint} && $livestatus == 1;
2716 # print "alivehash = $alivehash{$joint}\n" if exists $alivehash{$joint} && $printer == 1;
2717 }
2718
2719
2720
2721 @nodes = reverse(@nodes); #1 THIS IS IN ORDER TO GO THROUGH THE TREE FROM LEAVES TO ROOT.
2722
2723 my @mutations_array=();
2724
2725 my $joint = ();
2726 foreach my $node (@nodes){
2727 my @pair = @$node;
2728 # print "now in the nodes for loop, pair = @pair\n and sequences=\n" if $printer == 1;
2729 $joint = "(".join(", ",@pair).")";
2730 # print "joint = $joint \n" if $printer == 1;
2731 my @pair_sequences=();
2732
2733 foreach my $tag (@pair){
2734 # print "tag = $tag: " if $printer == 1;
2735 # print $node_alignments{$tag},"\n" if $printer == 1;
2736 push @pair_sequences, $node_alignments{$tag};
2737 }
2738 # print "fillgap\n";
2739 my ($compared, $substitutions_list) = base_by_base_simple($motif,\@pair_sequences, scalar(@pair_sequences), @pair, $joint);
2740 $node_alignments{$joint}=$compared;
2741 push( @mutations_array,split(/:/,$substitutions_list));
2742 # print "newly added to node_sequences: $node_alignments{$joint} and list of mutations = @mutations_array\n" if $printer == 1;
2743 }
2744 # print "now sending for analyze_mutations: mutation_array=@mutations_array, nodes=@nodes, branches_hash=$branches_hash, alignment=$alignment, tags=@tags, alivehash=%alivehash, node_sequences=\%node_sequences, microsatstarts=$microsatstarts, motif=$motif\n" if $printer == 1;
2745 # <STDIN> if $printer == 1;
2746
2747 my $analayzed_mutations = analyze_mutations(\@mutations_array, \@nodes, $branches_hash, $alignment, \@tags, \%alivehash, \%node_sequences, $microsatstarts, $motif);
2748
2749 # print "returningt: ", $analayzed_mutations, \@nodes,"\n" if scalar @mutations_array > 0;;
2750 # print "returningy: NULL, NULL, NULL " if scalar @mutations_array == 0 && $printer == 1;
2751 # print "final node alignment after filling for $joint= " if $printer == 1;
2752 # print "$node_alignments{$joint}\n" if $printer == 1;
2753
2754
2755 return 1, $analayzed_mutations, \@nodes, $branches_hash, \%alivehash, $node_alignments{$joint} if scalar @mutations_array > 0 ;
2756 return 1, "NULL","NULL","NULL", "NULL", "NULL" if scalar @mutations_array == 0;
2757 }
2758
2759
2760
2761 sub add_mutation{
2762 my $printer = 0;
2763 # print "IN SUBROUTUNE add_mutation.. information received = @_\n" if $printer == 1;
2764 my ($i , $bite, $to, $from) = @_;
2765 # print "bite = $bite.. all received info = ",join("^", @_),"\n" if $printer == 1;
2766 # print "to=$to\n" if $printer == 1;
2767 # print "tis split = ",join(" and ",split(/!/,$to)),"\n" if $printer == 1;
2768 my @toields = split "!",$to;
2769 # print "toilds = @toields\n" if $printer == 1;
2770 my @mutations=();
2771
2772 foreach my $toield (@toields){
2773 my @toinfo=split(":",$toield);
2774 # print " at toinfo=@toinfo \n" if $printer == 1;
2775 next if $toinfo[1] =~ /$from/i;
2776 my @mutation = @toinfo if $toinfo[1] !~ /$from/i;
2777 # print "adding to mutaton list: ", join(",", "node=$mutation[0]","type=substitution" ,"position=$i", "from=$from", "to=$mutation[1]", "insertion=", "deletion="),"\n" if $printer == 1;
2778 push (@mutations, join("\t", "node=$mutation[0]","type=substitution" ,"position=$i", "from=$from", "to=$mutation[1]", "insertion=", "deletion="));
2779 }
2780 return @mutations;
2781 }
2782
2783
2784 sub add_bases{
2785
2786 my $printer = 0;
2787 # print "IN SUBROUTUNE add_bases.. information received = @_\n" if $printer == 1;
2788 my ($optional0, $optional1, $pair0, $pair1,$joint) = @_;
2789 my $total_list=();
2790
2791 my @total_list0=split(/!/,$optional0);
2792 my @total_list1=split(/!/,$optional1);
2793 my @all_list=();
2794 my %total_hash0=();
2795 foreach my $entry (@total_list0) {
2796 $entry = uc $entry;
2797 $entry =~ /(\S+):(\S+)/;
2798 $total_hash0{$2}=$1;
2799 push @all_list, $2;
2800 }
2801
2802 my %total_hash1=();
2803 foreach my $entry (@total_list1) {
2804 $entry = uc $entry;
2805 $entry =~ /(\S+):(\S+)/;
2806 $total_hash1{$2}=$1;
2807 push @all_list, $2;
2808 }
2809
2810 my %alphabetical_hash=();
2811 my @return_options=();
2812
2813 for my $i (0 ... $#all_list){
2814 my $alph = $all_list[$i];
2815 if (exists $total_hash0{$alph} && exists $total_hash1{$alph}){
2816 push(@return_options, $joint.":".$alph);
2817 delete $total_hash0{$alph}; delete $total_hash1{$alph};
2818 }
2819 if (exists $total_hash0{$alph} && !exists $total_hash1{$alph}){
2820 push(@return_options, $pair0.":".$alph);
2821 delete $total_hash0{$alph};
2822 }
2823 if (!exists $total_hash0{$alph} && exists $total_hash1{$alph}){
2824 push(@return_options, $pair1.":".$alph);
2825 delete $total_hash1{$alph};
2826 }
2827
2828 }
2829 # print "returning ",join "!",@return_options,"\n" if $printer == 1;
2830 return join "!",@return_options;
2831
2832 }
2833
2834
2835 sub fillgaps{
2836 # print "IN fillgaps: @_\n";
2837 my ($micro, $phasesinput) = @_;
2838 #print "in microsathash ,,.. micro = $micro\n";
2839 return $micro if $micro !~ /\-/;
2840 my $orig_micro = $micro;
2841 my @phases = @$phasesinput;
2842
2843 my %tested_patterns = ();
2844
2845 foreach my $phase (@phases){
2846 # print "considering phase: $phase\n";
2847 my @phase_prefixes = ();
2848 my @prephase_left_contexts = ();
2849 my @prephase_right_contexts = ();
2850 my @pregapsize = ();
2851 my @prepostfilins = ();
2852
2853 my @phase_suffixes;
2854 my @suffphase_left_contexts;
2855 my @suffphase_right_contexts;
2856 my @suffgapsize;
2857 my @suffpostfilins;
2858
2859 my @postfilins = ();
2860 my $motifsize = length($phases[0]);
2861
2862 my $change = 0;
2863
2864 for my $u (0 ... $motifsize-1){
2865 my $concat = $phase.$phase.$phase.$phase;
2866 my @concatarr = split(/\s*/, $concat);
2867 my $l = 0;
2868 while ($l < $u){
2869 shift @concatarr;
2870 $l++;
2871 }
2872 $concat = join ("", @concatarr);
2873
2874 for my $t (0 ... $motifsize-1){
2875 for my $k (1 ... $motifsize-1){
2876 push @phase_prefixes, substr($concat, $motifsize+$t, $k);
2877 push @prephase_left_contexts, substr ($concat, $t, $motifsize);
2878 push @prephase_right_contexts, substr ($concat, $motifsize+$t+$k+($motifsize-$k), 1);
2879 push @pregapsize, $k;
2880 push @prepostfilins, substr($concat, $motifsize+$t+$k, ($motifsize-$k));
2881 # print "reading: $concat, t=$t, k=$k prefix: $prephase_left_contexts[$#prephase_left_contexts] $phase_prefixes[$#phase_prefixes] -x$pregapsize[$#pregapsize] $prephase_right_contexts[$#prephase_right_contexts]\n";
2882 # print "phase_prefixes = $phase_prefixes[$#phase_prefixes]\n";
2883 # print "prephase_left_contexts = $prephase_left_contexts[$#prephase_left_contexts]\n";
2884 # print "prephase_right_contexts = $prephase_right_contexts[$#prephase_right_contexts]\n";
2885 # print "pregapsize = $pregapsize[$#pregapsize]\n";
2886 # print "prepostfilins = $prepostfilins[$#prepostfilins]\n";
2887 }
2888 }
2889 }
2890
2891 # print "looking if $micro =~ /($phase\-{$motifsize})/i || $micro =~ /^(\-{$motifsize,}$phase)/i\n";
2892 if ($micro =~ /($phase\-{$motifsize,})$/i || $micro =~ /^(\-{$motifsize,}$phase)/i){
2893 # print "micro: $micro needs further gap removal: $1\n";
2894 while ($micro =~ /$phase(\-{$motifsize,})$/i || $micro =~ /^(\-{$motifsize,})$phase/i){
2895 # print "micro: $micro needs further gap removal: $1\n";
2896
2897 # print "phase being considered = $phase\n";
2898 my $num = ();
2899 $num = $micro =~ s/$phase\-{$motifsize}/$phase$phase/gi if $micro =~ /$phase\-{$motifsize,}/i;
2900 $num = $micro =~ s/\-{$motifsize}$phase/$phase$phase/gi if $micro =~ /\-{$motifsize,}$phase/i;
2901 # print "num = $num\n";
2902 $change = 1 if $num == 1;
2903 }
2904 }
2905
2906 elsif ($micro =~ /(($phase)+)\-{$motifsize,}(($phase)+)/i){
2907 while ($micro =~ /(($phase)+)\-{$motifsize,}(($phase)+)/i){
2908 # print "checking lengths of $1 and $3 for $micro... \n";
2909 my $num = ();
2910 if (length($1) >= length($3)){
2911 # print "$micro matches (($phase)+)\-{$motifsize,}(($phase)+) = $1, >= , $3 \n";
2912 $num = $micro =~ s/$phase\-{$motifsize}/$phase$phase/gi ;
2913 }
2914 if (length($1) < length($3)){
2915 # print "$micro matches (($phase)+)\-{$motifsize,}(($phase)+) = $1, < , $3 \n";
2916 $num = $micro =~ s/\-{$motifsize}$phase/$phase$phase/gi ;
2917 }
2918 # print "micro changed to $micro\n";
2919 }
2920 }
2921 elsif ($micro =~ /([A-Z]+)\-{$motifsize,}(($phase)+)/i){
2922 while ($micro =~ /([A-Z]+)\-{$motifsize,}(($phase)+)/i){
2923 # print "$micro matches ([A-Z]+)\-{$motifsize}(($phase)+) = 1=$1, - , 3=$3 \n";
2924 my $num = 0;
2925 $num = $micro =~ s/\-{$motifsize}$phase/$phase$phase/gi ;
2926 }
2927 }
2928 elsif ($micro =~ /(($phase)+)\-{$motifsize,}([A-Z]+)/i){
2929 while ($micro =~ /(($phase)+)\-{$motifsize,}([A-Z]+)/i){
2930 # print "$micro matches (($phase)+)\-{$motifsize,}([A-Z]+) = 1=$1, - , 3=$3 \n";
2931 my $num = 0;
2932 $num = $micro =~ s/$phase\-{$motifsize}/$phase$phase/gi ;
2933 }
2934 }
2935
2936 # print "$orig_micro to $micro\n";
2937
2938 #s <STDIN>;
2939
2940 for my $h (0 ... $#phase_prefixes){
2941 # print "searching using prefix : $prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h]\n";
2942 my $pattern = $prephase_left_contexts[$h].$phase_prefixes[$h].$pregapsize[$h].$prephase_right_contexts[$h];
2943 # print "returning orig_micro = $orig_micro, micro = $micro \n" if exists $tested_patterns{$pattern};
2944 if ($micro =~ /$prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h]/i){
2945 return $orig_micro if exists $tested_patterns{$pattern};
2946 while ($micro =~ /($prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h])/i){
2947 $tested_patterns{$pattern} = $pattern;
2948 # print "micro: $micro needs further gap removal: $1\n";
2949
2950 # print "prefix being considered = $phase_prefixes[$h]\n";
2951 my $num = ();
2952 $num = ($micro =~ s/$prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h]/$prephase_left_contexts[$h]$phase_prefixes[$h]$prepostfilins[$h]$prephase_right_contexts[$h]/gi) ;
2953 # print "num = $num, micro = $micro\n";
2954 $change = 1 if $num == 1;
2955
2956 return $orig_micro if $num > 1;
2957 }
2958 }
2959
2960 }
2961 }
2962 return $orig_micro if length($micro) != length($orig_micro);
2963 return $micro;
2964 }
2965
2966 sub selectMutationArray{
2967 my $printer =0;
2968
2969 my $oldmutspt = $_[0];
2970 my $newmutspt = $_[1];
2971 my $tagstringpt = $_[2];
2972 my $alivehashpt = $_[3];
2973 my $alignmentpt = $_[4];
2974 my $motif = $_[5];
2975
2976 my @alivehasharr=();
2977
2978 my @tags = @$tagstringpt;
2979 my $alignmentln = length($alignmentpt->{$tags[0]});
2980
2981 foreach my $key (keys %$alivehashpt) { push @alivehasharr, $key; }
2982
2983 my %newside = ();
2984 my %oldside = ();
2985 my %newmuts = ();
2986
2987 my %commons = ();
2988 my %olds = ();
2989 foreach my $old (@$oldmutspt){
2990 $olds{$old} = 1;
2991 }
2992 foreach my $new (@$newmutspt){
2993 $commons{$new} = 1 if exists $olds{$new};;
2994 }
2995
2996
2997 foreach my $pos ( 0 ... $alignmentln){
2998 #print "pos = $pos\n" if $printer == 1;
2999 my $newyes = 0;
3000 foreach my $mut (@$newmutspt){
3001 $newmuts{$mut} = 1;
3002 chomp $mut;
3003 $newyes++;
3004 $mut =~ s/=\t/= \t/g;
3005 $mut =~ s/=$/= /g;
3006
3007 $mut =~ /node=([A-Z\(\), ]+)\stype=([a-zA-Z ]+)\sposition=([0-9 ]+)\sfrom=([a-zA-Z\- ]+)\sto=([a-zA-Z\- ]+)\sinsertion=([a-zA-Z\- ]+)\sdeletion=([a-zA-Z\- ]+)/;
3008 my $node = $1;
3009 next if $3 != $pos;
3010 # print "new mut = $mut\n" if $printer == 1;
3011 # print "node = $node, pos = $3 ... and alivehasharr = >@alivehasharr<\n" if $printer == 1;
3012 my $alivenode = 0;
3013 foreach my $key (@alivehasharr){
3014 $alivenode = 1 if $key =~ /$node/;
3015 }
3016 # next if $alivenode == 0;
3017 my $indel_type = " ";
3018 if ($2 eq "insertion" || $2 eq "deletion"){
3019 my $thisindel = ();
3020 $thisindel = $6 if $2 eq "insertion";
3021 $thisindel = $7 if $2 eq "deletion";
3022
3023 $indel_type = "i".checkIndelType($node, $thisindel, $motif,$alignmentpt,$3, $2) if $2 eq "insertion";
3024 $indel_type = "d".checkIndelType($node, $thisindel, $motif,$alignmentpt, $3, $2) if $2 eq "deletion";
3025 $indel_type = $indel_type."f" if $indel_type =~ /mot/ && length($thisindel) >= length($motif);
3026 }
3027 # print "indeltype = $indel_type\n" if $printer == 1;
3028 my $added = 0;
3029
3030 if (exists $newside{$pos} && $indel_type =~ /[a-z]+/){
3031 # print "we have a preexisting one for $pos\n" if $printer == 1;
3032 my @preexisting = @{$newside{$pos}};
3033 foreach my $pre (@preexisting){
3034 # print "looking at $pre\n" if $printer == 1;
3035 next if $pre !~ /node=$node/;
3036 next if $pre !~ /indeltype=([a-z]+)/;
3037 my $currtype = $1;
3038
3039 if ($currtype =~ /inon/ && $indel_type =~ /dmot/){
3040 delete $newside{$pos};
3041 push @{$newside{$pos}}, $pre;
3042 $added = 1;
3043 }
3044 if ($currtype =~ /dnon/ && $indel_type =~ /imot/){
3045 delete $newside{$pos};
3046 push @{$newside{$pos}}, $pre;
3047 $added = 1;
3048 }
3049 if ($currtype =~ /dmot/ && $indel_type =~ /inon/){
3050 delete $newside{$pos};
3051 push @{$newside{$pos}}, $mut."\tindeltype=$indel_type";
3052 $added = 1;
3053 }
3054 if ($currtype =~ /imot/ && $indel_type =~ /dnon/){
3055 delete $newside{$pos};
3056 push @{$newside{$pos}}, $mut."\tindeltype=$indel_type";
3057 $added = 1;
3058 }
3059 }
3060 }
3061 # print "added = $added\n" if $printer == 1;
3062 push @{$newside{$pos}}, $mut."\tindeltype=$indel_type" if $added == 0;
3063 # print "for new pos,: $pos we have: @{$newside{$pos}}\n " if $printer == 1;
3064 }
3065 }
3066
3067 foreach my $pos ( 0 ... $alignmentln){
3068 my $oldyes = 0;
3069 foreach my $mut (@$oldmutspt){
3070 chomp $mut;
3071 $oldyes++;
3072 $mut =~ s/=\t/= \t/g;
3073 $mut =~ s/=$/= /g;
3074 $mut =~ /node=([A-Z\(\), ]+)\ttype=([a-zA-Z ]+)\tposition=([0-9 ]+)\tfrom=([a-zA-Z\- ]+)\tto=([a-zA-Z\- ]+)\tinsertion=([a-zA-Z\- ]+)\tdeletion=([a-zA-Z\- ]+)/;
3075 my $node = $1;
3076 next if $3 != $pos;
3077 # print "old mut = $mut\n" if $printer == 1;
3078 my $alivenode = 0;
3079 foreach my $key (@alivehasharr){
3080 $alivenode = 1 if $key =~ /$node/;
3081 }
3082 #next if $alivenode == 0;
3083 my $indel_type = " ";
3084 if ($2 eq "insertion" || $2 eq "deletion"){
3085 $indel_type = "i".checkIndelType($node, $6, $motif,$alignmentpt, $3, $2) if $2 eq "insertion";
3086 $indel_type = "d".checkIndelType($node, $7, $motif,$alignmentpt, $3, $2) if $2 eq "deletion";
3087 next if $indel_type =~/non/;
3088 }
3089 else{ next;}
3090
3091 my $imp=0;
3092 $imp = 1 if $indel_type =~ /dmot/ && $alivenode == 0;
3093 $imp = 1 if $indel_type =~ /imot/ && $alivenode == 1;
3094
3095
3096 if (exists $newside{$pos} && $indel_type =~ /[a-z]+/){
3097 my @preexisting = @{$newside{$pos}};
3098 # print "we have a preexisting one for $pos: @preexisting\n" if $printer == 1;
3099 next if $imp == 0;
3100
3101 if (scalar(@preexisting) == 1){
3102 my $foundmut = $preexisting[0];
3103 $foundmut=~ /node=([A-Z, \(\)]+)/;
3104 next if $1 eq $node;
3105
3106 if (exists $oldside{$pos} || exists $commons{$foundmut}){
3107 # print "not replacing, but just adding\n" if $printer == 1;
3108 push @{$newside{$pos}}, $mut."\tindeltype=$indel_type";
3109 push @{$oldside{$pos}}, $mut."\tindeltype=$indel_type";
3110 next;
3111 }
3112
3113 delete $newside{$pos};
3114 push @{$oldside{$pos}}, $mut."\tindeltype=$indel_type";
3115 push @{$newside{$pos}}, $mut."\tindeltype=$indel_type";
3116 # print "now new one is : @{$newside{$pos}}\n" if $printer == 1;
3117 }
3118 # print "for pos: $pos: @{$newside{$pos}}\n" if $printer == 1;
3119 next;
3120 }
3121
3122
3123 my @news = @{$newside{$pos}} if exists $newside{$pos};
3124 # print "mut = $mut and news = @news\n" if $printer == 1;
3125 push @{$oldside{$pos}}, $mut."\tindeltype=$indel_type";
3126 push @{$newside{$pos}}, $mut."\tindeltype=$indel_type";
3127 }
3128 }
3129 # print "in the end, our collected mutations = \n" if $printer == 1;
3130 my @returnarr = ();
3131 foreach my $key (keys %newside) {push @returnarr,@{$newside{$key}};}
3132 # print join("\n", @returnarr),"\n" if $printer == 1;
3133 #<STDIN>;
3134 return @returnarr;
3135
3136 }
3137
3138
3139 sub checkIndelType{
3140 my $printer = 0;
3141 my $node = $_[0];
3142 my $indel = $_[1];
3143 my $motif = $_[2];
3144 my $alignmentpt = $_[3];
3145 my $posit = $_[4];
3146 my $type = $_[5];
3147 my @phases =();
3148 my %prephases = ();
3149 my %postphases = ();
3150 #print "motif = $motif\n";
3151 # print "IN checkIndelType ... received: @_\n" if $printer == 1;
3152 my $concat = $motif.$motif.$motif.$motif;
3153 my $motiflength = length($motif);
3154
3155 if ($motiflength > length ($indel)){
3156 return "non" if $motif !~ /$indel/i;
3157 return checkIndelType_ComplexAnalysis($node, $indel, $motif, $alignmentpt, $posit, $type);
3158 }
3159
3160 my $firstpass = 0;
3161 for my $y (0 ... $motiflength-1){
3162 my $phase = substr($concat, $motiflength+$y, $motiflength);
3163 push @phases, $phase;
3164 $firstpass = 1 if $indel =~ /$phase/i;
3165 for my $k (0 ... length($motif)-1){
3166 # print "at: motiflength=$motiflength , y=$y , k=$k.. for pre: $motiflength+$y-$k and post: $motiflength+$y-$k+$motiflength in $concat\n" if $printer == 1;
3167 my $pre = substr($concat, $motiflength+$y-$k, $k );
3168 my $post = substr($concat, $motiflength+$y+$motiflength, $k);
3169 # print "adding to phases : $phase - $pre and $post\n" if $printer == 1;
3170 push @{$prephases{$phase}} , $pre;
3171 push @{$postphases{$phase}} , $post;
3172 }
3173
3174 }
3175 # print "firstpass 1= $firstpass\n" if $printer == 1;
3176 return "non" if $firstpass ==0;
3177 $firstpass =0;
3178
3179 foreach my $phase (@phases){
3180 my @pres = @{$prephases{$phase}};
3181 my @posts = @{$postphases{$phase}};
3182
3183 foreach my $pre (@pres){
3184 foreach my $post (@posts){
3185
3186 $firstpass = 1 if $indel =~ /($pre)?($phase)+($post)?/i && length($indel) > (3 * length($motif));
3187 $firstpass = 1 if $indel =~ /^($pre)?($phase)+($post)?$/i && length($indel) < (3 * length($motif));
3188 # print "matched here : ($pre)?($phase)+($post)?\n" if $printer == 1;
3189 last if $firstpass == 1;
3190 }
3191 last if $firstpass == 1;
3192 }
3193 last if $firstpass == 1;
3194 }
3195 # print "firstpass 2= $firstpass\n" if $printer == 1;
3196 return "non" if $firstpass ==0;
3197 return "mot" if $firstpass ==1;
3198 }
3199
3200
3201 sub checkIndelType_ComplexAnalysis{
3202 my $printer = 0;
3203 my $node = $_[0];
3204 my $indel = $_[1];
3205 my $motif = $_[2];
3206 my $alignmentpt = $_[3];
3207 my $pos = $_[4];
3208 my $type = $_[5];
3209 my @speciesinvolved = $node =~ /[A-Z]+/g;
3210
3211 my @seqs = ();
3212 my $residualseq = length($motif) - length($indel);
3213 # print "IN COMPLEX ANALYSIS ... received: @_ .... speciesinvolved = @speciesinvolved\n" if $printer == 1;
3214 # print "we have position = $pos, sseq = $alignmentpt->{$speciesinvolved[0]}\n" if $printer == 1;
3215 # print "residualseq = $residualseq\n" if $printer == 1;
3216 # print "pos=$pos... got: @_\n" if $printer == 1;
3217 foreach my $sp (@speciesinvolved){
3218 my $spseq = $alignmentpt->{$sp};
3219 #print "orig spseq = $spseq\n";
3220 my $subseq = ();
3221
3222 if ($type eq "deletion"){
3223 my @indelparts = split(/\s*/,$indel);
3224 my @seqparts = split(/\s*/,$spseq);
3225
3226 for my $p ($pos ... $pos+length($indel)-1){
3227 $seqparts[$p] = shift @indelparts;
3228 }
3229 $spseq = join("",@seqparts);
3230 }
3231 #print "mod spseq = $spseq\n";
3232 # $spseq=~ s/\-//g if $type !~ /deletion/;
3233 # print "substr($spseq, $pos-($residualseq), length($indel)+$residualseq+$residualseq)\n" if $pos > 0 && $pos < (length($spseq) - length($motif)) && $printer == 1;
3234 # print "substr($spseq, 0, length($indel)+$residualseq)\n" if $pos == 0 && $printer == 1;
3235 # print "substr($spseq, $pos - $residualseq, length($indel)+$residualseq)\n" if $pos >= (length($spseq) - length($motif)) && $printer == 1;
3236
3237 $subseq = substr($spseq, $pos-($residualseq), length($indel)+$residualseq+$residualseq) if $pos > 0 && $pos < (length($spseq) - length($motif)) ;
3238 $subseq = substr($spseq, 0, length($indel)+$residualseq) if $pos == 0;
3239 $subseq = substr($spseq, $pos - $residualseq, length($indel)+$residualseq) if $pos >= (length($spseq) - length($motif)) ;
3240 # print "spseq = $spseq . subseq=$subseq . type = $type\n" if $printer == 1;
3241 #<STDIN> if $subseq !~ /[a-z\-]/i;
3242 $subseq =~ s/\-/$indel/g if $type =~ /insertion/;
3243 push @seqs, $subseq;
3244 # print "seqs = @seqs\n" if $printer == 1;
3245 }
3246 return "non" if checkIfSeqsIdentical(@seqs) eq "NO";
3247 # print "checking for $seqs[0] \n" if $printer == 1;
3248
3249 my @phases =();
3250 my %prephases = ();
3251 my %postphases = ();
3252 my $concat = $motif.$motif.$motif.$motif;
3253 my $motiflength = length($motif);
3254
3255 my $firstpass = 0;
3256
3257 for my $y (0 ... $motiflength-1){
3258 my $phase = substr($concat, $motiflength+$y, $motiflength);
3259 push @phases, $phase;
3260 $firstpass = 1 if $seqs[0] =~ /$phase/i;
3261 for my $k (0 ... length($motif)-1){
3262 my $pre = substr($concat, $motiflength+$y-$k, $k );
3263 my $post = substr($concat, $motiflength+$y+$motiflength, $k);
3264 # print "adding to phases : $phase - $pre and $post\n" if $printer == 1;
3265 push @{$prephases{$phase}} , $pre;
3266 push @{$postphases{$phase}} , $post;
3267 }
3268
3269 }
3270 # print "firstpass 1= $firstpass.. also, res-d = ",(length($seqs[0]))%(length($motif)),"\n" if $printer == 1;
3271 return "non" if $firstpass ==0;
3272 $firstpass =0;
3273 foreach my $phase (@phases){
3274
3275 $firstpass = 1 if $seqs[0] =~ /^($phase)+$/i && ((length($seqs[0]))%(length($motif))) == 0;
3276
3277 if (((length($seqs[0]))%(length($motif))) != 0){
3278 my @pres = @{$prephases{$phase}};
3279 my @posts = @{$postphases{$phase}};
3280 foreach my $pre (@pres){
3281 foreach my $post (@posts){
3282 next if $pre !~ /\S/ && $post !~ /\S/;
3283 $firstpass = 1 if ($seqs[0] =~ /^($pre)($phase)+($post)$/i || $seqs[0] =~ /^($pre)($phase)+$/i || $seqs[0] =~ /^($phase)+($post)$/i);
3284 # print "caught with $pre $phase $post\n" if $printer == 1;
3285 last if $firstpass == 1;
3286 }
3287 last if $firstpass == 1;
3288 }
3289 }
3290
3291 last if $firstpass == 1;
3292 }
3293
3294 #print "indel = $indel.. motif = $motif.. firstpass 2= mot\n" if $firstpass ==1;
3295 #print "indel = $indel.. motif = $motif.. firstpass 2= non\n" if $firstpass ==0;
3296 #<STDIN>;# if $firstpass ==1;
3297 return "non" if $firstpass ==0;
3298 return "mot" if $firstpass ==1;
3299
3300 }
3301
3302 sub checkIfSeqsIdentical{
3303 my @seqs = @_;
3304 my $identical = 1;
3305
3306 for my $j (1 ... $#seqs){
3307 $identical = 0 if uc($seqs[0]) ne uc($seqs[$j]);
3308 }
3309 return "NO" if $identical == 0;
3310 return "YES" if $identical == 1;
3311
3312 }
3313
3314 sub summarizeMutations{
3315 my $mutspt = $_[0];
3316 my @muts = @$mutspt;
3317 my $tree = $_[1];
3318
3319 my @returnarr = ();
3320
3321 for (1 ... 38){
3322 push @returnarr, "NA";
3323 }
3324 push @returnarr, "NULL";
3325 return @returnarr if $tree eq "NULL" || scalar(@muts) < 1;
3326
3327
3328 my @bspecies = ();
3329 my @dspecies = ();
3330 my $treecopy = $tree;
3331 $treecopy =~ s/[\(\)]//g;
3332 my @treeparts = split(/[\.,]+/, $treecopy);
3333
3334 for my $part (@treeparts){
3335 if ($part =~ /\+/){
3336 $part =~ s/\+//g;
3337 #my @sp = split(/\s*/, $part);
3338 #foreach my $p (@sp) {push @bspecies, $p;}
3339 push @bspecies, $part;
3340 }
3341 if ($part =~ /\-/){
3342 $part =~ s/\-//g;
3343 #my @sp = split(/\s*/, $part);
3344 #foreach my $p (@sp) {push @dspecies, $p;}
3345 push @dspecies, $part;
3346 }
3347
3348 }
3349 #print "-------------------------------------------------------\n";
3350
3351 my ($insertions, $deletions, $motinsertions, $motinsertionsf, $motdeletions, $motdeletionsf, $noninsertions, $nondeletions) = (0,0,0,0,0,0,0,0);
3352 my ($binsertions, $bdeletions, $bmotinsertions,$bmotinsertionsf, $bmotdeletions, $bmotdeletionsf, $bnoninsertions, $bnondeletions) = (0,0,0,0,0,0,0,0);
3353 my ($dinsertions, $ddeletions, $dmotinsertions,$dmotinsertionsf, $dmotdeletions, $dmotdeletionsf, $dnoninsertions, $dnondeletions) = (0,0,0,0,0,0,0,0);
3354 my ($ninsertions, $ndeletions, $nmotinsertions,$nmotinsertionsf, $nmotdeletions, $nmotdeletionsf, $nnoninsertions, $nnondeletions) = (0,0,0,0,0,0,0,0);
3355 my ($substitutions, $bsubstitutions, $dsubstitutions, $nsubstitutions, $indels, $subs) = (0,0,0,0,"NA","NA");
3356
3357 my @insertionsarr = (" ");
3358 my @deletionsarr = (" ");
3359
3360 my @substitutionsarr = (" ");
3361
3362
3363 foreach my $mut (@muts){
3364 # print "mut = $mut\n";
3365 chomp $mut;
3366 $mut =~ s/=\t/= /g;
3367 $mut =~ s/=$/= /g;
3368 my %mhash = ();
3369 my @mields = split(/\t/,$mut);
3370
3371 foreach my $m (@mields){
3372 my @fields = split(/=/,$m);
3373 next if $fields[1] eq " ";
3374 $mhash{$fields[0]} = $fields[1];
3375 }
3376
3377 my $myutype = ();
3378 my $decided = 0;
3379
3380 my $localnode = $mhash{"node"};
3381 $localnode =~ s/[\(\)\. ,]//g;
3382
3383
3384 foreach my $s (@bspecies){
3385 if ($localnode eq $s) {
3386 $decided = 1; $myutype = "b";
3387 }
3388 }
3389
3390 foreach my $s (@dspecies){
3391 if ($localnode eq $s) {
3392 $decided = 1; $myutype = "d";
3393 }
3394 }
3395
3396 $myutype = "n" if $decided != 1;
3397
3398
3399 # print "tree=$tree, birth species=@bspecies, death species=@dspecies, node=$mhash{node} .. myutype=$myutype .. \n";
3400 # <STDIN> if $mhash{"type"} eq "insertion" && $myutype eq "b";
3401
3402
3403 if ($mhash{"type"} eq "substitution"){
3404 $substitutions++;
3405 $bsubstitutions++ if $myutype eq "b";
3406 $dsubstitutions++ if $myutype eq "d";
3407 $nsubstitutions++ if $myutype eq "n";
3408 # print "substitution: from= $mhash{from}, to = $mhash{to}, and type = myutype\n";
3409 push @substitutionsarr, "$mhash{position}:".$mhash{"from"}.">".$mhash{"to"} if $myutype eq "b";
3410 push @substitutionsarr, "$mhash{position}:".$mhash{"from"}.">".$mhash{"to"} if $myutype eq "d";
3411 push @substitutionsarr, "$mhash{position}:".$mhash{"from"}.">".$mhash{"to"} if $myutype eq "n";
3412 # print "substitutionsarr = @substitutionsarr\n";
3413 # <STDIN>;
3414 }
3415 else{
3416 #print "tree=$tree, birth species=@bspecies, death species=@dspecies, node=$mhash{node} .. myutype=$myutype .. indeltype=$mhash{indeltype}\n";
3417 if ($mhash{"type"} eq "deletion"){
3418 $deletions++;
3419
3420 $motdeletions++ if $mhash{"indeltype"} =~ /dmot/;
3421 $motdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/;
3422
3423 $nondeletions++ if $mhash{"indeltype"} =~ /dnon/;
3424
3425 $bdeletions++ if $myutype eq "b";
3426 $ddeletions++ if $myutype eq "d";
3427 $ndeletions++ if $myutype eq "n";
3428
3429 $bmotdeletions++ if $mhash{"indeltype"} =~ /dmot/ && $myutype eq "b";
3430 $bmotdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/ && $myutype eq "b";
3431 $bnondeletions++ if $mhash{"indeltype"} =~ /dnon/ && $myutype eq "b";
3432
3433 $dmotdeletions++ if $mhash{"indeltype"} =~ /dmot/ && $myutype eq "d";
3434 $dmotdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/ && $myutype eq "d";
3435 $dnondeletions++ if $mhash{"indeltype"} =~ /dnon/ && $myutype eq "d";
3436
3437 $nmotdeletions++ if $mhash{"indeltype"} =~ /dmot/ && $myutype eq "n";
3438 $nmotdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/ && $myutype eq "n";
3439 $nnondeletions++ if $mhash{"indeltype"} =~ /dnon/ && $myutype eq "n";
3440
3441 push @deletionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"deletion"} if $myutype eq "b";
3442 push @deletionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"deletion"} if $myutype eq "d";
3443 push @deletionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"deletion"} if $myutype eq "n";
3444 }
3445
3446 if ($mhash{"type"} eq "insertion"){
3447 $insertions++;
3448
3449 $motinsertions++ if $mhash{"indeltype"} =~ /imot/;
3450 $motinsertionsf++ if $mhash{"indeltype"} =~ /imotf/;
3451 $noninsertions++ if $mhash{"indeltype"} =~ /inon/;
3452
3453 $binsertions++ if $myutype eq "b";
3454 $dinsertions++ if $myutype eq "d";
3455 $ninsertions++ if $myutype eq "n";
3456
3457 $bmotinsertions++ if $mhash{"indeltype"} =~ /imot/ && $myutype eq "b";
3458 $bmotinsertionsf++ if $mhash{"indeltype"} =~ /imotf/ && $myutype eq "b";
3459 $bnoninsertions++ if $mhash{"indeltype"} =~ /inon/ && $myutype eq "b";
3460
3461 $dmotinsertions++ if $mhash{"indeltype"} =~ /imot/ && $myutype eq "d";
3462 $dmotinsertionsf++ if $mhash{"indeltype"} =~ /imotf/ && $myutype eq "d";
3463 $dnoninsertions++ if $mhash{"indeltype"} =~ /inon/ && $myutype eq "d";
3464
3465 $nmotinsertions++ if $mhash{"indeltype"} =~ /imot/ && $myutype eq "n";
3466 $nmotinsertionsf++ if $mhash{"indeltype"} =~ /imotf/ && $myutype eq "n";
3467 $nnoninsertions++ if $mhash{"indeltype"} =~ /inon/ && $myutype eq "n";
3468
3469 push @insertionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"insertion"} if $myutype eq "b";
3470 push @insertionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"insertion"} if $myutype eq "d";
3471 push @insertionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"insertion"} if $myutype eq "n";
3472
3473 }
3474 }
3475 }
3476
3477
3478
3479 $indels = "ins=".join(",",@insertionsarr).";dels=".join(",",@deletionsarr) if scalar(@insertionsarr) > 1 || scalar(@deletionsarr) > 1 ;
3480 $subs = join(",",@substitutionsarr) if scalar(@substitutionsarr) > 1;
3481 $indels =~ s/ //g;
3482 $subs =~ s/ //g ;
3483
3484 #print "indels = $indels, subs=$subs\n";
3485 ##<STDIN> if $indels =~ /[a-zA-Z0-9]/ || $subs =~ /[a-zA-Z0-9]/ ;
3486 #print "tree = $tree, indels = $indels, subs = $subs, bspecies = @bspecies, dspecies = @dspecies \n";
3487 my @returnarray = ();
3488
3489 push (@returnarray, $indels, $subs) ;
3490
3491 push @returnarray, $tree;
3492
3493 my @copy = @returnarray;
3494 #print "\n\nreturnarray = @returnarray ... binsertions=$binsertions dinsertions=$dinsertions bsubstitutions=$bsubstitutions dsubstitutions=$dsubstitutions\n";
3495 #<STDIN>;
3496 return (@returnarray);
3497
3498 }
3499
3500 sub selectBetterTree{
3501 my $printer = 1;
3502 my $treestudy = $_[0];
3503 my $alt = $_[1];
3504 my $mutspt = $_[2];
3505 my @muts = @$mutspt;
3506 my @trees = (); my @alternatetrees=();
3507
3508 @trees = split(/\|/,$treestudy) if $treestudy =~ /\|/;
3509 @alternatetrees = split(/[\|;]/,$alt) if $alt =~ /[\|;\(\)]/;
3510
3511 $trees[0] = $treestudy if $treestudy !~ /\|/;
3512 $alternatetrees[0] = $alt if $alt !~ /[\|;\(\)]/;
3513
3514 my @alltrees = (@trees, @alternatetrees);
3515 # push(@alltrees,@alternatetrees);
3516
3517 my %mutspecies = ();
3518 # print "IN selectBetterTree..treestudy=$treestudy. alt=$alt. for: @_. trees=@trees<. alternatetrees=@alternatetrees\n" if $printer == 1;
3519 #<STDIN>;
3520 foreach my $mut (@muts){
3521 # print colored ['green'],"mut = $mut\n" if $printer == 1;
3522 $mut =~ /node=([A-Z,\(\) ]+)/;
3523 my $node = $1;
3524 $node =~s/[,\(\) ]+//g;
3525 my @indivspecies = $node =~ /[A-Z]+/g;
3526 #print "adding node: $node\n" if $printer == 1;
3527 $mutspecies{$node} = $node;
3528
3529 }
3530
3531 my @treerecords = ();
3532 my $treecount = -1;
3533 foreach my $tree (@alltrees){
3534 # print "checking with tree $tree\n" if $printer == 1;
3535 $treecount++;
3536 $treerecords[$treecount] = 0;
3537 my @indivspecies = ($tree =~ /[A-Z]+/g);
3538 # print "indivspecies=@indivspecies\n" if $printer == 1;
3539 foreach my $species (@indivspecies){
3540 # print "checkin if exists species: $species\n" if $printer == 1;
3541 $treerecords[$treecount]+=2 if exists $mutspecies{$species} && $mutspecies{$species} !~ /indeltype=[a-z]mot/;
3542 $treerecords[$treecount]+=1.5 if exists $mutspecies{$species} && $mutspecies{$species} =~ /indeltype=[a-z]mot/;
3543 $treerecords[$treecount]-- if !exists $mutspecies{$species};
3544 }
3545 # print "for tree $tree, our treecount = $treerecords[$treecount]\n" if $printer == 1;
3546 }
3547
3548 my @best_tree = array_largest_number_arrayPosition(@treerecords);
3549 # print "treerecords = @treerecords. hence, best tree = @best_tree = $alltrees[$best_tree[0]], $treerecords[$best_tree[0]]\n" if $printer == 1;
3550 #<STDIN>;
3551 return ($alltrees[$best_tree[0]], $treerecords[$best_tree[0]]) if scalar(@best_tree) == 1;
3552 # print "best_tree[0] = $best_tree[0], and treerecords = $treerecords[$best_tree[0]]\n" if $printer == 1;
3553 return ("NULL", -1) if $treerecords[$best_tree[0]] < 1;
3554 my $rando = int(rand($#trees));
3555 return ($alltrees[$rando], $treerecords[$rando]) if scalar(@best_tree) > 1;
3556
3557 }
3558
3559
3560
3561
3562 sub load_sameHash{
3563 #my $g = %$_[0];
3564 $sameHash{"CAGT"}="AGTC";
3565 $sameHash{"ATGA"}="AATG";
3566 $sameHash{"CAAC"}="AACC";
3567 $sameHash{"GGAA"}="AAGG";
3568 $sameHash{"TAAG"}="AAGT";
3569 $sameHash{"CGAG"}="AGCG";
3570 $sameHash{"TAGG"}="AGGT";
3571 $sameHash{"GCAG"}="AGGC";
3572 $sameHash{"TAGA"}="ATAG";
3573 $sameHash{"TGA"}="ATG";
3574 $sameHash{"CAAG"}="AAGC";
3575 $sameHash{"CTAA"}="AACT";
3576 $sameHash{"CAAT"}="AATC";
3577 $sameHash{"GTAG"}="AGGT";
3578 $sameHash{"GAAG"}="AAGG";
3579 $sameHash{"CGA"}="ACG";
3580 $sameHash{"GTAA"}="AAGT";
3581 $sameHash{"ACAA"}="AAAC";
3582 $sameHash{"GCGG"}="GGGC";
3583 $sameHash{"ATCA"}="AATC";
3584 $sameHash{"TAAC"}="AACT";
3585 $sameHash{"GGCA"}="AGGC";
3586 $sameHash{"TGAG"}="AGTG";
3587 $sameHash{"AACA"}="AAAC";
3588 $sameHash{"GAGC"}="AGCG";
3589 $sameHash{"ACCA"}="AACC";
3590 $sameHash{"TGAA"}="AATG";
3591 $sameHash{"ACA"}="AAC";
3592 $sameHash{"GAAC"}="AACG";
3593 $sameHash{"GCA"}="AGC";
3594 $sameHash{"CCAC"}="ACCC";
3595 $sameHash{"CATA"}="ATAC";
3596 $sameHash{"CAC"}="ACC";
3597 $sameHash{"TACA"}="ATAC";
3598 $sameHash{"GGAC"}="ACGG";
3599 $sameHash{"AGA"}="AAG";
3600 $sameHash{"ATAA"}="AAAT";
3601 $sameHash{"CA"}="AC";
3602 $sameHash{"CCCA"}="ACCC";
3603 $sameHash{"TCAA"}="AATC";
3604 $sameHash{"CAGA"}="AGAC";
3605 $sameHash{"AATA"}="AAAT";
3606 $sameHash{"CCA"}="ACC";
3607 $sameHash{"AGAA"}="AAAG";
3608 $sameHash{"AGTA"}="AAGT";
3609 $sameHash{"GACG"}="ACGG";
3610 $sameHash{"TCAG"}="AGTC";
3611 $sameHash{"ACGA"}="AACG";
3612 $sameHash{"CGCA"}="ACGC";
3613 $sameHash{"GAGT"}="AGTG";
3614 $sameHash{"GA"}="AG";
3615 $sameHash{"TA"}="AT";
3616 $sameHash{"TAA"}="AAT";
3617 $sameHash{"CAG"}="AGC";
3618 $sameHash{"GATA"}="ATAG";
3619 $sameHash{"GTA"}="AGT";
3620 $sameHash{"CCAA"}="AACC";
3621 $sameHash{"TAG"}="AGT";
3622 $sameHash{"CAAA"}="AAAC";
3623 $sameHash{"AAGA"}="AAAG";
3624 $sameHash{"CACG"}="ACGC";
3625 $sameHash{"GTCA"}="AGTC";
3626 $sameHash{"GGA"}="AGG";
3627 $sameHash{"GGAT"}="ATGG";
3628 $sameHash{"CGGG"}="GGGC";
3629 $sameHash{"CGGA"}="ACGG";
3630 $sameHash{"AGGA"}="AAGG";
3631 $sameHash{"TAAA"}="AAAT";
3632 $sameHash{"GAGA"}="AGAG";
3633 $sameHash{"ACTA"}="AACT";
3634 $sameHash{"GCGA"}="AGCG";
3635 $sameHash{"CACA"}="ACAC";
3636 $sameHash{"AGAT"}="ATAG";
3637 $sameHash{"GAGG"}="AGGG";
3638 $sameHash{"CGAC"}="ACCG";
3639 $sameHash{"GGAG"}="AGGG";
3640 $sameHash{"GCCA"}="AGCC";
3641 $sameHash{"CCAG"}="AGCC";
3642 $sameHash{"GAAA"}="AAAG";
3643 $sameHash{"CAGG"}="AGGC";
3644 $sameHash{"GAC"}="ACG";
3645 $sameHash{"CAA"}="AAC";
3646 $sameHash{"GACC"}="ACCG";
3647 $sameHash{"GGCG"}="GGGC";
3648 $sameHash{"GGTA"}="AGGT";
3649 $sameHash{"AGCA"}="AAGC";
3650 $sameHash{"GATG"}="ATGG";
3651 $sameHash{"GTGA"}="AGTG";
3652 $sameHash{"ACAG"}="AGAC";
3653 $sameHash{"CGG"}="GGC";
3654 $sameHash{"ATA"}="AAT";
3655 $sameHash{"GACA"}="AGAC";
3656 $sameHash{"GCAA"}="AAGC";
3657 $sameHash{"CAGC"}="AGCC";
3658 $sameHash{"GGGA"}="AGGG";
3659 $sameHash{"GAG"}="AGG";
3660 $sameHash{"ACAT"}="ATAC";
3661 $sameHash{"GAAT"}="AATG";
3662 $sameHash{"CACC"}="ACCC";
3663 $sameHash{"GAT"}="ATG";
3664 $sameHash{"GCG"}="GGC";
3665 $sameHash{"GCAC"}="ACGC";
3666 $sameHash{"GAA"}="AAG";
3667 $sameHash{"TGGA"}="ATGG";
3668 $sameHash{"CCGA"}="ACCG";
3669 $sameHash{"CGAA"}="AACG";
3670 }
3671
3672
3673
3674 sub load_revHash{
3675 $revHash{"CTGA"}="AGTC";
3676 $revHash{"TCTT"}="AAAG";
3677 $revHash{"CTAG"}="AGCT";
3678 $revHash{"GGTG"}="ACCC";
3679 $revHash{"GCC"}="GGC";
3680 $revHash{"GCTT"}="AAGC";
3681 $revHash{"GCGT"}="ACGC";
3682 $revHash{"GTTG"}="AACC";
3683 $revHash{"CTCC"}="AGGG";
3684 $revHash{"ATC"}="ATG";
3685 $revHash{"CGAT"}="ATCG";
3686 $revHash{"TTAA"}="AATT";
3687 $revHash{"GTTC"}="AACG";
3688 $revHash{"CTGC"}="AGGC";
3689 $revHash{"TCGA"}="ATCG";
3690 $revHash{"ATCT"}="ATAG";
3691 $revHash{"GGTT"}="AACC";
3692 $revHash{"CTTA"}="AAGT";
3693 $revHash{"TGGC"}="AGCC";
3694 $revHash{"CCG"}="GGC";
3695 $revHash{"CGGC"}="GGCC";
3696 $revHash{"TTAG"}="AACT";
3697 $revHash{"GTG"}="ACC";
3698 $revHash{"CTTT"}="AAAG";
3699 $revHash{"TGCA"}="ATGC";
3700 $revHash{"CGCT"}="AGCG";
3701 $revHash{"TTCC"}="AAGG";
3702 $revHash{"CT"}="AG";
3703 $revHash{"C"}="G";
3704 $revHash{"CTCT"}="AGAG";
3705 $revHash{"ACTT"}="AAGT";
3706 $revHash{"GGTC"}="ACCG";
3707 $revHash{"ATTC"}="AATG";
3708 $revHash{"GGGT"}="ACCC";
3709 $revHash{"CCTA"}="AGGT";
3710 $revHash{"CGCG"}="GCGC";
3711 $revHash{"GTGT"}="ACAC";
3712 $revHash{"GCCC"}="GGGC";
3713 $revHash{"GTCG"}="ACCG";
3714 $revHash{"TCCC"}="AGGG";
3715 $revHash{"TTCA"}="AATG";
3716 $revHash{"AGTT"}="AACT";
3717 $revHash{"CCCT"}="AGGG";
3718 $revHash{"CCGC"}="GGGC";
3719 $revHash{"CTT"}="AAG";
3720 $revHash{"TTGG"}="AACC";
3721 $revHash{"ATT"}="AAT";
3722 $revHash{"TAGC"}="AGCT";
3723 $revHash{"ACTG"}="AGTC";
3724 $revHash{"TCAC"}="AGTG";
3725 $revHash{"CTGT"}="AGAC";
3726 $revHash{"TGTG"}="ACAC";
3727 $revHash{"ATCC"}="ATGG";
3728 $revHash{"GTGG"}="ACCC";
3729 $revHash{"TGGG"}="ACCC";
3730 $revHash{"TCGG"}="ACCG";
3731 $revHash{"CGGT"}="ACCG";
3732 $revHash{"GCTC"}="AGCG";
3733 $revHash{"TACG"}="ACGT";
3734 $revHash{"GTTT"}="AAAC";
3735 $revHash{"CAT"}="ATG";
3736 $revHash{"CATG"}="ATGC";
3737 $revHash{"GTTA"}="AACT";
3738 $revHash{"CACT"}="AGTG";
3739 $revHash{"TCAT"}="AATG";
3740 $revHash{"TTA"}="AAT";
3741 $revHash{"TGTA"}="ATAC";
3742 $revHash{"TTTC"}="AAAG";
3743 $revHash{"TACT"}="AAGT";
3744 $revHash{"TGTT"}="AAAC";
3745 $revHash{"CTA"}="AGT";
3746 $revHash{"GACT"}="AGTC";
3747 $revHash{"TTGC"}="AAGC";
3748 $revHash{"TTC"}="AAG";
3749 $revHash{"GCT"}="AGC";
3750 $revHash{"GCAT"}="ATGC";
3751 $revHash{"TGGT"}="AACC";
3752 $revHash{"CCT"}="AGG";
3753 $revHash{"CATC"}="ATGG";
3754 $revHash{"CCAT"}="ATGG";
3755 $revHash{"CCCG"}="GGGC";
3756 $revHash{"TGCC"}="AGGC";
3757 $revHash{"TG"}="AC";
3758 $revHash{"TGCT"}="AAGC";
3759 $revHash{"GCCG"}="GGCC";
3760 $revHash{"TCTG"}="AGAC";
3761 $revHash{"TGT"}="AAC";
3762 $revHash{"TTAT"}="AAAT";
3763 $revHash{"TAGT"}="AACT";
3764 $revHash{"TATG"}="ATAC";
3765 $revHash{"TTTA"}="AAAT";
3766 $revHash{"CGTA"}="ACGT";
3767 $revHash{"TA"}="AT";
3768 $revHash{"TGTC"}="AGAC";
3769 $revHash{"CTAT"}="ATAG";
3770 $revHash{"TATA"}="ATAT";
3771 $revHash{"TAC"}="AGT";
3772 $revHash{"TC"}="AG";
3773 $revHash{"CATT"}="AATG";
3774 $revHash{"TCG"}="ACG";
3775 $revHash{"ATTT"}="AAAT";
3776 $revHash{"CGTG"}="ACGC";
3777 $revHash{"CTG"}="AGC";
3778 $revHash{"TCGT"}="AACG";
3779 $revHash{"TCCG"}="ACGG";
3780 $revHash{"GTT"}="AAC";
3781 $revHash{"ATGT"}="ATAC";
3782 $revHash{"CTTG"}="AAGC";
3783 $revHash{"CCTT"}="AAGG";
3784 $revHash{"GATC"}="ATCG";
3785 $revHash{"CTGG"}="AGCC";
3786 $revHash{"TTCT"}="AAAG";
3787 $revHash{"CGTC"}="ACGG";
3788 $revHash{"CG"}="GC";
3789 $revHash{"TATT"}="AAAT";
3790 $revHash{"CTCG"}="AGCG";
3791 $revHash{"TCTC"}="AGAG";
3792 $revHash{"TCCT"}="AAGG";
3793 $revHash{"TGG"}="ACC";
3794 $revHash{"ACTC"}="AGTG";
3795 $revHash{"CTC"}="AGG";
3796 $revHash{"CGC"}="GGC";
3797 $revHash{"TTG"}="AAC";
3798 $revHash{"ACCT"}="AGGT";
3799 $revHash{"TCTA"}="ATAG";
3800 $revHash{"GTAC"}="ACGT";
3801 $revHash{"TTGA"}="AATC";
3802 $revHash{"GTCC"}="ACGG";
3803 $revHash{"GATT"}="AATC";
3804 $revHash{"T"}="A";
3805 $revHash{"CGTT"}="AACG";
3806 $revHash{"GTC"}="ACG";
3807 $revHash{"GCCT"}="AGGC";
3808 $revHash{"TGC"}="AGC";
3809 $revHash{"TTTG"}="AAAC";
3810 $revHash{"GGCT"}="AGCC";
3811 $revHash{"TCA"}="ATG";
3812 $revHash{"GTGC"}="ACGC";
3813 $revHash{"TGAT"}="AATC";
3814 $revHash{"TAT"}="AAT";
3815 $revHash{"CTAC"}="AGGT";
3816 $revHash{"TGCG"}="ACGC";
3817 $revHash{"CTCA"}="AGTG";
3818 $revHash{"CTTC"}="AAGG";
3819 $revHash{"GCTG"}="AGCC";
3820 $revHash{"TATC"}="ATAG";
3821 $revHash{"TAAT"}="AATT";
3822 $revHash{"ACT"}="AGT";
3823 $revHash{"TCGC"}="AGCG";
3824 $revHash{"GGT"}="ACC";
3825 $revHash{"TCC"}="AGG";
3826 $revHash{"TTGT"}="AAAC";
3827 $revHash{"TGAC"}="AGTC";
3828 $revHash{"TTAC"}="AAGT";
3829 $revHash{"CGT"}="ACG";
3830 $revHash{"ATTA"}="AATT";
3831 $revHash{"ATTG"}="AATC";
3832 $revHash{"CCTC"}="AGGG";
3833 $revHash{"CCGG"}="GGCC";
3834 $revHash{"CCGT"}="ACGG";
3835 $revHash{"TCCA"}="ATGG";
3836 $revHash{"CGCC"}="GGGC";
3837 $revHash{"GT"}="AC";
3838 $revHash{"TTCG"}="AACG";
3839 $revHash{"CCTG"}="AGGC";
3840 $revHash{"TCT"}="AAG";
3841 $revHash{"GTAT"}="ATAC";
3842 $revHash{"GTCT"}="AGAC";
3843 $revHash{"GCTA"}="AGCT";
3844 $revHash{"TACC"}="AGGT";
3845 }
3846
3847
3848 sub allCaps{
3849 my $motif = $_[0];
3850 $motif =~ s/a/A/g;
3851 $motif =~ s/c/C/g;
3852 $motif =~ s/t/T/g;
3853 $motif =~ s/g/G/g;
3854 return $motif;
3855 }
3856
3857
3858 sub all_caps{
3859 my @strand = split(/\s*/,$_[0]);
3860 for my $i (0 ... $#strand){
3861 if ($strand[$i] =~ /c/) {$strand[$i] = "C";next;}
3862 if ($strand[$i] =~ /a/) {$strand[$i] = "A";next;}
3863 if ($strand[$i] =~ /t/) { $strand[$i] = "T";next;}
3864 if ($strand[$i] =~ /g/) {$strand[$i] = "G";next;}
3865 }
3866 return join("",@strand);
3867 }
3868 sub array_mean{
3869 return "NA" if scalar(@_) == 0;
3870 my $sum = 0;
3871 foreach my $val (@_){
3872 $sum = $sum + $val;
3873 }
3874 return ($sum/scalar(@_));
3875 }
3876 sub array_sum{
3877 return "NA" if scalar(@_) == 0;
3878 my $sum = 0;
3879 foreach my $val (@_){
3880 $sum = $sum + $val;
3881 }
3882 return ($sum);
3883 }
3884
3885 sub variance{
3886 return "NA" if scalar(@_) == 0;
3887 return 0 if scalar(@_) == 1;
3888 my $mean = array_mean(@_);
3889 my $num = 0;
3890 return 0 if scalar(@_) == 1;
3891 # print "mean = $mean .. array = >@_<\n";
3892 foreach my $ele (@_){
3893 # print "$num = $num + ($ele-$mean)*($ele-$mean)\n";
3894 $num = $num + ($ele-$mean)*($ele-$mean);
3895 }
3896 my $var = $num / scalar(@_);
3897 return $var;
3898 }
3899
3900 sub array_95confIntervals{
3901 return "NA" if scalar(@_) <= 0;
3902 my @sorted = sort { $a <=> $b } @_;
3903 # print "@sorted=",scalar(@sorted), "\n";
3904 my $aDeechNo = int((scalar(@sorted) * 2.5) / 100);
3905 my $saaDeNo = int((scalar(@sorted) * 97.5) / 100);
3906
3907 return ($sorted[$aDeechNo], $sorted[$saaDeNo]);
3908 }
3909
3910 sub array_median{
3911 return "NA" if scalar(@_) == 0;
3912 return $_[0] if scalar(@_) == 1;
3913 my @sorted = sort { $a <=> $b } @_;
3914 my $totalno = scalar(@sorted);
3915
3916 #print "sorted = @sorted\n";
3917
3918 my $pick = ();
3919 if ($totalno % 2 == 1){
3920 #print "odd set .. totalno = $totalno\n";
3921 my $mid = $totalno / 2;
3922 my $onehalfno = $mid - $mid % 1;
3923 my $secondhalfno = $onehalfno + 1;
3924 my $onehalf = $sorted[$onehalfno-1];
3925 my $secondhalf = $sorted[$secondhalfno-1];
3926 #print "onehalfno = $onehalfno and secondhalfno = $secondhalfno \n onehalf = $onehalf and secondhalf = $secondhalf\n";
3927
3928 $pick = $secondhalf;
3929 }
3930 else{
3931 #print "even set .. totalno = $totalno\n";
3932 my $mid = $totalno / 2;
3933 my $onehalfno = $mid;
3934 my $secondhalfno = $onehalfno + 1;
3935 my $onehalf = $sorted[$onehalfno-1];
3936 my $secondhalf = $sorted[$secondhalfno-1];
3937 #print "onehalfno = $onehalfno and secondhalfno = $secondhalfno \n onehalf = $onehalf and secondhalf = $secondhalf\n";
3938 $pick = ($onehalf + $secondhalf )/2;
3939
3940 }
3941 #print "pick = $pick..\n";
3942 return $pick;
3943
3944 }
3945
3946
3947 sub array_numerical_sort{
3948 return "NA" if scalar(@_) == 0;
3949 my @sorted = sort { $a <=> $b } @_;
3950 return (@sorted);
3951 }
3952
3953 sub array_smallest_number{
3954 return "NA" if scalar(@_) == 0;
3955 return $_[0] if scalar(@_) == 1;
3956 my @sorted = sort { $a <=> $b } @_;
3957 return $sorted[0];
3958 }
3959
3960
3961 sub array_largest_number{
3962 return "NA" if scalar(@_) == 0;
3963 return $_[0] if scalar(@_) == 1;
3964 my @sorted = sort { $a <=> $b } @_;
3965 return $sorted[$#sorted];
3966 }
3967
3968
3969 sub array_largest_number_arrayPosition{
3970 return "NA" if scalar(@_) == 0;
3971 return 0 if scalar(@_) == 1;
3972 my $maxpos = 0;
3973 my @maxposes = ();
3974 my @maxvals = ();
3975 my $maxval = array_smallest_number(@_);
3976 for my $i (0 ... $#_){
3977 if ($_[$i] > $maxval){
3978 $maxval = $_[$i];
3979 $maxpos = $i;
3980 }
3981 if ($_[$i] == $maxval){
3982 $maxval = $_[$i];
3983 if (scalar(@maxposes) == 0){
3984 push @maxposes, $i;
3985 push @maxvals, $_[$i];
3986
3987 }
3988 elsif ($maxvals[0] == $maxval){
3989 push @maxposes, $i;
3990 push @maxvals, $_[$i];
3991 }
3992 else{
3993 @maxposes = (); @maxvals = ();
3994 push @maxposes, $i;
3995 push @maxvals, $_[$i];
3996 }
3997
3998 }
3999
4000 }
4001 return $maxpos if scalar(@maxposes) < 2;
4002 return (@maxposes);
4003 }
4004
4005 sub array_smallest_number_arrayPosition{
4006 return "NA" if scalar(@_) == 0;
4007 return 0 if scalar(@_) == 1;
4008 my $minpos = 0;
4009 my @minposes = ();
4010 my @minvals = ();
4011 my $minval = array_largest_number(@_);
4012 my $maxval = array_smallest_number(@_);
4013 #print "starting with $maxval, ending with $minval\n";
4014 for my $i (0 ... $#_){
4015 if ($_[$i] < $minval){
4016 $minval = $_[$i];
4017 $minpos = $i;
4018 }
4019 if ($_[$i] == $minval){
4020 $minval = $_[$i];
4021 if (scalar(@minposes) == 0){
4022 push @minposes, $i;
4023 push @minvals, $_[$i];
4024
4025 }
4026 elsif ($minvals[0] == $minval){
4027 push @minposes, $i;
4028 push @minvals, $_[$i];
4029 }
4030 else{
4031 @minposes = (); @minvals = ();
4032 push @minposes, $i;
4033 push @minvals, $_[$i];
4034 }
4035
4036 }
4037
4038 }
4039 #print "minposes=@minposes\n";
4040
4041 return $minpos if scalar(@minposes) < 2;
4042 return (@minposes);
4043 }
4044
4045 sub basic_stats{
4046 my @arr = @_;
4047 # print " array_smallest_number= ", array_smallest_number(@arr)," array_largest_number= ", array_largest_number(@arr), " array_mean= ",array_mean(@arr),"\n";
4048 return ":";
4049 }
4050 #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx
4051
4052 sub maftoAxt_multispecies {
4053 my $printer = 0;
4054 #print "in maftoAxt_multispecies : got @_\n";
4055 my $fname=$_[0];
4056 open(IN,"<$_[0]") or die "Cannot open $_[0]: $! \n";
4057 my $treedefinition = $_[1];
4058 #print "treedefinition= $treedefinition\n";
4059
4060 my @treedefinitions = MakeTrees($treedefinition);
4061
4062
4063 open(OUT,">$_[2]") or die "Cannot open $_[2]: $! \n";
4064 my $counter = 0;
4065 my $exactspeciesset = $_[3];
4066 my @exactspeciesset_unarranged = split(/,/,$exactspeciesset);
4067
4068 $treedefinition=~s/[\)\(, ]/\t/g;
4069 my @species=split(/\t+/,$treedefinition);
4070 @exactspeciesset_unarranged = @species;
4071 # print "species=@species\n";
4072
4073 my @exactspeciesarr=();
4074
4075
4076 foreach my $def (@treedefinitions){
4077 $def=~s/[\)\(, ]/\t/g;
4078 my @specs = split(/\t+/,$def);
4079 my @exactspecies=();
4080 foreach my $spec (@specs){
4081 foreach my $espec (@exactspeciesset_unarranged){
4082 # print "pushing >$spec< nd >$espec<\n" if $spec eq $espec && $espec =~ /[a-zA-Z0-9]/;
4083 push @exactspecies, $spec if $spec eq $espec && $espec =~ /[a-zA-Z0-9]/;
4084 }
4085
4086 }
4087 #print "exactspecies = >@exactspecies<\n";
4088 push @exactspeciesarr, [@exactspecies];
4089 }
4090 #<STDIN>;
4091 #print "exactspeciesarr=@exactspeciesarr\n";
4092
4093 ###########
4094 my $select = 2;
4095 #select = 1 if all species need sequences to be present for each block otherwise, it is 0
4096 #select = 2 only the allowed set make up the alignment. use the removeset
4097 # information to detect alignmenets that have other important genomes aligned.
4098 ###########
4099 my @allowedset = ();
4100 @allowedset = split(/;/,allowedSetOfSpecies(join("_",@species))) if $select == 0;
4101 @allowedset = join("_",0,@species) if $select == 1;
4102 #print "species = @species , allowedset =",join("\n", @allowedset) ," \n";
4103
4104
4105 foreach my $set (@exactspeciesarr){
4106 my @openset = @$set;
4107 push @allowedset, join("_",0,@openset) if $select == 2;
4108 # print "openset = >@openset<, allowedset = @allowedset and exactspecies = @exactspecies\n";
4109 }
4110 # <STDIN>;
4111 my $start = 0;
4112 my @sequences = ();
4113 my @titles = ();
4114 my $species_counter = "0";
4115 my $countermatch = 0;
4116 my $outsideSpecies=0;
4117
4118 while(my $line = <IN>){
4119 next if $line =~ /^#/;
4120 next if $line =~ /^i/;
4121 # print "$line .. species = @species\n";
4122 chomp $line;
4123 my @fields = split(/\s+/,$line);
4124 chomp $line;
4125 if ($line =~ /^a /){
4126 $start = 1;
4127 }
4128
4129 if ($line =~ /^s /){
4130 # print "fields1 = $fields[1] , start = $start\n";
4131
4132 foreach my $sp (@species){
4133 if ($fields[1] =~ /$sp/){
4134 $species_counter = $species_counter."_".$sp;
4135 push(@sequences, $fields[6]);
4136 my @sp_info = split(/\./,$fields[1]);
4137 my $title = join(" ",@sp_info, $fields[2], ($fields[2]+$fields[3]), $fields[4]);
4138 push(@titles, $title);
4139
4140 }
4141 }
4142 }
4143
4144 if (($line !~ /^a/) && ($line !~ /^s/) && ($line !~ /^#/) && ($line !~ /^i/) && ($start = 1)){
4145
4146 my $arranged = reorderSpecies($species_counter, @species);
4147 # print "species_counter=$species_counter .. arranged = $arranged\n";
4148
4149 my $stopper = 1;
4150 my $arrno = 0;
4151 foreach my $set (@allowedset){
4152 # print "checking $set with $arranged\n";
4153 if ($arranged eq $set){
4154 # print "checked $set with $arranged\n";
4155 $stopper = 0; last;
4156 }
4157 $arrno++;
4158 }
4159
4160 if ($stopper == 0) {
4161 # print " accepted\n";
4162 @titles = split ";", orderInfo(join(";", @titles), $species_counter, $arranged) if $species_counter ne $arranged;
4163
4164 @sequences = split ";", orderInfo(join(";", @sequences), $species_counter, $arranged) if $species_counter ne $arranged;
4165 my $filteredseq = filter_gaps(@sequences);
4166
4167 if ($filteredseq ne "SHORT"){
4168 $counter++;
4169 print OUT join (" ",$counter, @titles), "\n";
4170 print OUT $filteredseq, "\n";
4171 print OUT "\n";
4172 $countermatch++;
4173 }
4174 # my @filtered_seq = split(/\t/,filter_gaps(@sequences) );
4175 }
4176 else{#print "\n";
4177 }
4178
4179 @sequences = (); @titles = (); $start = 0;$species_counter = "0";
4180 next;
4181
4182 }
4183 }
4184 close IN;
4185 close OUT;
4186 # print "countermatch = $countermatch\n";
4187 # <STDIN>;
4188 }
4189
4190 sub reorderSpecies{
4191 my @inarr=@_;
4192 my $currSpecies = shift (@inarr);
4193 my $ordered_species = 0;
4194 my @species=@inarr;
4195 foreach my $order (@species){
4196 $ordered_species = $ordered_species."_".$order if $currSpecies=~ /$order/;
4197 }
4198 return $ordered_species;
4199
4200 }
4201
4202 sub filter_gaps{
4203 my @sequences = @_;
4204 # print "sequences sent are @sequences\n";
4205 my $seq_length = length($sequences[0]);
4206 my $seq_no = scalar(@sequences);
4207 my $allgaps = ();
4208 for (1 ... $seq_no){
4209 $allgaps = $allgaps."-";
4210 }
4211
4212 my @seq_array = ();
4213 my $seq_counter = 0;
4214 foreach my $seq (@sequences){
4215 # my @sequence = split(/\s*/,$seq);
4216 $seq_array[$seq_counter] = [split(/\s*/,$seq)];
4217 # push @seq_array, [@sequence];
4218 $seq_counter++;
4219 }
4220 my $g = 0;
4221 while ( $g < $seq_length){
4222 last if (!exists $seq_array[0][$g]);
4223 my $bases = ();
4224 for my $u (0 ... $#seq_array){
4225 $bases = $bases.$seq_array[$u][$g];
4226 }
4227 # print $bases, "\n";
4228 if ($bases eq $allgaps){
4229 # print "bases are $bases, position is $g \n";
4230 for my $seq (@seq_array){
4231 splice(@$seq , $g, 1);
4232 }
4233 }
4234 else {
4235 $g++;
4236 }
4237 }
4238
4239 my @outs = ();
4240
4241 foreach my $seq (@seq_array){
4242 push(@outs, join("",@$seq));
4243 }
4244 return "SHORT" if length($outs[0]) <=100;
4245 return (join("\n", @outs));
4246 }
4247
4248
4249 sub allowedSetOfSpecies{
4250 my @allowed_species = split(/_/,$_[0]);
4251 unshift @allowed_species, 0;
4252 # print "allowed set = @allowed_species \n";
4253 my @output = ();
4254 for (0 ... scalar(@allowed_species) - 4){
4255 push(@output, join("_",@allowed_species));
4256 pop @allowed_species;
4257 }
4258 return join(";",reverse(@output));
4259
4260 }
4261
4262
4263 sub orderInfo{
4264 my @info = split(/;/,$_[0]);
4265 # print "info = @info";
4266 my @old = split(/_/,$_[1]);
4267 my @new = split(/_/,$_[2]);
4268 shift @old; shift @new;
4269 my @outinfo = ();
4270 foreach my $spe (@new){
4271 for my $no (0 ... $#old){
4272 if ($spe eq $old[$no]){
4273 push(@outinfo, $info[$no]);
4274 }
4275 }
4276 }
4277 # print "outinfo = @outinfo \n";
4278 return join(";", @outinfo);
4279 }
4280
4281 #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx
4282
4283 sub printarr {
4284 # print ">::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n";
4285 # foreach my $line (@_) {print "$line\n";}
4286 # print "::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::<\n";
4287 }
4288
4289 sub oneOf{
4290 my @arr = @_;
4291 my $element = $arr[$#arr];
4292 @arr = @arr[0 ... $#arr-1];
4293 my $present = 0;
4294
4295 foreach my $el (@arr){
4296 $present = 1 if $el eq $element;
4297 }
4298 return $present;
4299 }
4300
4301 #xxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx
4302
4303 sub MakeTrees{
4304 my $tree = $_[0];
4305 # my @parts=($tree);
4306 my @parts=();
4307
4308 # print "parts=@parts\n";
4309
4310 while (1){
4311 $tree =~ s/^\(//g;
4312 $tree =~ s/\)$//g;
4313 my @arr = ();
4314
4315 if ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)\)$/){
4316 @arr = $tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)$/;
4317 push @parts, "(".$tree.")";
4318 $tree = $2;
4319 }
4320 elsif ($tree =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/){
4321 @arr = $tree =~ /^([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/;
4322 push @parts, "(".$tree.")";
4323 $tree = $1;
4324 }
4325 elsif ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/){
4326 last;
4327 }
4328 }
4329 #print "parts=@parts\n";
4330 return @parts;
4331 }
4332
4333