Mercurial > repos > devteam > microsatellite_birthdeath
comparison microsatellite_birthdeath.pl @ 0:4e31fad3f08e draft
Uploaded tool tarball.
author | devteam |
---|---|
date | Wed, 25 Sep 2013 11:26:02 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4e31fad3f08e |
---|---|
1 #!/usr/bin/perl -w | |
2 use strict; | |
3 use warnings; | |
4 use Term::ANSIColor; | |
5 use Pod::Checker; | |
6 use File::Basename; | |
7 use IO::Handle; | |
8 use Cwd; | |
9 use File::Path qw(make_path remove_tree); | |
10 use File::Temp qw/ tempfile tempdir /; | |
11 my $tdir = tempdir( CLEANUP => 1 ); | |
12 chdir $tdir; | |
13 my $dir = getcwd; | |
14 # print "current dit=$dir\n"; | |
15 #<STDIN>; | |
16 use vars qw (%treesToReject %template $printer $interr_poscord $interrcord $no_of_interruptionscord $stringfile @tags | |
17 $infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species | |
18 $gapcord %thresholdhash $tree_decipherer @sp_ident %revHash %sameHash %treesToIgnore %alternate @exactspecies_orig @exactspecies @exacttags | |
19 $mono_flanksimplicityRepno $di_flanksimplicityRepno $prop_of_seq_allowedtoAT $prop_of_seq_allowedtoCG); | |
20 use FileHandle; | |
21 use IO::Handle; # 5.004 or higher | |
22 | |
23 | |
24 | |
25 #my @ar = ("/Users/ydk/work/rhesus_microsat/results/galay/chr22_5sp.maf.txt", "/Users/ydk/work/rhesus_microsat/results/galay/dataset_11.dat", | |
26 #"/Users/ydk/work/rhesus_microsat/results/galay/chr22_5spec.maf.summ","hg18,panTro2,ponAbe2,rheMac2,calJac1","((((hg18, panTro2), ponAbe2), rheMac2), calJac1)","9,10,12,12", | |
27 #"10","0.8"); | |
28 my @ar = @ARGV; | |
29 my ($maf, $orth, $summout, $species_set, $tree_definition, $thresholds, $FLANK_SUPPORT, $SIMILARITY_THRESH) = @ar; | |
30 $SIMILARITY_THRESH=$SIMILARITY_THRESH/100; | |
31 ######################### | |
32 $SIMILARITY_THRESH = $SIMILARITY_THRESH/100; | |
33 my $EDGE_DISTANCE = 10; | |
34 my $COMPLEXITY_SUPPORT = 20; | |
35 load_thresholds("9_10_12_12"); | |
36 my $FLANKINDEL_MAXTHRESH = 0.3; | |
37 | |
38 my $mono_flanksimplicityRepno=6; | |
39 my $di_flanksimplicityRepno=10; | |
40 my $prop_of_seq_allowedtoAT=0.5; | |
41 my $prop_of_seq_allowedtoCG=0.66; | |
42 | |
43 ######################### | |
44 my $tspecies_set = $species_set; | |
45 | |
46 my %speciesReplacement = (); | |
47 my %speciesReplacementTag = (); | |
48 my %replacementArr= (); | |
49 my %replacementArrTag= (); | |
50 my %backReplacementArr= (); | |
51 my %backReplacementArrTag= (); | |
52 $tree_definition=~s/\s+//g; | |
53 | |
54 my $tree_definition_split = $tree_definition; | |
55 $tree_definition_split =~ s/[\(\)]//g; | |
56 my @gotSpecies = ($tree_definition_split =~ /(,)/g); | |
57 # print "gotSpecies = @gotSpecies\n"; | |
58 | |
59 if (scalar(@gotSpecies)+1 ==5){ | |
60 | |
61 $speciesReplacement{1}="calJac1"; | |
62 $speciesReplacement{2}="rheMac2"; | |
63 $speciesReplacement{3}="ponAbe2"; | |
64 $speciesReplacement{4}="panTro2"; | |
65 $speciesReplacement{5}="hg18"; | |
66 | |
67 $speciesReplacementTag{1}="M"; | |
68 $speciesReplacementTag{2}="R"; | |
69 $speciesReplacementTag{3}="O"; | |
70 $speciesReplacementTag{4}="C"; | |
71 $speciesReplacementTag{5}="H"; | |
72 $species_set="hg18,panTro2,ponAbe2,rheMac2,calJac1"; | |
73 | |
74 } | |
75 if (scalar(@gotSpecies)+1 ==4){ | |
76 | |
77 $speciesReplacement{1}="rheMac2"; | |
78 $speciesReplacement{2}="ponAbe2"; | |
79 $speciesReplacement{3}="panTro2"; | |
80 $speciesReplacement{4}="hg18"; | |
81 | |
82 $speciesReplacementTag{1}="R"; | |
83 $speciesReplacementTag{2}="O"; | |
84 $speciesReplacementTag{3}="C"; | |
85 $speciesReplacementTag{4}="H"; | |
86 $species_set="hg18,panTro2,ponAbe2,rheMac2"; | |
87 | |
88 } | |
89 | |
90 # $tree_definition = "((((hg18,panTro2),ponAbe2),rheMac2),calJac1)"; | |
91 my $tree_definition_copy = $tree_definition; | |
92 my $tree_definition_orig = $tree_definition; | |
93 my $brackets = 0; | |
94 | |
95 while (1){ | |
96 #last if $tree_definition_copy !~ /\(/; | |
97 $brackets++; | |
98 # print "brackets = $brackets\n"; | |
99 last if $brackets > 6; | |
100 $tree_definition_copy =~ s/^\(//g; | |
101 $tree_definition_copy =~ s/\)$//g; | |
102 # print "tree_definition_copy = $tree_definition_copy\n"; | |
103 my @arr = (); | |
104 | |
105 if ($tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)\)$/){ | |
106 @arr = $tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)$/; | |
107 # print "arr = @arr\n"; | |
108 $tree_definition_copy = $2; | |
109 $replacementArr{$1} = $speciesReplacement{$brackets}; | |
110 $backReplacementArr{$speciesReplacement{$brackets}}=$1; | |
111 | |
112 $replacementArrTag{$1} = $speciesReplacementTag{$brackets}; | |
113 $backReplacementArrTag{$speciesReplacementTag{$brackets}}=$1; | |
114 # print "replacing $1 with $replacementArr{$1}\n"; | |
115 | |
116 $sp_ident[$brackets-1] = $1; | |
117 | |
118 } | |
119 elsif ($tree_definition_copy =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/){ | |
120 @arr = $tree_definition_copy =~ /^([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/; | |
121 # print "arr = @arr\n"; | |
122 $tree_definition_copy = $1; | |
123 $replacementArr{$2} = $speciesReplacement{$brackets}; | |
124 $backReplacementArr{$speciesReplacement{$brackets}}=$2; | |
125 | |
126 $replacementArrTag{$2} = $speciesReplacementTag{$brackets}; | |
127 $backReplacementArrTag{$speciesReplacementTag{$brackets}}=$2; | |
128 # print "replacing $2 with $replacementArr{$2}\n"; | |
129 | |
130 $sp_ident[$brackets-1] = $2; | |
131 } | |
132 elsif ($tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/){ | |
133 @arr = $tree_definition_copy =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/; | |
134 # print "arr = @arr .. TERMINAL\n"; | |
135 $tree_definition_copy = $1; | |
136 $replacementArr{$2} = $speciesReplacement{$brackets}; | |
137 $replacementArr{$1} = $speciesReplacement{$brackets+1}; | |
138 $backReplacementArr{$speciesReplacement{$brackets}}=$2; | |
139 $backReplacementArr{$speciesReplacement{$brackets+1}}=$1; | |
140 | |
141 $replacementArrTag{$1} = $speciesReplacementTag{$brackets+1}; | |
142 $backReplacementArrTag{$speciesReplacementTag{$brackets+1}}=$1; | |
143 | |
144 $replacementArrTag{$2} = $speciesReplacementTag{$brackets}; | |
145 $backReplacementArrTag{$speciesReplacementTag{$brackets}}=$2; | |
146 # print "replacing $1 with $replacementArr{$1}\n"; | |
147 # print "replacing $2 with $replacementArr{$2}\n"; | |
148 # print "replacing $1 with $replacementArrTag{$1}\n"; | |
149 # print "replacing $2 with $replacementArrTag{$2}\n"; | |
150 | |
151 $sp_ident[$brackets-1] = $2; | |
152 $sp_ident[$brackets] = $1; | |
153 | |
154 | |
155 last; | |
156 | |
157 } | |
158 elsif ($tree_definition_copy =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_\(\),]+)\)$/){ | |
159 $tree_definition_copy =~ s/^\(//g; | |
160 $tree_definition_copy =~ s/\)$//g; | |
161 $brackets--; | |
162 } | |
163 } | |
164 | |
165 foreach my $key (keys %replacementArr){ | |
166 my $replacement = $replacementArr{$key}; | |
167 $tree_definition =~ s/$key/$replacement/g; | |
168 } | |
169 @sp_ident = reverse(@sp_ident); | |
170 # print "modified tree_definition = $tree_definition\n"; | |
171 # print "done .. tree_definition = $tree_definition\n"; | |
172 # print "sp_ident = @sp_ident\n"; | |
173 #<STDIN>; | |
174 | |
175 | |
176 my $complexity=int($COMPLEXITY_SUPPORT * (1/40)); | |
177 | |
178 #print "complexity=$complexity\n"; | |
179 #<STDIN>; | |
180 | |
181 $printer = 1; | |
182 | |
183 my $rando = int(rand(1000)); | |
184 my $localdate = `date`; | |
185 $localdate =~ /([0-9]+):([0-9]+):([0-9]+)/; | |
186 my $info = $rando.$1.$2.$3; | |
187 | |
188 #--------------------------------------------------------------------------- | |
189 # GETTING INPUT INFORMATION AND OPENING INPUT AND OUTPUT FILES | |
190 | |
191 | |
192 my @thresharr = (0, split(/,/,$thresholds)); | |
193 my $randno=int(rand(100000)); | |
194 my $megamatch = $randno.".megamatch.net.axt"; #"/gpfs/home/ydk104/work/rhesus_microsat/axtNet/hg18.panTro2.ponAbe2.rheMac2.calJac1/chr1.hg18.panTro2.ponAbe2.rheMac2.calJac1.net.axt"; | |
195 my $megamatchlck = $megamatch.".lck"; | |
196 unlink $megamatchlck; | |
197 | |
198 my $selected= $orth; | |
199 #my $eventfile = $orth; | |
200 $selected = $selected."_SELECTED"; | |
201 #$selected = $selected."_".$SIMILARITY_THRESH; | |
202 #my $runtime = $selected.".runtime"; | |
203 | |
204 my $inputtags = "H:C:O:R:M"; | |
205 $inputtags = $ARGV[3] if exists $ARGV[3] && $ARGV[3] =~ /[A-Z]:[A-Z]/; | |
206 | |
207 my @all_tags = split(/:/, $inputtags); | |
208 my $inputsp = "hg18:panTro2:ponAbe2:rheMac2:calJac1"; | |
209 $inputsp = $ARGV[4] if exists $ARGV[4] && $ARGV[3] =~ /[0-9]+:/; | |
210 #@sp_ident = split(/:/,$inputsp); | |
211 my $junkfile = $orth."_junk"; | |
212 | |
213 my $sh = load_sameHash(1); | |
214 my $rh = load_revHash(1); | |
215 | |
216 #print "inputs are : \n"; foreach(@ARGV){print $_,"\n";} | |
217 #open (SELECT, ">$selected") or die "Cannot open selected file: $selected: $!"; | |
218 open (SUMMARY, ">$summout") or die "Cannot open summout file: $summout: $!"; | |
219 #open (RUN, ">$runtime") or die "Cannot open orth file: $runtime: $!"; | |
220 #my $ctlfile = "baseml\.ctl"; #$ARGV[4]; | |
221 #my $treefile = "/gpfs/home/ydk104/work/rhesus_microsat/codes/lib/"; #1 THIS IS THE THE TREE UNDER CONSIDERATION, IN NEWICK | |
222 my %registeredTrees = (); | |
223 my @removalReasons = | |
224 ("microsatellite is compound", | |
225 "complex structure", | |
226 "if no. if micros is more than no. of species", | |
227 "if more than one micro per species ", | |
228 "if microsat contains N", | |
229 "different motif than required ", | |
230 "more than zero interruptions", | |
231 "microsat could not form key ", | |
232 "orthologous microsats of different motif size ", | |
233 "orthologous microsats of different motifs ", | |
234 "microsats belong to different alignment blocks altogether", | |
235 "microsat near edge", | |
236 "microsat in low complexity region", | |
237 "microsat flanks dont align well", | |
238 "phylogeny not informative"); | |
239 my %allowedhash=(); | |
240 #--------------------------------------------------------------------------- | |
241 # WORKING ON MAKING THE MEGAMATCH FILE | |
242 my $chromt=int(rand(10000)); | |
243 my $p_chr=$chromt; | |
244 | |
245 my $tree_definition_orig_copy = $tree_definition_orig; | |
246 | |
247 $tree_definition=~s/,/, /g; | |
248 $tree_definition =~ s/, +/, /g; | |
249 $tree_definition_orig=~s/,/, /g; | |
250 $tree_definition_orig =~ s/, +/, /g; | |
251 my @exactspeciesset_unarranged = split(/,/,$species_set); | |
252 my @exactspeciesset_unarranged_orig = split(/,/,$tspecies_set); | |
253 my $largesttree = "$tree_definition;"; | |
254 my $largesttree_orig = "$tree_definition_orig;"; | |
255 # print "largesttree = $largesttree\n"; | |
256 $tree_definition =~ s/\(//g; | |
257 $tree_definition =~ s/\)//g; | |
258 $tree_definition=~s/[\)\(, ]/\t/g; | |
259 $tree_definition =~ s/\t+/\t/g; | |
260 | |
261 $tree_definition_orig =~ s/\(//g; | |
262 $tree_definition_orig =~ s/\)//g; | |
263 $tree_definition_orig =~s/[\)\(, ]/\t/g; | |
264 $tree_definition_orig =~ s/\t+/\t/g; | |
265 # print "tree_definition = $tree_definition tree_definition_orig = $tree_definition_orig\n"; | |
266 | |
267 my @treespecies=split(/\t+/,$tree_definition); | |
268 my @treespecies_orig=split(/\t+/,$tree_definition_orig); | |
269 # print "tree_definition = $tree_definition .. treespecies=@treespecies ... treespecies_orig=@treespecies_orig\n"; | |
270 #<STDIN>; | |
271 | |
272 foreach my $spec (@treespecies){ | |
273 foreach my $espec (@exactspeciesset_unarranged){ | |
274 # print "spec=$spec and espec=$espec\n"; | |
275 push @exactspecies, $spec if $spec eq $espec; | |
276 } | |
277 } | |
278 | |
279 foreach my $spec (@treespecies_orig){ | |
280 foreach my $espec (@exactspeciesset_unarranged_orig){ | |
281 # print "spec=$spec and espec=$espec\n"; | |
282 push @exactspecies_orig, $spec if $spec eq $espec; | |
283 } | |
284 } | |
285 | |
286 my $focalspec = $exactspecies[0]; | |
287 my $focalspec_orig = $exactspecies_orig[0]; | |
288 # print "exactspecies=@exactspecies ... focalspec=$focalspec\n"; | |
289 # print "texactspecies=@exactspecies_orig ... focalspec_orig=$focalspec_orig\n"; | |
290 #<STDIN>; | |
291 my $arranged_species_set = join(".",@exactspecies); | |
292 my $arranged_species_set_orig = join(".",@exactspecies_orig); | |
293 | |
294 @exacttags=@exactspecies; | |
295 my @exacttags_orig=@exactspecies_orig; | |
296 | |
297 foreach my $extag (@exacttags){ | |
298 $extag =~ s/hg18/H/g; | |
299 $extag =~ s/panTro2/C/g; | |
300 $extag =~ s/ponAbe2/O/g; | |
301 $extag =~ s/rheMac2/R/g; | |
302 $extag =~ s/calJac1/M/g; | |
303 } | |
304 | |
305 foreach my $extag (@exacttags_orig){ | |
306 $extag =~ s/hg18/H/g; | |
307 $extag =~ s/panTro2/C/g; | |
308 $extag =~ s/ponAbe2/O/g; | |
309 $extag =~ s/rheMac2/R/g; | |
310 $extag =~ s/calJac1/M/g; | |
311 } | |
312 | |
313 my $chr_name = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt"); | |
314 #print "sending to maftoAxt_multispecies: $maf, $tree_definition, $chr_name, $species_set .. focalspec=$focalspec \n"; | |
315 | |
316 maftoAxt_multispecies($maf, $tree_definition_orig_copy, $chr_name, $tspecies_set); | |
317 #print "made files\n"; | |
318 my @filterseqfiles= ($chr_name); | |
319 $largesttree =~ s/hg18/H/g; | |
320 $largesttree =~ s/panTro2/C/g; | |
321 $largesttree =~ s/ponAbe2/O/g; | |
322 $largesttree =~ s/rheMac2/R/g; | |
323 $largesttree =~ s/calJac1/M/g; | |
324 #<STDIN>; | |
325 #--------------------------------------------------------------------------- | |
326 | |
327 my ($lagestnodes, $largestbranches) = get_nodes($largesttree); | |
328 shift (@$lagestnodes); | |
329 my @extendedtitle=(); | |
330 | |
331 my $title = (); | |
332 my $parttitle = (); | |
333 my @titlearr = (); | |
334 my @firsttitle=($focalspec_orig."chrom", $focalspec_orig."start", $focalspec_orig."end", $focalspec_orig."motif", $focalspec_orig."motifsize", $focalspec_orig."threshold"); | |
335 | |
336 my @finames= qw(chr start end motif motifsize microsat mutation mutation.position mutation.from mutation.to insertion.details deletion.details); | |
337 | |
338 my @fititle=(); | |
339 | |
340 foreach my $spec (split(",",$tspecies_set)){ | |
341 push @fititle, $spec; | |
342 foreach my $name (@finames){ | |
343 push @fititle, $spec.".".$name; | |
344 } | |
345 } | |
346 | |
347 | |
348 my @othertitle=qw(somechr somestart somened event source); | |
349 | |
350 my @fnames = (); | |
351 push @fnames, qw(insertions_num deletions_num motinsertions_num motinsertionsf_num motdeletions_num motdeletionsf_num noninsertions_num nondeletions_num) ; | |
352 push @fnames, qw(binsertions_num bdeletions_num bmotinsertions_num bmotinsertionsf_num bmotdeletions_num bmotdeletionsf_num bnoninsertions_num bnondeletions_num) ; | |
353 push @fnames, qw(dinsertions_num ddeletions_num dmotinsertions_num dmotinsertionsf_num dmotdeletions_num dmotdeletionsf_num dnoninsertions_num dnondeletions_num) ; | |
354 push @fnames, qw(ninsertions_num ndeletions_num nmotinsertions_num nmotinsertionsf_num nmotdeletions_num nmotdeletionsf_num nnoninsertions_num nnondeletions_num) ; | |
355 push @fnames, qw(substitutions_num bsubstitutions_num dsubstitutions_num nsubstitutions_num indels_num subs_num); | |
356 | |
357 my @fullnames = (); | |
358 # print "revising\n"; | |
359 # print "H = $backReplacementArrTag{H}\n"; | |
360 # print "C = $backReplacementArrTag{C}\n"; | |
361 # print "O = $backReplacementArrTag{O}\n"; | |
362 # print "R = $backReplacementArrTag{R}\n"; | |
363 # print "M = $backReplacementArrTag{M}\n"; | |
364 | |
365 foreach my $lnode (@$lagestnodes){ | |
366 my @pair = @$lnode; | |
367 my @nodemutarr = (); | |
368 for my $p (@pair){ | |
369 # print "p = $p\n"; | |
370 $p =~ s/[\(\), ]+//g; | |
371 $p =~ s/([A-Z])/$1./g; | |
372 $p =~ s/\.$//g; | |
373 | |
374 $p =~ s/H/$backReplacementArrTag{H}/g; | |
375 $p =~ s/C/$backReplacementArrTag{C}/g; | |
376 $p =~ s/O/$backReplacementArrTag{O}/g; | |
377 $p =~ s/R/$backReplacementArrTag{R}/g; | |
378 $p =~ s/M/$backReplacementArrTag{M}/g; | |
379 foreach my $n (@fnames) { push @fullnames, $p.".".$n;} | |
380 } | |
381 } | |
382 | |
383 #print SUMMARY "#",join("\t", @firsttitle, @fititle, @othertitle); | |
384 | |
385 #print SUMMARY "\t",join("\t", @fullnames); | |
386 my $header = join("\t",@firsttitle, @fititle, @othertitle, @fullnames, @fnames, "tree", "cleancase"); | |
387 # print "header= $header\n"; | |
388 #<STDIN>; | |
389 | |
390 #print SUMMARY "\t",join("\t", @fnames); | |
391 #$title= $title."\t".join("\t", @fnames); | |
392 | |
393 #print SUMMARY "\t","tree","\t", "cleancase", "\n"; | |
394 #$title= $title."\t"."tree"."\t"."cleancase". "\n"; | |
395 | |
396 #print $title; #<STDIN>; | |
397 | |
398 #print "all_tags = @all_tags\n"; | |
399 | |
400 for my $no (3 ... $#all_tags+1){ | |
401 # print "no=$no\n"; #<STDIN>; | |
402 @tags = @all_tags[0 ... $no-1]; | |
403 # print "all_tags=>@all_tags< , tags = >@tags<\n" if $printer == 1; #<STDIN>; | |
404 %template=(); | |
405 my @nextcounter = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); | |
406 #next if scalar(@tags) < 4; | |
407 # print "now doing tags = @tags, no = $no\n"; | |
408 open (ORTH, "<$orth") or die "Cannot open orth file: $orth: $!"; | |
409 | |
410 # print SUMMARY join "\t", qw (species chr start end branch motif microsat mutation position from to insertion deletion); | |
411 | |
412 | |
413 ##################### T E M P O R A R Y ##################### | |
414 my @finaltitle=(); | |
415 my @singletitle = qw (species chr start end motif motifsize microsat strand microsatsize col10 col11 col12 col13); | |
416 my $endtitle = (); | |
417 foreach my $tag (@tags){ | |
418 my @tempsingle = (); | |
419 | |
420 foreach my $single (@singletitle){ | |
421 push @tempsingle, $tag.$single; | |
422 } | |
423 @finaltitle = (@finaltitle, @tempsingle); | |
424 } | |
425 | |
426 # print SUMMARY join("\t",@finaltitle),"\n"; | |
427 | |
428 ############################################################# | |
429 | |
430 #--------------------------------------------------------------------------- | |
431 # GET THE TREE FROM TREE FILE | |
432 my $tree = (); | |
433 $tree = "((H, C), O)" if $no == 3; | |
434 $tree = "(((H, C), O), R)" if $no == 4; | |
435 $tree = "((((H, C), O), R), M)" if $no == 5; | |
436 # $tree=~s/;$//g; | |
437 # print "our tree = $tree\n"; | |
438 #--------------------------------------------------------------------------- | |
439 # LOADING HASH CONTAINING ALL POSSIBLE TREES: | |
440 $tree_decipherer = "/gpfs/home/ydk104/work/rhesus_microsat/codes/lib/tree_analysis_".join("",@tags).".txt"; | |
441 %template=(); | |
442 %alternate=(); | |
443 load_allPossibleTrees($tree_decipherer, \%template, \%alternate); | |
444 | |
445 #--------------------------------------------------------------------------- | |
446 # LOADING THE TREES TO REJECT FOR BIRTH ANALYSIS | |
447 %treesToReject=(); | |
448 %treesToIgnore=(); | |
449 load_treesToReject(@tags); | |
450 load_treesToIgnore(@tags); | |
451 #--------------------------------------------------------------------------- | |
452 # LOADING INPUT DATA INTO HASHES AND ARRAYS | |
453 | |
454 | |
455 #1 THIS IS THE POINT WHERE WE CAN FILTER OUT LARGE MICROSAT CLUSTERS | |
456 #2 AS WELL AS MULTIPLE-ALIGNMENT-BLOCKS-SPANNING MICROSATS (KIND OF | |
457 #3 IMPLICIT IN THE FIRST PART OF THE SENTENCE ITSELF IN MOST CASES). | |
458 | |
459 my %orths=(); | |
460 my $counterm = 0; | |
461 my $loaded = 0; | |
462 my %seen = (); | |
463 my @allowedchrs = (); | |
464 # print "no = $no\n"; #<STDIN>; | |
465 | |
466 while (my $line = <ORTH>){ | |
467 # print "line=$line\n"; | |
468 my $register1 = $line =~ s/>$exactspecies_orig[0]/>$replacementArrTag{$exactspecies_orig[0]}/g; | |
469 my $register2 = $line =~ s/>$exactspecies_orig[1]/>$replacementArrTag{$exactspecies_orig[1]}/g; | |
470 my $register3 = $line =~ s/>$exactspecies_orig[2]/>$replacementArrTag{$exactspecies_orig[2]}/g; | |
471 my $register4 = $line =~ s/>$exactspecies_orig[3]/>$replacementArrTag{$exactspecies_orig[3]}/g; | |
472 my $register5 = $line =~ s/>$exactspecies_orig[4]/>$replacementArrTag{$exactspecies_orig[4]}/g if exists $exactspecies_orig[4]; | |
473 | |
474 # print "line = $line\n"; <STDIN>; | |
475 | |
476 | |
477 # next if $register1 + $register2 + $register3 + $register4 + $register5 > scalar(@tags); | |
478 my @micros = split(/>/,$line); # LOADING ALL THE MICROSAT ENTRIES FROM THE CLUSTER INTO @micros | |
479 #print "micros=",printarr(@micros),"\n"; #<STDIN>; | |
480 shift @micros; # EMPTYING THE FIRST, EMTPY ELEMENT OF THE ARRAY | |
481 | |
482 $no_of_species = adjustCoordinates($micros[0]); | |
483 # print "A: $no_of_species\n"; | |
484 next if $no_of_species != $no; | |
485 # print "no = $no ... no_of_species=$no_of_species\n";#<STDIN>; | |
486 $counterm++; | |
487 #------------------------------------------------ | |
488 $nextcounter[0]++ if $line =~ /compound/; | |
489 next if $line =~ /compound/; # GETTING RID OF COMPOUND MICROSATS | |
490 #------------------------------------------------ | |
491 #next if $line =~ /[A-Za-z]>[a-zA-Z]/; | |
492 #------------------------------------------------ | |
493 chomp $line; | |
494 my $match_count = ($line =~ s/>/>/g); # COUNTING THE NUMBER OF MICROSAT ENTRIES IN THE CLUSTER | |
495 #print "number of species = $match_count\n"; | |
496 my $stopper = 0; | |
497 foreach my $mic (@micros){ | |
498 my @local = split(/\t/,$mic); | |
499 if ($local[$typecord] =~ /\./ || exists($local[$no_of_interruptionscord+2])) {$stopper = 1; $nextcounter[1]++; | |
500 last; } | |
501 # REMOVING CLUSTERS WITH THE CYRPTIC, (UNRESOLVABLY COMPLEX) MICROSAT ENTRIES IN THEM | |
502 } | |
503 next if $stopper ==1; | |
504 #------------------------------------------------ | |
505 $nextcounter[2]++ if (scalar(@micros) >$no_of_species); | |
506 | |
507 next if (scalar(@micros) >$no_of_species); #1 REMOVING MICROSAT CLUSTERS WITH MORE NUMBER OF MICROSAT ENTRIES THAN THE NUMBER OF SPECIES IN THE DATASET. | |
508 #2 THIS IS SO BECAUSE SUCH CLUSTERS IMPLY THAT IN AT LEAST ONE SPECIES, THERE IS MORE THAN ONE MICROSAT ENTRY | |
509 #3 IN THE CLUSTER. THUS, HERE WE ARE GETTING RID OF MICROSATS CLUSTERS THAT INCLUDE MULTUPLE, NEIGHBORING | |
510 #4 MICROSATS, AND STICK TO CLEAN MICROSATS THAT DO NOT HAVE ANY MICROSATS IN NEIGHBORHOOD. | |
511 #5 THIS 'NEIGHBORHOOD-RANGE' HAD BEEN DECIDED PREVIOUSLY IN OUR CODE multiSpecies_orthFinder4.pl | |
512 my $nexter = 0; | |
513 foreach my $tag (@tags){ | |
514 my $tagcount = ($line =~ s/>$tag\t/>$tag\t/g); | |
515 if ($tagcount > 1) { $nexter =1; #print colored ['red'],"multiple entires per species : $tagcount of $tag\n" if $printer == 1; | |
516 next; | |
517 } | |
518 } | |
519 | |
520 if ($nexter == 1){ | |
521 $nextcounter[3]++; | |
522 next; | |
523 } | |
524 #------------------------------------------------ | |
525 foreach my $mic (@micros){ #1 REMOVING MICROSATELLITES WITH ANY 'N's IN THEM | |
526 my @local = split(/\t/,$mic); | |
527 if ($local[$microsatcord] =~ /N/) {$stopper =1; $nextcounter[4]++; | |
528 last;} | |
529 } | |
530 next if $stopper ==1; | |
531 #print "till here 1\n"; #<STDIN>; | |
532 #------------------------------------------------ | |
533 my @micros_copy = @micros; | |
534 | |
535 my $tempmicro = shift(@micros_copy); #1 CURRENTLY OBTAINING INFORMATION FOR THE FIRST | |
536 #2 MICROSAT IN THE CLUSTER. | |
537 my @tempfields = split(/\t/,$tempmicro); | |
538 my $prevtype = $tempfields[$typecord]; | |
539 my $tempmotif = $tempfields[$motifcord]; | |
540 | |
541 my $tempfirstmotif = (); | |
542 if (scalar(@tempfields) > $microsatcord + 2){ | |
543 if ($tempfields[$no_of_interruptionscord] >= 1) { #1 DISCARDING MICROSATS WITH MORE THAN ZERO INTERRUPTIONS | |
544 #2 IN THE FIRST MICROSAT OF THE CLUSTER | |
545 $nexter =1; #print colored ['blue'],"more than one interruptions \n" if $printer == 1; | |
546 } | |
547 } | |
548 if ($nexter == 1){ | |
549 $nextcounter[6]++; | |
550 next; | |
551 } #1 DONE OBTAINING INFORMATION REGARDING | |
552 #2 THE FIRST MICROSAT FROM THE CLUSTER | |
553 | |
554 if ($tempmotif =~ /^\[/){ | |
555 $tempmotif =~ s/^\[//g; | |
556 $tempmotif =~ /([a-zA-Z]+)\].*/; | |
557 $tempfirstmotif = $1; #1 OBTAINING THE FIRTS MOTIF OF MICROSAT | |
558 } | |
559 else {$tempfirstmotif = $tempmotif;} | |
560 my $prevmotif = $tempfirstmotif; | |
561 | |
562 my $key = (); | |
563 # print "searching temp micro for 0-9 $focalspec chr0-9a-zA-Z 0-9 0-9 \n"; | |
564 # print "tempmicro = $tempmicro .. looking for ([0-9]+)\s+($focalspec_orig)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\n"; <STDIN>; | |
565 if ($tempmicro =~ /([0-9]+)\s+($focalspec_orig)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) { | |
566 # print "B: $no_of_species\n"; | |
567 $key = join("_",$2, $3, $4, $5); | |
568 } | |
569 else{ | |
570 # print "counld not form a key for temp\n"; # if $printer == 1; | |
571 $nextcounter[7]++; | |
572 next; | |
573 } | |
574 #----------------- #1 NOW, AFTER OBTAINING INFORMATION ABOUT | |
575 #2 THE FIRST MICROSAT IN THE CLUSTER, THE | |
576 #3 FOLLOWING LOOP GOES THROUGH THE OTHER MICROSATS | |
577 #4 TO SEE IF THEY SHARE THE REQUIRED FEATURES (BELOW) | |
578 | |
579 foreach my $micro (@micros_copy){ | |
580 my @fields = split(/\t/,$micro); | |
581 #----------------- | |
582 if (scalar(@fields) > $microsatcord + 2){ #1 DISCARDING MICROSATS WITH MORE THAN ONE INTERRUPTIONS | |
583 if ($fields[$no_of_interruptionscord] >= 1) {$nexter =1; #print colored ['blue'],"more than one interruptions \n" if $printer == 1; | |
584 $nextcounter[6]++; | |
585 last; } | |
586 } | |
587 #----------------- | |
588 if (($prevtype ne "0") && ($prevtype ne $fields[$typecord])) { | |
589 $nexter =1; #print colored ['yellow'],"microsat of different type \n" if $printer == 1; | |
590 $nextcounter[8]++; | |
591 last; } #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSATS BELONG | |
592 #----------------- #2 TO DIFFERENT TYPES (MONOS, DIS, TRIS ETC.) | |
593 $prevtype = $fields[$typecord]; | |
594 | |
595 my $motif = $fields[$motifcord]; | |
596 my $firstmotif = (); | |
597 | |
598 if ($motif =~ /^\[/){ | |
599 $motif =~ s/^\[//g; | |
600 $motif =~ /([a-zA-Z]+)\].*/; | |
601 $firstmotif = $1; | |
602 } | |
603 else {$firstmotif = $motif;} | |
604 | |
605 my $motifpattern = $firstmotif.$firstmotif; | |
606 my $prevmotifpattern = $prevmotif.$prevmotif; | |
607 | |
608 if (($prevmotif ne "0")&&(($motifpattern !~ /$prevmotif/i)||($prevmotifpattern !~ /$firstmotif/i)) ) { | |
609 $nexter =1; #print colored ['green'],"different motifs used \n$line\n" if $printer == 1; | |
610 $nextcounter[9]++; | |
611 last; | |
612 } #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSATS BELONG | |
613 #2 TO DIFFERENT MOTIFS | |
614 my $prevmotif = $firstmotif; | |
615 #----------------- | |
616 | |
617 for my $t (0 ... $#tags){ #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSAT ENTRIES BELONG | |
618 #2 DIFFERENT ALIGNMENT BLOCKS | |
619 if ($micro =~ /([0-9]+)\s+($focalspec_orig)\s([_0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) { | |
620 my $key2 = join("_",$2, $3, $4, $5); | |
621 # print "key = $key .. key2 = $key2\n"; #<STDIN>; | |
622 if ($key2 ne $key){ | |
623 # print "microsats belong to diffferent alignment blocks altogether\n" if $printer == 1; | |
624 $nextcounter[10]++; | |
625 $nexter = 1; last; | |
626 } | |
627 } | |
628 else{ | |
629 # print "counld not form a key for $line\n"; # if $printer == 1; | |
630 #<STDIN>; | |
631 $nexter = 1; last; | |
632 } | |
633 } | |
634 } | |
635 #print "D2: $no_of_species\n"; | |
636 | |
637 ##################### | |
638 if ($nexter == 1){ | |
639 # print "nexting\n"; # if $printer == 1; | |
640 next; | |
641 } | |
642 else{ | |
643 # print "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n$key:\n$line\nvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv\n" if $printer == 1; | |
644 push (@{$orths{$key}},$line); | |
645 $loaded++; | |
646 if ($line =~ /($focalspec_orig)\s([_a-zA-Z0-9]+)\s([0-9]+)\s([0-9]+)/ ) { | |
647 | |
648 # print "$line\n" if $printer == 1; #if $line =~ /Contig/; | |
649 # print "################ ################\n" if $printer == 1; | |
650 push @allowedchrs, $2 if !exists $allowedhash{$2}; | |
651 $allowedhash{$2} = 1; | |
652 my $key = join("\t",$1, $2, $3, $4); | |
653 # print "C: $no_of_species .. key = $key\n";#<STDIN>; | |
654 # print "print the shit: $key\n" ; #if $printer == 1; | |
655 $seen{$key} = 1; | |
656 } | |
657 else { #print "Key could not be formed in SPUT for ($focalspec_orig) (chrom) ([0-9]+) ([0-9]+)\n"; | |
658 } | |
659 } | |
660 } | |
661 close ORTH; | |
662 # print "now studying where we lost microsatellites: @nextcounter\n"; | |
663 for my $reason (0 ... $#nextcounter){ | |
664 #print $removalReasons[$reason]."\t".$nextcounter[$reason],"\n"; | |
665 } | |
666 # print "\ntotal number of keys formed = ", scalar(keys %orths), " = \n"; | |
667 # print "done filtering .. counterm = $counterm and loaded = $loaded\n"; | |
668 #---------------------------------------------------------------------------------------------------------------- | |
669 # NOW GENERATING THE ALIGNMENT FILE WITH RELELEVENT ALIGNMENTS STORED ONLY. | |
670 # print "adding files @filterseqfiles \n"; | |
671 #<STDIN>; | |
672 | |
673 while (1){ | |
674 if (-e $megamatchlck){ | |
675 # print "waiting to write into $megamatchlck\n"; | |
676 sleep 10; | |
677 } | |
678 else{ | |
679 open (MEGAMLCK, ">$megamatchlck") or die "Cannot open megamatchlck file $megamatchlck: $!"; | |
680 open (MEGAM, ">$megamatch") or die "Cannot open megamatch file $megamatch: $!"; | |
681 last; | |
682 } | |
683 } | |
684 foreach my $seqfile (@filterseqfiles){ | |
685 my $fullpath = $seqfile; | |
686 open (MATCH, "<$fullpath") or die "Cannot open MATCH file $fullpath: $!"; | |
687 my $matchlines = 0; | |
688 | |
689 while (my $line = <MATCH>) { | |
690 #print "checking $line"; | |
691 if ($line =~ /($focalspec_orig)\s([a-zA-Z0-9]+)\s([0-9]+)\s([0-9]+)/ ) { | |
692 my $key = join("\t",$1, $2, $3, $4); | |
693 # print "key = $key\n"; | |
694 #print "------------------------------------------------------\n"; | |
695 #print "asking $line\n"; | |
696 if (exists $seen{$key}){ | |
697 | |
698 #print "seen $line \n"; <STDIN>; | |
699 while (1){ | |
700 $matchlines++; | |
701 print MEGAM $line; | |
702 $line = <MATCH>; | |
703 print MEGAM "\n" if $line !~ /[0-9a-zA-Z]/; | |
704 last if $line !~/[0-9a-zA-Z]/; | |
705 } | |
706 } | |
707 else{ | |
708 # print "not seen\n"; | |
709 } | |
710 } | |
711 } | |
712 # print "matchlines = $matchlines\n"; | |
713 close MATCH; | |
714 } | |
715 close MEGAMLCK; | |
716 | |
717 unlink $megamatchlck; | |
718 close MEGAM; | |
719 undef %seen; | |
720 # print "done writitn to $megamatch\n";#<STDIN>; | |
721 #---------------------------------------------------------------------------------------------------------------- | |
722 #<STDIN>; | |
723 #--------------------------------------------------------------------------- | |
724 # NOW, AFTER FILTERING MANY MICROSATS, AND LOADING THE FILTERED ONES INTO | |
725 # THE HASH %orths , WE GO THROUGH THE ALIGNMENT FILE, AND STUDY THE | |
726 # FLANKING SEQUENCES OF ALL THESE MICROSATS, TO FILTER THEM FURTHER | |
727 #$printer = 1; | |
728 | |
729 my $microreadcounter=0; | |
730 my $contigsentered=0; | |
731 my $contignotrightcounter=0; | |
732 my $keynotformedcounter=0; | |
733 my $keynotfoundcounter= 0; | |
734 my $dotcounter = 0; | |
735 | |
736 # print "opening $megamatch\n"; | |
737 | |
738 open (BO, "<$megamatch") or die "Cannot open alignment file: $megamatch: $!"; | |
739 # print "doing $megamatch\n " ; | |
740 | |
741 #<STDIN>; | |
742 | |
743 while (my $line = <BO>){ | |
744 # print $line; #<STDIN>; | |
745 # print "." if $dotcounter % 100 ==0; | |
746 # print "\n" if $dotcounter % 5000 ==0; | |
747 # print "dotcounter = $dotcounter\n " if $printer == 1; | |
748 next if $line !~ /^[0-9]+/; | |
749 $dotcounter++; | |
750 # print colored ['green'], "~" x 60, "\n" if $printer == 1; | |
751 # print colored ['green'], $line;# if $printer == 1; | |
752 chomp $line; | |
753 my @fields2 = split(/\t/,$line); | |
754 my $key2 = (); | |
755 my $alignment_no = (); #1 TEMPORARY | |
756 if ($line =~ /([0-9]+)\s+($focalspec_orig)\s([_\-s0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) { | |
757 # $key2 = join("\t",$1, $2, $4, $5); | |
758 $key2 = join("_",$2, $3, $4, $5); | |
759 # print "key = $key2\n"; | |
760 | |
761 # print "key = $key2\n"; | |
762 $alignment_no=$1; | |
763 } | |
764 else {print "seq line $line incompatible\n"; $keynotformedcounter++; next;} | |
765 | |
766 $no_of_species = adjustCoordinates($line); | |
767 | |
768 $contignotrightcounter++ if $no_of_species != $no; | |
769 # print "contignotrightcounter=$contignotrightcounter\n"; | |
770 # print "no_of_species=$no_of_species\n"; | |
771 # print "no=$no\n"; | |
772 | |
773 next if $no_of_species != $no; | |
774 # print "D: $no_of_species\n"; | |
775 # print "E: $no_of_species\n"; | |
776 #<STDIN>; | |
777 # print "key = $key2\n" if $printer == 1; | |
778 my @clusters = (); #1 EXTRACTING MICROSATS CORRESPONDING TO THIS | |
779 #2 ALIGNMENT BLOCK | |
780 if (exists($orths{$key2})){ | |
781 @clusters = @{$orths{$key2}}; | |
782 $contigsentered++; | |
783 delete $orths{$key2}; | |
784 } | |
785 else{ | |
786 # print "orth does not exist\n"; | |
787 $keynotfoundcounter++; | |
788 next; | |
789 } | |
790 | |
791 my %sequences=(); #1 WILL STORE SEQUENCES IN THE CURRENT ALIGNMENT BLOCK | |
792 my $humseq = (); | |
793 foreach my $tag (@tags){ #1 READING THE ALIGNMENT FILE AND CAPTURING SEQUENCES | |
794 my $seq = <BO>; #2 OF ALL SPECIES. | |
795 chomp $seq; | |
796 $sequences{$tag} = " ".$seq; | |
797 #print "sequences = $sequences{$tag}\n" if $printer == 1; | |
798 $humseq = $seq if $tag =~ /H/; | |
799 } | |
800 | |
801 | |
802 foreach my $cluster (@clusters){ #1 NOW, GOING THROUGH THE CLUSTER OF MICROSATS | |
803 # print "x" x 60, "\n" if $printer == 1; | |
804 # print colored ['red'],"cluster = $cluster\n"; | |
805 $largesttree =~ s/hg18/H/g; | |
806 $largesttree =~ s/panTro2/C/g; | |
807 $largesttree =~ s/ponAbe2/O/g; | |
808 $largesttree =~ s/rheMac2/R/g; | |
809 $largesttree =~ s/calJac1/M/g; | |
810 | |
811 $microreadcounter++; | |
812 my @micros = split(/>/,$cluster); | |
813 | |
814 | |
815 | |
816 | |
817 | |
818 | |
819 shift @micros; | |
820 | |
821 my $edge_microsat=0; #1 THIS WILL HAVE VALUE "1" IF MICROSAT IS FOUND | |
822 #2 TO BE TOO CLOSE TO THE EDGES OF ALIGNMENT BLOCK | |
823 | |
824 my @starts= (); my %start_hash=(); #1 STORES THE START AND END COORDINATES OF MICROSATELLITES | |
825 my @ends = (); my %end_hash=(); #2 SO THAT LATER, WE WILL BE ABLE TO FIND THE EXTREME | |
826 #3 COORDINATE VALUES OF THE ORTHOLOGOUS MIROSATELLITES. | |
827 | |
828 my %microhash=(); | |
829 my %microsathash=(); | |
830 my %nonmicrosathash=(); | |
831 my $motif=(); #1 BASIC MOTIF OF THE MICROSATELLITE.. THERE'S ONLY 1 | |
832 # print "tags=@tags\n"; | |
833 for my $i (0 ... $#tags){ #1 FINDING THE MICROSAT, AND THE ALIGNMENT SEQUENCE | |
834 #2 CORRESPONDING TO THE PARTICULAR SPECIES (AS PER | |
835 #3 THE VARIABLE $TAG; | |
836 my $tag = $tags[$i]; | |
837 # print $seq; | |
838 my $locus="NULL"; #1 THIS WILL STORE THE MICROSAT OF THIS SPECIES. | |
839 #2 IF THERE IS NO MICROSAT, IT WILL REMAIN "NULL" | |
840 | |
841 foreach my $micro (@micros){ | |
842 # print "micro=$micro, tag=$tag\n"; | |
843 if ($micro =~ /^$tag/){ #1 MICROSAT OF THIS SPECIES FOUND.. | |
844 $locus = $micro; | |
845 my @fields = split(/\t/,$micro); | |
846 $motif = $fields[$motifcord]; | |
847 $microsathash{$tag}=$fields[$microsatcord]; | |
848 # print "fields=@fields, and startcord=$startcord = $fields[$startcord]\n"; | |
849 push(@starts, $fields[$startcord]); | |
850 push(@ends, $fields[$endcord]); | |
851 $start_hash{$tag}=$fields[$startcord]; | |
852 $end_hash{$tag}=$fields[$endcord]; | |
853 last; | |
854 } | |
855 else{$microsathash{$tag}="NULL"} | |
856 } | |
857 $microhash{$tag}=$locus; | |
858 | |
859 } | |
860 | |
861 | |
862 | |
863 my $extreme_start = smallest_number(@starts); #1 THESE TWO ARE THE EXTREME COORDINATES OF THE | |
864 my $extreme_end = largest_number(@ends); #2 MICROSAT CLUSTER ACCROSS ALL THE SPECIES IN | |
865 #3 WHOM IT IS FOUND TO BE ORTHOLOGOUS. | |
866 | |
867 # print "starts=@starts... ends=@ends\n"; | |
868 | |
869 my %up_flanks = (); #1 CONTAINS UPSTEAM FLANKING REGIONS FOR EACH SPECIES | |
870 my %down_flanks = (); #1 CONTAINS DOWNDTREAM FLANKING REGIONS FOR EACH SPECIES | |
871 | |
872 my %up_largeflanks = (); | |
873 my %down_largeflanks = (); | |
874 | |
875 my %locusandflanks = (); | |
876 my %locusandlargeflanks = (); | |
877 | |
878 my %up_internal_flanks=(); #1 CONTAINS SEQUENCE BETWEEN THE $extreme_start and the | |
879 #2 ACTUAL START OF MICROSATELLITE IN THE SPECIES | |
880 my %down_internal_flanks=(); #1 CONTAINS SEQUENCE BETWEEN THE $extreme_end and the | |
881 #2 ACTUAL end OF MICROSATELLITE IN THE SPECIES | |
882 | |
883 my %alignment=(); #1 CONTAINS ACTUAL ALIGNMENT SEQUENCE BETWEEN THE TWO | |
884 #2 EXTEME VALUES. | |
885 | |
886 my %microsatstarts=(); #1 WITHIN EACH ALIGNMENT, IF THERE EXISTS A MICROSATELLITE | |
887 #2 THIS HASH CONTAINS THE START SITE OF THE MICROSATELLITE | |
888 #3 WIHIN THE ALIGNMENT | |
889 next if !defined $extreme_start; | |
890 next if !defined $extreme_end; | |
891 next if $extreme_start > length($sequences{$tags[0]}); | |
892 next if $extreme_start < 0; | |
893 next if $extreme_end > length($sequences{$tags[0]}); | |
894 | |
895 for my $i (0 ... $#tags){ #1 NOW THAT WE HAVE GATHERED INFORMATION REGARDING | |
896 #2 SEQUENCE ALIGNMENT AND MICROSATELLITE COORDINATES | |
897 #3 AS WELL AS THE EXTREME COORDINATES OF THE | |
898 #4 MICROSAT CLUSTER, WE WILL PROCEED TO EXTRACT THE | |
899 #5 FLANKING SEQUENCE OF ALL ORGS, AND STUDY IT IN | |
900 #6 MORE DETAIL. | |
901 my $tag = $tags[$i]; | |
902 # print "tag=$tag.. seqlength = ",length($sequences{$tag})," extreme_start=$extreme_start and extreme_end=$extreme_end\n"; | |
903 my $upstream_gaps = (substr($sequences{$tag}, 0, $extreme_start) =~ s/\-/-/g); #1 NOW MEASURING THE NUMBER OF GAPS IN THE UPSTEAM | |
904 #2 AND DOWNSTREAM SEQUENCES OF THE MICROSATs IN THIS | |
905 #3 CLUSTER. | |
906 # print "seq length $tag = $sequences{$tag} = ",length($sequences{$tag})," extreme_end=$extreme_end\n" ; | |
907 my $downstream_gaps = (substr($sequences{$tag}, $extreme_end) =~ s/\-/-/g); | |
908 if (($extreme_start - $upstream_gaps )< $EDGE_DISTANCE || (length($sequences{$tag}) - $extreme_end - $downstream_gaps) < $EDGE_DISTANCE){ | |
909 $edge_microsat=1; | |
910 | |
911 last; | |
912 } | |
913 else{ | |
914 $up_flanks{$tag} = substr($sequences{$tag}, $extreme_start - $FLANK_SUPPORT, $FLANK_SUPPORT); | |
915 $down_flanks{$tag} = substr($sequences{$tag}, $extreme_end+1, $FLANK_SUPPORT); | |
916 | |
917 $up_largeflanks{$tag} = substr($sequences{$tag}, $extreme_start - $COMPLEXITY_SUPPORT, $COMPLEXITY_SUPPORT); | |
918 $down_largeflanks{$tag} = substr($sequences{$tag}, $extreme_end+1, $COMPLEXITY_SUPPORT); | |
919 | |
920 | |
921 $alignment{$tag} = substr($sequences{$tag}, $extreme_start, $extreme_end-$extreme_start+1); | |
922 $locusandflanks{$tag} = $up_flanks{$tag}."[".$alignment{$tag}."]".$down_flanks{$tag}; | |
923 $locusandlargeflanks{$tag} = $up_largeflanks{$tag}."[".$alignment{$tag}."]".$down_largeflanks{$tag}; | |
924 | |
925 if ($microhash{$tag} ne "NULL"){ | |
926 $up_internal_flanks{$tag} = substr($sequences{$tag}, $extreme_start , $start_hash{$tag}-$extreme_start); | |
927 $down_internal_flanks{$tag} = substr($sequences{$tag}, $end_hash{$tag} , $extreme_end-$end_hash{$tag}); | |
928 $microsatstarts{$tag}=$start_hash{$tag}-$extreme_start; | |
929 # print "tag = $tag, internal flanks = $up_internal_flanks{$tag} and $down_internal_flanks{$tag} and start = $microsatstarts{$tag}\n" if $printer == 1; | |
930 } | |
931 else{ | |
932 $nonmicrosathash{$tag}=substr($sequences{$tag}, $extreme_start, $extreme_end-$extreme_start+1); | |
933 | |
934 } | |
935 # print "up flank for species $tag = $up_flanks{$tag} \ndown flank for species $tag = $down_flanks{$tag} \n" if $printer == 1; | |
936 | |
937 } | |
938 | |
939 } | |
940 $nextcounter[11]++ if $edge_microsat==1; | |
941 next if $edge_microsat==1; | |
942 | |
943 | |
944 my $low_complexity = 0; #1 VALUE WILL BE 1 IF ANY OF THE FLANKING REGIONS | |
945 #2 IS FOUND TO BE OF LOW COMPLEXITY, BY USING THE | |
946 #3 FUNCTION sub test_complexity | |
947 | |
948 | |
949 for my $i (0 ... $#tags){ | |
950 # print "i = $tags[$i]\n" if $printer == 1; | |
951 if (test_complexity($up_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT) eq "LOW" || test_complexity($down_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT) eq "LOW"){ | |
952 # print "i = $i, low complexity regions: $up_largeflanks{$tags[$i]}: ",test_complexity($up_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT), " and $down_largeflanks{$tags[$i]} = ",test_complexity($down_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT),"\n" if $printer == 1; | |
953 $low_complexity =1; last; | |
954 } | |
955 } | |
956 | |
957 $nextcounter[12]++ if $low_complexity==1; | |
958 next if $low_complexity == 1; | |
959 | |
960 | |
961 my $sequence_dissimilarity = 0; #1 THIS VALYE WILL BE 1 IF THE SEQUENCE SIMILARITY | |
962 #2 BETWEEN ANY OF THE SPECIES AGAINST THE HUMAN | |
963 #3 FLANKING SEQUENCES IS BELOW A CERTAIN THRESHOLD | |
964 #4 AS DESCRIBED IN FUNCTION sub sequence_similarity | |
965 my %donepair = (); | |
966 for my $i (0 ... $#tags){ | |
967 # print "i = $tags[$i]\n" if $printer == 1; | |
968 # next if $i == 0; | |
969 # print colored ['magenta'],"THIS IS UP\n" if $printer == 1; | |
970 | |
971 for my $b (0 ... $#tags){ | |
972 next if $b == $i; | |
973 my $pair = (); | |
974 $pair = $i."_".$b if $i < $b; | |
975 $pair = $b."_".$i if $b < $i; | |
976 next if exists $donepair{$pair}; | |
977 my ($up_similarity,$upnucdiffs, $upindeldiffs) = sequence_similarity($up_flanks{$tags[$i]}, $up_flanks{$tags[$b]}, $SIMILARITY_THRESH, $info); | |
978 my ($down_similarity,$downnucdiffs, $downindeldiffs) = sequence_similarity($down_flanks{$tags[$i]}, $down_flanks{$tags[$b]}, $SIMILARITY_THRESH, $info); | |
979 $donepair{$pair} = $up_similarity."_".$down_similarity; | |
980 | |
981 # print RUN "$up_similarity $upnucdiffs $upindeldiffs $down_similarity $downnucdiffs $downindeldiffs\n"; | |
982 | |
983 if ( $up_similarity < $SIMILARITY_THRESH || $down_similarity < $SIMILARITY_THRESH){ | |
984 $sequence_dissimilarity =1; | |
985 last; | |
986 } | |
987 } | |
988 } | |
989 $nextcounter[13]++ if $sequence_dissimilarity==1; | |
990 | |
991 next if $sequence_dissimilarity == 1; | |
992 my ($simplified_microsat, $Hchrom, $Hstart, $Hend, $locusmotif, $locusmotifsize) = summarize_microsat($cluster, $humseq); | |
993 # print "simplified_microsat=$simplified_microsat\n"; | |
994 my ($tree_analysis, $conformation) = treeStudy($simplified_microsat); | |
995 # print "tree_analysis = $tree_analysis .. conformation=$conformation\n"; | |
996 #<STDIN>; | |
997 | |
998 # print SELECT "\"$conformation\"\t$tree_analysis\n"; | |
999 | |
1000 next if $tree_analysis =~ /DISCARD/; | |
1001 if (exists $treesToReject{$tree_analysis}){ | |
1002 $nextcounter[14]++; | |
1003 next; | |
1004 } | |
1005 | |
1006 # print "F: $no_of_species\n"; | |
1007 | |
1008 # my $adjuster=(); | |
1009 # if ($no_of_species == 4){ | |
1010 # my @sields = split(/\t/,$simplified_microsat); | |
1011 # my $somend = pop(@sields); | |
1012 # my $somestart = pop(@sields); | |
1013 # my $somechr = pop(@sields); | |
1014 # $adjuster = "NA\t" x 13 ; | |
1015 # $simplified_microsat = join ("\t", @sields, $adjuster).$somechr."\t".$somestart."\t".$somend; | |
1016 # } | |
1017 # if ($no_of_species == 3){ | |
1018 # my @sields = split(/\t/,$simplified_microsat); | |
1019 # my $somend = pop(@sields); | |
1020 # my $somestart = pop(@sields); | |
1021 # my $somechr = pop(@sields); | |
1022 # $adjuster = "NA\t" x 26 ; | |
1023 # $simplified_microsat = join ("\t", @sields, $adjuster).$somechr."\t".$somestart."\t".$somend; | |
1024 # } | |
1025 # | |
1026 $registeredTrees{$tree_analysis} = 1 if !exists $registeredTrees{$tree_analysis}; | |
1027 $registeredTrees{$tree_analysis}++ if exists $registeredTrees{$tree_analysis}; | |
1028 | |
1029 if (exists $treesToIgnore{$tree_analysis}){ | |
1030 my @appendarr = (); | |
1031 | |
1032 # print SUMMARY $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize], "\t", $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t"; | |
1033 #print "SUMMARY ",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize], "\t", $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t"; | |
1034 # print SELECT $Hchrom,"\t",$Hstart,"\t",$Hend,"\t","NOEVENT", "\t\t", $cluster,"\n"; | |
1035 | |
1036 foreach my $lnode (@$lagestnodes){ | |
1037 my @pair = @$lnode; | |
1038 my @nodemutarr = (); | |
1039 for my $p (@pair){ | |
1040 my @mutinfoarray1 = (); | |
1041 for (1 ... 38){ | |
1042 # push (@mutinfoarray1, "NA") | |
1043 } | |
1044 # print SUMMARY join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t"; | |
1045 # print join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t"; | |
1046 } | |
1047 | |
1048 } | |
1049 for (1 ... 38){ | |
1050 push (@appendarr, "NA") | |
1051 } | |
1052 # print SUMMARY join ("\t", @appendarr,"NULL", "NULL"),"\n"; | |
1053 # print join ("\t", @appendarr,"NULL", "NULL"),"\n"; | |
1054 # print "SUMMARY ",join ("\t", @appendarr,"NULL", "NULL"),"\n"; #<STDIN>; | |
1055 next; | |
1056 } | |
1057 # print colored ['blue'],"cluster = $cluster\n"; | |
1058 | |
1059 my ($mutations_array, $nodes, $branches_hash, $alivehash, $primaryalignment) = peel_onion($tree, \%sequences, \%alignment, \@tags, \%microsathash, \%nonmicrosathash, $motif, $tree_analysis, $thresholdhash{length($motif)}, \%microsatstarts); | |
1060 | |
1061 if ($mutations_array eq "NULL"){ | |
1062 # print "cluster = $cluster \n"; <STDIN>; | |
1063 my @appendarr = (); | |
1064 | |
1065 # print SUMMARY $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize, "\t"; | |
1066 | |
1067 # foreach my $lnode (@$lagestnodes){ | |
1068 # my @pair = @$lnode; | |
1069 # my @nodemutarr = (); | |
1070 # for my $p (@pair){ | |
1071 # my @mutinfoarray1 = (); | |
1072 # for (1 ... 38){ | |
1073 # push (@mutinfoarray1, "NA") | |
1074 # } | |
1075 # print SUMMARY join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t"; | |
1076 # print join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t"; | |
1077 # } | |
1078 # } | |
1079 # for (1 ... 38){ | |
1080 # push (@appendarr, "NA") | |
1081 # } | |
1082 # print SUMMARY join ("\t", @appendarr,"NULL", "NULL"),"\n"; | |
1083 # print join ("\t", @appendarr,"NULL", "NULL"),"\n"; | |
1084 # print join ("\t","SUMMARY", @appendarr,"NULL", "NULL"),"\n"; #<STDIN>; | |
1085 next; | |
1086 } | |
1087 | |
1088 | |
1089 # print "sent: \n" if $printer == 1; | |
1090 # print "nodes = @$nodes, branches array:\n" if $mutations_array ne "NULL" && $printer == 1; | |
1091 | |
1092 my ($newstatus, $newmutations_array, $newnodes, $newbranches_hash, $newalivehash, $finalalignment) = fillAlignmentGaps($tree, \%sequences, \%alignment, \@tags, \%microsathash, \%nonmicrosathash, $motif, $tree_analysis, $thresholdhash{length($motif)}, \%microsatstarts); | |
1093 # print "newmutations_array returned = \n",join("\n",@$newmutations_array),"\n" if $newmutations_array ne "NULL" && $printer == 1; | |
1094 my @finalmutations_array= (); | |
1095 @finalmutations_array = selectMutationArray($mutations_array, $newmutations_array, \@tags, $alivehash, \%alignment, $motif) if $newmutations_array ne "NULL"; | |
1096 @finalmutations_array = selectMutationArray($mutations_array, $mutations_array, \@tags, $alivehash, \%alignment, $motif) if $newmutations_array eq "NULL"; | |
1097 # print "alt = $alternate{$conformation}\n"; | |
1098 | |
1099 my ($besttree, $treescore) = selectBetterTree($tree_analysis, $alternate{$conformation}, \@finalmutations_array); | |
1100 my $cleancase = "UNCLEAN"; | |
1101 $cleancase = checkCleanCase($besttree, $finalalignment) if $treescore > 0 && $finalalignment ne "NULL" && $finalalignment =~ /\!/; | |
1102 $cleancase = checkCleanCase($besttree, $primaryalignment) if $treescore > 0 && $finalalignment eq "NULL" && $primaryalignment =~ /\!/ && $primaryalignment ne "NULL"; | |
1103 $cleancase = "CLEAN" if $finalalignment eq "NULL" && $primaryalignment !~ /\!/ && $primaryalignment ne "NULL"; | |
1104 $cleancase = "CLEAN" if $finalalignment ne "NULL" && $finalalignment !~ /\!/ ; | |
1105 # print "besttree = $besttree ... cleancase=$cleancase\n"; #<STDIN>; | |
1106 | |
1107 my @selects = ("-C","+C","-H","+H","-HC","+HC","-O","+O","-H.-C","-H.-O","-HC,+C","-HC,+H","-HC.-O","-HCO,+HC","-HCO,+O","-O.-C","-O.-H", | |
1108 "+C.+O","+H.+C","+H.+O","+HC,-C","+HC,-H","+HC.+O","+HCO,-C","+HCO,-H","+HCO,-HC","+HCO,-O","+O.+C","+O.+H","+H.+C.+O","-H.-C.-O","+HCO","-HCO"); | |
1109 next if (oneOf(@selects, $besttree) == 0); | |
1110 if ( ($besttree =~ /,/ || $besttree =~ /\./) && $cleancase eq "UNCLEAN"){ | |
1111 $besttree = "$besttree / $tree_analysis"; | |
1112 } | |
1113 | |
1114 $besttree = "NULL" if $treescore <= 0; | |
1115 | |
1116 while ($besttree =~ /[A-Z][A-Z]/){ | |
1117 $besttree =~ s/([A-Z])([A-Z])/$1:$2/g; | |
1118 } | |
1119 | |
1120 if ($besttree !~ /NULL/){ | |
1121 my @elements = ($besttree =~ /([A-Z])/g); | |
1122 | |
1123 foreach my $ele (@elements){ | |
1124 # print "replacing $ele with $backReplacementArrTag{$ele}\n"; | |
1125 $besttree =~ s/$ele/$backReplacementArrTag{$ele}/g if exists $backReplacementArrTag{$ele}; | |
1126 } | |
1127 } | |
1128 my $endendstate = $focalspec_orig.".".$Hchrom."\t".$Hstart."\t".$Hend."\t".$locusmotif."\t".$locusmotifsize."\t".$tree_analysis."\t"; | |
1129 next if $endendstate =~ /NA\tNA\tNA/; | |
1130 | |
1131 print SUMMARY $focalspec_orig,".",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t"; | |
1132 # print "SUMMARY\t", $focalspec_orig,".",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t",$tree_analysis,"\t" ; | |
1133 | |
1134 my @mutinfoarray =(); | |
1135 | |
1136 foreach my $lnode (@$lagestnodes){ | |
1137 my @pair = @$lnode; | |
1138 my $joint = "(".join(", ",@pair).")"; | |
1139 my @nodemutarr = (); | |
1140 | |
1141 for my $p (@pair){ | |
1142 foreach my $mut (@finalmutations_array){ | |
1143 $mut =~ /node=([A-Z, \(\)]+)/; | |
1144 push @nodemutarr, $mut if $p eq $1; | |
1145 } | |
1146 @mutinfoarray = summarizeMutations(\@nodemutarr, $besttree); | |
1147 | |
1148 # print SUMMARY join ("\t", @mutinfoarray[0...($#mutinfoarray-1)] ),"\t"; | |
1149 # print join ("\t", @mutinfoarray[0...($#mutinfoarray-1)] ),"\t"; | |
1150 } | |
1151 } | |
1152 | |
1153 # print "G: $no_of_species\n"; | |
1154 | |
1155 my @alignmentarr = (); | |
1156 | |
1157 foreach my $key (keys %alignment){ | |
1158 push @alignmentarr, $backReplacementArrTag{$key}.":".$alignment{$key}; | |
1159 | |
1160 } | |
1161 # print "alignmentarr = @alignmentarr"; <STDIN>; | |
1162 | |
1163 @mutinfoarray = summarizeMutations(\@finalmutations_array, $besttree); | |
1164 print SUMMARY join ("\t", @mutinfoarray ),"\t"; | |
1165 print SUMMARY join(",",@alignmentarr),"\n"; | |
1166 # print join("\t","--------------","\n",$besttree, join("",@tags)),"\n" if scalar(@tags) < 5; | |
1167 # <STDIN> if scalar(@tags) < 5; | |
1168 # print $cleancase, "\n"; | |
1169 # print join ("\t", @mutinfoarray,$cleancase,join(",",@alignmentarr)),"\n"; #<STDIN>; | |
1170 # print "summarized\n"; #<STDIN>; | |
1171 | |
1172 | |
1173 | |
1174 my %indelcatch = (); | |
1175 my %substcatch = (); | |
1176 my %typecatch = (); | |
1177 my %nodescatch = (); | |
1178 my $mutconcat = join("\t", @finalmutations_array)."\n"; | |
1179 my %indelposcatch = (); | |
1180 my %subsposcatch = (); | |
1181 | |
1182 foreach my $fmut ( @finalmutations_array){ | |
1183 # next if $fmut !~ /indeltype=[a-zA-Z]+/; | |
1184 #print RUN $fmut, "\n"; | |
1185 $fmut =~ /node=([a-zA-Z, \(\)]+)/; | |
1186 my $lnode = $1; | |
1187 $nodescatch{$1}=1; | |
1188 | |
1189 if ($fmut =~ /type=substitution/){ | |
1190 # print "fmut=$fmut\n"; | |
1191 $fmut =~ /from=([a-zA-Z\-]+)\tto=([a-zA-Z\-]+)/; | |
1192 my $from=$1; | |
1193 # print "from=$from\n"; | |
1194 my $to=$2; | |
1195 # print "to=$to\n"; | |
1196 push @{$substcatch{$lnode}} , ("from:".$from." to:".$to); | |
1197 $fmut =~ /position=([0-9]+)/; | |
1198 push @{$subsposcatch{$lnode}}, $1; | |
1199 } | |
1200 | |
1201 if ($fmut =~ /insertion=[a-zA-Z\-]+/){ | |
1202 $fmut =~ /insertion=([a-zA-Z\-]+)/; | |
1203 push @{$indelcatch{$lnode}} , $1; | |
1204 $fmut =~ /indeltype=([a-zA-Z]+)/; | |
1205 push @{$typecatch{$lnode}}, $1; | |
1206 $fmut =~ /position=([0-9]+)/; | |
1207 push @{$indelposcatch{$lnode}}, $1; | |
1208 } | |
1209 if ($fmut =~ /deletion=[a-zA-Z\-]+/){ | |
1210 $fmut =~ /deletion=([a-zA-Z\-]+)/; | |
1211 push @{$indelcatch{$lnode}} , $1; | |
1212 $fmut =~ /indeltype=([a-zA-Z]+)/; | |
1213 push @{$typecatch{$lnode}}, $1; | |
1214 $fmut =~ /position=([0-9]+)/; | |
1215 push @{$indelposcatch{$lnode}}, $1; | |
1216 } | |
1217 } | |
1218 | |
1219 # print $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t" if $printer == 1; | |
1220 # print join ("<\t>", @mutinfoarray),"\n" if $printer == 1; | |
1221 # print "where mutinfoarray = @mutinfoarray\n" if $printer == 1; | |
1222 # #print RUN "."; | |
1223 | |
1224 # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1; | |
1225 # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1; | |
1226 | |
1227 # print colored ['red'],"finalmutations_array=\n" if $printer == 1; | |
1228 # foreach (@finalmutations_array) { | |
1229 # print colored ['red'], "$_\n" if $_ =~ /type=substitution/ && $printer == 1 ; | |
1230 # print colored ['yellow'], "$_\n" if $_ !~ /type=substitution/ && $printer == 1 ; | |
1231 | |
1232 # }# if $line =~ /cal/;# && $line =~ /chr4/; | |
1233 | |
1234 # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1; | |
1235 # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1; | |
1236 # print "tree analysis = $tree_analysis\n" if $printer == 1; | |
1237 | |
1238 # my $mutations = "@$mutations_array"; | |
1239 | |
1240 | |
1241 next; | |
1242 for my $keys (@$nodes) {foreach my $key (@$keys){ | |
1243 #print "key = $key, => $branches_hash->{$key}\n"; | |
1244 } | |
1245 # print "x" x 50, "\n"; | |
1246 } | |
1247 my ($birth_steps, $death_steps) = decipher_history($mutations_array,join("",@tags),$nodes,$branches_hash,$tree_analysis,$conformation, $alivehash, $simplified_microsat); | |
1248 } | |
1249 } | |
1250 close BO; | |
1251 # print "now studying where we lost microsatellites:\n"; | |
1252 # print "x" x 60,"\n"; | |
1253 for my $reason (0 ... $#nextcounter){ | |
1254 # print $removalReasons[$reason]."\t".$nextcounter[$reason],"\n"; | |
1255 } | |
1256 # print "x" x 60,"\n"; | |
1257 # print "In total we read $microreadcounter microsatellites after reading through $contigsentered contigs\n"; | |
1258 # print " we lost $keynotformedcounter contigs as they did not form the key, \n"; | |
1259 # print "$contignotrightcounter contigs as they were not of the right species configuration\n"; | |
1260 # print "$keynotfoundcounter contigs as they did not contain the microsats\n"; | |
1261 # print "... In total we went through a file that had $dotcounter contigs...\n"; | |
1262 # print join ("\n","remaining orth keys = ", (keys %orths),""); | |
1263 # print "------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ \n"; | |
1264 # print "now printing counted trees: \n"; | |
1265 if (scalar(keys %registeredTrees) > 0){ | |
1266 foreach my $keyb ( sort (keys %registeredTrees) ) | |
1267 { | |
1268 # print "$keyb : $registeredTrees{$keyb}\n"; | |
1269 } | |
1270 } | |
1271 | |
1272 | |
1273 } | |
1274 close SUMMARY; | |
1275 | |
1276 my @summarizarr = ("+C=+C +R.+C -HCOR,+C", | |
1277 "+H=+H +R.+H -HCOR,+H", | |
1278 "-C=-C -R.-C +HCOR,-C", | |
1279 "-H=-H -R.-H +HCOR,-H", | |
1280 "+HC=+HC", | |
1281 "-HC=-HC", | |
1282 "+O=+O -HCOR,+O", | |
1283 "-O=-O +HCOR,-O", | |
1284 "+HCO=+HCO", | |
1285 "-HCO=-HCO", | |
1286 "+R=+R +R.+C +R.+H", | |
1287 "-R=-R -R.-C -R.-H"); | |
1288 | |
1289 foreach my $line (@summarizarr){ | |
1290 next if $line !~ /[A-Za-z0-9]/; | |
1291 # print $line; | |
1292 chomp $line; | |
1293 my @fields = split(/=/,$line); | |
1294 # print "title = $fields[0]\n"; | |
1295 my @parts=split(/ +/, $fields[1]); | |
1296 my %partshash = (); | |
1297 foreach my $part (@parts){$partshash{$part}=1;} | |
1298 my $count=0; | |
1299 foreach my $key ( sort keys %registeredTrees ){ | |
1300 next if !exists $partshash{$key}; | |
1301 # print "now adding $registeredTrees{$key} from $key\n"; | |
1302 $count+=$registeredTrees{$key}; | |
1303 } | |
1304 # print "$fields[0] : $count\n"; | |
1305 } | |
1306 my $rootdir = $dir; | |
1307 $rootdir =~ s/\/[A-Za-z0-9\-_]+$//; | |
1308 chdir $rootdir; | |
1309 remove_tree($dir); | |
1310 | |
1311 | |
1312 #-------------------------------------------------------------------------------------------------------- | |
1313 #-------------------------------------------------------------------------------------------------------- | |
1314 #-------------------------------------------------------------------------------------------------------- | |
1315 #-------------------------------------------------------------------------------------------------------- | |
1316 #-------------------------------------------------------------------------------------------------------- | |
1317 sub largest_number{ | |
1318 my $counter = 0; | |
1319 my($max) = shift(@_); | |
1320 foreach my $temp (@_) { | |
1321 | |
1322 #print "finding largest array: $maxcounter \n"; | |
1323 if($temp > $max){ | |
1324 $max = $temp; | |
1325 } | |
1326 } | |
1327 return($max); | |
1328 } | |
1329 | |
1330 sub smallest_number{ | |
1331 my $counter = 0; | |
1332 my($min) = shift(@_); | |
1333 foreach my $temp (@_) { | |
1334 #print "finding largest array: $maxcounter \n"; | |
1335 if($temp < $min){ | |
1336 $min = $temp; | |
1337 } | |
1338 } | |
1339 return($min); | |
1340 } | |
1341 #-------------------------------------------------------------------------------------------------------- | |
1342 #-------------------------------------------------------------------------------------------------------- | |
1343 #-------------------------------------------------------------------------------------------------------- | |
1344 #-------------------------------------------------------------------------------------------------------- | |
1345 #-------------------------------------------------------------------------------------------------------- | |
1346 sub baseml_parser{ | |
1347 my $outputfile = $_[0]; | |
1348 open(BOUT,"<$outputfile") or die "Cannot open output of upstream baseml $outputfile: $!"; | |
1349 my @info = (); | |
1350 my @branchields = (); | |
1351 my @distanceields = (); | |
1352 my @bout = <BOUT>; | |
1353 #print colored ['red'], @bout ,"\n"; | |
1354 for my $b (0 ... $#bout){ | |
1355 my $bine=$bout[$b]; | |
1356 #print colored ['yellow'], "sentence = ",$bine; | |
1357 if ($bine =~ /TREE/){ | |
1358 $bine=$bout[$b++]; | |
1359 $bine=$bout[$b++]; | |
1360 $bine=$bout[$b++]; | |
1361 #print "FOUND",$bine; | |
1362 chomp $bine; | |
1363 $bine =~ s/^\s+//g; | |
1364 @branchields = split(/\s+/,$bine); | |
1365 $bine=$bout[$b++]; | |
1366 chomp $bine; | |
1367 $bine =~ s/^\s+//g; | |
1368 @distanceields = split(/\s+/,$bine); | |
1369 #print "LASTING..............\n"; | |
1370 last; | |
1371 } | |
1372 else{ | |
1373 } | |
1374 } | |
1375 | |
1376 close BOUT; | |
1377 # print "branchfields = @branchields and distanceields = @distanceields\n" if $printer == 1; | |
1378 my %distance_hash=(); | |
1379 for my $d (0 ... $#branchields){ | |
1380 $distance_hash{$branchields[$d]} = $distanceields[$d]; | |
1381 } | |
1382 | |
1383 $info[0] = $distance_hash{"9..1"} + $distance_hash{"9..2"}; | |
1384 $info[1] = $distance_hash{"9..1"} + $distance_hash{"8..9"}+ $distance_hash{"8..3"}; | |
1385 $info[2] = $distance_hash{"9..1"} + $distance_hash{"8..9"}+$distance_hash{"7..8"}+$distance_hash{"7..4"}; | |
1386 $info[3] = $distance_hash{"9..1"} + $distance_hash{"8..9"}+$distance_hash{"7..8"}+$distance_hash{"6..7"}+$distance_hash{"6..5"}; | |
1387 | |
1388 # print "\nsending back: @info\n" if $printer == 1; | |
1389 | |
1390 return join("\t",@info); | |
1391 | |
1392 } | |
1393 | |
1394 | |
1395 #-------------------------------------------------------------------------------------------------------- | |
1396 #-------------------------------------------------------------------------------------------------------- | |
1397 #-------------------------------------------------------------------------------------------------------- | |
1398 #-------------------------------------------------------------------------------------------------------- | |
1399 #-------------------------------------------------------------------------------------------------------- | |
1400 sub test_complexity{ | |
1401 my $printer = 0; | |
1402 my $sequence = $_[0]; | |
1403 #print "sequence = $sequence\n"; | |
1404 my $COMPLEXITY_SUPPORT = $_[1]; | |
1405 my $complexity=int($COMPLEXITY_SUPPORT * (1/40)); #1 THIS IS AN ARBITRARY THRESHOLD SET FOR LOW COMPLEXITY. | |
1406 #2 THE INSPIRATION WAS WEB MILLER'S MAIL SENT ON | |
1407 #3 19 Apr 2008 WHERE HE CLASSED AS HIGH COMPLEXITY | |
1408 #4 REGION, IF 40 BP OF SEQUENCE HAS AT LEAST 3 OF | |
1409 #5 EACH NUCLEOTIDE. HENCE, I NORMALIZE THIS PARAMETER | |
1410 #6 FOR THE ACTUAL LENGTH OF $FLANK_SUPPORT SET BY | |
1411 #7 THE USER. | |
1412 #8 WEB MILLER SENT THE MAIL TO YDK104@PSU.EDU | |
1413 | |
1414 | |
1415 | |
1416 my $As = ($sequence=~ s/A/A/gi); | |
1417 my $Ts = ($sequence=~ s/T/T/gi); | |
1418 my $Gs = ($sequence=~ s/G/G/gi); | |
1419 my $Cs = ($sequence=~ s/C/C/gi); | |
1420 my $dashes = ($sequence=~ s/\-/-/gi); | |
1421 $dashes = 0 if $sequence !~ /\-/; | |
1422 # print "seq = $sequence, As=$As, Ts=$Ts, Gs=$Gs, Cs=$Cs, dashes=$dashes\n"; | |
1423 return "LOW" if $dashes > length($sequence)/2; | |
1424 | |
1425 my $ans = (); | |
1426 | |
1427 return "HIGH" if $As >= $complexity && $Ts >= $complexity && $Cs >= $complexity && $Gs >= $complexity; | |
1428 | |
1429 my @nts = ("A","T","G","C","-"); | |
1430 | |
1431 my $lowcomplex = 0; | |
1432 | |
1433 foreach my $nt (@nts){ | |
1434 $lowcomplex =1 if $sequence =~ /(($nt\-*){$mono_flanksimplicityRepno,})/i; | |
1435 $lowcomplex =1 if $sequence =~ /(($nt[A-Za-z]){$di_flanksimplicityRepno,})/i; | |
1436 $lowcomplex =1 if $sequence =~ /(([A-Za-z]$nt){$di_flanksimplicityRepno,})/i; | |
1437 my $nont = ($sequence=~ s/$nt/$nt/gi); | |
1438 $lowcomplex = 1 if $nont > (length($sequence) * $prop_of_seq_allowedtoAT) && ($nt =~ /[AT\-]/); | |
1439 $lowcomplex = 1 if $nont > (length($sequence) * $prop_of_seq_allowedtoCG) && ($nt =~ /[CG]/); | |
1440 } | |
1441 # print "leaving for now.. $sequence\n" if $printer == 1 && $lowcomplex == 0; | |
1442 #<STDIN>; | |
1443 return "HIGH" if $lowcomplex == 0; | |
1444 return "LOW" ; | |
1445 } | |
1446 #-------------------------------------------------------------------------------------------------------- | |
1447 #-------------------------------------------------------------------------------------------------------- | |
1448 #-------------------------------------------------------------------------------------------------------- | |
1449 #-------------------------------------------------------------------------------------------------------- | |
1450 #-------------------------------------------------------------------------------------------------------- | |
1451 sub sequence_similarity{ | |
1452 my $printer = 0; | |
1453 my @seq1 = split(/\s*/, $_[0]); | |
1454 my @seq2 = split(/\s*/, $_[1]); | |
1455 my $similarity_thresh = $_[2]; | |
1456 my $info = $_[3]; | |
1457 # print "input = @_\n" if $printer == 1; | |
1458 my $seq1str = $_[0]; | |
1459 my $seq2str = $_[1]; | |
1460 $seq1str=~s/\-//g; $seq2str=~s/\-//g; | |
1461 my $similarity=0; | |
1462 | |
1463 my $nucdiffs=0; | |
1464 my $nucsims=0; | |
1465 my $indeldiffs=0; | |
1466 | |
1467 for my $i (0...$#seq1){ | |
1468 $similarity++ if $seq1[$i] =~ /$seq2[$i]/i ; #|| $seq1[$i] =~ /\-/i || $seq2[$i] =~ /\-/i ; | |
1469 $nucsims++ if $seq1[$i] =~ /$seq2[$i]/i && ($seq1[$i] =~ /[a-zA-Z]/i && $seq2[$i] =~ /[a-zA-Z]/i); | |
1470 $nucdiffs++ if $seq1[$i] !~ /$seq2[$i]/i && ($seq1[$i] =~ /[a-zA-Z]/i && $seq2[$i] =~ /[a-zA-Z]/i); | |
1471 $indeldiffs++ if $seq1[$i] !~ /$seq2[$i]/i && $seq1[$i] =~ /\-/i || $seq2[$i] =~ /\-/i; | |
1472 } | |
1473 my $sim = $similarity/length($_[0]); | |
1474 return ( $sim, $nucdiffs, $indeldiffs ); #<= $similarity_thresh; | |
1475 } | |
1476 #-------------------------------------------------------------------------------------------------------- | |
1477 #-------------------------------------------------------------------------------------------------------- | |
1478 #-------------------------------------------------------------------------------------------------------- | |
1479 | |
1480 sub load_treesToReject{ | |
1481 my @rejectlist = (); | |
1482 my $alltags = join("",@_); | |
1483 @rejectlist = qw (-HCOR +HCOR) if $alltags eq "HCORM"; | |
1484 @rejectlist = qw ( -HCO|+R +HCO|-R) if $alltags eq "HCOR"; | |
1485 @rejectlist = qw ( -HC|+O +HC|-O) if $alltags eq "HCO"; | |
1486 | |
1487 %treesToReject=(); | |
1488 $treesToReject{$_} = $_ foreach (@rejectlist); | |
1489 #print "loaded to reject for $alltags; ", $treesToReject{$_},"\n" foreach (@rejectlist); #<STDIN>; | |
1490 } | |
1491 #-------------------------------------------------------------------------------------------------------- | |
1492 sub load_treesToIgnore{ | |
1493 my @rejectlist = (); | |
1494 my $alltags = join("",@_); | |
1495 @rejectlist = qw (-HCOR +HCOR +HCORM -HCORM) if $alltags eq "HCORM"; | |
1496 @rejectlist = qw ( -HCO|+R +HCO|-R +HCOR -HCOR) if $alltags eq "HCOR"; | |
1497 @rejectlist = qw ( -HC|+O +HC|-O +HCO -HCO) if $alltags eq "HCO"; | |
1498 | |
1499 %treesToIgnore=(); | |
1500 $treesToIgnore{$_} = $_ foreach (@rejectlist); | |
1501 #print "loaded ", $treesToIgnore{$_},"\n" foreach (@rejectlist); | |
1502 } | |
1503 #-------------------------------------------------------------------------------------------------------- | |
1504 sub load_thresholds{ | |
1505 my @threshold_array=split(/[,_]/,$_[0]); | |
1506 unshift @threshold_array, "0"; | |
1507 for my $size (1 ... 4){ | |
1508 $thresholdhash{$size}=$threshold_array[$size]; | |
1509 } | |
1510 } | |
1511 #-------------------------------------------------------------------------------------------------------- | |
1512 sub load_allPossibleTrees{ | |
1513 #1 THIS FILE STORES ALL POSSIBLE SCENARIOS OF MICROSATELLITE | |
1514 #2 BIRTH AND DEATH EVENTS ON A 5-PRIMATE TREE OF H,C,O,R,M | |
1515 #3 IN FORM OF A TEXT FILE. THIS WILL BE USED AS A TEMPLET | |
1516 #4 TO COMPARE EACH MICROSATELLITE CLUSTER TO UNDERSTAND THE | |
1517 #5 EVOLUTION OF EACH LOCUS. WE WILL THEN DISCARD SOME | |
1518 #6 MICROSATS ACCRODING TO THEIR EVOLUTIONARY BEHAVIOUR ON | |
1519 #7 THE TREE. MOST PROBABLY WE WILL REMOVE THOSE MICROSATS | |
1520 #8 THAT ARE NOT SUFFICIENTLY INFORMATIVE, LIKE IN CASE OF | |
1521 #9 AN OUTGROUP MICROSATELLITE BEING DIFFERENT FRON ALL OTHER | |
1522 #10 SPECIES IN THE TREE. | |
1523 my $tree_list = $_[0]; | |
1524 # print "file to be loaded: $tree_list\n"; | |
1525 | |
1526 my @trarr = (); | |
1527 @trarr = ("#H C O CONCLUSION ALTERNATE", | |
1528 "+ + + +HCO NA", | |
1529 "+ _ _ +H NA", | |
1530 "_ + _ +C NA", | |
1531 "_ _ + -HC|+O NA", | |
1532 "+ _ + -C +H", | |
1533 "_ + + -H +C", | |
1534 "+ + _ +HC|-O NA", | |
1535 "_ _ _ -HCO NA") if $tree_list =~ /_HCO\.txt/; | |
1536 @trarr = ("#H C O R CONCLUSION ALTERNATE", | |
1537 "_ _ _ _ -HCOR NA", | |
1538 "+ + + + +HCOR NA", | |
1539 "+ + + _ +HCO|-R +H.+C.+O", | |
1540 "+ + _ _ +HC +H.+C;-O", | |
1541 "+ _ _ _ +H +HC,-C;+HC,-C", | |
1542 "_ + _ _ +C +HC,-H;+HC,-H", | |
1543 "_ _ + _ +O -HC|-H.-C", | |
1544 "_ _ + + -HC -H.-C", | |
1545 "+ _ _ + +H|-C.-O +HC,-C", | |
1546 "_ + _ + +C -H.-O", | |
1547 "_ + + _ -H +C.+O", | |
1548 "_ _ _ + -HCO|+R NA", | |
1549 "+ _ + _ +H.+O|-C NA", | |
1550 "_ + + + -H -HC,+C", | |
1551 "+ _ + + -C -HC,+H", | |
1552 "+ + _ + -O +HC") if $tree_list =~ /_HCOR\.txt/; | |
1553 | |
1554 @trarr = ("#H C O R M CONCLUSION ALTERNATE", | |
1555 "+ + _ + + -O -HCO,+HC|-HCO,+HC;-HCO,(+H.+C)", | |
1556 "+ _ + + + -C -HC,+H;+HCO,(+H.+O)", | |
1557 "_ + + + + -H -HC,+C;-HCO,(+C.+O)", | |
1558 "_ _ + _ _ +O +HCO,-HC;+HCO,(-H.-C)", | |
1559 "_ + _ _ _ +C +HC,-H;+HCO,(-H.-O)", | |
1560 "+ _ _ _ _ +H +HC,-C;+HCO,(-C.-O)", | |
1561 "+ + + _ _ +HCO +H.+C.+O", | |
1562 "_ _ _ + + -HCO -HC.-O;-H.-C.-O", | |
1563 "+ _ _ + + -O.-C|-HCO,+H +R.+H;-HCO,(+R.+H)", | |
1564 "_ + _ + + -O.-H|-HCO,+C +R.+C;-HCO,(+R.+C)", | |
1565 "_ + + _ _ +HCO,-H|+O.+C NA", | |
1566 "+ _ + _ _ +HCO,-C|+O.+H NA", | |
1567 "_ _ + + + -HC -H.-C|-HCO,+O", | |
1568 "+ + _ _ _ +HC +H.+C|+HCO,-O|-HCO,+HC;-HCO,(+H.+C)", | |
1569 "+ + + + + +HCORM NA", | |
1570 "_ _ + _ + DISCARD +O;+HCO,-HC;+HCO,(-H.-C)", | |
1571 "_ + _ _ + +C +HC,-H;+HCO,(-H.-O)", | |
1572 "+ _ _ _ + +H +HC,-C;+HCO,(-C.-O)", | |
1573 "+ + _ _ + +HC -R.-O|+HCO,-O|+H.+C;-HCO,+HC;-HCO,(+H.+C)", | |
1574 "+ _ + _ + DISCARD -R.-C|+HCO,-C|+H.+O NA", | |
1575 "_ + + _ + DISCARD -R.-H|+HCO,-H|+C.+O NA", | |
1576 "_ _ _ _ + DISCARD -HCOR NA", | |
1577 "_ _ _ + _ DISCARD +R;-HC.-O;-H.-C.-O", | |
1578 "+ + _ + _ -O +R.+HC|-HCO,+HC;+H.+C.+R|-HCO,(+H.+C)", | |
1579 "+ + + + _ +HCOR NA", | |
1580 "+ + + _ + DISCARD -R;+HCO;+HC.+O;+H.+C.+O", | |
1581 "+ _ + + _ -C -HC,+H;+H.+O.+R|-HCO,(+H.+O)", | |
1582 "_ + + + _ -H -HC,+C;+C.+O.+R|-HCO,(+C.+O)", | |
1583 "_ _ + + _ -HC +R.+O|-HCO,+O|+HCO,-HC", | |
1584 "_ + _ + _ +C +R.+C|-HCO,+C|-HC,+C +HCO,(-H.-O)", | |
1585 "+ _ _ + _ +H +R.+H|-C.-O +HCO,(-C.-O)" | |
1586 ) if $tree_list =~ /_HCORM\.txt/; | |
1587 | |
1588 | |
1589 my $template_p = $_[1]; | |
1590 my $alternate_p = $_[2]; | |
1591 #1 THIS IS THE HASH IN WHICH INFORMATION FROM THE ABOVE FILE | |
1592 #2 GETS STORED, USING THE WHILE LOOP BELOW. HERE, THE KEY | |
1593 #3 OF EACH ROW IS THE EVOLUTIONARY CONFIGURATION OF A LOCUS | |
1594 #4 ON THE PRIMATE TREE, BASED ON PRESENCE/ABSENCE OF A MICROSAT | |
1595 #5 AT THAT LOCUS, LIKE SAY "+ + + _ _" .. EACH COLUMN BELONGS | |
1596 #6 TO ONE SPECIES; HERE THE COLUMN NAMES ARE "H C O R M". | |
1597 #7 THE VALUE FOR EACH ENTRY IS THE MEANING OF THE ABOVE | |
1598 #8 CONFIGURATION (I.E., CONFIGURAION OF THE KEY. HERE, THE | |
1599 #9 VALUE WILL BE +HCO, SIGNIFYING A BIRTH IN HUMAN-CHIMP-ORANG | |
1600 #10 COMMON ANCESTOR. THIS HASH HAS BEEN LOADED HERE TO BE USED | |
1601 #11 LATER BY THE SUBROUTINE sub treeStudy{} THAT STUDIES | |
1602 #12 EVOLUTIONARY CONFIGURAION OF EACH MICROSAT LOCUS, AS | |
1603 #13 MENTIONED ABOVE. | |
1604 my @keys_array=(); | |
1605 foreach my $line (@trarr){ | |
1606 # print $line,"\n"; | |
1607 next if $line =~ /^#/; | |
1608 chomp $line; | |
1609 my @fields = split("\t", $line); | |
1610 push @keys_array, $fields[0]; | |
1611 # print "loading: $fields[0]\n"; | |
1612 $template_p->{$fields[0]}[0] = $fields[1]; | |
1613 $template_p->{$fields[0]}[1] = 0; | |
1614 $alternate_p->{$fields[0]} = $fields[2]; | |
1615 # $alternate_p->{$fields[1]} = $fields[2]; | |
1616 # print "loading alternate_p $fields[1] $fields[2]\n"; #<STDIN> if $fields[1] eq "+H"; | |
1617 } | |
1618 # print "loaded the trees with keys: @keys_array\n"; | |
1619 return $template_p, \@keys_array, $alternate_p; | |
1620 } | |
1621 | |
1622 #-------------------------------------------------------------------------------------------------------- | |
1623 #-------------------------------------------------------------------------------------------------------- | |
1624 #-------------------------------------------------------------------------------------------------------- | |
1625 #-------------------------------------------------------------------------------------------------------- | |
1626 #-------------------------------------------------------------------------------------------------------- | |
1627 sub checkCleanCase{ | |
1628 my $printer = 0; | |
1629 my $tree = $_[0]; | |
1630 my $finalalignment = $_[1]; | |
1631 | |
1632 #print "IN checkCleanCase: @_\n"; | |
1633 #<STDIN>; | |
1634 my @indivspecies = $tree =~ /[A-Z]/g; | |
1635 $finalalignment =~ s/\./_/g; | |
1636 my @captured = $finalalignment =~ /[A-Za-z, \(\):]+\![:A-Za-z, \(\)]/g; | |
1637 | |
1638 my $unclean = 0; | |
1639 | |
1640 foreach my $sp (@indivspecies){ | |
1641 foreach my $cap (@captured){ | |
1642 $cap =~ s/:[A-Za-z\-]+//g; | |
1643 my @sps = $cap =~ /[A-Z]+/g; | |
1644 my $spsc = join("", @sps); | |
1645 # print "checking whether imp species $sp is present in $cap i.e, in $spsc\n " if $printer == 1; | |
1646 if ($spsc =~ /$sp/){ | |
1647 # print "foind : $sp\n"; | |
1648 $unclean = 1; last; | |
1649 } | |
1650 } | |
1651 last if $unclean == 1; | |
1652 } | |
1653 #<STDIN>; | |
1654 return "CLEAN" if $unclean == 0; | |
1655 return "UNCLEAN"; | |
1656 } | |
1657 | |
1658 #-------------------------------------------------------------------------------------------------------- | |
1659 #-------------------------------------------------------------------------------------------------------- | |
1660 #-------------------------------------------------------------------------------------------------------- | |
1661 #-------------------------------------------------------------------------------------------------------- | |
1662 #-------------------------------------------------------------------------------------------------------- | |
1663 | |
1664 | |
1665 sub adjustCoordinates{ | |
1666 my $line = $_[0]; | |
1667 return 0 if !defined $line; | |
1668 #print "------x------x------x------x------x------x------x------x------\n"; | |
1669 #print $line,"\n\n"; | |
1670 my $no_of_species = $line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)|(scaffold[0-9a-zA-Z\._\-]+)|(supercontig[0-9a-zA-Z\._\-]+)/x/ig; | |
1671 #print $line,"\n"; | |
1672 #print "------x------x------x------x------x------x------x------x------\n\n\n"; | |
1673 # my @got = ($line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)/x/g); | |
1674 # print "line = $line\n"; | |
1675 $infocord = 2 + (4*$no_of_species) - 1; | |
1676 $typecord = 2 + (4*$no_of_species) + 1 - 1; | |
1677 $motifcord = 2 + (4*$no_of_species) + 2 - 1; | |
1678 $gapcord = $motifcord+1; | |
1679 $startcord = $gapcord+1; | |
1680 $strandcord = $startcord+1; | |
1681 $endcord = $strandcord + 1; | |
1682 $microsatcord = $endcord + 1; | |
1683 $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; | |
1684 $interr_poscord = $microsatcord + 3; | |
1685 $no_of_interruptionscord = $microsatcord + 4; | |
1686 $interrcord = $microsatcord + 2; | |
1687 # print "$line\n startcord = $startcord, and endcord = $endcord and no_of_species = $no_of_species\n" if $line !~ /calJac/i; | |
1688 | |
1689 return $no_of_species; | |
1690 } | |
1691 | |
1692 | |
1693 sub printhash{ | |
1694 my $alivehash = $_[0]; | |
1695 my @tags = @$_[1]; | |
1696 # print "print hash\n"; | |
1697 foreach my $tag (@tags){ | |
1698 # print "$tag=",$alivehash->{$tag},"\n" if exists $alivehash->{$tag}; | |
1699 } | |
1700 | |
1701 return "\n" | |
1702 } | |
1703 sub peel_onion{ | |
1704 my $printer = 0; | |
1705 # print "received: @_\n" ; #<STDIN>; | |
1706 $printer = 0; | |
1707 my ($tree, $sequences, $alignment, $tagarray, $microsathash, $nonmicrosathash, $motif, $tree_analysis, $threshold, $microsatstarts) = @_; | |
1708 # print "in peel onion.. tree = $tree \n" if $printer == 1; | |
1709 my %sequence_hash=(); | |
1710 | |
1711 | |
1712 # for my $i (0 ... $#sequences){ $sequence_hash{$species[$i]}=$sequences->[$i]; } | |
1713 | |
1714 | |
1715 my %node_sequences=(); | |
1716 | |
1717 my %node_alignments = (); #NEW, Nov 28 2008 | |
1718 my @tags=(); | |
1719 my @locus_sequences=(); | |
1720 my %alivehash=(); | |
1721 foreach my $tag (@$tagarray) { | |
1722 #print "adding: $tag\n"; | |
1723 push(@tags, $tag); | |
1724 $node_sequences{$tag}=join ".",split(/\s*/,$microsathash->{$tag}) if $microsathash->{$tag} ne "NULL"; | |
1725 $alivehash{$tag}= $tag if $microsathash->{$tag} ne "NULL"; | |
1726 $node_sequences{$tag}=join ".",split(/\s*/,$nonmicrosathash->{$tag}) if $microsathash->{$tag} eq "NULL"; | |
1727 $node_alignments{$tag}=join ".",split(/\s*/,$alignment->{$tag}) ; | |
1728 push @locus_sequences, $node_sequences{$tag}; | |
1729 # print "adding to node_seq: $tag = ",$node_alignments{$tag},"\n"; | |
1730 } | |
1731 | |
1732 #<STDIN>; | |
1733 | |
1734 my ($nodes_arr, $branches_hash) = get_nodes($tree); | |
1735 my @nodes=@$nodes_arr; | |
1736 # print "recieved nodes = " if $printer == 1; | |
1737 # foreach my $key (@nodes) {print "@$key " if $printer == 1;} | |
1738 | |
1739 # print "\n" if $printer == 1; | |
1740 | |
1741 #POPULATE branches_hash WITH INFORMATION ABOUT LIVESTATUS | |
1742 foreach my $keys (@nodes){ | |
1743 my @pair = @$keys; | |
1744 my $joint = "(".join(", ",@pair).")"; | |
1745 my $copykey = join "", @pair; | |
1746 $copykey =~ s/[\W ]+//g; | |
1747 # print "for node: $keys, copykey = $copykey and joint = $joint\n" if $printer == 1; | |
1748 my $livestatus = 1; | |
1749 foreach my $copy (split(/\s*/,$copykey)){ | |
1750 $livestatus = 0 if !exists $alivehash{$copy}; | |
1751 } | |
1752 $alivehash{$joint} = $joint if !exists $alivehash{$joint} && $livestatus == 1; | |
1753 # print "alivehash = $alivehash{$joint}\n" if exists $alivehash{$joint} && $printer == 1; | |
1754 } | |
1755 | |
1756 @nodes = reverse(@nodes); #1 THIS IS IN ORDER TO GO THROUGH THE TREE FROM LEAVES TO ROOT. | |
1757 | |
1758 my @mutations_array=(); | |
1759 | |
1760 my $joint = (); | |
1761 foreach my $node (@nodes){ | |
1762 my @pair = @$node; | |
1763 # print "now in the nodes for loop, pair = @pair\n and sequences=\n" if $printer == 1; | |
1764 $joint = "(".join(", ",@pair).")"; | |
1765 my @pair_sequences=(); | |
1766 | |
1767 foreach my $tag (@pair){ | |
1768 # print "$tag: $node_alignments{$tag}\n" if $printer == 1; | |
1769 # print $node_alignments{$tag},"\n" if $printer == 1; | |
1770 push @pair_sequences, $node_alignments{$tag}; | |
1771 } | |
1772 # print "ppeel onion joint = $joint , pair_sequences=>@pair_sequences< , pair=>@pair<\n" if $printer == 1; | |
1773 | |
1774 my ($compared, $substitutions_list) = base_by_base_simple($motif,\@pair_sequences, scalar(@pair_sequences), @pair, $joint); | |
1775 $node_alignments{$joint}=$compared; | |
1776 push( @mutations_array,split(/:/,$substitutions_list)); | |
1777 # print "newly added to node_sequences: $node_alignments{$joint} and list of mutations =\n", join("\n",@mutations_array),"\n" if $printer == 1; | |
1778 } | |
1779 | |
1780 | |
1781 my $analayzed_mutations = analyze_mutations(\@mutations_array, \@nodes, $branches_hash, $alignment, \@tags, \%alivehash, \%node_sequences, $microsatstarts, $motif); | |
1782 | |
1783 return ($analayzed_mutations, \@nodes, $branches_hash, \%alivehash, $node_alignments{$joint}) if scalar @mutations_array > 0; | |
1784 return ("NULL",\@nodes,$branches_hash, \%alivehash, "NULL") if scalar @mutations_array == 0; | |
1785 } | |
1786 | |
1787 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# | |
1788 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# | |
1789 | |
1790 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# | |
1791 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# | |
1792 | |
1793 sub get_nodes{ | |
1794 my $printer = 0; | |
1795 | |
1796 my $tree=$_[0]; | |
1797 #$tree =~ s/ +//g; | |
1798 $tree =~ s/\t+//g; | |
1799 $tree=~s/;//g; | |
1800 # print "tree=$tree\n" if $printer == 1; | |
1801 my @nodes = (); | |
1802 my @onions=($tree); | |
1803 my %branches=(); | |
1804 foreach my $bite (@onions){ | |
1805 $bite=~ s/^\(|\)$//g; | |
1806 chomp $bite; | |
1807 # print "tree = $bite \n"; | |
1808 # <STDIN>; | |
1809 $bite=~ /([ ,\(\)A-Z]+)\,\s*([ ,\(\)A-Z]+)/; | |
1810 #$tree =~ /(\(\(\(H, C\), O\), R\))\, (M)/; | |
1811 my @raw_nodes = ($1, $2); | |
1812 # print "raw nodes = $1 and $2\n" if $printer == 1; | |
1813 push(@nodes, [@raw_nodes]); | |
1814 foreach my $node (@raw_nodes) {push (@onions, $node) if $node =~ /,/;} | |
1815 foreach my $node (@raw_nodes) {$branches{$node}="(".$bite.")"; } | |
1816 # print "onions = @onions\n" if $printer == 1;<STDIN> if $printer == 1; | |
1817 } | |
1818 $printer = 0; | |
1819 return \@nodes, \%branches; | |
1820 } | |
1821 | |
1822 | |
1823 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# | |
1824 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# | |
1825 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# | |
1826 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# | |
1827 sub analyze_mutations{ | |
1828 my ($mutations_array, $nodes, $branches_hash, $alignment, $tags, $alivehash, $node_sequences, $microsatstarts, $motif) = @_; | |
1829 my $locuslength = length($alignment->{$tags->[0]}); | |
1830 my $printer = 0; | |
1831 | |
1832 | |
1833 # print " IN analyzed_mutations....\n" if $printer == 1; # \n mutations array = @$mutations_array, \nAND locuslength = $locuslength\n" if $printer == 1; | |
1834 my %mutation_hash=(); | |
1835 my %froms_megahash=(); | |
1836 my %tos_megahash=(); | |
1837 my %position_hash=(); | |
1838 my @solutions_array=(); | |
1839 foreach my $mutation (@$mutations_array){ | |
1840 # print "loadin mutation: $mutation\n" if $printer == 1; | |
1841 my %localhash= $mutation =~ /([\S ]+)=([\S ]+)/g; | |
1842 $mutation_hash{$localhash{"position"}} = {%localhash}; | |
1843 push @{$position_hash{$localhash{"position"}}},$localhash{"node"}; | |
1844 # print "feeding position hash with $localhash{position}: $position_hash{$localhash{position}}[0]\n" if $printer == 1; | |
1845 $froms_megahash{$localhash{"position"}}{$localhash{"node"}}=$localhash{"from"}; | |
1846 $tos_megahash{$localhash{"position"}}{$localhash{"node"}}=$localhash{"to"}; | |
1847 # print "just a trial: $mutation_hash{$localhash{position}}{position}\n" if $printer == 1; | |
1848 # print "loadin in tos_megahash: $localhash{position} {$localhash{node} = $localhash{to}\n" if $printer == 1; | |
1849 # print "loadin in from: $localhash{position} {$localhash{node} = $localhash{from}\n" if $printer == 1; | |
1850 } | |
1851 | |
1852 # print "now going through each position in loculength:\n" if $printer == 1; | |
1853 ## <STDIN> if $printer == 1; | |
1854 | |
1855 for my $pos (0 ... $locuslength-1){ | |
1856 # print "at position: $pos\n" if $printer == 1; | |
1857 | |
1858 if (exists($mutation_hash{$pos})){ | |
1859 my @local_nodes=@{$position_hash{$pos}}; | |
1860 # print "found mutation: @{$position_hash{$pos}} : @local_nodes\n" if $printer == 1; | |
1861 | |
1862 foreach my $local_node (@local_nodes){ | |
1863 # print "at local node: $local_node ... from state = $froms_megahash{$pos}{$local_node}\n" if $printer == 1; | |
1864 my $open_insertion=(); | |
1865 my $open_deletion=(); | |
1866 my $open_to_substitution=(); | |
1867 my $open_from_substitution=(); | |
1868 if ($froms_megahash{$pos}{$local_node} eq "-"){ | |
1869 # print "here exists a microsatellite from $local_node to $branches_hash->{$local_node}\n" if $printer == 1 && exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};; | |
1870 # print "for localnode $local_node, amd the realated branches_hash:$branches_hash->{$local_node}, nexting as exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}\n" if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}} && $printer == 1; | |
1871 #next if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}; | |
1872 $open_insertion=$tos_megahash{$pos}{$local_node}; | |
1873 for my $posnext ($pos+1 ... $locuslength-1){ | |
1874 # print "in first if .... studying posnext: $posnext\n" if $printer == 1; | |
1875 last if !exists ($froms_megahash{$posnext}{$local_node}); | |
1876 # print "for posnext: $posnext, there exists $froms_megahash{$posnext}{$local_node}.. already, open_insertion = $open_insertion.. checking is $froms_megahash{$posnext}{$local_node} matters\n" if $printer == 1; | |
1877 $open_insertion = $open_insertion.$tos_megahash{$posnext}{$local_node} if $froms_megahash{$posnext}{$local_node} eq "-"; | |
1878 # print "now open_insertion=$open_insertion\n" if $printer == 1; | |
1879 delete $mutation_hash{$posnext} if $froms_megahash{$posnext}{$local_node} eq "-"; | |
1880 } | |
1881 # print "1 Feeding in: ", join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_insertion", "deletion="),"\n" if $printer == 1; | |
1882 push (@solutions_array, join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_insertion", "deletion=")); | |
1883 } | |
1884 elsif ($tos_megahash{$pos}{$local_node} eq "-"){ | |
1885 # print "here exists a microsatellite to $local_node from $branches_hash->{$local_node}\n" if $printer == 1 && exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};; | |
1886 # print "for localnode $local_node, amd the realated branches_hash:$branches_hash->{$local_node}, nexting as exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}\n" if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}; | |
1887 #next if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}; | |
1888 $open_deletion=$froms_megahash{$pos}{$local_node}; | |
1889 for my $posnext ($pos+1 ... $locuslength-1){ | |
1890 # print "in 1st elsif studying posnext: $posnext\n" if $printer == 1; | |
1891 # print "nexting as nextpos does not exist\n" if !exists ($tos_megahash{$posnext}{$local_node}) && $printer == 1; | |
1892 last if !exists ($tos_megahash{$posnext}{$local_node}); | |
1893 # print "for posnext: $posnext, there exists $tos_megahash{$posnext}{$local_node}\n" if $printer == 1; | |
1894 $open_deletion = $open_deletion.$froms_megahash{$posnext}{$local_node} if $tos_megahash{$posnext}{$local_node} eq "-"; | |
1895 delete $mutation_hash{$posnext} if $tos_megahash{$posnext}{$local_node} eq "-"; | |
1896 } | |
1897 # print "2 Feeding in:", join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_deletion"), "\n" if $printer == 1; | |
1898 push (@solutions_array, join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_deletion")); | |
1899 } | |
1900 elsif ($tos_megahash{$pos}{$local_node} ne "-"){ | |
1901 # print "here exists a microsatellite from $local_node to $branches_hash->{$local_node}\n" if $printer == 1 && exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};; | |
1902 # print "for localnode $local_node, amd the realated branches_hash:$branches_hash->{$local_node}, nexting as exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}\n" if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}; | |
1903 #next if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}; | |
1904 # print "microsatstart = $microsatstarts->{$local_node} \n" if exists $microsatstarts->{$local_node} && $pos < $microsatstarts->{$local_node} && $printer == 1; | |
1905 next if exists $microsatstarts->{$local_node} && $pos < $microsatstarts->{$local_node}; | |
1906 $open_to_substitution=$tos_megahash{$pos}{$local_node}; | |
1907 $open_from_substitution=$froms_megahash{$pos}{$local_node}; | |
1908 # print "open from substitution: $open_from_substitution \n" if $printer == 1; | |
1909 for my $posnext ($pos+1 ... $locuslength-1){ | |
1910 #print "in last elsif studying posnext: $posnext\n"; | |
1911 last if !exists ($tos_megahash{$posnext}{$local_node}); | |
1912 # print "for posnext: $posnext, there exists $tos_megahash{$posnext}{$local_node}\n" if $printer == 1; | |
1913 $open_to_substitution = $open_to_substitution.$tos_megahash{$posnext}{$local_node} if $tos_megahash{$posnext}{$local_node} ne "-"; | |
1914 $open_from_substitution = $open_from_substitution.$froms_megahash{$posnext}{$local_node} if $tos_megahash{$posnext}{$local_node} ne "-"; | |
1915 delete $mutation_hash{$posnext} if $tos_megahash{$posnext}{$local_node} ne "-" && $froms_megahash{$posnext}{$local_node} ; | |
1916 } | |
1917 # print "open from substitution: $open_from_substitution \n" if $printer == 1; | |
1918 | |
1919 #IS THE STRETCH OF SUBSTITUTION MICROSATELLITE-LIKE? | |
1920 my @motif_parts=split(/\s*/,$motif); | |
1921 #GENERATING THE FLEXIBLE LEFT END | |
1922 my $left_query=(); | |
1923 for my $k (1 ... $#motif_parts) { | |
1924 $left_query= $motif_parts[$k]."|)"; | |
1925 $left_query="(".$left_query; | |
1926 } | |
1927 $left_query=$left_query."?"; | |
1928 # print "left_quewry = $left_query\n" if $printer == 1; | |
1929 #GENERATING THE FLEXIBLE RIGHT END | |
1930 my $right_query=(); | |
1931 for my $k (0 ... ($#motif_parts-1)) { | |
1932 $right_query= "(|".$motif_parts[$k]; | |
1933 $right_query=$right_query.")"; | |
1934 } | |
1935 $right_query=$right_query."?"; | |
1936 # print "right_query = $right_query\n" if $printer == 1; | |
1937 # print "Hence, searching for: ^$left_query($motif)+$right_query\$\n" if $printer == 1; | |
1938 | |
1939 my $motifcomb=$motif x 50; | |
1940 # print "motifcomb = $motifcomb\n" if $printer == 1; | |
1941 if ( ($motifcomb =~/$open_to_substitution/i) && (length ($open_to_substitution) >= length($motif)) ){ | |
1942 # print "sequence microsat-like\n" if $printer == 1; | |
1943 my $all_microsat_like = 0; | |
1944 # print "3 feeding in: ", join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_from_substitution"), "\n" if $printer == 1; | |
1945 push (@solutions_array, join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_from_substitution")); | |
1946 # print "4 feeding in: ", join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_to_substitution", "deletion="), "\n" if $printer == 1; | |
1947 push (@solutions_array, join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_to_substitution", "deletion=")); | |
1948 | |
1949 } | |
1950 else{ | |
1951 # print "5 feeding in: ", join("\t", "node=$local_node","type=substitution" ,"position=$pos", "from=$open_from_substitution", "to=$open_to_substitution", "insertion=", "deletion="), "\n" if $printer == 1; | |
1952 push (@solutions_array, join("\t", "node=$local_node","type=substitution" ,"position=$pos", "from=$open_from_substitution", "to=$open_to_substitution", "insertion=", "deletion=")); | |
1953 } | |
1954 #IS THE FROM-SEQUENCE MICROSATELLITE-LIKE? | |
1955 | |
1956 } | |
1957 #<STDIN> if $printer ==1; | |
1958 } | |
1959 #<STDIN> if $printer ==1; | |
1960 } | |
1961 } | |
1962 # print "\n", "#" x 50, "\n" if $printer == 1; | |
1963 foreach my $tag (@$tags){ | |
1964 # print "$tag: $alignment->{$tag}\n" if $printer == 1; | |
1965 } | |
1966 # print "\n", "#" x 50, "\n" if $printer == 1; | |
1967 # print "returning SOLUTIONS ARRAY : \n",join("\n", @solutions_array),"\n" if $printer == 1; | |
1968 #print "end\n"; | |
1969 #<STDIN> if | |
1970 return \@solutions_array; | |
1971 } | |
1972 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# | |
1973 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# | |
1974 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# | |
1975 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# | |
1976 | |
1977 sub base_by_base_simple{ | |
1978 my $printer = 0; | |
1979 my ($motif, $locus, $no, $pair0, $pair1, $joint) = @_; | |
1980 my @seq_array=(); | |
1981 # print "IN SUBROUTUNE base_by_base_simple.. information received = @_\n" if $printer == 1; | |
1982 # print "pair0 = $pair0 and pair1 = $pair1\n" if $printer == 1; | |
1983 | |
1984 my @example=split(/\./,$locus->[0]); | |
1985 # print "example, for length = @example\n" if $printer == 1; | |
1986 for my $i (0...$no-1){push(@seq_array, [split(/\./,$locus->[$i])]); } | |
1987 | |
1988 my @compared_sequence=(); | |
1989 my @substitutions_list; | |
1990 for my $i (0...scalar(@example)-1){ | |
1991 | |
1992 #print "i = $i\n" if $printer == 1; | |
1993 #print "comparing $seq_array[0][$i] and $seq_array[1][$i] \n" ;#if $printer == 1; | |
1994 if ($seq_array[0][$i] =~ /!/ && $seq_array[1][$i] !~ /!/){ | |
1995 | |
1996 my $resolution= resolve_base($seq_array[0][$i],$seq_array[1][$i], $pair1 ,"keep" ); | |
1997 # print "ancestral = $resolution\n" if $printer == 1; | |
1998 | |
1999 if ($resolution =~ /$seq_array[1][$i]/i && $resolution !~ /!/){ | |
2000 push @substitutions_list, add_mutation($i, $pair0, $seq_array[0][$i], $resolution ); | |
2001 } | |
2002 elsif ( $resolution !~ /!/){ | |
2003 push @substitutions_list, add_mutation($i, $pair1, $seq_array[1][$i], $resolution); | |
2004 } | |
2005 push @compared_sequence,$resolution; | |
2006 } | |
2007 elsif ($seq_array[0][$i] !~ /!/ && $seq_array[1][$i] =~ /!/){ | |
2008 | |
2009 my $resolution= resolve_base($seq_array[1][$i],$seq_array[0][$i], $pair0, "invert" ); | |
2010 # print "ancestral = $resolution\n" if $printer == 1; | |
2011 | |
2012 if ($resolution =~ /$seq_array[0][$i]/i && $resolution !~ /!/){ | |
2013 push @substitutions_list, add_mutation($i, $pair1, $seq_array[1][$i], $resolution); | |
2014 } | |
2015 elsif ( $resolution !~ /!/){ | |
2016 push @substitutions_list, add_mutation($i, $pair0, $seq_array[0][$i], $resolution); | |
2017 } | |
2018 push @compared_sequence,$resolution; | |
2019 } | |
2020 elsif($seq_array[0][$i] =~ /!/ && $seq_array[1][$i] =~ /!/){ | |
2021 push @compared_sequence, add_bases($seq_array[0][$i],$seq_array[1][$i], $pair0, $pair1, $joint ); | |
2022 } | |
2023 else{ | |
2024 if($seq_array[0][$i] !~ /^$seq_array[1][$i]$/i){ | |
2025 push @compared_sequence, $pair0.":".$seq_array[0][$i]."!".$pair1.":".$seq_array[1][$i]; | |
2026 } | |
2027 else{ | |
2028 # print "perfect match\n" if $printer == 1; | |
2029 push @compared_sequence, $seq_array[0][$i]; | |
2030 } | |
2031 } | |
2032 } | |
2033 # print "returning: comared = @compared_sequence \nand substitutions list =\n", join("\n",@substitutions_list),"\n" if $printer == 1; | |
2034 return join(".",@compared_sequence), join(":", @substitutions_list) if scalar (@substitutions_list) > 0; | |
2035 return join(".",@compared_sequence), "" if scalar (@substitutions_list) == 0; | |
2036 } | |
2037 | |
2038 | |
2039 sub resolve_base{ | |
2040 my $printer = 0; | |
2041 # print "IN SUBROUTUNE resolve_base.. information received = @_\n" if $printer == 1; | |
2042 my ($optional, $single, $singlesp, $arg) = @_; | |
2043 my @options=split(/!/,$optional); | |
2044 foreach my $option(@options) { | |
2045 $option=~s/[A-Z\(\) ,]+://g; | |
2046 if ($option =~ /$single/i){ | |
2047 # print "option = $option , returning single: $single\n" if $printer == 1; | |
2048 return $single; | |
2049 } | |
2050 } | |
2051 # print "returning ",$optional."!".$singlesp.":".$single. "\n" if $arg eq "keep" && $printer == 1; | |
2052 # print "returning ",$singlesp.":".$single."!".$optional. "\n" if $arg eq "invert" && $printer == 1; | |
2053 return $optional."!".$singlesp.":".$single if $arg eq "keep"; | |
2054 return $singlesp.":".$single."!".$optional if $arg eq "invert"; | |
2055 | |
2056 } | |
2057 | |
2058 sub same_length{ | |
2059 my $printer = 0; | |
2060 my @locus = @_; | |
2061 my $temp = shift @locus; | |
2062 $temp=~s/-|,//g; | |
2063 foreach my $l (@locus){ | |
2064 $l=~s/-|,//g; | |
2065 return 0 if length($l) != length($temp); | |
2066 $temp = $l; | |
2067 } | |
2068 return 1; | |
2069 } | |
2070 sub treeStudy{ | |
2071 my $printer = 1; | |
2072 # print "template DEFINED.. received: @_\n" if defined %template; | |
2073 # print "only received = @_" if !defined %template; | |
2074 my $stopper = 0; | |
2075 # if (!defined %template){ TEMP MASKED OCT 18 2012 | |
2076 $stopper = 1; | |
2077 %template=(); | |
2078 # print "tree decipherer = $tree_decipherer\n" if $printer == 1; | |
2079 my ( $template_ref, $keys_array)=load_allPossibleTrees($tree_decipherer, \%template); | |
2080 # print "return = $template_ref and @{$keys_array}\n" if $printer == 1; | |
2081 foreach my $key (@$keys_array){ | |
2082 # print "addding : $template_ref->{$key} for $key\n" if $printer == 1; | |
2083 $template{$key} = $template_ref->{$key}; | |
2084 } | |
2085 # } TEMP MASK OCT 18 2012 END | |
2086 # <STDIN>; | |
2087 for my $templet ( keys %template ) { | |
2088 # print "$templet => @{$template{$templet}}\n"; | |
2089 } | |
2090 # <STDIN> if !defined %template; | |
2091 | |
2092 my $strict = 0; | |
2093 | |
2094 my $H = 0; | |
2095 my $Hchr = 1; | |
2096 my $Hstart = 2; | |
2097 my $Hend = 3; | |
2098 my $Hmotif = 4; | |
2099 my $Hmotiflen = 5; | |
2100 my $Hmicro = 6; | |
2101 my $Hstrand = 7; | |
2102 my $Hmicrolen = 8; | |
2103 my $Hinterpos = 9; | |
2104 my $Hrelativepos = 10; | |
2105 my $Hinter = 11; | |
2106 my $Hinterlen = 12; | |
2107 | |
2108 my $C = 13; | |
2109 my $Cchr = 14; | |
2110 my $Cstart = 15; | |
2111 my $Cend = 16; | |
2112 my $Cmotif = 17; | |
2113 my $Cmotiflen = 18; | |
2114 my $Cmicro = 19; | |
2115 my $Cstrand = 20; | |
2116 my $Cmicrolen = 21; | |
2117 my $Cinterpos = 22; | |
2118 my $Crelativepos = 23; | |
2119 my $Cinter = 24; | |
2120 my $Cinterlen = 25; | |
2121 | |
2122 my $O = 26; | |
2123 my $Ochr = 27; | |
2124 my $Ostart = 28; | |
2125 my $Oend = 29; | |
2126 my $Omotif = 30; | |
2127 my $Omotiflen = 31; | |
2128 my $Omicro = 32; | |
2129 my $Ostrand = 33; | |
2130 my $Omicrolen = 34; | |
2131 my $Ointerpos = 35; | |
2132 my $Orelativepos = 36; | |
2133 my $Ointer = 37; | |
2134 my $Ointerlen = 38; | |
2135 | |
2136 my $R = 39; | |
2137 my $Rchr = 40; | |
2138 my $Rstart = 41; | |
2139 my $Rend = 42; | |
2140 my $Rmotif = 43; | |
2141 my $Rmotiflen = 44; | |
2142 my $Rmicro = 45; | |
2143 my $Rstrand = 46; | |
2144 my $Rmicrolen = 47; | |
2145 my $Rinterpos = 48; | |
2146 my $Rrelativepos = 49; | |
2147 my $Rinter = 50; | |
2148 my $Rinterlen = 51; | |
2149 | |
2150 my $Mchr = 52; | |
2151 my $Mstart = 53; | |
2152 my $Mend = 54; | |
2153 my $M = 55; | |
2154 my $Mmotif = 56; | |
2155 my $Mmotiflen = 57; | |
2156 my $Mmicro = 58; | |
2157 my $Mstrand = 59; | |
2158 my $Mmicrolen = 60; | |
2159 my $Minterpos = 61; | |
2160 my $Mrelativepos = 62; | |
2161 my $Minter = 63; | |
2162 my $Minterlen = 64; | |
2163 | |
2164 #-------------------------------------------------------------------------------# | |
2165 my @analysis=(); | |
2166 | |
2167 | |
2168 my %speciesOrder = (); | |
2169 $speciesOrder{"H"} = 0; | |
2170 $speciesOrder{"C"} = 1; | |
2171 $speciesOrder{"O"} = 2; | |
2172 $speciesOrder{"R"} = 3; | |
2173 $speciesOrder{"M"} = 4; | |
2174 #-------------------------------------------------------------------------------# | |
2175 | |
2176 my $line = $_[0]; | |
2177 chomp $line; | |
2178 | |
2179 my @f = split(/\t/,$line); | |
2180 # print "received array : @f.. recieved tags = @tags\n" if $printer == 1; | |
2181 | |
2182 # collect all motifs | |
2183 my @motifs=(); | |
2184 @motifs = ($f[$Hmotif], $f[$Cmotif], $f[$Omotif], $f[$Rmotif], $f[$Mmotif]) if $tags[$#tags] =~ /M/; | |
2185 @motifs = ($f[$Hmotif], $f[$Cmotif], $f[$Omotif], $f[$Rmotif]) if $tags[$#tags] =~ /R/; | |
2186 @motifs = ($f[$Hmotif], $f[$Cmotif], $f[$Omotif]) if $tags[$#tags] =~ /O/; | |
2187 # print "motifs in the array = $f[$Hmotif], $f[$Cmotif], $f[$Omotif], $f[$Rmotif]\n" if $tags[$#tags] =~ /R/;; | |
2188 # print "motifs = @motifs\n" if $printer == 1; | |
2189 my @translation = (); | |
2190 foreach my $motif (@motifs){ | |
2191 push(@translation, "_") if $motif eq "NA"; | |
2192 push(@translation, "+") if $motif ne "NA"; | |
2193 } | |
2194 my $translate = join(" ", @translation); | |
2195 # print "translate = >$translate< and analysis = $template{$translate}[0].. on the other hand, ",$template{"- - +"}[0],"\n"; | |
2196 my @analyses = split(/\|/,$template{$translate}[0]); | |
2197 # print "motifs = @motifs, analyses = @analyses\n" if $printer == 1; | |
2198 | |
2199 if (scalar(@analyses) == 1) { | |
2200 #print "analysis = $analyses[0]\n"; | |
2201 if ($analyses[0] !~ /,|\./ ){ | |
2202 if ($analyses[0] =~ /\+/){ | |
2203 my $analysis = $analyses[0]; | |
2204 $analysis =~ s/\+|\-//g; | |
2205 my @species = split(/\s*/,$analysis); | |
2206 my @currentMotifs = (); | |
2207 foreach my $specie (@species){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); #print "pushing into currentMotifs: $speciesOrder{$specie}: $motifs[$speciesOrder{$specie}]\n" if $printer == 1; | |
2208 } | |
2209 # print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; | |
2210 $template{$translate}[1]++ if $strict == 1 && consistency(@currentMotifs) ne "NULL"; | |
2211 $template{$translate}[1]++ if $strict == 0; | |
2212 # print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; | |
2213 } | |
2214 else{ | |
2215 my $analysis = $analyses[0]; | |
2216 $analysis =~ s/\+|\-//g; | |
2217 my @species = split(/\s*/,$analysis); | |
2218 my @currentMotifs = (); | |
2219 my @complementarySpecies = (); | |
2220 my $allSpecies = join("",@tags); | |
2221 foreach my $specie (@species){ $allSpecies =~ s/$specie//g; } | |
2222 foreach my $specie (split(/\s*/,$allSpecies)){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); #print "pushing into currentMotifs: $speciesOrder{$specie}: $motifs[$speciesOrder{$specie}]\n" if $printer == 1;; | |
2223 } | |
2224 # print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; | |
2225 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL"; | |
2226 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0; | |
2227 # print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; | |
2228 } | |
2229 } | |
2230 | |
2231 elsif ($analyses[0] =~ /,/) { | |
2232 my @events = split(/,/,$analyses[0]); | |
2233 # print "events = @events \n " if $printer == 1; | |
2234 if ($events[0] =~ /\+/){ | |
2235 my $analysis1 = $events[0]; | |
2236 $analysis1 =~ s/\+|\-//g; | |
2237 my $analysis2 = $events[1]; | |
2238 $analysis2 =~ s/\+|\-//g; | |
2239 my @nSpecies = split(/\s*/,$analysis2); | |
2240 # print "original anslysis = $analysis1 " if $printer == 1; | |
2241 foreach my $specie (@nSpecies){ $analysis1=~ s/$specie//g;} | |
2242 # print "processed anslysis = $analysis1 \n" if $printer == 1; | |
2243 my @currentMotifs = (); | |
2244 foreach my $specie (split(/\s*/,$analysis1)){push(@currentMotifs, $motifs[$speciesOrder{$specie}]); } | |
2245 # print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; | |
2246 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL"; | |
2247 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0; | |
2248 # print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; | |
2249 } | |
2250 else{ | |
2251 my $analysis1 = $events[0]; | |
2252 $analysis1 =~ s/\+|\-//g; | |
2253 my $analysis2 = $events[1]; | |
2254 $analysis2 =~ s/\+|\-//g; | |
2255 my @pSpecies = split(/\s*/,$analysis2); | |
2256 my @currentMotifs = (); | |
2257 foreach my $specie (@pSpecies){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); } | |
2258 # print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; | |
2259 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL"; | |
2260 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0; | |
2261 # print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; | |
2262 | |
2263 } | |
2264 | |
2265 } | |
2266 elsif ($analyses[0] =~ /\./) { | |
2267 my @events = split(/\./,$analyses[0]); | |
2268 foreach my $event (@events){ | |
2269 # print "event = $event \n" if $printer == 1; | |
2270 if ($event =~ /\+/){ | |
2271 my $analysis = $event; | |
2272 $analysis =~ s/\+|\-//g; | |
2273 my @species = split(/\s*/,$analysis); | |
2274 my @currentMotifs = (); | |
2275 foreach my $specie (@species){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); } | |
2276 #print consistency(@currentMotifs),"<- \n"; | |
2277 # print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; | |
2278 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL"; | |
2279 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0; | |
2280 # print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; | |
2281 } | |
2282 else{ | |
2283 my $analysis = $event; | |
2284 $analysis =~ s/\+|\-//g; | |
2285 my @species = split(/\s*/,$analysis); | |
2286 my @currentMotifs = (); | |
2287 my @complementarySpecies = (); | |
2288 my $allSpecies = join("",@tags); | |
2289 foreach my $specie (@species){ $allSpecies =~ s/$specie//g; } | |
2290 foreach my $specie (split(/\s*/,$allSpecies)){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); } | |
2291 #print consistency(@currentMotifs),"<- \n"; | |
2292 # print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; | |
2293 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL"; | |
2294 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0; | |
2295 # print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; | |
2296 } | |
2297 } | |
2298 | |
2299 } | |
2300 } | |
2301 else{ | |
2302 my $finalanalysis = (); | |
2303 $template{$translate}[1]++; | |
2304 foreach my $analysis (@analyses){ ;} | |
2305 } | |
2306 # test if motifs where microsats are present, as indeed of same the motif composition | |
2307 | |
2308 | |
2309 | |
2310 for my $templet ( keys %template ) { | |
2311 if (@{ $template{$templet} }[1] > 0){ | |
2312 | |
2313 $template{$templet}[1] = 0; | |
2314 # print "now returning: @{$template{$templet}}[0], $templet\n"; | |
2315 return (@{$template{$templet}}[0], $templet); | |
2316 } | |
2317 } | |
2318 undef %template; | |
2319 # print "sending NULL\n" if $printer == 1; | |
2320 return ("NULL", "NULL"); | |
2321 | |
2322 } | |
2323 | |
2324 | |
2325 sub consistency{ | |
2326 my @motifs = @_; | |
2327 # print "in consistency \n" if $printer == 1; | |
2328 # print "motifs sent = >",join("|",@motifs),"< \n" if $printer == 1; | |
2329 return $motifs[0] if scalar(@motifs) == 1; | |
2330 my $prevmotif = shift(@motifs); | |
2331 my $stopper = 0; | |
2332 for my $i (0 ... $#motifs){ | |
2333 next if $motifs[$i] eq "NA"; | |
2334 my $templet = $motifs[$i].$motifs[$i]; | |
2335 if ($templet !~ /$prevmotif/i){ | |
2336 $stopper = 1; last; | |
2337 } | |
2338 } | |
2339 return $prevmotif if $stopper == 0; | |
2340 return "NULL" if $stopper == 1; | |
2341 } | |
2342 sub summarize_microsat{ | |
2343 my $printer = 1; | |
2344 my $line = $_[0]; | |
2345 my $humseq = $_[1]; | |
2346 | |
2347 my @gaps = $line =~ /[0-9]+\t[0-9]+\t[\+\-]/g; | |
2348 my @starts = $line =~ /[0-9]+\t[\+\-]/g; | |
2349 my @ends = $line =~ /[\+\-]\t[0-9]+/g; | |
2350 # print "starts = @starts\tends = @ends\n" if $printer == 1; | |
2351 for my $i (0 ... $#gaps) {$gaps[$i] =~ s/\t[0-9]+\t[\+\-]//g;} | |
2352 for my $i (0 ... $#starts) {$starts[$i] =~ s/\t[\+\-]//g;} | |
2353 for my $i (0 ... $#ends) {$ends[$i] =~ s/[\+\-]\t//g;} | |
2354 | |
2355 my $minstart = array_smallest_number(@starts); | |
2356 my $maxend = array_largest_number(@ends); | |
2357 | |
2358 my $humupstream_st = substr($humseq, 0, $minstart); | |
2359 my $humupstream_en = substr($humseq, 0, $maxend); | |
2360 my $no_of_gaps_to_start = 0; | |
2361 my $no_of_gaps_to_end = 0; | |
2362 $no_of_gaps_to_start = ($humupstream_st =~ s/\-/x/g) if $humupstream_st=~/\-/; | |
2363 $no_of_gaps_to_end = ($humupstream_en =~ s/\-/x/g) if $humupstream_en=~/\-/; | |
2364 | |
2365 my $locusmotif = (); | |
2366 # print "IN SUB SUMMARIZE_MICROSAT $line\n" if $printer == 1; | |
2367 #return "NULL" if $line =~ /compound/; | |
2368 my $Hstart = "NA"; | |
2369 my $Hend = "NA"; | |
2370 chomp $line; | |
2371 my $match_count = ($line =~ s/>/>/g); | |
2372 #print "number of species = $match_count\n"; | |
2373 my @micros = split(/>/,$line); | |
2374 shift @micros; | |
2375 my $stopper = 0; | |
2376 | |
2377 | |
2378 foreach my $mic (@micros){ | |
2379 my @local = split(/\t/,$mic); | |
2380 if ($local[$microsatcord] =~ /N/) {$stopper =1; last;} | |
2381 } | |
2382 return "NULL" if $stopper ==1; | |
2383 | |
2384 #------------------------------------------------------ | |
2385 | |
2386 my @arranged = (); | |
2387 for my $arr (0 ... $#exacttags) {$arranged[$arr] = '0';} | |
2388 | |
2389 foreach my $micro (@micros){ | |
2390 for my $i (0 ... $#exacttags){ | |
2391 if ($micro =~ /^$exacttags[$i]/){ | |
2392 $arranged[$i] = $micro; | |
2393 last; | |
2394 } | |
2395 } | |
2396 } | |
2397 # print "arranged = @arranged \n" ; <STDIN>;; | |
2398 | |
2399 my @endstatement = (); | |
2400 my $turn = 0; | |
2401 my $species_counter = 0; | |
2402 # print scalar(@arranged),"\n"; | |
2403 | |
2404 my $species_no=0; | |
2405 | |
2406 my $orthHchr = 0; | |
2407 | |
2408 foreach my $micro (@arranged) { | |
2409 $micro =~ s/\t\t/\t \t/g; | |
2410 $micro =~ s/\t,/\t ,/g; | |
2411 $micro =~ s/,\t/, \t/g; | |
2412 # print "------------------------------------------------------------------------------------------\n" if $printer == 1; | |
2413 chomp $micro; | |
2414 if ($micro eq '0'){ | |
2415 push(@endstatement, join("\t",$exacttags[$species_counter],"NA","NA","NA","NA",0 ,"NA", "NA", 0,"NA","NA","NA", "NA" )); | |
2416 $species_counter++; | |
2417 # print join("|","ENDSTATEMENT:",@endstatement),"\n" if $printer == 1; | |
2418 next; | |
2419 } | |
2420 # print $micro,"\n"; | |
2421 # print "micro = $micro \n" if $printer == 1; | |
2422 my @fields = split(/\t/,$micro); | |
2423 my $microcopy = $fields[$microsatcord]; | |
2424 $microcopy =~ s/\[|\]|-//g; | |
2425 my $microsatlength = length($microcopy); | |
2426 # print "microsat = $fields[$microsatcord] and microsatlength = $microsatlength\n" if $printer == 1; | |
2427 # print "sp_ident = @sp_ident.. species_no=$species_no\n"; | |
2428 $micro =~ /$sp_ident[$species_no]\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)/; | |
2429 # print "$micro =~ /$sp_ident[$species_no] ([0-9a-zA-Z_]+) ([0-9]+) ([0-9]+)/\n"; | |
2430 my $sp_chr=$1; | |
2431 my $sp_start=$2 + $fields[$startcord] - $fields[$gapcord]; | |
2432 my $sp_end= $sp_start + $microsatlength - 1; | |
2433 | |
2434 $species_no++; | |
2435 | |
2436 $micro =~ /$focalspec_orig\s(\S+)\s([0-9]+)\s([0-9]+)/; | |
2437 $orthHchr=$1; | |
2438 $Hstart=$2+$minstart-$no_of_gaps_to_start; | |
2439 $Hend=$2+$maxend-$no_of_gaps_to_end; | |
2440 # print "Hstart = $Hstart = $fields[4] + $fields[$startcord] - $fields[$gapcord]\n" if $printer == 1; | |
2441 | |
2442 my $motif = $fields[$motifcord]; | |
2443 my $firstmotif = (); | |
2444 my $strand = $fields[$strandcord]; | |
2445 # print "strand = $strand\n"; | |
2446 | |
2447 | |
2448 if ($motif =~ /^\[/){ | |
2449 $motif =~ s/^\[//g; | |
2450 $motif =~ /([a-zA-Z]+)\].*/; | |
2451 $firstmotif = $1; | |
2452 } | |
2453 | |
2454 else {$firstmotif = $motif;} | |
2455 # print "firstmotif =$firstmotif : \n" if $printer == 1; | |
2456 $firstmotif = allCaps($firstmotif); | |
2457 | |
2458 if (exists $revHash{$firstmotif} && $turn == 0) { | |
2459 $turn=1 if $species_counter==0; | |
2460 $firstmotif = $revHash{$firstmotif}; | |
2461 } | |
2462 | |
2463 elsif (exists $revHash{$firstmotif} && $turn == 1) {$firstmotif = $revHash{$firstmotif}; $turn = 1;} | |
2464 # print "changed firstmotif =$firstmotif\n" if $printer == 1; | |
2465 # <STDIN>; | |
2466 $locusmotif = $firstmotif; | |
2467 | |
2468 if (scalar(@fields) > $microsatcord + 2){ | |
2469 # print "fields = @fields ... interr_poscord=$interr_poscord=$fields[$interr_poscord] .. interrcord=$interrcord=$fields[$interrcord]\n" if $printer == 1; | |
2470 | |
2471 my @interposes = (); | |
2472 @interposes = split(",",$fields[$interr_poscord]) if $fields[$interr_poscord] =~ /,/; | |
2473 $interposes[0] = $fields[$interr_poscord] if $fields[$interr_poscord] !~ /,/ ; | |
2474 # print "interposes=@interposes\n" if $printer == 1; | |
2475 my @relativeposes = (); | |
2476 my @interruptions = (); | |
2477 @interruptions = split(",",$fields[$interrcord]) if $fields[$interrcord] =~ /,/; | |
2478 $interruptions[0] = $fields[$interrcord] if $fields[$interrcord] !~ /,/; | |
2479 my @interlens = (); | |
2480 | |
2481 | |
2482 for my $i (0 ... $#interposes){ | |
2483 | |
2484 my $interpos = $interposes[$i]; | |
2485 my $nexter = 0; | |
2486 my $interruption = $interruptions[$i]; | |
2487 my $interlen = length($interruption); | |
2488 push (@interlens, $interlen); | |
2489 | |
2490 | |
2491 my $relativepos = (100 * $interpos) / $microsatlength; | |
2492 # print "relativepos = $relativepos ,interpos=$interpos, interruption=$interruption, interlen=$interlen \n" if $printer == 1; | |
2493 $relativepos = (100 * ($interpos-$interlen)) / $microsatlength if $relativepos > 50; | |
2494 # print "--> = $relativepos\n" if $printer == 1; | |
2495 $interruption = "IND" if length($interruption) < 1; | |
2496 | |
2497 if ($turn == 1){ | |
2498 $fields[$microsatcord] = switch_micro($fields[$microsatcord]); | |
2499 $interruption = switch_nucl($interruption) unless $interruption eq "IND"; | |
2500 $interpos = ($microsatlength - $interpos) - $interlen + 2; | |
2501 # print "turn interpos = $interpos for $fields[$microsatcord]\n" if $printer == 1; | |
2502 $relativepos = (100 * $interpos) / $microsatlength; | |
2503 $relativepos = (100 * ($interpos-$interlen)) / $microsatlength if $relativepos > 50; | |
2504 | |
2505 | |
2506 $strand = '+' if $strand eq '-'; | |
2507 $strand = '-' if $strand eq '+'; | |
2508 } | |
2509 # print "final relativepos = $relativepos\n" if $printer == 1; | |
2510 push(@relativeposes, $relativepos); | |
2511 } | |
2512 push(@endstatement,join("\t",($exacttags[$species_counter],$sp_chr, $sp_start, $sp_end, $firstmotif,length($firstmotif),$fields[$microsatcord],$strand,$microsatlength,join(",",@interposes),join(",",@relativeposes),join(",",@interruptions), join(",",@interlens)))); | |
2513 } | |
2514 | |
2515 else{ | |
2516 push(@endstatement, join("\t",$exacttags[$species_counter],$sp_chr, $sp_start, $sp_end, $firstmotif,length($firstmotif),$fields[$microsatcord],$strand,$microsatlength,"NA","NA","NA", "NA")); | |
2517 } | |
2518 | |
2519 $species_counter++; | |
2520 } | |
2521 | |
2522 $locusmotif = $sameHash{$locusmotif} if exists $sameHash{$locusmotif}; | |
2523 $locusmotif = $revHash{$locusmotif} if exists $revHash{$locusmotif}; | |
2524 | |
2525 my $endst = join("\t", @endstatement, $orthHchr, $Hstart, $Hend); | |
2526 # print join("\t", @endstatement, $orthHchr, $Hstart, $Hend), "\n" if $printer == 1; | |
2527 | |
2528 | |
2529 return (join("\t", @endstatement, $orthHchr, $Hstart, $Hend), $orthHchr, $Hstart, $Hend, $locusmotif, length($locusmotif)); | |
2530 | |
2531 } | |
2532 | |
2533 sub switch_nucl{ | |
2534 my @strand = split(/\s*/,$_[0]); | |
2535 for my $i (0 ... $#strand){ | |
2536 if ($strand[$i] =~ /c/i) {$strand[$i] = "G";next;} | |
2537 if ($strand[$i] =~ /a/i) {$strand[$i] = "T";next;} | |
2538 if ($strand[$i] =~ /t/i) { $strand[$i] = "A";next;} | |
2539 if ($strand[$i] =~ /g/i) {$strand[$i] = "C";next;} | |
2540 } | |
2541 return join("",@strand); | |
2542 } | |
2543 | |
2544 | |
2545 sub switch_micro{ | |
2546 my $micro = reverse($_[0]); | |
2547 my @strand = split(/\s*/,$micro); | |
2548 for my $i (0 ... $#strand){ | |
2549 if ($strand[$i] =~ /c/i) {$strand[$i] = "G";next;} | |
2550 if ($strand[$i] =~ /a/i) {$strand[$i] = "T";next;} | |
2551 if ($strand[$i] =~ /t/i) { $strand[$i] = "A";next;} | |
2552 if ($strand[$i] =~ /g/i) {$strand[$i] = "C";next;} | |
2553 if ($strand[$i] =~ /\[/i) {$strand[$i] = "]";next;} | |
2554 if ($strand[$i] =~ /\]/i) {$strand[$i] = "[";next;} | |
2555 } | |
2556 return join("",@strand); | |
2557 } | |
2558 sub decipher_history{ | |
2559 my $printer = 0; | |
2560 my ($mutations_array, $tags_string, $nodes, $branches_hash, $tree_analysis, $confirmation_string, $alivehash) = @_; | |
2561 my %mutations_hash=(); | |
2562 foreach my $mutation (@$mutations_array){ | |
2563 # print "mutation = $mutation\n" if $printer == 1; | |
2564 my %local = $mutation =~ /([\S ]+)=([\S ]+)/g; | |
2565 push @{$mutations_hash{$local{"node"}}},$mutation; | |
2566 # print "just for confirmation: $local{node} pushed as: $mutation\n" if $printer == 1; | |
2567 } | |
2568 my @nodes; | |
2569 my @birth_steps=(); | |
2570 my @death_steps=(); | |
2571 | |
2572 my @tags=split(/\s*/,$tags_string); | |
2573 my @confirmation=split(/\s+/,$confirmation_string); | |
2574 my %info=(); | |
2575 | |
2576 for my $i (0 ... $#tags){ | |
2577 $info{$tags[$i]}=$confirmation[$i]; | |
2578 # print "feeding info: $tags[$i] = $info{$tags[$i]}\n" if $printer == 1; | |
2579 } | |
2580 | |
2581 for my $keys (@$nodes) { | |
2582 foreach my $key (@$keys){ | |
2583 # print "current key = $key\n"; | |
2584 my $copykey = $key; | |
2585 $copykey =~ s/[\W ]+//g; | |
2586 my @copykeys=split(/\s*/,$copykey); | |
2587 my $states=(); | |
2588 foreach my $copy (@copykeys){ | |
2589 $states=$states.$info{$copy}; | |
2590 } | |
2591 # print "reduced key = $copykey and state = $states\n" if $printer == 1; | |
2592 | |
2593 if (exists $mutations_hash{$key}) { | |
2594 | |
2595 if ($states=~/\+/){ | |
2596 push @birth_steps, @{$mutations_hash{$key}}; | |
2597 $birth_steps[$#birth_steps] =~ s/\S+=//g; | |
2598 delete $mutations_hash{$key}; | |
2599 } | |
2600 else{ | |
2601 push @death_steps, @{$mutations_hash{$key}}; | |
2602 $death_steps[$#death_steps] =~ s/\S+=//g; | |
2603 delete $mutations_hash{$key}; | |
2604 } | |
2605 } | |
2606 } | |
2607 } | |
2608 # print "conformation = $confirmation_string\n" if $printer == 1; | |
2609 push (@birth_steps, "NULL") if scalar(@birth_steps) == 0; | |
2610 push (@death_steps, "NULL") if scalar(@death_steps) == 0; | |
2611 # print "birth steps = ",join("\n",@birth_steps)," and death steps = ",join("\n",@death_steps),"\n" if $printer == 1; | |
2612 return \@birth_steps, \@death_steps; | |
2613 } | |
2614 | |
2615 sub fillAlignmentGaps{ | |
2616 my $printer = 0; | |
2617 # print "received: @_\n" if $printer == 1; | |
2618 my ($tree, $sequences, $alignment, $tagarray, $microsathash, $nonmicrosathash, $motif, $tree_analysis, $threshold, $microsatstarts) = @_; | |
2619 # print "in fillAlignmentGaps.. tree = $tree \n" if $printer == 1; | |
2620 my %sequence_hash=(); | |
2621 | |
2622 my @phases = (); | |
2623 my $concat = $motif.$motif; | |
2624 my $motifsize = length($motif); | |
2625 | |
2626 for my $i (1 ... $motifsize){ | |
2627 push @phases, substr($concat, $i, $motifsize); | |
2628 } | |
2629 | |
2630 my $concatalignment = (); | |
2631 foreach my $tag (@tags){ | |
2632 $concatalignment = $concatalignment.$alignment->{$tag}; | |
2633 } | |
2634 # print "returningg NULL","NULL","NULL", "NULL\n" if $concatalignment !~ /-/; | |
2635 return 0, "NULL","NULL","NULL", "NULL","NULL" if $concatalignment !~ /-/; | |
2636 | |
2637 | |
2638 | |
2639 my %node_sequences_temp=(); | |
2640 my %node_alignments_temp =(); #NEW, Nov 28 2008 | |
2641 | |
2642 my @tags=(); | |
2643 my @locus_sequences=(); | |
2644 my %alivehash=(); | |
2645 | |
2646 # print "IN fillAlignmentGaps\n";# <STDIN>; | |
2647 my %fillrecord = (); | |
2648 | |
2649 my $change = 0; | |
2650 foreach my $tag (@$tagarray) { | |
2651 #print "adding: $tag\n"; | |
2652 push(@tags, $tag); | |
2653 if (exists $microsathash->{$tag}){ | |
2654 my $micro = $microsathash->{$tag}; | |
2655 my $orig_micro = $micro; | |
2656 ($micro, $fillrecord{$tag}) = fillgaps($micro, \@phases); | |
2657 $change = 1 if uc($micro) ne uc($orig_micro); | |
2658 $node_sequences_temp{$tag}=$micro if $microsathash->{$tag} ne "NULL"; | |
2659 } | |
2660 if (exists $nonmicrosathash->{$tag}){ | |
2661 my $micro = $nonmicrosathash->{$tag}; | |
2662 my $orig_micro = $micro; | |
2663 ($micro, $fillrecord{$tag}) = fillgaps($micro, \@phases); | |
2664 $change = 1 if uc($micro) ne uc($orig_micro); | |
2665 $node_sequences_temp{$tag}=$micro if $nonmicrosathash->{$tag} ne "NULL"; | |
2666 } | |
2667 | |
2668 if (exists $alignment->{$tag}){ | |
2669 my $micro = $alignment->{$tag}; | |
2670 my $orig_micro = $micro; | |
2671 ($micro, $fillrecord{$tag}) = fillgaps($micro, \@phases); | |
2672 $change = 1 if uc($micro) ne uc($orig_micro); | |
2673 $node_alignments_temp{$tag}=$micro if $alignment->{$tag} ne "NULL"; | |
2674 } | |
2675 | |
2676 #print "adding to node_sequences: $tag = ",$node_sequences_temp{$tag},"\n" if $printer == 1; | |
2677 #print "adding to node_alignments: $tag = ",$node_alignments_temp{$tag},"\n" if $printer == 1; | |
2678 } | |
2679 | |
2680 | |
2681 my %node_sequences=(); | |
2682 my %node_alignments =(); #NEW, Nov 28 2008 | |
2683 foreach my $tag (@$tagarray) { | |
2684 $node_sequences{$tag} = join ".",split(/\s*/,$node_sequences_temp{$tag}); | |
2685 $node_alignments{$tag} = join ".",split(/\s*/,$node_alignments_temp{$tag}); | |
2686 } | |
2687 # print "\n", "#" x 50, "\n" if $printer == 1; | |
2688 foreach my $tag (@tags){ | |
2689 # print "$tag: $alignment->{$tag} = $node_alignments{$tag}\n" if $printer == 1; | |
2690 } | |
2691 # print "\n", "#" x 50, "\n" if $printer == 1; | |
2692 # print "change = $change\n"; | |
2693 #<STDIN> if $concatalignment=~/\-/; | |
2694 | |
2695 # <STDIN> if $printer == 1 && $concatalignment =~ /\-/; | |
2696 | |
2697 return 0, "NULL","NULL","NULL", "NULL", "NULL" if $change == 0; | |
2698 | |
2699 my ($nodes_arr, $branches_hash) = get_nodes($tree); | |
2700 my @nodes=@$nodes_arr; | |
2701 # print "recieved nodes = @nodes\n" if $printer == 1; | |
2702 | |
2703 | |
2704 #POPULATE branches_hash WITH INFORMATION ABOUT LIVESTATUS | |
2705 foreach my $keys (@nodes){ | |
2706 my @pair = @$keys; | |
2707 my $joint = "(".join(", ",@pair).")"; | |
2708 my $copykey = join "", @pair; | |
2709 $copykey =~ s/[\W ]+//g; | |
2710 # print "for node: $keys, copykey = $copykey and joint = $joint\n" if $printer == 1; | |
2711 my $livestatus = 1; | |
2712 foreach my $copy (split(/\s*/,$copykey)){ | |
2713 $livestatus = 0 if !exists $alivehash{$copy}; | |
2714 } | |
2715 $alivehash{$joint} = $joint if !exists $alivehash{$joint} && $livestatus == 1; | |
2716 # print "alivehash = $alivehash{$joint}\n" if exists $alivehash{$joint} && $printer == 1; | |
2717 } | |
2718 | |
2719 | |
2720 | |
2721 @nodes = reverse(@nodes); #1 THIS IS IN ORDER TO GO THROUGH THE TREE FROM LEAVES TO ROOT. | |
2722 | |
2723 my @mutations_array=(); | |
2724 | |
2725 my $joint = (); | |
2726 foreach my $node (@nodes){ | |
2727 my @pair = @$node; | |
2728 # print "now in the nodes for loop, pair = @pair\n and sequences=\n" if $printer == 1; | |
2729 $joint = "(".join(", ",@pair).")"; | |
2730 # print "joint = $joint \n" if $printer == 1; | |
2731 my @pair_sequences=(); | |
2732 | |
2733 foreach my $tag (@pair){ | |
2734 # print "tag = $tag: " if $printer == 1; | |
2735 # print $node_alignments{$tag},"\n" if $printer == 1; | |
2736 push @pair_sequences, $node_alignments{$tag}; | |
2737 } | |
2738 # print "fillgap\n"; | |
2739 my ($compared, $substitutions_list) = base_by_base_simple($motif,\@pair_sequences, scalar(@pair_sequences), @pair, $joint); | |
2740 $node_alignments{$joint}=$compared; | |
2741 push( @mutations_array,split(/:/,$substitutions_list)); | |
2742 # print "newly added to node_sequences: $node_alignments{$joint} and list of mutations = @mutations_array\n" if $printer == 1; | |
2743 } | |
2744 # print "now sending for analyze_mutations: mutation_array=@mutations_array, nodes=@nodes, branches_hash=$branches_hash, alignment=$alignment, tags=@tags, alivehash=%alivehash, node_sequences=\%node_sequences, microsatstarts=$microsatstarts, motif=$motif\n" if $printer == 1; | |
2745 # <STDIN> if $printer == 1; | |
2746 | |
2747 my $analayzed_mutations = analyze_mutations(\@mutations_array, \@nodes, $branches_hash, $alignment, \@tags, \%alivehash, \%node_sequences, $microsatstarts, $motif); | |
2748 | |
2749 # print "returningt: ", $analayzed_mutations, \@nodes,"\n" if scalar @mutations_array > 0;; | |
2750 # print "returningy: NULL, NULL, NULL " if scalar @mutations_array == 0 && $printer == 1; | |
2751 # print "final node alignment after filling for $joint= " if $printer == 1; | |
2752 # print "$node_alignments{$joint}\n" if $printer == 1; | |
2753 | |
2754 | |
2755 return 1, $analayzed_mutations, \@nodes, $branches_hash, \%alivehash, $node_alignments{$joint} if scalar @mutations_array > 0 ; | |
2756 return 1, "NULL","NULL","NULL", "NULL", "NULL" if scalar @mutations_array == 0; | |
2757 } | |
2758 | |
2759 | |
2760 | |
2761 sub add_mutation{ | |
2762 my $printer = 0; | |
2763 # print "IN SUBROUTUNE add_mutation.. information received = @_\n" if $printer == 1; | |
2764 my ($i , $bite, $to, $from) = @_; | |
2765 # print "bite = $bite.. all received info = ",join("^", @_),"\n" if $printer == 1; | |
2766 # print "to=$to\n" if $printer == 1; | |
2767 # print "tis split = ",join(" and ",split(/!/,$to)),"\n" if $printer == 1; | |
2768 my @toields = split "!",$to; | |
2769 # print "toilds = @toields\n" if $printer == 1; | |
2770 my @mutations=(); | |
2771 | |
2772 foreach my $toield (@toields){ | |
2773 my @toinfo=split(":",$toield); | |
2774 # print " at toinfo=@toinfo \n" if $printer == 1; | |
2775 next if $toinfo[1] =~ /$from/i; | |
2776 my @mutation = @toinfo if $toinfo[1] !~ /$from/i; | |
2777 # print "adding to mutaton list: ", join(",", "node=$mutation[0]","type=substitution" ,"position=$i", "from=$from", "to=$mutation[1]", "insertion=", "deletion="),"\n" if $printer == 1; | |
2778 push (@mutations, join("\t", "node=$mutation[0]","type=substitution" ,"position=$i", "from=$from", "to=$mutation[1]", "insertion=", "deletion=")); | |
2779 } | |
2780 return @mutations; | |
2781 } | |
2782 | |
2783 | |
2784 sub add_bases{ | |
2785 | |
2786 my $printer = 0; | |
2787 # print "IN SUBROUTUNE add_bases.. information received = @_\n" if $printer == 1; | |
2788 my ($optional0, $optional1, $pair0, $pair1,$joint) = @_; | |
2789 my $total_list=(); | |
2790 | |
2791 my @total_list0=split(/!/,$optional0); | |
2792 my @total_list1=split(/!/,$optional1); | |
2793 my @all_list=(); | |
2794 my %total_hash0=(); | |
2795 foreach my $entry (@total_list0) { | |
2796 $entry = uc $entry; | |
2797 $entry =~ /(\S+):(\S+)/; | |
2798 $total_hash0{$2}=$1; | |
2799 push @all_list, $2; | |
2800 } | |
2801 | |
2802 my %total_hash1=(); | |
2803 foreach my $entry (@total_list1) { | |
2804 $entry = uc $entry; | |
2805 $entry =~ /(\S+):(\S+)/; | |
2806 $total_hash1{$2}=$1; | |
2807 push @all_list, $2; | |
2808 } | |
2809 | |
2810 my %alphabetical_hash=(); | |
2811 my @return_options=(); | |
2812 | |
2813 for my $i (0 ... $#all_list){ | |
2814 my $alph = $all_list[$i]; | |
2815 if (exists $total_hash0{$alph} && exists $total_hash1{$alph}){ | |
2816 push(@return_options, $joint.":".$alph); | |
2817 delete $total_hash0{$alph}; delete $total_hash1{$alph}; | |
2818 } | |
2819 if (exists $total_hash0{$alph} && !exists $total_hash1{$alph}){ | |
2820 push(@return_options, $pair0.":".$alph); | |
2821 delete $total_hash0{$alph}; | |
2822 } | |
2823 if (!exists $total_hash0{$alph} && exists $total_hash1{$alph}){ | |
2824 push(@return_options, $pair1.":".$alph); | |
2825 delete $total_hash1{$alph}; | |
2826 } | |
2827 | |
2828 } | |
2829 # print "returning ",join "!",@return_options,"\n" if $printer == 1; | |
2830 return join "!",@return_options; | |
2831 | |
2832 } | |
2833 | |
2834 | |
2835 sub fillgaps{ | |
2836 # print "IN fillgaps: @_\n"; | |
2837 my ($micro, $phasesinput) = @_; | |
2838 #print "in microsathash ,,.. micro = $micro\n"; | |
2839 return $micro if $micro !~ /\-/; | |
2840 my $orig_micro = $micro; | |
2841 my @phases = @$phasesinput; | |
2842 | |
2843 my %tested_patterns = (); | |
2844 | |
2845 foreach my $phase (@phases){ | |
2846 # print "considering phase: $phase\n"; | |
2847 my @phase_prefixes = (); | |
2848 my @prephase_left_contexts = (); | |
2849 my @prephase_right_contexts = (); | |
2850 my @pregapsize = (); | |
2851 my @prepostfilins = (); | |
2852 | |
2853 my @phase_suffixes; | |
2854 my @suffphase_left_contexts; | |
2855 my @suffphase_right_contexts; | |
2856 my @suffgapsize; | |
2857 my @suffpostfilins; | |
2858 | |
2859 my @postfilins = (); | |
2860 my $motifsize = length($phases[0]); | |
2861 | |
2862 my $change = 0; | |
2863 | |
2864 for my $u (0 ... $motifsize-1){ | |
2865 my $concat = $phase.$phase.$phase.$phase; | |
2866 my @concatarr = split(/\s*/, $concat); | |
2867 my $l = 0; | |
2868 while ($l < $u){ | |
2869 shift @concatarr; | |
2870 $l++; | |
2871 } | |
2872 $concat = join ("", @concatarr); | |
2873 | |
2874 for my $t (0 ... $motifsize-1){ | |
2875 for my $k (1 ... $motifsize-1){ | |
2876 push @phase_prefixes, substr($concat, $motifsize+$t, $k); | |
2877 push @prephase_left_contexts, substr ($concat, $t, $motifsize); | |
2878 push @prephase_right_contexts, substr ($concat, $motifsize+$t+$k+($motifsize-$k), 1); | |
2879 push @pregapsize, $k; | |
2880 push @prepostfilins, substr($concat, $motifsize+$t+$k, ($motifsize-$k)); | |
2881 # print "reading: $concat, t=$t, k=$k prefix: $prephase_left_contexts[$#prephase_left_contexts] $phase_prefixes[$#phase_prefixes] -x$pregapsize[$#pregapsize] $prephase_right_contexts[$#prephase_right_contexts]\n"; | |
2882 # print "phase_prefixes = $phase_prefixes[$#phase_prefixes]\n"; | |
2883 # print "prephase_left_contexts = $prephase_left_contexts[$#prephase_left_contexts]\n"; | |
2884 # print "prephase_right_contexts = $prephase_right_contexts[$#prephase_right_contexts]\n"; | |
2885 # print "pregapsize = $pregapsize[$#pregapsize]\n"; | |
2886 # print "prepostfilins = $prepostfilins[$#prepostfilins]\n"; | |
2887 } | |
2888 } | |
2889 } | |
2890 | |
2891 # print "looking if $micro =~ /($phase\-{$motifsize})/i || $micro =~ /^(\-{$motifsize,}$phase)/i\n"; | |
2892 if ($micro =~ /($phase\-{$motifsize,})$/i || $micro =~ /^(\-{$motifsize,}$phase)/i){ | |
2893 # print "micro: $micro needs further gap removal: $1\n"; | |
2894 while ($micro =~ /$phase(\-{$motifsize,})$/i || $micro =~ /^(\-{$motifsize,})$phase/i){ | |
2895 # print "micro: $micro needs further gap removal: $1\n"; | |
2896 | |
2897 # print "phase being considered = $phase\n"; | |
2898 my $num = (); | |
2899 $num = $micro =~ s/$phase\-{$motifsize}/$phase$phase/gi if $micro =~ /$phase\-{$motifsize,}/i; | |
2900 $num = $micro =~ s/\-{$motifsize}$phase/$phase$phase/gi if $micro =~ /\-{$motifsize,}$phase/i; | |
2901 # print "num = $num\n"; | |
2902 $change = 1 if $num == 1; | |
2903 } | |
2904 } | |
2905 | |
2906 elsif ($micro =~ /(($phase)+)\-{$motifsize,}(($phase)+)/i){ | |
2907 while ($micro =~ /(($phase)+)\-{$motifsize,}(($phase)+)/i){ | |
2908 # print "checking lengths of $1 and $3 for $micro... \n"; | |
2909 my $num = (); | |
2910 if (length($1) >= length($3)){ | |
2911 # print "$micro matches (($phase)+)\-{$motifsize,}(($phase)+) = $1, >= , $3 \n"; | |
2912 $num = $micro =~ s/$phase\-{$motifsize}/$phase$phase/gi ; | |
2913 } | |
2914 if (length($1) < length($3)){ | |
2915 # print "$micro matches (($phase)+)\-{$motifsize,}(($phase)+) = $1, < , $3 \n"; | |
2916 $num = $micro =~ s/\-{$motifsize}$phase/$phase$phase/gi ; | |
2917 } | |
2918 # print "micro changed to $micro\n"; | |
2919 } | |
2920 } | |
2921 elsif ($micro =~ /([A-Z]+)\-{$motifsize,}(($phase)+)/i){ | |
2922 while ($micro =~ /([A-Z]+)\-{$motifsize,}(($phase)+)/i){ | |
2923 # print "$micro matches ([A-Z]+)\-{$motifsize}(($phase)+) = 1=$1, - , 3=$3 \n"; | |
2924 my $num = 0; | |
2925 $num = $micro =~ s/\-{$motifsize}$phase/$phase$phase/gi ; | |
2926 } | |
2927 } | |
2928 elsif ($micro =~ /(($phase)+)\-{$motifsize,}([A-Z]+)/i){ | |
2929 while ($micro =~ /(($phase)+)\-{$motifsize,}([A-Z]+)/i){ | |
2930 # print "$micro matches (($phase)+)\-{$motifsize,}([A-Z]+) = 1=$1, - , 3=$3 \n"; | |
2931 my $num = 0; | |
2932 $num = $micro =~ s/$phase\-{$motifsize}/$phase$phase/gi ; | |
2933 } | |
2934 } | |
2935 | |
2936 # print "$orig_micro to $micro\n"; | |
2937 | |
2938 #s <STDIN>; | |
2939 | |
2940 for my $h (0 ... $#phase_prefixes){ | |
2941 # print "searching using prefix : $prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h]\n"; | |
2942 my $pattern = $prephase_left_contexts[$h].$phase_prefixes[$h].$pregapsize[$h].$prephase_right_contexts[$h]; | |
2943 # print "returning orig_micro = $orig_micro, micro = $micro \n" if exists $tested_patterns{$pattern}; | |
2944 if ($micro =~ /$prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h]/i){ | |
2945 return $orig_micro if exists $tested_patterns{$pattern}; | |
2946 while ($micro =~ /($prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h])/i){ | |
2947 $tested_patterns{$pattern} = $pattern; | |
2948 # print "micro: $micro needs further gap removal: $1\n"; | |
2949 | |
2950 # print "prefix being considered = $phase_prefixes[$h]\n"; | |
2951 my $num = (); | |
2952 $num = ($micro =~ s/$prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h]/$prephase_left_contexts[$h]$phase_prefixes[$h]$prepostfilins[$h]$prephase_right_contexts[$h]/gi) ; | |
2953 # print "num = $num, micro = $micro\n"; | |
2954 $change = 1 if $num == 1; | |
2955 | |
2956 return $orig_micro if $num > 1; | |
2957 } | |
2958 } | |
2959 | |
2960 } | |
2961 } | |
2962 return $orig_micro if length($micro) != length($orig_micro); | |
2963 return $micro; | |
2964 } | |
2965 | |
2966 sub selectMutationArray{ | |
2967 my $printer =0; | |
2968 | |
2969 my $oldmutspt = $_[0]; | |
2970 my $newmutspt = $_[1]; | |
2971 my $tagstringpt = $_[2]; | |
2972 my $alivehashpt = $_[3]; | |
2973 my $alignmentpt = $_[4]; | |
2974 my $motif = $_[5]; | |
2975 | |
2976 my @alivehasharr=(); | |
2977 | |
2978 my @tags = @$tagstringpt; | |
2979 my $alignmentln = length($alignmentpt->{$tags[0]}); | |
2980 | |
2981 foreach my $key (keys %$alivehashpt) { push @alivehasharr, $key; } | |
2982 | |
2983 my %newside = (); | |
2984 my %oldside = (); | |
2985 my %newmuts = (); | |
2986 | |
2987 my %commons = (); | |
2988 my %olds = (); | |
2989 foreach my $old (@$oldmutspt){ | |
2990 $olds{$old} = 1; | |
2991 } | |
2992 foreach my $new (@$newmutspt){ | |
2993 $commons{$new} = 1 if exists $olds{$new};; | |
2994 } | |
2995 | |
2996 | |
2997 foreach my $pos ( 0 ... $alignmentln){ | |
2998 #print "pos = $pos\n" if $printer == 1; | |
2999 my $newyes = 0; | |
3000 foreach my $mut (@$newmutspt){ | |
3001 $newmuts{$mut} = 1; | |
3002 chomp $mut; | |
3003 $newyes++; | |
3004 $mut =~ s/=\t/= \t/g; | |
3005 $mut =~ s/=$/= /g; | |
3006 | |
3007 $mut =~ /node=([A-Z\(\), ]+)\stype=([a-zA-Z ]+)\sposition=([0-9 ]+)\sfrom=([a-zA-Z\- ]+)\sto=([a-zA-Z\- ]+)\sinsertion=([a-zA-Z\- ]+)\sdeletion=([a-zA-Z\- ]+)/; | |
3008 my $node = $1; | |
3009 next if $3 != $pos; | |
3010 # print "new mut = $mut\n" if $printer == 1; | |
3011 # print "node = $node, pos = $3 ... and alivehasharr = >@alivehasharr<\n" if $printer == 1; | |
3012 my $alivenode = 0; | |
3013 foreach my $key (@alivehasharr){ | |
3014 $alivenode = 1 if $key =~ /$node/; | |
3015 } | |
3016 # next if $alivenode == 0; | |
3017 my $indel_type = " "; | |
3018 if ($2 eq "insertion" || $2 eq "deletion"){ | |
3019 my $thisindel = (); | |
3020 $thisindel = $6 if $2 eq "insertion"; | |
3021 $thisindel = $7 if $2 eq "deletion"; | |
3022 | |
3023 $indel_type = "i".checkIndelType($node, $thisindel, $motif,$alignmentpt,$3, $2) if $2 eq "insertion"; | |
3024 $indel_type = "d".checkIndelType($node, $thisindel, $motif,$alignmentpt, $3, $2) if $2 eq "deletion"; | |
3025 $indel_type = $indel_type."f" if $indel_type =~ /mot/ && length($thisindel) >= length($motif); | |
3026 } | |
3027 # print "indeltype = $indel_type\n" if $printer == 1; | |
3028 my $added = 0; | |
3029 | |
3030 if (exists $newside{$pos} && $indel_type =~ /[a-z]+/){ | |
3031 # print "we have a preexisting one for $pos\n" if $printer == 1; | |
3032 my @preexisting = @{$newside{$pos}}; | |
3033 foreach my $pre (@preexisting){ | |
3034 # print "looking at $pre\n" if $printer == 1; | |
3035 next if $pre !~ /node=$node/; | |
3036 next if $pre !~ /indeltype=([a-z]+)/; | |
3037 my $currtype = $1; | |
3038 | |
3039 if ($currtype =~ /inon/ && $indel_type =~ /dmot/){ | |
3040 delete $newside{$pos}; | |
3041 push @{$newside{$pos}}, $pre; | |
3042 $added = 1; | |
3043 } | |
3044 if ($currtype =~ /dnon/ && $indel_type =~ /imot/){ | |
3045 delete $newside{$pos}; | |
3046 push @{$newside{$pos}}, $pre; | |
3047 $added = 1; | |
3048 } | |
3049 if ($currtype =~ /dmot/ && $indel_type =~ /inon/){ | |
3050 delete $newside{$pos}; | |
3051 push @{$newside{$pos}}, $mut."\tindeltype=$indel_type"; | |
3052 $added = 1; | |
3053 } | |
3054 if ($currtype =~ /imot/ && $indel_type =~ /dnon/){ | |
3055 delete $newside{$pos}; | |
3056 push @{$newside{$pos}}, $mut."\tindeltype=$indel_type"; | |
3057 $added = 1; | |
3058 } | |
3059 } | |
3060 } | |
3061 # print "added = $added\n" if $printer == 1; | |
3062 push @{$newside{$pos}}, $mut."\tindeltype=$indel_type" if $added == 0; | |
3063 # print "for new pos,: $pos we have: @{$newside{$pos}}\n " if $printer == 1; | |
3064 } | |
3065 } | |
3066 | |
3067 foreach my $pos ( 0 ... $alignmentln){ | |
3068 my $oldyes = 0; | |
3069 foreach my $mut (@$oldmutspt){ | |
3070 chomp $mut; | |
3071 $oldyes++; | |
3072 $mut =~ s/=\t/= \t/g; | |
3073 $mut =~ s/=$/= /g; | |
3074 $mut =~ /node=([A-Z\(\), ]+)\ttype=([a-zA-Z ]+)\tposition=([0-9 ]+)\tfrom=([a-zA-Z\- ]+)\tto=([a-zA-Z\- ]+)\tinsertion=([a-zA-Z\- ]+)\tdeletion=([a-zA-Z\- ]+)/; | |
3075 my $node = $1; | |
3076 next if $3 != $pos; | |
3077 # print "old mut = $mut\n" if $printer == 1; | |
3078 my $alivenode = 0; | |
3079 foreach my $key (@alivehasharr){ | |
3080 $alivenode = 1 if $key =~ /$node/; | |
3081 } | |
3082 #next if $alivenode == 0; | |
3083 my $indel_type = " "; | |
3084 if ($2 eq "insertion" || $2 eq "deletion"){ | |
3085 $indel_type = "i".checkIndelType($node, $6, $motif,$alignmentpt, $3, $2) if $2 eq "insertion"; | |
3086 $indel_type = "d".checkIndelType($node, $7, $motif,$alignmentpt, $3, $2) if $2 eq "deletion"; | |
3087 next if $indel_type =~/non/; | |
3088 } | |
3089 else{ next;} | |
3090 | |
3091 my $imp=0; | |
3092 $imp = 1 if $indel_type =~ /dmot/ && $alivenode == 0; | |
3093 $imp = 1 if $indel_type =~ /imot/ && $alivenode == 1; | |
3094 | |
3095 | |
3096 if (exists $newside{$pos} && $indel_type =~ /[a-z]+/){ | |
3097 my @preexisting = @{$newside{$pos}}; | |
3098 # print "we have a preexisting one for $pos: @preexisting\n" if $printer == 1; | |
3099 next if $imp == 0; | |
3100 | |
3101 if (scalar(@preexisting) == 1){ | |
3102 my $foundmut = $preexisting[0]; | |
3103 $foundmut=~ /node=([A-Z, \(\)]+)/; | |
3104 next if $1 eq $node; | |
3105 | |
3106 if (exists $oldside{$pos} || exists $commons{$foundmut}){ | |
3107 # print "not replacing, but just adding\n" if $printer == 1; | |
3108 push @{$newside{$pos}}, $mut."\tindeltype=$indel_type"; | |
3109 push @{$oldside{$pos}}, $mut."\tindeltype=$indel_type"; | |
3110 next; | |
3111 } | |
3112 | |
3113 delete $newside{$pos}; | |
3114 push @{$oldside{$pos}}, $mut."\tindeltype=$indel_type"; | |
3115 push @{$newside{$pos}}, $mut."\tindeltype=$indel_type"; | |
3116 # print "now new one is : @{$newside{$pos}}\n" if $printer == 1; | |
3117 } | |
3118 # print "for pos: $pos: @{$newside{$pos}}\n" if $printer == 1; | |
3119 next; | |
3120 } | |
3121 | |
3122 | |
3123 my @news = @{$newside{$pos}} if exists $newside{$pos}; | |
3124 # print "mut = $mut and news = @news\n" if $printer == 1; | |
3125 push @{$oldside{$pos}}, $mut."\tindeltype=$indel_type"; | |
3126 push @{$newside{$pos}}, $mut."\tindeltype=$indel_type"; | |
3127 } | |
3128 } | |
3129 # print "in the end, our collected mutations = \n" if $printer == 1; | |
3130 my @returnarr = (); | |
3131 foreach my $key (keys %newside) {push @returnarr,@{$newside{$key}};} | |
3132 # print join("\n", @returnarr),"\n" if $printer == 1; | |
3133 #<STDIN>; | |
3134 return @returnarr; | |
3135 | |
3136 } | |
3137 | |
3138 | |
3139 sub checkIndelType{ | |
3140 my $printer = 0; | |
3141 my $node = $_[0]; | |
3142 my $indel = $_[1]; | |
3143 my $motif = $_[2]; | |
3144 my $alignmentpt = $_[3]; | |
3145 my $posit = $_[4]; | |
3146 my $type = $_[5]; | |
3147 my @phases =(); | |
3148 my %prephases = (); | |
3149 my %postphases = (); | |
3150 #print "motif = $motif\n"; | |
3151 # print "IN checkIndelType ... received: @_\n" if $printer == 1; | |
3152 my $concat = $motif.$motif.$motif.$motif; | |
3153 my $motiflength = length($motif); | |
3154 | |
3155 if ($motiflength > length ($indel)){ | |
3156 return "non" if $motif !~ /$indel/i; | |
3157 return checkIndelType_ComplexAnalysis($node, $indel, $motif, $alignmentpt, $posit, $type); | |
3158 } | |
3159 | |
3160 my $firstpass = 0; | |
3161 for my $y (0 ... $motiflength-1){ | |
3162 my $phase = substr($concat, $motiflength+$y, $motiflength); | |
3163 push @phases, $phase; | |
3164 $firstpass = 1 if $indel =~ /$phase/i; | |
3165 for my $k (0 ... length($motif)-1){ | |
3166 # print "at: motiflength=$motiflength , y=$y , k=$k.. for pre: $motiflength+$y-$k and post: $motiflength+$y-$k+$motiflength in $concat\n" if $printer == 1; | |
3167 my $pre = substr($concat, $motiflength+$y-$k, $k ); | |
3168 my $post = substr($concat, $motiflength+$y+$motiflength, $k); | |
3169 # print "adding to phases : $phase - $pre and $post\n" if $printer == 1; | |
3170 push @{$prephases{$phase}} , $pre; | |
3171 push @{$postphases{$phase}} , $post; | |
3172 } | |
3173 | |
3174 } | |
3175 # print "firstpass 1= $firstpass\n" if $printer == 1; | |
3176 return "non" if $firstpass ==0; | |
3177 $firstpass =0; | |
3178 | |
3179 foreach my $phase (@phases){ | |
3180 my @pres = @{$prephases{$phase}}; | |
3181 my @posts = @{$postphases{$phase}}; | |
3182 | |
3183 foreach my $pre (@pres){ | |
3184 foreach my $post (@posts){ | |
3185 | |
3186 $firstpass = 1 if $indel =~ /($pre)?($phase)+($post)?/i && length($indel) > (3 * length($motif)); | |
3187 $firstpass = 1 if $indel =~ /^($pre)?($phase)+($post)?$/i && length($indel) < (3 * length($motif)); | |
3188 # print "matched here : ($pre)?($phase)+($post)?\n" if $printer == 1; | |
3189 last if $firstpass == 1; | |
3190 } | |
3191 last if $firstpass == 1; | |
3192 } | |
3193 last if $firstpass == 1; | |
3194 } | |
3195 # print "firstpass 2= $firstpass\n" if $printer == 1; | |
3196 return "non" if $firstpass ==0; | |
3197 return "mot" if $firstpass ==1; | |
3198 } | |
3199 | |
3200 | |
3201 sub checkIndelType_ComplexAnalysis{ | |
3202 my $printer = 0; | |
3203 my $node = $_[0]; | |
3204 my $indel = $_[1]; | |
3205 my $motif = $_[2]; | |
3206 my $alignmentpt = $_[3]; | |
3207 my $pos = $_[4]; | |
3208 my $type = $_[5]; | |
3209 my @speciesinvolved = $node =~ /[A-Z]+/g; | |
3210 | |
3211 my @seqs = (); | |
3212 my $residualseq = length($motif) - length($indel); | |
3213 # print "IN COMPLEX ANALYSIS ... received: @_ .... speciesinvolved = @speciesinvolved\n" if $printer == 1; | |
3214 # print "we have position = $pos, sseq = $alignmentpt->{$speciesinvolved[0]}\n" if $printer == 1; | |
3215 # print "residualseq = $residualseq\n" if $printer == 1; | |
3216 # print "pos=$pos... got: @_\n" if $printer == 1; | |
3217 foreach my $sp (@speciesinvolved){ | |
3218 my $spseq = $alignmentpt->{$sp}; | |
3219 #print "orig spseq = $spseq\n"; | |
3220 my $subseq = (); | |
3221 | |
3222 if ($type eq "deletion"){ | |
3223 my @indelparts = split(/\s*/,$indel); | |
3224 my @seqparts = split(/\s*/,$spseq); | |
3225 | |
3226 for my $p ($pos ... $pos+length($indel)-1){ | |
3227 $seqparts[$p] = shift @indelparts; | |
3228 } | |
3229 $spseq = join("",@seqparts); | |
3230 } | |
3231 #print "mod spseq = $spseq\n"; | |
3232 # $spseq=~ s/\-//g if $type !~ /deletion/; | |
3233 # print "substr($spseq, $pos-($residualseq), length($indel)+$residualseq+$residualseq)\n" if $pos > 0 && $pos < (length($spseq) - length($motif)) && $printer == 1; | |
3234 # print "substr($spseq, 0, length($indel)+$residualseq)\n" if $pos == 0 && $printer == 1; | |
3235 # print "substr($spseq, $pos - $residualseq, length($indel)+$residualseq)\n" if $pos >= (length($spseq) - length($motif)) && $printer == 1; | |
3236 | |
3237 $subseq = substr($spseq, $pos-($residualseq), length($indel)+$residualseq+$residualseq) if $pos > 0 && $pos < (length($spseq) - length($motif)) ; | |
3238 $subseq = substr($spseq, 0, length($indel)+$residualseq) if $pos == 0; | |
3239 $subseq = substr($spseq, $pos - $residualseq, length($indel)+$residualseq) if $pos >= (length($spseq) - length($motif)) ; | |
3240 # print "spseq = $spseq . subseq=$subseq . type = $type\n" if $printer == 1; | |
3241 #<STDIN> if $subseq !~ /[a-z\-]/i; | |
3242 $subseq =~ s/\-/$indel/g if $type =~ /insertion/; | |
3243 push @seqs, $subseq; | |
3244 # print "seqs = @seqs\n" if $printer == 1; | |
3245 } | |
3246 return "non" if checkIfSeqsIdentical(@seqs) eq "NO"; | |
3247 # print "checking for $seqs[0] \n" if $printer == 1; | |
3248 | |
3249 my @phases =(); | |
3250 my %prephases = (); | |
3251 my %postphases = (); | |
3252 my $concat = $motif.$motif.$motif.$motif; | |
3253 my $motiflength = length($motif); | |
3254 | |
3255 my $firstpass = 0; | |
3256 | |
3257 for my $y (0 ... $motiflength-1){ | |
3258 my $phase = substr($concat, $motiflength+$y, $motiflength); | |
3259 push @phases, $phase; | |
3260 $firstpass = 1 if $seqs[0] =~ /$phase/i; | |
3261 for my $k (0 ... length($motif)-1){ | |
3262 my $pre = substr($concat, $motiflength+$y-$k, $k ); | |
3263 my $post = substr($concat, $motiflength+$y+$motiflength, $k); | |
3264 # print "adding to phases : $phase - $pre and $post\n" if $printer == 1; | |
3265 push @{$prephases{$phase}} , $pre; | |
3266 push @{$postphases{$phase}} , $post; | |
3267 } | |
3268 | |
3269 } | |
3270 # print "firstpass 1= $firstpass.. also, res-d = ",(length($seqs[0]))%(length($motif)),"\n" if $printer == 1; | |
3271 return "non" if $firstpass ==0; | |
3272 $firstpass =0; | |
3273 foreach my $phase (@phases){ | |
3274 | |
3275 $firstpass = 1 if $seqs[0] =~ /^($phase)+$/i && ((length($seqs[0]))%(length($motif))) == 0; | |
3276 | |
3277 if (((length($seqs[0]))%(length($motif))) != 0){ | |
3278 my @pres = @{$prephases{$phase}}; | |
3279 my @posts = @{$postphases{$phase}}; | |
3280 foreach my $pre (@pres){ | |
3281 foreach my $post (@posts){ | |
3282 next if $pre !~ /\S/ && $post !~ /\S/; | |
3283 $firstpass = 1 if ($seqs[0] =~ /^($pre)($phase)+($post)$/i || $seqs[0] =~ /^($pre)($phase)+$/i || $seqs[0] =~ /^($phase)+($post)$/i); | |
3284 # print "caught with $pre $phase $post\n" if $printer == 1; | |
3285 last if $firstpass == 1; | |
3286 } | |
3287 last if $firstpass == 1; | |
3288 } | |
3289 } | |
3290 | |
3291 last if $firstpass == 1; | |
3292 } | |
3293 | |
3294 #print "indel = $indel.. motif = $motif.. firstpass 2= mot\n" if $firstpass ==1; | |
3295 #print "indel = $indel.. motif = $motif.. firstpass 2= non\n" if $firstpass ==0; | |
3296 #<STDIN>;# if $firstpass ==1; | |
3297 return "non" if $firstpass ==0; | |
3298 return "mot" if $firstpass ==1; | |
3299 | |
3300 } | |
3301 | |
3302 sub checkIfSeqsIdentical{ | |
3303 my @seqs = @_; | |
3304 my $identical = 1; | |
3305 | |
3306 for my $j (1 ... $#seqs){ | |
3307 $identical = 0 if uc($seqs[0]) ne uc($seqs[$j]); | |
3308 } | |
3309 return "NO" if $identical == 0; | |
3310 return "YES" if $identical == 1; | |
3311 | |
3312 } | |
3313 | |
3314 sub summarizeMutations{ | |
3315 my $mutspt = $_[0]; | |
3316 my @muts = @$mutspt; | |
3317 my $tree = $_[1]; | |
3318 | |
3319 my @returnarr = (); | |
3320 | |
3321 for (1 ... 38){ | |
3322 push @returnarr, "NA"; | |
3323 } | |
3324 push @returnarr, "NULL"; | |
3325 return @returnarr if $tree eq "NULL" || scalar(@muts) < 1; | |
3326 | |
3327 | |
3328 my @bspecies = (); | |
3329 my @dspecies = (); | |
3330 my $treecopy = $tree; | |
3331 $treecopy =~ s/[\(\)]//g; | |
3332 my @treeparts = split(/[\.,]+/, $treecopy); | |
3333 | |
3334 for my $part (@treeparts){ | |
3335 if ($part =~ /\+/){ | |
3336 $part =~ s/\+//g; | |
3337 #my @sp = split(/\s*/, $part); | |
3338 #foreach my $p (@sp) {push @bspecies, $p;} | |
3339 push @bspecies, $part; | |
3340 } | |
3341 if ($part =~ /\-/){ | |
3342 $part =~ s/\-//g; | |
3343 #my @sp = split(/\s*/, $part); | |
3344 #foreach my $p (@sp) {push @dspecies, $p;} | |
3345 push @dspecies, $part; | |
3346 } | |
3347 | |
3348 } | |
3349 #print "-------------------------------------------------------\n"; | |
3350 | |
3351 my ($insertions, $deletions, $motinsertions, $motinsertionsf, $motdeletions, $motdeletionsf, $noninsertions, $nondeletions) = (0,0,0,0,0,0,0,0); | |
3352 my ($binsertions, $bdeletions, $bmotinsertions,$bmotinsertionsf, $bmotdeletions, $bmotdeletionsf, $bnoninsertions, $bnondeletions) = (0,0,0,0,0,0,0,0); | |
3353 my ($dinsertions, $ddeletions, $dmotinsertions,$dmotinsertionsf, $dmotdeletions, $dmotdeletionsf, $dnoninsertions, $dnondeletions) = (0,0,0,0,0,0,0,0); | |
3354 my ($ninsertions, $ndeletions, $nmotinsertions,$nmotinsertionsf, $nmotdeletions, $nmotdeletionsf, $nnoninsertions, $nnondeletions) = (0,0,0,0,0,0,0,0); | |
3355 my ($substitutions, $bsubstitutions, $dsubstitutions, $nsubstitutions, $indels, $subs) = (0,0,0,0,"NA","NA"); | |
3356 | |
3357 my @insertionsarr = (" "); | |
3358 my @deletionsarr = (" "); | |
3359 | |
3360 my @substitutionsarr = (" "); | |
3361 | |
3362 | |
3363 foreach my $mut (@muts){ | |
3364 # print "mut = $mut\n"; | |
3365 chomp $mut; | |
3366 $mut =~ s/=\t/= /g; | |
3367 $mut =~ s/=$/= /g; | |
3368 my %mhash = (); | |
3369 my @mields = split(/\t/,$mut); | |
3370 | |
3371 foreach my $m (@mields){ | |
3372 my @fields = split(/=/,$m); | |
3373 next if $fields[1] eq " "; | |
3374 $mhash{$fields[0]} = $fields[1]; | |
3375 } | |
3376 | |
3377 my $myutype = (); | |
3378 my $decided = 0; | |
3379 | |
3380 my $localnode = $mhash{"node"}; | |
3381 $localnode =~ s/[\(\)\. ,]//g; | |
3382 | |
3383 | |
3384 foreach my $s (@bspecies){ | |
3385 if ($localnode eq $s) { | |
3386 $decided = 1; $myutype = "b"; | |
3387 } | |
3388 } | |
3389 | |
3390 foreach my $s (@dspecies){ | |
3391 if ($localnode eq $s) { | |
3392 $decided = 1; $myutype = "d"; | |
3393 } | |
3394 } | |
3395 | |
3396 $myutype = "n" if $decided != 1; | |
3397 | |
3398 | |
3399 # print "tree=$tree, birth species=@bspecies, death species=@dspecies, node=$mhash{node} .. myutype=$myutype .. \n"; | |
3400 # <STDIN> if $mhash{"type"} eq "insertion" && $myutype eq "b"; | |
3401 | |
3402 | |
3403 if ($mhash{"type"} eq "substitution"){ | |
3404 $substitutions++; | |
3405 $bsubstitutions++ if $myutype eq "b"; | |
3406 $dsubstitutions++ if $myutype eq "d"; | |
3407 $nsubstitutions++ if $myutype eq "n"; | |
3408 # print "substitution: from= $mhash{from}, to = $mhash{to}, and type = myutype\n"; | |
3409 push @substitutionsarr, "$mhash{position}:".$mhash{"from"}.">".$mhash{"to"} if $myutype eq "b"; | |
3410 push @substitutionsarr, "$mhash{position}:".$mhash{"from"}.">".$mhash{"to"} if $myutype eq "d"; | |
3411 push @substitutionsarr, "$mhash{position}:".$mhash{"from"}.">".$mhash{"to"} if $myutype eq "n"; | |
3412 # print "substitutionsarr = @substitutionsarr\n"; | |
3413 # <STDIN>; | |
3414 } | |
3415 else{ | |
3416 #print "tree=$tree, birth species=@bspecies, death species=@dspecies, node=$mhash{node} .. myutype=$myutype .. indeltype=$mhash{indeltype}\n"; | |
3417 if ($mhash{"type"} eq "deletion"){ | |
3418 $deletions++; | |
3419 | |
3420 $motdeletions++ if $mhash{"indeltype"} =~ /dmot/; | |
3421 $motdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/; | |
3422 | |
3423 $nondeletions++ if $mhash{"indeltype"} =~ /dnon/; | |
3424 | |
3425 $bdeletions++ if $myutype eq "b"; | |
3426 $ddeletions++ if $myutype eq "d"; | |
3427 $ndeletions++ if $myutype eq "n"; | |
3428 | |
3429 $bmotdeletions++ if $mhash{"indeltype"} =~ /dmot/ && $myutype eq "b"; | |
3430 $bmotdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/ && $myutype eq "b"; | |
3431 $bnondeletions++ if $mhash{"indeltype"} =~ /dnon/ && $myutype eq "b"; | |
3432 | |
3433 $dmotdeletions++ if $mhash{"indeltype"} =~ /dmot/ && $myutype eq "d"; | |
3434 $dmotdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/ && $myutype eq "d"; | |
3435 $dnondeletions++ if $mhash{"indeltype"} =~ /dnon/ && $myutype eq "d"; | |
3436 | |
3437 $nmotdeletions++ if $mhash{"indeltype"} =~ /dmot/ && $myutype eq "n"; | |
3438 $nmotdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/ && $myutype eq "n"; | |
3439 $nnondeletions++ if $mhash{"indeltype"} =~ /dnon/ && $myutype eq "n"; | |
3440 | |
3441 push @deletionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"deletion"} if $myutype eq "b"; | |
3442 push @deletionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"deletion"} if $myutype eq "d"; | |
3443 push @deletionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"deletion"} if $myutype eq "n"; | |
3444 } | |
3445 | |
3446 if ($mhash{"type"} eq "insertion"){ | |
3447 $insertions++; | |
3448 | |
3449 $motinsertions++ if $mhash{"indeltype"} =~ /imot/; | |
3450 $motinsertionsf++ if $mhash{"indeltype"} =~ /imotf/; | |
3451 $noninsertions++ if $mhash{"indeltype"} =~ /inon/; | |
3452 | |
3453 $binsertions++ if $myutype eq "b"; | |
3454 $dinsertions++ if $myutype eq "d"; | |
3455 $ninsertions++ if $myutype eq "n"; | |
3456 | |
3457 $bmotinsertions++ if $mhash{"indeltype"} =~ /imot/ && $myutype eq "b"; | |
3458 $bmotinsertionsf++ if $mhash{"indeltype"} =~ /imotf/ && $myutype eq "b"; | |
3459 $bnoninsertions++ if $mhash{"indeltype"} =~ /inon/ && $myutype eq "b"; | |
3460 | |
3461 $dmotinsertions++ if $mhash{"indeltype"} =~ /imot/ && $myutype eq "d"; | |
3462 $dmotinsertionsf++ if $mhash{"indeltype"} =~ /imotf/ && $myutype eq "d"; | |
3463 $dnoninsertions++ if $mhash{"indeltype"} =~ /inon/ && $myutype eq "d"; | |
3464 | |
3465 $nmotinsertions++ if $mhash{"indeltype"} =~ /imot/ && $myutype eq "n"; | |
3466 $nmotinsertionsf++ if $mhash{"indeltype"} =~ /imotf/ && $myutype eq "n"; | |
3467 $nnoninsertions++ if $mhash{"indeltype"} =~ /inon/ && $myutype eq "n"; | |
3468 | |
3469 push @insertionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"insertion"} if $myutype eq "b"; | |
3470 push @insertionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"insertion"} if $myutype eq "d"; | |
3471 push @insertionsarr, "$mhash{indeltype}:$mhash{position}:".$mhash{"insertion"} if $myutype eq "n"; | |
3472 | |
3473 } | |
3474 } | |
3475 } | |
3476 | |
3477 | |
3478 | |
3479 $indels = "ins=".join(",",@insertionsarr).";dels=".join(",",@deletionsarr) if scalar(@insertionsarr) > 1 || scalar(@deletionsarr) > 1 ; | |
3480 $subs = join(",",@substitutionsarr) if scalar(@substitutionsarr) > 1; | |
3481 $indels =~ s/ //g; | |
3482 $subs =~ s/ //g ; | |
3483 | |
3484 #print "indels = $indels, subs=$subs\n"; | |
3485 ##<STDIN> if $indels =~ /[a-zA-Z0-9]/ || $subs =~ /[a-zA-Z0-9]/ ; | |
3486 #print "tree = $tree, indels = $indels, subs = $subs, bspecies = @bspecies, dspecies = @dspecies \n"; | |
3487 my @returnarray = (); | |
3488 | |
3489 push (@returnarray, $indels, $subs) ; | |
3490 | |
3491 push @returnarray, $tree; | |
3492 | |
3493 my @copy = @returnarray; | |
3494 #print "\n\nreturnarray = @returnarray ... binsertions=$binsertions dinsertions=$dinsertions bsubstitutions=$bsubstitutions dsubstitutions=$dsubstitutions\n"; | |
3495 #<STDIN>; | |
3496 return (@returnarray); | |
3497 | |
3498 } | |
3499 | |
3500 sub selectBetterTree{ | |
3501 my $printer = 1; | |
3502 my $treestudy = $_[0]; | |
3503 my $alt = $_[1]; | |
3504 my $mutspt = $_[2]; | |
3505 my @muts = @$mutspt; | |
3506 my @trees = (); my @alternatetrees=(); | |
3507 | |
3508 @trees = split(/\|/,$treestudy) if $treestudy =~ /\|/; | |
3509 @alternatetrees = split(/[\|;]/,$alt) if $alt =~ /[\|;\(\)]/; | |
3510 | |
3511 $trees[0] = $treestudy if $treestudy !~ /\|/; | |
3512 $alternatetrees[0] = $alt if $alt !~ /[\|;\(\)]/; | |
3513 | |
3514 my @alltrees = (@trees, @alternatetrees); | |
3515 # push(@alltrees,@alternatetrees); | |
3516 | |
3517 my %mutspecies = (); | |
3518 # print "IN selectBetterTree..treestudy=$treestudy. alt=$alt. for: @_. trees=@trees<. alternatetrees=@alternatetrees\n" if $printer == 1; | |
3519 #<STDIN>; | |
3520 foreach my $mut (@muts){ | |
3521 # print colored ['green'],"mut = $mut\n" if $printer == 1; | |
3522 $mut =~ /node=([A-Z,\(\) ]+)/; | |
3523 my $node = $1; | |
3524 $node =~s/[,\(\) ]+//g; | |
3525 my @indivspecies = $node =~ /[A-Z]+/g; | |
3526 #print "adding node: $node\n" if $printer == 1; | |
3527 $mutspecies{$node} = $node; | |
3528 | |
3529 } | |
3530 | |
3531 my @treerecords = (); | |
3532 my $treecount = -1; | |
3533 foreach my $tree (@alltrees){ | |
3534 # print "checking with tree $tree\n" if $printer == 1; | |
3535 $treecount++; | |
3536 $treerecords[$treecount] = 0; | |
3537 my @indivspecies = ($tree =~ /[A-Z]+/g); | |
3538 # print "indivspecies=@indivspecies\n" if $printer == 1; | |
3539 foreach my $species (@indivspecies){ | |
3540 # print "checkin if exists species: $species\n" if $printer == 1; | |
3541 $treerecords[$treecount]+=2 if exists $mutspecies{$species} && $mutspecies{$species} !~ /indeltype=[a-z]mot/; | |
3542 $treerecords[$treecount]+=1.5 if exists $mutspecies{$species} && $mutspecies{$species} =~ /indeltype=[a-z]mot/; | |
3543 $treerecords[$treecount]-- if !exists $mutspecies{$species}; | |
3544 } | |
3545 # print "for tree $tree, our treecount = $treerecords[$treecount]\n" if $printer == 1; | |
3546 } | |
3547 | |
3548 my @best_tree = array_largest_number_arrayPosition(@treerecords); | |
3549 # print "treerecords = @treerecords. hence, best tree = @best_tree = $alltrees[$best_tree[0]], $treerecords[$best_tree[0]]\n" if $printer == 1; | |
3550 #<STDIN>; | |
3551 return ($alltrees[$best_tree[0]], $treerecords[$best_tree[0]]) if scalar(@best_tree) == 1; | |
3552 # print "best_tree[0] = $best_tree[0], and treerecords = $treerecords[$best_tree[0]]\n" if $printer == 1; | |
3553 return ("NULL", -1) if $treerecords[$best_tree[0]] < 1; | |
3554 my $rando = int(rand($#trees)); | |
3555 return ($alltrees[$rando], $treerecords[$rando]) if scalar(@best_tree) > 1; | |
3556 | |
3557 } | |
3558 | |
3559 | |
3560 | |
3561 | |
3562 sub load_sameHash{ | |
3563 #my $g = %$_[0]; | |
3564 $sameHash{"CAGT"}="AGTC"; | |
3565 $sameHash{"ATGA"}="AATG"; | |
3566 $sameHash{"CAAC"}="AACC"; | |
3567 $sameHash{"GGAA"}="AAGG"; | |
3568 $sameHash{"TAAG"}="AAGT"; | |
3569 $sameHash{"CGAG"}="AGCG"; | |
3570 $sameHash{"TAGG"}="AGGT"; | |
3571 $sameHash{"GCAG"}="AGGC"; | |
3572 $sameHash{"TAGA"}="ATAG"; | |
3573 $sameHash{"TGA"}="ATG"; | |
3574 $sameHash{"CAAG"}="AAGC"; | |
3575 $sameHash{"CTAA"}="AACT"; | |
3576 $sameHash{"CAAT"}="AATC"; | |
3577 $sameHash{"GTAG"}="AGGT"; | |
3578 $sameHash{"GAAG"}="AAGG"; | |
3579 $sameHash{"CGA"}="ACG"; | |
3580 $sameHash{"GTAA"}="AAGT"; | |
3581 $sameHash{"ACAA"}="AAAC"; | |
3582 $sameHash{"GCGG"}="GGGC"; | |
3583 $sameHash{"ATCA"}="AATC"; | |
3584 $sameHash{"TAAC"}="AACT"; | |
3585 $sameHash{"GGCA"}="AGGC"; | |
3586 $sameHash{"TGAG"}="AGTG"; | |
3587 $sameHash{"AACA"}="AAAC"; | |
3588 $sameHash{"GAGC"}="AGCG"; | |
3589 $sameHash{"ACCA"}="AACC"; | |
3590 $sameHash{"TGAA"}="AATG"; | |
3591 $sameHash{"ACA"}="AAC"; | |
3592 $sameHash{"GAAC"}="AACG"; | |
3593 $sameHash{"GCA"}="AGC"; | |
3594 $sameHash{"CCAC"}="ACCC"; | |
3595 $sameHash{"CATA"}="ATAC"; | |
3596 $sameHash{"CAC"}="ACC"; | |
3597 $sameHash{"TACA"}="ATAC"; | |
3598 $sameHash{"GGAC"}="ACGG"; | |
3599 $sameHash{"AGA"}="AAG"; | |
3600 $sameHash{"ATAA"}="AAAT"; | |
3601 $sameHash{"CA"}="AC"; | |
3602 $sameHash{"CCCA"}="ACCC"; | |
3603 $sameHash{"TCAA"}="AATC"; | |
3604 $sameHash{"CAGA"}="AGAC"; | |
3605 $sameHash{"AATA"}="AAAT"; | |
3606 $sameHash{"CCA"}="ACC"; | |
3607 $sameHash{"AGAA"}="AAAG"; | |
3608 $sameHash{"AGTA"}="AAGT"; | |
3609 $sameHash{"GACG"}="ACGG"; | |
3610 $sameHash{"TCAG"}="AGTC"; | |
3611 $sameHash{"ACGA"}="AACG"; | |
3612 $sameHash{"CGCA"}="ACGC"; | |
3613 $sameHash{"GAGT"}="AGTG"; | |
3614 $sameHash{"GA"}="AG"; | |
3615 $sameHash{"TA"}="AT"; | |
3616 $sameHash{"TAA"}="AAT"; | |
3617 $sameHash{"CAG"}="AGC"; | |
3618 $sameHash{"GATA"}="ATAG"; | |
3619 $sameHash{"GTA"}="AGT"; | |
3620 $sameHash{"CCAA"}="AACC"; | |
3621 $sameHash{"TAG"}="AGT"; | |
3622 $sameHash{"CAAA"}="AAAC"; | |
3623 $sameHash{"AAGA"}="AAAG"; | |
3624 $sameHash{"CACG"}="ACGC"; | |
3625 $sameHash{"GTCA"}="AGTC"; | |
3626 $sameHash{"GGA"}="AGG"; | |
3627 $sameHash{"GGAT"}="ATGG"; | |
3628 $sameHash{"CGGG"}="GGGC"; | |
3629 $sameHash{"CGGA"}="ACGG"; | |
3630 $sameHash{"AGGA"}="AAGG"; | |
3631 $sameHash{"TAAA"}="AAAT"; | |
3632 $sameHash{"GAGA"}="AGAG"; | |
3633 $sameHash{"ACTA"}="AACT"; | |
3634 $sameHash{"GCGA"}="AGCG"; | |
3635 $sameHash{"CACA"}="ACAC"; | |
3636 $sameHash{"AGAT"}="ATAG"; | |
3637 $sameHash{"GAGG"}="AGGG"; | |
3638 $sameHash{"CGAC"}="ACCG"; | |
3639 $sameHash{"GGAG"}="AGGG"; | |
3640 $sameHash{"GCCA"}="AGCC"; | |
3641 $sameHash{"CCAG"}="AGCC"; | |
3642 $sameHash{"GAAA"}="AAAG"; | |
3643 $sameHash{"CAGG"}="AGGC"; | |
3644 $sameHash{"GAC"}="ACG"; | |
3645 $sameHash{"CAA"}="AAC"; | |
3646 $sameHash{"GACC"}="ACCG"; | |
3647 $sameHash{"GGCG"}="GGGC"; | |
3648 $sameHash{"GGTA"}="AGGT"; | |
3649 $sameHash{"AGCA"}="AAGC"; | |
3650 $sameHash{"GATG"}="ATGG"; | |
3651 $sameHash{"GTGA"}="AGTG"; | |
3652 $sameHash{"ACAG"}="AGAC"; | |
3653 $sameHash{"CGG"}="GGC"; | |
3654 $sameHash{"ATA"}="AAT"; | |
3655 $sameHash{"GACA"}="AGAC"; | |
3656 $sameHash{"GCAA"}="AAGC"; | |
3657 $sameHash{"CAGC"}="AGCC"; | |
3658 $sameHash{"GGGA"}="AGGG"; | |
3659 $sameHash{"GAG"}="AGG"; | |
3660 $sameHash{"ACAT"}="ATAC"; | |
3661 $sameHash{"GAAT"}="AATG"; | |
3662 $sameHash{"CACC"}="ACCC"; | |
3663 $sameHash{"GAT"}="ATG"; | |
3664 $sameHash{"GCG"}="GGC"; | |
3665 $sameHash{"GCAC"}="ACGC"; | |
3666 $sameHash{"GAA"}="AAG"; | |
3667 $sameHash{"TGGA"}="ATGG"; | |
3668 $sameHash{"CCGA"}="ACCG"; | |
3669 $sameHash{"CGAA"}="AACG"; | |
3670 } | |
3671 | |
3672 | |
3673 | |
3674 sub load_revHash{ | |
3675 $revHash{"CTGA"}="AGTC"; | |
3676 $revHash{"TCTT"}="AAAG"; | |
3677 $revHash{"CTAG"}="AGCT"; | |
3678 $revHash{"GGTG"}="ACCC"; | |
3679 $revHash{"GCC"}="GGC"; | |
3680 $revHash{"GCTT"}="AAGC"; | |
3681 $revHash{"GCGT"}="ACGC"; | |
3682 $revHash{"GTTG"}="AACC"; | |
3683 $revHash{"CTCC"}="AGGG"; | |
3684 $revHash{"ATC"}="ATG"; | |
3685 $revHash{"CGAT"}="ATCG"; | |
3686 $revHash{"TTAA"}="AATT"; | |
3687 $revHash{"GTTC"}="AACG"; | |
3688 $revHash{"CTGC"}="AGGC"; | |
3689 $revHash{"TCGA"}="ATCG"; | |
3690 $revHash{"ATCT"}="ATAG"; | |
3691 $revHash{"GGTT"}="AACC"; | |
3692 $revHash{"CTTA"}="AAGT"; | |
3693 $revHash{"TGGC"}="AGCC"; | |
3694 $revHash{"CCG"}="GGC"; | |
3695 $revHash{"CGGC"}="GGCC"; | |
3696 $revHash{"TTAG"}="AACT"; | |
3697 $revHash{"GTG"}="ACC"; | |
3698 $revHash{"CTTT"}="AAAG"; | |
3699 $revHash{"TGCA"}="ATGC"; | |
3700 $revHash{"CGCT"}="AGCG"; | |
3701 $revHash{"TTCC"}="AAGG"; | |
3702 $revHash{"CT"}="AG"; | |
3703 $revHash{"C"}="G"; | |
3704 $revHash{"CTCT"}="AGAG"; | |
3705 $revHash{"ACTT"}="AAGT"; | |
3706 $revHash{"GGTC"}="ACCG"; | |
3707 $revHash{"ATTC"}="AATG"; | |
3708 $revHash{"GGGT"}="ACCC"; | |
3709 $revHash{"CCTA"}="AGGT"; | |
3710 $revHash{"CGCG"}="GCGC"; | |
3711 $revHash{"GTGT"}="ACAC"; | |
3712 $revHash{"GCCC"}="GGGC"; | |
3713 $revHash{"GTCG"}="ACCG"; | |
3714 $revHash{"TCCC"}="AGGG"; | |
3715 $revHash{"TTCA"}="AATG"; | |
3716 $revHash{"AGTT"}="AACT"; | |
3717 $revHash{"CCCT"}="AGGG"; | |
3718 $revHash{"CCGC"}="GGGC"; | |
3719 $revHash{"CTT"}="AAG"; | |
3720 $revHash{"TTGG"}="AACC"; | |
3721 $revHash{"ATT"}="AAT"; | |
3722 $revHash{"TAGC"}="AGCT"; | |
3723 $revHash{"ACTG"}="AGTC"; | |
3724 $revHash{"TCAC"}="AGTG"; | |
3725 $revHash{"CTGT"}="AGAC"; | |
3726 $revHash{"TGTG"}="ACAC"; | |
3727 $revHash{"ATCC"}="ATGG"; | |
3728 $revHash{"GTGG"}="ACCC"; | |
3729 $revHash{"TGGG"}="ACCC"; | |
3730 $revHash{"TCGG"}="ACCG"; | |
3731 $revHash{"CGGT"}="ACCG"; | |
3732 $revHash{"GCTC"}="AGCG"; | |
3733 $revHash{"TACG"}="ACGT"; | |
3734 $revHash{"GTTT"}="AAAC"; | |
3735 $revHash{"CAT"}="ATG"; | |
3736 $revHash{"CATG"}="ATGC"; | |
3737 $revHash{"GTTA"}="AACT"; | |
3738 $revHash{"CACT"}="AGTG"; | |
3739 $revHash{"TCAT"}="AATG"; | |
3740 $revHash{"TTA"}="AAT"; | |
3741 $revHash{"TGTA"}="ATAC"; | |
3742 $revHash{"TTTC"}="AAAG"; | |
3743 $revHash{"TACT"}="AAGT"; | |
3744 $revHash{"TGTT"}="AAAC"; | |
3745 $revHash{"CTA"}="AGT"; | |
3746 $revHash{"GACT"}="AGTC"; | |
3747 $revHash{"TTGC"}="AAGC"; | |
3748 $revHash{"TTC"}="AAG"; | |
3749 $revHash{"GCT"}="AGC"; | |
3750 $revHash{"GCAT"}="ATGC"; | |
3751 $revHash{"TGGT"}="AACC"; | |
3752 $revHash{"CCT"}="AGG"; | |
3753 $revHash{"CATC"}="ATGG"; | |
3754 $revHash{"CCAT"}="ATGG"; | |
3755 $revHash{"CCCG"}="GGGC"; | |
3756 $revHash{"TGCC"}="AGGC"; | |
3757 $revHash{"TG"}="AC"; | |
3758 $revHash{"TGCT"}="AAGC"; | |
3759 $revHash{"GCCG"}="GGCC"; | |
3760 $revHash{"TCTG"}="AGAC"; | |
3761 $revHash{"TGT"}="AAC"; | |
3762 $revHash{"TTAT"}="AAAT"; | |
3763 $revHash{"TAGT"}="AACT"; | |
3764 $revHash{"TATG"}="ATAC"; | |
3765 $revHash{"TTTA"}="AAAT"; | |
3766 $revHash{"CGTA"}="ACGT"; | |
3767 $revHash{"TA"}="AT"; | |
3768 $revHash{"TGTC"}="AGAC"; | |
3769 $revHash{"CTAT"}="ATAG"; | |
3770 $revHash{"TATA"}="ATAT"; | |
3771 $revHash{"TAC"}="AGT"; | |
3772 $revHash{"TC"}="AG"; | |
3773 $revHash{"CATT"}="AATG"; | |
3774 $revHash{"TCG"}="ACG"; | |
3775 $revHash{"ATTT"}="AAAT"; | |
3776 $revHash{"CGTG"}="ACGC"; | |
3777 $revHash{"CTG"}="AGC"; | |
3778 $revHash{"TCGT"}="AACG"; | |
3779 $revHash{"TCCG"}="ACGG"; | |
3780 $revHash{"GTT"}="AAC"; | |
3781 $revHash{"ATGT"}="ATAC"; | |
3782 $revHash{"CTTG"}="AAGC"; | |
3783 $revHash{"CCTT"}="AAGG"; | |
3784 $revHash{"GATC"}="ATCG"; | |
3785 $revHash{"CTGG"}="AGCC"; | |
3786 $revHash{"TTCT"}="AAAG"; | |
3787 $revHash{"CGTC"}="ACGG"; | |
3788 $revHash{"CG"}="GC"; | |
3789 $revHash{"TATT"}="AAAT"; | |
3790 $revHash{"CTCG"}="AGCG"; | |
3791 $revHash{"TCTC"}="AGAG"; | |
3792 $revHash{"TCCT"}="AAGG"; | |
3793 $revHash{"TGG"}="ACC"; | |
3794 $revHash{"ACTC"}="AGTG"; | |
3795 $revHash{"CTC"}="AGG"; | |
3796 $revHash{"CGC"}="GGC"; | |
3797 $revHash{"TTG"}="AAC"; | |
3798 $revHash{"ACCT"}="AGGT"; | |
3799 $revHash{"TCTA"}="ATAG"; | |
3800 $revHash{"GTAC"}="ACGT"; | |
3801 $revHash{"TTGA"}="AATC"; | |
3802 $revHash{"GTCC"}="ACGG"; | |
3803 $revHash{"GATT"}="AATC"; | |
3804 $revHash{"T"}="A"; | |
3805 $revHash{"CGTT"}="AACG"; | |
3806 $revHash{"GTC"}="ACG"; | |
3807 $revHash{"GCCT"}="AGGC"; | |
3808 $revHash{"TGC"}="AGC"; | |
3809 $revHash{"TTTG"}="AAAC"; | |
3810 $revHash{"GGCT"}="AGCC"; | |
3811 $revHash{"TCA"}="ATG"; | |
3812 $revHash{"GTGC"}="ACGC"; | |
3813 $revHash{"TGAT"}="AATC"; | |
3814 $revHash{"TAT"}="AAT"; | |
3815 $revHash{"CTAC"}="AGGT"; | |
3816 $revHash{"TGCG"}="ACGC"; | |
3817 $revHash{"CTCA"}="AGTG"; | |
3818 $revHash{"CTTC"}="AAGG"; | |
3819 $revHash{"GCTG"}="AGCC"; | |
3820 $revHash{"TATC"}="ATAG"; | |
3821 $revHash{"TAAT"}="AATT"; | |
3822 $revHash{"ACT"}="AGT"; | |
3823 $revHash{"TCGC"}="AGCG"; | |
3824 $revHash{"GGT"}="ACC"; | |
3825 $revHash{"TCC"}="AGG"; | |
3826 $revHash{"TTGT"}="AAAC"; | |
3827 $revHash{"TGAC"}="AGTC"; | |
3828 $revHash{"TTAC"}="AAGT"; | |
3829 $revHash{"CGT"}="ACG"; | |
3830 $revHash{"ATTA"}="AATT"; | |
3831 $revHash{"ATTG"}="AATC"; | |
3832 $revHash{"CCTC"}="AGGG"; | |
3833 $revHash{"CCGG"}="GGCC"; | |
3834 $revHash{"CCGT"}="ACGG"; | |
3835 $revHash{"TCCA"}="ATGG"; | |
3836 $revHash{"CGCC"}="GGGC"; | |
3837 $revHash{"GT"}="AC"; | |
3838 $revHash{"TTCG"}="AACG"; | |
3839 $revHash{"CCTG"}="AGGC"; | |
3840 $revHash{"TCT"}="AAG"; | |
3841 $revHash{"GTAT"}="ATAC"; | |
3842 $revHash{"GTCT"}="AGAC"; | |
3843 $revHash{"GCTA"}="AGCT"; | |
3844 $revHash{"TACC"}="AGGT"; | |
3845 } | |
3846 | |
3847 | |
3848 sub allCaps{ | |
3849 my $motif = $_[0]; | |
3850 $motif =~ s/a/A/g; | |
3851 $motif =~ s/c/C/g; | |
3852 $motif =~ s/t/T/g; | |
3853 $motif =~ s/g/G/g; | |
3854 return $motif; | |
3855 } | |
3856 | |
3857 | |
3858 sub all_caps{ | |
3859 my @strand = split(/\s*/,$_[0]); | |
3860 for my $i (0 ... $#strand){ | |
3861 if ($strand[$i] =~ /c/) {$strand[$i] = "C";next;} | |
3862 if ($strand[$i] =~ /a/) {$strand[$i] = "A";next;} | |
3863 if ($strand[$i] =~ /t/) { $strand[$i] = "T";next;} | |
3864 if ($strand[$i] =~ /g/) {$strand[$i] = "G";next;} | |
3865 } | |
3866 return join("",@strand); | |
3867 } | |
3868 sub array_mean{ | |
3869 return "NA" if scalar(@_) == 0; | |
3870 my $sum = 0; | |
3871 foreach my $val (@_){ | |
3872 $sum = $sum + $val; | |
3873 } | |
3874 return ($sum/scalar(@_)); | |
3875 } | |
3876 sub array_sum{ | |
3877 return "NA" if scalar(@_) == 0; | |
3878 my $sum = 0; | |
3879 foreach my $val (@_){ | |
3880 $sum = $sum + $val; | |
3881 } | |
3882 return ($sum); | |
3883 } | |
3884 | |
3885 sub variance{ | |
3886 return "NA" if scalar(@_) == 0; | |
3887 return 0 if scalar(@_) == 1; | |
3888 my $mean = array_mean(@_); | |
3889 my $num = 0; | |
3890 return 0 if scalar(@_) == 1; | |
3891 # print "mean = $mean .. array = >@_<\n"; | |
3892 foreach my $ele (@_){ | |
3893 # print "$num = $num + ($ele-$mean)*($ele-$mean)\n"; | |
3894 $num = $num + ($ele-$mean)*($ele-$mean); | |
3895 } | |
3896 my $var = $num / scalar(@_); | |
3897 return $var; | |
3898 } | |
3899 | |
3900 sub array_95confIntervals{ | |
3901 return "NA" if scalar(@_) <= 0; | |
3902 my @sorted = sort { $a <=> $b } @_; | |
3903 # print "@sorted=",scalar(@sorted), "\n"; | |
3904 my $aDeechNo = int((scalar(@sorted) * 2.5) / 100); | |
3905 my $saaDeNo = int((scalar(@sorted) * 97.5) / 100); | |
3906 | |
3907 return ($sorted[$aDeechNo], $sorted[$saaDeNo]); | |
3908 } | |
3909 | |
3910 sub array_median{ | |
3911 return "NA" if scalar(@_) == 0; | |
3912 return $_[0] if scalar(@_) == 1; | |
3913 my @sorted = sort { $a <=> $b } @_; | |
3914 my $totalno = scalar(@sorted); | |
3915 | |
3916 #print "sorted = @sorted\n"; | |
3917 | |
3918 my $pick = (); | |
3919 if ($totalno % 2 == 1){ | |
3920 #print "odd set .. totalno = $totalno\n"; | |
3921 my $mid = $totalno / 2; | |
3922 my $onehalfno = $mid - $mid % 1; | |
3923 my $secondhalfno = $onehalfno + 1; | |
3924 my $onehalf = $sorted[$onehalfno-1]; | |
3925 my $secondhalf = $sorted[$secondhalfno-1]; | |
3926 #print "onehalfno = $onehalfno and secondhalfno = $secondhalfno \n onehalf = $onehalf and secondhalf = $secondhalf\n"; | |
3927 | |
3928 $pick = $secondhalf; | |
3929 } | |
3930 else{ | |
3931 #print "even set .. totalno = $totalno\n"; | |
3932 my $mid = $totalno / 2; | |
3933 my $onehalfno = $mid; | |
3934 my $secondhalfno = $onehalfno + 1; | |
3935 my $onehalf = $sorted[$onehalfno-1]; | |
3936 my $secondhalf = $sorted[$secondhalfno-1]; | |
3937 #print "onehalfno = $onehalfno and secondhalfno = $secondhalfno \n onehalf = $onehalf and secondhalf = $secondhalf\n"; | |
3938 $pick = ($onehalf + $secondhalf )/2; | |
3939 | |
3940 } | |
3941 #print "pick = $pick..\n"; | |
3942 return $pick; | |
3943 | |
3944 } | |
3945 | |
3946 | |
3947 sub array_numerical_sort{ | |
3948 return "NA" if scalar(@_) == 0; | |
3949 my @sorted = sort { $a <=> $b } @_; | |
3950 return (@sorted); | |
3951 } | |
3952 | |
3953 sub array_smallest_number{ | |
3954 return "NA" if scalar(@_) == 0; | |
3955 return $_[0] if scalar(@_) == 1; | |
3956 my @sorted = sort { $a <=> $b } @_; | |
3957 return $sorted[0]; | |
3958 } | |
3959 | |
3960 | |
3961 sub array_largest_number{ | |
3962 return "NA" if scalar(@_) == 0; | |
3963 return $_[0] if scalar(@_) == 1; | |
3964 my @sorted = sort { $a <=> $b } @_; | |
3965 return $sorted[$#sorted]; | |
3966 } | |
3967 | |
3968 | |
3969 sub array_largest_number_arrayPosition{ | |
3970 return "NA" if scalar(@_) == 0; | |
3971 return 0 if scalar(@_) == 1; | |
3972 my $maxpos = 0; | |
3973 my @maxposes = (); | |
3974 my @maxvals = (); | |
3975 my $maxval = array_smallest_number(@_); | |
3976 for my $i (0 ... $#_){ | |
3977 if ($_[$i] > $maxval){ | |
3978 $maxval = $_[$i]; | |
3979 $maxpos = $i; | |
3980 } | |
3981 if ($_[$i] == $maxval){ | |
3982 $maxval = $_[$i]; | |
3983 if (scalar(@maxposes) == 0){ | |
3984 push @maxposes, $i; | |
3985 push @maxvals, $_[$i]; | |
3986 | |
3987 } | |
3988 elsif ($maxvals[0] == $maxval){ | |
3989 push @maxposes, $i; | |
3990 push @maxvals, $_[$i]; | |
3991 } | |
3992 else{ | |
3993 @maxposes = (); @maxvals = (); | |
3994 push @maxposes, $i; | |
3995 push @maxvals, $_[$i]; | |
3996 } | |
3997 | |
3998 } | |
3999 | |
4000 } | |
4001 return $maxpos if scalar(@maxposes) < 2; | |
4002 return (@maxposes); | |
4003 } | |
4004 | |
4005 sub array_smallest_number_arrayPosition{ | |
4006 return "NA" if scalar(@_) == 0; | |
4007 return 0 if scalar(@_) == 1; | |
4008 my $minpos = 0; | |
4009 my @minposes = (); | |
4010 my @minvals = (); | |
4011 my $minval = array_largest_number(@_); | |
4012 my $maxval = array_smallest_number(@_); | |
4013 #print "starting with $maxval, ending with $minval\n"; | |
4014 for my $i (0 ... $#_){ | |
4015 if ($_[$i] < $minval){ | |
4016 $minval = $_[$i]; | |
4017 $minpos = $i; | |
4018 } | |
4019 if ($_[$i] == $minval){ | |
4020 $minval = $_[$i]; | |
4021 if (scalar(@minposes) == 0){ | |
4022 push @minposes, $i; | |
4023 push @minvals, $_[$i]; | |
4024 | |
4025 } | |
4026 elsif ($minvals[0] == $minval){ | |
4027 push @minposes, $i; | |
4028 push @minvals, $_[$i]; | |
4029 } | |
4030 else{ | |
4031 @minposes = (); @minvals = (); | |
4032 push @minposes, $i; | |
4033 push @minvals, $_[$i]; | |
4034 } | |
4035 | |
4036 } | |
4037 | |
4038 } | |
4039 #print "minposes=@minposes\n"; | |
4040 | |
4041 return $minpos if scalar(@minposes) < 2; | |
4042 return (@minposes); | |
4043 } | |
4044 | |
4045 sub basic_stats{ | |
4046 my @arr = @_; | |
4047 # print " array_smallest_number= ", array_smallest_number(@arr)," array_largest_number= ", array_largest_number(@arr), " array_mean= ",array_mean(@arr),"\n"; | |
4048 return ":"; | |
4049 } | |
4050 #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx | |
4051 | |
4052 sub maftoAxt_multispecies { | |
4053 my $printer = 0; | |
4054 #print "in maftoAxt_multispecies : got @_\n"; | |
4055 my $fname=$_[0]; | |
4056 open(IN,"<$_[0]") or die "Cannot open $_[0]: $! \n"; | |
4057 my $treedefinition = $_[1]; | |
4058 #print "treedefinition= $treedefinition\n"; | |
4059 | |
4060 my @treedefinitions = MakeTrees($treedefinition); | |
4061 | |
4062 | |
4063 open(OUT,">$_[2]") or die "Cannot open $_[2]: $! \n"; | |
4064 my $counter = 0; | |
4065 my $exactspeciesset = $_[3]; | |
4066 my @exactspeciesset_unarranged = split(/,/,$exactspeciesset); | |
4067 | |
4068 $treedefinition=~s/[\)\(, ]/\t/g; | |
4069 my @species=split(/\t+/,$treedefinition); | |
4070 @exactspeciesset_unarranged = @species; | |
4071 # print "species=@species\n"; | |
4072 | |
4073 my @exactspeciesarr=(); | |
4074 | |
4075 | |
4076 foreach my $def (@treedefinitions){ | |
4077 $def=~s/[\)\(, ]/\t/g; | |
4078 my @specs = split(/\t+/,$def); | |
4079 my @exactspecies=(); | |
4080 foreach my $spec (@specs){ | |
4081 foreach my $espec (@exactspeciesset_unarranged){ | |
4082 # print "pushing >$spec< nd >$espec<\n" if $spec eq $espec && $espec =~ /[a-zA-Z0-9]/; | |
4083 push @exactspecies, $spec if $spec eq $espec && $espec =~ /[a-zA-Z0-9]/; | |
4084 } | |
4085 | |
4086 } | |
4087 #print "exactspecies = >@exactspecies<\n"; | |
4088 push @exactspeciesarr, [@exactspecies]; | |
4089 } | |
4090 #<STDIN>; | |
4091 #print "exactspeciesarr=@exactspeciesarr\n"; | |
4092 | |
4093 ########### | |
4094 my $select = 2; | |
4095 #select = 1 if all species need sequences to be present for each block otherwise, it is 0 | |
4096 #select = 2 only the allowed set make up the alignment. use the removeset | |
4097 # information to detect alignmenets that have other important genomes aligned. | |
4098 ########### | |
4099 my @allowedset = (); | |
4100 @allowedset = split(/;/,allowedSetOfSpecies(join("_",@species))) if $select == 0; | |
4101 @allowedset = join("_",0,@species) if $select == 1; | |
4102 #print "species = @species , allowedset =",join("\n", @allowedset) ," \n"; | |
4103 | |
4104 | |
4105 foreach my $set (@exactspeciesarr){ | |
4106 my @openset = @$set; | |
4107 push @allowedset, join("_",0,@openset) if $select == 2; | |
4108 # print "openset = >@openset<, allowedset = @allowedset and exactspecies = @exactspecies\n"; | |
4109 } | |
4110 # <STDIN>; | |
4111 my $start = 0; | |
4112 my @sequences = (); | |
4113 my @titles = (); | |
4114 my $species_counter = "0"; | |
4115 my $countermatch = 0; | |
4116 my $outsideSpecies=0; | |
4117 | |
4118 while(my $line = <IN>){ | |
4119 next if $line =~ /^#/; | |
4120 next if $line =~ /^i/; | |
4121 # print "$line .. species = @species\n"; | |
4122 chomp $line; | |
4123 my @fields = split(/\s+/,$line); | |
4124 chomp $line; | |
4125 if ($line =~ /^a /){ | |
4126 $start = 1; | |
4127 } | |
4128 | |
4129 if ($line =~ /^s /){ | |
4130 # print "fields1 = $fields[1] , start = $start\n"; | |
4131 | |
4132 foreach my $sp (@species){ | |
4133 if ($fields[1] =~ /$sp/){ | |
4134 $species_counter = $species_counter."_".$sp; | |
4135 push(@sequences, $fields[6]); | |
4136 my @sp_info = split(/\./,$fields[1]); | |
4137 my $title = join(" ",@sp_info, $fields[2], ($fields[2]+$fields[3]), $fields[4]); | |
4138 push(@titles, $title); | |
4139 | |
4140 } | |
4141 } | |
4142 } | |
4143 | |
4144 if (($line !~ /^a/) && ($line !~ /^s/) && ($line !~ /^#/) && ($line !~ /^i/) && ($start = 1)){ | |
4145 | |
4146 my $arranged = reorderSpecies($species_counter, @species); | |
4147 # print "species_counter=$species_counter .. arranged = $arranged\n"; | |
4148 | |
4149 my $stopper = 1; | |
4150 my $arrno = 0; | |
4151 foreach my $set (@allowedset){ | |
4152 # print "checking $set with $arranged\n"; | |
4153 if ($arranged eq $set){ | |
4154 # print "checked $set with $arranged\n"; | |
4155 $stopper = 0; last; | |
4156 } | |
4157 $arrno++; | |
4158 } | |
4159 | |
4160 if ($stopper == 0) { | |
4161 # print " accepted\n"; | |
4162 @titles = split ";", orderInfo(join(";", @titles), $species_counter, $arranged) if $species_counter ne $arranged; | |
4163 | |
4164 @sequences = split ";", orderInfo(join(";", @sequences), $species_counter, $arranged) if $species_counter ne $arranged; | |
4165 my $filteredseq = filter_gaps(@sequences); | |
4166 | |
4167 if ($filteredseq ne "SHORT"){ | |
4168 $counter++; | |
4169 print OUT join (" ",$counter, @titles), "\n"; | |
4170 print OUT $filteredseq, "\n"; | |
4171 print OUT "\n"; | |
4172 $countermatch++; | |
4173 } | |
4174 # my @filtered_seq = split(/\t/,filter_gaps(@sequences) ); | |
4175 } | |
4176 else{#print "\n"; | |
4177 } | |
4178 | |
4179 @sequences = (); @titles = (); $start = 0;$species_counter = "0"; | |
4180 next; | |
4181 | |
4182 } | |
4183 } | |
4184 close IN; | |
4185 close OUT; | |
4186 # print "countermatch = $countermatch\n"; | |
4187 # <STDIN>; | |
4188 } | |
4189 | |
4190 sub reorderSpecies{ | |
4191 my @inarr=@_; | |
4192 my $currSpecies = shift (@inarr); | |
4193 my $ordered_species = 0; | |
4194 my @species=@inarr; | |
4195 foreach my $order (@species){ | |
4196 $ordered_species = $ordered_species."_".$order if $currSpecies=~ /$order/; | |
4197 } | |
4198 return $ordered_species; | |
4199 | |
4200 } | |
4201 | |
4202 sub filter_gaps{ | |
4203 my @sequences = @_; | |
4204 # print "sequences sent are @sequences\n"; | |
4205 my $seq_length = length($sequences[0]); | |
4206 my $seq_no = scalar(@sequences); | |
4207 my $allgaps = (); | |
4208 for (1 ... $seq_no){ | |
4209 $allgaps = $allgaps."-"; | |
4210 } | |
4211 | |
4212 my @seq_array = (); | |
4213 my $seq_counter = 0; | |
4214 foreach my $seq (@sequences){ | |
4215 # my @sequence = split(/\s*/,$seq); | |
4216 $seq_array[$seq_counter] = [split(/\s*/,$seq)]; | |
4217 # push @seq_array, [@sequence]; | |
4218 $seq_counter++; | |
4219 } | |
4220 my $g = 0; | |
4221 while ( $g < $seq_length){ | |
4222 last if (!exists $seq_array[0][$g]); | |
4223 my $bases = (); | |
4224 for my $u (0 ... $#seq_array){ | |
4225 $bases = $bases.$seq_array[$u][$g]; | |
4226 } | |
4227 # print $bases, "\n"; | |
4228 if ($bases eq $allgaps){ | |
4229 # print "bases are $bases, position is $g \n"; | |
4230 for my $seq (@seq_array){ | |
4231 splice(@$seq , $g, 1); | |
4232 } | |
4233 } | |
4234 else { | |
4235 $g++; | |
4236 } | |
4237 } | |
4238 | |
4239 my @outs = (); | |
4240 | |
4241 foreach my $seq (@seq_array){ | |
4242 push(@outs, join("",@$seq)); | |
4243 } | |
4244 return "SHORT" if length($outs[0]) <=100; | |
4245 return (join("\n", @outs)); | |
4246 } | |
4247 | |
4248 | |
4249 sub allowedSetOfSpecies{ | |
4250 my @allowed_species = split(/_/,$_[0]); | |
4251 unshift @allowed_species, 0; | |
4252 # print "allowed set = @allowed_species \n"; | |
4253 my @output = (); | |
4254 for (0 ... scalar(@allowed_species) - 4){ | |
4255 push(@output, join("_",@allowed_species)); | |
4256 pop @allowed_species; | |
4257 } | |
4258 return join(";",reverse(@output)); | |
4259 | |
4260 } | |
4261 | |
4262 | |
4263 sub orderInfo{ | |
4264 my @info = split(/;/,$_[0]); | |
4265 # print "info = @info"; | |
4266 my @old = split(/_/,$_[1]); | |
4267 my @new = split(/_/,$_[2]); | |
4268 shift @old; shift @new; | |
4269 my @outinfo = (); | |
4270 foreach my $spe (@new){ | |
4271 for my $no (0 ... $#old){ | |
4272 if ($spe eq $old[$no]){ | |
4273 push(@outinfo, $info[$no]); | |
4274 } | |
4275 } | |
4276 } | |
4277 # print "outinfo = @outinfo \n"; | |
4278 return join(";", @outinfo); | |
4279 } | |
4280 | |
4281 #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx | |
4282 | |
4283 sub printarr { | |
4284 # print ">::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"; | |
4285 # foreach my $line (@_) {print "$line\n";} | |
4286 # print "::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::<\n"; | |
4287 } | |
4288 | |
4289 sub oneOf{ | |
4290 my @arr = @_; | |
4291 my $element = $arr[$#arr]; | |
4292 @arr = @arr[0 ... $#arr-1]; | |
4293 my $present = 0; | |
4294 | |
4295 foreach my $el (@arr){ | |
4296 $present = 1 if $el eq $element; | |
4297 } | |
4298 return $present; | |
4299 } | |
4300 | |
4301 #xxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx | |
4302 | |
4303 sub MakeTrees{ | |
4304 my $tree = $_[0]; | |
4305 # my @parts=($tree); | |
4306 my @parts=(); | |
4307 | |
4308 # print "parts=@parts\n"; | |
4309 | |
4310 while (1){ | |
4311 $tree =~ s/^\(//g; | |
4312 $tree =~ s/\)$//g; | |
4313 my @arr = (); | |
4314 | |
4315 if ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)\)$/){ | |
4316 @arr = $tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)$/; | |
4317 push @parts, "(".$tree.")"; | |
4318 $tree = $2; | |
4319 } | |
4320 elsif ($tree =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/){ | |
4321 @arr = $tree =~ /^([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/; | |
4322 push @parts, "(".$tree.")"; | |
4323 $tree = $1; | |
4324 } | |
4325 elsif ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/){ | |
4326 last; | |
4327 } | |
4328 } | |
4329 #print "parts=@parts\n"; | |
4330 return @parts; | |
4331 } | |
4332 | |
4333 |