0
|
1 #!/usr/bin/perl -w
|
|
2 use strict;
|
|
3 use warnings;
|
|
4 use Term::ANSIColor;
|
|
5 use Pod::Checker;
|
|
6 use File::Basename;
|
|
7 use IO::Handle;
|
|
8 use Cwd;
|
|
9 use File::Path qw(make_path remove_tree);
|
|
10 use File::Temp qw/ tempfile tempdir /;
|
|
11 my $tdir = tempdir( CLEANUP => 0 );
|
|
12 chdir $tdir;
|
|
13 my $dir = getcwd;
|
|
14 #print "current dit=$dir\n";
|
|
15
|
|
16 use vars qw (%treesToReject %template $printer $interr_poscord $interrcord $no_of_interruptionscord $stringfile @tags
|
|
17 $infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species
|
|
18 $gapcord %thresholdhash $tree_decipherer @sp_ident %revHash %sameHash %treesToIgnore %alternate @exactspecies @exacttags);
|
|
19 use FileHandle;
|
|
20 use IO::Handle; # 5.004 or higher
|
|
21
|
|
22 #my @ar = ("/Users/ydk/work/rhesus_microsat/results/galay/chr22_5sp.maf.txt", "/Users/ydk/work/rhesus_microsat/results/galay/dataset_11.dat",
|
|
23 #"/Users/ydk/work/rhesus_microsat/results/galay/chr22_5spec.maf.summ","hg18,panTro2,ponAbe2,rheMac2,calJac1","((((hg18, panTro2), ponAbe2), rheMac2), calJac1)","9,10,12,12",
|
|
24 #"10","0.8");
|
|
25 my @ar = @ARGV;
|
|
26 my ($maf, $orth, $summout, $species_set, $tree_definition, $thresholds, $FLANK_SUPPORT, $SIMILARITY_THRESH) = @ar;
|
|
27 $SIMILARITY_THRESH=$SIMILARITY_THRESH/100;
|
|
28 #########################
|
|
29 $SIMILARITY_THRESH = $SIMILARITY_THRESH/100;
|
|
30 my $EDGE_DISTANCE = 10;
|
|
31 my $COMPLEXITY_SUPPORT = 20;
|
|
32 load_thresholds("9_10_12_12");
|
|
33 #########################
|
|
34
|
|
35 my $complexity=int($COMPLEXITY_SUPPORT * (1/40));
|
|
36
|
|
37 #print "complexity=$complexity\n";
|
|
38 #<STDIN>;
|
|
39
|
|
40 #$printer = 1;
|
|
41
|
|
42 my $rando = int(rand(1000));
|
|
43 my $localdate = `date`;
|
|
44 $localdate =~ /([0-9]+):([0-9]+):([0-9]+)/;
|
|
45 my $info = $rando.$1.$2.$3;
|
|
46
|
|
47 #---------------------------------------------------------------------------
|
|
48 # GETTING INPUT INFORMATION AND OPENING INPUT AND OUTPUT FILES
|
|
49
|
|
50
|
|
51 my @thresharr = (0, split(/,/,$thresholds));
|
|
52 my $randno=int(rand(100000));
|
|
53 my $megamatch = $randno.".megamatch.net.axt"; #"/gpfs/home/ydk104/work/rhesus_microsat/axtNet/hg18.panTro2.ponAbe2.rheMac2.calJac1/chr1.hg18.panTro2.ponAbe2.rheMac2.calJac1.net.axt";
|
|
54 my $megamatchlck = $megamatch.".lck";
|
|
55 unlink $megamatchlck;
|
|
56
|
|
57 #my $selected= $orth;
|
|
58 #my $eventfile = $orth;
|
|
59 #$selected = $selected."_SELECTED";
|
|
60 #$selected = $selected."_".$SIMILARITY_THRESH;
|
|
61 #my $runtime = $selected.".runtime";
|
|
62
|
|
63 my $inputtags = "H:C:O:R:M";
|
|
64 $inputtags = $ARGV[3] if exists $ARGV[3] && $ARGV[3] =~ /[A-Z]:[A-Z]/;
|
|
65
|
|
66 my @all_tags = split(/:/, $inputtags);
|
|
67 my $inputsp = "hg18:panTro2:ponAbe2:rheMac2:calJac1";
|
|
68 $inputsp = $ARGV[4] if exists $ARGV[4] && $ARGV[3] =~ /[0-9]+:/;
|
|
69 @sp_ident = split(/:/,$inputsp);
|
|
70 my $junkfile = $orth."_junk";
|
|
71
|
|
72 my $sh = load_sameHash(1);
|
|
73 my $rh = load_revHash(1);
|
|
74
|
|
75 #print "inputs are : \n"; foreach(@ARGV){print $_,"\n";}
|
|
76 #open (SELECT, ">$selected") or die "Cannot open selected file: $selected: $!";
|
|
77 open (SUMMARY, ">$summout") or die "Cannot open summout file: $summout: $!";
|
|
78 #open (RUN, ">$runtime") or die "Cannot open orth file: $runtime: $!";
|
|
79 #my $ctlfile = "baseml\.ctl"; #$ARGV[4];
|
|
80 #my $treefile = "/gpfs/home/ydk104/work/rhesus_microsat/codes/lib/"; #1 THIS IS THE THE TREE UNDER CONSIDERATION, IN NEWICK
|
|
81 my %registeredTrees = ();
|
|
82 my @removalReasons =
|
|
83 ("microsatellite is compound",
|
|
84 "complex structure",
|
|
85 "if no. if micros is more than no. of species",
|
|
86 "if more than one micro per species ",
|
|
87 "if microsat contains N",
|
|
88 "different motif than required ",
|
|
89 "more than zero interruptions",
|
|
90 "microsat could not form key ",
|
|
91 "orthologous microsats of different motif size ",
|
|
92 "orthologous microsats of different motifs ",
|
|
93 "microsats belong to different alignment blocks altogether",
|
|
94 "microsat near edge",
|
|
95 "microsat in low complexity region",
|
|
96 "microsat flanks dont align well",
|
|
97 "phylogeny not informative");
|
|
98 my %allowedhash=();
|
|
99 #---------------------------------------------------------------------------
|
|
100 # WORKING ON MAKING THE MEGAMATCH FILE
|
|
101 my $chromt=int(rand(10000));
|
|
102 my $p_chr=$chromt;
|
|
103
|
|
104 $tree_definition=~s/,/, /g;
|
|
105 $tree_definition =~ s/, +/, /g;
|
|
106 my @exactspeciesset_unarranged = split(/,/,$species_set);
|
|
107 my $largesttree = "$tree_definition;";
|
|
108 $tree_definition=~s/[\)\(, ]/\t/g;
|
|
109
|
|
110 my @treespecies=split(/\t+/,$tree_definition);
|
|
111
|
|
112 foreach my $spec (@treespecies){
|
|
113 foreach my $espec (@exactspeciesset_unarranged){
|
|
114 push @exactspecies, $spec if $spec eq $espec;
|
|
115 }
|
|
116 }
|
|
117 #print "exactspecies=@exactspecies\n";
|
|
118 my $focalspec = $exactspecies[0];
|
|
119 my $arranged_species_set=join(".",@exactspecies);
|
|
120 @exacttags=@exactspecies;
|
|
121 foreach my $extag (@exacttags){
|
|
122 $extag =~ s/hg18/H/g;
|
|
123 $extag =~ s/panTro2/C/g;
|
|
124 $extag =~ s/ponAbe2/O/g;
|
|
125 $extag =~ s/rheMac2/R/g;
|
|
126 $extag =~ s/calJac1/M/g;
|
|
127 }
|
|
128 my $chr_name = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt");
|
|
129 #print "sending to maftoAxt_multispecies: $maf, $tree_definition, $chr_name, $species_set .. focalspec=$focalspec \n";
|
|
130 maftoAxt_multispecies($maf, $tree_definition, $chr_name, $species_set);
|
|
131 my @filterseqfiles= ($chr_name);
|
|
132 $largesttree =~ s/hg18/H/g;
|
|
133 $largesttree =~ s/panTro2/C/g;
|
|
134 $largesttree =~ s/ponAbe2/O/g;
|
|
135 $largesttree =~ s/rheMac2/R/g;
|
|
136 $largesttree =~ s/calJac1/M/g;
|
|
137 #---------------------------------------------------------------------------
|
|
138
|
|
139 my ($lagestnodes, $largestbranches) = get_nodes($largesttree);
|
|
140 shift (@$lagestnodes);
|
|
141 my @extendedtitle=();
|
|
142
|
|
143 my $title = ();
|
|
144 my $parttitle = ();
|
|
145 my @titlearr = ();
|
|
146 my @firsttitle=($focalspec."chrom", $focalspec."start", $focalspec."end", $focalspec."motif", $focalspec."motifsize", $focalspec."threshold");
|
|
147
|
|
148 my @finames= qw(chr start end motif motifsize microsat mutation mutation.position mutation.from mutation.to insertion.details deletion.details);
|
|
149
|
|
150 my @fititle=();
|
|
151
|
|
152 foreach my $spec (split(",",$species_set)){
|
|
153 push @fititle, $spec;
|
|
154 foreach my $name (@finames){
|
|
155 push @fititle, $spec.".".$name;
|
|
156 }
|
|
157 }
|
|
158
|
|
159
|
|
160 my @othertitle=qw(somechr somestart somened event source);
|
|
161
|
|
162 my @fnames = ();
|
|
163 push @fnames, qw(insertions_num deletions_num motinsertions_num motinsertionsf_num motdeletions_num motdeletionsf_num noninsertions_num nondeletions_num) ;
|
|
164 push @fnames, qw(binsertions_num bdeletions_num bmotinsertions_num bmotinsertionsf_num bmotdeletions_num bmotdeletionsf_num bnoninsertions_num bnondeletions_num) ;
|
|
165 push @fnames, qw(dinsertions_num ddeletions_num dmotinsertions_num dmotinsertionsf_num dmotdeletions_num dmotdeletionsf_num dnoninsertions_num dnondeletions_num) ;
|
|
166 push @fnames, qw(ninsertions_num ndeletions_num nmotinsertions_num nmotinsertionsf_num nmotdeletions_num nmotdeletionsf_num nnoninsertions_num nnondeletions_num) ;
|
|
167 push @fnames, qw(substitutions_num bsubstitutions_num dsubstitutions_num nsubstitutions_num indels_num subs_num);
|
|
168
|
|
169 my @fullnames = ();
|
|
170
|
|
171 foreach my $lnode (@$lagestnodes){
|
|
172 my @pair = @$lnode;
|
|
173 my @nodemutarr = ();
|
|
174 for my $p (@pair){
|
|
175 # print "p = $p\n";
|
|
176 $p =~ s/[\(\), ]+//g;
|
|
177 $p =~ s/H/hg18/g;
|
|
178 $p =~ s/C/panTro2/g;
|
|
179 $p =~ s/O/ponAbe2/g;
|
|
180 $p =~ s/R/rheMac2/g;
|
|
181 $p =~ s/M/calJac1/g;
|
|
182 foreach my $n (@fnames) { push @fullnames, $p.".".$n;}
|
|
183 }
|
|
184 }
|
|
185 print SUMMARY "#",join("\t", @firsttitle, @fititle, @othertitle);
|
|
186
|
|
187 print SUMMARY "\t",join("\t", @fullnames);
|
|
188 #$title = $title."\t".join("\t", @fullnames);
|
|
189
|
|
190 print SUMMARY "\t",join("\t", @fnames);
|
|
191 #$title= $title."\t".join("\t", @fnames);
|
|
192
|
|
193 print SUMMARY "\t","tree","\t", "cleancase", "\n";
|
|
194 #$title= $title."\t"."tree"."\t"."cleancase". "\n";
|
|
195
|
|
196 #print $title; #<STDIN>;
|
|
197
|
|
198 #print "all_tags = @all_tags\n";
|
|
199
|
|
200 for my $no (3 ... $#all_tags+1){
|
|
201 # print "no=$no\n"; #<STDIN>;
|
|
202 @tags = @all_tags[0 ... $no-1];
|
|
203 #print "tags = = @tags\n" if $printer == 1;
|
|
204 %template=();
|
|
205 my @nextcounter = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
206 #next if scalar(@tags) < 4;
|
|
207
|
|
208 #print "now doing tags = @tags, no = $no\n";
|
|
209 open (ORTH, "<$orth") or die "Cannot open orth file: $orth: $!";
|
|
210
|
|
211 # print SUMMARY join "\t", qw (species chr start end branch motif microsat mutation position from to insertion deletion);
|
|
212
|
|
213
|
|
214 ##################### T E M P O R A R Y #####################
|
|
215 my @finaltitle=();
|
|
216 my @singletitle = qw (species chr start end motif motifsize microsat strand microsatsize col10 col11 col12 col13);
|
|
217 my $endtitle = ();
|
|
218 foreach my $tag (@tags){
|
|
219 my @tempsingle = ();
|
|
220
|
|
221 foreach my $single (@singletitle){
|
|
222 push @tempsingle, $tag.$single;
|
|
223 }
|
|
224 @finaltitle = (@finaltitle, @tempsingle);
|
|
225 }
|
|
226
|
|
227 # print SUMMARY join("\t",@finaltitle),"\n";
|
|
228
|
|
229 #############################################################
|
|
230
|
|
231 #---------------------------------------------------------------------------
|
|
232 # GET THE TREE FROM TREE FILE
|
|
233 my $tree = ();
|
|
234 $tree = "((H, C), O)" if $no == 3;
|
|
235 $tree = "(((H, C), O), R)" if $no == 4;
|
|
236 $tree = "((((H, C), O), R), M)" if $no == 5;
|
|
237 # $tree=~s/;$//g;
|
|
238 # print "our tree = $tree\n";
|
|
239 #---------------------------------------------------------------------------
|
|
240 # LOADING HASH CONTAINING ALL POSSIBLE TREES:
|
|
241 $tree_decipherer = "/gpfs/home/ydk104/work/rhesus_microsat/codes/lib/tree_analysis_".join("",@tags).".txt";
|
|
242 load_allPossibleTrees($tree_decipherer, \%template, \%alternate);
|
|
243
|
|
244 #---------------------------------------------------------------------------
|
|
245 # LOADING THE TREES TO REJECT FOR BIRTH ANALYSIS
|
|
246 %treesToReject=();
|
|
247 %treesToIgnore=();
|
|
248 load_treesToReject(@tags);
|
|
249 load_treesToIgnore(@tags);
|
|
250 #---------------------------------------------------------------------------
|
|
251 # LOADING INPUT DATA INTO HASHES AND ARRAYS
|
|
252
|
|
253
|
|
254 #1 THIS IS THE POINT WHERE WE CAN FILTER OUT LARGE MICROSAT CLUSTERS
|
|
255 #2 AS WELL AS MULTIPLE-ALIGNMENT-BLOCKS-SPANNING MICROSATS (KIND OF
|
|
256 #3 IMPLICIT IN THE FIRST PART OF THE SENTENCE ITSELF IN MOST CASES).
|
|
257
|
|
258 my %orths=();
|
|
259 my $counterm = 0;
|
|
260 my $loaded = 0;
|
|
261 my %seen = ();
|
|
262 my @allowedchrs = ();
|
|
263 # print "no = $no\n"; #<STDIN>;
|
|
264
|
|
265 while (my $line = <ORTH>){
|
|
266 #print "line=$line\n";
|
|
267 $line =~ s/>hg18/>H/g;
|
|
268 $line =~ s/>panTro2/>C/g;
|
|
269 $line =~ s/>ponAbe2/>O/g;
|
|
270 $line =~ s/>rheMac2/>R/g;
|
|
271 $line =~ s/>calJac1/>M/g;
|
|
272 my @micros = split(/>/,$line); # LOADING ALL THE MICROSAT ENTRIES FROM THE CLUSTER INTO @micros
|
|
273 #print "micros=",printarr(@micros),"\n"; #<STDIN>;
|
|
274 shift @micros; # EMPTYING THE FIRST, EMTPY ELEMENT OF THE ARRAY
|
|
275 $no_of_species = adjustCoordinates($micros[0]);
|
|
276 next if $no_of_species != $no;
|
|
277 $counterm++;
|
|
278 #------------------------------------------------
|
|
279 $nextcounter[0]++ if $line =~ /compound/;
|
|
280 next if $line =~ /compound/; # GETTING RID OF COMPOUND MICROSATS
|
|
281 #------------------------------------------------
|
|
282 #next if $line =~ /[A-Za-z]>[a-zA-Z]/;
|
|
283 #------------------------------------------------
|
|
284 chomp $line;
|
|
285 my $match_count = ($line =~ s/>/>/g); # COUNTING THE NUMBER OF MICROSAT ENTRIES IN THE CLUSTER
|
|
286 #print "number of species = $match_count\n";
|
|
287 my $stopper = 0;
|
|
288 foreach my $mic (@micros){
|
|
289 my @local = split(/\t/,$mic);
|
|
290 if ($local[$typecord] =~ /\./ || exists($local[$no_of_interruptionscord+2])) {$stopper = 1; $nextcounter[1]++;
|
|
291 last; }
|
|
292 # REMOVING CLUSTERS WITH THE CYRPTIC, (UNRESOLVABLY COMPLEX) MICROSAT ENTRIES IN THEM
|
|
293 }
|
|
294 next if $stopper ==1;
|
|
295 #------------------------------------------------
|
|
296 $nextcounter[2]++ if (scalar(@micros) >$no_of_species);
|
|
297
|
|
298 next if (scalar(@micros) >$no_of_species); #1 REMOVING MICROSAT CLUSTERS WITH MORE NUMBER OF MICROSAT ENTRIES THAN THE NUMBER OF SPECIES IN THE DATASET.
|
|
299 #2 THIS IS SO BECAUSE SUCH CLUSTERS IMPLY THAT IN AT LEAST ONE SPECIES, THERE IS MORE THAN ONE MICROSAT ENTRY
|
|
300 #3 IN THE CLUSTER. THUS, HERE WE ARE GETTING RID OF MICROSATS CLUSTERS THAT INCLUDE MULTUPLE, NEIGHBORING
|
|
301 #4 MICROSATS, AND STICK TO CLEAN MICROSATS THAT DO NOT HAVE ANY MICROSATS IN NEIGHBORHOOD.
|
|
302 #5 THIS 'NEIGHBORHOOD-RANGE' HAD BEEN DECIDED PREVIOUSLY IN OUR CODE multiSpecies_orthFinder4.pl
|
|
303 my $nexter = 0;
|
|
304 foreach my $tag (@tags){
|
|
305 my $tagcount = ($line =~ s/>$tag\t/>$tag\t/g);
|
|
306 if ($tagcount > 1) { $nexter =1; #print colored ['red'],"multiple entires per species : $tagcount of $tag\n" if $printer == 1;
|
|
307 next;
|
|
308 }
|
|
309 }
|
|
310
|
|
311 if ($nexter == 1){
|
|
312 $nextcounter[3]++;
|
|
313 next;
|
|
314 }
|
|
315 #------------------------------------------------
|
|
316 foreach my $mic (@micros){ #1 REMOVING MICROSATELLITES WITH ANY 'N's IN THEM
|
|
317 my @local = split(/\t/,$mic);
|
|
318 if ($local[$microsatcord] =~ /N/) {$stopper =1; $nextcounter[4]++;
|
|
319 last;}
|
|
320 }
|
|
321 next if $stopper ==1;
|
|
322 #print "till here 1\n"; #<STDIN>;
|
|
323 #------------------------------------------------
|
|
324 my @micros_copy = @micros;
|
|
325
|
|
326 my $tempmicro = shift(@micros_copy); #1 CURRENTLY OBTAINING INFORMATION FOR THE FIRST
|
|
327 #2 MICROSAT IN THE CLUSTER.
|
|
328 my @tempfields = split(/\t/,$tempmicro);
|
|
329 my $prevtype = $tempfields[$typecord];
|
|
330 my $tempmotif = $tempfields[$motifcord];
|
|
331
|
|
332 my $tempfirstmotif = ();
|
|
333 if (scalar(@tempfields) > $microsatcord + 2){
|
|
334 if ($tempfields[$no_of_interruptionscord] >= 1) { #1 DISCARDING MICROSATS WITH MORE THAN ZERO INTERRUPTIONS
|
|
335 #2 IN THE FIRST MICROSAT OF THE CLUSTER
|
|
336 $nexter =1; #print colored ['blue'],"more than one interruptions \n" if $printer == 1;
|
|
337 }
|
|
338 }
|
|
339 if ($nexter == 1){
|
|
340 $nextcounter[6]++;
|
|
341 next;
|
|
342 } #1 DONE OBTAINING INFORMATION REGARDING
|
|
343 #2 THE FIRST MICROSAT FROM THE CLUSTER
|
|
344
|
|
345 if ($tempmotif =~ /^\[/){
|
|
346 $tempmotif =~ s/^\[//g;
|
|
347 $tempmotif =~ /([a-zA-Z]+)\].*/;
|
|
348 $tempfirstmotif = $1; #1 OBTAINING THE FIRTS MOTIF OF MICROSAT
|
|
349 }
|
|
350 else {$tempfirstmotif = $tempmotif;}
|
|
351 my $prevmotif = $tempfirstmotif;
|
|
352
|
|
353 my $key = ();
|
|
354 if ($tempmicro =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
|
|
355 $key = join("\t",$1, $2, $4, $5);
|
|
356 }
|
|
357 else{
|
|
358 # print "counld not form a key \n" if $printer == 1;
|
|
359 $nextcounter[7]++;
|
|
360 next;
|
|
361 }
|
|
362 #----------------- #1 NOW, AFTER OBTAINING INFORMATION ABOUT
|
|
363 #2 THE FIRST MICROSAT IN THE CLUSTER, THE
|
|
364 #3 FOLLOWING LOOP GOES THROUGH THE OTHER MICROSATS
|
|
365 #4 TO SEE IF THEY SHARE THE REQUIRED FEATURES (BELOW)
|
|
366
|
|
367 foreach my $micro (@micros_copy){
|
|
368 my @fields = split(/\t/,$micro);
|
|
369 #-----------------
|
|
370 if (scalar(@fields) > $microsatcord + 2){ #1 DISCARDING MICROSATS WITH MORE THAN ONE INTERRUPTIONS
|
|
371 if ($fields[$no_of_interruptionscord] >= 1) {$nexter =1; #print colored ['blue'],"more than one interruptions \n" if $printer == 1;
|
|
372 $nextcounter[6]++;
|
|
373 last; }
|
|
374 }
|
|
375 #-----------------
|
|
376 if (($prevtype ne "0") && ($prevtype ne $fields[$typecord])) {
|
|
377 $nexter =1; #print colored ['yellow'],"microsat of different type \n" if $printer == 1;
|
|
378 $nextcounter[8]++;
|
|
379 last; } #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSATS BELONG
|
|
380 #----------------- #2 TO DIFFERENT TYPES (MONOS, DIS, TRIS ETC.)
|
|
381 $prevtype = $fields[$typecord];
|
|
382
|
|
383 my $motif = $fields[$motifcord];
|
|
384 my $firstmotif = ();
|
|
385
|
|
386 if ($motif =~ /^\[/){
|
|
387 $motif =~ s/^\[//g;
|
|
388 $motif =~ /([a-zA-Z]+)\].*/;
|
|
389 $firstmotif = $1;
|
|
390 }
|
|
391 else {$firstmotif = $motif;}
|
|
392
|
|
393 my $motifpattern = $firstmotif.$firstmotif;
|
|
394 my $prevmotifpattern = $prevmotif.$prevmotif;
|
|
395
|
|
396 if (($prevmotif ne "0")&&(($motifpattern !~ /$prevmotif/i)||($prevmotifpattern !~ /$firstmotif/i)) ) {
|
|
397 $nexter =1; #print colored ['green'],"different motifs used \n$line\n" if $printer == 1;
|
|
398 $nextcounter[9]++;
|
|
399 last;
|
|
400 } #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSATS BELONG
|
|
401 #2 TO DIFFERENT MOTIFS
|
|
402 my $prevmotif = $firstmotif;
|
|
403 #-----------------
|
|
404
|
|
405 for my $t (0 ... $#tags){ #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSAT ENTRIES BELONG
|
|
406 #2 DIFFERENT ALIGNMENT BLOCKS
|
|
407 if ($micro =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
|
|
408 my $key2 = join("\t",$1, $2, $4, $5);
|
|
409 if ($key2 ne $key){
|
|
410 # print "microsats belong to diffferent alignment blocks altogether\n" if $printer == 1;
|
|
411 $nextcounter[10]++;
|
|
412 $nexter = 1; last;
|
|
413 }
|
|
414 }
|
|
415 else{
|
|
416 # print "counld not form a key \n" if $printer == 1;
|
|
417 $nexter = 1; last;
|
|
418 }
|
|
419 }
|
|
420
|
|
421 }
|
|
422 #####################
|
|
423 if ($nexter == 1){
|
|
424 # print "nexting\n" if $printer == 1;
|
|
425 next;
|
|
426 }
|
|
427 else{
|
|
428 # print "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n$key:\n$line\nvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv\n" if $printer == 1;
|
|
429 push (@{$orths{$key}},$line);
|
|
430 $loaded++;
|
|
431 if ($line =~ /($focalspec)\s([a-zA-Z0-9]+)\s([0-9]+)\s([0-9]+)/ ) {
|
|
432 # print "$line\n" if $printer == 1; #if $line =~ /Contig/;
|
|
433 # print "################ ################\n" if $printer == 1;
|
|
434 push @allowedchrs, $2 if !exists $allowedhash{$2};
|
|
435 $allowedhash{$2} = 1;
|
|
436 my $key = join("\t",$1, $2, $3, $4);
|
|
437 #print "print the shit: $key\n" if $printer == 1;
|
|
438 $seen{$key} = 1;
|
|
439 }
|
|
440 else { #print "Key could not be formed in SPUT for ($org) ($title) ([0-9]+) ([0-9]+)\n";
|
|
441 }
|
|
442 }
|
|
443 }
|
|
444 close ORTH;
|
|
445
|
|
446 # print "now studying where we lost microsatellites: @nextcounter\n";
|
|
447 for my $reason (0 ... $#nextcounter){
|
|
448 # print $removalReasons[$reason]."\t".$nextcounter[$reason],"\n";
|
|
449 }
|
|
450 # print "\ntotal number of keys formed = ", scalar(keys %orths), " = \n";
|
|
451 # print "done filtering .. counterm = $counterm and loaded = $loaded\n";
|
|
452
|
|
453 #----------------------------------------------------------------------------------------------------------------
|
|
454 # NOW GENERATING THE ALIGNMENT FILE WITH RELELEVENT ALIGNMENTS STORED ONLY.
|
|
455
|
|
456 while (1){
|
|
457 if (-e $megamatchlck){
|
|
458 # print "waiting to write into $megamatchlck\n";
|
|
459 sleep 10;
|
|
460 }
|
|
461 else{
|
|
462 open (MEGAMLCK, ">$megamatchlck") or die "Cannot open megamatchlck file $megamatchlck: $!";
|
|
463 open (MEGAM, ">$megamatch") or die "Cannot open megamatch file $megamatch: $!";
|
|
464 last;
|
|
465 }
|
|
466 }
|
|
467
|
|
468 foreach my $seqfile (@filterseqfiles){
|
|
469 my $fullpath = $seqfile;
|
|
470
|
|
471 # print "opening file: $fullpath\n";
|
|
472 open (MATCH, "<$fullpath") or die "Cannot open MATCH file $fullpath: $!";
|
|
473 my $matchlines = 0;
|
|
474
|
|
475 while (my $line = <MATCH>) {
|
|
476 if ($line =~ /($focalspec)\s([a-zA-Z0-9]+)\s([0-9]+)\s([0-9]+)/ ) {
|
|
477 my $key = join("\t",$1, $2, $3, $4);
|
|
478 if (exists $seen{$key}){
|
|
479 while (1){
|
|
480 $matchlines++;
|
|
481 print MEGAM $line;
|
|
482 $line = <MATCH>;
|
|
483 print MEGAM "\n" if $line !~/[0-9a-zA-Z]/;
|
|
484 last if $line !~/[0-9a-zA-Z]/;
|
|
485 }
|
|
486 }
|
|
487 }
|
|
488 }
|
|
489 # print "matchlines = $matchlines\n";
|
|
490 close MATCH;
|
|
491 }
|
|
492 close MEGAMLCK;
|
|
493
|
|
494 unlink $megamatchlck;
|
|
495 close MEGAM;
|
|
496 undef %seen;
|
|
497 #----------------------------------------------------------------------------------------------------------------
|
|
498
|
|
499 #---------------------------------------------------------------------------
|
|
500 # NOW, AFTER FILTERING MANY MICROSATS, AND LOADING THE FILTERED ONES INTO
|
|
501 # THE HASH %orths , WE GO THROUGH THE ALIGNMENT FILE, AND STUDY THE
|
|
502 # FLANKING SEQUENCES OF ALL THESE MICROSATS, TO FILTER THEM FURTHER
|
|
503 #$printer = 1;
|
|
504
|
|
505 my $microreadcounter=0;
|
|
506 my $contigsentered=0;
|
|
507 my $contignotrightcounter=0;
|
|
508 my $keynotformedcounter=0;
|
|
509 my $keynotfoundcounter= 0;
|
|
510 my $dotcounter = 0;
|
|
511
|
|
512 open (BO, "<$megamatch") or die "Cannot open alignment file: $megamatch: $!";
|
|
513
|
|
514 while (my $line = <BO>){
|
|
515 # print "." if $dotcounter % 100 ==0;
|
|
516 # print "\n" if $dotcounter % 5000 ==0;
|
|
517 # print "dotcounter = $dotcounter\n " if $printer == 1;
|
|
518 next if $line !~ /^[0-9]+/;
|
|
519 $dotcounter++;
|
|
520 # print colored ['green'], "~" x 60, "\n" if $printer == 1;
|
|
521 # print colored ['green'], $line;# if $printer == 1;
|
|
522 chomp $line;
|
|
523 my @fields2 = split(/\t/,$line);
|
|
524 my $key2 = ();
|
|
525 my $alignment_no = (); #1 TEMPORARY
|
|
526 if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
|
|
527 $key2 = join("\t",$1, $2, $4, $5);
|
|
528 $alignment_no=$1;
|
|
529 }
|
|
530 else {print "seq line $line incompatible\n"; $keynotformedcounter++; next;}
|
|
531
|
|
532 $no_of_species = adjustCoordinates($line);
|
|
533 $contignotrightcounter++ if $no_of_species != $no;
|
|
534 # print "contignotrightcounter=$contignotrightcounter\n";
|
|
535 # print "no_of_species=$no_of_species\n";
|
|
536 # print "no=$no\n";
|
|
537
|
|
538 next if $no_of_species != $no;
|
|
539
|
|
540 # print "key = $key2\n" if $printer == 1;
|
|
541 my @clusters = (); #1 EXTRACTING MICROSATS CORRESPONDING TO THIS
|
|
542 #2 ALIGNMENT BLOCK
|
|
543 if (exists($orths{$key2})){
|
|
544 @clusters = @{$orths{$key2}};
|
|
545 $contigsentered++;
|
|
546 delete $orths{$key2};
|
|
547 }
|
|
548 else{
|
|
549 #print "orth does not exist\n";
|
|
550 $keynotfoundcounter++;
|
|
551 next;
|
|
552 }
|
|
553
|
|
554 my %sequences=(); #1 WILL STORE SEQUENCES IN THE CURRENT ALIGNMENT BLOCK
|
|
555 my $humseq = ();
|
|
556 foreach my $tag (@tags){ #1 READING THE ALIGNMENT FILE AND CAPTURING SEQUENCES
|
|
557 my $seq = <BO>; #2 OF ALL SPECIES.
|
|
558 chomp $seq;
|
|
559 $sequences{$tag} = " ".$seq;
|
|
560 #print "sequences = $sequences{$tag}\n" if $printer == 1;
|
|
561 $humseq = $seq if $tag =~ /H/;
|
|
562 }
|
|
563
|
|
564
|
|
565 foreach my $cluster (@clusters){ #1 NOW, GOING THROUGH THE CLUSTER OF MICROSATS
|
|
566 #print "x" x 60, "\n" if $printer == 1;
|
|
567 #print colored ['red'],"cluster = $cluster\n";
|
|
568 $largesttree =~ s/hg18/H/g;
|
|
569 $largesttree =~ s/panTro2/C/g;
|
|
570 $largesttree =~ s/ponAbe2/O/g;
|
|
571 $largesttree =~ s/rheMac2/R/g;
|
|
572 $largesttree =~ s/calJac1/M/g;
|
|
573
|
|
574 $microreadcounter++;
|
|
575 my @micros = split(/>/,$cluster);
|
|
576 shift @micros;
|
|
577
|
|
578 my $edge_microsat=0; #1 THIS WILL HAVE VALUE "1" IF MICROSAT IS FOUND
|
|
579 #2 TO BE TOO CLOSE TO THE EDGES OF ALIGNMENT BLOCK
|
|
580
|
|
581 my @starts= (); my %start_hash=(); #1 STORES THE START AND END COORDINATES OF MICROSATELLITES
|
|
582 my @ends = (); my %end_hash=(); #2 SO THAT LATER, WE WILL BE ABLE TO FIND THE EXTREME
|
|
583 #3 COORDINATE VALUES OF THE ORTHOLOGOUS MIROSATELLITES.
|
|
584
|
|
585 my %microhash=();
|
|
586 my %microsathash=();
|
|
587 my %nonmicrosathash=();
|
|
588 my $motif=(); #1 BASIC MOTIF OF THE MICROSATELLITE.. THERE'S ONLY 1
|
|
589 #print "tags=@tags\n";
|
|
590 for my $i (0 ... $#tags){ #1 FINDING THE MICROSAT, AND THE ALIGNMENT SEQUENCE
|
|
591 #2 CORRESPONDING TO THE PARTICULAR SPECIES (AS PER
|
|
592 #3 THE VARIABLE $TAG;
|
|
593 my $tag = $tags[$i];
|
|
594 # print $seq;
|
|
595 my $locus="NULL"; #1 THIS WILL STORE THE MICROSAT OF THIS SPECIES.
|
|
596 #2 IF THERE IS NO MICROSAT, IT WILL REMAIN "NULL"
|
|
597
|
|
598 foreach my $micro (@micros){
|
|
599 # print "micro=$micro, tag=$tag\n";
|
|
600 if ($micro =~ /^$tag/){ #1 MICROSAT OF THIS SPECIES FOUND..
|
|
601 $locus = $micro;
|
|
602 my @fields = split(/\t/,$micro);
|
|
603 $motif = $fields[$motifcord];
|
|
604 $microsathash{$tag}=$fields[$microsatcord];
|
|
605 # print "fields=@fields, and startcord=$startcord = $fields[$startcord]\n";
|
|
606 push(@starts, $fields[$startcord]);
|
|
607 push(@ends, $fields[$endcord]);
|
|
608 $start_hash{$tag}=$fields[$startcord];
|
|
609 $end_hash{$tag}=$fields[$endcord];
|
|
610 last;
|
|
611 }
|
|
612 else{$microsathash{$tag}="NULL"}
|
|
613 }
|
|
614 $microhash{$tag}=$locus;
|
|
615
|
|
616 }
|
|
617
|
|
618
|
|
619
|
|
620 my $extreme_start = smallest_number(@starts); #1 THESE TWO ARE THE EXTREME COORDINATES OF THE
|
|
621 my $extreme_end = largest_number(@ends); #2 MICROSAT CLUSTER ACCROSS ALL THE SPECIES IN
|
|
622 #3 WHOM IT IS FOUND TO BE ORTHOLOGOUS.
|
|
623
|
|
624 #print "starts=@starts... ends=@ends\n";
|
|
625
|
|
626 my %up_flanks = (); #1 CONTAINS UPSTEAM FLANKING REGIONS FOR EACH SPECIES
|
|
627 my %down_flanks = (); #1 CONTAINS DOWNDTREAM FLANKING REGIONS FOR EACH SPECIES
|
|
628
|
|
629 my %up_largeflanks = ();
|
|
630 my %down_largeflanks = ();
|
|
631
|
|
632 my %locusandflanks = ();
|
|
633 my %locusandlargeflanks = ();
|
|
634
|
|
635 my %up_internal_flanks=(); #1 CONTAINS SEQUENCE BETWEEN THE $extreme_start and the
|
|
636 #2 ACTUAL START OF MICROSATELLITE IN THE SPECIES
|
|
637 my %down_internal_flanks=(); #1 CONTAINS SEQUENCE BETWEEN THE $extreme_end and the
|
|
638 #2 ACTUAL end OF MICROSATELLITE IN THE SPECIES
|
|
639
|
|
640 my %alignment=(); #1 CONTAINS ACTUAL ALIGNMENT SEQUENCE BETWEEN THE TWO
|
|
641 #2 EXTEME VALUES.
|
|
642
|
|
643 my %microsatstarts=(); #1 WITHIN EACH ALIGNMENT, IF THERE EXISTS A MICROSATELLITE
|
|
644 #2 THIS HASH CONTAINS THE START SITE OF THE MICROSATELLITE
|
|
645 #3 WIHIN THE ALIGNMENT
|
|
646 next if !defined $extreme_start;
|
|
647 next if !defined $extreme_end;
|
|
648 next if $extreme_start > length($sequences{$tags[0]});
|
|
649 next if $extreme_start < 0;
|
|
650 next if $extreme_end > length($sequences{$tags[0]});
|
|
651
|
|
652 for my $i (0 ... $#tags){ #1 NOW THAT WE HAVE GATHERED INFORMATION REGARDING
|
|
653 #2 SEQUENCE ALIGNMENT AND MICROSATELLITE COORDINATES
|
|
654 #3 AS WELL AS THE EXTREME COORDINATES OF THE
|
|
655 #4 MICROSAT CLUSTER, WE WILL PROCEED TO EXTRACT THE
|
|
656 #5 FLANKING SEQUENCE OF ALL ORGS, AND STUDY IT IN
|
|
657 #6 MORE DETAIL.
|
|
658 my $tag = $tags[$i];
|
|
659 # print "tag=$tag.. seqlength = ",length($sequences{$tag})," extreme_start=$extreme_start and extreme_end=$extreme_end\n";
|
|
660 my $upstream_gaps = (substr($sequences{$tag}, 0, $extreme_start) =~ s/\-/-/g); #1 NOW MEASURING THE NUMBER OF GAPS IN THE UPSTEAM
|
|
661 #2 AND DOWNSTREAM SEQUENCES OF THE MICROSATs IN THIS
|
|
662 #3 CLUSTER.
|
|
663
|
|
664 my $downstream_gaps = (substr($sequences{$tag}, $extreme_end) =~ s/\-/-/g);
|
|
665 if (($extreme_start - $upstream_gaps )< $EDGE_DISTANCE || (length($sequences{$tag}) - $extreme_end - $downstream_gaps) < $EDGE_DISTANCE){
|
|
666 $edge_microsat=1;
|
|
667
|
|
668 last;
|
|
669 }
|
|
670 else{
|
|
671 $up_flanks{$tag} = substr($sequences{$tag}, $extreme_start - $FLANK_SUPPORT, $FLANK_SUPPORT);
|
|
672 $down_flanks{$tag} = substr($sequences{$tag}, $extreme_end+1, $FLANK_SUPPORT);
|
|
673
|
|
674 $up_largeflanks{$tag} = substr($sequences{$tag}, $extreme_start - $COMPLEXITY_SUPPORT, $COMPLEXITY_SUPPORT);
|
|
675 $down_largeflanks{$tag} = substr($sequences{$tag}, $extreme_end+1, $COMPLEXITY_SUPPORT);
|
|
676
|
|
677
|
|
678 $alignment{$tag} = substr($sequences{$tag}, $extreme_start, $extreme_end-$extreme_start+1);
|
|
679 $locusandflanks{$tag} = $up_flanks{$tag}."[".$alignment{$tag}."]".$down_flanks{$tag};
|
|
680 $locusandlargeflanks{$tag} = $up_largeflanks{$tag}."[".$alignment{$tag}."]".$down_largeflanks{$tag};
|
|
681
|
|
682 if ($microhash{$tag} ne "NULL"){
|
|
683 $up_internal_flanks{$tag} = substr($sequences{$tag}, $extreme_start , $start_hash{$tag}-$extreme_start);
|
|
684 $down_internal_flanks{$tag} = substr($sequences{$tag}, $end_hash{$tag} , $extreme_end-$end_hash{$tag});
|
|
685 $microsatstarts{$tag}=$start_hash{$tag}-$extreme_start;
|
|
686 # print "tag = $tag, internal flanks = $up_internal_flanks{$tag} and $down_internal_flanks{$tag} and start = $microsatstarts{$tag}\n" if $printer == 1;
|
|
687 }
|
|
688 else{
|
|
689 $nonmicrosathash{$tag}=substr($sequences{$tag}, $extreme_start, $extreme_end-$extreme_start+1);
|
|
690
|
|
691 }
|
|
692 # print "up flank for species $tag = $up_flanks{$tag} \ndown flank for species $tag = $down_flanks{$tag} \n" if $printer == 1;
|
|
693
|
|
694 }
|
|
695
|
|
696 }
|
|
697 $nextcounter[11]++ if $edge_microsat==1;
|
|
698 next if $edge_microsat==1;
|
|
699
|
|
700
|
|
701 my $low_complexity = 0; #1 VALUE WILL BE 1 IF ANY OF THE FLANKING REGIONS
|
|
702 #2 IS FOUND TO BE OF LOW COMPLEXITY, BY USING THE
|
|
703 #3 FUNCTION sub test_complexity
|
|
704
|
|
705
|
|
706 for my $i (0 ... $#tags){
|
|
707 # print "i = $tags[$i]\n" if $printer == 1;
|
|
708 if (test_complexity($up_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT) eq "LOW" || test_complexity($down_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT) eq "LOW"){
|
|
709 # print "i = $i, low complexity regions: $up_largeflanks{$tags[$i]}: ",test_complexity($up_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT), " and $down_largeflanks{$tags[$i]} = ",test_complexity($down_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT),"\n" if $printer == 1;
|
|
710 $low_complexity =1; last;
|
|
711 }
|
|
712 }
|
|
713
|
|
714 $nextcounter[12]++ if $low_complexity==1;
|
|
715 next if $low_complexity == 1;
|
|
716
|
|
717
|
|
718 my $sequence_dissimilarity = 0; #1 THIS VALYE WILL BE 1 IF THE SEQUENCE SIMILARITY
|
|
719 #2 BETWEEN ANY OF THE SPECIES AGAINST THE HUMAN
|
|
720 #3 FLANKING SEQUENCES IS BELOW A CERTAIN THRESHOLD
|
|
721 #4 AS DESCRIBED IN FUNCTION sub sequence_similarity
|
|
722 my %donepair = ();
|
|
723 for my $i (0 ... $#tags){
|
|
724 # print "i = $tags[$i]\n" if $printer == 1;
|
|
725 # next if $i == 0;
|
|
726 # print colored ['magenta'],"THIS IS UP\n" if $printer == 1;
|
|
727
|
|
728 for my $b (0 ... $#tags){
|
|
729 next if $b == $i;
|
|
730 my $pair = ();
|
|
731 $pair = $i."_".$b if $i < $b;
|
|
732 $pair = $b."_".$i if $b < $i;
|
|
733 next if exists $donepair{$pair};
|
|
734 my ($up_similarity,$upnucdiffs, $upindeldiffs) = sequence_similarity($up_flanks{$tags[$i]}, $up_flanks{$tags[$b]}, $SIMILARITY_THRESH, $info);
|
|
735 my ($down_similarity,$downnucdiffs, $downindeldiffs) = sequence_similarity($down_flanks{$tags[$i]}, $down_flanks{$tags[$b]}, $SIMILARITY_THRESH, $info);
|
|
736 $donepair{$pair} = $up_similarity."_".$down_similarity;
|
|
737
|
|
738 # print RUN "$up_similarity $upnucdiffs $upindeldiffs $down_similarity $downnucdiffs $downindeldiffs\n";
|
|
739
|
|
740 if ( $up_similarity < $SIMILARITY_THRESH || $down_similarity < $SIMILARITY_THRESH){
|
|
741 $sequence_dissimilarity =1;
|
|
742 last;
|
|
743 }
|
|
744 }
|
|
745 }
|
|
746 $nextcounter[13]++ if $sequence_dissimilarity==1;
|
|
747
|
|
748 next if $sequence_dissimilarity == 1;
|
|
749 my ($simplified_microsat, $Hchrom, $Hstart, $Hend, $locusmotif, $locusmotifsize) = summarize_microsat($cluster, $humseq);
|
|
750 # print "simplified_microsat=$simplified_microsat\n"; <STDIN>;
|
|
751 my ($tree_analysis, $alternative_trees, $conformation) = treeStudy($simplified_microsat);
|
|
752
|
|
753 if (exists $treesToReject{$tree_analysis}){
|
|
754 $nextcounter[14]++;
|
|
755 next;
|
|
756 }
|
|
757
|
|
758 # my $adjuster=();
|
|
759 # if ($no_of_species == 4){
|
|
760 # my @sields = split(/\t/,$simplified_microsat);
|
|
761 # my $somend = pop(@sields);
|
|
762 # my $somestart = pop(@sields);
|
|
763 # my $somechr = pop(@sields);
|
|
764 # $adjuster = "NA\t" x 13 ;
|
|
765 # $simplified_microsat = join ("\t", @sields, $adjuster).$somechr."\t".$somestart."\t".$somend;
|
|
766 # }
|
|
767 # if ($no_of_species == 3){
|
|
768 # my @sields = split(/\t/,$simplified_microsat);
|
|
769 # my $somend = pop(@sields);
|
|
770 # my $somestart = pop(@sields);
|
|
771 # my $somechr = pop(@sields);
|
|
772 # $adjuster = "NA\t" x 26 ;
|
|
773 # $simplified_microsat = join ("\t", @sields, $adjuster).$somechr."\t".$somestart."\t".$somend;
|
|
774 # }
|
|
775 #
|
|
776 $registeredTrees{$tree_analysis} = 1 if !exists $registeredTrees{$tree_analysis};
|
|
777 $registeredTrees{$tree_analysis}++ if exists $registeredTrees{$tree_analysis};
|
|
778
|
|
779 if (exists $treesToIgnore{$tree_analysis}){
|
|
780 my @appendarr = ();
|
|
781
|
|
782 print SUMMARY $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize], "\t", $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t";
|
|
783 #print "SUMMARY ",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize], "\t", $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t";
|
|
784 # print SELECT $Hchrom,"\t",$Hstart,"\t",$Hend,"\t","NOEVENT", "\t\t", $cluster,"\n";
|
|
785
|
|
786 foreach my $lnode (@$lagestnodes){
|
|
787 my @pair = @$lnode;
|
|
788 my @nodemutarr = ();
|
|
789 for my $p (@pair){
|
|
790 my @mutinfoarray1 = ();
|
|
791 for (1 ... 38){
|
|
792 push (@mutinfoarray1, "NA")
|
|
793 }
|
|
794 print SUMMARY join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t";
|
|
795 }
|
|
796
|
|
797 }
|
|
798 for (1 ... 38){
|
|
799 push (@appendarr, "NA")
|
|
800 }
|
|
801 print SUMMARY join ("\t", @appendarr,"NULL", "NULL"),"\n";
|
|
802 # print "SUMMARY ",join ("\t", @appendarr,"NULL", "NULL"),"\n"; #<STDIN>;
|
|
803 next;
|
|
804 }
|
|
805
|
|
806 my ($mutations_array, $nodes, $branches_hash, $alivehash, $primaryalignment) = peel_onion($tree, \%sequences, \%alignment, \@tags, \%microsathash, \%nonmicrosathash, $motif, $tree_analysis, $thresholdhash{length($motif)}, \%microsatstarts);
|
|
807
|
|
808 if ($mutations_array eq "NULL"){
|
|
809 my @appendarr = ();
|
|
810
|
|
811 print SUMMARY $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize],"\t",$simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t";
|
|
812 # print "SUMMARY ", $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize],"\t",$simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t";
|
|
813 # print SELECT $Hchrom,"\t",$Hstart,"\t",$Hend,"\t","EVENT", "\t\t", $cluster,"\n";
|
|
814
|
|
815 foreach my $lnode (@$lagestnodes){
|
|
816 my @pair = @$lnode;
|
|
817 my @nodemutarr = ();
|
|
818 for my $p (@pair){
|
|
819 my @mutinfoarray1 = ();
|
|
820 for (1 ... 38){
|
|
821 push (@mutinfoarray1, "NA")
|
|
822 }
|
|
823 print SUMMARY join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t";
|
|
824 # print join ("\t", "SUMMARY", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t";
|
|
825 }
|
|
826
|
|
827 }
|
|
828 for (1 ... 38){
|
|
829 push (@appendarr, "NA")
|
|
830 }
|
|
831 print SUMMARY join ("\t", @appendarr,"NULL", "NULL"),"\n";
|
|
832 # print join ("\t","SUMMARY", @appendarr,"NULL", "NULL"),"\n"; #<STDIN>;
|
|
833 next;
|
|
834 }
|
|
835
|
|
836
|
|
837 # print "sent: \n" if $printer == 1;
|
|
838 # print "nodes = @$nodes, branches array:\n" if $mutations_array ne "NULL" && $printer == 1;
|
|
839
|
|
840 my ($newstatus, $newmutations_array, $newnodes, $newbranches_hash, $newalivehash, $finalalignment) = fillAlignmentGaps($tree, \%sequences, \%alignment, \@tags, \%microsathash, \%nonmicrosathash, $motif, $tree_analysis, $thresholdhash{length($motif)}, \%microsatstarts);
|
|
841 # print "newmutations_array returned = \n",join("\n",@$newmutations_array),"\n" if $newmutations_array ne "NULL" && $printer == 1;
|
|
842 my @finalmutations_array= ();
|
|
843 @finalmutations_array = selectMutationArray($mutations_array, $newmutations_array, \@tags, $alivehash, \%alignment, $motif) if $newmutations_array ne "NULL";
|
|
844 @finalmutations_array = selectMutationArray($mutations_array, $mutations_array, \@tags, $alivehash, \%alignment, $motif) if $newmutations_array eq "NULL";
|
|
845
|
|
846 my ($besttree, $treescore) = selectBetterTree($tree_analysis, $alternate{$tree_analysis}, \@finalmutations_array);
|
|
847 my $cleancase = "UNCLEAN";
|
|
848
|
|
849 $cleancase = checkCleanCase($besttree, $finalalignment) if $treescore > 0 && $finalalignment ne "NULL" && $finalalignment =~ /\!/;
|
|
850 $cleancase = checkCleanCase($besttree, $primaryalignment) if $treescore > 0 && $finalalignment eq "NULL" && $primaryalignment =~ /\!/ && $primaryalignment ne "NULL";
|
|
851 $cleancase = "CLEAN" if $finalalignment eq "NULL" && $primaryalignment !~ /\!/ && $primaryalignment ne "NULL";
|
|
852 $cleancase = "CLEAN" if $finalalignment ne "NULL" && $finalalignment !~ /\!/ ;
|
|
853 $besttree = "NULL" if $treescore <= 0;
|
|
854 print SUMMARY $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize],"\t",$simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t";
|
|
855 # print "SUMMARY ", $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize],"\t",$simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t";
|
|
856
|
|
857 # print SELECT $Hchrom,"\t",$Hstart,"\t",$Hend,"\t","EVENT", "\t\t", $cluster,"\n";
|
|
858
|
|
859 my @mutinfoarray =();
|
|
860
|
|
861 foreach my $lnode (@$lagestnodes){
|
|
862 my @pair = @$lnode;
|
|
863 my $joint = "(".join(", ",@pair).")";
|
|
864 my @nodemutarr = ();
|
|
865
|
|
866 for my $p (@pair){
|
|
867 foreach my $mut (@finalmutations_array){
|
|
868 $mut =~ /node=([A-Z, \(\)]+)/;
|
|
869 push @nodemutarr, $mut if $p eq $1;
|
|
870 }
|
|
871 # print "from pair @pair, p=$p\n";
|
|
872 @mutinfoarray = summarizeMutations(\@nodemutarr, $besttree);
|
|
873 print SUMMARY join ("\t", @mutinfoarray[0...($#mutinfoarray-1)] ),"\t";
|
|
874 # print "SUMMARY ",join ("\t", @mutinfoarray[0...($#mutinfoarray-1)] ),"\t";
|
|
875 }
|
|
876
|
|
877 }
|
|
878
|
|
879 @mutinfoarray = summarizeMutations(\@finalmutations_array, $besttree);
|
|
880 print SUMMARY join ("\t", @mutinfoarray ),"\t";
|
|
881 print SUMMARY $cleancase, "\n";
|
|
882 # print "SUMMARY ",join ("\t", @mutinfoarray,$cleancase ),"\n"; #<STDIN>;
|
|
883 # print "summarized\n"; <STDIN>;
|
|
884
|
|
885 my %indelcatch = ();
|
|
886 my %substcatch = ();
|
|
887 my %typecatch = ();
|
|
888 my %nodescatch = ();
|
|
889 my $mutconcat = join("\t", @finalmutations_array)."\n";
|
|
890 my %indelposcatch = ();
|
|
891 my %subsposcatch = ();
|
|
892
|
|
893 foreach my $fmut ( @finalmutations_array){
|
|
894 # next if $fmut !~ /indeltype=[a-zA-Z]+/;
|
|
895 #print RUN $fmut, "\n";
|
|
896 $fmut =~ /node=([a-zA-Z, \(\)]+)/;
|
|
897 my $lnode = $1;
|
|
898 $nodescatch{$1}=1;
|
|
899
|
|
900 if ($fmut =~ /type=substitution/){
|
|
901 # print "fmut=$fmut\n";
|
|
902 $fmut =~ /from=([a-zA-Z\-]+)\tto=([a-zA-Z\-]+)/;
|
|
903 my $from=$1;
|
|
904 # print "from=$from\n";
|
|
905 my $to=$2;
|
|
906 # print "to=$to\n";
|
|
907 push @{$substcatch{$lnode}} , ("from:".$from." to:".$to);
|
|
908 $fmut =~ /position=([0-9]+)/;
|
|
909 push @{$subsposcatch{$lnode}}, $1;
|
|
910 }
|
|
911
|
|
912 if ($fmut =~ /insertion=[a-zA-Z\-]+/){
|
|
913 $fmut =~ /insertion=([a-zA-Z\-]+)/;
|
|
914 push @{$indelcatch{$lnode}} , $1;
|
|
915 $fmut =~ /indeltype=([a-zA-Z]+)/;
|
|
916 push @{$typecatch{$lnode}}, $1;
|
|
917 $fmut =~ /position=([0-9]+)/;
|
|
918 push @{$indelposcatch{$lnode}}, $1;
|
|
919 }
|
|
920 if ($fmut =~ /deletion=[a-zA-Z\-]+/){
|
|
921 $fmut =~ /deletion=([a-zA-Z\-]+)/;
|
|
922 push @{$indelcatch{$lnode}} , $1;
|
|
923 $fmut =~ /indeltype=([a-zA-Z]+)/;
|
|
924 push @{$typecatch{$lnode}}, $1;
|
|
925 $fmut =~ /position=([0-9]+)/;
|
|
926 push @{$indelposcatch{$lnode}}, $1;
|
|
927 }
|
|
928 }
|
|
929
|
|
930 # print $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t" if $printer == 1;
|
|
931 # print join ("<\t>", @mutinfoarray),"\n" if $printer == 1;
|
|
932 # print "where mutinfoarray = @mutinfoarray\n" if $printer == 1;
|
|
933 # #print RUN ".";
|
|
934
|
|
935 # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;
|
|
936 # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;
|
|
937
|
|
938 # print colored ['red'],"finalmutations_array=\n" if $printer == 1;
|
|
939 foreach (@finalmutations_array) {
|
|
940 # print colored ['red'], "$_\n" if $_ =~ /type=substitution/ && $printer == 1 ;
|
|
941 # print colored ['yellow'], "$_\n" if $_ !~ /type=substitution/ && $printer == 1 ;
|
|
942
|
|
943 }# if $line =~ /cal/;# && $line =~ /chr4/;
|
|
944
|
|
945 # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;
|
|
946 # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1;
|
|
947 # print "tree analysis = $tree_analysis\n" if $printer == 1;
|
|
948
|
|
949 # my $mutations = "@$mutations_array";
|
|
950
|
|
951
|
|
952 next;
|
|
953 for my $keys (@$nodes) {foreach my $key (@$keys){
|
|
954 #print "key = $key, => $branches_hash->{$key}\n";
|
|
955 }
|
|
956 # print "x" x 50, "\n";
|
|
957 }
|
|
958 my ($birth_steps, $death_steps) = decipher_history($mutations_array,join("",@tags),$nodes,$branches_hash,$tree_analysis,$conformation, $alivehash, $simplified_microsat);
|
|
959 }
|
|
960 }
|
|
961 close BO;
|
|
962 # print "now studying where we lost microsatellites:";
|
|
963 # print "x" x 60,"\n";
|
|
964 for my $reason (0 ... $#nextcounter){
|
|
965 # print $removalReasons[$reason]."\t".$nextcounter[$reason],"\n";
|
|
966 }
|
|
967 # print "x" x 60,"\n";
|
|
968 # print "In total we read $microreadcounter microsatellites after reading through $contigsentered contigs\n";
|
|
969 # print " we lost $keynotformedcounter contigs as they did not form the key, \n";
|
|
970 # print "$contignotrightcounter contigs as they were not of the right species configuration\n";
|
|
971 # print "$keynotfoundcounter contigs as they did not contain the microsats\n";
|
|
972 # print "... In total we went through a file that had $dotcounter contigs...\n";
|
|
973 # print join ("\n","remaining orth keys = ", (keys %orths),"");
|
|
974
|
|
975
|
|
976 # print "now printing counted trees: \n";
|
|
977 if (scalar(keys %registeredTrees) > 0){
|
|
978 foreach my $keyb ( sort (keys %registeredTrees) )
|
|
979 {
|
|
980 # print "$keyb : $registeredTrees{$keyb}\n";
|
|
981 }
|
|
982 }
|
|
983
|
|
984
|
|
985 }
|
|
986
|
|
987 my @summarizarr = ("+C=+C +R.+C -HCOR,+C",
|
|
988 "+H=+H +R.+H -HCOR,+H",
|
|
989 "-C=-C -R.-C +HCOR,-C",
|
|
990 "-H=-H -R.-H +HCOR,-H",
|
|
991 "+HC=+HC",
|
|
992 "-HC=-HC",
|
|
993 "+O=+O -HCOR,+O",
|
|
994 "-O=-O +HCOR,-O",
|
|
995 "+HCO=+HCO",
|
|
996 "-HCO=-HCO",
|
|
997 "+R=+R +R.+C +R.+H",
|
|
998 "-R=-R -R.-C -R.-H");
|
|
999
|
|
1000 foreach my $line (@summarizarr){
|
|
1001 next if $line !~ /[A-Za-z0-9]/;
|
|
1002 # print $line;
|
|
1003 chomp $line;
|
|
1004 my @fields = split(/=/,$line);
|
|
1005 # print "title = $fields[0]\n";
|
|
1006 my @parts=split(/ +/, $fields[1]);
|
|
1007 my %partshash = ();
|
|
1008 foreach my $part (@parts){$partshash{$part}=1;}
|
|
1009 my $count=0;
|
|
1010 foreach my $key ( sort keys %registeredTrees ){
|
|
1011 next if !exists $partshash{$key};
|
|
1012 # print "now adding $registeredTrees{$key} from $key\n";
|
|
1013 $count+=$registeredTrees{$key};
|
|
1014 }
|
|
1015 # print "$fields[0] : $count\n";
|
|
1016 }
|
|
1017 #--------------------------------------------------------------------------------------------------------
|
|
1018 #--------------------------------------------------------------------------------------------------------
|
|
1019 #--------------------------------------------------------------------------------------------------------
|
|
1020 #--------------------------------------------------------------------------------------------------------
|
|
1021 #--------------------------------------------------------------------------------------------------------
|
|
1022 sub largest_number{
|
|
1023 my $counter = 0;
|
|
1024 my($max) = shift(@_);
|
|
1025 foreach my $temp (@_) {
|
|
1026
|
|
1027 #print "finding largest array: $maxcounter \n";
|
|
1028 if($temp > $max){
|
|
1029 $max = $temp;
|
|
1030 }
|
|
1031 }
|
|
1032 return($max);
|
|
1033 }
|
|
1034
|
|
1035 sub smallest_number{
|
|
1036 my $counter = 0;
|
|
1037 my($min) = shift(@_);
|
|
1038 foreach my $temp (@_) {
|
|
1039 #print "finding largest array: $maxcounter \n";
|
|
1040 if($temp < $min){
|
|
1041 $min = $temp;
|
|
1042 }
|
|
1043 }
|
|
1044 return($min);
|
|
1045 }
|
|
1046 #--------------------------------------------------------------------------------------------------------
|
|
1047 #--------------------------------------------------------------------------------------------------------
|
|
1048 #--------------------------------------------------------------------------------------------------------
|
|
1049 #--------------------------------------------------------------------------------------------------------
|
|
1050 #--------------------------------------------------------------------------------------------------------
|
|
1051 sub baseml_parser{
|
|
1052 my $outputfile = $_[0];
|
|
1053 open(BOUT,"<$outputfile") or die "Cannot open output of upstream baseml $outputfile: $!";
|
|
1054 my @info = ();
|
|
1055 my @branchields = ();
|
|
1056 my @distanceields = ();
|
|
1057 my @bout = <BOUT>;
|
|
1058 #print colored ['red'], @bout ,"\n";
|
|
1059 for my $b (0 ... $#bout){
|
|
1060 my $bine=$bout[$b];
|
|
1061 #print colored ['yellow'], "sentence = ",$bine;
|
|
1062 if ($bine =~ /TREE/){
|
|
1063 $bine=$bout[$b++];
|
|
1064 $bine=$bout[$b++];
|
|
1065 $bine=$bout[$b++];
|
|
1066 #print "FOUND",$bine;
|
|
1067 chomp $bine;
|
|
1068 $bine =~ s/^\s+//g;
|
|
1069 @branchields = split(/\s+/,$bine);
|
|
1070 $bine=$bout[$b++];
|
|
1071 chomp $bine;
|
|
1072 $bine =~ s/^\s+//g;
|
|
1073 @distanceields = split(/\s+/,$bine);
|
|
1074 #print "LASTING..............\n";
|
|
1075 last;
|
|
1076 }
|
|
1077 else{
|
|
1078 }
|
|
1079 }
|
|
1080
|
|
1081 close BOUT;
|
|
1082 # print "branchfields = @branchields and distanceields = @distanceields\n" if $printer == 1;
|
|
1083 my %distance_hash=();
|
|
1084 for my $d (0 ... $#branchields){
|
|
1085 $distance_hash{$branchields[$d]} = $distanceields[$d];
|
|
1086 }
|
|
1087
|
|
1088 $info[0] = $distance_hash{"9..1"} + $distance_hash{"9..2"};
|
|
1089 $info[1] = $distance_hash{"9..1"} + $distance_hash{"8..9"}+ $distance_hash{"8..3"};
|
|
1090 $info[2] = $distance_hash{"9..1"} + $distance_hash{"8..9"}+$distance_hash{"7..8"}+$distance_hash{"7..4"};
|
|
1091 $info[3] = $distance_hash{"9..1"} + $distance_hash{"8..9"}+$distance_hash{"7..8"}+$distance_hash{"6..7"}+$distance_hash{"6..5"};
|
|
1092
|
|
1093 # print "\nsending back: @info\n" if $printer == 1;
|
|
1094
|
|
1095 return join("\t",@info);
|
|
1096
|
|
1097 }
|
|
1098
|
|
1099
|
|
1100 #--------------------------------------------------------------------------------------------------------
|
|
1101 #--------------------------------------------------------------------------------------------------------
|
|
1102 #--------------------------------------------------------------------------------------------------------
|
|
1103 #--------------------------------------------------------------------------------------------------------
|
|
1104 #--------------------------------------------------------------------------------------------------------
|
|
1105 sub test_complexity{
|
|
1106 my $printer = 0;
|
|
1107 my $sequence = $_[0];
|
|
1108 my $COMPLEXITY_SUPPORT = $_[1];
|
|
1109 my $complexity=int($COMPLEXITY_SUPPORT * (1/40)); #1 THIS IS AN ARBITRARY THRESHOLD SET FOR LOW COMPLEXITY.
|
|
1110 #2 THE INSPIRATION WAS WEB MILLER'S MAIL SENT ON
|
|
1111 #3 19 Apr 2008 WHERE HE CLASSED AS HIGH COMPLEXITY
|
|
1112 #4 REGION, IF 40 BP OF SEQUENCE HAS AT LEAST 3 OF
|
|
1113 #5 EACH NUCLEOTIDE. HENCE, I NORMALIZE THIS PARAMETER
|
|
1114 #6 FOR THE ACTUAL LENGTH OF $FLANK_SUPPORT SET BY
|
|
1115 #7 THE USER.
|
|
1116 #8 WEB MILLER SENT THE MAIL TO YDK104@PSU.EDU
|
|
1117
|
|
1118
|
|
1119
|
|
1120 my $As = ($sequence=~ s/A/A/gi);
|
|
1121 my $Ts = ($sequence=~ s/T/T/gi);
|
|
1122 my $Gs = ($sequence=~ s/G/G/gi);
|
|
1123 my $Cs = ($sequence=~ s/C/C/gi);
|
|
1124 #print "seq = $sequence, As=$As, Ts=$Ts, Gs=$Gs, Cs=$Cs\n" if $printer == 1;
|
|
1125
|
|
1126 my $ans = ();
|
|
1127
|
|
1128 return "HIGH" if $As >= $complexity && $Ts >= $complexity && $Cs >= $complexity && $Gs >= $complexity;
|
|
1129
|
|
1130 my @nts = ("A","T","G","C","-");
|
|
1131
|
|
1132 my $lowcomplex = 0;
|
|
1133
|
|
1134 foreach my $nt (@nts){
|
|
1135 $lowcomplex =1 if $sequence =~ /(($nt\-*){10,})/i;
|
|
1136 # print "caught with a mono of $nt : $1 in $sequence\n" if $sequence =~ /(($nt\-*){10,})/i;
|
|
1137 $lowcomplex =1 if $sequence =~ /(($nt[A-Za-z]){10,})/i;
|
|
1138 $lowcomplex =1 if $sequence =~ /(([A-Za-z]$nt){10,})/i;
|
|
1139 # print "caught with a di with $nt : $2 in $sequence\n" if $sequence =~ /(($nt[A-Za-z]){10,})/i || $sequence =~ /(([A-Za-z]$nt){10,})/i;
|
|
1140 my $nont = ($sequence=~ s/$nt/$nt/gi);
|
|
1141
|
|
1142 }
|
|
1143 # print "leaving for now.. $sequence\n" if $printer == 1 && $lowcomplex == 0;
|
|
1144 #<STDIN>;
|
|
1145 return "HIGH" if $lowcomplex == 0;
|
|
1146 return "LOW" ;
|
|
1147 }
|
|
1148 #--------------------------------------------------------------------------------------------------------
|
|
1149 #--------------------------------------------------------------------------------------------------------
|
|
1150 #--------------------------------------------------------------------------------------------------------
|
|
1151 #--------------------------------------------------------------------------------------------------------
|
|
1152 #--------------------------------------------------------------------------------------------------------
|
|
1153 sub sequence_similarity{
|
|
1154 my $printer = 0;
|
|
1155 my @seq1 = split(/\s*/, $_[0]);
|
|
1156 my @seq2 = split(/\s*/, $_[1]);
|
|
1157 my $similarity_thresh = $_[2];
|
|
1158 my $info = $_[3];
|
|
1159 # print "input = @_\n" if $printer == 1;
|
|
1160 my $seq1str = $_[0];
|
|
1161 my $seq2str = $_[1];
|
|
1162 $seq1str=~s/\-//g; $seq2str=~s/\-//g;
|
|
1163 my $similarity=0;
|
|
1164
|
|
1165 my $nucdiffs=0;
|
|
1166 my $nucsims=0;
|
|
1167 my $indeldiffs=0;
|
|
1168
|
|
1169 for my $i (0...$#seq1){
|
|
1170 $similarity++ if $seq1[$i] =~ /$seq2[$i]/i ; #|| $seq1[$i] =~ /\-/i || $seq2[$i] =~ /\-/i ;
|
|
1171 $nucsims++ if $seq1[$i] =~ /$seq2[$i]/i && ($seq1[$i] =~ /[a-zA-Z]/i && $seq2[$i] =~ /[a-zA-Z]/i);
|
|
1172 $nucdiffs++ if $seq1[$i] !~ /$seq2[$i]/i && ($seq1[$i] =~ /[a-zA-Z]/i && $seq2[$i] =~ /[a-zA-Z]/i);
|
|
1173 $indeldiffs++ if $seq1[$i] !~ /$seq2[$i]/i && $seq1[$i] =~ /\-/i || $seq2[$i] =~ /\-/i;
|
|
1174 }
|
|
1175 my $sim = $similarity/length($_[0]);
|
|
1176 return ( $sim, $nucdiffs, $indeldiffs ); #<= $similarity_thresh;
|
|
1177 }
|
|
1178 #--------------------------------------------------------------------------------------------------------
|
|
1179 #--------------------------------------------------------------------------------------------------------
|
|
1180 #--------------------------------------------------------------------------------------------------------
|
|
1181
|
|
1182 sub load_treesToReject{
|
|
1183 my @rejectlist = ();
|
|
1184 my $alltags = join("",@_);
|
|
1185 @rejectlist = qw (-HCOR +HCOR) if $alltags eq "HCORM";
|
|
1186 @rejectlist = qw ( -HCO|+R +HCO|-R) if $alltags eq "HCOR";
|
|
1187 @rejectlist = qw ( -HC|+O +HC|-O) if $alltags eq "HCO";
|
|
1188
|
|
1189 %treesToReject=();
|
|
1190 $treesToReject{$_} = $_ foreach (@rejectlist);
|
|
1191 #print "loaded to reject for $alltags; ", $treesToReject{$_},"\n" foreach (@rejectlist); #<STDIN>;
|
|
1192 }
|
|
1193 #--------------------------------------------------------------------------------------------------------
|
|
1194 sub load_treesToIgnore{
|
|
1195 my @rejectlist = ();
|
|
1196 my $alltags = join("",@_);
|
|
1197 @rejectlist = qw (-HCOR +HCOR +HCORM -HCORM) if $alltags eq "HCORM";
|
|
1198 @rejectlist = qw ( -HCO|+R +HCO|-R +HCOR -HCOR) if $alltags eq "HCOR";
|
|
1199 @rejectlist = qw ( -HC|+O +HC|-O +HCO -HCO) if $alltags eq "HCO";
|
|
1200
|
|
1201 %treesToIgnore=();
|
|
1202 $treesToIgnore{$_} = $_ foreach (@rejectlist);
|
|
1203 #print "loaded ", $treesToIgnore{$_},"\n" foreach (@rejectlist);
|
|
1204 }
|
|
1205 #--------------------------------------------------------------------------------------------------------
|
|
1206 sub load_thresholds{
|
|
1207 my @threshold_array=split(/[,_]/,$_[0]);
|
|
1208 unshift @threshold_array, "0";
|
|
1209 for my $size (1 ... 4){
|
|
1210 $thresholdhash{$size}=$threshold_array[$size];
|
|
1211 }
|
|
1212 }
|
|
1213 #--------------------------------------------------------------------------------------------------------
|
|
1214 sub load_allPossibleTrees{
|
|
1215 #1 THIS FILE STORES ALL POSSIBLE SCENARIOS OF MICROSATELLITE
|
|
1216 #2 BIRTH AND DEATH EVENTS ON A 5-PRIMATE TREE OF H,C,O,R,M
|
|
1217 #3 IN FORM OF A TEXT FILE. THIS WILL BE USED AS A TEMPLET
|
|
1218 #4 TO COMPARE EACH MICROSATELLITE CLUSTER TO UNDERSTAND THE
|
|
1219 #5 EVOLUTION OF EACH LOCUS. WE WILL THEN DISCARD SOME
|
|
1220 #6 MICROSATS ACCRODING TO THEIR EVOLUTIONARY BEHAVIOUR ON
|
|
1221 #7 THE TREE. MOST PROBABLY WE WILL REMOVE THOSE MICROSATS
|
|
1222 #8 THAT ARE NOT SUFFICIENTLY INFORMATIVE, LIKE IN CASE OF
|
|
1223 #9 AN OUTGROUP MICROSATELLITE BEING DIFFERENT FRON ALL OTHER
|
|
1224 #10 SPECIES IN THE TREE.
|
|
1225 my $tree_list = $_[0];
|
|
1226 # print "file to be loaded: $tree_list\n";
|
|
1227
|
|
1228 my @trarr = ();
|
|
1229 @trarr = ("#H C O CONCLUSION ALTERNATE",
|
|
1230 "+ + + +HCO NA",
|
|
1231 "+ _ _ +H NA",
|
|
1232 "_ + _ +C NA",
|
|
1233 "_ _ + -HC|+O NA",
|
|
1234 "+ _ + -C +H",
|
|
1235 "_ + + -H +C",
|
|
1236 "+ + _ +HC|-O NA",
|
|
1237 "_ _ _ -HCO NA") if $tree_list =~ /_HCO\.txt/;
|
|
1238 @trarr = ("#H C O R CONCLUSION ALTERNATE",
|
|
1239 "_ _ _ _ -HCOR NA",
|
|
1240 "+ + + + +HCOR NA",
|
|
1241 "+ + + _ +HCO|-R +H.+C.+O",
|
|
1242 "+ + _ _ +HC +H.+C;-O",
|
|
1243 "+ _ _ _ +H +HC,-C",
|
|
1244 "_ + _ _ +C +HC,-H",
|
|
1245 "_ _ + _ +O -HC|-H.-C",
|
|
1246 "_ _ + + -HC -H.-C",
|
|
1247 "+ _ _ + +H|-C.-O +HC,-C",
|
|
1248 "_ + _ + +C -H.-O",
|
|
1249 "_ + + _ -H +C.+O",
|
|
1250 "_ _ _ + -HCO|+R NA",
|
|
1251 "+ _ + _ +H.+O|-C NA",
|
|
1252 "_ + + + -H -HC,+C",
|
|
1253 "+ _ + + -C -HC,+H",
|
|
1254 "+ + _ + -O +HC") if $tree_list =~ /_HCOR\.txt/;
|
|
1255
|
|
1256 @trarr = ("#H C O R M CONCLUSION ALTERNATE",
|
|
1257 "+ + + + _ +HCOR NA",
|
|
1258 "+ + + _ + -R +HCO;+HC.+O;+H.+C.+O",
|
|
1259 "+ + _ + + -O -HCO,+HC|-HCO,+HC;-HCO,(+H.+C)",
|
|
1260 "+ _ + + + -C -HC,+H;+HCO,(+H.+O)",
|
|
1261 "_ + + + + -H -HC,+C;-HCO,(+C.+O)",
|
|
1262 "_ _ _ _ + -HCOR NA",
|
|
1263 "_ _ _ + _ +R -HC.-O;-H.-C.-O",
|
|
1264 "_ _ + _ _ +O +HCO,-HC;+HCO,(-H.-C)",
|
|
1265 "_ + _ _ _ +C +HC,-H;+HCO,(-H.-O)",
|
|
1266 "+ _ _ _ _ +H +HC,-C;+HCO,(-C.-O)",
|
|
1267 "+ + + _ _ +HCO +H.+C.+O",
|
|
1268 "+ + _ + _ -O +R.+HC|-HCO,+HC;+H.+C.+R|-HCO,(+H.+C)",
|
|
1269 "+ _ + + _ -C -HC,+H;+H.+O.+R|-HCO,(+H.+O)",
|
|
1270 "_ + + + _ -H -HC,+C;+C.+O.+R|-HCO,(+C.+O)",
|
|
1271 "_ _ _ + + -HCO -HC.-O;-H.-C.-O",
|
|
1272 "_ _ + _ + +O +HCO,-HC;+HCO,(-H.-C)",
|
|
1273 "_ + _ _ + +C +HC,-H;+HCO,(-H.-O)",
|
|
1274 "+ _ _ _ + +H -HC,+H;+HCO,(-C.-O)",
|
|
1275 "+ + _ _ + +HC -R.-O|+HCO,-O|+H.+C;-HCO,+HC;-HCO,(+H.+C)",
|
|
1276 "+ _ + _ + -R.-C|+HCO,-C|+H.+O NA",
|
|
1277 "_ + + _ + -R.-H|+HCO,-H|+C.+O NA",
|
|
1278 "_ _ + + _ -HC +R.+O|-HCO,+O|+HCO,-HC",
|
|
1279 "_ + _ + _ +R.+C|-HCO,+C|-HC,+C +HCO,(-H.-O)",
|
|
1280 "+ _ _ + _ +R.+H|-C.-O +HCO,(-C.-O)",
|
|
1281 "+ _ _ + + -O.-C|-HCO,+H +R.+H;-HCO,(+R.+H)",
|
|
1282 "_ + _ + + -O.-H|-HCO,+C +R.+C;-HCO,(+R.+C)",
|
|
1283 "_ + + _ _ +HCO,-H|+O.+C NA",
|
|
1284 "+ _ + _ _ +HCO,-C|+O.+H NA",
|
|
1285 "_ _ + + + -HC -H.-C|-HCO,+O",
|
|
1286 "+ + _ _ _ +HC +H.+C|+HCO,-O|-HCO,+HC;-HCO,(+H.+C)",
|
|
1287 "+ + + + + +HCORM NA") if $tree_list =~ /_HCORM\.txt/;
|
|
1288
|
|
1289
|
|
1290 my $template_p = $_[1];
|
|
1291 my $alternate_p = $_[2];
|
|
1292 #1 THIS IS THE HASH IN WHICH INFORMATION FROM THE ABOVE FILE
|
|
1293 #2 GETS STORED, USING THE WHILE LOOP BELOW. HERE, THE KEY
|
|
1294 #3 OF EACH ROW IS THE EVOLUTIONARY CONFIGURATION OF A LOCUS
|
|
1295 #4 ON THE PRIMATE TREE, BASED ON PRESENCE/ABSENCE OF A MICROSAT
|
|
1296 #5 AT THAT LOCUS, LIKE SAY "+ + + _ _" .. EACH COLUMN BELONGS
|
|
1297 #6 TO ONE SPECIES; HERE THE COLUMN NAMES ARE "H C O R M".
|
|
1298 #7 THE VALUE FOR EACH ENTRY IS THE MEANING OF THE ABOVE
|
|
1299 #8 CONFIGURATION (I.E., CONFIGURAION OF THE KEY. HERE, THE
|
|
1300 #9 VALUE WILL BE +HCO, SIGNIFYING A BIRTH IN HUMAN-CHIMP-ORANG
|
|
1301 #10 COMMON ANCESTOR. THIS HASH HAS BEEN LOADED HERE TO BE USED
|
|
1302 #11 LATER BY THE SUBROUTINE sub treeStudy{} THAT STUDIES
|
|
1303 #12 EVOLUTIONARY CONFIGURAION OF EACH MICROSAT LOCUS, AS
|
|
1304 #13 MENTIONED ABOVE.
|
|
1305 my @keys_array=();
|
|
1306 foreach my $line (@trarr){
|
|
1307 next if $line =~ /^#/;
|
|
1308 chomp $line;
|
|
1309 my @fields = split("\t", $line);
|
|
1310 push @keys_array, $fields[0];
|
|
1311 # print "loading: $fields[0]\n";
|
|
1312 $template_p->{$fields[0]}[0] = $fields[1];
|
|
1313 $template_p->{$fields[0]}[1] = 0;
|
|
1314 $alternate_p->{$fields[1]} = $fields[2];
|
|
1315
|
|
1316 }
|
|
1317 # print "loaded the trees with keys: @keys_array\n";
|
|
1318 return $template_p, \@keys_array, $alternate_p;
|
|
1319 }
|
|
1320
|
|
1321 #--------------------------------------------------------------------------------------------------------
|
|
1322 #--------------------------------------------------------------------------------------------------------
|
|
1323 #--------------------------------------------------------------------------------------------------------
|
|
1324 #--------------------------------------------------------------------------------------------------------
|
|
1325 #--------------------------------------------------------------------------------------------------------
|
|
1326 sub checkCleanCase{
|
|
1327 my $printer = 0;
|
|
1328 my $tree = $_[0];
|
|
1329 my $finalalignment = $_[1];
|
|
1330
|
|
1331 #print "IN checkCleanCase: @_\n";
|
|
1332 #<STDIN>;
|
|
1333 my @indivspecies = $tree =~ /[A-Z]/g;
|
|
1334 $finalalignment =~ s/\./_/g;
|
|
1335 my @captured = $finalalignment =~ /[A-Za-z, \(\):]+\![:A-Za-z, \(\)]/g;
|
|
1336
|
|
1337 my $unclean = 0;
|
|
1338
|
|
1339 foreach my $sp (@indivspecies){
|
|
1340 foreach my $cap (@captured){
|
|
1341 $cap =~ s/:[A-Za-z\-]+//g;
|
|
1342 my @sps = $cap =~ /[A-Z]+/g;
|
|
1343 my $spsc = join("", @sps);
|
|
1344 # print "checking whether imp species $sp is present in $cap i.e, in $spsc\n " if $printer == 1;
|
|
1345 if ($spsc =~ /$sp/){
|
|
1346 # print "foind : $sp\n";
|
|
1347 $unclean = 1; last;
|
|
1348 }
|
|
1349 }
|
|
1350 last if $unclean == 1;
|
|
1351 }
|
|
1352 #<STDIN>;
|
|
1353 return "CLEAN" if $unclean == 0;
|
|
1354 return "UNCLEAN";
|
|
1355 }
|
|
1356
|
|
1357 #--------------------------------------------------------------------------------------------------------
|
|
1358 #--------------------------------------------------------------------------------------------------------
|
|
1359 #--------------------------------------------------------------------------------------------------------
|
|
1360 #--------------------------------------------------------------------------------------------------------
|
|
1361 #--------------------------------------------------------------------------------------------------------
|
|
1362
|
|
1363
|
|
1364 sub adjustCoordinates{
|
|
1365 my $line = $_[0];
|
|
1366 my $no_of_species = $line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)/x/g;
|
|
1367 $infocord = 2 + (4*$no_of_species) - 1;
|
|
1368 $typecord = 2 + (4*$no_of_species) + 1 - 1;
|
|
1369 $motifcord = 2 + (4*$no_of_species) + 2 - 1;
|
|
1370 $gapcord = $motifcord+1;
|
|
1371 $startcord = $gapcord+1;
|
|
1372 $strandcord = $startcord+1;
|
|
1373 $endcord = $strandcord + 1;
|
|
1374 $microsatcord = $endcord + 1;
|
|
1375 $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
|
|
1376 $interr_poscord = $microsatcord + 3;
|
|
1377 $no_of_interruptionscord = $microsatcord + 4;
|
|
1378 $interrcord = $microsatcord + 2;
|
|
1379 #print "$line\n startcord = $startcord, and endcord = $endcord and no_of_species = $no_of_species\n" if $printer == 1;
|
|
1380 return $no_of_species;
|
|
1381 }
|
|
1382
|
|
1383
|
|
1384 sub printhash{
|
|
1385 my $alivehash = $_[0];
|
|
1386 my @tags = @$_[1];
|
|
1387 # print "print hash\n";
|
|
1388 foreach my $tag (@tags){
|
|
1389 # print "$tag=",$alivehash->{$tag},"\n" if exists $alivehash->{$tag};
|
|
1390 }
|
|
1391
|
|
1392 return "\n"
|
|
1393 }
|
|
1394 sub peel_onion{
|
|
1395 my $printer = 0;
|
|
1396 # print "received: @_\n" ; #<STDIN>;
|
|
1397 $printer = 0;
|
|
1398 my ($tree, $sequences, $alignment, $tagarray, $microsathash, $nonmicrosathash, $motif, $tree_analysis, $threshold, $microsatstarts) = @_;
|
|
1399 # print "in peel onion.. tree = $tree \n" if $printer == 1;
|
|
1400 my %sequence_hash=();
|
|
1401
|
|
1402
|
|
1403 # for my $i (0 ... $#sequences){ $sequence_hash{$species[$i]}=$sequences->[$i]; }
|
|
1404
|
|
1405
|
|
1406 my %node_sequences=();
|
|
1407
|
|
1408 my %node_alignments = (); #NEW, Nov 28 2008
|
|
1409 my @tags=();
|
|
1410 my @locus_sequences=();
|
|
1411 my %alivehash=();
|
|
1412 foreach my $tag (@$tagarray) {
|
|
1413 #print "adding: $tag\n";
|
|
1414 push(@tags, $tag);
|
|
1415 $node_sequences{$tag}=join ".",split(/\s*/,$microsathash->{$tag}) if $microsathash->{$tag} ne "NULL";
|
|
1416 $alivehash{$tag}= $tag if $microsathash->{$tag} ne "NULL";
|
|
1417 $node_sequences{$tag}=join ".",split(/\s*/,$nonmicrosathash->{$tag}) if $microsathash->{$tag} eq "NULL";
|
|
1418 $node_alignments{$tag}=join ".",split(/\s*/,$alignment->{$tag}) ;
|
|
1419 push @locus_sequences, $node_sequences{$tag};
|
|
1420 #print "adding to node_seq: $tag = ",$node_alignments{$tag},"\n";
|
|
1421 }
|
|
1422
|
|
1423
|
|
1424
|
|
1425 my ($nodes_arr, $branches_hash) = get_nodes($tree);
|
|
1426 my @nodes=@$nodes_arr;
|
|
1427 # print "recieved nodes = " if $printer == 1;
|
|
1428 # foreach my $key (@nodes) {print "@$key " if $printer == 1;}
|
|
1429
|
|
1430 # print "\n" if $printer == 1;
|
|
1431
|
|
1432 #POPULATE branches_hash WITH INFORMATION ABOUT LIVESTATUS
|
|
1433 foreach my $keys (@nodes){
|
|
1434 my @pair = @$keys;
|
|
1435 my $joint = "(".join(", ",@pair).")";
|
|
1436 my $copykey = join "", @pair;
|
|
1437 $copykey =~ s/[\W ]+//g;
|
|
1438 # print "for node: $keys, copykey = $copykey and joint = $joint\n" if $printer == 1;
|
|
1439 my $livestatus = 1;
|
|
1440 foreach my $copy (split(/\s*/,$copykey)){
|
|
1441 $livestatus = 0 if !exists $alivehash{$copy};
|
|
1442 }
|
|
1443 $alivehash{$joint} = $joint if !exists $alivehash{$joint} && $livestatus == 1;
|
|
1444 # print "alivehash = $alivehash{$joint}\n" if exists $alivehash{$joint} && $printer == 1;
|
|
1445 }
|
|
1446
|
|
1447 @nodes = reverse(@nodes); #1 THIS IS IN ORDER TO GO THROUGH THE TREE FROM LEAVES TO ROOT.
|
|
1448
|
|
1449 my @mutations_array=();
|
|
1450
|
|
1451 my $joint = ();
|
|
1452 foreach my $node (@nodes){
|
|
1453 my @pair = @$node;
|
|
1454 # print "now in the nodes for loop, pair = @pair\n and sequences=\n" if $printer == 1;
|
|
1455 $joint = "(".join(", ",@pair).")";
|
|
1456 my @pair_sequences=();
|
|
1457
|
|
1458 foreach my $tag (@pair){
|
|
1459 # print "$tag: $node_alignments{$tag}\n" if $printer == 1;
|
|
1460 print $node_alignments{$tag},"\n" if $printer == 1;
|
|
1461 push @pair_sequences, $node_alignments{$tag};
|
|
1462 }
|
|
1463 # print "ppeel onion joint = $joint , pair_sequences=>@pair_sequences< , pair=>@pair<\n" if $printer == 1;
|
|
1464
|
|
1465 my ($compared, $substitutions_list) = base_by_base_simple($motif,\@pair_sequences, scalar(@pair_sequences), @pair, $joint);
|
|
1466 $node_alignments{$joint}=$compared;
|
|
1467 push( @mutations_array,split(/:/,$substitutions_list));
|
|
1468 # print "newly added to node_sequences: $node_alignments{$joint} and list of mutations =\n", join("\n",@mutations_array),"\n" if $printer == 1;
|
|
1469 }
|
|
1470
|
|
1471 # print "now sending for analyze_mutations: mutation_array=@mutations_array, nodes=@nodes, branches_hash=$branches_hash, alignment=$alignment, tags=@tags, alivehash=\%alivehash, node_sequences=\%node_sequences, microsatstarts=$microsatstarts, motif=$motif\n" if $printer == 1;
|
|
1472 ## <STDIN> if $printer == 1;
|
|
1473
|
|
1474 my $analayzed_mutations = analyze_mutations(\@mutations_array, \@nodes, $branches_hash, $alignment, \@tags, \%alivehash, \%node_sequences, $microsatstarts, $motif);
|
|
1475
|
|
1476 # print "returning: ", $analayzed_mutations, \@nodes, $branches_hash,"\n" if scalar @mutations_array > 0 && $printer == 1;
|
|
1477 # print "returning: NULL, NULL, NULL " if scalar @mutations_array == 0 && $printer == 1;
|
|
1478 # print "final node alignment = $node_alignments{$joint}\n" if $printer == 1;
|
|
1479
|
|
1480 # <STDIN> if $printer == 1;
|
|
1481
|
|
1482
|
|
1483 return ($analayzed_mutations, \@nodes, $branches_hash, \%alivehash, $node_alignments{$joint}) if scalar @mutations_array > 0;
|
|
1484 return ("NULL",\@nodes,$branches_hash, \%alivehash, "NULL") if scalar @mutations_array == 0;
|
|
1485 }
|
|
1486
|
|
1487 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
|
|
1488 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
|
|
1489
|
|
1490 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
|
|
1491 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
|
|
1492
|
|
1493 sub get_nodes{
|
|
1494 my $printer = 0;
|
|
1495
|
|
1496 my $tree=$_[0];
|
|
1497 #$tree =~ s/ +//g;
|
|
1498 $tree =~ s/\t+//g;
|
|
1499 $tree=~s/;//g;
|
|
1500 print "tree=$tree\n" if $printer == 1;
|
|
1501 my @nodes = ();
|
|
1502 my @onions=($tree);
|
|
1503 my %branches=();
|
|
1504 foreach my $bite (@onions){
|
|
1505 $bite=~ s/^\(|\)$//g;
|
|
1506 chomp $bite;
|
|
1507 # print "tree = $bite \n";
|
|
1508 # <STDIN>;
|
|
1509 $bite=~ /([ ,\(\)A-Z]+)\,\s*([ ,\(\)A-Z]+)/;
|
|
1510 #$tree =~ /(\(\(\(H, C\), O\), R\))\, (M)/;
|
|
1511 my @raw_nodes = ($1, $2);
|
|
1512 print "raw nodes = $1 and $2\n" if $printer == 1;
|
|
1513 push(@nodes, [@raw_nodes]);
|
|
1514 foreach my $node (@raw_nodes) {push (@onions, $node) if $node =~ /,/;}
|
|
1515 foreach my $node (@raw_nodes) {$branches{$node}="(".$bite.")"; print "adding to branches: $node = ($bite)\n" if $printer == 1;}
|
|
1516 print "onions = @onions\n" if $printer == 1;<STDIN> if $printer == 1;
|
|
1517 }
|
|
1518 $printer = 0;
|
|
1519 return \@nodes, \%branches;
|
|
1520 }
|
|
1521
|
|
1522
|
|
1523 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
|
|
1524 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
|
|
1525 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
|
|
1526 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
|
|
1527 sub analyze_mutations{
|
|
1528 my ($mutations_array, $nodes, $branches_hash, $alignment, $tags, $alivehash, $node_sequences, $microsatstarts, $motif) = @_;
|
|
1529 my $locuslength = length($alignment->{$tags->[0]});
|
|
1530 my $printer = 0;
|
|
1531
|
|
1532
|
|
1533 # print " IN analyzed_mutations....\n" if $printer == 1; # \n mutations array = @$mutations_array, \nAND locuslength = $locuslength\n" if $printer == 1;
|
|
1534 my %mutation_hash=();
|
|
1535 my %froms_megahash=();
|
|
1536 my %tos_megahash=();
|
|
1537 my %position_hash=();
|
|
1538 my @solutions_array=();
|
|
1539 foreach my $mutation (@$mutations_array){
|
|
1540 # print "loadin mutation: $mutation\n" if $printer == 1;
|
|
1541 my %localhash= $mutation =~ /([\S ]+)=([\S ]+)/g;
|
|
1542 $mutation_hash{$localhash{"position"}} = {%localhash};
|
|
1543 push @{$position_hash{$localhash{"position"}}},$localhash{"node"};
|
|
1544 # print "feeding position hash with $localhash{position}: $position_hash{$localhash{position}}[0]\n" if $printer == 1;
|
|
1545 $froms_megahash{$localhash{"position"}}{$localhash{"node"}}=$localhash{"from"};
|
|
1546 $tos_megahash{$localhash{"position"}}{$localhash{"node"}}=$localhash{"to"};
|
|
1547 # print "just a trial: $mutation_hash{$localhash{position}}{position}\n" if $printer == 1;
|
|
1548 # print "loadin in tos_megahash: $localhash{position} {$localhash{node} = $localhash{to}\n" if $printer == 1;
|
|
1549 # print "loadin in from: $localhash{position} {$localhash{node} = $localhash{from}\n" if $printer == 1;
|
|
1550 }
|
|
1551
|
|
1552 # print "now going through each position in loculength:\n" if $printer == 1;
|
|
1553 ## <STDIN> if $printer == 1;
|
|
1554
|
|
1555 for my $pos (0 ... $locuslength-1){
|
|
1556 # print "at position: $pos\n" if $printer == 1;
|
|
1557
|
|
1558 if (exists($mutation_hash{$pos})){
|
|
1559 my @local_nodes=@{$position_hash{$pos}};
|
|
1560 # print "found mutation: @{$position_hash{$pos}} : @local_nodes\n" if $printer == 1;
|
|
1561
|
|
1562 foreach my $local_node (@local_nodes){
|
|
1563 # print "at local node: $local_node ... from state = $froms_megahash{$pos}{$local_node}\n" if $printer == 1;
|
|
1564 my $open_insertion=();
|
|
1565 my $open_deletion=();
|
|
1566 my $open_to_substitution=();
|
|
1567 my $open_from_substitution=();
|
|
1568 if ($froms_megahash{$pos}{$local_node} eq "-"){
|
|
1569 # print "here exists a microsatellite from $local_node to $branches_hash->{$local_node}\n" if $printer == 1 && exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};;
|
|
1570 # print "for localnode $local_node, amd the realated branches_hash:$branches_hash->{$local_node}, nexting as exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}\n" if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}} && $printer == 1;
|
|
1571 #next if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};
|
|
1572 $open_insertion=$tos_megahash{$pos}{$local_node};
|
|
1573 for my $posnext ($pos+1 ... $locuslength-1){
|
|
1574 # print "in first if .... studying posnext: $posnext\n" if $printer == 1;
|
|
1575 last if !exists ($froms_megahash{$posnext}{$local_node});
|
|
1576 # print "for posnext: $posnext, there exists $froms_megahash{$posnext}{$local_node}.. already, open_insertion = $open_insertion.. checking is $froms_megahash{$posnext}{$local_node} matters\n" if $printer == 1;
|
|
1577 $open_insertion = $open_insertion.$tos_megahash{$posnext}{$local_node} if $froms_megahash{$posnext}{$local_node} eq "-";
|
|
1578 # print "now open_insertion=$open_insertion\n" if $printer == 1;
|
|
1579 delete $mutation_hash{$posnext} if $froms_megahash{$posnext}{$local_node} eq "-";
|
|
1580 }
|
|
1581 print "1 Feeding in: ", join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_insertion", "deletion="),"\n" if $printer == 1;
|
|
1582 push (@solutions_array, join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_insertion", "deletion="));
|
|
1583 }
|
|
1584 elsif ($tos_megahash{$pos}{$local_node} eq "-"){
|
|
1585 # print "here exists a microsatellite to $local_node from $branches_hash->{$local_node}\n" if $printer == 1 && exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};;
|
|
1586 # print "for localnode $local_node, amd the realated branches_hash:$branches_hash->{$local_node}, nexting as exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}\n" if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};
|
|
1587 #next if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};
|
|
1588 $open_deletion=$froms_megahash{$pos}{$local_node};
|
|
1589 for my $posnext ($pos+1 ... $locuslength-1){
|
|
1590 print "in 1st elsif studying posnext: $posnext\n" if $printer == 1;
|
|
1591 print "nexting as nextpos does not exist\n" if !exists ($tos_megahash{$posnext}{$local_node}) && $printer == 1;
|
|
1592 last if !exists ($tos_megahash{$posnext}{$local_node});
|
|
1593 print "for posnext: $posnext, there exists $tos_megahash{$posnext}{$local_node}\n" if $printer == 1;
|
|
1594 $open_deletion = $open_deletion.$froms_megahash{$posnext}{$local_node} if $tos_megahash{$posnext}{$local_node} eq "-";
|
|
1595 delete $mutation_hash{$posnext} if $tos_megahash{$posnext}{$local_node} eq "-";
|
|
1596 }
|
|
1597 print "2 Feeding in:", join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_deletion"), "\n" if $printer == 1;
|
|
1598 push (@solutions_array, join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_deletion"));
|
|
1599 }
|
|
1600 elsif ($tos_megahash{$pos}{$local_node} ne "-"){
|
|
1601 # print "here exists a microsatellite from $local_node to $branches_hash->{$local_node}\n" if $printer == 1 && exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};;
|
|
1602 # print "for localnode $local_node, amd the realated branches_hash:$branches_hash->{$local_node}, nexting as exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}\n" if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};
|
|
1603 #next if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};
|
|
1604 # print "microsatstart = $microsatstarts->{$local_node} \n" if exists $microsatstarts->{$local_node} && $pos < $microsatstarts->{$local_node} && $printer == 1;
|
|
1605 next if exists $microsatstarts->{$local_node} && $pos < $microsatstarts->{$local_node};
|
|
1606 $open_to_substitution=$tos_megahash{$pos}{$local_node};
|
|
1607 $open_from_substitution=$froms_megahash{$pos}{$local_node};
|
|
1608 print "open from substitution: $open_from_substitution \n" if $printer == 1;
|
|
1609 for my $posnext ($pos+1 ... $locuslength-1){
|
|
1610 #print "in last elsif studying posnext: $posnext\n";
|
|
1611 last if !exists ($tos_megahash{$posnext}{$local_node});
|
|
1612 print "for posnext: $posnext, there exists $tos_megahash{$posnext}{$local_node}\n" if $printer == 1;
|
|
1613 $open_to_substitution = $open_to_substitution.$tos_megahash{$posnext}{$local_node} if $tos_megahash{$posnext}{$local_node} ne "-";
|
|
1614 $open_from_substitution = $open_from_substitution.$froms_megahash{$posnext}{$local_node} if $tos_megahash{$posnext}{$local_node} ne "-";
|
|
1615 delete $mutation_hash{$posnext} if $tos_megahash{$posnext}{$local_node} ne "-" && $froms_megahash{$posnext}{$local_node} ;
|
|
1616 }
|
|
1617 print "open from substitution: $open_from_substitution \n" if $printer == 1;
|
|
1618
|
|
1619 #IS THE STRETCH OF SUBSTITUTION MICROSATELLITE-LIKE?
|
|
1620 my @motif_parts=split(/\s*/,$motif);
|
|
1621 #GENERATING THE FLEXIBLE LEFT END
|
|
1622 my $left_query=();
|
|
1623 for my $k (1 ... $#motif_parts) {
|
|
1624 $left_query= $motif_parts[$k]."|)";
|
|
1625 $left_query="(".$left_query;
|
|
1626 }
|
|
1627 $left_query=$left_query."?";
|
|
1628
|
|
1629 print "left_quewry = $left_query\n" if $printer == 1;
|
|
1630 #GENERATING THE FLEXIBLE RIGHT END
|
|
1631 my $right_query=();
|
|
1632 for my $k (0 ... ($#motif_parts-1)) {
|
|
1633 $right_query= "(|".$motif_parts[$k];
|
|
1634 $right_query=$right_query.")";
|
|
1635 }
|
|
1636 $right_query=$right_query."?";
|
|
1637 print "right_query = $right_query\n" if $printer == 1;
|
|
1638 print "Hence, searching for: ^$left_query($motif)+$right_query\$\n" if $printer == 1;
|
|
1639
|
|
1640 my $motifcomb=$motif x 50;
|
|
1641 print "motifcomb = $motifcomb\n" if $printer == 1;
|
|
1642 if ( ($motifcomb =~/$open_to_substitution/i) && (length ($open_to_substitution) >= length($motif)) ){
|
|
1643 print "sequence microsat-like\n" if $printer == 1;
|
|
1644 my $all_microsat_like = 0;
|
|
1645 print "3 feeding in: ", join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_from_substitution"), "\n" if $printer == 1;
|
|
1646 push (@solutions_array, join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_from_substitution"));
|
|
1647 print "4 feeding in: ", join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_to_substitution", "deletion="), "\n" if $printer == 1;
|
|
1648 push (@solutions_array, join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_to_substitution", "deletion="));
|
|
1649
|
|
1650 }
|
|
1651 else{
|
|
1652 print "5 feeding in: ", join("\t", "node=$local_node","type=substitution" ,"position=$pos", "from=$open_from_substitution", "to=$open_to_substitution", "insertion=", "deletion="), "\n" if $printer == 1;
|
|
1653 push (@solutions_array, join("\t", "node=$local_node","type=substitution" ,"position=$pos", "from=$open_from_substitution", "to=$open_to_substitution", "insertion=", "deletion="));
|
|
1654 }
|
|
1655 #IS THE FROM-SEQUENCE MICROSATELLITE-LIKE?
|
|
1656
|
|
1657 }
|
|
1658 #<STDIN> if $printer ==1;
|
|
1659 }
|
|
1660 #<STDIN> if $printer ==1;
|
|
1661 }
|
|
1662 }
|
|
1663
|
|
1664 print "\n", "#" x 50, "\n" if $printer == 1;
|
|
1665 foreach my $tag (@$tags){
|
|
1666 print "$tag: $alignment->{$tag}\n" if $printer == 1;
|
|
1667 }
|
|
1668 print "\n", "#" x 50, "\n" if $printer == 1;
|
|
1669
|
|
1670 print "returning SOLUTIONS ARRAY : \n",join("\n", @solutions_array),"\n" if $printer == 1;
|
|
1671 #print "end\n";
|
|
1672 #<STDIN> if
|
|
1673 return \@solutions_array;
|
|
1674 }
|
|
1675 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
|
|
1676 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
|
|
1677 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
|
|
1678 #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#
|
|
1679
|
|
1680 sub base_by_base_simple{
|
|
1681 my $printer = 0;
|
|
1682 my ($motif, $locus, $no, $pair0, $pair1, $joint) = @_;
|
|
1683 my @seq_array=();
|
|
1684 print "IN SUBROUTUNE base_by_base_simple.. information received = @_\n" if $printer == 1;
|
|
1685 print "pair0 = $pair0 and pair1 = $pair1\n" if $printer == 1;
|
|
1686
|
|
1687 my @example=split(/\./,$locus->[0]);
|
|
1688 print "example, for length = @example\n" if $printer == 1;
|
|
1689 for my $i (0...$no-1){push(@seq_array, [split(/\./,$locus->[$i])]); print "for $i, from $locus->[$i], seq_array = >@{$seq_array[$i]}<\n" if $printer == 1;}
|
|
1690
|
|
1691 my @compared_sequence=();
|
|
1692 my @substitutions_list;
|
|
1693 for my $i (0...scalar(@example)-1){
|
|
1694
|
|
1695 #print "i = $i\n" if $printer == 1;
|
|
1696 #print "comparing $seq_array[0][$i] and $seq_array[1][$i] \n" ;#if $printer == 1;
|
|
1697 if ($seq_array[0][$i] =~ /!/ && $seq_array[1][$i] !~ /!/){
|
|
1698
|
|
1699 my $resolution= resolve_base($seq_array[0][$i],$seq_array[1][$i], $pair1 ,"keep" );
|
|
1700 # print "ancestral = $resolution\n" if $printer == 1;
|
|
1701
|
|
1702 if ($resolution =~ /$seq_array[1][$i]/i && $resolution !~ /!/){
|
|
1703 push @substitutions_list, add_mutation($i, $pair0, $seq_array[0][$i], $resolution );
|
|
1704 }
|
|
1705 elsif ( $resolution !~ /!/){
|
|
1706 push @substitutions_list, add_mutation($i, $pair1, $seq_array[1][$i], $resolution);
|
|
1707 }
|
|
1708 push @compared_sequence,$resolution;
|
|
1709 }
|
|
1710 elsif ($seq_array[0][$i] !~ /!/ && $seq_array[1][$i] =~ /!/){
|
|
1711
|
|
1712 my $resolution= resolve_base($seq_array[1][$i],$seq_array[0][$i], $pair0, "invert" );
|
|
1713 # print "ancestral = $resolution\n" if $printer == 1;
|
|
1714
|
|
1715 if ($resolution =~ /$seq_array[0][$i]/i && $resolution !~ /!/){
|
|
1716 push @substitutions_list, add_mutation($i, $pair1, $seq_array[1][$i], $resolution);
|
|
1717 }
|
|
1718 elsif ( $resolution !~ /!/){
|
|
1719 push @substitutions_list, add_mutation($i, $pair0, $seq_array[0][$i], $resolution);
|
|
1720 }
|
|
1721 push @compared_sequence,$resolution;
|
|
1722 }
|
|
1723 elsif($seq_array[0][$i] =~ /!/ && $seq_array[1][$i] =~ /!/){
|
|
1724 push @compared_sequence, add_bases($seq_array[0][$i],$seq_array[1][$i], $pair0, $pair1, $joint );
|
|
1725 }
|
|
1726 else{
|
|
1727 if($seq_array[0][$i] !~ /^$seq_array[1][$i]$/i){
|
|
1728 push @compared_sequence, $pair0.":".$seq_array[0][$i]."!".$pair1.":".$seq_array[1][$i];
|
|
1729 }
|
|
1730 else{
|
|
1731 # print "perfect match\n" if $printer == 1;
|
|
1732 push @compared_sequence, $seq_array[0][$i];
|
|
1733 }
|
|
1734 }
|
|
1735 }
|
|
1736 print "returning: comared = @compared_sequence \nand substitutions list =\n", join("\n",@substitutions_list),"\n" if $printer == 1;
|
|
1737 return join(".",@compared_sequence), join(":", @substitutions_list) if scalar (@substitutions_list) > 0;
|
|
1738 return join(".",@compared_sequence), "" if scalar (@substitutions_list) == 0;
|
|
1739 }
|
|
1740
|
|
1741
|
|
1742 sub resolve_base{
|
|
1743 my $printer = 0;
|
|
1744
|
|
1745 print "IN SUBROUTUNE resolve_base.. information received = @_\n" if $printer == 1;
|
|
1746 my ($optional, $single, $singlesp, $arg) = @_;
|
|
1747 my @options=split(/!/,$optional);
|
|
1748 foreach my $option(@options) {
|
|
1749 $option=~s/[A-Z\(\) ,]+://g;
|
|
1750 if ($option =~ /$single/i){
|
|
1751 print "option = $option , returning single: $single\n" if $printer == 1;
|
|
1752 return $single;
|
|
1753 }
|
|
1754 }
|
|
1755 print "returning ",$optional."!".$singlesp.":".$single. "\n" if $arg eq "keep" && $printer == 1;
|
|
1756 print "returning ",$singlesp.":".$single."!".$optional. "\n" if $arg eq "invert" && $printer == 1;
|
|
1757 return $optional."!".$singlesp.":".$single if $arg eq "keep";
|
|
1758 return $singlesp.":".$single."!".$optional if $arg eq "invert";
|
|
1759
|
|
1760 }
|
|
1761
|
|
1762 sub same_length{
|
|
1763 my $printer = 0;
|
|
1764 my @locus = @_;
|
|
1765 my $temp = shift @locus;
|
|
1766 $temp=~s/-|,//g;
|
|
1767 foreach my $l (@locus){
|
|
1768 $l=~s/-|,//g;
|
|
1769 return 0 if length($l) != length($temp);
|
|
1770 $temp = $l;
|
|
1771 }
|
|
1772 return 1;
|
|
1773 }
|
|
1774 sub treeStudy{
|
|
1775 my $printer = 0;
|
|
1776 # print "template DEFINED.. received: @_\n" if defined %template;
|
|
1777 # print "only received = @_" if !defined %template;
|
|
1778 my $stopper = 0;
|
|
1779 if (!defined %template){
|
|
1780 $stopper = 1;
|
|
1781 %template=();
|
|
1782 print "tree decipherer = $tree_decipherer\n" if $printer == 1;
|
|
1783 my ( $template_ref, $keys_array)=load_allPossibleTrees($tree_decipherer, \%template);
|
|
1784 print "return = $template_ref and @{$keys_array}\n" if $printer == 1;
|
|
1785 foreach my $key (@$keys_array){
|
|
1786 print "addding : $template_ref->{$key} for $key\n" if $printer == 1;
|
|
1787 $template{$key} = $template_ref->{$key};
|
|
1788 }
|
|
1789 }
|
|
1790
|
|
1791 for my $templet ( keys %template ) {
|
|
1792 # print "$templet => @{$template{$templet}}\n";
|
|
1793 }
|
|
1794 <STDIN> if !defined %template;
|
|
1795
|
|
1796 my $strict = 0;
|
|
1797
|
|
1798 my $H = 0;
|
|
1799 my $Hchr = 1;
|
|
1800 my $Hstart = 2;
|
|
1801 my $Hend = 3;
|
|
1802 my $Hmotif = 4;
|
|
1803 my $Hmotiflen = 5;
|
|
1804 my $Hmicro = 6;
|
|
1805 my $Hstrand = 7;
|
|
1806 my $Hmicrolen = 8;
|
|
1807 my $Hinterpos = 9;
|
|
1808 my $Hrelativepos = 10;
|
|
1809 my $Hinter = 11;
|
|
1810 my $Hinterlen = 12;
|
|
1811
|
|
1812 my $C = 13;
|
|
1813 my $Cchr = 14;
|
|
1814 my $Cstart = 15;
|
|
1815 my $Cend = 16;
|
|
1816 my $Cmotif = 17;
|
|
1817 my $Cmotiflen = 18;
|
|
1818 my $Cmicro = 19;
|
|
1819 my $Cstrand = 20;
|
|
1820 my $Cmicrolen = 21;
|
|
1821 my $Cinterpos = 22;
|
|
1822 my $Crelativepos = 23;
|
|
1823 my $Cinter = 24;
|
|
1824 my $Cinterlen = 25;
|
|
1825
|
|
1826 my $O = 26;
|
|
1827 my $Ochr = 27;
|
|
1828 my $Ostart = 28;
|
|
1829 my $Oend = 29;
|
|
1830 my $Omotif = 30;
|
|
1831 my $Omotiflen = 31;
|
|
1832 my $Omicro = 32;
|
|
1833 my $Ostrand = 33;
|
|
1834 my $Omicrolen = 34;
|
|
1835 my $Ointerpos = 35;
|
|
1836 my $Orelativepos = 36;
|
|
1837 my $Ointer = 37;
|
|
1838 my $Ointerlen = 38;
|
|
1839
|
|
1840 my $R = 39;
|
|
1841 my $Rchr = 40;
|
|
1842 my $Rstart = 41;
|
|
1843 my $Rend = 42;
|
|
1844 my $Rmotif = 43;
|
|
1845 my $Rmotiflen = 44;
|
|
1846 my $Rmicro = 45;
|
|
1847 my $Rstrand = 46;
|
|
1848 my $Rmicrolen = 47;
|
|
1849 my $Rinterpos = 48;
|
|
1850 my $Rrelativepos = 49;
|
|
1851 my $Rinter = 50;
|
|
1852 my $Rinterlen = 51;
|
|
1853
|
|
1854 my $Mchr = 52;
|
|
1855 my $Mstart = 53;
|
|
1856 my $Mend = 54;
|
|
1857 my $M = 55;
|
|
1858 my $Mmotif = 56;
|
|
1859 my $Mmotiflen = 57;
|
|
1860 my $Mmicro = 58;
|
|
1861 my $Mstrand = 59;
|
|
1862 my $Mmicrolen = 60;
|
|
1863 my $Minterpos = 61;
|
|
1864 my $Mrelativepos = 62;
|
|
1865 my $Minter = 63;
|
|
1866 my $Minterlen = 64;
|
|
1867
|
|
1868 #-------------------------------------------------------------------------------#
|
|
1869 my @analysis=();
|
|
1870
|
|
1871
|
|
1872 my %speciesOrder = ();
|
|
1873 $speciesOrder{"H"} = 0;
|
|
1874 $speciesOrder{"C"} = 1;
|
|
1875 $speciesOrder{"O"} = 2;
|
|
1876 $speciesOrder{"R"} = 3;
|
|
1877 $speciesOrder{"M"} = 4;
|
|
1878 #-------------------------------------------------------------------------------#
|
|
1879
|
|
1880 my $line = $_[0];
|
|
1881 chomp $line;
|
|
1882
|
|
1883 my @f = split(/\t/,$line);
|
|
1884 print "received array : @f.. recieved tags = @tags\n" if $printer == 1;
|
|
1885
|
|
1886 # collect all motifs
|
|
1887 my @motifs=();
|
|
1888 @motifs = ($f[$Hmotif], $f[$Cmotif], $f[$Omotif], $f[$Rmotif], $f[$Mmotif]) if $tags[$#tags] =~ /M/;
|
|
1889 @motifs = ($f[$Hmotif], $f[$Cmotif], $f[$Omotif], $f[$Rmotif]) if $tags[$#tags] =~ /R/;
|
|
1890 @motifs = ($f[$Hmotif], $f[$Cmotif], $f[$Omotif]) if $tags[$#tags] =~ /O/;
|
|
1891 # print "motifs in the array = $f[$Hmotif], $f[$Cmotif], $f[$Omotif], $f[$Rmotif]\n" if $tags[$#tags] =~ /R/;;
|
|
1892 print "motifs = @motifs\n" if $printer == 1;
|
|
1893 my @translation = ();
|
|
1894 foreach my $motif (@motifs){
|
|
1895 push(@translation, "_") if $motif eq "NA";
|
|
1896 push(@translation, "+") if $motif ne "NA";
|
|
1897 }
|
|
1898 my $translate = join(" ", @translation);
|
|
1899 # print "translate = >$translate< and analysis = $template{$translate}[0].. on the other hand, ",$template{"- - +"}[0],"\n";
|
|
1900 my @analyses = split(/\|/,$template{$translate}[0]);
|
|
1901
|
|
1902 print "motifs = @motifs, analyses = @analyses\n" if $printer == 1;
|
|
1903
|
|
1904 if (scalar(@analyses) == 1) {
|
|
1905 #print "analysis = $analyses[0]\n";
|
|
1906 if ($analyses[0] !~ /,|\./ ){
|
|
1907 if ($analyses[0] =~ /\+/){
|
|
1908 my $analysis = $analyses[0];
|
|
1909 $analysis =~ s/\+|\-//g;
|
|
1910 my @species = split(/\s*/,$analysis);
|
|
1911 my @currentMotifs = ();
|
|
1912 foreach my $specie (@species){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); print "pushing into currentMotifs: $speciesOrder{$specie}: $motifs[$speciesOrder{$specie}]\n" if $printer == 1;}
|
|
1913 print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1;
|
|
1914 $template{$translate}[1]++ if $strict == 1 && consistency(@currentMotifs) ne "NULL";
|
|
1915 $template{$translate}[1]++ if $strict == 0;
|
|
1916 print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1;
|
|
1917 }
|
|
1918 else{
|
|
1919 my $analysis = $analyses[0];
|
|
1920 $analysis =~ s/\+|\-//g;
|
|
1921 my @species = split(/\s*/,$analysis);
|
|
1922 my @currentMotifs = ();
|
|
1923 my @complementarySpecies = ();
|
|
1924 my $allSpecies = join("",@tags);
|
|
1925 foreach my $specie (@species){ $allSpecies =~ s/$specie//g; }
|
|
1926 foreach my $specie (split(/\s*/,$allSpecies)){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); print "pushing into currentMotifs: $speciesOrder{$specie}: $motifs[$speciesOrder{$specie}]\n" if $printer == 1;;}
|
|
1927 print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1;
|
|
1928 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL";
|
|
1929 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0;
|
|
1930 print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1;
|
|
1931 }
|
|
1932 }
|
|
1933
|
|
1934 elsif ($analyses[0] =~ /,/) {
|
|
1935 my @events = split(/,/,$analyses[0]);
|
|
1936 print "events = @events \n " if $printer == 1;
|
|
1937 if ($events[0] =~ /\+/){
|
|
1938 my $analysis1 = $events[0];
|
|
1939 $analysis1 =~ s/\+|\-//g;
|
|
1940 my $analysis2 = $events[1];
|
|
1941 $analysis2 =~ s/\+|\-//g;
|
|
1942 my @nSpecies = split(/\s*/,$analysis2);
|
|
1943 print "original anslysis = $analysis1 " if $printer == 1;
|
|
1944 foreach my $specie (@nSpecies){ $analysis1=~ s/$specie//g;}
|
|
1945 print "processed anslysis = $analysis1 \n" if $printer == 1;
|
|
1946 my @currentMotifs = ();
|
|
1947 foreach my $specie (split(/\s*/,$analysis1)){push(@currentMotifs, $motifs[$speciesOrder{$specie}]); }
|
|
1948 print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1;
|
|
1949 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL";
|
|
1950 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0;
|
|
1951 print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1;
|
|
1952 }
|
|
1953 else{
|
|
1954 my $analysis1 = $events[0];
|
|
1955 $analysis1 =~ s/\+|\-//g;
|
|
1956 my $analysis2 = $events[1];
|
|
1957 $analysis2 =~ s/\+|\-//g;
|
|
1958 my @pSpecies = split(/\s*/,$analysis2);
|
|
1959 my @currentMotifs = ();
|
|
1960 foreach my $specie (@pSpecies){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); }
|
|
1961 print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1;
|
|
1962 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL";
|
|
1963 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0;
|
|
1964 print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1;
|
|
1965
|
|
1966 }
|
|
1967
|
|
1968 }
|
|
1969 elsif ($analyses[0] =~ /\./) {
|
|
1970 my @events = split(/\./,$analyses[0]);
|
|
1971 foreach my $event (@events){
|
|
1972 print "event = $event \n" if $printer == 1;
|
|
1973 if ($event =~ /\+/){
|
|
1974 my $analysis = $event;
|
|
1975 $analysis =~ s/\+|\-//g;
|
|
1976 my @species = split(/\s*/,$analysis);
|
|
1977 my @currentMotifs = ();
|
|
1978 foreach my $specie (@species){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); }
|
|
1979 #print consistency(@currentMotifs),"<- \n";
|
|
1980 print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1;
|
|
1981 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL";
|
|
1982 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0;
|
|
1983 print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1;
|
|
1984 }
|
|
1985 else{
|
|
1986 my $analysis = $event;
|
|
1987 $analysis =~ s/\+|\-//g;
|
|
1988 my @species = split(/\s*/,$analysis);
|
|
1989 my @currentMotifs = ();
|
|
1990 my @complementarySpecies = ();
|
|
1991 my $allSpecies = join("",@tags);
|
|
1992 foreach my $specie (@species){ $allSpecies =~ s/$specie//g; }
|
|
1993 foreach my $specie (split(/\s*/,$allSpecies)){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); }
|
|
1994 #print consistency(@currentMotifs),"<- \n";
|
|
1995 print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1;
|
|
1996 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL";
|
|
1997 $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0;
|
|
1998 print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1;
|
|
1999 }
|
|
2000 }
|
|
2001
|
|
2002 }
|
|
2003 }
|
|
2004 else{
|
|
2005 my $finalanalysis = ();
|
|
2006 $template{$translate}[1]++;
|
|
2007 foreach my $analysis (@analyses){ ;}
|
|
2008 }
|
|
2009 # test if motifs where microsats are present, as indeed of same the motif composition
|
|
2010
|
|
2011
|
|
2012
|
|
2013 for my $templet ( keys %template ) {
|
|
2014 # print "now returning: $templet\n";
|
|
2015 if (@{ $template{$templet} }[1] > 0){
|
|
2016 print "returning in the end: $templet and $translate\n" if $printer == 1;
|
|
2017 $template{$templet}[1] = 0;
|
|
2018 return (@{$template{$templet}}[0], $translate);
|
|
2019 }
|
|
2020 }
|
|
2021 undef %template;
|
|
2022 print "sending NULL\n" if $printer == 1;
|
|
2023 return ("NULL", $translate);
|
|
2024
|
|
2025 }
|
|
2026
|
|
2027
|
|
2028 sub consistency{
|
|
2029 my @motifs = @_;
|
|
2030 print "in consistency \n" if $printer == 1;
|
|
2031 print "motifs sent = >",join("|",@motifs),"< \n" if $printer == 1;
|
|
2032 return $motifs[0] if scalar(@motifs) == 1;
|
|
2033 my $prevmotif = shift(@motifs);
|
|
2034 my $stopper = 0;
|
|
2035 for my $i (0 ... $#motifs){
|
|
2036 next if $motifs[$i] eq "NA";
|
|
2037 my $templet = $motifs[$i].$motifs[$i];
|
|
2038 if ($templet !~ /$prevmotif/i){
|
|
2039 $stopper = 1; last;
|
|
2040 }
|
|
2041 }
|
|
2042 return $prevmotif if $stopper == 0;
|
|
2043 return "NULL" if $stopper == 1;
|
|
2044 }
|
|
2045 sub summarize_microsat{
|
|
2046 my $printer = 0;
|
|
2047 my $line = $_[0];
|
|
2048 my $humseq = $_[1];
|
|
2049
|
|
2050 my @gaps = $line =~ /[0-9]+\t[0-9]+\t[\+\-]/g;
|
|
2051 my @starts = $line =~ /[0-9]+\t[\+\-]/g;
|
|
2052 my @ends = $line =~ /[\+\-]\t[0-9]+/g;
|
|
2053 print "starts = @starts\tends = @ends\n" if $printer == 1;
|
|
2054 for my $i (0 ... $#gaps) {$gaps[$i] =~ s/\t[0-9]+\t[\+\-]//g;}
|
|
2055 for my $i (0 ... $#starts) {$starts[$i] =~ s/\t[\+\-]//g;}
|
|
2056 for my $i (0 ... $#ends) {$ends[$i] =~ s/[\+\-]\t//g;}
|
|
2057
|
|
2058 my $minstart = array_smallest_number(@starts);
|
|
2059 my $maxend = array_largest_number(@ends);
|
|
2060
|
|
2061 my $humupstream_st = substr($humseq, 0, $minstart);
|
|
2062 my $humupstream_en = substr($humseq, 0, $maxend);
|
|
2063 my $no_of_gaps_to_start = 0;
|
|
2064 my $no_of_gaps_to_end = 0;
|
|
2065 $no_of_gaps_to_start = ($humupstream_st =~ s/\-/x/g) if $humupstream_st=~/\-/;
|
|
2066 $no_of_gaps_to_end = ($humupstream_en =~ s/\-/x/g) if $humupstream_en=~/\-/;
|
|
2067
|
|
2068 my $locusmotif = ();
|
|
2069
|
|
2070
|
|
2071 print "IN SUB SUMMARIZE_MICROSAT $line\n" if $printer == 1;
|
|
2072 #return "NULL" if $line =~ /compound/;
|
|
2073 my $Hstart = "NA";
|
|
2074 my $Hend = "NA";
|
|
2075 chomp $line;
|
|
2076 my $match_count = ($line =~ s/>/>/g);
|
|
2077 #print "number of species = $match_count\n";
|
|
2078 my @micros = split(/>/,$line);
|
|
2079 shift @micros;
|
|
2080 my $stopper = 0;
|
|
2081
|
|
2082
|
|
2083 foreach my $mic (@micros){
|
|
2084 my @local = split(/\t/,$mic);
|
|
2085 if ($local[$microsatcord] =~ /N/) {$stopper =1; last;}
|
|
2086 }
|
|
2087 return "NULL" if $stopper ==1;
|
|
2088
|
|
2089 #------------------------------------------------------
|
|
2090
|
|
2091 my @arranged = ();
|
|
2092 for my $arr (0 ... $#exacttags) {$arranged[$arr] = '0';}
|
|
2093
|
|
2094 foreach my $micro (@micros){
|
|
2095 for my $i (0 ... $#exacttags){
|
|
2096 if ($micro =~ /^$exacttags[$i]/){
|
|
2097 $arranged[$i] = $micro;
|
|
2098 last;
|
|
2099 }
|
|
2100 }
|
|
2101 }
|
|
2102 # print "arranged = @arranged \n" ; <STDIN>;;
|
|
2103
|
|
2104 my @endstatement = ();
|
|
2105 my $turn = 0;
|
|
2106 my $species_counter = 0;
|
|
2107 # print scalar(@arranged),"\n";
|
|
2108
|
|
2109 my $species_no=0;
|
|
2110
|
|
2111 my $orthHchr = 0;
|
|
2112
|
|
2113 foreach my $micro (@arranged) {
|
|
2114 $micro =~ s/\t\t/\t \t/g;
|
|
2115 $micro =~ s/\t,/\t ,/g;
|
|
2116 $micro =~ s/,\t/, \t/g;
|
|
2117 print "------------------------------------------------------------------------------------------\n" if $printer == 1;
|
|
2118 chomp $micro;
|
|
2119 if ($micro eq '0'){
|
|
2120 push(@endstatement, join("\t",$exacttags[$species_counter],"NA","NA","NA","NA",0 ,"NA", "NA", 0,"NA","NA","NA", "NA" ));
|
|
2121 $species_counter++;
|
|
2122 print join("|","ENDSTATEMENT:",@endstatement),"\n" if $printer == 1;
|
|
2123 next;
|
|
2124 }
|
|
2125 # print $micro,"\n";
|
|
2126 print "micro = $micro \n" if $printer == 1;
|
|
2127 my @fields = split(/\t/,$micro);
|
|
2128 my $microcopy = $fields[$microsatcord];
|
|
2129 $microcopy =~ s/\[|\]|-//g;
|
|
2130 my $microsatlength = length($microcopy);
|
|
2131 print "microsat = $fields[$microsatcord] and microsatlength = $microsatlength\n" if $printer == 1;
|
|
2132 # print "sp_ident = @sp_ident.. species_no=$species_no\n";
|
|
2133 $micro =~ /$sp_ident[$species_no]\s(\S+)\s([0-9]+)\s([0-9]+)/;
|
|
2134
|
|
2135
|
|
2136 my $sp_chr=$1;
|
|
2137 my $sp_start=$2 + $fields[$startcord] - $fields[$gapcord];
|
|
2138 my $sp_end= $sp_start + $microsatlength - 1;
|
|
2139
|
|
2140 $species_no++;
|
|
2141
|
|
2142 $micro =~ /$focalspec\s(\S+)\s([0-9]+)\s([0-9]+)/;
|
|
2143 $orthHchr=$1;
|
|
2144 $Hstart=$2+$minstart-$no_of_gaps_to_start;
|
|
2145 $Hend=$2+$maxend-$no_of_gaps_to_end;
|
|
2146
|
|
2147 print "Hstart = $Hstart = $fields[4] + $fields[$startcord] - $fields[$gapcord]\n" if $printer == 1;
|
|
2148
|
|
2149 my $motif = $fields[$motifcord];
|
|
2150 my $firstmotif = ();
|
|
2151 my $strand = $fields[$strandcord];
|
|
2152 # print "strand = $strand\n";
|
|
2153
|
|
2154
|
|
2155 if ($motif =~ /^\[/){
|
|
2156 $motif =~ s/^\[//g;
|
|
2157 $motif =~ /([a-zA-Z]+)\].*/;
|
|
2158 $firstmotif = $1;
|
|
2159 }
|
|
2160
|
|
2161 else {$firstmotif = $motif;}
|
|
2162 print "firstmotif =$firstmotif : \n" if $printer == 1;
|
|
2163 $firstmotif = allCaps($firstmotif);
|
|
2164
|
|
2165 if (exists $revHash{$firstmotif} && $turn == 0) {
|
|
2166 $turn=1 if $species_counter==0;
|
|
2167 $firstmotif = $revHash{$firstmotif};
|
|
2168 }
|
|
2169
|
|
2170 elsif (exists $revHash{$firstmotif} && $turn == 1) {$firstmotif = $revHash{$firstmotif}; $turn = 1;}
|
|
2171 print "changed firstmotif =$firstmotif\n" if $printer == 1;
|
|
2172 # <STDIN>;
|
|
2173 $locusmotif = $firstmotif;
|
|
2174
|
|
2175 if (scalar(@fields) > $microsatcord + 2){
|
|
2176 print "fields = @fields ... interr_poscord=$interr_poscord=$fields[$interr_poscord] .. interrcord=$interrcord=$fields[$interrcord]\n" if $printer == 1;
|
|
2177
|
|
2178 my @interposes = ();
|
|
2179 @interposes = split(",",$fields[$interr_poscord]) if $fields[$interr_poscord] =~ /,/;
|
|
2180 $interposes[0] = $fields[$interr_poscord] if $fields[$interr_poscord] !~ /,/ ;
|
|
2181 print "interposes=@interposes\n" if $printer == 1;
|
|
2182 my @relativeposes = ();
|
|
2183 my @interruptions = ();
|
|
2184 @interruptions = split(",",$fields[$interrcord]) if $fields[$interrcord] =~ /,/;
|
|
2185 $interruptions[0] = $fields[$interrcord] if $fields[$interrcord] !~ /,/;
|
|
2186 my @interlens = ();
|
|
2187
|
|
2188
|
|
2189 for my $i (0 ... $#interposes){
|
|
2190
|
|
2191 my $interpos = $interposes[$i];
|
|
2192 my $nexter = 0;
|
|
2193 my $interruption = $interruptions[$i];
|
|
2194 my $interlen = length($interruption);
|
|
2195 push (@interlens, $interlen);
|
|
2196
|
|
2197
|
|
2198 my $relativepos = (100 * $interpos) / $microsatlength;
|
|
2199 print "relativepos = $relativepos ,interpos=$interpos, interruption=$interruption, interlen=$interlen \n" if $printer == 1;
|
|
2200 $relativepos = (100 * ($interpos-$interlen)) / $microsatlength if $relativepos > 50;
|
|
2201 print "--> = $relativepos\n" if $printer == 1;
|
|
2202 $interruption = "IND" if length($interruption) < 1;
|
|
2203
|
|
2204 if ($turn == 1){
|
|
2205 $fields[$microsatcord] = switch_micro($fields[$microsatcord]);
|
|
2206 $interruption = switch_nucl($interruption) unless $interruption eq "IND";
|
|
2207 $interpos = ($microsatlength - $interpos) - $interlen + 2;
|
|
2208 print "turn interpos = $interpos for $fields[$microsatcord]\n" if $printer == 1;
|
|
2209 $relativepos = (100 * $interpos) / $microsatlength;
|
|
2210 $relativepos = (100 * ($interpos-$interlen)) / $microsatlength if $relativepos > 50;
|
|
2211
|
|
2212
|
|
2213 $strand = '+' if $strand eq '-';
|
|
2214 $strand = '-' if $strand eq '+';
|
|
2215 }
|
|
2216 print "final relativepos = $relativepos\n" if $printer == 1;
|
|
2217 push(@relativeposes, $relativepos);
|
|
2218 }
|
|
2219 push(@endstatement,join("\t",($exacttags[$species_counter],$sp_chr, $sp_start, $sp_end, $firstmotif,length($firstmotif),$fields[$microsatcord],$strand,$microsatlength,join(",",@interposes),join(",",@relativeposes),join(",",@interruptions), join(",",@interlens))));
|
|
2220 }
|
|
2221
|
|
2222 else{
|
|
2223 push(@endstatement, join("\t",$exacttags[$species_counter],$sp_chr, $sp_start, $sp_end, $firstmotif,length($firstmotif),$fields[$microsatcord],$strand,$microsatlength,"NA","NA","NA", "NA"));
|
|
2224 }
|
|
2225
|
|
2226 $species_counter++;
|
|
2227 }
|
|
2228
|
|
2229 $locusmotif = $sameHash{$locusmotif} if exists $sameHash{$locusmotif};
|
|
2230 $locusmotif = $revHash{$locusmotif} if exists $revHash{$locusmotif};
|
|
2231
|
|
2232 my $endst = join("\t", @endstatement, $orthHchr, $Hstart, $Hend);
|
|
2233 print join("\t", @endstatement, $orthHchr, $Hstart, $Hend), "\n" if $printer == 1;
|
|
2234
|
|
2235
|
|
2236 return (join("\t", @endstatement, $orthHchr, $Hstart, $Hend), $orthHchr, $Hstart, $Hend, $locusmotif, length($locusmotif));
|
|
2237
|
|
2238 }
|
|
2239
|
|
2240 sub switch_nucl{
|
|
2241 my @strand = split(/\s*/,$_[0]);
|
|
2242 for my $i (0 ... $#strand){
|
|
2243 if ($strand[$i] =~ /c/i) {$strand[$i] = "G";next;}
|
|
2244 if ($strand[$i] =~ /a/i) {$strand[$i] = "T";next;}
|
|
2245 if ($strand[$i] =~ /t/i) { $strand[$i] = "A";next;}
|
|
2246 if ($strand[$i] =~ /g/i) {$strand[$i] = "C";next;}
|
|
2247 }
|
|
2248 return join("",@strand);
|
|
2249 }
|
|
2250
|
|
2251
|
|
2252 sub switch_micro{
|
|
2253 my $micro = reverse($_[0]);
|
|
2254 my @strand = split(/\s*/,$micro);
|
|
2255 for my $i (0 ... $#strand){
|
|
2256 if ($strand[$i] =~ /c/i) {$strand[$i] = "G";next;}
|
|
2257 if ($strand[$i] =~ /a/i) {$strand[$i] = "T";next;}
|
|
2258 if ($strand[$i] =~ /t/i) { $strand[$i] = "A";next;}
|
|
2259 if ($strand[$i] =~ /g/i) {$strand[$i] = "C";next;}
|
|
2260 if ($strand[$i] =~ /\[/i) {$strand[$i] = "]";next;}
|
|
2261 if ($strand[$i] =~ /\]/i) {$strand[$i] = "[";next;}
|
|
2262 }
|
|
2263 return join("",@strand);
|
|
2264 }
|
|
2265 sub decipher_history{
|
|
2266 my $printer = 0;
|
|
2267 my ($mutations_array, $tags_string, $nodes, $branches_hash, $tree_analysis, $confirmation_string, $alivehash) = @_;
|
|
2268 my %mutations_hash=();
|
|
2269 foreach my $mutation (@$mutations_array){
|
|
2270 print "mutation = $mutation\n" if $printer == 1;
|
|
2271 my %local = $mutation =~ /([\S ]+)=([\S ]+)/g;
|
|
2272 push @{$mutations_hash{$local{"node"}}},$mutation;
|
|
2273 print "just for confirmation: $local{node} pushed as: $mutation\n" if $printer == 1;
|
|
2274 }
|
|
2275 my @nodes;
|
|
2276 my @birth_steps=();
|
|
2277 my @death_steps=();
|
|
2278
|
|
2279 my @tags=split(/\s*/,$tags_string);
|
|
2280 my @confirmation=split(/\s+/,$confirmation_string);
|
|
2281 my %info=();
|
|
2282
|
|
2283 for my $i (0 ... $#tags){
|
|
2284 $info{$tags[$i]}=$confirmation[$i];
|
|
2285 print "feeding info: $tags[$i] = $info{$tags[$i]}\n" if $printer == 1;
|
|
2286 }
|
|
2287
|
|
2288 for my $keys (@$nodes) {
|
|
2289 foreach my $key (@$keys){
|
|
2290 # print "current key = $key\n";
|
|
2291 my $copykey = $key;
|
|
2292 $copykey =~ s/[\W ]+//g;
|
|
2293 my @copykeys=split(/\s*/,$copykey);
|
|
2294 my $states=();
|
|
2295 foreach my $copy (@copykeys){
|
|
2296 $states=$states.$info{$copy};
|
|
2297 }
|
|
2298
|
|
2299 print "reduced key = $copykey and state = $states\n" if $printer == 1;
|
|
2300
|
|
2301 if (exists $mutations_hash{$key}) {
|
|
2302
|
|
2303 if ($states=~/\+/){
|
|
2304 push @birth_steps, @{$mutations_hash{$key}};
|
|
2305 $birth_steps[$#birth_steps] =~ s/\S+=//g;
|
|
2306 delete $mutations_hash{$key};
|
|
2307 }
|
|
2308 else{
|
|
2309 push @death_steps, @{$mutations_hash{$key}};
|
|
2310 $death_steps[$#death_steps] =~ s/\S+=//g;
|
|
2311 delete $mutations_hash{$key};
|
|
2312 }
|
|
2313 }
|
|
2314 }
|
|
2315 }
|
|
2316 print "conformation = $confirmation_string\n" if $printer == 1;
|
|
2317 push (@birth_steps, "NULL") if scalar(@birth_steps) == 0;
|
|
2318 push (@death_steps, "NULL") if scalar(@death_steps) == 0;
|
|
2319 print "birth steps = ",join("\n",@birth_steps)," and death steps = ",join("\n",@death_steps),"\n" if $printer == 1;
|
|
2320 return \@birth_steps, \@death_steps;
|
|
2321 }
|
|
2322
|
|
2323 sub fillAlignmentGaps{
|
|
2324 my $printer = 0;
|
|
2325 print "received: @_\n" if $printer == 1;
|
|
2326 my ($tree, $sequences, $alignment, $tagarray, $microsathash, $nonmicrosathash, $motif, $tree_analysis, $threshold, $microsatstarts) = @_;
|
|
2327 print "in fillAlignmentGaps.. tree = $tree \n" if $printer == 1;
|
|
2328 my %sequence_hash=();
|
|
2329
|
|
2330 my @phases = ();
|
|
2331 my $concat = $motif.$motif;
|
|
2332 my $motifsize = length($motif);
|
|
2333
|
|
2334 for my $i (1 ... $motifsize){
|
|
2335 push @phases, substr($concat, $i, $motifsize);
|
|
2336 }
|
|
2337
|
|
2338 my $concatalignment = ();
|
|
2339 foreach my $tag (@tags){
|
|
2340 $concatalignment = $concatalignment.$alignment->{$tag};
|
|
2341 }
|
|
2342 # print "returningg NULL","NULL","NULL", "NULL\n" if $concatalignment !~ /-/;
|
|
2343 return 0, "NULL","NULL","NULL", "NULL","NULL" if $concatalignment !~ /-/;
|
|
2344
|
|
2345
|
|
2346
|
|
2347 my %node_sequences_temp=();
|
|
2348 my %node_alignments_temp =(); #NEW, Nov 28 2008
|
|
2349
|
|
2350 my @tags=();
|
|
2351 my @locus_sequences=();
|
|
2352 my %alivehash=();
|
|
2353
|
|
2354 # print "IN fillAlignmentGaps\n";# <STDIN>;
|
|
2355 my %fillrecord = ();
|
|
2356
|
|
2357 my $change = 0;
|
|
2358 foreach my $tag (@$tagarray) {
|
|
2359 #print "adding: $tag\n";
|
|
2360 push(@tags, $tag);
|
|
2361 if (exists $microsathash->{$tag}){
|
|
2362 my $micro = $microsathash->{$tag};
|
|
2363 my $orig_micro = $micro;
|
|
2364 ($micro, $fillrecord{$tag}) = fillgaps($micro, \@phases);
|
|
2365 $change = 1 if uc($micro) ne uc($orig_micro);
|
|
2366 $node_sequences_temp{$tag}=$micro if $microsathash->{$tag} ne "NULL";
|
|
2367 }
|
|
2368 if (exists $nonmicrosathash->{$tag}){
|
|
2369 my $micro = $nonmicrosathash->{$tag};
|
|
2370 my $orig_micro = $micro;
|
|
2371 ($micro, $fillrecord{$tag}) = fillgaps($micro, \@phases);
|
|
2372 $change = 1 if uc($micro) ne uc($orig_micro);
|
|
2373 $node_sequences_temp{$tag}=$micro if $nonmicrosathash->{$tag} ne "NULL";
|
|
2374 }
|
|
2375
|
|
2376 if (exists $alignment->{$tag}){
|
|
2377 my $micro = $alignment->{$tag};
|
|
2378 my $orig_micro = $micro;
|
|
2379 ($micro, $fillrecord{$tag}) = fillgaps($micro, \@phases);
|
|
2380 $change = 1 if uc($micro) ne uc($orig_micro);
|
|
2381 $node_alignments_temp{$tag}=$micro if $alignment->{$tag} ne "NULL";
|
|
2382 }
|
|
2383
|
|
2384 #print "adding to node_sequences: $tag = ",$node_sequences_temp{$tag},"\n" if $printer == 1;
|
|
2385 #print "adding to node_alignments: $tag = ",$node_alignments_temp{$tag},"\n" if $printer == 1;
|
|
2386 }
|
|
2387
|
|
2388
|
|
2389 my %node_sequences=();
|
|
2390 my %node_alignments =(); #NEW, Nov 28 2008
|
|
2391 foreach my $tag (@$tagarray) {
|
|
2392 $node_sequences{$tag} = join ".",split(/\s*/,$node_sequences_temp{$tag});
|
|
2393 $node_alignments{$tag} = join ".",split(/\s*/,$node_alignments_temp{$tag});
|
|
2394 }
|
|
2395
|
|
2396 print "\n", "#" x 50, "\n" if $printer == 1;
|
|
2397 foreach my $tag (@tags){
|
|
2398 print "$tag: $alignment->{$tag} = $node_alignments{$tag}\n" if $printer == 1;
|
|
2399 }
|
|
2400 print "\n", "#" x 50, "\n" if $printer == 1;
|
|
2401 # print "change = $change\n";
|
|
2402 #<STDIN> if $concatalignment=~/\-/;
|
|
2403
|
|
2404 # <STDIN> if $printer == 1 && $concatalignment =~ /\-/;
|
|
2405
|
|
2406 return 0, "NULL","NULL","NULL", "NULL", "NULL" if $change == 0;
|
|
2407
|
|
2408 my ($nodes_arr, $branches_hash) = get_nodes($tree);
|
|
2409 my @nodes=@$nodes_arr;
|
|
2410 print "recieved nodes = @nodes\n" if $printer == 1;
|
|
2411
|
|
2412
|
|
2413 #POPULATE branches_hash WITH INFORMATION ABOUT LIVESTATUS
|
|
2414 foreach my $keys (@nodes){
|
|
2415 my @pair = @$keys;
|
|
2416 my $joint = "(".join(", ",@pair).")";
|
|
2417 my $copykey = join "", @pair;
|
|
2418 $copykey =~ s/[\W ]+//g;
|
|
2419 print "for node: $keys, copykey = $copykey and joint = $joint\n" if $printer == 1;
|
|
2420 my $livestatus = 1;
|
|
2421 foreach my $copy (split(/\s*/,$copykey)){
|
|
2422 $livestatus = 0 if !exists $alivehash{$copy};
|
|
2423 }
|
|
2424 $alivehash{$joint} = $joint if !exists $alivehash{$joint} && $livestatus == 1;
|
|
2425 print "alivehash = $alivehash{$joint}\n" if exists $alivehash{$joint} && $printer == 1;
|
|
2426 }
|
|
2427
|
|
2428
|
|
2429
|
|
2430 @nodes = reverse(@nodes); #1 THIS IS IN ORDER TO GO THROUGH THE TREE FROM LEAVES TO ROOT.
|
|
2431
|
|
2432 my @mutations_array=();
|
|
2433
|
|
2434 my $joint = ();
|
|
2435 foreach my $node (@nodes){
|
|
2436 my @pair = @$node;
|
|
2437 print "now in the nodes for loop, pair = @pair\n and sequences=\n" if $printer == 1;
|
|
2438 $joint = "(".join(", ",@pair).")";
|
|
2439 print "joint = $joint \n" if $printer == 1;
|
|
2440 my @pair_sequences=();
|
|
2441
|
|
2442 foreach my $tag (@pair){
|
|
2443 print "tag = $tag: " if $printer == 1;
|
|
2444 print $node_alignments{$tag},"\n" if $printer == 1;
|
|
2445 push @pair_sequences, $node_alignments{$tag};
|
|
2446 }
|
|
2447 # print "fillgap\n";
|
|
2448 my ($compared, $substitutions_list) = base_by_base_simple($motif,\@pair_sequences, scalar(@pair_sequences), @pair, $joint);
|
|
2449 $node_alignments{$joint}=$compared;
|
|
2450 push( @mutations_array,split(/:/,$substitutions_list));
|
|
2451 print "newly added to node_sequences: $node_alignments{$joint} and list of mutations = @mutations_array\n" if $printer == 1;
|
|
2452 }
|
|
2453 print "now sending for analyze_mutations: mutation_array=@mutations_array, nodes=@nodes, branches_hash=$branches_hash, alignment=$alignment, tags=@tags, alivehash=%alivehash, node_sequences=\%node_sequences, microsatstarts=$microsatstarts, motif=$motif\n" if $printer == 1;
|
|
2454 # <STDIN> if $printer == 1;
|
|
2455
|
|
2456 my $analayzed_mutations = analyze_mutations(\@mutations_array, \@nodes, $branches_hash, $alignment, \@tags, \%alivehash, \%node_sequences, $microsatstarts, $motif);
|
|
2457
|
|
2458 # print "returningt: ", $analayzed_mutations, \@nodes,"\n" if scalar @mutations_array > 0;;
|
|
2459 # print "returningy: NULL, NULL, NULL " if scalar @mutations_array == 0 && $printer == 1;
|
|
2460 print "final node alignment after filling for $joint= " if $printer == 1;
|
|
2461 print "$node_alignments{$joint}\n" if $printer == 1;
|
|
2462
|
|
2463
|
|
2464 return 1, $analayzed_mutations, \@nodes, $branches_hash, \%alivehash, $node_alignments{$joint} if scalar @mutations_array > 0 ;
|
|
2465 return 1, "NULL","NULL","NULL", "NULL", "NULL" if scalar @mutations_array == 0;
|
|
2466 }
|
|
2467
|
|
2468
|
|
2469
|
|
2470 sub add_mutation{
|
|
2471 my $printer = 0;
|
|
2472 print "IN SUBROUTUNE add_mutation.. information received = @_\n" if $printer == 1;
|
|
2473 my ($i , $bite, $to, $from) = @_;
|
|
2474 print "bite = $bite.. all received info = ",join("^", @_),"\n" if $printer == 1;
|
|
2475 print "to=$to\n" if $printer == 1;
|
|
2476 print "tis split = ",join(" and ",split(/!/,$to)),"\n" if $printer == 1;
|
|
2477 my @toields = split "!",$to;
|
|
2478 print "toilds = @toields\n" if $printer == 1;
|
|
2479 my @mutations=();
|
|
2480
|
|
2481 foreach my $toield (@toields){
|
|
2482 my @toinfo=split(":",$toield);
|
|
2483 print " at toinfo=@toinfo \n" if $printer == 1;
|
|
2484 next if $toinfo[1] =~ /$from/i;
|
|
2485 my @mutation = @toinfo if $toinfo[1] !~ /$from/i;
|
|
2486 print "adding to mutaton list: ", join(",", "node=$mutation[0]","type=substitution" ,"position=$i", "from=$from", "to=$mutation[1]", "insertion=", "deletion="),"\n" if $printer == 1;
|
|
2487 push (@mutations, join("\t", "node=$mutation[0]","type=substitution" ,"position=$i", "from=$from", "to=$mutation[1]", "insertion=", "deletion="));
|
|
2488 }
|
|
2489 return @mutations;
|
|
2490 }
|
|
2491
|
|
2492
|
|
2493 sub add_bases{
|
|
2494
|
|
2495 my $printer = 0;
|
|
2496 print "IN SUBROUTUNE add_bases.. information received = @_\n" if $printer == 1;
|
|
2497 my ($optional0, $optional1, $pair0, $pair1,$joint) = @_;
|
|
2498 my $total_list=();
|
|
2499
|
|
2500 my @total_list0=split(/!/,$optional0);
|
|
2501 my @total_list1=split(/!/,$optional1);
|
|
2502 my @all_list=();
|
|
2503 my %total_hash0=();
|
|
2504 foreach my $entry (@total_list0) {
|
|
2505 $entry = uc $entry;
|
|
2506 $entry =~ /(\S+):(\S+)/;
|
|
2507 $total_hash0{$2}=$1;
|
|
2508 push @all_list, $2;
|
|
2509 }
|
|
2510
|
|
2511 my %total_hash1=();
|
|
2512 foreach my $entry (@total_list1) {
|
|
2513 $entry = uc $entry;
|
|
2514 $entry =~ /(\S+):(\S+)/;
|
|
2515 $total_hash1{$2}=$1;
|
|
2516 push @all_list, $2;
|
|
2517 }
|
|
2518
|
|
2519 my %alphabetical_hash=();
|
|
2520 my @return_options=();
|
|
2521
|
|
2522 for my $i (0 ... $#all_list){
|
|
2523 my $alph = $all_list[$i];
|
|
2524 if (exists $total_hash0{$alph} && exists $total_hash1{$alph}){
|
|
2525 push(@return_options, $joint.":".$alph);
|
|
2526 delete $total_hash0{$alph}; delete $total_hash1{$alph};
|
|
2527 }
|
|
2528 if (exists $total_hash0{$alph} && !exists $total_hash1{$alph}){
|
|
2529 push(@return_options, $pair0.":".$alph);
|
|
2530 delete $total_hash0{$alph};
|
|
2531 }
|
|
2532 if (!exists $total_hash0{$alph} && exists $total_hash1{$alph}){
|
|
2533 push(@return_options, $pair1.":".$alph);
|
|
2534 delete $total_hash1{$alph};
|
|
2535 }
|
|
2536
|
|
2537 }
|
|
2538
|
|
2539 print "returning ",join "!",@return_options,"\n" if $printer == 1;
|
|
2540 return join "!",@return_options;
|
|
2541
|
|
2542 }
|
|
2543
|
|
2544
|
|
2545 sub fillgaps{
|
|
2546 # print "IN fillgaps: @_\n";
|
|
2547 my ($micro, $phasesinput) = @_;
|
|
2548 #print "in microsathash ,,.. micro = $micro\n";
|
|
2549 return $micro if $micro !~ /\-/;
|
|
2550 my $orig_micro = $micro;
|
|
2551 my @phases = @$phasesinput;
|
|
2552
|
|
2553 my %tested_patterns = ();
|
|
2554
|
|
2555 foreach my $phase (@phases){
|
|
2556 # print "considering phase: $phase\n";
|
|
2557 my @phase_prefixes = ();
|
|
2558 my @prephase_left_contexts = ();
|
|
2559 my @prephase_right_contexts = ();
|
|
2560 my @pregapsize = ();
|
|
2561 my @prepostfilins = ();
|
|
2562
|
|
2563 my @phase_suffixes;
|
|
2564 my @suffphase_left_contexts;
|
|
2565 my @suffphase_right_contexts;
|
|
2566 my @suffgapsize;
|
|
2567 my @suffpostfilins;
|
|
2568
|
|
2569 my @postfilins = ();
|
|
2570 my $motifsize = length($phases[0]);
|
|
2571
|
|
2572 my $change = 0;
|
|
2573
|
|
2574 for my $u (0 ... $motifsize-1){
|
|
2575 my $concat = $phase.$phase.$phase.$phase;
|
|
2576 my @concatarr = split(/\s*/, $concat);
|
|
2577 my $l = 0;
|
|
2578 while ($l < $u){
|
|
2579 shift @concatarr;
|
|
2580 $l++;
|
|
2581 }
|
|
2582 $concat = join ("", @concatarr);
|
|
2583
|
|
2584 for my $t (0 ... $motifsize-1){
|
|
2585 for my $k (1 ... $motifsize-1){
|
|
2586 push @phase_prefixes, substr($concat, $motifsize+$t, $k);
|
|
2587 push @prephase_left_contexts, substr ($concat, $t, $motifsize);
|
|
2588 push @prephase_right_contexts, substr ($concat, $motifsize+$t+$k+($motifsize-$k), 1);
|
|
2589 push @pregapsize, $k;
|
|
2590 push @prepostfilins, substr($concat, $motifsize+$t+$k, ($motifsize-$k));
|
|
2591 # print "reading: $concat, t=$t, k=$k prefix: $prephase_left_contexts[$#prephase_left_contexts] $phase_prefixes[$#phase_prefixes] -x$pregapsize[$#pregapsize] $prephase_right_contexts[$#prephase_right_contexts]\n";
|
|
2592 # print "phase_prefixes = $phase_prefixes[$#phase_prefixes]\n";
|
|
2593 # print "prephase_left_contexts = $prephase_left_contexts[$#prephase_left_contexts]\n";
|
|
2594 # print "prephase_right_contexts = $prephase_right_contexts[$#prephase_right_contexts]\n";
|
|
2595 # print "pregapsize = $pregapsize[$#pregapsize]\n";
|
|
2596 # print "prepostfilins = $prepostfilins[$#prepostfilins]\n";
|
|
2597 }
|
|
2598 }
|
|
2599 }
|
|
2600
|
|
2601 # print "looking if $micro =~ /($phase\-{$motifsize})/i || $micro =~ /^(\-{$motifsize,}$phase)/i\n";
|
|
2602 if ($micro =~ /($phase\-{$motifsize,})$/i || $micro =~ /^(\-{$motifsize,}$phase)/i){
|
|
2603 # print "micro: $micro needs further gap removal: $1\n";
|
|
2604 while ($micro =~ /$phase(\-{$motifsize,})$/i || $micro =~ /^(\-{$motifsize,})$phase/i){
|
|
2605 # print "micro: $micro needs further gap removal: $1\n";
|
|
2606
|
|
2607 # print "phase being considered = $phase\n";
|
|
2608 my $num = ();
|
|
2609 $num = $micro =~ s/$phase\-{$motifsize}/$phase$phase/gi if $micro =~ /$phase\-{$motifsize,}/i;
|
|
2610 $num = $micro =~ s/\-{$motifsize}$phase/$phase$phase/gi if $micro =~ /\-{$motifsize,}$phase/i;
|
|
2611 # print "num = $num\n";
|
|
2612 $change = 1 if $num == 1;
|
|
2613 }
|
|
2614 }
|
|
2615
|
|
2616 elsif ($micro =~ /(($phase)+)\-{$motifsize,}(($phase)+)/i){
|
|
2617 while ($micro =~ /(($phase)+)\-{$motifsize,}(($phase)+)/i){
|
|
2618 # print "checking lengths of $1 and $3 for $micro... \n";
|
|
2619 my $num = ();
|
|
2620 if (length($1) >= length($3)){
|
|
2621 # print "$micro matches (($phase)+)\-{$motifsize,}(($phase)+) = $1, >= , $3 \n";
|
|
2622 $num = $micro =~ s/$phase\-{$motifsize}/$phase$phase/gi ;
|
|
2623 }
|
|
2624 if (length($1) < length($3)){
|
|
2625 # print "$micro matches (($phase)+)\-{$motifsize,}(($phase)+) = $1, < , $3 \n";
|
|
2626 $num = $micro =~ s/\-{$motifsize}$phase/$phase$phase/gi ;
|
|
2627 }
|
|
2628 # print "micro changed to $micro\n";
|
|
2629 }
|
|
2630 }
|
|
2631 elsif ($micro =~ /([A-Z]+)\-{$motifsize,}(($phase)+)/i){
|
|
2632 while ($micro =~ /([A-Z]+)\-{$motifsize,}(($phase)+)/i){
|
|
2633 # print "$micro matches ([A-Z]+)\-{$motifsize}(($phase)+) = 1=$1, - , 3=$3 \n";
|
|
2634 my $num = 0;
|
|
2635 $num = $micro =~ s/\-{$motifsize}$phase/$phase$phase/gi ;
|
|
2636 }
|
|
2637 }
|
|
2638 elsif ($micro =~ /(($phase)+)\-{$motifsize,}([A-Z]+)/i){
|
|
2639 while ($micro =~ /(($phase)+)\-{$motifsize,}([A-Z]+)/i){
|
|
2640 # print "$micro matches (($phase)+)\-{$motifsize,}([A-Z]+) = 1=$1, - , 3=$3 \n";
|
|
2641 my $num = 0;
|
|
2642 $num = $micro =~ s/$phase\-{$motifsize}/$phase$phase/gi ;
|
|
2643 }
|
|
2644 }
|
|
2645
|
|
2646 # print "$orig_micro to $micro\n";
|
|
2647
|
|
2648 #s <STDIN>;
|
|
2649
|
|
2650 for my $h (0 ... $#phase_prefixes){
|
|
2651 # print "searching using prefix : $prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h]\n";
|
|
2652 my $pattern = $prephase_left_contexts[$h].$phase_prefixes[$h].$pregapsize[$h].$prephase_right_contexts[$h];
|
|
2653 # print "returning orig_micro = $orig_micro, micro = $micro \n" if exists $tested_patterns{$pattern};
|
|
2654 if ($micro =~ /$prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h]/i){
|
|
2655 return $orig_micro if exists $tested_patterns{$pattern};
|
|
2656 while ($micro =~ /($prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h])/i){
|
|
2657 $tested_patterns{$pattern} = $pattern;
|
|
2658 # print "micro: $micro needs further gap removal: $1\n";
|
|
2659
|
|
2660 # print "prefix being considered = $phase_prefixes[$h]\n";
|
|
2661 my $num = ();
|
|
2662 $num = ($micro =~ s/$prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h]/$prephase_left_contexts[$h]$phase_prefixes[$h]$prepostfilins[$h]$prephase_right_contexts[$h]/gi) ;
|
|
2663 # print "num = $num, micro = $micro\n";
|
|
2664 $change = 1 if $num == 1;
|
|
2665
|
|
2666 return $orig_micro if $num > 1;
|
|
2667 }
|
|
2668 }
|
|
2669
|
|
2670 }
|
|
2671 }
|
|
2672 return $orig_micro if length($micro) != length($orig_micro);
|
|
2673 return $micro;
|
|
2674 }
|
|
2675
|
|
2676 sub selectMutationArray{
|
|
2677 my $printer =0;
|
|
2678
|
|
2679 my $oldmutspt = $_[0];
|
|
2680 my $newmutspt = $_[1];
|
|
2681 my $tagstringpt = $_[2];
|
|
2682 my $alivehashpt = $_[3];
|
|
2683 my $alignmentpt = $_[4];
|
|
2684 my $motif = $_[5];
|
|
2685
|
|
2686 my @alivehasharr=();
|
|
2687
|
|
2688 my @tags = @$tagstringpt;
|
|
2689 my $alignmentln = length($alignmentpt->{$tags[0]});
|
|
2690
|
|
2691 foreach my $key (keys %$alivehashpt) { push @alivehasharr, $key; print "we have alive: $key\n" if $printer == 1;}
|
|
2692
|
|
2693 my %newside = ();
|
|
2694 my %oldside = ();
|
|
2695 my %newmuts = ();
|
|
2696
|
|
2697 my %commons = ();
|
|
2698 my %olds = ();
|
|
2699 foreach my $old (@$oldmutspt){
|
|
2700 $olds{$old} = 1;
|
|
2701 }
|
|
2702 foreach my $new (@$newmutspt){
|
|
2703 $commons{$new} = 1 if exists $olds{$new};;
|
|
2704 }
|
|
2705
|
|
2706
|
|
2707 foreach my $pos ( 0 ... $alignmentln){
|
|
2708 #print "pos = $pos\n" if $printer == 1;
|
|
2709 my $newyes = 0;
|
|
2710 foreach my $mut (@$newmutspt){
|
|
2711 $newmuts{$mut} = 1;
|
|
2712 chomp $mut;
|
|
2713 $newyes++;
|
|
2714 $mut =~ s/=\t/= \t/g;
|
|
2715 $mut =~ s/=$/= /g;
|
|
2716
|
|
2717 $mut =~ /node=([A-Z\(\), ]+)\stype=([a-zA-Z ]+)\sposition=([0-9 ]+)\sfrom=([a-zA-Z\- ]+)\sto=([a-zA-Z\- ]+)\sinsertion=([a-zA-Z\- ]+)\sdeletion=([a-zA-Z\- ]+)/;
|
|
2718 my $node = $1;
|
|
2719 next if $3 != $pos;
|
|
2720 print "new mut = $mut\n" if $printer == 1;
|
|
2721 print "node = $node, pos = $3 ... and alivehasharr = >@alivehasharr<\n" if $printer == 1;
|
|
2722 my $alivenode = 0;
|
|
2723 foreach my $key (@alivehasharr){
|
|
2724 $alivenode = 1 if $key =~ /$node/;
|
|
2725 }
|
|
2726 # next if $alivenode == 0;
|
|
2727 my $indel_type = " ";
|
|
2728 if ($2 eq "insertion" || $2 eq "deletion"){
|
|
2729 my $thisindel = ();
|
|
2730 $thisindel = $6 if $2 eq "insertion";
|
|
2731 $thisindel = $7 if $2 eq "deletion";
|
|
2732
|
|
2733 $indel_type = "i".checkIndelType($node, $thisindel, $motif,$alignmentpt,$3, $2) if $2 eq "insertion";
|
|
2734 $indel_type = "d".checkIndelType($node, $thisindel, $motif,$alignmentpt, $3, $2) if $2 eq "deletion";
|
|
2735 $indel_type = $indel_type."f" if $indel_type =~ /mot/ && length($thisindel) >= length($motif);
|
|
2736 }
|
|
2737 print "indeltype = $indel_type\n" if $printer == 1;
|
|
2738 my $added = 0;
|
|
2739
|
|
2740 if (exists $newside{$pos} && $indel_type =~ /[a-z]+/){
|
|
2741 print "we have a preexisting one for $pos\n" if $printer == 1;
|
|
2742 my @preexisting = @{$newside{$pos}};
|
|
2743 foreach my $pre (@preexisting){
|
|
2744 print "looking at $pre\n" if $printer == 1;
|
|
2745 next if $pre !~ /node=$node/;
|
|
2746 next if $pre !~ /indeltype=([a-z]+)/;
|
|
2747 my $currtype = $1;
|
|
2748
|
|
2749 if ($currtype =~ /inon/ && $indel_type =~ /dmot/){
|
|
2750 delete $newside{$pos};
|
|
2751 push @{$newside{$pos}}, $pre;
|
|
2752 $added = 1;
|
|
2753 }
|
|
2754 if ($currtype =~ /dnon/ && $indel_type =~ /imot/){
|
|
2755 delete $newside{$pos};
|
|
2756 push @{$newside{$pos}}, $pre;
|
|
2757 $added = 1;
|
|
2758 }
|
|
2759 if ($currtype =~ /dmot/ && $indel_type =~ /inon/){
|
|
2760 delete $newside{$pos};
|
|
2761 push @{$newside{$pos}}, $mut."\tindeltype=$indel_type";
|
|
2762 $added = 1;
|
|
2763 }
|
|
2764 if ($currtype =~ /imot/ && $indel_type =~ /dnon/){
|
|
2765 delete $newside{$pos};
|
|
2766 push @{$newside{$pos}}, $mut."\tindeltype=$indel_type";
|
|
2767 $added = 1;
|
|
2768 }
|
|
2769 }
|
|
2770 }
|
|
2771
|
|
2772 print "added = $added\n" if $printer == 1;
|
|
2773 push @{$newside{$pos}}, $mut."\tindeltype=$indel_type" if $added == 0;
|
|
2774
|
|
2775 print "for new pos,: $pos we have: @{$newside{$pos}}\n " if $printer == 1;
|
|
2776 }
|
|
2777 }
|
|
2778
|
|
2779 foreach my $pos ( 0 ... $alignmentln){
|
|
2780 my $oldyes = 0;
|
|
2781 foreach my $mut (@$oldmutspt){
|
|
2782 chomp $mut;
|
|
2783 $oldyes++;
|
|
2784 $mut =~ s/=\t/= \t/g;
|
|
2785 $mut =~ s/=$/= /g;
|
|
2786 $mut =~ /node=([A-Z\(\), ]+)\ttype=([a-zA-Z ]+)\tposition=([0-9 ]+)\tfrom=([a-zA-Z\- ]+)\tto=([a-zA-Z\- ]+)\tinsertion=([a-zA-Z\- ]+)\tdeletion=([a-zA-Z\- ]+)/;
|
|
2787 my $node = $1;
|
|
2788 next if $3 != $pos;
|
|
2789 print "old mut = $mut\n" if $printer == 1;
|
|
2790 my $alivenode = 0;
|
|
2791 foreach my $key (@alivehasharr){
|
|
2792 $alivenode = 1 if $key =~ /$node/;
|
|
2793 }
|
|
2794 #next if $alivenode == 0;
|
|
2795 my $indel_type = " ";
|
|
2796 if ($2 eq "insertion" || $2 eq "deletion"){
|
|
2797 $indel_type = "i".checkIndelType($node, $6, $motif,$alignmentpt, $3, $2) if $2 eq "insertion";
|
|
2798 $indel_type = "d".checkIndelType($node, $7, $motif,$alignmentpt, $3, $2) if $2 eq "deletion";
|
|
2799 next if $indel_type =~/non/;
|
|
2800 }
|
|
2801 else{ next;}
|
|
2802
|
|
2803 my $imp=0;
|
|
2804 $imp = 1 if $indel_type =~ /dmot/ && $alivenode == 0;
|
|
2805 $imp = 1 if $indel_type =~ /imot/ && $alivenode == 1;
|
|
2806
|
|
2807
|
|
2808 if (exists $newside{$pos} && $indel_type =~ /[a-z]+/){
|
|
2809 my @preexisting = @{$newside{$pos}};
|
|
2810 print "we have a preexisting one for $pos: @preexisting\n" if $printer == 1;
|
|
2811 next if $imp == 0;
|
|
2812
|
|
2813 if (scalar(@preexisting) == 1){
|
|
2814 my $foundmut = $preexisting[0];
|
|
2815 $foundmut=~ /node=([A-Z, \(\)]+)/;
|
|
2816 next if $1 eq $node;
|
|
2817
|
|
2818 if (exists $oldside{$pos} || exists $commons{$foundmut}){
|
|
2819 print "not replacing, but just adding\n" if $printer == 1;
|
|
2820 push @{$newside{$pos}}, $mut."\tindeltype=$indel_type";
|
|
2821 push @{$oldside{$pos}}, $mut."\tindeltype=$indel_type";
|
|
2822 next;
|
|
2823 }
|
|
2824
|
|
2825 delete $newside{$pos};
|
|
2826 push @{$oldside{$pos}}, $mut."\tindeltype=$indel_type";
|
|
2827 push @{$newside{$pos}}, $mut."\tindeltype=$indel_type";
|
|
2828 print "now new one is : @{$newside{$pos}}\n" if $printer == 1;
|
|
2829 }
|
|
2830
|
|
2831 print "for pos: $pos: @{$newside{$pos}}\n" if $printer == 1;
|
|
2832 next;
|
|
2833 }
|
|
2834
|
|
2835
|
|
2836 my @news = @{$newside{$pos}} if exists $newside{$pos};
|
|
2837
|
|
2838 print "mut = $mut and news = @news\n" if $printer == 1;
|
|
2839 push @{$oldside{$pos}}, $mut."\tindeltype=$indel_type";
|
|
2840 push @{$newside{$pos}}, $mut."\tindeltype=$indel_type";
|
|
2841 }
|
|
2842 }
|
|
2843
|
|
2844 print "in the end, our collected mutations = \n" if $printer == 1;
|
|
2845 my @returnarr = ();
|
|
2846 foreach my $key (keys %newside) {push @returnarr,@{$newside{$key}};}
|
|
2847 print join("\n", @returnarr),"\n" if $printer == 1;
|
|
2848 #<STDIN>;
|
|
2849 return @returnarr;
|
|
2850
|
|
2851 }
|
|
2852
|
|
2853
|
|
2854 sub checkIndelType{
|
|
2855 my $printer = 0;
|
|
2856 my $node = $_[0];
|
|
2857 my $indel = $_[1];
|
|
2858 my $motif = $_[2];
|
|
2859 my $alignmentpt = $_[3];
|
|
2860 my $posit = $_[4];
|
|
2861 my $type = $_[5];
|
|
2862 my @phases =();
|
|
2863 my %prephases = ();
|
|
2864 my %postphases = ();
|
|
2865 #print "motif = $motif\n";
|
|
2866 print "IN checkIndelType ... received: @_\n" if $printer == 1;
|
|
2867 my $concat = $motif.$motif.$motif.$motif;
|
|
2868 my $motiflength = length($motif);
|
|
2869
|
|
2870 if ($motiflength > length ($indel)){
|
|
2871 return "non" if $motif !~ /$indel/i;
|
|
2872 return checkIndelType_ComplexAnalysis($node, $indel, $motif, $alignmentpt, $posit, $type);
|
|
2873 }
|
|
2874
|
|
2875 my $firstpass = 0;
|
|
2876 for my $y (0 ... $motiflength-1){
|
|
2877 my $phase = substr($concat, $motiflength+$y, $motiflength);
|
|
2878 push @phases, $phase;
|
|
2879 $firstpass = 1 if $indel =~ /$phase/i;
|
|
2880 for my $k (0 ... length($motif)-1){
|
|
2881 print "at: motiflength=$motiflength , y=$y , k=$k.. for pre: $motiflength+$y-$k and post: $motiflength+$y-$k+$motiflength in $concat\n" if $printer == 1;
|
|
2882 my $pre = substr($concat, $motiflength+$y-$k, $k );
|
|
2883 my $post = substr($concat, $motiflength+$y+$motiflength, $k);
|
|
2884 print "adding to phases : $phase - $pre and $post\n" if $printer == 1;
|
|
2885 push @{$prephases{$phase}} , $pre;
|
|
2886 push @{$postphases{$phase}} , $post;
|
|
2887 }
|
|
2888
|
|
2889 }
|
|
2890 print "firstpass 1= $firstpass\n" if $printer == 1;
|
|
2891 return "non" if $firstpass ==0;
|
|
2892 $firstpass =0;
|
|
2893
|
|
2894 foreach my $phase (@phases){
|
|
2895 my @pres = @{$prephases{$phase}};
|
|
2896 my @posts = @{$postphases{$phase}};
|
|
2897
|
|
2898 foreach my $pre (@pres){
|
|
2899 foreach my $post (@posts){
|
|
2900
|
|
2901 $firstpass = 1 if $indel =~ /($pre)?($phase)+($post)?/i && length($indel) > (3 * length($motif));
|
|
2902 $firstpass = 1 if $indel =~ /^($pre)?($phase)+($post)?$/i && length($indel) < (3 * length($motif));
|
|
2903 print "matched here : ($pre)?($phase)+($post)?\n" if $printer == 1;
|
|
2904 last if $firstpass == 1;
|
|
2905 }
|
|
2906 last if $firstpass == 1;
|
|
2907 }
|
|
2908 last if $firstpass == 1;
|
|
2909 }
|
|
2910
|
|
2911 print "firstpass 2= $firstpass\n" if $printer == 1;
|
|
2912 return "non" if $firstpass ==0;
|
|
2913 return "mot" if $firstpass ==1;
|
|
2914 }
|
|
2915
|
|
2916
|
|
2917 sub checkIndelType_ComplexAnalysis{
|
|
2918 my $printer = 0;
|
|
2919 my $node = $_[0];
|
|
2920 my $indel = $_[1];
|
|
2921 my $motif = $_[2];
|
|
2922 my $alignmentpt = $_[3];
|
|
2923 my $pos = $_[4];
|
|
2924 my $type = $_[5];
|
|
2925 my @speciesinvolved = $node =~ /[A-Z]+/g;
|
|
2926
|
|
2927 my @seqs = ();
|
|
2928 my $residualseq = length($motif) - length($indel);
|
|
2929 print "IN COMPLEX ANALYSIS ... received: @_ .... speciesinvolved = @speciesinvolved\n" if $printer == 1;
|
|
2930 print "we have position = $pos, sseq = $alignmentpt->{$speciesinvolved[0]}\n" if $printer == 1;
|
|
2931 print "residualseq = $residualseq\n" if $printer == 1;
|
|
2932 print "pos=$pos... got: @_\n" if $printer == 1;
|
|
2933 foreach my $sp (@speciesinvolved){
|
|
2934 my $spseq = $alignmentpt->{$sp};
|
|
2935 #print "orig spseq = $spseq\n";
|
|
2936 my $subseq = ();
|
|
2937
|
|
2938 if ($type eq "deletion"){
|
|
2939 my @indelparts = split(/\s*/,$indel);
|
|
2940 my @seqparts = split(/\s*/,$spseq);
|
|
2941
|
|
2942 for my $p ($pos ... $pos+length($indel)-1){
|
|
2943 $seqparts[$p] = shift @indelparts;
|
|
2944 }
|
|
2945 $spseq = join("",@seqparts);
|
|
2946 }
|
|
2947 #print "mod spseq = $spseq\n";
|
|
2948 # $spseq=~ s/\-//g if $type !~ /deletion/;
|
|
2949
|
|
2950 print "substr($spseq, $pos-($residualseq), length($indel)+$residualseq+$residualseq)\n" if $pos > 0 && $pos < (length($spseq) - length($motif)) && $printer == 1;
|
|
2951 print "substr($spseq, 0, length($indel)+$residualseq)\n" if $pos == 0 && $printer == 1;
|
|
2952 print "substr($spseq, $pos - $residualseq, length($indel)+$residualseq)\n" if $pos >= (length($spseq) - length($motif)) && $printer == 1;
|
|
2953
|
|
2954 $subseq = substr($spseq, $pos-($residualseq), length($indel)+$residualseq+$residualseq) if $pos > 0 && $pos < (length($spseq) - length($motif)) ;
|
|
2955 $subseq = substr($spseq, 0, length($indel)+$residualseq) if $pos == 0;
|
|
2956 $subseq = substr($spseq, $pos - $residualseq, length($indel)+$residualseq) if $pos >= (length($spseq) - length($motif)) ;
|
|
2957 print "spseq = $spseq . subseq=$subseq . type = $type\n" if $printer == 1;
|
|
2958 #<STDIN> if $subseq !~ /[a-z\-]/i;
|
|
2959 $subseq =~ s/\-/$indel/g if $type =~ /insertion/;
|
|
2960 push @seqs, $subseq;
|
|
2961 print "seqs = @seqs\n" if $printer == 1;
|
|
2962 }
|
|
2963 return "non" if checkIfSeqsIdentical(@seqs) eq "NO";
|
|
2964
|
|
2965 print "checking for $seqs[0] \n" if $printer == 1;
|
|
2966
|
|
2967 my @phases =();
|
|
2968 my %prephases = ();
|
|
2969 my %postphases = ();
|
|
2970 my $concat = $motif.$motif.$motif.$motif;
|
|
2971 my $motiflength = length($motif);
|
|
2972
|
|
2973 my $firstpass = 0;
|
|
2974
|
|
2975 for my $y (0 ... $motiflength-1){
|
|
2976 my $phase = substr($concat, $motiflength+$y, $motiflength);
|
|
2977 push @phases, $phase;
|
|
2978 $firstpass = 1 if $seqs[0] =~ /$phase/i;
|
|
2979 for my $k (0 ... length($motif)-1){
|
|
2980 my $pre = substr($concat, $motiflength+$y-$k, $k );
|
|
2981 my $post = substr($concat, $motiflength+$y+$motiflength, $k);
|
|
2982 print "adding to phases : $phase - $pre and $post\n" if $printer == 1;
|
|
2983 push @{$prephases{$phase}} , $pre;
|
|
2984 push @{$postphases{$phase}} , $post;
|
|
2985 }
|
|
2986
|
|
2987 }
|
|
2988 print "firstpass 1= $firstpass.. also, res-d = ",(length($seqs[0]))%(length($motif)),"\n" if $printer == 1;
|
|
2989 return "non" if $firstpass ==0;
|
|
2990 $firstpass =0;
|
|
2991 foreach my $phase (@phases){
|
|
2992
|
|
2993 $firstpass = 1 if $seqs[0] =~ /^($phase)+$/i && ((length($seqs[0]))%(length($motif))) == 0;
|
|
2994
|
|
2995 if (((length($seqs[0]))%(length($motif))) != 0){
|
|
2996 my @pres = @{$prephases{$phase}};
|
|
2997 my @posts = @{$postphases{$phase}};
|
|
2998 foreach my $pre (@pres){
|
|
2999 foreach my $post (@posts){
|
|
3000 next if $pre !~ /\S/ && $post !~ /\S/;
|
|
3001 $firstpass = 1 if ($seqs[0] =~ /^($pre)($phase)+($post)$/i || $seqs[0] =~ /^($pre)($phase)+$/i || $seqs[0] =~ /^($phase)+($post)$/i);
|
|
3002 print "caught with $pre $phase $post\n" if $printer == 1;
|
|
3003 last if $firstpass == 1;
|
|
3004 }
|
|
3005 last if $firstpass == 1;
|
|
3006 }
|
|
3007 }
|
|
3008
|
|
3009 last if $firstpass == 1;
|
|
3010 }
|
|
3011
|
|
3012 #print "indel = $indel.. motif = $motif.. firstpass 2= mot\n" if $firstpass ==1;
|
|
3013 #print "indel = $indel.. motif = $motif.. firstpass 2= non\n" if $firstpass ==0;
|
|
3014 #<STDIN>;# if $firstpass ==1;
|
|
3015 return "non" if $firstpass ==0;
|
|
3016 return "mot" if $firstpass ==1;
|
|
3017
|
|
3018 }
|
|
3019
|
|
3020 sub checkIfSeqsIdentical{
|
|
3021 my @seqs = @_;
|
|
3022 my $identical = 1;
|
|
3023
|
|
3024 for my $j (1 ... $#seqs){
|
|
3025 $identical = 0 if uc($seqs[0]) ne uc($seqs[$j]);
|
|
3026 }
|
|
3027 return "NO" if $identical == 0;
|
|
3028 return "YES" if $identical == 1;
|
|
3029
|
|
3030 }
|
|
3031
|
|
3032 sub summarizeMutations{
|
|
3033 my $mutspt = $_[0];
|
|
3034 my @muts = @$mutspt;
|
|
3035 my $tree = $_[1];
|
|
3036
|
|
3037 my @returnarr = ();
|
|
3038
|
|
3039 for (1 ... 38){
|
|
3040 push @returnarr, "NA";
|
|
3041 }
|
|
3042 push @returnarr, "NULL";
|
|
3043 return @returnarr if $tree eq "NULL" || scalar(@muts) < 1;
|
|
3044
|
|
3045
|
|
3046 my @bspecies = ();
|
|
3047 my @dspecies = ();
|
|
3048 my $treecopy = $tree;
|
|
3049 $treecopy =~ s/[\(\)]//g;
|
|
3050 my @treeparts = split(/[\.,]+/, $treecopy);
|
|
3051
|
|
3052 for my $part (@treeparts){
|
|
3053 if ($part =~ /\+/){
|
|
3054 $part =~ s/\+//g;
|
|
3055 #my @sp = split(/\s*/, $part);
|
|
3056 #foreach my $p (@sp) {push @bspecies, $p;}
|
|
3057 push @bspecies, $part;
|
|
3058 }
|
|
3059 if ($part =~ /\-/){
|
|
3060 $part =~ s/\-//g;
|
|
3061 #my @sp = split(/\s*/, $part);
|
|
3062 #foreach my $p (@sp) {push @dspecies, $p;}
|
|
3063 push @dspecies, $part;
|
|
3064 }
|
|
3065
|
|
3066 }
|
|
3067 #print "-------------------------------------------------------\n";
|
|
3068
|
|
3069 my ($insertions, $deletions, $motinsertions, $motinsertionsf, $motdeletions, $motdeletionsf, $noninsertions, $nondeletions) = (0,0,0,0,0,0,0,0);
|
|
3070 my ($binsertions, $bdeletions, $bmotinsertions,$bmotinsertionsf, $bmotdeletions, $bmotdeletionsf, $bnoninsertions, $bnondeletions) = (0,0,0,0,0,0,0,0);
|
|
3071 my ($dinsertions, $ddeletions, $dmotinsertions,$dmotinsertionsf, $dmotdeletions, $dmotdeletionsf, $dnoninsertions, $dnondeletions) = (0,0,0,0,0,0,0,0);
|
|
3072 my ($ninsertions, $ndeletions, $nmotinsertions,$nmotinsertionsf, $nmotdeletions, $nmotdeletionsf, $nnoninsertions, $nnondeletions) = (0,0,0,0,0,0,0,0);
|
|
3073 my ($substitutions, $bsubstitutions, $dsubstitutions, $nsubstitutions, $indels, $subs) = (0,0,0,0,"NA","NA");
|
|
3074
|
|
3075 my @insertionsarr = (" ");
|
|
3076 my @deletionsarr = (" ");
|
|
3077
|
|
3078 my @substitutionsarr = (" ");
|
|
3079
|
|
3080
|
|
3081 foreach my $mut (@muts){
|
|
3082 # print "mut = $mut\n";
|
|
3083 chomp $mut;
|
|
3084 $mut =~ s/=\t/= /g;
|
|
3085 $mut =~ s/=$/= /g;
|
|
3086 my %mhash = ();
|
|
3087 my @mields = split(/\t/,$mut);
|
|
3088
|
|
3089 foreach my $m (@mields){
|
|
3090 my @fields = split(/=/,$m);
|
|
3091 next if $fields[1] eq " ";
|
|
3092 $mhash{$fields[0]} = $fields[1];
|
|
3093 }
|
|
3094
|
|
3095 my $myutype = ();
|
|
3096 my $decided = 0;
|
|
3097
|
|
3098 my $localnode = $mhash{"node"};
|
|
3099 $localnode =~ s/[\(\)\. ,]//g;
|
|
3100
|
|
3101
|
|
3102 foreach my $s (@bspecies){
|
|
3103 if ($localnode eq $s) {
|
|
3104 $decided = 1; $myutype = "b";
|
|
3105 }
|
|
3106 }
|
|
3107
|
|
3108 foreach my $s (@dspecies){
|
|
3109 if ($localnode eq $s) {
|
|
3110 $decided = 1; $myutype = "d";
|
|
3111 }
|
|
3112 }
|
|
3113
|
|
3114 $myutype = "n" if $decided != 1;
|
|
3115
|
|
3116
|
|
3117 # print "tree=$tree, birth species=@bspecies, death species=@dspecies, node=$mhash{node} .. myutype=$myutype .. \n";
|
|
3118 # <STDIN> if $mhash{"type"} eq "insertion" && $myutype eq "b";
|
|
3119
|
|
3120
|
|
3121 if ($mhash{"type"} eq "substitution"){
|
|
3122 $substitutions++;
|
|
3123 $bsubstitutions++ if $myutype eq "b";
|
|
3124 $dsubstitutions++ if $myutype eq "d";
|
|
3125 $nsubstitutions++ if $myutype eq "n";
|
|
3126 # print "substitution: from= $mhash{from}, to = $mhash{to}, and type = myutype\n";
|
|
3127 push @substitutionsarr, "b:$mhash{position}:".$mhash{"from"}.">".$mhash{"to"} if $myutype eq "b";
|
|
3128 push @substitutionsarr, "d:$mhash{position}:".$mhash{"from"}.">".$mhash{"to"} if $myutype eq "d";
|
|
3129 push @substitutionsarr, "n:$mhash{position}:".$mhash{"from"}.">".$mhash{"to"} if $myutype eq "n";
|
|
3130 # print "substitutionsarr = @substitutionsarr\n";
|
|
3131 # <STDIN>;
|
|
3132 }
|
|
3133 else{
|
|
3134 #print "tree=$tree, birth species=@bspecies, death species=@dspecies, node=$mhash{node} .. myutype=$myutype .. indeltype=$mhash{indeltype}\n";
|
|
3135 if ($mhash{"type"} eq "deletion"){
|
|
3136 $deletions++;
|
|
3137
|
|
3138 $motdeletions++ if $mhash{"indeltype"} =~ /dmot/;
|
|
3139 $motdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/;
|
|
3140
|
|
3141 $nondeletions++ if $mhash{"indeltype"} =~ /dnon/;
|
|
3142
|
|
3143 $bdeletions++ if $myutype eq "b";
|
|
3144 $ddeletions++ if $myutype eq "d";
|
|
3145 $ndeletions++ if $myutype eq "n";
|
|
3146
|
|
3147 $bmotdeletions++ if $mhash{"indeltype"} =~ /dmot/ && $myutype eq "b";
|
|
3148 $bmotdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/ && $myutype eq "b";
|
|
3149 $bnondeletions++ if $mhash{"indeltype"} =~ /dnon/ && $myutype eq "b";
|
|
3150
|
|
3151 $dmotdeletions++ if $mhash{"indeltype"} =~ /dmot/ && $myutype eq "d";
|
|
3152 $dmotdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/ && $myutype eq "d";
|
|
3153 $dnondeletions++ if $mhash{"indeltype"} =~ /dnon/ && $myutype eq "d";
|
|
3154
|
|
3155 $nmotdeletions++ if $mhash{"indeltype"} =~ /dmot/ && $myutype eq "n";
|
|
3156 $nmotdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/ && $myutype eq "n";
|
|
3157 $nnondeletions++ if $mhash{"indeltype"} =~ /dnon/ && $myutype eq "n";
|
|
3158
|
|
3159 push @deletionsarr, "b:$mhash{indeltype}:$mhash{position}:".$mhash{"deletion"} if $myutype eq "b";
|
|
3160 push @deletionsarr, "d:$mhash{indeltype}:$mhash{position}:".$mhash{"deletion"} if $myutype eq "d";
|
|
3161 push @deletionsarr, "n:$mhash{indeltype}:$mhash{position}:".$mhash{"deletion"} if $myutype eq "n";
|
|
3162 }
|
|
3163
|
|
3164 if ($mhash{"type"} eq "insertion"){
|
|
3165 $insertions++;
|
|
3166
|
|
3167 $motinsertions++ if $mhash{"indeltype"} =~ /imot/;
|
|
3168 $motinsertionsf++ if $mhash{"indeltype"} =~ /imotf/;
|
|
3169 $noninsertions++ if $mhash{"indeltype"} =~ /inon/;
|
|
3170
|
|
3171 $binsertions++ if $myutype eq "b";
|
|
3172 $dinsertions++ if $myutype eq "d";
|
|
3173 $ninsertions++ if $myutype eq "n";
|
|
3174
|
|
3175 $bmotinsertions++ if $mhash{"indeltype"} =~ /imot/ && $myutype eq "b";
|
|
3176 $bmotinsertionsf++ if $mhash{"indeltype"} =~ /imotf/ && $myutype eq "b";
|
|
3177 $bnoninsertions++ if $mhash{"indeltype"} =~ /inon/ && $myutype eq "b";
|
|
3178
|
|
3179 $dmotinsertions++ if $mhash{"indeltype"} =~ /imot/ && $myutype eq "d";
|
|
3180 $dmotinsertionsf++ if $mhash{"indeltype"} =~ /imotf/ && $myutype eq "d";
|
|
3181 $dnoninsertions++ if $mhash{"indeltype"} =~ /inon/ && $myutype eq "d";
|
|
3182
|
|
3183 $nmotinsertions++ if $mhash{"indeltype"} =~ /imot/ && $myutype eq "n";
|
|
3184 $nmotinsertionsf++ if $mhash{"indeltype"} =~ /imotf/ && $myutype eq "n";
|
|
3185 $nnoninsertions++ if $mhash{"indeltype"} =~ /inon/ && $myutype eq "n";
|
|
3186
|
|
3187 push @insertionsarr, "b:$mhash{indeltype}:$mhash{position}:".$mhash{"insertion"} if $myutype eq "b";
|
|
3188 push @insertionsarr, "d:$mhash{indeltype}:$mhash{position}:".$mhash{"insertion"} if $myutype eq "d";
|
|
3189 push @insertionsarr, "n:$mhash{indeltype}:$mhash{position}:".$mhash{"insertion"} if $myutype eq "n";
|
|
3190
|
|
3191 }
|
|
3192 }
|
|
3193 }
|
|
3194
|
|
3195
|
|
3196
|
|
3197 $indels = "ins=".join(",",@insertionsarr).";dels=".join(",",@deletionsarr) if scalar(@insertionsarr) > 1 || scalar(@deletionsarr) > 1 ;
|
|
3198 $subs = join(",",@substitutionsarr) if scalar(@substitutionsarr) > 1;
|
|
3199 $indels =~ s/ //g;
|
|
3200 $subs =~ s/ //g ;
|
|
3201
|
|
3202 #print "indels = $indels, subs=$subs\n";
|
|
3203 ##<STDIN> if $indels =~ /[a-zA-Z0-9]/ || $subs =~ /[a-zA-Z0-9]/ ;
|
|
3204 #print "tree = $tree, indels = $indels, subs = $subs, bspecies = @bspecies, dspecies = @dspecies \n";
|
|
3205 my @returnarray = ();
|
|
3206
|
|
3207 push (@returnarray, $insertions, $deletions, $motinsertions, $motinsertionsf, $motdeletions, $motdeletionsf, $noninsertions, $nondeletions) ;
|
|
3208 push (@returnarray, $binsertions, $bdeletions, $bmotinsertions,$bmotinsertionsf, $bmotdeletions, $bmotdeletionsf, $bnoninsertions, $bnondeletions) ;
|
|
3209 push (@returnarray, $dinsertions, $ddeletions, $dmotinsertions,$dmotinsertionsf, $dmotdeletions, $dmotdeletionsf, $dnoninsertions, $dnondeletions) ;
|
|
3210 push (@returnarray, $ninsertions, $ndeletions, $nmotinsertions,$nmotinsertionsf, $nmotdeletions, $nmotdeletionsf, $nnoninsertions, $nnondeletions) ;
|
|
3211 push (@returnarray, $substitutions, $bsubstitutions, $dsubstitutions, $nsubstitutions, $indels, $subs) ;
|
|
3212
|
|
3213 push @returnarray, $tree;
|
|
3214
|
|
3215 my @copy = @returnarray;
|
|
3216 return (@returnarray);
|
|
3217
|
|
3218 }
|
|
3219
|
|
3220 sub selectBetterTree{
|
|
3221 my $printer = 0;
|
|
3222 my $treestudy = $_[0];
|
|
3223 my $alt = $_[1];
|
|
3224 my $mutspt = $_[2];
|
|
3225 my @muts = @$mutspt;
|
|
3226 my @trees = (); my @alternatetrees=();
|
|
3227
|
|
3228 @trees = split(/\|/,$treestudy) if $treestudy =~ /\|/;
|
|
3229 @alternatetrees = split(/[\|;]/,$alt) if $alt =~ /[\|;\(\)]/;
|
|
3230
|
|
3231 $trees[0] = $treestudy if $treestudy !~ /\|/;
|
|
3232 $alternatetrees[0] = $alt if $alt !~ /[\|;\(\)]/;
|
|
3233
|
|
3234 my @alltrees = (@trees, @alternatetrees);
|
|
3235 # push(@alltrees,@alternatetrees);
|
|
3236
|
|
3237 my %mutspecies = ();
|
|
3238
|
|
3239 print "IN selectBetterTree..treestudy=$treestudy. alt=$alt. for: @_. trees=@trees<. alternatetrees=@alternatetrees\n" if $printer == 1;
|
|
3240 #<STDIN>;
|
|
3241 foreach my $mut (@muts){
|
|
3242 print colored ['green'],"mut = $mut\n" if $printer == 1;
|
|
3243 $mut =~ /node=([A-Z,\(\) ]+)/;
|
|
3244 my $node = $1;
|
|
3245 $node =~s/[,\(\) ]+//g;
|
|
3246 my @indivspecies = $node =~ /[A-Z]+/g;
|
|
3247 #print "adding node: $node\n" if $printer == 1;
|
|
3248 $mutspecies{$node} = $node;
|
|
3249
|
|
3250 #foreach (@indivspecies) {
|
|
3251 #$mutspecies{$mut} = $_; #print "for $_ adding $mutspecies{$_}\n";
|
|
3252 #}
|
|
3253
|
|
3254 }
|
|
3255
|
|
3256 my @treerecords = ();
|
|
3257 my $treecount = -1;
|
|
3258 foreach my $tree (@alltrees){
|
|
3259 print "checking with tree $tree\n" if $printer == 1;
|
|
3260 $treecount++;
|
|
3261 $treerecords[$treecount] = 0;
|
|
3262 my @indivspecies = ($tree =~ /[A-Z]+/g);
|
|
3263 print "indivspecies=@indivspecies\n" if $printer == 1;
|
|
3264 foreach my $species (@indivspecies){
|
|
3265 print "checkin if exists species: $species\n" if $printer == 1;
|
|
3266 $treerecords[$treecount]+=2 if exists $mutspecies{$species} && $mutspecies{$species} !~ /indeltype=[a-z]mot/;
|
|
3267 $treerecords[$treecount]+=1.5 if exists $mutspecies{$species} && $mutspecies{$species} =~ /indeltype=[a-z]mot/;
|
|
3268 $treerecords[$treecount]-- if !exists $mutspecies{$species};
|
|
3269 }
|
|
3270
|
|
3271 print "for tree $tree, our treecount = $treerecords[$treecount]\n" if $printer == 1;
|
|
3272 }
|
|
3273
|
|
3274 my @best_tree = array_largest_number_arrayPosition(@treerecords);
|
|
3275 print "treerecords = @treerecords. hence, best tree = @best_tree\n" if $printer == 1;
|
|
3276
|
|
3277 return ($alltrees[$best_tree[0]], $treerecords[$best_tree[0]]) if scalar(@best_tree) == 1;
|
|
3278 print "best_tree[0] = $best_tree[0], and treerecords = $treerecords[$best_tree[0]]\n" if $printer == 1;
|
|
3279 return ("NULL", -1) if $treerecords[$best_tree[0]] < 1;
|
|
3280 my $rando = int(rand($#trees));
|
|
3281 return ($alltrees[$rando], $treerecords[$rando]) if scalar(@best_tree) > 1;
|
|
3282
|
|
3283 }
|
|
3284
|
|
3285
|
|
3286
|
|
3287
|
|
3288 sub load_sameHash{
|
|
3289 #my $g = %$_[0];
|
|
3290 $sameHash{"CAGT"}="AGTC";
|
|
3291 $sameHash{"ATGA"}="AATG";
|
|
3292 $sameHash{"CAAC"}="AACC";
|
|
3293 $sameHash{"GGAA"}="AAGG";
|
|
3294 $sameHash{"TAAG"}="AAGT";
|
|
3295 $sameHash{"CGAG"}="AGCG";
|
|
3296 $sameHash{"TAGG"}="AGGT";
|
|
3297 $sameHash{"GCAG"}="AGGC";
|
|
3298 $sameHash{"TAGA"}="ATAG";
|
|
3299 $sameHash{"TGA"}="ATG";
|
|
3300 $sameHash{"CAAG"}="AAGC";
|
|
3301 $sameHash{"CTAA"}="AACT";
|
|
3302 $sameHash{"CAAT"}="AATC";
|
|
3303 $sameHash{"GTAG"}="AGGT";
|
|
3304 $sameHash{"GAAG"}="AAGG";
|
|
3305 $sameHash{"CGA"}="ACG";
|
|
3306 $sameHash{"GTAA"}="AAGT";
|
|
3307 $sameHash{"ACAA"}="AAAC";
|
|
3308 $sameHash{"GCGG"}="GGGC";
|
|
3309 $sameHash{"ATCA"}="AATC";
|
|
3310 $sameHash{"TAAC"}="AACT";
|
|
3311 $sameHash{"GGCA"}="AGGC";
|
|
3312 $sameHash{"TGAG"}="AGTG";
|
|
3313 $sameHash{"AACA"}="AAAC";
|
|
3314 $sameHash{"GAGC"}="AGCG";
|
|
3315 $sameHash{"ACCA"}="AACC";
|
|
3316 $sameHash{"TGAA"}="AATG";
|
|
3317 $sameHash{"ACA"}="AAC";
|
|
3318 $sameHash{"GAAC"}="AACG";
|
|
3319 $sameHash{"GCA"}="AGC";
|
|
3320 $sameHash{"CCAC"}="ACCC";
|
|
3321 $sameHash{"CATA"}="ATAC";
|
|
3322 $sameHash{"CAC"}="ACC";
|
|
3323 $sameHash{"TACA"}="ATAC";
|
|
3324 $sameHash{"GGAC"}="ACGG";
|
|
3325 $sameHash{"AGA"}="AAG";
|
|
3326 $sameHash{"ATAA"}="AAAT";
|
|
3327 $sameHash{"CA"}="AC";
|
|
3328 $sameHash{"CCCA"}="ACCC";
|
|
3329 $sameHash{"TCAA"}="AATC";
|
|
3330 $sameHash{"CAGA"}="AGAC";
|
|
3331 $sameHash{"AATA"}="AAAT";
|
|
3332 $sameHash{"CCA"}="ACC";
|
|
3333 $sameHash{"AGAA"}="AAAG";
|
|
3334 $sameHash{"AGTA"}="AAGT";
|
|
3335 $sameHash{"GACG"}="ACGG";
|
|
3336 $sameHash{"TCAG"}="AGTC";
|
|
3337 $sameHash{"ACGA"}="AACG";
|
|
3338 $sameHash{"CGCA"}="ACGC";
|
|
3339 $sameHash{"GAGT"}="AGTG";
|
|
3340 $sameHash{"GA"}="AG";
|
|
3341 $sameHash{"TA"}="AT";
|
|
3342 $sameHash{"TAA"}="AAT";
|
|
3343 $sameHash{"CAG"}="AGC";
|
|
3344 $sameHash{"GATA"}="ATAG";
|
|
3345 $sameHash{"GTA"}="AGT";
|
|
3346 $sameHash{"CCAA"}="AACC";
|
|
3347 $sameHash{"TAG"}="AGT";
|
|
3348 $sameHash{"CAAA"}="AAAC";
|
|
3349 $sameHash{"AAGA"}="AAAG";
|
|
3350 $sameHash{"CACG"}="ACGC";
|
|
3351 $sameHash{"GTCA"}="AGTC";
|
|
3352 $sameHash{"GGA"}="AGG";
|
|
3353 $sameHash{"GGAT"}="ATGG";
|
|
3354 $sameHash{"CGGG"}="GGGC";
|
|
3355 $sameHash{"CGGA"}="ACGG";
|
|
3356 $sameHash{"AGGA"}="AAGG";
|
|
3357 $sameHash{"TAAA"}="AAAT";
|
|
3358 $sameHash{"GAGA"}="AGAG";
|
|
3359 $sameHash{"ACTA"}="AACT";
|
|
3360 $sameHash{"GCGA"}="AGCG";
|
|
3361 $sameHash{"CACA"}="ACAC";
|
|
3362 $sameHash{"AGAT"}="ATAG";
|
|
3363 $sameHash{"GAGG"}="AGGG";
|
|
3364 $sameHash{"CGAC"}="ACCG";
|
|
3365 $sameHash{"GGAG"}="AGGG";
|
|
3366 $sameHash{"GCCA"}="AGCC";
|
|
3367 $sameHash{"CCAG"}="AGCC";
|
|
3368 $sameHash{"GAAA"}="AAAG";
|
|
3369 $sameHash{"CAGG"}="AGGC";
|
|
3370 $sameHash{"GAC"}="ACG";
|
|
3371 $sameHash{"CAA"}="AAC";
|
|
3372 $sameHash{"GACC"}="ACCG";
|
|
3373 $sameHash{"GGCG"}="GGGC";
|
|
3374 $sameHash{"GGTA"}="AGGT";
|
|
3375 $sameHash{"AGCA"}="AAGC";
|
|
3376 $sameHash{"GATG"}="ATGG";
|
|
3377 $sameHash{"GTGA"}="AGTG";
|
|
3378 $sameHash{"ACAG"}="AGAC";
|
|
3379 $sameHash{"CGG"}="GGC";
|
|
3380 $sameHash{"ATA"}="AAT";
|
|
3381 $sameHash{"GACA"}="AGAC";
|
|
3382 $sameHash{"GCAA"}="AAGC";
|
|
3383 $sameHash{"CAGC"}="AGCC";
|
|
3384 $sameHash{"GGGA"}="AGGG";
|
|
3385 $sameHash{"GAG"}="AGG";
|
|
3386 $sameHash{"ACAT"}="ATAC";
|
|
3387 $sameHash{"GAAT"}="AATG";
|
|
3388 $sameHash{"CACC"}="ACCC";
|
|
3389 $sameHash{"GAT"}="ATG";
|
|
3390 $sameHash{"GCG"}="GGC";
|
|
3391 $sameHash{"GCAC"}="ACGC";
|
|
3392 $sameHash{"GAA"}="AAG";
|
|
3393 $sameHash{"TGGA"}="ATGG";
|
|
3394 $sameHash{"CCGA"}="ACCG";
|
|
3395 $sameHash{"CGAA"}="AACG";
|
|
3396 }
|
|
3397
|
|
3398
|
|
3399
|
|
3400 sub load_revHash{
|
|
3401 $revHash{"CTGA"}="AGTC";
|
|
3402 $revHash{"TCTT"}="AAAG";
|
|
3403 $revHash{"CTAG"}="AGCT";
|
|
3404 $revHash{"GGTG"}="ACCC";
|
|
3405 $revHash{"GCC"}="GGC";
|
|
3406 $revHash{"GCTT"}="AAGC";
|
|
3407 $revHash{"GCGT"}="ACGC";
|
|
3408 $revHash{"GTTG"}="AACC";
|
|
3409 $revHash{"CTCC"}="AGGG";
|
|
3410 $revHash{"ATC"}="ATG";
|
|
3411 $revHash{"CGAT"}="ATCG";
|
|
3412 $revHash{"TTAA"}="AATT";
|
|
3413 $revHash{"GTTC"}="AACG";
|
|
3414 $revHash{"CTGC"}="AGGC";
|
|
3415 $revHash{"TCGA"}="ATCG";
|
|
3416 $revHash{"ATCT"}="ATAG";
|
|
3417 $revHash{"GGTT"}="AACC";
|
|
3418 $revHash{"CTTA"}="AAGT";
|
|
3419 $revHash{"TGGC"}="AGCC";
|
|
3420 $revHash{"CCG"}="GGC";
|
|
3421 $revHash{"CGGC"}="GGCC";
|
|
3422 $revHash{"TTAG"}="AACT";
|
|
3423 $revHash{"GTG"}="ACC";
|
|
3424 $revHash{"CTTT"}="AAAG";
|
|
3425 $revHash{"TGCA"}="ATGC";
|
|
3426 $revHash{"CGCT"}="AGCG";
|
|
3427 $revHash{"TTCC"}="AAGG";
|
|
3428 $revHash{"CT"}="AG";
|
|
3429 $revHash{"C"}="G";
|
|
3430 $revHash{"CTCT"}="AGAG";
|
|
3431 $revHash{"ACTT"}="AAGT";
|
|
3432 $revHash{"GGTC"}="ACCG";
|
|
3433 $revHash{"ATTC"}="AATG";
|
|
3434 $revHash{"GGGT"}="ACCC";
|
|
3435 $revHash{"CCTA"}="AGGT";
|
|
3436 $revHash{"CGCG"}="GCGC";
|
|
3437 $revHash{"GTGT"}="ACAC";
|
|
3438 $revHash{"GCCC"}="GGGC";
|
|
3439 $revHash{"GTCG"}="ACCG";
|
|
3440 $revHash{"TCCC"}="AGGG";
|
|
3441 $revHash{"TTCA"}="AATG";
|
|
3442 $revHash{"AGTT"}="AACT";
|
|
3443 $revHash{"CCCT"}="AGGG";
|
|
3444 $revHash{"CCGC"}="GGGC";
|
|
3445 $revHash{"CTT"}="AAG";
|
|
3446 $revHash{"TTGG"}="AACC";
|
|
3447 $revHash{"ATT"}="AAT";
|
|
3448 $revHash{"TAGC"}="AGCT";
|
|
3449 $revHash{"ACTG"}="AGTC";
|
|
3450 $revHash{"TCAC"}="AGTG";
|
|
3451 $revHash{"CTGT"}="AGAC";
|
|
3452 $revHash{"TGTG"}="ACAC";
|
|
3453 $revHash{"ATCC"}="ATGG";
|
|
3454 $revHash{"GTGG"}="ACCC";
|
|
3455 $revHash{"TGGG"}="ACCC";
|
|
3456 $revHash{"TCGG"}="ACCG";
|
|
3457 $revHash{"CGGT"}="ACCG";
|
|
3458 $revHash{"GCTC"}="AGCG";
|
|
3459 $revHash{"TACG"}="ACGT";
|
|
3460 $revHash{"GTTT"}="AAAC";
|
|
3461 $revHash{"CAT"}="ATG";
|
|
3462 $revHash{"CATG"}="ATGC";
|
|
3463 $revHash{"GTTA"}="AACT";
|
|
3464 $revHash{"CACT"}="AGTG";
|
|
3465 $revHash{"TCAT"}="AATG";
|
|
3466 $revHash{"TTA"}="AAT";
|
|
3467 $revHash{"TGTA"}="ATAC";
|
|
3468 $revHash{"TTTC"}="AAAG";
|
|
3469 $revHash{"TACT"}="AAGT";
|
|
3470 $revHash{"TGTT"}="AAAC";
|
|
3471 $revHash{"CTA"}="AGT";
|
|
3472 $revHash{"GACT"}="AGTC";
|
|
3473 $revHash{"TTGC"}="AAGC";
|
|
3474 $revHash{"TTC"}="AAG";
|
|
3475 $revHash{"GCT"}="AGC";
|
|
3476 $revHash{"GCAT"}="ATGC";
|
|
3477 $revHash{"TGGT"}="AACC";
|
|
3478 $revHash{"CCT"}="AGG";
|
|
3479 $revHash{"CATC"}="ATGG";
|
|
3480 $revHash{"CCAT"}="ATGG";
|
|
3481 $revHash{"CCCG"}="GGGC";
|
|
3482 $revHash{"TGCC"}="AGGC";
|
|
3483 $revHash{"TG"}="AC";
|
|
3484 $revHash{"TGCT"}="AAGC";
|
|
3485 $revHash{"GCCG"}="GGCC";
|
|
3486 $revHash{"TCTG"}="AGAC";
|
|
3487 $revHash{"TGT"}="AAC";
|
|
3488 $revHash{"TTAT"}="AAAT";
|
|
3489 $revHash{"TAGT"}="AACT";
|
|
3490 $revHash{"TATG"}="ATAC";
|
|
3491 $revHash{"TTTA"}="AAAT";
|
|
3492 $revHash{"CGTA"}="ACGT";
|
|
3493 $revHash{"TA"}="AT";
|
|
3494 $revHash{"TGTC"}="AGAC";
|
|
3495 $revHash{"CTAT"}="ATAG";
|
|
3496 $revHash{"TATA"}="ATAT";
|
|
3497 $revHash{"TAC"}="AGT";
|
|
3498 $revHash{"TC"}="AG";
|
|
3499 $revHash{"CATT"}="AATG";
|
|
3500 $revHash{"TCG"}="ACG";
|
|
3501 $revHash{"ATTT"}="AAAT";
|
|
3502 $revHash{"CGTG"}="ACGC";
|
|
3503 $revHash{"CTG"}="AGC";
|
|
3504 $revHash{"TCGT"}="AACG";
|
|
3505 $revHash{"TCCG"}="ACGG";
|
|
3506 $revHash{"GTT"}="AAC";
|
|
3507 $revHash{"ATGT"}="ATAC";
|
|
3508 $revHash{"CTTG"}="AAGC";
|
|
3509 $revHash{"CCTT"}="AAGG";
|
|
3510 $revHash{"GATC"}="ATCG";
|
|
3511 $revHash{"CTGG"}="AGCC";
|
|
3512 $revHash{"TTCT"}="AAAG";
|
|
3513 $revHash{"CGTC"}="ACGG";
|
|
3514 $revHash{"CG"}="GC";
|
|
3515 $revHash{"TATT"}="AAAT";
|
|
3516 $revHash{"CTCG"}="AGCG";
|
|
3517 $revHash{"TCTC"}="AGAG";
|
|
3518 $revHash{"TCCT"}="AAGG";
|
|
3519 $revHash{"TGG"}="ACC";
|
|
3520 $revHash{"ACTC"}="AGTG";
|
|
3521 $revHash{"CTC"}="AGG";
|
|
3522 $revHash{"CGC"}="GGC";
|
|
3523 $revHash{"TTG"}="AAC";
|
|
3524 $revHash{"ACCT"}="AGGT";
|
|
3525 $revHash{"TCTA"}="ATAG";
|
|
3526 $revHash{"GTAC"}="ACGT";
|
|
3527 $revHash{"TTGA"}="AATC";
|
|
3528 $revHash{"GTCC"}="ACGG";
|
|
3529 $revHash{"GATT"}="AATC";
|
|
3530 $revHash{"T"}="A";
|
|
3531 $revHash{"CGTT"}="AACG";
|
|
3532 $revHash{"GTC"}="ACG";
|
|
3533 $revHash{"GCCT"}="AGGC";
|
|
3534 $revHash{"TGC"}="AGC";
|
|
3535 $revHash{"TTTG"}="AAAC";
|
|
3536 $revHash{"GGCT"}="AGCC";
|
|
3537 $revHash{"TCA"}="ATG";
|
|
3538 $revHash{"GTGC"}="ACGC";
|
|
3539 $revHash{"TGAT"}="AATC";
|
|
3540 $revHash{"TAT"}="AAT";
|
|
3541 $revHash{"CTAC"}="AGGT";
|
|
3542 $revHash{"TGCG"}="ACGC";
|
|
3543 $revHash{"CTCA"}="AGTG";
|
|
3544 $revHash{"CTTC"}="AAGG";
|
|
3545 $revHash{"GCTG"}="AGCC";
|
|
3546 $revHash{"TATC"}="ATAG";
|
|
3547 $revHash{"TAAT"}="AATT";
|
|
3548 $revHash{"ACT"}="AGT";
|
|
3549 $revHash{"TCGC"}="AGCG";
|
|
3550 $revHash{"GGT"}="ACC";
|
|
3551 $revHash{"TCC"}="AGG";
|
|
3552 $revHash{"TTGT"}="AAAC";
|
|
3553 $revHash{"TGAC"}="AGTC";
|
|
3554 $revHash{"TTAC"}="AAGT";
|
|
3555 $revHash{"CGT"}="ACG";
|
|
3556 $revHash{"ATTA"}="AATT";
|
|
3557 $revHash{"ATTG"}="AATC";
|
|
3558 $revHash{"CCTC"}="AGGG";
|
|
3559 $revHash{"CCGG"}="GGCC";
|
|
3560 $revHash{"CCGT"}="ACGG";
|
|
3561 $revHash{"TCCA"}="ATGG";
|
|
3562 $revHash{"CGCC"}="GGGC";
|
|
3563 $revHash{"GT"}="AC";
|
|
3564 $revHash{"TTCG"}="AACG";
|
|
3565 $revHash{"CCTG"}="AGGC";
|
|
3566 $revHash{"TCT"}="AAG";
|
|
3567 $revHash{"GTAT"}="ATAC";
|
|
3568 $revHash{"GTCT"}="AGAC";
|
|
3569 $revHash{"GCTA"}="AGCT";
|
|
3570 $revHash{"TACC"}="AGGT";
|
|
3571 }
|
|
3572
|
|
3573
|
|
3574 sub allCaps{
|
|
3575 my $motif = $_[0];
|
|
3576 $motif =~ s/a/A/g;
|
|
3577 $motif =~ s/c/C/g;
|
|
3578 $motif =~ s/t/T/g;
|
|
3579 $motif =~ s/g/G/g;
|
|
3580 return $motif;
|
|
3581 }
|
|
3582
|
|
3583
|
|
3584 sub all_caps{
|
|
3585 my @strand = split(/\s*/,$_[0]);
|
|
3586 for my $i (0 ... $#strand){
|
|
3587 if ($strand[$i] =~ /c/) {$strand[$i] = "C";next;}
|
|
3588 if ($strand[$i] =~ /a/) {$strand[$i] = "A";next;}
|
|
3589 if ($strand[$i] =~ /t/) { $strand[$i] = "T";next;}
|
|
3590 if ($strand[$i] =~ /g/) {$strand[$i] = "G";next;}
|
|
3591 }
|
|
3592 return join("",@strand);
|
|
3593 }
|
|
3594 sub array_mean{
|
|
3595 return "NA" if scalar(@_) == 0;
|
|
3596 my $sum = 0;
|
|
3597 foreach my $val (@_){
|
|
3598 $sum = $sum + $val;
|
|
3599 }
|
|
3600 return ($sum/scalar(@_));
|
|
3601 }
|
|
3602 sub array_sum{
|
|
3603 return "NA" if scalar(@_) == 0;
|
|
3604 my $sum = 0;
|
|
3605 foreach my $val (@_){
|
|
3606 $sum = $sum + $val;
|
|
3607 }
|
|
3608 return ($sum);
|
|
3609 }
|
|
3610
|
|
3611 sub variance{
|
|
3612 return "NA" if scalar(@_) == 0;
|
|
3613 return 0 if scalar(@_) == 1;
|
|
3614 my $mean = array_mean(@_);
|
|
3615 my $num = 0;
|
|
3616 return 0 if scalar(@_) == 1;
|
|
3617 # print "mean = $mean .. array = >@_<\n";
|
|
3618 foreach my $ele (@_){
|
|
3619 # print "$num = $num + ($ele-$mean)*($ele-$mean)\n";
|
|
3620 $num = $num + ($ele-$mean)*($ele-$mean);
|
|
3621 }
|
|
3622 my $var = $num / scalar(@_);
|
|
3623 return $var;
|
|
3624 }
|
|
3625
|
|
3626 sub array_95confIntervals{
|
|
3627 return "NA" if scalar(@_) <= 0;
|
|
3628 my @sorted = sort { $a <=> $b } @_;
|
|
3629 # print "@sorted=",scalar(@sorted), "\n";
|
|
3630 my $aDeechNo = int((scalar(@sorted) * 2.5) / 100);
|
|
3631 my $saaDeNo = int((scalar(@sorted) * 97.5) / 100);
|
|
3632
|
|
3633 return ($sorted[$aDeechNo], $sorted[$saaDeNo]);
|
|
3634 }
|
|
3635
|
|
3636 sub array_median{
|
|
3637 return "NA" if scalar(@_) == 0;
|
|
3638 return $_[0] if scalar(@_) == 1;
|
|
3639 my @sorted = sort { $a <=> $b } @_;
|
|
3640 my $totalno = scalar(@sorted);
|
|
3641
|
|
3642 #print "sorted = @sorted\n";
|
|
3643
|
|
3644 my $pick = ();
|
|
3645 if ($totalno % 2 == 1){
|
|
3646 #print "odd set .. totalno = $totalno\n";
|
|
3647 my $mid = $totalno / 2;
|
|
3648 my $onehalfno = $mid - $mid % 1;
|
|
3649 my $secondhalfno = $onehalfno + 1;
|
|
3650 my $onehalf = $sorted[$onehalfno-1];
|
|
3651 my $secondhalf = $sorted[$secondhalfno-1];
|
|
3652 #print "onehalfno = $onehalfno and secondhalfno = $secondhalfno \n onehalf = $onehalf and secondhalf = $secondhalf\n";
|
|
3653
|
|
3654 $pick = $secondhalf;
|
|
3655 }
|
|
3656 else{
|
|
3657 #print "even set .. totalno = $totalno\n";
|
|
3658 my $mid = $totalno / 2;
|
|
3659 my $onehalfno = $mid;
|
|
3660 my $secondhalfno = $onehalfno + 1;
|
|
3661 my $onehalf = $sorted[$onehalfno-1];
|
|
3662 my $secondhalf = $sorted[$secondhalfno-1];
|
|
3663 #print "onehalfno = $onehalfno and secondhalfno = $secondhalfno \n onehalf = $onehalf and secondhalf = $secondhalf\n";
|
|
3664 $pick = ($onehalf + $secondhalf )/2;
|
|
3665
|
|
3666 }
|
|
3667 #print "pick = $pick..\n";
|
|
3668 return $pick;
|
|
3669
|
|
3670 }
|
|
3671
|
|
3672
|
|
3673 sub array_numerical_sort{
|
|
3674 return "NA" if scalar(@_) == 0;
|
|
3675 my @sorted = sort { $a <=> $b } @_;
|
|
3676 return (@sorted);
|
|
3677 }
|
|
3678
|
|
3679 sub array_smallest_number{
|
|
3680 return "NA" if scalar(@_) == 0;
|
|
3681 return $_[0] if scalar(@_) == 1;
|
|
3682 my @sorted = sort { $a <=> $b } @_;
|
|
3683 return $sorted[0];
|
|
3684 }
|
|
3685
|
|
3686
|
|
3687 sub array_largest_number{
|
|
3688 return "NA" if scalar(@_) == 0;
|
|
3689 return $_[0] if scalar(@_) == 1;
|
|
3690 my @sorted = sort { $a <=> $b } @_;
|
|
3691 return $sorted[$#sorted];
|
|
3692 }
|
|
3693
|
|
3694
|
|
3695 sub array_largest_number_arrayPosition{
|
|
3696 return "NA" if scalar(@_) == 0;
|
|
3697 return 0 if scalar(@_) == 1;
|
|
3698 my $maxpos = 0;
|
|
3699 my @maxposes = ();
|
|
3700 my @maxvals = ();
|
|
3701 my $maxval = array_smallest_number(@_);
|
|
3702 for my $i (0 ... $#_){
|
|
3703 if ($_[$i] > $maxval){
|
|
3704 $maxval = $_[$i];
|
|
3705 $maxpos = $i;
|
|
3706 }
|
|
3707 if ($_[$i] == $maxval){
|
|
3708 $maxval = $_[$i];
|
|
3709 if (scalar(@maxposes) == 0){
|
|
3710 push @maxposes, $i;
|
|
3711 push @maxvals, $_[$i];
|
|
3712
|
|
3713 }
|
|
3714 elsif ($maxvals[0] == $maxval){
|
|
3715 push @maxposes, $i;
|
|
3716 push @maxvals, $_[$i];
|
|
3717 }
|
|
3718 else{
|
|
3719 @maxposes = (); @maxvals = ();
|
|
3720 push @maxposes, $i;
|
|
3721 push @maxvals, $_[$i];
|
|
3722 }
|
|
3723
|
|
3724 }
|
|
3725
|
|
3726 }
|
|
3727 return $maxpos if scalar(@maxposes) < 2;
|
|
3728 return (@maxposes);
|
|
3729 }
|
|
3730
|
|
3731 sub array_smallest_number_arrayPosition{
|
|
3732 return "NA" if scalar(@_) == 0;
|
|
3733 return 0 if scalar(@_) == 1;
|
|
3734 my $minpos = 0;
|
|
3735 my @minposes = ();
|
|
3736 my @minvals = ();
|
|
3737 my $minval = array_largest_number(@_);
|
|
3738 my $maxval = array_smallest_number(@_);
|
|
3739 #print "starting with $maxval, ending with $minval\n";
|
|
3740 for my $i (0 ... $#_){
|
|
3741 if ($_[$i] < $minval){
|
|
3742 $minval = $_[$i];
|
|
3743 $minpos = $i;
|
|
3744 }
|
|
3745 if ($_[$i] == $minval){
|
|
3746 $minval = $_[$i];
|
|
3747 if (scalar(@minposes) == 0){
|
|
3748 push @minposes, $i;
|
|
3749 push @minvals, $_[$i];
|
|
3750
|
|
3751 }
|
|
3752 elsif ($minvals[0] == $minval){
|
|
3753 push @minposes, $i;
|
|
3754 push @minvals, $_[$i];
|
|
3755 }
|
|
3756 else{
|
|
3757 @minposes = (); @minvals = ();
|
|
3758 push @minposes, $i;
|
|
3759 push @minvals, $_[$i];
|
|
3760 }
|
|
3761
|
|
3762 }
|
|
3763
|
|
3764 }
|
|
3765 #print "minposes=@minposes\n";
|
|
3766
|
|
3767 return $minpos if scalar(@minposes) < 2;
|
|
3768 return (@minposes);
|
|
3769 }
|
|
3770
|
|
3771 sub basic_stats{
|
|
3772 my @arr = @_;
|
|
3773 # print " array_smallest_number= ", array_smallest_number(@arr)," array_largest_number= ", array_largest_number(@arr), " array_mean= ",array_mean(@arr),"\n";
|
|
3774 return ":";
|
|
3775 }
|
|
3776 #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx
|
|
3777
|
|
3778 sub maftoAxt_multispecies {
|
|
3779 my $printer = 0;
|
|
3780 # print "in maftoAxt_multispecies : got @_\n";
|
|
3781 my $fname=$_[0];
|
|
3782 open(IN,"<$_[0]") or die "Cannot open $_[0]: $! \n";
|
|
3783 my $treedefinition = $_[1];
|
|
3784 open(OUT,">$_[2]") or die "Cannot open $_[2]: $! \n";
|
|
3785 my $counter = 0;
|
|
3786 my $exactspeciesset = $_[3];
|
|
3787 my @exactspeciesset_unarranged = split(/,/,$exactspeciesset);
|
|
3788
|
|
3789 $treedefinition=~s/[\)\(, ]/\t/g;
|
|
3790 my @species=split(/\t+/,$treedefinition);
|
|
3791 my @exactspecies=();
|
|
3792
|
|
3793 foreach my $spec (@species){
|
|
3794 foreach my $espec (@exactspeciesset_unarranged){
|
|
3795 push @exactspecies, $spec if $spec eq $espec;
|
|
3796 }
|
|
3797 }
|
|
3798 # print "exactspecies=@exactspecies\n";
|
|
3799
|
|
3800 ###########
|
|
3801 my $select = 2;
|
|
3802 #select = 1 if all species need sequences to be present for each block otherwise, it is 0
|
|
3803 #select = 2 only the allowed set make up the alignment. use the removeset
|
|
3804 # information to detect alignmenets that have other important genomes aligned.
|
|
3805 ###########
|
|
3806 my @allowedset = ();
|
|
3807 @allowedset = split(/;/,allowedSetOfSpecies(join("_",@species))) if $select == 0;
|
|
3808 @allowedset = join("_",0,@species) if $select == 1;
|
|
3809 #print "species = @species , allowedset =",join("\n", @allowedset) ," \n";
|
|
3810 @allowedset = join("_",0,@exactspecies) if $select == 2;
|
|
3811 #print "allowedset = @allowedset and exactspecies = @exactspecies\n";
|
|
3812
|
|
3813 my $start = 0;
|
|
3814 my @sequences = ();
|
|
3815 my @titles = ();
|
|
3816 my $species_counter = "0";
|
|
3817 my $countermatch = 0;
|
|
3818 my $outsideSpecies=0;
|
|
3819
|
|
3820 while(my $line = <IN>){
|
|
3821 next if $line =~ /^#/;
|
|
3822 next if $line =~ /^i/;
|
|
3823 chomp $line;
|
|
3824 #print "$line";
|
|
3825 my @fields = split(/\s+/,$line);
|
|
3826 chomp $line;
|
|
3827 if ($line =~ /^a /){
|
|
3828 $start = 1;
|
|
3829 }
|
|
3830
|
|
3831 if ($line =~ /^s /){
|
|
3832 # print "fields1 = $fields[1] , start = $start\n";
|
|
3833
|
|
3834 foreach my $sp (@species){
|
|
3835 if ($fields[1] =~ /$sp/){
|
|
3836 $species_counter = $species_counter."_".$sp;
|
|
3837 push(@sequences, $fields[6]);
|
|
3838 my @sp_info = split(/\./,$fields[1]);
|
|
3839 my $title = join(" ",@sp_info, $fields[2], ($fields[2]+$fields[3]), $fields[4]);
|
|
3840 push(@titles, $title);
|
|
3841
|
|
3842 }
|
|
3843 }
|
|
3844 }
|
|
3845
|
|
3846 if (($line !~ /^a/) && ($line !~ /^s/) && ($line !~ /^#/) && ($line !~ /^i/) && ($start = 1)){
|
|
3847
|
|
3848 my $arranged = reorderSpecies($species_counter, @species);
|
|
3849 my $stopper = 1;
|
|
3850 my $arrno = 0;
|
|
3851 foreach my $set (@allowedset){
|
|
3852 if ($arranged eq $set){
|
|
3853 # print "$arranged == $set\n";
|
|
3854 $stopper = 0; last;
|
|
3855 }
|
|
3856 $arrno++;
|
|
3857 }
|
|
3858
|
|
3859 if ($stopper == 0) {
|
|
3860 # print " accepted\n";
|
|
3861 @titles = split ";", orderInfo(join(";", @titles), $species_counter, $arranged) if $species_counter ne $arranged;
|
|
3862
|
|
3863 @sequences = split ";", orderInfo(join(";", @sequences), $species_counter, $arranged) if $species_counter ne $arranged;
|
|
3864 my $filteredseq = filter_gaps(@sequences);
|
|
3865
|
|
3866 if ($filteredseq ne "SHORT"){
|
|
3867 $counter++;
|
|
3868 print OUT join (" ",$counter, @titles), "\n";
|
|
3869 print OUT $filteredseq, "\n";
|
|
3870 print OUT "\n";
|
|
3871 $countermatch++;
|
|
3872 }
|
|
3873 # my @filtered_seq = split(/\t/,filter_gaps(@sequences) );
|
|
3874 }
|
|
3875 else{#print "\n";
|
|
3876 }
|
|
3877
|
|
3878 @sequences = (); @titles = (); $start = 0;$species_counter = "0";
|
|
3879 next;
|
|
3880
|
|
3881 }
|
|
3882 }
|
|
3883 # print "countermatch = $countermatch\n";
|
|
3884 }
|
|
3885
|
|
3886 sub reorderSpecies{
|
|
3887 my @inarr=@_;
|
|
3888 my $currSpecies = shift (@inarr);
|
|
3889 my $ordered_species = 0;
|
|
3890 my @species=@inarr;
|
|
3891 foreach my $order (@species){
|
|
3892 $ordered_species = $ordered_species."_".$order if $currSpecies=~ /$order/;
|
|
3893 }
|
|
3894 return $ordered_species;
|
|
3895
|
|
3896 }
|
|
3897
|
|
3898 sub filter_gaps{
|
|
3899 my @sequences = @_;
|
|
3900 # print "sequences sent are @sequences\n";
|
|
3901 my $seq_length = length($sequences[0]);
|
|
3902 my $seq_no = scalar(@sequences);
|
|
3903 my $allgaps = ();
|
|
3904 for (1 ... $seq_no){
|
|
3905 $allgaps = $allgaps."-";
|
|
3906 }
|
|
3907
|
|
3908 my @seq_array = ();
|
|
3909 my $seq_counter = 0;
|
|
3910 foreach my $seq (@sequences){
|
|
3911 # my @sequence = split(/\s*/,$seq);
|
|
3912 $seq_array[$seq_counter] = [split(/\s*/,$seq)];
|
|
3913 # push @seq_array, [@sequence];
|
|
3914 $seq_counter++;
|
|
3915 }
|
|
3916 my $g = 0;
|
|
3917 while ( $g < $seq_length){
|
|
3918 last if (!exists $seq_array[0][$g]);
|
|
3919 my $bases = ();
|
|
3920 for my $u (0 ... $#seq_array){
|
|
3921 $bases = $bases.$seq_array[$u][$g];
|
|
3922 }
|
|
3923 # print $bases, "\n";
|
|
3924 if ($bases eq $allgaps){
|
|
3925 # print "bases are $bases, position is $g \n";
|
|
3926 for my $seq (@seq_array){
|
|
3927 splice(@$seq , $g, 1);
|
|
3928 }
|
|
3929 }
|
|
3930 else {
|
|
3931 $g++;
|
|
3932 }
|
|
3933 }
|
|
3934
|
|
3935 my @outs = ();
|
|
3936
|
|
3937 foreach my $seq (@seq_array){
|
|
3938 push(@outs, join("",@$seq));
|
|
3939 }
|
|
3940 return "SHORT" if length($outs[0]) <=100;
|
|
3941 return (join("\n", @outs));
|
|
3942 }
|
|
3943
|
|
3944
|
|
3945 sub allowedSetOfSpecies{
|
|
3946 my @allowed_species = split(/_/,$_[0]);
|
|
3947 unshift @allowed_species, 0;
|
|
3948 # print "allowed set = @allowed_species \n";
|
|
3949 my @output = ();
|
|
3950 for (0 ... scalar(@allowed_species) - 4){
|
|
3951 push(@output, join("_",@allowed_species));
|
|
3952 pop @allowed_species;
|
|
3953 }
|
|
3954 return join(";",reverse(@output));
|
|
3955
|
|
3956 }
|
|
3957
|
|
3958
|
|
3959 sub orderInfo{
|
|
3960 my @info = split(/;/,$_[0]);
|
|
3961 # print "info = @info";
|
|
3962 my @old = split(/_/,$_[1]);
|
|
3963 my @new = split(/_/,$_[2]);
|
|
3964 shift @old; shift @new;
|
|
3965 my @outinfo = ();
|
|
3966 foreach my $spe (@new){
|
|
3967 for my $no (0 ... $#old){
|
|
3968 if ($spe eq $old[$no]){
|
|
3969 push(@outinfo, $info[$no]);
|
|
3970 }
|
|
3971 }
|
|
3972 }
|
|
3973 # print "outinfo = @outinfo \n";
|
|
3974 return join(";", @outinfo);
|
|
3975 }
|
|
3976
|
|
3977 #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx
|
|
3978
|
|
3979 sub printarr {
|
|
3980 print ">::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n";
|
|
3981 foreach my $line (@_) {print "$line\n";}
|
|
3982 print "::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::<\n";
|
|
3983 }
|
|
3984
|