0
|
1 #!/usr/bin/perl
|
|
2 # ARTS: Automated Randomization of multiple Traits for Study Design, using diploidly GA
|
|
3 # Mark Maienschein-Cline, last updated 8/19/2013
|
|
4 # mmaiensc@uic.edu
|
|
5 # Center for Research Informatics, University of Illinois at Chicago
|
|
6 #
|
|
7 # Copyright (C) 2013 Mark Maienschein-Cline
|
|
8 #
|
|
9 # This program is free software; you can redistribute it and/or modify
|
|
10 # it under the terms of the GNU General Public License as published by
|
|
11 # the Free Software Foundation; either version 2 of the License, or
|
|
12 # (at your option) any later version.
|
|
13 #
|
|
14 # This program is distributed in the hope that it will be useful,
|
|
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
17 # GNU General Public License for more details.
|
|
18 #
|
|
19 # You should have received a copy of the GNU General Public License along
|
|
20 # with this program; if not, write to the Free Software Foundation, Inc.,
|
|
21 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
22
|
|
23
|
|
24
|
|
25
|
|
26
|
|
27 use Getopt::Long qw(:config no_ignore_case);
|
|
28 #use Time::HiRes qw( clock_gettime );
|
|
29 use Math::Trig;
|
|
30 $|++;
|
|
31
|
|
32 #
|
|
33 # initialize random number parameters
|
|
34 #
|
|
35 &ran1_init();
|
|
36
|
|
37 #
|
|
38 # read command line
|
|
39 #
|
|
40 &read_command_line();
|
|
41
|
|
42 #
|
|
43 # read phenotype list: print the title lines of columns used for verbose
|
|
44 #
|
|
45 &read_data();
|
|
46 if( $verb eq "y" || $verb eq "l" ){
|
|
47 printf("Using traits:");
|
|
48 for($i=0; $i<= $#allcols; $i++){
|
|
49 print "\t$titlevals[$allcols[$i]]";
|
|
50 }
|
|
51 print "\n";
|
|
52 printf("Using trait combinations:");
|
|
53 for($i=0; $i<= $#cols; $i++){
|
|
54 printf("\t{%s", $titlevals[$cols[$i][0]]);
|
|
55 for($j=1; $j<= $#{$cols[$i]}; $j++){
|
|
56 printf(",%s", $titlevals[$cols[$i][$j]]);
|
|
57 }
|
|
58 printf("}");
|
|
59 }
|
|
60 print "\n";
|
|
61 }
|
|
62
|
|
63 #
|
|
64 # initialize GA parameters
|
|
65 #
|
|
66 &ga_init();
|
|
67
|
|
68 #
|
|
69 # if using the batchcolumn, fill in the batch
|
|
70 #
|
|
71 if( $bcolumn ne "" ){
|
|
72 if( $verb eq "y" ){
|
|
73 printf("Looking at column %i (%s) for batch assignments\n", $bcolumn+1, $titlevals[$bcolumn]);
|
|
74 }
|
|
75 # fill in batch from last column of @data
|
|
76 $numbatches = 0;
|
|
77 $foundbatchhash = {};
|
|
78 @batchsizes = ();
|
|
79 for($i=0; $i<=$#data; $i++){
|
|
80 if( $foundbatchhash->{$data[$i][$bcolumn]} eq "" ){
|
|
81 $foundbatchhash->{$data[$i][$bcolumn]} = $numbatches;
|
|
82 $numbatches++;
|
|
83 push(@batchnames, $data[$i][$bcolumn]);
|
|
84 push(@batchsizes, 0);
|
|
85 }
|
|
86 $batchsizes[$foundbatchhash->{$data[$i][$bcolumn]}]++;
|
|
87 $data[$i][$#{$data[0]}] = $data[$i][$bcolumn];
|
|
88 }
|
|
89 $mi = &mutual_info( $numbatches );
|
|
90 $bestmi = $mi;
|
|
91 }
|
|
92
|
|
93 #
|
|
94 # else do sampling: run GA
|
|
95 #
|
|
96 if( $bcolumn eq "" ){
|
|
97 &initialize_population();
|
|
98
|
|
99 $oldavg = 1;
|
|
100 $err = 0.0001;
|
|
101 for($n=0; $n< $numgen; $n++){
|
|
102 &add_immigrants();
|
|
103 @population = &permute( \@population );
|
|
104 $k = 0;
|
|
105 $k = &crossover( $k );
|
|
106 $k = &mutate( $k );
|
|
107 $k = &add_parents( $k );
|
|
108 @pool = sort{$a->{score} <=> $b->{score}} @pool;
|
|
109 $average = &fill_population();
|
|
110
|
|
111 # check if we've done enough already, and print out status
|
|
112 if( $verb eq "y" ){printf(" Generation %i of %i, average fitness %0.4f\n", $n+1, $numgen, $average );}
|
|
113 if( $oldavg >= $average && $oldavg - $average < $err ){last;}
|
|
114 $oldavg = $average;
|
|
115 }
|
|
116
|
|
117 # save the final best one
|
|
118 for($i=0; $i<= $#data; $i++){
|
|
119 &fill_assignments( \@{$population[0]->{assignments}} );
|
|
120 }
|
|
121 }
|
|
122
|
|
123 #
|
|
124 # print final log to stdout
|
|
125 #
|
|
126 if( $verb eq "y" || $verb eq "l" ){&print_info;}
|
|
127
|
|
128 #
|
|
129 # print result
|
|
130 #
|
|
131 if( $out ne "" ){
|
|
132 open(OUT,">$out");
|
|
133 print OUT "$title\t$bname\n";
|
|
134 for($i=0; $i<= $#data; $i++){
|
|
135 $orig[$i][1] = $data[$i][$#{$data[0]}];
|
|
136 printf OUT ("%s\t%i\n", $orig[$i][0], $orig[$i][1]);
|
|
137 }
|
|
138 close(OUT);
|
|
139 }
|
|
140
|
|
141 ###############
|
|
142 # SUBROUTINES #
|
|
143 ###############
|
|
144
|
|
145 # read command line options
|
|
146 sub read_command_line{
|
|
147 my $i;
|
|
148
|
|
149 #
|
|
150 # option default values
|
|
151 #
|
|
152 $in = "";
|
|
153 $out = "";
|
|
154 $bcolumn = "";
|
|
155 $batch = "";
|
|
156 $bname = "batch";
|
|
157 $phenocols = "";
|
|
158 $contcols = "";
|
|
159 $datecols = "";
|
|
160 $bins = 5;
|
|
161 @blist = ();
|
|
162 $verb = "y";
|
|
163 $mmi = 0;
|
|
164
|
|
165 $options = "
|
|
166 Usage: ./ARTS.pl <OPTIONS>
|
|
167 REQUIRED:
|
|
168 -i input traits (rectangular, tab-delimited matrix, including title line with column names)
|
|
169 -c trait columns to randomize
|
|
170 comma- and semicolon delimited list, columns indexed from 1
|
|
171 all traits indicated by commas are used in joint distributions
|
|
172 AND EITHER -b AND -o, OR -p:
|
|
173 -b batch sizes (a single number, or a comma-delimited list)
|
|
174 -o output file (formatted same as input file, with batch added as last column)
|
|
175 -or-
|
|
176 -p <batch column index>: print MI statistic for input traits using this column as batch designations
|
|
177 -p will not do any sampling
|
|
178 OTHER OPTIONS:
|
|
179 -cc continuously-valued columns (will be binned)
|
|
180 -cd date-valued columns (should be M/D/Y); should also list these as continuous (in -cc)
|
|
181 -cb number of bins to use for continuous or date columns (default: $bins for each)
|
|
182 can give 1 value, or a list of the same length as -cc; if a list, will be assigned in the same order as -cc
|
|
183 -bn batch name (title of added column, default $bname)
|
|
184 -s random number seed (large negative integer, default: $seed)
|
|
185 ";
|
|
186
|
|
187 #
|
|
188 # Secret options:
|
|
189 # -v y or l (verbose: print all, or just print status from beginning or end)
|
|
190 # -mmi force use of MMI objective function on all columns indicated by -c, over-riding any other settings from -c
|
|
191 #
|
|
192
|
|
193 GetOptions('i=s' => \$in,
|
|
194 'o=s' => \$out,
|
|
195 'p=i' => \$bcolumn,
|
|
196 'b=s' => \$batch,
|
|
197 'c=s' => \$phenocols,
|
|
198 'cc=s' => \$contcols,
|
|
199 'cd=s' => \$datecols,
|
|
200 'cb=s' => \$bins,
|
|
201 'bn=s' => \$bname,
|
|
202 's=i' => \$seed,
|
|
203 'mmi' => \$mmi,
|
|
204 'v=s' => \$verb,
|
|
205 ) || die "$options\n";
|
|
206
|
|
207 #
|
|
208 # check that required inputs exist
|
|
209 #
|
|
210 if( $in eq "" ){&exit_required("i");}
|
|
211 if( ($out eq "" || $batch eq "") && $bcolumn eq "" ){&exit_required("b and -o, or -p,");}
|
|
212 if( $phenocols eq "" || $phenocols eq "None" ){&exit_required("c");}
|
|
213
|
|
214 #
|
|
215 # check that inputs values are OK
|
|
216 #
|
|
217 if( $bcolumn ne "" ){
|
|
218 if( $bcolumn < 1 ){&exit_err("p","at least 1");}
|
|
219 $bcolumn--;
|
|
220 }
|
|
221 if( $verb ne "y" && $verb ne "n" && $verb ne "l" ){&exit_err("v","y or n or l");}
|
|
222 if( $seed > 0 ){$seed*= -1;}
|
|
223 if( $seed == 0 ){&exit_err("s","non-zero");}
|
|
224
|
|
225 #
|
|
226 # if mmi, reset phenocols value using all found columns
|
|
227 #
|
|
228 if( $mmi ){
|
|
229 @initcs = split(/[,;]/, $phenocols);
|
|
230 # remove duplicates
|
|
231 @clist = ();
|
|
232 $cinds = {};
|
|
233 for($i=0; $i<= $#initcs; $i++){
|
|
234 if( $cinds->{$initcs[$i]} eq "" ){
|
|
235 $cinds->{$initcs[$i]} = 1;
|
|
236 push(@clist, $initcs[$i]);
|
|
237 }
|
|
238 }
|
|
239 # add to new phenocols
|
|
240 $phenocols = "$clist[0]";
|
|
241 for($i=1; $i<= $#clist; $i++){
|
|
242 $phenocols = sprintf("%s,%s", $phenocols, $clist[$i]);
|
|
243 }
|
|
244 $phenocols = sprintf("%s;%s", $phenocols, $clist[0]);
|
|
245 for($i=1; $i<= $#clist; $i++){
|
|
246 $phenocols = sprintf("%s;%s", $phenocols, $clist[$i]);
|
|
247 }
|
|
248 }
|
|
249
|
|
250 #
|
|
251 # extract phenotype columns
|
|
252 #
|
|
253 @cols = ();
|
|
254 @allcols = ();
|
|
255 $alllist = {};
|
|
256 @jointlist = split(';',$phenocols);
|
|
257 for($i=0; $i<= $#jointlist; $i++){
|
|
258 @tmp = split(',',$jointlist[$i]);
|
|
259 @tmp = &fix_cols( \@tmp );
|
|
260 push(@cols, [@tmp]);
|
|
261 for($j=0; $j<= $#tmp; $j++){
|
|
262 if( $alllist->{$tmp[$j]} eq "" ){
|
|
263 $alllist->{$tmp[$j]} = 1;
|
|
264 push(@allcols, $tmp[$j]);
|
|
265 }
|
|
266 }
|
|
267 }
|
|
268
|
|
269 #
|
|
270 # extract continuous and date columns
|
|
271 # sort continuous columns so that bins correspond to them in order
|
|
272 #
|
|
273 if( $contcols ne "" && $contcols ne "None" ){
|
|
274 @conts = split(',',$contcols);
|
|
275 @conts = &fix_cols( \@conts );
|
|
276 $numconts = $#conts+1;
|
|
277 }
|
|
278 if( $datecols ne "" && $datecols ne "None" ){
|
|
279 @dates = split(',',$datecols);
|
|
280 @dates = &fix_cols( \@dates );
|
|
281 $numdates = $#dates+1;
|
|
282 # check that date columns are among continuous columns
|
|
283 for($i=0; $i<= $#dates; $i++){
|
|
284 for($j=0; $j<= $#conts; $j++){
|
|
285 if( $dates[$i] == $conts[$j] ){last;}
|
|
286 if( $j==$#conts ){
|
|
287 printf("Error: please specify date column %i as continuous\n", $dates[$i]+1 );
|
|
288 die;
|
|
289 }
|
|
290 }
|
|
291 }
|
|
292 }
|
|
293 if( $bins =~ /,/ ){
|
|
294 @blist = split(',',$bins);
|
|
295 if( $#blist+1 != $#conts + 1 ){
|
|
296 printf("Error: you input %i bins, but %i columns that need binning\n", $#blist+1, $#conts+1);
|
|
297 die;
|
|
298 }
|
|
299 }
|
|
300 else{
|
|
301 for($i=0; $i<= $#conts; $i++){
|
|
302 push(@blist, $bins);
|
|
303 }
|
|
304 }
|
|
305 }
|
|
306
|
|
307 # print error message for flag $_[0], with correct values $_[1], and print usage
|
|
308 sub exit_err{
|
|
309 printf("Error: set -%s to be %s\n%s\n", $_[0], $_[1], $options);
|
|
310 exit;
|
|
311 }
|
|
312 # print error message saying flag $_[0] is required
|
|
313 sub exit_required{
|
|
314 printf("Error: option -%s is required\n%s\n", $_[0], $options);
|
|
315 exit;
|
|
316 }
|
|
317
|
|
318 # fix all indices in array $_[0]: cast to integer, check at least 1, and subtract 1
|
|
319 sub fix_cols{
|
|
320 my @list;
|
|
321 my $i;
|
|
322 @list = @{$_[0]};
|
|
323 for($i=0; $i<= $#list; $i++){
|
|
324 $list[$i] = sprintf("%i", $list[$i]);
|
|
325 if( $list[$i] < 1 ){
|
|
326 print "Error: column indices should be at least 1\n";
|
|
327 die;
|
|
328 }
|
|
329 $list[$i]--;
|
|
330 }
|
|
331 return @list;
|
|
332 }
|
|
333
|
|
334 # print info about best randomization
|
|
335 sub print_info{
|
|
336 #
|
|
337 # get MI of each phenotype and average
|
|
338 #
|
|
339 $bestmi = &mutual_info();
|
|
340 @bestmilist = &individual_mi( $numbatches );
|
|
341 $bestavgmi = 0;
|
|
342 for($i=0; $i<= $#bestmilist; $i++){
|
|
343 $bestavgmi+= $bestmilist[$i]/($#bestmilist+1);
|
|
344 }
|
|
345
|
|
346 printf("Final MI %0.4f ; Individual trait MIs (mean %0.4f ): ", $bestmi, $bestavgmi);
|
|
347 for($i=0; $i<= $#bestmilist; $i++){
|
|
348 printf("\t%0.4f", $bestmilist[$i]);
|
|
349 }
|
|
350 print "\n-----------------------------------------------------------------\n";
|
|
351 #
|
|
352 # print the counts for each phenotype in each batch
|
|
353 #
|
|
354 # first title line: phenotype names
|
|
355 for($i=0; $i<= $#allcols; $i++){
|
|
356 printf("\t%s values", $titlevals[$allcols[$i]]);
|
|
357 for($j=1; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
|
|
358 printf("\t");
|
|
359 }
|
|
360 }
|
|
361 print "\nBatch (size)";
|
|
362 # second title line: phenotype values
|
|
363 for($i=0; $i<= $#allcols; $i++){
|
|
364 for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
|
|
365 if( $items->{$allcols[$i]}->{list}[$j] ne "" ){printf("\t%s", &name($items->{$allcols[$i]}->{list}[$j], $allcols[$i]) );}
|
|
366 else{printf("\tempty");}
|
|
367 }
|
|
368 }
|
|
369 print "\n-------";
|
|
370 # print a line of dashes to separate
|
|
371 for($i=0; $i<= $#allcols; $i++){
|
|
372 for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
|
|
373 printf("\t-------");
|
|
374 }
|
|
375 }
|
|
376 print "\n";
|
|
377 for($k=0; $k< $numbatches; $k++){
|
|
378 printf("%s (%i)", $batchnames[$k], $batchsizes[$k] );
|
|
379 for($i=0; $i<= $#allcols; $i++){
|
|
380 for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
|
|
381 printf("\t%i", &count( $#{$data[0]}, $batchnames[$k], $allcols[$i], $items->{$allcols[$i]}->{list}[$j] ) );
|
|
382 }
|
|
383 }
|
|
384 print "\n";
|
|
385 }
|
|
386 print "-------";
|
|
387 # print a line of dashes to separate
|
|
388 for($i=0; $i<= $#allcols; $i++){
|
|
389 for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
|
|
390 printf("\t-------");
|
|
391 }
|
|
392 }
|
|
393 # print totals for each type
|
|
394 print "\nTotal";
|
|
395 for($i=0; $i<= $#allcols; $i++){
|
|
396 for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
|
|
397 printf("\t%i", $items->{$allcols[$i]}->{$items->{$allcols[$i]}->{list}[$j]}[1] );
|
|
398 }
|
|
399 }
|
|
400 print "\n";
|
|
401 }
|
|
402
|
|
403 # for continuous valued columns, checked by $_[1], convert value $_[0] back to a range
|
|
404 sub name{
|
|
405 my $i;
|
|
406 my $binw;
|
|
407
|
|
408 #
|
|
409 # if there aren't any continuous columns, or $_[1] doesn't match one, just return $_[0]
|
|
410 #
|
|
411 if( $#conts < 0 ){return $_[0];}
|
|
412 for($i=0; $i<= $#conts; $i++){
|
|
413 if( $_[1] == $conts[$i] ){last;}
|
|
414 if( $i==$#conts ){return $_[0];}
|
|
415 }
|
|
416
|
|
417 #
|
|
418 # convert bin value back to continuous value
|
|
419 #
|
|
420 $binw = ($contstats[$i][2]-$contstats[$i][0])/$blist[$i];
|
|
421 $val1 = $binw*$_[0]+$contstats[$i][0];
|
|
422 $val2 = $binw*($_[0]+1)+$contstats[$i][0];
|
|
423
|
|
424 #
|
|
425 # if there aren't any date columns, or $_[1] doesn't match one, just return the range val1-val2
|
|
426 #
|
|
427 if( $#dates < 0 ){return sprintf("%s-%s", $val1, $val2);}
|
|
428 for($i=0; $i<= $#dates; $i++){
|
|
429 if( $_[1] == $dates[$i] ){last;}
|
|
430 if( $i==$#dates ){return sprintf("%s-%s", $val1, $val2);}
|
|
431 }
|
|
432 $val1 = sprintf("%i", $val1);
|
|
433 $val2 = sprintf("%i", $val2);
|
|
434 return sprintf("%s-%s", &convert_date( $val1 ), &convert_date( $val2 ));
|
|
435
|
|
436 }
|
|
437
|
|
438 # read in regular matrix from $in
|
|
439 # for continuous (including date-value) columns, make histograms
|
|
440 sub read_data{
|
|
441 my @lines;
|
|
442 my $i;
|
|
443 @data = ();
|
|
444 $items = {};
|
|
445 @orig = ();
|
|
446 @titlevals = ();
|
|
447 @batchsizes = ();
|
|
448 $numbatches = 0;
|
|
449
|
|
450 #
|
|
451 # fix newline convention
|
|
452 #
|
|
453
|
|
454
|
|
455 open(IN,"$in") || die "Error: can't open $in\n";
|
|
456 #
|
|
457 # read in all lines and check formatting
|
|
458 #
|
|
459 @lines = <IN>;
|
|
460 if( $lines[0] =~ /\r/ && $#lines == 0 ){
|
|
461 # this happens with tab-delimited text saved from excel
|
|
462 @lines = split('\r', $lines[0]);
|
|
463 }
|
|
464
|
|
465 #
|
|
466 # read title line
|
|
467 #
|
|
468 $title = $lines[0];
|
|
469 chomp($title);
|
|
470 @titlevals = split('\t',$title);
|
|
471 for($k=1; $k<= $#lines; $k++){
|
|
472 $line = $lines[$k];
|
|
473 chomp($line);
|
|
474 if( $line ne "" ){ # ignore blank lines
|
|
475 @parts = split('\t',$line);
|
|
476 for($i=$#parts+1; $i<= $#titlevals; $i++){
|
|
477 push(@parts, "");
|
|
478 }
|
|
479 if( $#parts != $#titlevals ){
|
|
480 printf("Error: not enough columns in line %i\n", $#data+2);
|
|
481 die;
|
|
482 }
|
|
483 # push 1 extra for the batch
|
|
484 push(@parts, 0);
|
|
485 push(@data, [(@parts)] );
|
|
486 push(@orig, [($line, 0)] );
|
|
487 }
|
|
488 }
|
|
489 close(IN);
|
|
490
|
|
491 #
|
|
492 # exit if no data read
|
|
493 #
|
|
494 if( $#data < 0 ){
|
|
495 printf("Error: no samples were read in\n");
|
|
496 die;
|
|
497 }
|
|
498
|
|
499 #
|
|
500 # if batch is not empty, check batches:
|
|
501 # if no commas, cast to integer and count how many are needed
|
|
502 # if there are commas, get batched on given sizes
|
|
503 # double check that we add up
|
|
504 #
|
|
505 @batchnames = ();
|
|
506 if( $batch ne "" ){
|
|
507 if( $batch !~ /,/ ){
|
|
508 $batch = sprintf("%i", $batch);
|
|
509 # fix batch size if too big
|
|
510 if( $batch > $#data + 1 ){
|
|
511 printf("Warning: you have %i samples, but asked for a batch size of %i, so there is only 1 batch\n", $#data+1, $batch);
|
|
512 $batch = $#data+1;
|
|
513 }
|
|
514 $numbatches = ($#data+1)/$batch;
|
|
515 $exactbatches = sprintf("%i", $numbatches);
|
|
516 if( $exactbatches < $numbatches ){$exactbatches++;}
|
|
517 $numbatches = $exactbatches;
|
|
518 for($i=0; $i< $numbatches-1; $i++){
|
|
519 push(@batchsizes, $batch);
|
|
520 }
|
|
521 push(@batchsizes, $batch - ($numbatches*$batch - ($#data+1)) );
|
|
522 }
|
|
523 else{
|
|
524 @batchsizes = split(',',$batch);
|
|
525 $numbatches = $#batchsizes+1;
|
|
526 }
|
|
527 $tot = 0;
|
|
528 for($i=0; $i< $numbatches; $i++){
|
|
529 push(@batchnames, $i+1);
|
|
530 $tot+= $batchsizes[$i];
|
|
531 }
|
|
532 if( $tot != $#data+1 ){
|
|
533 printf("Error: have %i spaces in all batches, but %i samples\n", $tot, $#data+1);
|
|
534 die;
|
|
535 }
|
|
536 }
|
|
537
|
|
538 #
|
|
539 # convert dates to numbers
|
|
540 #
|
|
541 for($i=0; $i<= $#data; $i++){
|
|
542 for($j=0; $j<= $#dates; $j++){
|
|
543 if( $data[$i][$dates[$j]] ne "" ){$data[$i][$dates[$j]] = &convert_date( $data[$i][$dates[$j]] );}
|
|
544 }
|
|
545 }
|
|
546
|
|
547 #
|
|
548 # for all continuous columns, compute median and fill in missing values
|
|
549 # also record max and min for binning
|
|
550 #
|
|
551 @contstats = (); # records min, median, max for each continuous column
|
|
552 for($j=0; $j<= $#conts; $j++){
|
|
553 @tmp = ();
|
|
554 for($i=0; $i<= $#data; $i++){
|
|
555 if( $data[$i][$conts[$j]] ne "" ){push(@tmp, $data[$i][$conts[$j]]);}
|
|
556 }
|
|
557 @tmp = sort{ $a <=> $b } @tmp;
|
|
558 $median = $tmp[sprintf("%i", ($#tmp+1)/2)];
|
|
559 push(@contstats, [($tmp[0], $median, $tmp[$#tmp])] );
|
|
560 # for($i=0; $i<= $#data; $i++){
|
|
561 # if( $data[$i][$conts[$j]] eq "" ){$data[$i][$conts[$j]] = $median;}
|
|
562 # }
|
|
563 }
|
|
564
|
|
565 #
|
|
566 # for all continuous columns, bin data
|
|
567 #
|
|
568 for($j=0; $j<= $#conts; $j++){
|
|
569 $binw = ($contstats[$j][2] - $contstats[$j][0])/($blist[$j]);
|
|
570 if( $binw == 0 ){
|
|
571 printf("Error: max and min of column %i are equal (max/min are %f/%f)\n", $conts[$j]+1, $contstats[$j][2], $contstats[$j][0] );
|
|
572 die;
|
|
573 }
|
|
574 for($i=0; $i<= $#data; $i++){
|
|
575 if( $data[$i][$conts[$j]] ne "" ){
|
|
576 $data[$i][$conts[$j]] = sprintf("%i", ($data[$i][$conts[$j]] - $contstats[$j][0])/$binw);
|
|
577 if( $data[$i][$conts[$j]] >= $blist[$j] ){$data[$i][$conts[$j]] = $blist[$j]-1;}
|
|
578 }
|
|
579 }
|
|
580 }
|
|
581
|
|
582 #
|
|
583 # for each column we're using, count how many item types there are
|
|
584 # empty phenotypes are considered their own, distinct phenotype
|
|
585 #
|
|
586 $items = &itemize( \@allcols );
|
|
587 }
|
|
588
|
|
589 # count how many item types of @{$_[0]} there are in @data
|
|
590 sub itemize{
|
|
591 my $i;
|
|
592 my $j;
|
|
593 my $info;
|
|
594 my @cols;
|
|
595 my $items;
|
|
596 @cols = @{$_[0]};
|
|
597
|
|
598 for($j=0; $j<= $#cols; $j++){
|
|
599 $info = {};
|
|
600 $info->{list} = ();
|
|
601 for($i=0; $i<= $#data; $i++){
|
|
602 if( $info->{$data[$i][$cols[$j]]} eq "" ){
|
|
603 $info->{$data[$i][$cols[$j]]} = [($#{$info->{list}}+1,0)];
|
|
604 push(@{$info->{list}}, $data[$i][$cols[$j]]);
|
|
605 }
|
|
606 $info->{$data[$i][$cols[$j]]}[1]++;
|
|
607 $info->{count}++;
|
|
608 }
|
|
609 $info->{num} = $#{$info->{list}}+1;
|
|
610 $items->{$cols[$j]} = $info;
|
|
611 }
|
|
612
|
|
613 for($j=0; $j<= $#cols; $j++){
|
|
614 # this set prints the number of values and counts for each phenotype
|
|
615 #printf("%i,%s:", $cols[$j], $titlevals[$cols[$j]]);
|
|
616 #for($k=0; $k<= $#{$items->{$cols[$j]}->{list}}; $k++){
|
|
617 # printf("\t%s,%i", $items->{$cols[$j]}->{list}[$k], $items->{$cols[$j]}->{$items->{$cols[$j]}->{list}[$k]}[1] );
|
|
618 #}
|
|
619 #print "\n";
|
|
620 if( $items->{$cols[$j]}->{num} > 20 ){
|
|
621 printf("Warning: column %i (%s) has %i values; should you make it continuous?\n", $cols[$j], $titlevals[$cols[$j]], $items->{$cols[$j]}->{num} );
|
|
622 }
|
|
623 }
|
|
624 return $items;
|
|
625 }
|
|
626
|
|
627 # convert date in M/D/Y to integer, or integer to M/D/Y
|
|
628 sub convert_date{
|
|
629 my $date;
|
|
630 my $month;
|
|
631 my $day;
|
|
632 my $year;
|
|
633 my $months;
|
|
634 my $i;
|
|
635 # cumulative days per month
|
|
636 $months->{0} = 0;
|
|
637 $months->{1} = 31;
|
|
638 $months->{2} = 59;
|
|
639 $months->{3} = 90;
|
|
640 $months->{4} = 120;
|
|
641 $months->{5} = 151;
|
|
642 $months->{6} = 181;
|
|
643 $months->{7} = 212;
|
|
644 $months->{8} = 243;
|
|
645 $months->{9} = 273;
|
|
646 $months->{10} = 304;
|
|
647 $months->{11} = 334;
|
|
648 $months->{12} = 365;
|
|
649 $date = $_[0];
|
|
650
|
|
651 # convert date to integer
|
|
652 if( $date =~ /\// ){
|
|
653 ($month, $day, $year) = split('/',$date);
|
|
654 $month = sprintf("%i", $month);
|
|
655 $day = sprintf("%i", $day);
|
|
656 $year = sprintf("%i", $year);
|
|
657 if( $month < 1 || $month > 12 ){
|
|
658 print "Error: found a month $month not between 1 and 12\n";
|
|
659 die;
|
|
660 }
|
|
661 if( $day < 1 || $day > 31 ){
|
|
662 print "Error: found a day $day not between 1 and 31\n";
|
|
663 die;
|
|
664 }
|
|
665
|
|
666 return $day + $months->{$month-1} + $year*$months->{12};
|
|
667 }
|
|
668 # convert integer to date
|
|
669 elsif( $date == sprintf("%i", $date) ){
|
|
670 $year = sprintf("%i", $date/($months->{12}));
|
|
671 $month = $date-$year*$months->{12};
|
|
672 for($i=1; $i<=12; $i++){
|
|
673 if( $month < $months->{$i} ){last;}
|
|
674 }
|
|
675 $day = $month - $months->{$i-1};
|
|
676 $month = $i;
|
|
677 return sprintf("%s/%s/%s", $month, $day, $year);
|
|
678 }
|
|
679 else{
|
|
680 printf("\nError: unrecognized format in convert_date(): %s\n", $date);
|
|
681 die;
|
|
682 }
|
|
683 }
|
|
684
|
|
685 # set globals used by ran1
|
|
686 sub ran1_init{
|
|
687 #
|
|
688 # random number variables
|
|
689 #
|
|
690 $iset = 0;
|
|
691 $gset = 0;
|
|
692 #$iseed = clock_gettime(CLOCK_REALTIME);
|
|
693 #($first, $second) = split('\.', $iseed);
|
|
694 #$seed = sprintf("-%i%i", $second, $first);
|
|
695 $seed = -10854829;
|
|
696 $M1 = 259200;
|
|
697 $IA1 = 7141;
|
|
698 $IC1 = 54773;
|
|
699 $RM1 = 1.0/$M1;
|
|
700 $M2 = 134456;
|
|
701 $IA2 = 8121;
|
|
702 $IC2 = 28411;
|
|
703 $RM2 = 1.0/$M2;
|
|
704 $M3 = 243000;
|
|
705 $IA3 = 4561;
|
|
706 $IC3 = 51349;
|
|
707 $iff = 0;
|
|
708 $ix1 = 0;
|
|
709 $ix2 = 0;
|
|
710 $ix3 = 0;
|
|
711 @ranarray = ();
|
|
712 for($i=0; $i< 98; $i++){
|
|
713 push(@ranarray, 0);
|
|
714 }
|
|
715 }
|
|
716
|
|
717 # uniform random number generator, seed, iff, and various capital-letter variables set in beginning
|
|
718 sub ran1{
|
|
719 my $j;
|
|
720 my $temp;
|
|
721
|
|
722 if( $seed < 0 || $iff == 0 ){
|
|
723 $iff = 1;
|
|
724 $ix1 = ($IC1 - $seed)%$M1;
|
|
725 $ix1 = ($IA1*$ix1 + $IC1)%$M1;
|
|
726 $ix2 = $ix1%$M2;
|
|
727 $ix1 = ($IA1*$ix1 + $IC1)%$M1;
|
|
728 $ix3 = $ix1%$M3;
|
|
729 for($j=1; $j<= 97; $j++){
|
|
730 $ix1 = ($IA1*$ix1 + $IC1)%$M1;
|
|
731 $ix2 = ($IA2*$ix2 + $IC2)%$M2;
|
|
732 $ranarray[$j] = ($ix1 + $ix2*$RM2)*$RM1;
|
|
733 }
|
|
734 $seed = 1;
|
|
735 }
|
|
736 $ix1 = ($IA1*$ix1 + $IC1)%$M1;
|
|
737 $ix2 = ($IA2*$ix2 + $IC2)%$M2;
|
|
738 $ix3 = ($IA3*$ix3 + $IC3)%$M3;
|
|
739
|
|
740 $j = sprintf("%i", 1 + ((97*$ix3)/$M3) );
|
|
741 if( $j> 97 || $j< 1 ){
|
|
742 printf("Error in ran1: $j outside of [1:97]\n");
|
|
743 die;
|
|
744 }
|
|
745 $temp = $ranarray[$j];
|
|
746 $ranarray[$j] = ($ix1 + $ix2*$RM2)*$RM1;
|
|
747 return $temp;
|
|
748 }
|
|
749
|
|
750 # permute array $_[0]
|
|
751 sub permute{
|
|
752 my @assignments;
|
|
753 my $i;
|
|
754 my $j;
|
|
755 my $tmp;
|
|
756
|
|
757 @assignments = @{$_[0]};
|
|
758
|
|
759 #
|
|
760 # shuffle batches randomly
|
|
761 #
|
|
762 for($i=$#assignments; $i>= 0; $i--){
|
|
763 $j = sprintf("%i", ($i+1)*&ran1() );
|
|
764 $tmp = $assignments[$j];
|
|
765 $assignments[$j] = $assignments[$i];
|
|
766 $assignments[$i] = $tmp;
|
|
767 }
|
|
768 return @assignments;
|
|
769 }
|
|
770
|
|
771 # fill data with assignments $_[0]
|
|
772 sub fill_assignments{
|
|
773 my @list;
|
|
774 my $i;
|
|
775 @list = @{$_[0]};
|
|
776 if( $#list != $#data ){
|
|
777 print "Error in fill_assignments: mismatching list lengths\n";
|
|
778 die;
|
|
779 }
|
|
780 for($i=0; $i<= $#list; $i++){
|
|
781 $data[$i][$#{$data[0]}] = $list[$i];
|
|
782 }
|
|
783 }
|
|
784
|
|
785 # compute mutual information of a batch assignment
|
|
786 sub mutual_info{
|
|
787 my $i;
|
|
788 my $s;
|
|
789 my $mi;
|
|
790 my $stot;
|
|
791
|
|
792 $mi = 0;
|
|
793 for($i=0; $i<= $#cols; $i++){
|
|
794 $mi += &this_mi( $_[0], $#{$data[0]}, \@{$cols[$i]} )/($#cols+1);
|
|
795 }
|
|
796
|
|
797 return $mi;
|
|
798 }
|
|
799
|
|
800 # compute all single-phenotype mutual information
|
|
801 sub individual_mi{
|
|
802 my $i;
|
|
803 my @list;
|
|
804 my @milist;
|
|
805
|
|
806 @milist = ();
|
|
807 for($i=0; $i<= $#allcols; $i++){
|
|
808 @list = ($allcols[$i]);
|
|
809 push(@milist, &this_mi( $_[0], $#{$data[0]}, \@list ) );
|
|
810 }
|
|
811 return @milist;
|
|
812 }
|
|
813
|
|
814 # compute mutual information of columns $_[1] ($_[0] bins) and all of @{$_[2]}
|
|
815 sub this_mi{
|
|
816 my $i;
|
|
817 my $j;
|
|
818 my $summand;
|
|
819 my @list;
|
|
820 my $jprob;
|
|
821 my $m1prob;
|
|
822 my $m2prob;
|
|
823 my $jbin;
|
|
824 my $m1bin;
|
|
825 my $m2bin;
|
|
826 my $jbinstot;
|
|
827 my $m1binstot;
|
|
828 my $m2binstot;
|
|
829 my @jbinlist;
|
|
830 my @m1binlist;
|
|
831 my @m2binlist;
|
|
832 my $mi;
|
|
833 my $s1;
|
|
834 my $s2;
|
|
835 my $s;
|
|
836 @list = @{$_[2]};
|
|
837
|
|
838 # initialize probabilities
|
|
839 $jprob = {}; # joint distribution
|
|
840 $m1prob = {}; # batch marginal dist
|
|
841 $m2prob = {}; # pheno marginal dist
|
|
842 @jbinlist = (); # phenotype combos found in joint distribution
|
|
843 @m1binlist = (); # batches found in batches distribution (1st marginal dist)
|
|
844 @m2binlist = (); # phenotype combos found in phenotypes distribution (2nd marginal dist)
|
|
845
|
|
846 #
|
|
847 # read through data and add to distributions
|
|
848 #
|
|
849 $summand = 1.0/($#data+1);
|
|
850 for($i=0; $i<= $#data; $i++){
|
|
851 #
|
|
852 # define bin names based on phenotype/batch
|
|
853 # for phenotypes p1, p2, etc., batch b:
|
|
854 # joint = p1_p2_..._pn_b
|
|
855 # 1st marginal = b
|
|
856 # 2nd marginal = p1_p2_..._pn
|
|
857 #
|
|
858 $jbin = sprintf("%s", $data[$i][$#{$data[0]}]);
|
|
859 $m1bin = sprintf("%s", $data[$i][$#{$data[0]}]);
|
|
860 $m2bin = "";
|
|
861 for($j=0; $j<= $#list; $j++){
|
|
862 # NOTE:
|
|
863 # $list[$j] is a phenotype column (e.g., gender)
|
|
864 # $data[$i][$list[$j]] is the value of that phenotype in sample $i (e.g., M or F)
|
|
865 # $items->{$list[$j]}->{$data[$i][$list[$j]]}[0] is the bin index (e.g., M->0, F->1) of that phenotype
|
|
866
|
|
867 $jbin = sprintf("%s_%i", $jbin, $items->{$list[$j]}->{$data[$i][$list[$j]]}[0]);
|
|
868 if( $j>0 ){$m2bin = sprintf("%s_", $m2bin);}
|
|
869 $m2bin = sprintf("%s%i", $m2bin, $items->{$list[$j]}->{$data[$i][$list[$j]]}[0]);
|
|
870 }
|
|
871
|
|
872 #
|
|
873 # check if we've already seen this bin, for each distribution
|
|
874 # initialize probabilities and add to list if it's the first time
|
|
875 #
|
|
876 if( $jprob->{$jbin} eq "" ){
|
|
877 $jprob->{$jbin} = 0;
|
|
878 push(@jbinlist, [($jbin, $m1bin, $m2bin)] );
|
|
879 }
|
|
880 if( $m1prob->{$m1bin} eq "" ){
|
|
881 $m1prob->{$m1bin} = 0;
|
|
882 push(@m1binlist, $m1bin);
|
|
883 }
|
|
884 if( $m2prob->{$m2bin} eq "" ){
|
|
885 $m2prob->{$m2bin} = 0;
|
|
886 push(@m2binlist, $m2bin);
|
|
887 }
|
|
888
|
|
889 #
|
|
890 # add a count to each distribution
|
|
891 #
|
|
892 $jprob->{$jbin} += $summand;
|
|
893 $m1prob->{$m1bin} += $summand;
|
|
894 $m2prob->{$m2bin} += $summand;
|
|
895 }
|
|
896
|
|
897 #
|
|
898 # compute mutual information, and entropy of m1prob and m2prob (for normalization)
|
|
899 #
|
|
900 $mi = 0;
|
|
901 $s1 = 0;
|
|
902 $s2 = 0;
|
|
903 for($i=0; $i<= $#jbinlist; $i++){
|
|
904 $mi+= ($jprob->{$jbinlist[$i][0]}) * log( ($jprob->{$jbinlist[$i][0]})/($m1prob->{$jbinlist[$i][1]} * $m2prob->{$jbinlist[$i][2]}) );
|
|
905 }
|
|
906 for($i=0; $i<= $#m1binlist; $i++){
|
|
907 $s1-= $m1prob->{$m1binlist[$i]} * log( $m1prob->{$m1binlist[$i]} );
|
|
908 }
|
|
909 for($i=0; $i<= $#m2binlist; $i++){
|
|
910 $s2-= $m2prob->{$m2binlist[$i]} * log( $m2prob->{$m2binlist[$i]} );
|
|
911 }
|
|
912 $s = sqrt($s1*$s2);
|
|
913
|
|
914 #
|
|
915 # normalize mi
|
|
916 #
|
|
917 if( $s>0 ){$mi/= $s;}
|
|
918
|
|
919 #
|
|
920 # return normalized mi (0=independent, 1=completely dependent)
|
|
921 #
|
|
922 return $mi;
|
|
923 }
|
|
924
|
|
925 # count how many of @data have column $_[0] equal $_[1] and column $_[2] equal $_[3]
|
|
926 sub count{
|
|
927 my $i;
|
|
928 my $tot;
|
|
929
|
|
930 $tot = 0;
|
|
931 for($i=0; $i<= $#data; $i++){
|
|
932 if( $data[$i][$_[0]] eq $_[1] && $data[$i][$_[2]] eq $_[3] ){$tot++;}
|
|
933 }
|
|
934 return $tot;
|
|
935 }
|
|
936
|
|
937 # initialize GA parameters and large matrices
|
|
938 sub ga_init{
|
|
939 my $i;
|
|
940 my $j;
|
|
941 my $info;
|
|
942
|
|
943 $popsize = 100;
|
|
944 $numgen = 300;
|
|
945 $nchrmuts = 2;
|
|
946 $nnewimm = 10;
|
|
947 $nkeepparents = 2;
|
|
948 $nchrpool = ($nnewimm+$popsize) + ($nnewimm+$popsize)*$nchrmuts + $nkeepparents;
|
|
949
|
|
950 #
|
|
951 # population to turn over each generation
|
|
952 #
|
|
953 @population = ();
|
|
954 for($i=0; $i< $popsize+$nnewimm; $i++){
|
|
955 $info = {};
|
|
956 $info->{score} = 0;
|
|
957 $info->{assignments} = [()];
|
|
958 for($j=0; $j<= $#data; $j++){
|
|
959 push(@{$info->{assignments}}, 0);
|
|
960 }
|
|
961 push(@population, $info);
|
|
962 }
|
|
963
|
|
964 #
|
|
965 # new individuals to fill each generation
|
|
966 #
|
|
967 @pool = ();
|
|
968 for($i=0; $i< $nchrpool; $i++){
|
|
969 $info = {};
|
|
970 $info->{score} = 0;
|
|
971 $info->{assignments} = [()];
|
|
972 for($j=0; $j<= $#data; $j++){
|
|
973 push(@{$info->{assignments}}, 0);
|
|
974 }
|
|
975 push(@pool, $info);
|
|
976 }
|
|
977
|
|
978 #
|
|
979 # array to randomize for batch assignments
|
|
980 #
|
|
981 @batched = ();
|
|
982 @bcounts = ();
|
|
983 for($i=0; $i<= $#batchsizes; $i++){
|
|
984 push(@bcounts, 0);
|
|
985 for($j=0; $j< $batchsizes[$i]; $j++){
|
|
986 push(@batched, $i+1);
|
|
987 }
|
|
988 }
|
|
989 }
|
|
990
|
|
991 # initialize the population array: randomize $popsize batches and score each one
|
|
992 sub initialize_population{
|
|
993 my $i;
|
|
994
|
|
995 for($i=0; $i< $popsize; $i++){
|
|
996 @{$population[$i]->{assignments}} = &permute( \@batched );
|
|
997 &fill_assignments( \@{$population[$i]->{assignments}} );
|
|
998 $population[$i]->{score} = &mutual_info( $numbatches );
|
|
999 }
|
|
1000 }
|
|
1001
|
|
1002 # complete the crossover step, keeping track of our index with $_[0]
|
|
1003 sub crossover{
|
|
1004 my $i;
|
|
1005 my $j;
|
|
1006 my $k;
|
|
1007
|
|
1008 $k = $_[0];
|
|
1009
|
|
1010 for($i=0; $i< $popsize + $nnewimm; $i+=2){
|
|
1011 &do_cross($i, $k);
|
|
1012 &fill_assignments( \@{$pool[$k]->{assignments}} );
|
|
1013 $pool[$k]->{score} = &mutual_info( $numbatches );
|
|
1014 $k++;
|
|
1015 &fill_assignments( \@{$pool[$k]->{assignments}} );
|
|
1016 $pool[$k]->{score} = &mutual_info( $numbatches );
|
|
1017 $k++;
|
|
1018 }
|
|
1019 return $k;
|
|
1020 }
|
|
1021
|
|
1022 # do a crossover between population members $_[0] and $_[0]+1, fill to pool members $_[1] and $_[1]+1
|
|
1023 sub do_cross{
|
|
1024 my $i;
|
|
1025 my $j;
|
|
1026 my $popmem;
|
|
1027 my $poolmem;
|
|
1028 my $index;
|
|
1029 my @swap;
|
|
1030 my @subswap;
|
|
1031 my $swapinds;
|
|
1032
|
|
1033 $popmem = $_[0];
|
|
1034 $poolmem = $_[1];
|
|
1035 $index = sprintf("%i", $#data * &ran1() );
|
|
1036
|
|
1037 #
|
|
1038 # count how many of each batch to switch
|
|
1039 #
|
|
1040 for($i=0; $i<= $#bcounts; $i++){
|
|
1041 $bcounts[$i] = 0;
|
|
1042 }
|
|
1043 for($i=0; $i<= $index; $i++){
|
|
1044 $bcounts[$population[$popmem]->{assignments}[$i] - 1]++;
|
|
1045 }
|
|
1046
|
|
1047 #
|
|
1048 # for each batch i:
|
|
1049 # record into subswap all indices with that batch from popmem+1
|
|
1050 # permute subswap
|
|
1051 # add first bcounts[$i] into swap
|
|
1052 # then sort swap
|
|
1053 #
|
|
1054 @swap = ();
|
|
1055 $swapinds = {};
|
|
1056 for($i=0; $i<= $#bcounts; $i++){
|
|
1057 @subswap = ();
|
|
1058 for($j=0; $j<= $#data; $j++){
|
|
1059 if( $population[$popmem+1]->{assignments}[$j] == $i+1 ){
|
|
1060 push(@subswap, $j);
|
|
1061 }
|
|
1062 }
|
|
1063 @subswap = &permute( \@subswap );
|
|
1064 for($j=0; $j< $bcounts[$i]; $j++){
|
|
1065 push(@swap, $subswap[$j]);
|
|
1066 $swapinds->{$subswap[$j]} = 1;
|
|
1067 }
|
|
1068 }
|
|
1069 @swap = sort{$a <=> $b} @swap;
|
|
1070
|
|
1071 #
|
|
1072 # fill start of first new chr from swap indices of second chr, and end from end of first chr
|
|
1073 #
|
|
1074 for($i=0; $i<= $index; $i++){
|
|
1075 $pool[$poolmem]->{assignments}[$i] = $population[$popmem+1]->{assignments}[$swap[$i]];
|
|
1076 }
|
|
1077 for($i=$index+1; $i<= $#data; $i++){
|
|
1078 $pool[$poolmem]->{assignments}[$i] = $population[$popmem]->{assignments}[$i];
|
|
1079 }
|
|
1080
|
|
1081 #
|
|
1082 # fill start of second chr from start of first chr, and end from remaining parts of second chr
|
|
1083 #
|
|
1084 for($i=0; $i<= $index; $i++){
|
|
1085 $pool[$poolmem+1]->{assignments}[$i] = $population[$popmem]->{assignments}[$i];
|
|
1086 }
|
|
1087 $j = $index+1;
|
|
1088 for($i=0; $i<= $#data; $i++){
|
|
1089 if( $swapinds->{$i} != 1 ){
|
|
1090 $pool[$poolmem+1]->{assignments}[$j] = $population[$popmem+1]->{assignments}[$i];
|
|
1091 $j++;
|
|
1092 }
|
|
1093 }
|
|
1094
|
|
1095
|
|
1096 #
|
|
1097 # check that batch counts are still ok
|
|
1098 #
|
|
1099 $checkbatch = 0;
|
|
1100 if( $checkbatch ){
|
|
1101 for($i=0; $i<= $#bcounts; $i++){
|
|
1102 $bcounts[$i] = 0;
|
|
1103 }
|
|
1104 for($i=0; $i<= $#data; $i++){
|
|
1105 $bcounts[$pool[$poolmem]->{assignments}[$i] - 1]++;
|
|
1106 }
|
|
1107 for($i=0; $i<= $#bcounts; $i++){
|
|
1108 if( $bcounts[$i] != $batchsizes[$i] ){
|
|
1109 print "Error in do_cross: lost some batch counts in first daughter chr\n";
|
|
1110 exit;
|
|
1111 }
|
|
1112 }
|
|
1113 for($i=0; $i<= $#bcounts; $i++){
|
|
1114 $bcounts[$i] = 0;
|
|
1115 }
|
|
1116 for($i=0; $i<= $#data; $i++){
|
|
1117 $bcounts[$pool[$poolmem+1]->{assignments}[$i] - 1]++;
|
|
1118 }
|
|
1119 for($i=0; $i<= $#bcounts; $i++){
|
|
1120 if( $bcounts[$i] != $batchsizes[$i] ){
|
|
1121 print "Error in do_cross: lost some batch counts in first daughter chr\n";
|
|
1122 exit;
|
|
1123 }
|
|
1124 }
|
|
1125 }
|
|
1126 }
|
|
1127
|
|
1128 # complete the mutation step, keeping track of our index with $_[0]
|
|
1129 sub mutate{
|
|
1130 my $i;
|
|
1131 my $j;
|
|
1132 my $k;
|
|
1133
|
|
1134 $k = $_[0];
|
|
1135
|
|
1136 for($i=0; $i< $popsize+$nnewimm; $i++){
|
|
1137 for($j=0; $j< $nchrmuts; $j++){
|
|
1138 &do_mutation($i, $k);
|
|
1139 &fill_assignments( \@{$pool[$k]->{assignments}} );
|
|
1140 $pool[$k]->{score} = &mutual_info( $numbatches );
|
|
1141 $k++;
|
|
1142 }
|
|
1143 }
|
|
1144 return $k;
|
|
1145 }
|
|
1146
|
|
1147 # do a mutation for population member $_[0], fill to pool member $_[1]
|
|
1148 sub do_mutation{
|
|
1149 my $i;
|
|
1150 my $popmem;
|
|
1151 my $poolmem;
|
|
1152 my $index1;
|
|
1153 my $index2;
|
|
1154
|
|
1155 $popmem = $_[0];
|
|
1156 $poolmem = $_[1];
|
|
1157
|
|
1158 #
|
|
1159 # fill all of poolmem
|
|
1160 #
|
|
1161 for($i=0; $i<= $#data; $i++){
|
|
1162 $pool[$poolmem]->{assignments}[$i] = $population[$popmem]->{assignments}[$i];
|
|
1163 }
|
|
1164
|
|
1165 #
|
|
1166 # switch two members
|
|
1167 #
|
|
1168 $index1 = sprintf("%i", ($#data+1) * &ran1() );
|
|
1169 $index2 = sprintf("%i", ($#data+1) * &ran1() );
|
|
1170
|
|
1171 $pool[$poolmem]->{assignments}[$index1] = $population[$popmem]->{assignments}[$index2];
|
|
1172 $pool[$poolmem]->{assignments}[$index2] = $population[$popmem]->{assignments}[$index1];
|
|
1173 }
|
|
1174
|
|
1175 # add immigrants, keeping track of our index with $_[0]
|
|
1176 sub add_immigrants{
|
|
1177 my $j;
|
|
1178
|
|
1179 for($j=0; $j< $nnewimm; $j++){
|
|
1180 @{$population[$popsize+$j]->{assignments}} = &permute( \@batched );
|
|
1181 &fill_assignments( \@{$population[$popsize+$j]->{assignments}} );
|
|
1182 $population[$popsize+$j]->{score} = &mutual_info( $numbatches );
|
|
1183 $k++;
|
|
1184 }
|
|
1185 }
|
|
1186
|
|
1187 # add top-scoring parents, keeping track of our index with $_[0]
|
|
1188 sub add_parents{
|
|
1189 my $i;
|
|
1190 my $j;
|
|
1191 my $k;
|
|
1192
|
|
1193 $k = $_[0];
|
|
1194 # sort population now
|
|
1195 @population = sort{$a->{score} <=> $b->{score}} @population;
|
|
1196
|
|
1197 for($j=0; $j< $nkeepparents; $j++){
|
|
1198 ©_parents( $j, $k );
|
|
1199 # will copy score also in copy_parents()
|
|
1200 $k++;
|
|
1201 }
|
|
1202 return $k;
|
|
1203 }
|
|
1204
|
|
1205 # add population member $_[0] to pool member $_[1]
|
|
1206 sub copy_parents{
|
|
1207 my $i;
|
|
1208 my $popmem;
|
|
1209 my $poolmem;
|
|
1210 my $index1;
|
|
1211 my $index2;
|
|
1212
|
|
1213 $popmem = $_[0];
|
|
1214 $poolmem = $_[1];
|
|
1215
|
|
1216 #
|
|
1217 # fill all of poolmem
|
|
1218 #
|
|
1219 for($i=0; $i<= $#data; $i++){
|
|
1220 $pool[$poolmem]->{assignments}[$i] = $population[$popmem]->{assignments}[$i];
|
|
1221 }
|
|
1222 $pool[$poolmem]->{score} = $population[$popmem]->{score};
|
|
1223 }
|
|
1224
|
|
1225 # copy top of pool to population (assumes pool is sorted)
|
|
1226 sub fill_population{
|
|
1227 my $i;
|
|
1228 my $j;
|
|
1229 my $m;
|
|
1230
|
|
1231 $m = 0;
|
|
1232 for($i=0; $i< $popsize; $i++){
|
|
1233 $population[$i]->{score} = $pool[$i]->{score};
|
|
1234 $m+= $population[$i]->{score};
|
|
1235 for($j=0; $j<= $#data; $j++){
|
|
1236 $population[$i]->{assignments}[$j] = $pool[$i]->{assignments}[$j];
|
|
1237 }
|
|
1238 }
|
|
1239 return $m/$popsize;
|
|
1240 }
|
|
1241
|
|
1242
|
|
1243
|
|
1244
|