Mercurial > repos > mmaiensc > arts

diff ARTS/ARTS.pl @ 0:3723b54935cb draft
Uploaded
author: mmaiensc
date: Wed, 13 Nov 2013 16:13:17 -0500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ARTS/ARTS.pl	Wed Nov 13 16:13:17 2013 -0500
@@ -0,0 +1,1244 @@
+#!/usr/bin/perl
+# ARTS: Automated Randomization of multiple Traits for Study Design, using diploidly GA
+# Mark Maienschein-Cline, last updated 8/19/2013
+# mmaiensc@uic.edu
+# Center for Research Informatics, University of Illinois at Chicago
+#
+# Copyright (C) 2013 Mark Maienschein-Cline
+#
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License along
+#    with this program; if not, write to the Free Software Foundation, Inc.,
+#    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+
+
+
+
+use Getopt::Long qw(:config no_ignore_case);
+#use Time::HiRes qw( clock_gettime );
+use Math::Trig;
+$|++;
+
+#
+# initialize random number parameters
+#
+&ran1_init();
+
+#
+# read command line
+#
+&read_command_line();
+
+#
+# read phenotype list: print the title lines of columns used for verbose
+#
+&read_data();
+if( $verb eq "y" || $verb eq "l" ){
+	printf("Using traits:");
+	for($i=0; $i<= $#allcols; $i++){
+		print "\t$titlevals[$allcols[$i]]";
+	}
+	print "\n";
+	printf("Using trait combinations:");
+	for($i=0; $i<= $#cols; $i++){
+		printf("\t{%s", $titlevals[$cols[$i][0]]);
+		for($j=1; $j<= $#{$cols[$i]}; $j++){
+			printf(",%s", $titlevals[$cols[$i][$j]]);
+		}
+		printf("}");
+	}
+	print "\n";
+}
+
+#
+# initialize GA parameters
+#
+&ga_init();
+
+#
+# if using the batchcolumn, fill in the batch
+#
+if( $bcolumn ne "" ){
+	if( $verb eq "y" ){
+		printf("Looking at column %i (%s) for batch assignments\n", $bcolumn+1, $titlevals[$bcolumn]);
+	}
+	# fill in batch from last column of @data
+	$numbatches = 0;
+	$foundbatchhash = {};
+	@batchsizes = ();
+	for($i=0; $i<=$#data; $i++){
+		if( $foundbatchhash->{$data[$i][$bcolumn]} eq "" ){
+			$foundbatchhash->{$data[$i][$bcolumn]} = $numbatches;
+			$numbatches++;
+			push(@batchnames, $data[$i][$bcolumn]);
+			push(@batchsizes, 0);
+		}
+		$batchsizes[$foundbatchhash->{$data[$i][$bcolumn]}]++;
+		$data[$i][$#{$data[0]}] = $data[$i][$bcolumn];
+	}
+	$mi = &mutual_info( $numbatches );
+	$bestmi = $mi;
+}
+
+#
+# else do sampling: run GA
+#
+if( $bcolumn eq "" ){
+	&initialize_population();
+
+	$oldavg = 1;
+	$err = 0.0001;
+	for($n=0; $n< $numgen; $n++){
+		&add_immigrants();
+		@population = &permute( \@population );
+		$k = 0;
+		$k = &crossover( $k );
+		$k = &mutate( $k );
+		$k = &add_parents( $k );
+		@pool = sort{$a->{score} <=> $b->{score}} @pool;
+		$average = &fill_population();
+
+		# check if we've done enough already, and print out status
+		if( $verb eq "y" ){printf("  Generation %i of %i, average fitness %0.4f\n", $n+1, $numgen, $average );}
+		if( $oldavg >= $average && $oldavg - $average < $err ){last;}
+		$oldavg = $average;
+	}
+
+	# save the final best one
+	for($i=0; $i<= $#data; $i++){
+		&fill_assignments( \@{$population[0]->{assignments}} );
+	}
+}
+
+#
+# print final log to stdout
+#
+if( $verb eq "y" || $verb eq "l" ){&print_info;}
+
+#
+# print result
+#
+if( $out ne "" ){
+	open(OUT,">$out");
+	print OUT "$title\t$bname\n";
+	for($i=0; $i<= $#data; $i++){
+		$orig[$i][1] = $data[$i][$#{$data[0]}];
+		printf OUT ("%s\t%i\n", $orig[$i][0], $orig[$i][1]);
+	}
+	close(OUT);
+}
+
+###############
+# SUBROUTINES #
+###############
+
+# read command line options
+sub read_command_line{
+	my $i;
+	
+	#
+	# option default values
+	#
+	$in = "";
+	$out = "";
+	$bcolumn = "";
+	$batch = "";
+	$bname = "batch";
+	$phenocols = "";
+	$contcols = "";
+	$datecols = "";
+	$bins = 5;
+	@blist = ();
+	$verb = "y";
+	$mmi = 0;
+
+	$options = "
+Usage: ./ARTS.pl <OPTIONS>
+	REQUIRED:
+	-i  input traits (rectangular, tab-delimited matrix, including title line with column names)
+	-c  trait columns to randomize
+	    comma- and semicolon delimited list, columns indexed from 1
+	    all traits indicated by commas are used in joint distributions
+	AND EITHER -b AND -o, OR -p:
+	-b  batch sizes (a single number, or a comma-delimited list)
+	-o  output file (formatted same as input file, with batch added as last column)
+	   -or-
+	-p  <batch column index>: print MI statistic for input traits using this column as batch designations
+	    -p will not do any sampling
+	OTHER OPTIONS:
+	-cc continuously-valued columns (will be binned)
+	-cd date-valued columns (should be M/D/Y); should also list these as continuous (in -cc)
+	-cb number of bins to use for continuous or date columns (default: $bins for each)
+	    can give 1 value, or a list of the same length as -cc; if a list, will be assigned in the same order as -cc
+	-bn batch name (title of added column, default $bname)
+	-s  random number seed (large negative integer, default: $seed)
+";
+
+#
+# Secret options:
+# -v y or l (verbose: print all, or just print status from beginning or end)
+# -mmi force use of MMI objective function on all columns indicated by -c, over-riding any other settings from -c
+#
+
+	GetOptions('i=s'  => \$in,
+		   'o=s'  => \$out,
+		   'p=i'  => \$bcolumn,
+		   'b=s'  => \$batch,
+		   'c=s'  => \$phenocols,
+		   'cc=s' => \$contcols,
+		   'cd=s' => \$datecols,
+		   'cb=s' => \$bins,
+		   'bn=s' => \$bname,
+		   's=i'  => \$seed,
+		   'mmi'  => \$mmi,
+		   'v=s'  => \$verb,
+		) || die "$options\n";
+
+	#
+	# check that required inputs exist
+	#
+	if( $in eq "" ){&exit_required("i");}
+	if( ($out eq "" || $batch eq "") && $bcolumn eq "" ){&exit_required("b and -o, or -p,");}
+	if( $phenocols eq "" || $phenocols eq "None" ){&exit_required("c");}
+
+	#
+	# check that inputs values are OK
+	#
+	if( $bcolumn ne "" ){
+		if( $bcolumn < 1 ){&exit_err("p","at least 1");}
+		$bcolumn--;
+	}
+	if( $verb ne "y" && $verb ne "n" && $verb ne "l" ){&exit_err("v","y or n or l");}
+	if( $seed > 0 ){$seed*= -1;}
+	if( $seed == 0 ){&exit_err("s","non-zero");}
+
+	#
+	# if mmi, reset phenocols value using all found columns
+	#
+	if( $mmi ){
+		@initcs = split(/[,;]/, $phenocols);
+		# remove duplicates
+		@clist = ();
+		$cinds = {};
+		for($i=0; $i<= $#initcs; $i++){
+			if( $cinds->{$initcs[$i]} eq "" ){
+				$cinds->{$initcs[$i]} = 1;
+				push(@clist, $initcs[$i]);
+			}
+		}
+		# add to new phenocols
+		$phenocols = "$clist[0]";
+		for($i=1; $i<= $#clist; $i++){
+			$phenocols = sprintf("%s,%s", $phenocols, $clist[$i]);
+		}
+		$phenocols = sprintf("%s;%s", $phenocols, $clist[0]);
+		for($i=1; $i<= $#clist; $i++){
+			$phenocols = sprintf("%s;%s", $phenocols, $clist[$i]);
+		}
+	}
+
+	#
+	# extract phenotype columns
+	#
+	@cols = ();
+	@allcols = ();
+	$alllist = {};
+	@jointlist = split(';',$phenocols);
+	for($i=0; $i<= $#jointlist; $i++){
+		@tmp = split(',',$jointlist[$i]);
+		@tmp = &fix_cols( \@tmp );
+		push(@cols, [@tmp]);
+		for($j=0; $j<= $#tmp; $j++){
+			if( $alllist->{$tmp[$j]} eq "" ){
+				$alllist->{$tmp[$j]} = 1;
+				push(@allcols, $tmp[$j]);
+			}
+		}
+	}
+
+	#
+	# extract continuous and date columns
+	# sort continuous columns so that bins correspond to them in order
+	#
+	if( $contcols ne "" && $contcols ne "None" ){
+		@conts = split(',',$contcols);
+		@conts = &fix_cols( \@conts );
+		$numconts = $#conts+1;
+	}
+	if( $datecols ne "" && $datecols ne "None" ){
+		@dates = split(',',$datecols);
+		@dates = &fix_cols( \@dates );
+		$numdates = $#dates+1;
+		# check that date columns are among continuous columns
+		for($i=0; $i<= $#dates; $i++){
+			for($j=0; $j<= $#conts; $j++){
+				if( $dates[$i] == $conts[$j] ){last;}
+				if( $j==$#conts ){
+					printf("Error: please specify date column %i as continuous\n", $dates[$i]+1 );
+					die;
+				}
+			}
+		}
+	}
+	if( $bins =~ /,/ ){
+		@blist = split(',',$bins);
+		if( $#blist+1 != $#conts + 1 ){
+			printf("Error: you input %i bins, but %i columns that need binning\n", $#blist+1, $#conts+1);
+			die;
+		}
+	}
+	else{
+		for($i=0; $i<= $#conts; $i++){
+			push(@blist, $bins);
+		}
+	}		
+}
+
+# print error message for flag $_[0], with correct values $_[1], and print usage
+sub exit_err{
+        printf("Error: set -%s to be %s\n%s\n", $_[0], $_[1], $options);
+        exit;
+}
+# print error message saying flag $_[0] is required
+sub exit_required{
+        printf("Error: option -%s is required\n%s\n", $_[0], $options);
+        exit;
+}
+
+# fix all indices in array $_[0]: cast to integer, check at least 1, and subtract 1
+sub fix_cols{
+	my @list;
+	my $i;
+	@list = @{$_[0]};
+	for($i=0; $i<= $#list; $i++){
+		$list[$i] = sprintf("%i", $list[$i]);
+		if( $list[$i] < 1 ){
+			print "Error: column indices should be at least 1\n";
+			die;
+		}
+		$list[$i]--;
+	}
+	return @list;
+}
+
+# print info about best randomization
+sub print_info{
+	#
+	# get MI of each phenotype and average
+	#
+	$bestmi = &mutual_info();
+	@bestmilist = &individual_mi( $numbatches );
+	$bestavgmi = 0;
+	for($i=0; $i<= $#bestmilist; $i++){
+		$bestavgmi+= $bestmilist[$i]/($#bestmilist+1);
+	}
+
+	printf("Final MI %0.4f ; Individual trait MIs (mean %0.4f ): ", $bestmi, $bestavgmi);
+	for($i=0; $i<= $#bestmilist; $i++){
+		printf("\t%0.4f", $bestmilist[$i]);
+	}
+	print "\n-----------------------------------------------------------------\n";
+	# 
+	# print the counts for each phenotype in each batch
+	# 
+	# first title line: phenotype names
+	for($i=0; $i<= $#allcols; $i++){
+		printf("\t%s values", $titlevals[$allcols[$i]]);
+		for($j=1; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
+			printf("\t");
+		}
+	}
+	print "\nBatch (size)";
+	# second title line: phenotype values
+	for($i=0; $i<= $#allcols; $i++){
+		for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
+			if( $items->{$allcols[$i]}->{list}[$j] ne "" ){printf("\t%s", &name($items->{$allcols[$i]}->{list}[$j], $allcols[$i]) );}
+			else{printf("\tempty");}
+		}
+	}
+	print "\n-------";
+	# print a line of dashes to separate
+	for($i=0; $i<= $#allcols; $i++){
+		for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
+			printf("\t-------");
+		}
+	}
+	print "\n";
+	for($k=0; $k< $numbatches; $k++){
+		printf("%s (%i)", $batchnames[$k], $batchsizes[$k] );
+		for($i=0; $i<= $#allcols; $i++){
+			for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
+				printf("\t%i", &count( $#{$data[0]}, $batchnames[$k], $allcols[$i], $items->{$allcols[$i]}->{list}[$j] ) );
+			}
+		}
+		print "\n";
+	}
+	print "-------";
+	# print a line of dashes to separate
+	for($i=0; $i<= $#allcols; $i++){
+		for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
+			printf("\t-------");
+		}
+	}
+	# print totals for each type
+	print "\nTotal";
+	for($i=0; $i<= $#allcols; $i++){
+		for($j=0; $j<= $#{$items->{$allcols[$i]}->{list}}; $j++){
+			printf("\t%i", $items->{$allcols[$i]}->{$items->{$allcols[$i]}->{list}[$j]}[1] );
+		}
+	}
+	print "\n";
+}
+
+# for continuous valued columns, checked by $_[1], convert value $_[0] back to a range
+sub name{
+	my $i;
+	my $binw;
+
+	# 
+	# if there aren't any continuous columns, or $_[1] doesn't match one, just return $_[0]
+	#
+	if( $#conts < 0 ){return $_[0];}
+	for($i=0; $i<= $#conts; $i++){
+		if( $_[1] == $conts[$i] ){last;}
+		if( $i==$#conts ){return $_[0];}
+	}
+
+	#
+	# convert bin value back to continuous value
+	#
+	$binw = ($contstats[$i][2]-$contstats[$i][0])/$blist[$i];
+	$val1 = $binw*$_[0]+$contstats[$i][0];
+	$val2 = $binw*($_[0]+1)+$contstats[$i][0];
+	
+	# 
+	# if there aren't any date columns, or $_[1] doesn't match one, just return the range val1-val2
+	#
+	if( $#dates < 0 ){return sprintf("%s-%s", $val1, $val2);}
+	for($i=0; $i<= $#dates; $i++){
+		if( $_[1] == $dates[$i] ){last;}
+		if( $i==$#dates ){return sprintf("%s-%s", $val1, $val2);}
+	}
+	$val1 = sprintf("%i", $val1);
+	$val2 = sprintf("%i", $val2);
+	return sprintf("%s-%s", &convert_date( $val1 ), &convert_date( $val2 ));
+
+}
+
+# read in regular matrix from $in
+# for continuous (including date-value) columns, make histograms
+sub read_data{
+	my @lines;
+	my $i;
+	@data = ();
+	$items = {};
+	@orig = ();
+	@titlevals = ();
+	@batchsizes = ();
+	$numbatches = 0;
+
+	#
+	# fix newline convention
+	#
+	
+	
+	open(IN,"$in") || die "Error: can't open $in\n";
+	#
+	# read in all lines and check formatting
+	#
+	@lines = <IN>;
+	if( $lines[0] =~ /\r/ && $#lines == 0 ){
+		# this happens with tab-delimited text saved from excel
+		@lines = split('\r', $lines[0]);
+	}
+
+	#
+	# read title line
+	#
+	$title = $lines[0];
+	chomp($title);
+	@titlevals = split('\t',$title);
+	for($k=1; $k<= $#lines; $k++){
+		$line = $lines[$k];
+		chomp($line);
+		if( $line ne "" ){	# ignore blank lines
+			@parts = split('\t',$line);
+			for($i=$#parts+1; $i<= $#titlevals; $i++){
+				push(@parts, "");
+			}
+			if( $#parts != $#titlevals ){
+				printf("Error: not enough columns in line %i\n", $#data+2);
+				die;
+			}
+			# push 1 extra for the batch
+			push(@parts, 0);
+			push(@data, [(@parts)] );
+			push(@orig, [($line, 0)] );
+		}
+	}
+	close(IN);
+
+	#
+	# exit if no data read
+	#
+	if( $#data < 0 ){
+		printf("Error: no samples were read in\n");
+		die;
+	}
+
+	#
+	# if batch is not empty, check batches:
+	#   if no commas, cast to integer and count how many are needed
+	#   if there are commas, get batched on given sizes
+	#   double check that we add up
+	# 
+	@batchnames = ();
+	if( $batch ne "" ){
+		if( $batch !~ /,/ ){
+			$batch = sprintf("%i", $batch);
+			# fix batch size if too big
+			if( $batch > $#data + 1 ){
+				printf("Warning: you have %i samples, but asked for a batch size of %i, so there is only 1 batch\n", $#data+1, $batch);
+				$batch = $#data+1;
+			}
+			$numbatches = ($#data+1)/$batch;
+			$exactbatches = sprintf("%i", $numbatches);
+			if( $exactbatches < $numbatches ){$exactbatches++;}
+			$numbatches = $exactbatches;
+			for($i=0; $i< $numbatches-1; $i++){
+				push(@batchsizes, $batch);
+			}
+			push(@batchsizes, $batch - ($numbatches*$batch - ($#data+1)) );
+		}
+		else{
+			@batchsizes = split(',',$batch);
+			$numbatches = $#batchsizes+1;
+		}
+		$tot = 0;
+		for($i=0; $i< $numbatches; $i++){
+			push(@batchnames, $i+1);
+			$tot+= $batchsizes[$i];
+		}
+		if( $tot != $#data+1 ){
+			printf("Error: have %i spaces in all batches, but %i samples\n", $tot, $#data+1);
+			die;
+		}
+	}
+
+	#
+	# convert dates to numbers
+	#
+	for($i=0; $i<= $#data; $i++){
+		for($j=0; $j<= $#dates; $j++){
+			if( $data[$i][$dates[$j]] ne "" ){$data[$i][$dates[$j]] = &convert_date( $data[$i][$dates[$j]] );}
+		}
+	}
+
+	# 
+	# for all continuous columns, compute median and fill in missing values
+	# also record max and min for binning
+	#
+	@contstats = ();	# records min, median, max for each continuous column
+	for($j=0; $j<= $#conts; $j++){
+		@tmp = ();
+		for($i=0; $i<= $#data; $i++){
+			if( $data[$i][$conts[$j]] ne "" ){push(@tmp, $data[$i][$conts[$j]]);}
+		}
+		@tmp = sort{ $a <=> $b } @tmp;
+		$median = $tmp[sprintf("%i", ($#tmp+1)/2)];
+		push(@contstats, [($tmp[0], $median, $tmp[$#tmp])] );
+	#	for($i=0; $i<= $#data; $i++){
+	#		if( $data[$i][$conts[$j]] eq "" ){$data[$i][$conts[$j]] = $median;}
+	#	}
+	}
+
+	#
+	# for all continuous columns, bin data
+	#
+	for($j=0; $j<= $#conts; $j++){
+		$binw = ($contstats[$j][2] - $contstats[$j][0])/($blist[$j]);
+		if( $binw == 0 ){
+			printf("Error: max and min of column %i are equal (max/min are %f/%f)\n", $conts[$j]+1, $contstats[$j][2], $contstats[$j][0] );
+			die;
+		}
+		for($i=0; $i<= $#data; $i++){
+			if( $data[$i][$conts[$j]] ne "" ){
+				$data[$i][$conts[$j]] = sprintf("%i", ($data[$i][$conts[$j]] - $contstats[$j][0])/$binw);
+				if( $data[$i][$conts[$j]] >= $blist[$j] ){$data[$i][$conts[$j]] = $blist[$j]-1;}
+			}
+		}
+	}
+
+	# 
+	# for each column we're using, count how many item types there are
+	# empty phenotypes are considered their own, distinct phenotype
+	#
+	$items = &itemize( \@allcols );
+}
+
+# count how many item types of @{$_[0]} there are in @data
+sub itemize{
+	my $i;
+	my $j;
+	my $info;
+	my @cols;
+	my $items;
+	@cols = @{$_[0]};
+
+	for($j=0; $j<= $#cols; $j++){
+		$info = {};
+		$info->{list} = ();
+		for($i=0; $i<= $#data; $i++){
+			if( $info->{$data[$i][$cols[$j]]} eq "" ){
+				$info->{$data[$i][$cols[$j]]} = [($#{$info->{list}}+1,0)];
+				push(@{$info->{list}}, $data[$i][$cols[$j]]);
+			}
+			$info->{$data[$i][$cols[$j]]}[1]++;
+			$info->{count}++;
+		}
+		$info->{num} = $#{$info->{list}}+1;
+		$items->{$cols[$j]} = $info;
+	}
+
+	for($j=0; $j<= $#cols; $j++){
+		# this set prints the number of values and counts for each phenotype
+		#printf("%i,%s:", $cols[$j], $titlevals[$cols[$j]]);
+		#for($k=0; $k<= $#{$items->{$cols[$j]}->{list}}; $k++){
+		#	printf("\t%s,%i", $items->{$cols[$j]}->{list}[$k], $items->{$cols[$j]}->{$items->{$cols[$j]}->{list}[$k]}[1]  );
+		#}
+		#print "\n";
+		if( $items->{$cols[$j]}->{num} > 20 ){
+			printf("Warning: column %i (%s) has %i values; should you make it continuous?\n", $cols[$j], $titlevals[$cols[$j]], $items->{$cols[$j]}->{num} );
+		}
+	}
+	return $items;
+}
+
+# convert date in M/D/Y to integer, or integer to M/D/Y
+sub convert_date{
+	my $date;
+	my $month;
+	my $day;
+	my $year;
+	my $months;
+	my $i;
+	# cumulative days per month
+	$months->{0} = 0;
+	$months->{1} = 31;
+	$months->{2} = 59;
+	$months->{3} = 90;
+	$months->{4} = 120;
+	$months->{5} = 151;
+	$months->{6} = 181;
+	$months->{7} = 212;
+	$months->{8} = 243;
+	$months->{9} = 273;
+	$months->{10} = 304;
+	$months->{11} = 334;
+	$months->{12} = 365;
+	$date = $_[0];
+
+	# convert date to integer
+	if( $date =~ /\// ){
+		($month, $day, $year) = split('/',$date);
+		$month = sprintf("%i", $month);
+		$day = sprintf("%i", $day);
+		$year = sprintf("%i", $year);
+		if( $month < 1 || $month > 12 ){
+			print "Error: found a month $month not between 1 and 12\n";
+			die;
+		}
+		if( $day < 1 || $day > 31 ){
+			print "Error: found a day $day not between 1 and 31\n";
+			die;
+		}
+			
+		return $day + $months->{$month-1} + $year*$months->{12};
+	}
+	# convert integer to date
+	elsif( $date == sprintf("%i", $date) ){
+		$year = sprintf("%i", $date/($months->{12}));
+		$month = $date-$year*$months->{12};
+		for($i=1; $i<=12; $i++){
+			if( $month < $months->{$i} ){last;}
+		}
+		$day = $month - $months->{$i-1};
+		$month = $i;
+		return sprintf("%s/%s/%s", $month, $day, $year);
+	}
+	else{
+		printf("\nError: unrecognized format in convert_date(): %s\n", $date);
+		die;
+	}
+}
+
+# set globals used by ran1
+sub ran1_init{
+	#
+	# random number variables
+	#
+	$iset = 0;
+	$gset = 0;
+	#$iseed = clock_gettime(CLOCK_REALTIME);
+	#($first, $second) = split('\.', $iseed);
+	#$seed = sprintf("-%i%i", $second, $first);
+	$seed = -10854829;
+	$M1 = 259200;
+	$IA1 = 7141;
+	$IC1 = 54773;
+	$RM1 = 1.0/$M1;
+	$M2 = 134456;
+	$IA2 = 8121;
+	$IC2 = 28411;
+	$RM2 = 1.0/$M2;
+	$M3 = 243000;
+	$IA3 = 4561;
+	$IC3 = 51349;
+	$iff = 0;
+	$ix1 = 0;
+	$ix2 = 0;
+	$ix3 = 0;
+	@ranarray = ();
+	for($i=0; $i< 98; $i++){
+		push(@ranarray, 0);
+	}
+}
+
+# uniform random number generator, seed, iff, and various capital-letter variables set in beginning
+sub ran1{
+	my $j;
+	my $temp;
+
+	if( $seed < 0 || $iff == 0 ){
+		$iff = 1;
+		$ix1 = ($IC1 - $seed)%$M1;
+		$ix1 = ($IA1*$ix1 + $IC1)%$M1;
+		$ix2 = $ix1%$M2;
+		$ix1 = ($IA1*$ix1 + $IC1)%$M1;
+		$ix3 = $ix1%$M3;
+		for($j=1; $j<= 97; $j++){
+			$ix1 = ($IA1*$ix1 + $IC1)%$M1;
+			$ix2 = ($IA2*$ix2 + $IC2)%$M2;
+			$ranarray[$j] = ($ix1 + $ix2*$RM2)*$RM1;
+		}
+		$seed = 1;
+	}
+	$ix1 = ($IA1*$ix1 + $IC1)%$M1;
+	$ix2 = ($IA2*$ix2 + $IC2)%$M2;
+	$ix3 = ($IA3*$ix3 + $IC3)%$M3;
+
+	$j = sprintf("%i", 1 + ((97*$ix3)/$M3) );
+	if( $j> 97 || $j< 1 ){
+		printf("Error in ran1: $j outside of [1:97]\n");
+		die;
+	}
+	$temp = $ranarray[$j];
+	$ranarray[$j] = ($ix1 + $ix2*$RM2)*$RM1;
+	return $temp;
+}
+
+# permute array $_[0]
+sub permute{
+	my @assignments;
+	my $i;
+	my $j;
+	my $tmp;
+
+	@assignments = @{$_[0]};
+
+	# 
+	# shuffle batches randomly
+	#
+	for($i=$#assignments; $i>= 0; $i--){
+		$j = sprintf("%i", ($i+1)*&ran1() );
+		$tmp = $assignments[$j];
+		$assignments[$j] = $assignments[$i];
+		$assignments[$i] = $tmp;
+	}
+	return @assignments;
+}
+
+# fill data with assignments $_[0]
+sub fill_assignments{
+	my @list;
+	my $i;
+	@list = @{$_[0]};
+	if( $#list != $#data ){
+		print "Error in fill_assignments: mismatching list lengths\n";
+		die;
+	}
+	for($i=0; $i<= $#list; $i++){
+		$data[$i][$#{$data[0]}] = $list[$i];
+	}
+}
+
+# compute mutual information of a batch assignment
+sub mutual_info{
+	my $i;
+	my $s;
+	my $mi;
+	my $stot;
+
+	$mi = 0;
+	for($i=0; $i<= $#cols; $i++){
+		$mi += &this_mi( $_[0], $#{$data[0]}, \@{$cols[$i]} )/($#cols+1);
+	}
+	
+	return $mi;
+}
+
+# compute all single-phenotype mutual information
+sub individual_mi{
+	my $i;
+	my @list;
+	my @milist;
+	
+	@milist = ();
+	for($i=0; $i<= $#allcols; $i++){
+		@list = ($allcols[$i]);
+		push(@milist, &this_mi( $_[0], $#{$data[0]}, \@list ) );
+	}
+	return @milist;
+}
+
+# compute mutual information of columns $_[1] ($_[0] bins) and all of @{$_[2]}
+sub this_mi{
+	my $i;
+	my $j;
+	my $summand;
+	my @list;
+	my $jprob;
+	my $m1prob;
+	my $m2prob;
+	my $jbin;
+	my $m1bin;
+	my $m2bin;
+	my $jbinstot;
+	my $m1binstot;
+	my $m2binstot;
+	my @jbinlist;
+	my @m1binlist;
+	my @m2binlist;
+	my $mi;
+	my $s1;
+	my $s2;
+	my $s;
+	@list = @{$_[2]};
+	
+	# initialize probabilities
+	$jprob = {};			# joint distribution
+	$m1prob = {};			# batch marginal dist
+	$m2prob = {};			# pheno marginal dist
+	@jbinlist = ();			# phenotype combos found in joint distribution
+	@m1binlist = ();		# batches found in batches distribution (1st marginal dist)
+	@m2binlist = ();		# phenotype combos found in phenotypes distribution (2nd marginal dist)
+
+	#	
+	# read through data and add to distributions
+	#
+	$summand = 1.0/($#data+1);
+	for($i=0; $i<= $#data; $i++){
+		#
+		# define bin names based on phenotype/batch
+		# for phenotypes p1, p2, etc., batch b:
+		#   joint = p1_p2_..._pn_b
+		#   1st marginal = b
+		#   2nd marginal = p1_p2_..._pn
+		#
+		$jbin = sprintf("%s", $data[$i][$#{$data[0]}]);
+		$m1bin = sprintf("%s", $data[$i][$#{$data[0]}]);
+		$m2bin = "";
+		for($j=0; $j<= $#list; $j++){
+			# NOTE:
+			# $list[$j] is a phenotype column (e.g., gender)
+			# $data[$i][$list[$j]] is the value of that phenotype in sample $i (e.g., M or F)
+			# $items->{$list[$j]}->{$data[$i][$list[$j]]}[0] is the bin index (e.g., M->0, F->1) of that phenotype
+
+			$jbin = sprintf("%s_%i", $jbin, $items->{$list[$j]}->{$data[$i][$list[$j]]}[0]);
+			if( $j>0 ){$m2bin = sprintf("%s_", $m2bin);}
+			$m2bin = sprintf("%s%i", $m2bin, $items->{$list[$j]}->{$data[$i][$list[$j]]}[0]);
+		}
+
+		# 
+		# check if we've already seen this bin, for each distribution
+		# initialize probabilities and add to list if it's the first time
+		#
+		if( $jprob->{$jbin} eq "" ){
+			$jprob->{$jbin} = 0;
+			push(@jbinlist, [($jbin, $m1bin, $m2bin)] );
+		}
+		if( $m1prob->{$m1bin} eq "" ){
+			$m1prob->{$m1bin} = 0;
+			push(@m1binlist, $m1bin);
+		}
+		if( $m2prob->{$m2bin} eq "" ){
+			$m2prob->{$m2bin} = 0;
+			push(@m2binlist, $m2bin);
+		}
+
+		#
+		# add a count to each distribution
+		#
+		$jprob->{$jbin} += $summand;
+		$m1prob->{$m1bin} += $summand;
+		$m2prob->{$m2bin} += $summand;
+	}
+
+	#
+	# compute mutual information, and entropy of m1prob and m2prob (for normalization)
+	#
+	$mi = 0;
+	$s1 = 0;
+	$s2 = 0;
+	for($i=0; $i<= $#jbinlist; $i++){
+		$mi+= ($jprob->{$jbinlist[$i][0]}) * log( ($jprob->{$jbinlist[$i][0]})/($m1prob->{$jbinlist[$i][1]} * $m2prob->{$jbinlist[$i][2]}) );
+	}
+	for($i=0; $i<= $#m1binlist; $i++){
+		$s1-= $m1prob->{$m1binlist[$i]} * log( $m1prob->{$m1binlist[$i]} );
+	}
+	for($i=0; $i<= $#m2binlist; $i++){
+		$s2-= $m2prob->{$m2binlist[$i]} * log( $m2prob->{$m2binlist[$i]} );
+	}
+	$s = sqrt($s1*$s2);
+
+	#	
+	# normalize mi
+	#
+	if( $s>0 ){$mi/= $s;}
+
+	#
+	# return normalized mi (0=independent, 1=completely dependent)
+	#
+	return $mi;
+}
+
+# count how many of @data have column $_[0] equal $_[1] and column $_[2] equal $_[3]
+sub count{
+	my $i;
+	my $tot;
+
+	$tot = 0;
+	for($i=0; $i<= $#data; $i++){
+		if( $data[$i][$_[0]] eq $_[1] && $data[$i][$_[2]] eq $_[3] ){$tot++;}
+	}
+	return $tot;
+}
+
+# initialize GA parameters and large matrices
+sub ga_init{
+	my $i;
+	my $j;
+	my $info;
+
+	$popsize = 100;
+	$numgen = 300;
+	$nchrmuts = 2;
+	$nnewimm = 10;
+	$nkeepparents = 2;
+	$nchrpool = ($nnewimm+$popsize) + ($nnewimm+$popsize)*$nchrmuts + $nkeepparents;
+
+	#
+	# population to turn over each generation
+	#
+	@population = ();
+	for($i=0; $i< $popsize+$nnewimm; $i++){
+		$info = {};
+		$info->{score} = 0;
+		$info->{assignments} = [()];
+		for($j=0; $j<= $#data; $j++){
+			push(@{$info->{assignments}}, 0);
+		}
+		push(@population, $info);
+	}
+
+	#
+	# new individuals to fill each generation
+	#
+	@pool = ();
+	for($i=0; $i< $nchrpool; $i++){
+		$info = {};
+		$info->{score} = 0;
+		$info->{assignments} = [()];
+		for($j=0; $j<= $#data; $j++){
+			push(@{$info->{assignments}}, 0);
+		}
+		push(@pool, $info);
+	}
+
+	#
+	# array to randomize for batch assignments
+	#
+	@batched = ();
+	@bcounts = ();
+	for($i=0; $i<= $#batchsizes; $i++){
+		push(@bcounts, 0);
+		for($j=0; $j< $batchsizes[$i]; $j++){
+			push(@batched, $i+1);
+		}
+	}
+}
+
+# initialize the population array: randomize $popsize batches and score each one
+sub initialize_population{
+	my $i;
+	
+	for($i=0; $i< $popsize; $i++){
+		@{$population[$i]->{assignments}} = &permute( \@batched );
+		&fill_assignments( \@{$population[$i]->{assignments}} );
+		$population[$i]->{score} = &mutual_info( $numbatches );
+	}
+}
+
+# complete the crossover step, keeping track of our index with $_[0]
+sub crossover{
+	my $i;
+	my $j;
+	my $k;
+
+	$k = $_[0];
+
+	for($i=0; $i< $popsize + $nnewimm; $i+=2){
+		&do_cross($i, $k);
+		&fill_assignments( \@{$pool[$k]->{assignments}} );
+		$pool[$k]->{score} = &mutual_info( $numbatches );
+		$k++;
+		&fill_assignments( \@{$pool[$k]->{assignments}} );
+		$pool[$k]->{score} = &mutual_info( $numbatches );
+		$k++;
+	}
+	return $k;
+}
+
+# do a crossover between population members $_[0] and $_[0]+1, fill to pool members $_[1] and $_[1]+1
+sub do_cross{
+	my $i;
+	my $j;
+	my $popmem;
+	my $poolmem;
+	my $index;
+	my @swap;
+	my @subswap;
+	my $swapinds;
+
+	$popmem = $_[0];
+	$poolmem = $_[1];
+	$index = sprintf("%i", $#data * &ran1() );
+
+	#
+	# count how many of each batch to switch
+	#
+	for($i=0; $i<= $#bcounts; $i++){
+		$bcounts[$i] = 0;
+	}
+	for($i=0; $i<= $index; $i++){
+		$bcounts[$population[$popmem]->{assignments}[$i] - 1]++;
+	}
+
+	#
+	# for each batch i:
+	# 	record into subswap all indices with that batch from popmem+1
+	#	permute subswap
+	#	add first bcounts[$i] into swap
+	# then sort swap
+	#
+	@swap = ();
+	$swapinds = {};
+	for($i=0; $i<= $#bcounts; $i++){
+		@subswap = ();
+		for($j=0; $j<= $#data; $j++){
+			if( $population[$popmem+1]->{assignments}[$j] == $i+1 ){
+				push(@subswap, $j);
+			}
+		}
+		@subswap = &permute( \@subswap );
+		for($j=0; $j< $bcounts[$i]; $j++){
+			push(@swap, $subswap[$j]);
+			$swapinds->{$subswap[$j]} = 1;
+		}
+	}
+	@swap = sort{$a <=> $b} @swap;
+
+	#
+	# fill start of first new chr from swap indices of second chr, and end from end of first chr
+	#
+	for($i=0; $i<= $index; $i++){
+		$pool[$poolmem]->{assignments}[$i] = $population[$popmem+1]->{assignments}[$swap[$i]];
+	}
+	for($i=$index+1; $i<= $#data; $i++){
+		$pool[$poolmem]->{assignments}[$i] = $population[$popmem]->{assignments}[$i];
+	}
+
+	#
+	# fill start of second chr from start of first chr, and end from remaining parts of second chr
+	#
+	for($i=0; $i<= $index; $i++){
+		$pool[$poolmem+1]->{assignments}[$i] = $population[$popmem]->{assignments}[$i];
+	}
+	$j = $index+1;
+	for($i=0; $i<= $#data; $i++){
+		if( $swapinds->{$i} != 1 ){
+			$pool[$poolmem+1]->{assignments}[$j] = $population[$popmem+1]->{assignments}[$i];
+			$j++;
+		}
+	}
+	
+
+	#
+	# check that batch counts are still ok
+	#
+	$checkbatch = 0;
+	if( $checkbatch ){
+		for($i=0; $i<= $#bcounts; $i++){
+			$bcounts[$i] = 0;
+		}
+		for($i=0; $i<= $#data; $i++){
+			$bcounts[$pool[$poolmem]->{assignments}[$i] - 1]++;
+		}
+		for($i=0; $i<= $#bcounts; $i++){
+			if( $bcounts[$i] != $batchsizes[$i] ){
+					print "Error in do_cross: lost some batch counts in first daughter chr\n";
+				exit;
+			}
+		}
+		for($i=0; $i<= $#bcounts; $i++){
+			$bcounts[$i] = 0;
+		}
+		for($i=0; $i<= $#data; $i++){
+			$bcounts[$pool[$poolmem+1]->{assignments}[$i] - 1]++;
+		}
+		for($i=0; $i<= $#bcounts; $i++){
+			if( $bcounts[$i] != $batchsizes[$i] ){
+				print "Error in do_cross: lost some batch counts in first daughter chr\n";
+				exit;
+			}
+		}
+	}
+}
+
+# complete the mutation step, keeping track of our index with $_[0]
+sub mutate{
+	my $i;
+	my $j;
+	my $k;
+
+	$k = $_[0];
+
+	for($i=0; $i< $popsize+$nnewimm; $i++){
+		for($j=0; $j< $nchrmuts; $j++){
+			&do_mutation($i, $k);
+			&fill_assignments( \@{$pool[$k]->{assignments}} );
+			$pool[$k]->{score} = &mutual_info( $numbatches );
+			$k++;
+		}
+	}
+	return $k;
+}
+
+# do a mutation for population member $_[0], fill to pool member $_[1]
+sub do_mutation{
+	my $i;
+	my $popmem;
+	my $poolmem;
+	my $index1;
+	my $index2;
+
+	$popmem = $_[0];
+	$poolmem = $_[1];
+
+	#
+	# fill all of poolmem
+	#
+	for($i=0; $i<= $#data; $i++){
+		$pool[$poolmem]->{assignments}[$i] = $population[$popmem]->{assignments}[$i];
+	}
+
+	# 
+	# switch two members
+	#
+	$index1 = sprintf("%i", ($#data+1) * &ran1() );
+	$index2 = sprintf("%i", ($#data+1) * &ran1() );
+
+	$pool[$poolmem]->{assignments}[$index1] = $population[$popmem]->{assignments}[$index2];
+	$pool[$poolmem]->{assignments}[$index2] = $population[$popmem]->{assignments}[$index1];
+}
+
+# add immigrants, keeping track of our index with $_[0]
+sub add_immigrants{
+	my $j;
+
+	for($j=0; $j< $nnewimm; $j++){
+		@{$population[$popsize+$j]->{assignments}} = &permute( \@batched );
+		&fill_assignments( \@{$population[$popsize+$j]->{assignments}} );
+		$population[$popsize+$j]->{score} = &mutual_info( $numbatches );
+		$k++;
+	}
+}
+
+# add top-scoring parents, keeping track of our index with $_[0]
+sub add_parents{
+	my $i;
+	my $j;
+	my $k;
+
+	$k = $_[0];
+	# sort population now
+	@population = sort{$a->{score} <=> $b->{score}} @population;
+
+	for($j=0; $j< $nkeepparents; $j++){
+		&copy_parents( $j, $k );
+		# will copy score also in copy_parents()
+		$k++;
+	}
+	return $k;
+}
+
+# add population member $_[0] to pool member $_[1]
+sub copy_parents{
+	my $i;
+	my $popmem;
+	my $poolmem;
+	my $index1;
+	my $index2;
+
+	$popmem = $_[0];
+	$poolmem = $_[1];
+
+	#
+	# fill all of poolmem
+	#
+	for($i=0; $i<= $#data; $i++){
+		$pool[$poolmem]->{assignments}[$i] = $population[$popmem]->{assignments}[$i];
+	}
+	$pool[$poolmem]->{score} = $population[$popmem]->{score};
+}
+
+# copy top of pool to population (assumes pool is sorted)
+sub fill_population{
+	my $i;
+	my $j;
+	my $m;
+
+	$m = 0;
+	for($i=0; $i< $popsize; $i++){
+		$population[$i]->{score} = $pool[$i]->{score};
+		$m+= $population[$i]->{score};
+		for($j=0; $j<= $#data; $j++){
+			$population[$i]->{assignments}[$j] = $pool[$i]->{assignments}[$j];
+		}
+	}
+	return $m/$popsize;
+}
+
+
+
+
author	mmaiensc
date	Wed, 13 Nov 2013 16:13:17 -0500
parents
children