changeset 1:cd8def22f313

Deleted selected files
author mmaiensc
date Wed, 29 Feb 2012 15:12:29 -0500
parents 003f802d4c7d
children 1a84b8178b45
files GALAXY_FILES/tools/EMBER/Ember_Galaxy.c
diffstat 1 files changed, 0 insertions(+), 1227 deletions(-) [+]
line wrap: on
line diff
--- a/GALAXY_FILES/tools/EMBER/Ember_Galaxy.c	Wed Feb 29 15:03:33 2012 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1227 +0,0 @@
-/* 
- * ember.c : Expectation Maximization of Binding and Expression pRofiles
- * ----------------------------------------------------------------
- * algorithm adapted from MEME algorithm: Bailey and Elkan, Proc Int Conf Intell Syst Mol Biol, 2:28-36 (1994).
- * 	See Eqs. 3-13, plus "Implementation of MM" section for detailed description
- * changes are as follows:
- * 	background changes with column (j=1,...,numclasses), so it is a 2D histogram of the same size as the motif model; same for psuedocounts
- * 	background model is determined by full list of genes (second input); z-values of genes in first input are subtracted
- *	   update of background can be turned off by commenting out 8th line of z_update()
- *	   in this case, also comment out index_genes at end of initialize(), as it will save some time
- * 	sequences cannot overlap, since they are fixed per gene; this simplifies the enumeration of all possibilities
- * 	a ZOOPS options is included (change in code, parameter zoops); this does not alter the algorithm, 
- * 	   only adds a restriction on which genes are chosen as targets in choose_targets()
- * general notes:
- * 	the common dimensions are:
- * 	   numpeaks - number of peaks, size of peaklist, indexed by m
- * 	   peaklist[m].numgenes - number of genes matched to a peak, size of peaklist[m].genelist, indexed by n
- * 	   numclasses - number of comparisons made (i.e. motif width), indexed by j
- * 	   5 - number of classifications in each comparison, assumed to be -2,-1,0,1,2, indexed by k
- * 	   2 - number of models (motif=0 and background=1), indexed by l
- * 	   nmots - total number of motifs to find, indexed by i in main (always stored at l=0, as found motifs are erased)
- *      col_tog can be used to force the algorithm to ignore certain columns (behavior dimensions)
- *
- * things to add/consider:
- * 	- best measure of motif quality score?
- * 	- want to ensure that the same sequence can't get assigned to more than one motif?
- */
-
-#include "all.h"
-
-typedef struct {
-	char id[50];
-	char name[500];
-	char chrom[50];
-	long start;
-	long end;
-	int sense;
-	double z;
-	double erase;
-	double score;
-	int index;	// will refer to index of this id in bklist
-	int used;	// set to 0 if a non-classified gene
-	int assigned;	// set to 1 if assigned
-	int *values;
-} gene;
-
-typedef struct {
-	char info[5000];
-	char chrom[50];
-	long pkposn;
-	int numgenes;
-	int numused;
-	int assgngene;	// set to # targets picked for this gene
-	gene *genelist;
-} peak;
-
-// FREE PARAMETERS TO CHOOSE IN COMMAND LINE
-double mu;	// scale background histogram by mu for psuedocounts
-double tscale;	// scales the threshold (to make choices more or less stringent)
-int zoops;	// set to 1 to enforce ZOOPS model (Zero Or One Per Strand)
-int printall;	// set to 1 to print all genes, not just targets
-int mots;	// number of motifs to search for
-int repeat;	// number of times to wean columns and repeat search
-double rsca;	// percent of average to set relative entropy cutoff
-// END PARAMETERS TO CHOOSE
-
-char *input, *background, *pref, *togs;
-char *target_out, *list_out, *model_out, *annot_out;
-long iseedval;
-int numpeaks, numclasses, usedclasses, numbk, bkused, nmots;
-peak *peaklist;
-gene *bklist;
-double ***hist, ***oldhist, lambda, t, **beta;
-double *Beta;
-int *col_tog, col_togl, set_togs, *col_tog_cpy;
-int verbose;
-
-void read_cmd_line( int argc, char **argv );
-void print_usage();
-void read_data( char *filename );
-void read_bkgrnd( char *filename );
-void initialize( long *seed );
-void reinitialize( long *seed );
-void beta_update();
-void index_genes();
-
-void hist_update();
-void bkhist_update();
-void lambda_update();
-void toggle_update();
-void toggle_reset();
-void z_update();
-void erase_update();
-double lprob( int m, int n, int l );
-double prob( int m, int n, int l );
-void copy_hist();
-double hist_diff();
-double score( int m, int n );
-void t_update();
-double ElogL();
-double ratio_score();
-double rel_entropy();
-double col_rel_entropy( int j );
-double chisq();
-int motif_count();
-
-int assign_gene( int m, long *seed );
-void score_genes();
-void choose_targets();
-
-void print_peak( FILE *fp, int i );
-void print_gene( FILE *fp, int i, int j );
-void print_state( FILE *fp, char *filename );
-//void print_targets( FILE *fp, char *filename );
-void print_model( FILE *fp, char *filename );
-//void print_annotpeak( FILE *fp, char *filename, int motif );
-char *nice_d( double val );
-
-int main(int argc, char *argv[]){
-	int i,iter,r;
-	int pfreq;
-	double diff, conv = 1e-6;
-	long seed;
-	FILE *fout;
-        
-	if(argc == 1 ){
-		print_usage();
-	}
-
-	/*
-	 * read command line params
-	 */
-	read_cmd_line( argc, argv );
-
-	/*
-	 * allocate output names
-	 */
-//	target_out = (char *) malloc((strlen(pref)+20)*sizeof(char));
-//	list_out = (char *) malloc((strlen(pref)+20)*sizeof(char));
-//	model_out = (char *) malloc((strlen(pref)+20)*sizeof(char));
-//	annot_out = (char *) malloc((strlen(pref)+20)*sizeof(char));
-
-	/*
-	 * read in data, copy values from global to local variables
-	 */
-	if( verbose != 0 ) printf("\nReading data and allocating\n");
-	read_data( input );
-	read_bkgrnd( background );
-	nmots = mots;
-	seed = iseedval;
-	
-	/*
-	 * allocate memory for other arrays
-	 */
-	hist = (double ***) alloc3d(sizeof(double),2,numclasses,5);
-	oldhist = (double ***) alloc3d(sizeof(double),2,numclasses,5);
-	beta = (double **) alloc2d(sizeof(double),numclasses,5);
-	Beta = (double *) malloc(numclasses*sizeof(double));
-
-	/*
-	 * find nmots motif models
-	 */
-	for(i=1; i<= nmots; i++){
-		/*
-		 * initialize, fill initial parameter values from z-guesses
-		 */
-		if( verbose != 0 ) printf("Expression Pattern %i,\n   Initializing\n", i);
-		if( i==1 ) initialize( &seed );
-		else reinitialize( &seed );
-		hist_update();
-		lambda_update();
-	
-		/*
-		 * now sample for awhile, repeating repeat times
-		 * each repeat starts at the previously refined search (?)
-		 */
-		for(r=0; r<= repeat; r++){
-			if( verbose != 0 ) printf("   Sampling\n");
-			if( verbose != 0 && repeat> 0 ) printf("   Repeat %i\n", r );
-			diff = 1;
-			pfreq = 10;
-			iter=1;
-			while( diff > conv ){
-				z_update();		// E step
-				copy_hist();		
-				hist_update();		// M step
-				bkhist_update();	// M step
-				lambda_update();	// M step
-				diff = hist_diff();
-				if( iter%pfreq == 0  && verbose != 0 ) printf("      iteration %i, (normalized) ElogL %.2f, change %.2e\n", iter, ElogL(), diff);
-				iter++;
-			}
-			if( repeat > 0 && r< repeat ) toggle_update();
-		}
-	
-		/*
-		 * score and choose final targets
-		 */
-		t_update();
-		if( verbose != 0 ) printf("   Done, scoring and choosing targets on threshold %.3f\n\n", t);
-		score_genes();
-		choose_targets();
-	
-		/*
-		 * print final targets
-		 */
-//		sprintf( target_out,"%s-%i.targets", pref, i);
-//		sprintf( list_out,"%s-%i.list", pref, i);
-//		sprintf( model_out,"%s-%i.model", pref, i);
-//		sprintf( annot_out,"%s-%i.xls", pref, i);
-		print_state(fout, target_out);
-		print_model(fout, model_out);
-//		print_targets(fout, list_out);
-//		print_annotpeak( fout, annot_out, i);
-	
-		/*
-		 * erase found motifs (only if we're going around again)
-		 */
-		if( i< nmots ) erase_update();
-		if( repeat > 0 ) toggle_reset();
-	}
-
-}
-
-/*==================== FUNCTIONS ======================*/
-
-/*
- * read command line inputs and options
- */
-void read_cmd_line( int argc, char **argv ){
-	int i, j;
-	int found_reqs=0;
-	
-	// default values
-	mots = 1;
-	zoops = 0;
-	printall = 0;
-	mu = 0.1;
-	tscale = 1.0;
-	iseedval = make_seed();
-	set_togs = 0;
-	verbose = 1;
-	repeat = 0;
-	rsca = 1.0;
-
-	// parse options
-	for(i=1; i< argc; i++){
-		if( argv[i][0] != '-' ) print_usage();
-		i++;
-		if( i >= argc || argv[i][0] == '-' ) print_usage();
-		switch( argv[i-1][1] ){
-			case 'i':
-				input = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
-				strcpy(input, argv[i]);
-				found_reqs++;
-				break;
-			case 'b':
-				background = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
-				strcpy(background, argv[i]);
-				found_reqs++;
-				break;
-			case 'o': 
-				target_out = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
-				strcpy(target_out, argv[i]);
-				found_reqs++;
-				break;
-			case 'p': 
-				model_out = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
-				strcpy(model_out, argv[i]);
-				found_reqs++;
-				break;
-			case 'n':
-				mots = atoi(argv[i]);
-				if( mots <= 0 ){
-					printf("\nError: choose number of expression patterns to be at least 1\n");
-					print_usage();
-				}
-				break;
-			case 'z':
-				if( argv[i][0] == 'y' ) zoops = 1;
-				else if( argv[i][0] == 'n' ) zoops = 0;
-				else{
-					printf("\nError: if using -z option, choose y or n\n");
-					print_usage();
-				}
-				break;
-			case 'v':
-				if( argv[i][0] == 'y' ) verbose = 1;
-				else if( argv[i][0] == 'n' ) verbose = 0;
-				else{
-					printf("\nError: if using -v option, choose y or n\n");
-					print_usage();
-				}
-				break;
-			case 'm':
-				mu = atof(argv[i]);
-				if( mu<= 0 ){
-					printf("\nError: choose mu to be positive\n");
-					print_usage();
-				}
-				break;
-			case 't':
-				tscale = atof(argv[i]);
-				if( tscale<= 0 ){
-					printf("\nError: choose threshold to be positive\n");
-					print_usage();
-				}
-				break;
-			case 's':
-				iseedval = -1*atol(argv[i]);
-				if( iseedval>= 0 ){
-					printf("\nError: choose seed to be a large positive integer\n");
-					print_usage();
-				}
-				break;
-			case 'c':
-				set_togs = 1;
-				togs = (char *) malloc((strlen(argv[i])+5)*sizeof(char));
-				strcpy( togs, argv[i] );
-				break;
-			case 'a':
-				if( argv[i][0] == 'y' ) printall = 1;
-				else if( argv[i][0] == 'n' ) printall = 0;
-				else{
-					printf("\nError: if using -a option, choose y or n\n");
-					print_usage();
-				}
-				break;
-			case 'r':
-				repeat = atoi(argv[i]);
-				if( repeat< 0 ){
-					printf("\nError: choose repeat to be a non-negative integer\n");
-					print_usage();
-				}
-				break;
-			case 'R':
-				rsca = atof(argv[i]);
-				if( rsca <= 0 ){
-					printf("\nError: choose repeat scalar to be positive\n");
-					print_usage();
-				}
-				break;
-			default:
-				printf("\nError: unkown option: -%c\n", argv[i-1][1]);
-				print_usage();
-		}
-	}
-	// double check that the correct number of required inputs has been read in
-	if( found_reqs != 4 ){
-		printf("\nError: improper number of required inputs found\n");
-		print_usage();
-	}
-}
-
-/*
- * usage info
- */
-void print_usage(){
-	printf("\nUsage: -i matched_data -b background_data -o target_output_name -p model_output_name [option value]\n");
-	printf("Options:\n");
-	printf("   -n number of expression patterns to find (positive integer, default 1)\n");
-	printf("   -z enforce ZOOPS model (y or n, default n)\n");
-	printf("   -m scalar for psuedocounts (positive real number, default 0.1)\n");
-	printf("   -t scalar for threshold (positive real number, default 1.0)\n");
-	printf("   -s seed (positive long integer, default chosen from date and time)\n");
-	printf("   -c column toggle file, used to ignore certain classifications (default: all on)\n");
-	printf("   -v verboseness, turn on/off printing to stdout (y or n, default: y)\n");
-	printf("   -a print all genes (and scores), rather than only those above the threshold (y or n, default: n)\n");
-	printf("   -r repeat search: after convergence of each expression pattern, turn off columns with lower relative entropy\n");
-	printf("      than average*scalar and repeat expression pattern search (non-negative integer, default 0)\n");
-        printf("   -R relative entropy scalar: if using the -r option, multiplies the average relative entropy\n");
-	printf("      by this value to set the threshold for turning off columns (positive real number, default 1.0)\n");
-	printf("\n");
-	exit(0);
-}
-
-/*
- * function: read data from filename into the peaklist data structure
- * notes on assumed format of data:
- *    peak lines start with "PEAK:" field
- *    gene lines start with "GENE:" field
- *    gene lines with only 1 classification are ignored
- *    assume all fields are <5000 characters
- */
-void read_data( char *filename ){
-	int j,m,n;
-	char val[5000], other[5000];
-	FILE *fp;
-
-	/*
-	 * first count number of peaks, and allocate
-	 */
-	fp = fopen(filename,"rt");
-	if( fp == NULL ){
-		printf("Error: can't open file %s\n", filename);
-		exit(0);
-	}
-	numpeaks = 0;
-	while( fscanf(fp, "%s", &val ) != EOF ){
-		if( contains( val, "PEAK" ) == 1 ){
-			numpeaks++;
-		}
-	}
-	fclose(fp);
-	peaklist = (peak *) malloc(numpeaks*sizeof(peak));
-
-	/*
-	 * now count number of genes w/>1 classification, and allocate
-	 * also count number of classifications, and allocate that at the end
-	 * n tracks number of genes, j field; subtract genes that don't have full classifications
-	 * m tracks peak index; record peak info
-	 */
-	fp = fopen(filename,"rt");
-	n=-1;
-	m=-1;
-	j=0;
-	sprintf(other,"NA");
-	numclasses=0;
-	while( fscanf(fp, "%s", &val ) != EOF ){
-		if( contains( val, "PEAK" ) == 1 ){
-			if( j==7 ) ;
-			else{
-				if( j-6 > numclasses ) numclasses = j-6;
-			}
-			if( m>=0 ){
-				peaklist[m].numgenes = n;
-				peaklist[m].genelist = (gene *) malloc( peaklist[m].numgenes*sizeof(gene));
-			}
-			n=0;
-			j=0;
-			if( strcmp(other, "NA") != 0 ) strcpy(peaklist[m].info, other);
-			m++;
-			sprintf(other,"NA");
-		}
-		else if( contains( val, "GENE" ) == 1 ){
-			n++;
-			j=0;
-		}
-		else if( n==0 ){
-			if( j==0 ) strcpy(peaklist[m].chrom, val);
-			if( j==1 ) peaklist[m].pkposn = atol(val);
-			if( j> 1 ){
-				if( strcmp(other, "NA") == 0 ) strcpy(other, val);
-				else sprintf(other,"%s %s", other, val);
-			}
-			j++;
-		}
-		else{
-			j++;
-		}
-	}
-	peaklist[m].numgenes = n;
-	peaklist[m].genelist = (gene *) malloc( peaklist[m].numgenes*sizeof(gene));
-	strcpy( peaklist[m].info, other);
-	fclose(fp);
-
-	for(m=0; m< numpeaks; m++){
-		for(n=0; n< peaklist[m].numgenes; n++){
-			peaklist[m].genelist[n].values = (int *) malloc(numclasses*sizeof(int));
-		}
-	}
-
-	/*
-	 * now read in gene info
-	 */
-	fp = fopen(filename,"rt");
-	m=-1;
-	j=0;
-	while( fscanf(fp, "%s", &val ) != EOF ){
-		if( contains( val, "PEAK" ) == 1 ){
-			if( j==7 ) peaklist[m].genelist[n].used = 0;
-			if( j==6+numclasses) peaklist[m].genelist[n].used = 1;
-			n=-1;
-			j=0;
-			m++;
-		}
-		else if( contains( val, "GENE" ) == 1 ){
-			if( j==7 ) peaklist[m].genelist[n].used = 0;
-			if( j==6+numclasses ) peaklist[m].genelist[n].used = 1;
-			n++;
-			j=0;
-		}
-		else if( n==-1 ){
-		}
-		else {
-			if( j==0 ) strcpy( peaklist[m].genelist[n].id, val );
-			if( j==1 ) strcpy( peaklist[m].genelist[n].name, val );
-			if( j==2 ) strcpy( peaklist[m].genelist[n].chrom, val );
-			if( j==3 ) peaklist[m].genelist[n].start = atol(val);
-			if( j==4 ) peaklist[m].genelist[n].end = atol(val);
-			if( j==5 ) peaklist[m].genelist[n].sense = atoi(val);
-			if( j>=6 ) peaklist[m].genelist[n].values[j-6] = atoi(val);
-			j++;
-		}
-	}
-	if( j==7 ) peaklist[m].genelist[n].used = 0;
-	if( j==6+numclasses) peaklist[m].genelist[n].used = 1;
-	fclose(fp);
-
-	/*
-	 * now count how many genes are to be used for each
-	 */
-	for(m=0; m< numpeaks; m++){
-		peaklist[m].numused = 0;
-		for(n=0; n< peaklist[m].numgenes; n++){
-			peaklist[m].numused+= peaklist[m].genelist[n].used;
-		}
-	}
-}
-
-void read_bkgrnd( char *filename ){
-	int j,n;
-	char val[500];
-	FILE *fp;
-
-	/*
-	 * first count number of genes, and allocate
-	 */
-	numbk = count_lines( fp, filename );;
-	bklist = (gene *) malloc(numbk*sizeof(gene));
-
-	for(n=0; n< numbk; n++){
-		bklist[n].values = (int *) malloc(numclasses*sizeof(int));
-	}
-
-	/*
-	 * now read in gene info
-	 */
-	fp = fopen(filename,"rt");
-	n=-1;
-	j=0;
-	while( fscanf(fp, "%s", &val ) != EOF ){
-		if( contains( val, "_a" ) == 1 ){
-			if( j==7 ) bklist[n].used = 0;
-			if( j==6+numclasses ) bklist[n].used = 1;
-			j=1;
-			n++;
-			strcpy( bklist[n].id, val );
-		}
-		else{
-			if( j==1 ) strcpy( bklist[n].name, val );
-			if( j==2 ) strcpy( bklist[n].chrom, val );
-			if( j==3 ) bklist[n].start = atol(val);
-			if( j==4 ) bklist[n].end = atol(val);
-			if( j==5 ) bklist[n].sense = atoi(val);
-			if( j>=6 ) bklist[n].values[j-6] = atoi(val);
-			j++;
-		}
-	}
-	if( j==7 ) bklist[n].used = 0;
-	if( j==6+numclasses) bklist[n].used = 1;
-	fclose(fp);
-
-	/*
-	 * now count how many genes are to be used for each
-	 */
-	bkused = 0;
-	for(n=0; n< numbk; n++){
-		bkused+= bklist[n].used;
-	}
-}
-
-/*
- * make initial guesses of z-values, set erase to 1
- * read in column toggle file if provided
- * also histogram background and scale into beta
- */
-void initialize( long *seed ){
-	int j,m,n;
-	int val;
-	FILE *fp;
-
-	for(m=0; m< numpeaks; m++){
-		for(n=0; n< peaklist[m].numgenes; n++){
-			if( peaklist[m].genelist[n].used == 1 ){
-				peaklist[m].genelist[n].erase = 1.0;
-				peaklist[m].genelist[n].z = 0.0; 
-			}
-		}
-		if( peaklist[m].numused > 0 ){
-			n = assign_gene( m, seed );
-			peaklist[m].genelist[n].z = 1.0; // pick one random gene from each set to read into motif model
-		}
-	}
-
-	col_tog = (int *) malloc(numclasses*sizeof(int));
-	col_tog_cpy = (int *) malloc(numclasses*sizeof(int));
-	if( set_togs == 0 ){
-		for(j=0; j< numclasses; j++){
-			col_tog[j] = 1;
-			col_tog_cpy[j] = 1;
-		}
-		usedclasses = numclasses;
-	}
-	else{
-		// first count number of fields
-		col_togl = 0;
-		fp = fopen(togs,"rt");
-		if( fp == NULL ){
-			printf("Error: can't open file %s\n", togs);
-			exit(0);
-		}
-		j=0;
-		while( fscanf(fp,"%i", &val) != EOF ){
-			col_togl++;
-		}
-		fclose(fp);
-		if( col_togl != numclasses ){
-			printf("Error: number of fields in toggle file (%i) is different from number of classifications in data (%i)\n", col_togl, numclasses);
-			exit(0);
-		}
-		// now read in values
-		fp = fopen(togs,"rt");
-		j=0;
-		usedclasses = 0;
-		while( fscanf(fp,"%i", &val) != EOF ){
-			col_tog[j] = val;
-			col_tog_cpy[j] = val;
-			j++;
-			if( val == 1 ) usedclasses++;
-			if( val != 1 && val != 0 ){
-				printf("Error: choose the toggle values to be 0 or 1\n");
-				exit(0);
-			}
-		}
-		fclose(fp);
-	}
-
-	for(n=0; n< numbk; n++){
-		bklist[n].z = 1.0;
-	}
-	beta_update();
-	if( verbose != 0 ) printf("   Indexing genes in first pass\n");
-	index_genes();
-}
-
-/*
- * make initial guesses of z-values for a second, etc motif
- * recount number of used genes per peak to those with erase > 0.5
- */
-void reinitialize( long *seed ){
-	int m,n;
-	for(m=0; m< numpeaks; m++){
-		peaklist[m].numused = 0;
-		for(n=0; n< peaklist[m].numgenes; n++){
-			peaklist[m].genelist[n].z = 0.0;
-			if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].erase> 0.5 ){
-				peaklist[m].numused++;
-			}
-		}
-		if( peaklist[m].numused > 0 ){
-			n = assign_gene( m, seed );
-			peaklist[m].genelist[n].z = 1.0; // pick one random gene from each set to read into motif model
-		}
-	}
-	for(n=0; n< numbk; n++){
-		bklist[n].z = 1.0;
-	}
-}
-
-/*
- * histogram background into beta
- */
-void beta_update(){
-	int j,k,n;
-	double tot=0.0;
-	bkhist_update();
-	for(j=0; j< numclasses; j++){
-//		if( col_tog[j] == 1 ){
-			Beta[j] = 0.0;
-			for(k=0; k< 5; k++){
-				beta[j][k] = hist[1][j][k]*mu;
-				Beta[j]+= beta[j][k];
-			}
-//		}
-	}
-}
-
-/*
- * for all the genes in peaklist[*].genelist[*], record the index of that gene in bklist
- */
-void index_genes(){
-	int m,n,nu;
-	int found;
-	for(m=0; m< numpeaks; m++){
-		for(n=0; n< peaklist[m].numgenes; n++){
-			if( peaklist[m].genelist[n].used == 1 ){
-				found = 0;
-				nu = 0;
-				while(nu< numbk && found == 0){
-					if( strcmp(peaklist[m].genelist[n].id, bklist[nu].id) == 0 ){
-						peaklist[m].genelist[n].index = nu;
-						found = 1;
-					}
-					nu++;
-				}
-			}
-		}
-	}
-}
-
-/*
- * update motif model histogram
- */
-void hist_update(){
-	int j,k,m,n;
-	double tot=0;
-	for(j=0; j< numclasses; j++){
-//		if( col_tog[j] == 1 ){
-			for(k=0; k< 5; k++){
-				hist[0][j][k] = 0.0;
-			}
-//		}
-	}
-	for(m=0; m< numpeaks; m++){
-		for(n=0; n< peaklist[m].numgenes; n++){
-			if( peaklist[m].genelist[n].used == 1 ){
-				tot+=peaklist[m].genelist[n].erase*peaklist[m].genelist[n].z;
-				for(j=0; j< numclasses; j++){
-//					if( col_tog[j] == 1 ){
-						k = peaklist[m].genelist[n].values[j]+2;
-						hist[0][j][k]+= peaklist[m].genelist[n].erase*peaklist[m].genelist[n].z;
-//					}
-				}
-			}
-		}
-	}
-	for(j=0; j< numclasses; j++){
-//		if( col_tog[j] == 1 ){
-			for(k=0; k< 5; k++){
-				hist[0][j][k]+= beta[j][k];
-				hist[0][j][k]/= (tot+Beta[j]);;
-			}
-//		}
-	}
-}
-
-/*
- * update background hist using z values from bklist
- */
-void bkhist_update(){
-	int j,k,n;
-	double tot=0.0;
-	for(j=0; j< numclasses; j++){
-//		if( col_tog[j] == 1 ){
-			for(k=0; k< 5; k++){
-				hist[1][j][k] = 0;
-			}
-//		}
-	}
-	for(n=0; n< numbk; n++){
-		if( bklist[n].used == 1 ){
-			tot+=bklist[n].z;
-			for(j=0; j< numclasses; j++){
-//				if( col_tog[j] == 1 ){
-					k = bklist[n].values[j]+2;
-					hist[1][j][k]+= bklist[n].z;
-//				}
-			}
-		}
-	}
-	for(j=0; j< numclasses; j++){
-//		if( col_tog[j] == 1 ){
-			for(k=0; k< 5; k++){
-				hist[1][j][k]/= tot;
-			}
-//		}
-	}
-}
-
-/*
- * update lambda value
- */
-void lambda_update(){
-	int m,n;
-	double tot=0.0;
-	lambda=0;
-	for(m=0; m< numpeaks; m++){
-		for(n=0; n< peaklist[m].numgenes; n++){
-			if( peaklist[m].genelist[n].used == 1 ){
-				lambda+= peaklist[m].genelist[n].z;
-				tot++;
-			}
-		}
-	}
-	lambda/=tot;
-}
-
-/*
- * update the toggle columns based on which columns have better than average relative entropy
- */
-void toggle_update(){
-	int j;
-	double avg=0.0;
-	avg = rsca*rel_entropy();
-	usedclasses=0;
-	for(j=0; j< numclasses; j++){
-		if( col_tog[j] == 1 ){
-			if( col_rel_entropy(j) < avg ) col_tog[j] = 0;
-			if( col_tog[j] == 1 ) usedclasses++;
-		}
-	}
-}
-
-/*
- * reset col_tog values with their original ones
- */
-void toggle_reset(){
-	int j;
-	usedclasses=0;
-	for(j=0; j< numclasses; j++){
-		col_tog[j] = col_tog_cpy[j];
-		if( col_tog[j] == 1 ) usedclasses++;
-	}
-}
-
-/*
- * update z
- */
-void z_update(){
-	int j,m,n;
-	double a, c, denom;
-	for(m=0; m< numpeaks; m++){
-		for(n=0; n< peaklist[m].numgenes; n++){
-			if( peaklist[m].genelist[n].used == 1 ){
-				// new z value for motif model
-				a = lprob(m,n,0)+log(lambda);
-				c = lprob(m,n,1)+log(1-lambda);
-				if( a> c ){
-					denom = a + log(1+exp(c-a));
-				}
-				else{
-					denom = c + log(1+exp(a-c));
-				}
-				peaklist[m].genelist[n].z = exp( lprob(m,n,0)+log(lambda) - denom );
-	//			peaklist[m].genelist[n].z = prob(m,n,0)*lambda/( prob(m,n,0)*lambda + prob(m,n,1)*(1-lambda) );
-				// new z value for corresponding gene in background model
-				bklist[peaklist[m].genelist[n].index].z = 1.0-peaklist[m].genelist[n].z;
-			}
-		}
-	}
-}
-
-/*
- * update erase parameters 
- */
-void erase_update(){
-	int j,m,n;
-	for(m=0; m< numpeaks; m++){
-		for(n=0; n< peaklist[m].numgenes; n++){
-			if( peaklist[m].genelist[n].used == 1 ){
-				peaklist[m].genelist[n].erase*= (1-peaklist[m].genelist[n].z);
-			}
-		}
-	}
-}
-
-/*
- * calculate (log) (multinomial) probability of sequence m,n in model l
- */
-double lprob( int m, int n, int l ){
-	int j,k;
-	double val=0.0;
-	for(j=0; j< numclasses; j++){
-		if( col_tog[j] == 1 ){
-			k = peaklist[m].genelist[n].values[j]+2;
-			val+= log(hist[l][j][k]);
-		}
-	}
-	return val;
-}
-double prob( int m, int n, int l ){
-	int j,k;
-	double val=1.0;
-	for(j=0; j< numclasses; j++){
-		if( col_tog[j] == 1 ){
-			k = peaklist[m].genelist[n].values[j]+2;
-			val*= hist[l][j][k];
-		}
-	}
-	return val;
-}
-
-/*
- * copy hist into oldhist before updating
- */
-void copy_hist(){
-	int j,k,l;
-	for(l=0; l< 2; l++){
-		for(j=0; j< numclasses; j++){
-//			if( col_tog[j] == 1 ){
-				for(k=0; k< 5; k++){
-					oldhist[l][j][k] = hist[l][j][k];
-				}
-//			}
-		}
-	}
-}
-
-/*
- * compute change in hist
- */
-double hist_diff(){
-	int j,k,l;
-	double val=0.0;
-	for(l=0; l< 2; l++){
-		for(j=0; j< numclasses; j++){
-			if( col_tog[j] == 1 ){
-				for(k=0; k< 5; k++){
-					val+= fabs( oldhist[l][j][k]-hist[l][j][k] );
-				}
-			}
-		}
-	}
-	return val;
-}
-
-/*
- * calculate the score of a sequence
- */
-double score( int m, int n ){
-	int j,k;
-	double val=0.0;
-	for(j=0; j< numclasses; j++){
-		if( col_tog[j] == 1 ){
-			k = peaklist[m].genelist[n].values[j]+2;
-			val+= log( hist[0][j][k]/hist[1][j][k] );
-		}
-	}
-	return val;
-}
-
-/*
- * calculate value of t (threshold)
- */
-void t_update(){
-	t = tscale*log( (1-lambda)/lambda );
-}
-
-/*
- * quality scores of a motif, averaging over only those sequences picked as targets
- *   ElogL: log-likelihood of the combination of models, averaged by # genes
- *   ratio_score: log-likelihood ratio of motif vs background models
- *   rel_entropy: relative entropy of motif to background
- *   col_rel_entropy: same as rel_entropy, but for just one column (rel_entropy is average over all cols)
- *   chisq: chi-square statistic
- */
-double ElogL(){
-	int m,n;
-	double val=0.0, tot=0.0;
-	for(m=0; m< numpeaks; m++){
-		for(n=0; n< peaklist[m].numgenes; n++){
-			if( peaklist[m].genelist[n].used == 1 ){
-				val+=   peaklist[m].genelist[n].z*lprob(m, n, 0 )    \
-				      + (1-peaklist[m].genelist[n].z)*lprob(m, n, 1 )\
-				      + peaklist[m].genelist[n].z*log(lambda)       \
-				      + (1-peaklist[m].genelist[n].z)*log(1-lambda);
-				tot++;
-			}
-		}
-	}
-	return val/tot;
-}
-
-double ratio_score(){
-	int m,n;
-	double val=0.0;
-	double tot=0.0;
-	for(m=0; m< numpeaks; m++){
-		for(n=0; n< peaklist[m].numgenes; n++){
-			if( peaklist[m].genelist[n].assigned == 1 ){
-				val+= lprob(m,n,0)-lprob(m,n,1);
-				tot++;
-			}
-		}
-	}
-	return val/tot;
-}
-
-double rel_entropy(){
-	int j,k;
-	double val=0.0;
-	for(j=0; j< numclasses; j++){
-		if( col_tog[j] == 1 ){
-			val+= col_rel_entropy( j );
-		}
-	}
-	return val/usedclasses;
-}
-
-double col_rel_entropy( int j ){
-	int k;
-	double val=0.0;
-	for(k=0; k< 5; k++){
-		if( hist[1][j][j] > 0 && hist[0][j][k] > 0 ){
-			// do in log base 2 for bits
-			val+= hist[0][j][k]*log(hist[0][j][k]/hist[1][j][k])/log(2);
-		}
-	}
-	return val;
-}
-
-double chisq(){
-	int j, k;
-	double val=0.0;
-	for(j=0; j< numclasses; j++){
-		if( col_tog[j] == 1 ){
-			for(k=0; k< 5; k++){
-				val+= pow(hist[0][j][k] - hist[1][j][k],2)/hist[1][j][k];
-			}
-		}
-	}
-	return val;
-}
-	
-/*
- * number of members of a motif model
- */
-int motif_count(){
-	int m,n;
-	int tot=0;
-	for(m=0; m< numpeaks; m++){
-		for(n=0; n< peaklist[m].numgenes; n++){
-			if( peaklist[m].genelist[n].assigned == 1 ){
-				tot++;
-			}
-		}
-	}
-	return tot;
-}
-
-/*
- * assign a random used gene: one that is used and hasn't been erased
- */
-int assign_gene( int m, long *seed ){
-	int n;
-	n = floor( peaklist[m].numgenes*ran1(seed) );
-	while( peaklist[m].genelist[n].used == 0 || peaklist[m].genelist[n].erase < 0.5 ){
-		n = floor( peaklist[m].numgenes*ran1(seed) );
-	}
-	return n;
-}
-
-/*
- * score all genes using the models
- */
-void score_genes(){
-	int m,n;
-	for(m=0; m< numpeaks; m++){
-		for(n=0; n< peaklist[m].numgenes; n++){
-			if( peaklist[m].genelist[n].used == 1 ){
-				peaklist[m].genelist[n].score= score( m, n );
-			}
-		}
-	}	
-}
-
-/*
- * choose targets using the scores, and ZOOPS setting
- * zoops = 1 -> 0 or 1 target/strand
- * else -> all above threshold
- */
-void choose_targets(){
-	int m,n;
-	int best;
-	for(m=0; m< numpeaks; m++){
-		peaklist[m].assgngene = 0;
-		best=-1;
-		for(n=0; n< peaklist[m].numgenes; n++){
-			if( peaklist[m].genelist[n].used == 1 ){
-				if( peaklist[m].genelist[n].score > t ){
-					peaklist[m].genelist[n].assigned = 1;
-					peaklist[m].assgngene++;
-				}
-				else peaklist[m].genelist[n].assigned = 0;
-				if( best == -1 ) best = n;
-				else if( peaklist[m].genelist[n].score > peaklist[m].genelist[best].score ) best = n;
-			}
-		}
-		if( zoops == 1 ){
-			for(n=0; n< peaklist[m].numgenes; n++){
-				if( peaklist[m].genelist[n].used == 1 && n != best ) peaklist[m].genelist[n].assigned = 0;
-			}
-			if( peaklist[m].genelist[best].score > t ) peaklist[m].assgngene = 1;
-			else peaklist[m].assgngene = 0;
-		}
-	}
-}
-
-/*
- * functions to print to output (fp must already be opened for top two)
- */
-void print_peak( FILE *fp, int m ){
-	int n;
-	fprintf(fp,"PEAK: %s %ld %s\n", peaklist[m].chrom, peaklist[m].pkposn, peaklist[m].info);
-	for(n=0; n< peaklist[m].numgenes; n++){
-		if( printall == 0 ){
-			if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].assigned == 1 ){
-				print_gene( fp, m, n );
-			}
-		}
-		if( printall == 1 ){
-			if( peaklist[m].genelist[n].used == 1 ){
-				print_gene( fp, m, n );
-			}
-		}
-	}
-}
-
-void print_gene( FILE *fp, int m, int n ){
-	int j;
-	if( peaklist[m].genelist[n].used == 1 ){
-		fprintf(fp, "TGENE: %0.3f %s %s %s %ld %ld %i", peaklist[m].genelist[n].score, peaklist[m].genelist[n].id, peaklist[m].genelist[n].name, peaklist[m].genelist[n].chrom, peaklist[m].genelist[n].start, peaklist[m].genelist[n].end, peaklist[m].genelist[n].sense);
-		for(j=0; j< numclasses; j++){
-			if( col_tog[j] == 0 ) fprintf(fp," %i ", peaklist[m].genelist[n].values[j]);
-			else fprintf(fp," %i", peaklist[m].genelist[n].values[j]);
-		}
-		fprintf(fp,"\n");
-	}
-}
-
-void print_state(FILE *fp, char *filename){
-	int m;
-	fp = fopen(filename,"w");
-	for(m=0; m< numpeaks; m++){
-		print_peak(fp, m);
-	}
-	fclose(fp);
-}
-
-/*
- * this prints a list of <region id> <name>
- */
-/****************
-void print_targets( FILE *fp, char *filename ){
-	int m,n,nu;
-	fp = fopen(filename,"w");
-	for(m=0; m< numpeaks; m++){
-		if( peaklist[m].assgngene > 0 ){
-			fprintf(fp,"%s ", peaklist[m].regid);
-			n=0;
-			nu=0;
-			while( n< peaklist[m].numgenes ){
-				if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].assigned == 1 ){
-					if( nu< peaklist[m].assgngene-1 ) fprintf(fp,"%s,", peaklist[m].genelist[n].name);
-					else fprintf(fp,"%s\n", peaklist[m].genelist[n].name);
-					nu++;
-				}
-				n++;
-			}
-		}
-	}
-	fclose(fp);
-}
-*************/
-
-/*
- * print the motif model and its score
- */
-void print_model( FILE *fp, char *filename ){
-	int j,k;
-	fp = fopen(filename,"w");
-	fprintf(fp,"comparison\\class; avg relative entropy %0.3f, %i genes, threshold %0.3f, quality %0.1f\n", rel_entropy(), motif_count(), t, rel_entropy()*motif_count() );
-	fprintf(fp,"             ++       +       0       -      --\n");
-	for(j=0; j< numclasses; j++){
-		if( j< 10 ) fprintf(fp," ");
-		if( col_tog[j] == 0 ) fprintf(fp,"#");
-		else fprintf(fp," ");
-		fprintf(fp,"%i,%.3f",j+1, col_rel_entropy(j) );
-		for(k=4; k>=0; k--){
-			if( hist[1][j][k] > 0 ) fprintf(fp,"%s", nice_d(log(hist[0][j][k]/hist[1][j][k])) );
-			else fprintf(fp,"      NA");
-		}
-		fprintf(fp,"\n");
-	}
-	fclose(fp);
-}
-
-/*
- * prints a double with 3 digits, aligned by spaces
- * longest would be -999.000, so 8 characters
- * shortest would be 1.000, 5 characters
- * up to 3 spaces
- */
-char *nice_d( double val ){
-	char *returnval;
-	char *dval;
-	char *spaces;
-	int i, numspaces=0;
-
-	returnval = (char *) malloc(10*sizeof(char));	
-	dval = (char *) malloc(10*sizeof(char));
-	spaces = (char *) malloc(10*sizeof(char));
-	
-	sprintf(dval,"%.3f", val);
-	
-	if( val >= 0 ) numspaces++;
-	if( fabs(val) < 100 ){
-		if( fabs(val) < 10 ) numspaces+=2;
-		else numspaces++;
-	}
-	if( numspaces == 0 ) sprintf(spaces,"");
-	if( numspaces == 1 ) sprintf(spaces," ");
-	if( numspaces == 2 ) sprintf(spaces,"  ");
-	if( numspaces == 3 ) sprintf(spaces,"   ");
-	sprintf(returnval,"%s%s", spaces, dval);
-	return returnval;
-}
-
-/*
- * print the peaks only in annotation format, for use in comppeaks, subpeaks, etc
- */
-/**************
-void print_annotpeak( FILE *fp, char *filename, int motif ){
-	int m;
-	fp = fopen(filename,"w");
-	for(m=0; m< numpeaks; m++){
-		if( peaklist[m].assgngene > 0 ){
-			fprintf(fp,"%s-%i %s %s %ld %ld %ld ", peaklist[m].filename, motif, peaklist[m].regid, peaklist[m].chrom, peaklist[m].pkposn, peaklist[m].start, peaklist[m].end );
-			fprintf(fp,"%f 1.0 1.0 1 1 1.0 ", peaklist[m].height );
-			fprintf(fp,"1 1.0 1.0 1.0 1 -1.0 poop 1\n");
-		}
-	}
-	close(fp);
-}
-*************/
-