Mercurial > repos > mmaiensc > ember
changeset 1:cd8def22f313
Deleted selected files
author | mmaiensc |
---|---|
date | Wed, 29 Feb 2012 15:12:29 -0500 |
parents | 003f802d4c7d |
children | 1a84b8178b45 |
files | GALAXY_FILES/tools/EMBER/Ember_Galaxy.c |
diffstat | 1 files changed, 0 insertions(+), 1227 deletions(-) [+] |
line wrap: on
line diff
--- a/GALAXY_FILES/tools/EMBER/Ember_Galaxy.c Wed Feb 29 15:03:33 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1227 +0,0 @@ -/* - * ember.c : Expectation Maximization of Binding and Expression pRofiles - * ---------------------------------------------------------------- - * algorithm adapted from MEME algorithm: Bailey and Elkan, Proc Int Conf Intell Syst Mol Biol, 2:28-36 (1994). - * See Eqs. 3-13, plus "Implementation of MM" section for detailed description - * changes are as follows: - * background changes with column (j=1,...,numclasses), so it is a 2D histogram of the same size as the motif model; same for psuedocounts - * background model is determined by full list of genes (second input); z-values of genes in first input are subtracted - * update of background can be turned off by commenting out 8th line of z_update() - * in this case, also comment out index_genes at end of initialize(), as it will save some time - * sequences cannot overlap, since they are fixed per gene; this simplifies the enumeration of all possibilities - * a ZOOPS options is included (change in code, parameter zoops); this does not alter the algorithm, - * only adds a restriction on which genes are chosen as targets in choose_targets() - * general notes: - * the common dimensions are: - * numpeaks - number of peaks, size of peaklist, indexed by m - * peaklist[m].numgenes - number of genes matched to a peak, size of peaklist[m].genelist, indexed by n - * numclasses - number of comparisons made (i.e. motif width), indexed by j - * 5 - number of classifications in each comparison, assumed to be -2,-1,0,1,2, indexed by k - * 2 - number of models (motif=0 and background=1), indexed by l - * nmots - total number of motifs to find, indexed by i in main (always stored at l=0, as found motifs are erased) - * col_tog can be used to force the algorithm to ignore certain columns (behavior dimensions) - * - * things to add/consider: - * - best measure of motif quality score? - * - want to ensure that the same sequence can't get assigned to more than one motif? - */ - -#include "all.h" - -typedef struct { - char id[50]; - char name[500]; - char chrom[50]; - long start; - long end; - int sense; - double z; - double erase; - double score; - int index; // will refer to index of this id in bklist - int used; // set to 0 if a non-classified gene - int assigned; // set to 1 if assigned - int *values; -} gene; - -typedef struct { - char info[5000]; - char chrom[50]; - long pkposn; - int numgenes; - int numused; - int assgngene; // set to # targets picked for this gene - gene *genelist; -} peak; - -// FREE PARAMETERS TO CHOOSE IN COMMAND LINE -double mu; // scale background histogram by mu for psuedocounts -double tscale; // scales the threshold (to make choices more or less stringent) -int zoops; // set to 1 to enforce ZOOPS model (Zero Or One Per Strand) -int printall; // set to 1 to print all genes, not just targets -int mots; // number of motifs to search for -int repeat; // number of times to wean columns and repeat search -double rsca; // percent of average to set relative entropy cutoff -// END PARAMETERS TO CHOOSE - -char *input, *background, *pref, *togs; -char *target_out, *list_out, *model_out, *annot_out; -long iseedval; -int numpeaks, numclasses, usedclasses, numbk, bkused, nmots; -peak *peaklist; -gene *bklist; -double ***hist, ***oldhist, lambda, t, **beta; -double *Beta; -int *col_tog, col_togl, set_togs, *col_tog_cpy; -int verbose; - -void read_cmd_line( int argc, char **argv ); -void print_usage(); -void read_data( char *filename ); -void read_bkgrnd( char *filename ); -void initialize( long *seed ); -void reinitialize( long *seed ); -void beta_update(); -void index_genes(); - -void hist_update(); -void bkhist_update(); -void lambda_update(); -void toggle_update(); -void toggle_reset(); -void z_update(); -void erase_update(); -double lprob( int m, int n, int l ); -double prob( int m, int n, int l ); -void copy_hist(); -double hist_diff(); -double score( int m, int n ); -void t_update(); -double ElogL(); -double ratio_score(); -double rel_entropy(); -double col_rel_entropy( int j ); -double chisq(); -int motif_count(); - -int assign_gene( int m, long *seed ); -void score_genes(); -void choose_targets(); - -void print_peak( FILE *fp, int i ); -void print_gene( FILE *fp, int i, int j ); -void print_state( FILE *fp, char *filename ); -//void print_targets( FILE *fp, char *filename ); -void print_model( FILE *fp, char *filename ); -//void print_annotpeak( FILE *fp, char *filename, int motif ); -char *nice_d( double val ); - -int main(int argc, char *argv[]){ - int i,iter,r; - int pfreq; - double diff, conv = 1e-6; - long seed; - FILE *fout; - - if(argc == 1 ){ - print_usage(); - } - - /* - * read command line params - */ - read_cmd_line( argc, argv ); - - /* - * allocate output names - */ -// target_out = (char *) malloc((strlen(pref)+20)*sizeof(char)); -// list_out = (char *) malloc((strlen(pref)+20)*sizeof(char)); -// model_out = (char *) malloc((strlen(pref)+20)*sizeof(char)); -// annot_out = (char *) malloc((strlen(pref)+20)*sizeof(char)); - - /* - * read in data, copy values from global to local variables - */ - if( verbose != 0 ) printf("\nReading data and allocating\n"); - read_data( input ); - read_bkgrnd( background ); - nmots = mots; - seed = iseedval; - - /* - * allocate memory for other arrays - */ - hist = (double ***) alloc3d(sizeof(double),2,numclasses,5); - oldhist = (double ***) alloc3d(sizeof(double),2,numclasses,5); - beta = (double **) alloc2d(sizeof(double),numclasses,5); - Beta = (double *) malloc(numclasses*sizeof(double)); - - /* - * find nmots motif models - */ - for(i=1; i<= nmots; i++){ - /* - * initialize, fill initial parameter values from z-guesses - */ - if( verbose != 0 ) printf("Expression Pattern %i,\n Initializing\n", i); - if( i==1 ) initialize( &seed ); - else reinitialize( &seed ); - hist_update(); - lambda_update(); - - /* - * now sample for awhile, repeating repeat times - * each repeat starts at the previously refined search (?) - */ - for(r=0; r<= repeat; r++){ - if( verbose != 0 ) printf(" Sampling\n"); - if( verbose != 0 && repeat> 0 ) printf(" Repeat %i\n", r ); - diff = 1; - pfreq = 10; - iter=1; - while( diff > conv ){ - z_update(); // E step - copy_hist(); - hist_update(); // M step - bkhist_update(); // M step - lambda_update(); // M step - diff = hist_diff(); - if( iter%pfreq == 0 && verbose != 0 ) printf(" iteration %i, (normalized) ElogL %.2f, change %.2e\n", iter, ElogL(), diff); - iter++; - } - if( repeat > 0 && r< repeat ) toggle_update(); - } - - /* - * score and choose final targets - */ - t_update(); - if( verbose != 0 ) printf(" Done, scoring and choosing targets on threshold %.3f\n\n", t); - score_genes(); - choose_targets(); - - /* - * print final targets - */ -// sprintf( target_out,"%s-%i.targets", pref, i); -// sprintf( list_out,"%s-%i.list", pref, i); -// sprintf( model_out,"%s-%i.model", pref, i); -// sprintf( annot_out,"%s-%i.xls", pref, i); - print_state(fout, target_out); - print_model(fout, model_out); -// print_targets(fout, list_out); -// print_annotpeak( fout, annot_out, i); - - /* - * erase found motifs (only if we're going around again) - */ - if( i< nmots ) erase_update(); - if( repeat > 0 ) toggle_reset(); - } - -} - -/*==================== FUNCTIONS ======================*/ - -/* - * read command line inputs and options - */ -void read_cmd_line( int argc, char **argv ){ - int i, j; - int found_reqs=0; - - // default values - mots = 1; - zoops = 0; - printall = 0; - mu = 0.1; - tscale = 1.0; - iseedval = make_seed(); - set_togs = 0; - verbose = 1; - repeat = 0; - rsca = 1.0; - - // parse options - for(i=1; i< argc; i++){ - if( argv[i][0] != '-' ) print_usage(); - i++; - if( i >= argc || argv[i][0] == '-' ) print_usage(); - switch( argv[i-1][1] ){ - case 'i': - input = (char *) malloc((strlen(argv[i])+5)*sizeof(char)); - strcpy(input, argv[i]); - found_reqs++; - break; - case 'b': - background = (char *) malloc((strlen(argv[i])+5)*sizeof(char)); - strcpy(background, argv[i]); - found_reqs++; - break; - case 'o': - target_out = (char *) malloc((strlen(argv[i])+5)*sizeof(char)); - strcpy(target_out, argv[i]); - found_reqs++; - break; - case 'p': - model_out = (char *) malloc((strlen(argv[i])+5)*sizeof(char)); - strcpy(model_out, argv[i]); - found_reqs++; - break; - case 'n': - mots = atoi(argv[i]); - if( mots <= 0 ){ - printf("\nError: choose number of expression patterns to be at least 1\n"); - print_usage(); - } - break; - case 'z': - if( argv[i][0] == 'y' ) zoops = 1; - else if( argv[i][0] == 'n' ) zoops = 0; - else{ - printf("\nError: if using -z option, choose y or n\n"); - print_usage(); - } - break; - case 'v': - if( argv[i][0] == 'y' ) verbose = 1; - else if( argv[i][0] == 'n' ) verbose = 0; - else{ - printf("\nError: if using -v option, choose y or n\n"); - print_usage(); - } - break; - case 'm': - mu = atof(argv[i]); - if( mu<= 0 ){ - printf("\nError: choose mu to be positive\n"); - print_usage(); - } - break; - case 't': - tscale = atof(argv[i]); - if( tscale<= 0 ){ - printf("\nError: choose threshold to be positive\n"); - print_usage(); - } - break; - case 's': - iseedval = -1*atol(argv[i]); - if( iseedval>= 0 ){ - printf("\nError: choose seed to be a large positive integer\n"); - print_usage(); - } - break; - case 'c': - set_togs = 1; - togs = (char *) malloc((strlen(argv[i])+5)*sizeof(char)); - strcpy( togs, argv[i] ); - break; - case 'a': - if( argv[i][0] == 'y' ) printall = 1; - else if( argv[i][0] == 'n' ) printall = 0; - else{ - printf("\nError: if using -a option, choose y or n\n"); - print_usage(); - } - break; - case 'r': - repeat = atoi(argv[i]); - if( repeat< 0 ){ - printf("\nError: choose repeat to be a non-negative integer\n"); - print_usage(); - } - break; - case 'R': - rsca = atof(argv[i]); - if( rsca <= 0 ){ - printf("\nError: choose repeat scalar to be positive\n"); - print_usage(); - } - break; - default: - printf("\nError: unkown option: -%c\n", argv[i-1][1]); - print_usage(); - } - } - // double check that the correct number of required inputs has been read in - if( found_reqs != 4 ){ - printf("\nError: improper number of required inputs found\n"); - print_usage(); - } -} - -/* - * usage info - */ -void print_usage(){ - printf("\nUsage: -i matched_data -b background_data -o target_output_name -p model_output_name [option value]\n"); - printf("Options:\n"); - printf(" -n number of expression patterns to find (positive integer, default 1)\n"); - printf(" -z enforce ZOOPS model (y or n, default n)\n"); - printf(" -m scalar for psuedocounts (positive real number, default 0.1)\n"); - printf(" -t scalar for threshold (positive real number, default 1.0)\n"); - printf(" -s seed (positive long integer, default chosen from date and time)\n"); - printf(" -c column toggle file, used to ignore certain classifications (default: all on)\n"); - printf(" -v verboseness, turn on/off printing to stdout (y or n, default: y)\n"); - printf(" -a print all genes (and scores), rather than only those above the threshold (y or n, default: n)\n"); - printf(" -r repeat search: after convergence of each expression pattern, turn off columns with lower relative entropy\n"); - printf(" than average*scalar and repeat expression pattern search (non-negative integer, default 0)\n"); - printf(" -R relative entropy scalar: if using the -r option, multiplies the average relative entropy\n"); - printf(" by this value to set the threshold for turning off columns (positive real number, default 1.0)\n"); - printf("\n"); - exit(0); -} - -/* - * function: read data from filename into the peaklist data structure - * notes on assumed format of data: - * peak lines start with "PEAK:" field - * gene lines start with "GENE:" field - * gene lines with only 1 classification are ignored - * assume all fields are <5000 characters - */ -void read_data( char *filename ){ - int j,m,n; - char val[5000], other[5000]; - FILE *fp; - - /* - * first count number of peaks, and allocate - */ - fp = fopen(filename,"rt"); - if( fp == NULL ){ - printf("Error: can't open file %s\n", filename); - exit(0); - } - numpeaks = 0; - while( fscanf(fp, "%s", &val ) != EOF ){ - if( contains( val, "PEAK" ) == 1 ){ - numpeaks++; - } - } - fclose(fp); - peaklist = (peak *) malloc(numpeaks*sizeof(peak)); - - /* - * now count number of genes w/>1 classification, and allocate - * also count number of classifications, and allocate that at the end - * n tracks number of genes, j field; subtract genes that don't have full classifications - * m tracks peak index; record peak info - */ - fp = fopen(filename,"rt"); - n=-1; - m=-1; - j=0; - sprintf(other,"NA"); - numclasses=0; - while( fscanf(fp, "%s", &val ) != EOF ){ - if( contains( val, "PEAK" ) == 1 ){ - if( j==7 ) ; - else{ - if( j-6 > numclasses ) numclasses = j-6; - } - if( m>=0 ){ - peaklist[m].numgenes = n; - peaklist[m].genelist = (gene *) malloc( peaklist[m].numgenes*sizeof(gene)); - } - n=0; - j=0; - if( strcmp(other, "NA") != 0 ) strcpy(peaklist[m].info, other); - m++; - sprintf(other,"NA"); - } - else if( contains( val, "GENE" ) == 1 ){ - n++; - j=0; - } - else if( n==0 ){ - if( j==0 ) strcpy(peaklist[m].chrom, val); - if( j==1 ) peaklist[m].pkposn = atol(val); - if( j> 1 ){ - if( strcmp(other, "NA") == 0 ) strcpy(other, val); - else sprintf(other,"%s %s", other, val); - } - j++; - } - else{ - j++; - } - } - peaklist[m].numgenes = n; - peaklist[m].genelist = (gene *) malloc( peaklist[m].numgenes*sizeof(gene)); - strcpy( peaklist[m].info, other); - fclose(fp); - - for(m=0; m< numpeaks; m++){ - for(n=0; n< peaklist[m].numgenes; n++){ - peaklist[m].genelist[n].values = (int *) malloc(numclasses*sizeof(int)); - } - } - - /* - * now read in gene info - */ - fp = fopen(filename,"rt"); - m=-1; - j=0; - while( fscanf(fp, "%s", &val ) != EOF ){ - if( contains( val, "PEAK" ) == 1 ){ - if( j==7 ) peaklist[m].genelist[n].used = 0; - if( j==6+numclasses) peaklist[m].genelist[n].used = 1; - n=-1; - j=0; - m++; - } - else if( contains( val, "GENE" ) == 1 ){ - if( j==7 ) peaklist[m].genelist[n].used = 0; - if( j==6+numclasses ) peaklist[m].genelist[n].used = 1; - n++; - j=0; - } - else if( n==-1 ){ - } - else { - if( j==0 ) strcpy( peaklist[m].genelist[n].id, val ); - if( j==1 ) strcpy( peaklist[m].genelist[n].name, val ); - if( j==2 ) strcpy( peaklist[m].genelist[n].chrom, val ); - if( j==3 ) peaklist[m].genelist[n].start = atol(val); - if( j==4 ) peaklist[m].genelist[n].end = atol(val); - if( j==5 ) peaklist[m].genelist[n].sense = atoi(val); - if( j>=6 ) peaklist[m].genelist[n].values[j-6] = atoi(val); - j++; - } - } - if( j==7 ) peaklist[m].genelist[n].used = 0; - if( j==6+numclasses) peaklist[m].genelist[n].used = 1; - fclose(fp); - - /* - * now count how many genes are to be used for each - */ - for(m=0; m< numpeaks; m++){ - peaklist[m].numused = 0; - for(n=0; n< peaklist[m].numgenes; n++){ - peaklist[m].numused+= peaklist[m].genelist[n].used; - } - } -} - -void read_bkgrnd( char *filename ){ - int j,n; - char val[500]; - FILE *fp; - - /* - * first count number of genes, and allocate - */ - numbk = count_lines( fp, filename );; - bklist = (gene *) malloc(numbk*sizeof(gene)); - - for(n=0; n< numbk; n++){ - bklist[n].values = (int *) malloc(numclasses*sizeof(int)); - } - - /* - * now read in gene info - */ - fp = fopen(filename,"rt"); - n=-1; - j=0; - while( fscanf(fp, "%s", &val ) != EOF ){ - if( contains( val, "_a" ) == 1 ){ - if( j==7 ) bklist[n].used = 0; - if( j==6+numclasses ) bklist[n].used = 1; - j=1; - n++; - strcpy( bklist[n].id, val ); - } - else{ - if( j==1 ) strcpy( bklist[n].name, val ); - if( j==2 ) strcpy( bklist[n].chrom, val ); - if( j==3 ) bklist[n].start = atol(val); - if( j==4 ) bklist[n].end = atol(val); - if( j==5 ) bklist[n].sense = atoi(val); - if( j>=6 ) bklist[n].values[j-6] = atoi(val); - j++; - } - } - if( j==7 ) bklist[n].used = 0; - if( j==6+numclasses) bklist[n].used = 1; - fclose(fp); - - /* - * now count how many genes are to be used for each - */ - bkused = 0; - for(n=0; n< numbk; n++){ - bkused+= bklist[n].used; - } -} - -/* - * make initial guesses of z-values, set erase to 1 - * read in column toggle file if provided - * also histogram background and scale into beta - */ -void initialize( long *seed ){ - int j,m,n; - int val; - FILE *fp; - - for(m=0; m< numpeaks; m++){ - for(n=0; n< peaklist[m].numgenes; n++){ - if( peaklist[m].genelist[n].used == 1 ){ - peaklist[m].genelist[n].erase = 1.0; - peaklist[m].genelist[n].z = 0.0; - } - } - if( peaklist[m].numused > 0 ){ - n = assign_gene( m, seed ); - peaklist[m].genelist[n].z = 1.0; // pick one random gene from each set to read into motif model - } - } - - col_tog = (int *) malloc(numclasses*sizeof(int)); - col_tog_cpy = (int *) malloc(numclasses*sizeof(int)); - if( set_togs == 0 ){ - for(j=0; j< numclasses; j++){ - col_tog[j] = 1; - col_tog_cpy[j] = 1; - } - usedclasses = numclasses; - } - else{ - // first count number of fields - col_togl = 0; - fp = fopen(togs,"rt"); - if( fp == NULL ){ - printf("Error: can't open file %s\n", togs); - exit(0); - } - j=0; - while( fscanf(fp,"%i", &val) != EOF ){ - col_togl++; - } - fclose(fp); - if( col_togl != numclasses ){ - printf("Error: number of fields in toggle file (%i) is different from number of classifications in data (%i)\n", col_togl, numclasses); - exit(0); - } - // now read in values - fp = fopen(togs,"rt"); - j=0; - usedclasses = 0; - while( fscanf(fp,"%i", &val) != EOF ){ - col_tog[j] = val; - col_tog_cpy[j] = val; - j++; - if( val == 1 ) usedclasses++; - if( val != 1 && val != 0 ){ - printf("Error: choose the toggle values to be 0 or 1\n"); - exit(0); - } - } - fclose(fp); - } - - for(n=0; n< numbk; n++){ - bklist[n].z = 1.0; - } - beta_update(); - if( verbose != 0 ) printf(" Indexing genes in first pass\n"); - index_genes(); -} - -/* - * make initial guesses of z-values for a second, etc motif - * recount number of used genes per peak to those with erase > 0.5 - */ -void reinitialize( long *seed ){ - int m,n; - for(m=0; m< numpeaks; m++){ - peaklist[m].numused = 0; - for(n=0; n< peaklist[m].numgenes; n++){ - peaklist[m].genelist[n].z = 0.0; - if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].erase> 0.5 ){ - peaklist[m].numused++; - } - } - if( peaklist[m].numused > 0 ){ - n = assign_gene( m, seed ); - peaklist[m].genelist[n].z = 1.0; // pick one random gene from each set to read into motif model - } - } - for(n=0; n< numbk; n++){ - bklist[n].z = 1.0; - } -} - -/* - * histogram background into beta - */ -void beta_update(){ - int j,k,n; - double tot=0.0; - bkhist_update(); - for(j=0; j< numclasses; j++){ -// if( col_tog[j] == 1 ){ - Beta[j] = 0.0; - for(k=0; k< 5; k++){ - beta[j][k] = hist[1][j][k]*mu; - Beta[j]+= beta[j][k]; - } -// } - } -} - -/* - * for all the genes in peaklist[*].genelist[*], record the index of that gene in bklist - */ -void index_genes(){ - int m,n,nu; - int found; - for(m=0; m< numpeaks; m++){ - for(n=0; n< peaklist[m].numgenes; n++){ - if( peaklist[m].genelist[n].used == 1 ){ - found = 0; - nu = 0; - while(nu< numbk && found == 0){ - if( strcmp(peaklist[m].genelist[n].id, bklist[nu].id) == 0 ){ - peaklist[m].genelist[n].index = nu; - found = 1; - } - nu++; - } - } - } - } -} - -/* - * update motif model histogram - */ -void hist_update(){ - int j,k,m,n; - double tot=0; - for(j=0; j< numclasses; j++){ -// if( col_tog[j] == 1 ){ - for(k=0; k< 5; k++){ - hist[0][j][k] = 0.0; - } -// } - } - for(m=0; m< numpeaks; m++){ - for(n=0; n< peaklist[m].numgenes; n++){ - if( peaklist[m].genelist[n].used == 1 ){ - tot+=peaklist[m].genelist[n].erase*peaklist[m].genelist[n].z; - for(j=0; j< numclasses; j++){ -// if( col_tog[j] == 1 ){ - k = peaklist[m].genelist[n].values[j]+2; - hist[0][j][k]+= peaklist[m].genelist[n].erase*peaklist[m].genelist[n].z; -// } - } - } - } - } - for(j=0; j< numclasses; j++){ -// if( col_tog[j] == 1 ){ - for(k=0; k< 5; k++){ - hist[0][j][k]+= beta[j][k]; - hist[0][j][k]/= (tot+Beta[j]);; - } -// } - } -} - -/* - * update background hist using z values from bklist - */ -void bkhist_update(){ - int j,k,n; - double tot=0.0; - for(j=0; j< numclasses; j++){ -// if( col_tog[j] == 1 ){ - for(k=0; k< 5; k++){ - hist[1][j][k] = 0; - } -// } - } - for(n=0; n< numbk; n++){ - if( bklist[n].used == 1 ){ - tot+=bklist[n].z; - for(j=0; j< numclasses; j++){ -// if( col_tog[j] == 1 ){ - k = bklist[n].values[j]+2; - hist[1][j][k]+= bklist[n].z; -// } - } - } - } - for(j=0; j< numclasses; j++){ -// if( col_tog[j] == 1 ){ - for(k=0; k< 5; k++){ - hist[1][j][k]/= tot; - } -// } - } -} - -/* - * update lambda value - */ -void lambda_update(){ - int m,n; - double tot=0.0; - lambda=0; - for(m=0; m< numpeaks; m++){ - for(n=0; n< peaklist[m].numgenes; n++){ - if( peaklist[m].genelist[n].used == 1 ){ - lambda+= peaklist[m].genelist[n].z; - tot++; - } - } - } - lambda/=tot; -} - -/* - * update the toggle columns based on which columns have better than average relative entropy - */ -void toggle_update(){ - int j; - double avg=0.0; - avg = rsca*rel_entropy(); - usedclasses=0; - for(j=0; j< numclasses; j++){ - if( col_tog[j] == 1 ){ - if( col_rel_entropy(j) < avg ) col_tog[j] = 0; - if( col_tog[j] == 1 ) usedclasses++; - } - } -} - -/* - * reset col_tog values with their original ones - */ -void toggle_reset(){ - int j; - usedclasses=0; - for(j=0; j< numclasses; j++){ - col_tog[j] = col_tog_cpy[j]; - if( col_tog[j] == 1 ) usedclasses++; - } -} - -/* - * update z - */ -void z_update(){ - int j,m,n; - double a, c, denom; - for(m=0; m< numpeaks; m++){ - for(n=0; n< peaklist[m].numgenes; n++){ - if( peaklist[m].genelist[n].used == 1 ){ - // new z value for motif model - a = lprob(m,n,0)+log(lambda); - c = lprob(m,n,1)+log(1-lambda); - if( a> c ){ - denom = a + log(1+exp(c-a)); - } - else{ - denom = c + log(1+exp(a-c)); - } - peaklist[m].genelist[n].z = exp( lprob(m,n,0)+log(lambda) - denom ); - // peaklist[m].genelist[n].z = prob(m,n,0)*lambda/( prob(m,n,0)*lambda + prob(m,n,1)*(1-lambda) ); - // new z value for corresponding gene in background model - bklist[peaklist[m].genelist[n].index].z = 1.0-peaklist[m].genelist[n].z; - } - } - } -} - -/* - * update erase parameters - */ -void erase_update(){ - int j,m,n; - for(m=0; m< numpeaks; m++){ - for(n=0; n< peaklist[m].numgenes; n++){ - if( peaklist[m].genelist[n].used == 1 ){ - peaklist[m].genelist[n].erase*= (1-peaklist[m].genelist[n].z); - } - } - } -} - -/* - * calculate (log) (multinomial) probability of sequence m,n in model l - */ -double lprob( int m, int n, int l ){ - int j,k; - double val=0.0; - for(j=0; j< numclasses; j++){ - if( col_tog[j] == 1 ){ - k = peaklist[m].genelist[n].values[j]+2; - val+= log(hist[l][j][k]); - } - } - return val; -} -double prob( int m, int n, int l ){ - int j,k; - double val=1.0; - for(j=0; j< numclasses; j++){ - if( col_tog[j] == 1 ){ - k = peaklist[m].genelist[n].values[j]+2; - val*= hist[l][j][k]; - } - } - return val; -} - -/* - * copy hist into oldhist before updating - */ -void copy_hist(){ - int j,k,l; - for(l=0; l< 2; l++){ - for(j=0; j< numclasses; j++){ -// if( col_tog[j] == 1 ){ - for(k=0; k< 5; k++){ - oldhist[l][j][k] = hist[l][j][k]; - } -// } - } - } -} - -/* - * compute change in hist - */ -double hist_diff(){ - int j,k,l; - double val=0.0; - for(l=0; l< 2; l++){ - for(j=0; j< numclasses; j++){ - if( col_tog[j] == 1 ){ - for(k=0; k< 5; k++){ - val+= fabs( oldhist[l][j][k]-hist[l][j][k] ); - } - } - } - } - return val; -} - -/* - * calculate the score of a sequence - */ -double score( int m, int n ){ - int j,k; - double val=0.0; - for(j=0; j< numclasses; j++){ - if( col_tog[j] == 1 ){ - k = peaklist[m].genelist[n].values[j]+2; - val+= log( hist[0][j][k]/hist[1][j][k] ); - } - } - return val; -} - -/* - * calculate value of t (threshold) - */ -void t_update(){ - t = tscale*log( (1-lambda)/lambda ); -} - -/* - * quality scores of a motif, averaging over only those sequences picked as targets - * ElogL: log-likelihood of the combination of models, averaged by # genes - * ratio_score: log-likelihood ratio of motif vs background models - * rel_entropy: relative entropy of motif to background - * col_rel_entropy: same as rel_entropy, but for just one column (rel_entropy is average over all cols) - * chisq: chi-square statistic - */ -double ElogL(){ - int m,n; - double val=0.0, tot=0.0; - for(m=0; m< numpeaks; m++){ - for(n=0; n< peaklist[m].numgenes; n++){ - if( peaklist[m].genelist[n].used == 1 ){ - val+= peaklist[m].genelist[n].z*lprob(m, n, 0 ) \ - + (1-peaklist[m].genelist[n].z)*lprob(m, n, 1 )\ - + peaklist[m].genelist[n].z*log(lambda) \ - + (1-peaklist[m].genelist[n].z)*log(1-lambda); - tot++; - } - } - } - return val/tot; -} - -double ratio_score(){ - int m,n; - double val=0.0; - double tot=0.0; - for(m=0; m< numpeaks; m++){ - for(n=0; n< peaklist[m].numgenes; n++){ - if( peaklist[m].genelist[n].assigned == 1 ){ - val+= lprob(m,n,0)-lprob(m,n,1); - tot++; - } - } - } - return val/tot; -} - -double rel_entropy(){ - int j,k; - double val=0.0; - for(j=0; j< numclasses; j++){ - if( col_tog[j] == 1 ){ - val+= col_rel_entropy( j ); - } - } - return val/usedclasses; -} - -double col_rel_entropy( int j ){ - int k; - double val=0.0; - for(k=0; k< 5; k++){ - if( hist[1][j][j] > 0 && hist[0][j][k] > 0 ){ - // do in log base 2 for bits - val+= hist[0][j][k]*log(hist[0][j][k]/hist[1][j][k])/log(2); - } - } - return val; -} - -double chisq(){ - int j, k; - double val=0.0; - for(j=0; j< numclasses; j++){ - if( col_tog[j] == 1 ){ - for(k=0; k< 5; k++){ - val+= pow(hist[0][j][k] - hist[1][j][k],2)/hist[1][j][k]; - } - } - } - return val; -} - -/* - * number of members of a motif model - */ -int motif_count(){ - int m,n; - int tot=0; - for(m=0; m< numpeaks; m++){ - for(n=0; n< peaklist[m].numgenes; n++){ - if( peaklist[m].genelist[n].assigned == 1 ){ - tot++; - } - } - } - return tot; -} - -/* - * assign a random used gene: one that is used and hasn't been erased - */ -int assign_gene( int m, long *seed ){ - int n; - n = floor( peaklist[m].numgenes*ran1(seed) ); - while( peaklist[m].genelist[n].used == 0 || peaklist[m].genelist[n].erase < 0.5 ){ - n = floor( peaklist[m].numgenes*ran1(seed) ); - } - return n; -} - -/* - * score all genes using the models - */ -void score_genes(){ - int m,n; - for(m=0; m< numpeaks; m++){ - for(n=0; n< peaklist[m].numgenes; n++){ - if( peaklist[m].genelist[n].used == 1 ){ - peaklist[m].genelist[n].score= score( m, n ); - } - } - } -} - -/* - * choose targets using the scores, and ZOOPS setting - * zoops = 1 -> 0 or 1 target/strand - * else -> all above threshold - */ -void choose_targets(){ - int m,n; - int best; - for(m=0; m< numpeaks; m++){ - peaklist[m].assgngene = 0; - best=-1; - for(n=0; n< peaklist[m].numgenes; n++){ - if( peaklist[m].genelist[n].used == 1 ){ - if( peaklist[m].genelist[n].score > t ){ - peaklist[m].genelist[n].assigned = 1; - peaklist[m].assgngene++; - } - else peaklist[m].genelist[n].assigned = 0; - if( best == -1 ) best = n; - else if( peaklist[m].genelist[n].score > peaklist[m].genelist[best].score ) best = n; - } - } - if( zoops == 1 ){ - for(n=0; n< peaklist[m].numgenes; n++){ - if( peaklist[m].genelist[n].used == 1 && n != best ) peaklist[m].genelist[n].assigned = 0; - } - if( peaklist[m].genelist[best].score > t ) peaklist[m].assgngene = 1; - else peaklist[m].assgngene = 0; - } - } -} - -/* - * functions to print to output (fp must already be opened for top two) - */ -void print_peak( FILE *fp, int m ){ - int n; - fprintf(fp,"PEAK: %s %ld %s\n", peaklist[m].chrom, peaklist[m].pkposn, peaklist[m].info); - for(n=0; n< peaklist[m].numgenes; n++){ - if( printall == 0 ){ - if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].assigned == 1 ){ - print_gene( fp, m, n ); - } - } - if( printall == 1 ){ - if( peaklist[m].genelist[n].used == 1 ){ - print_gene( fp, m, n ); - } - } - } -} - -void print_gene( FILE *fp, int m, int n ){ - int j; - if( peaklist[m].genelist[n].used == 1 ){ - fprintf(fp, "TGENE: %0.3f %s %s %s %ld %ld %i", peaklist[m].genelist[n].score, peaklist[m].genelist[n].id, peaklist[m].genelist[n].name, peaklist[m].genelist[n].chrom, peaklist[m].genelist[n].start, peaklist[m].genelist[n].end, peaklist[m].genelist[n].sense); - for(j=0; j< numclasses; j++){ - if( col_tog[j] == 0 ) fprintf(fp," %i ", peaklist[m].genelist[n].values[j]); - else fprintf(fp," %i", peaklist[m].genelist[n].values[j]); - } - fprintf(fp,"\n"); - } -} - -void print_state(FILE *fp, char *filename){ - int m; - fp = fopen(filename,"w"); - for(m=0; m< numpeaks; m++){ - print_peak(fp, m); - } - fclose(fp); -} - -/* - * this prints a list of <region id> <name> - */ -/**************** -void print_targets( FILE *fp, char *filename ){ - int m,n,nu; - fp = fopen(filename,"w"); - for(m=0; m< numpeaks; m++){ - if( peaklist[m].assgngene > 0 ){ - fprintf(fp,"%s ", peaklist[m].regid); - n=0; - nu=0; - while( n< peaklist[m].numgenes ){ - if( peaklist[m].genelist[n].used == 1 && peaklist[m].genelist[n].assigned == 1 ){ - if( nu< peaklist[m].assgngene-1 ) fprintf(fp,"%s,", peaklist[m].genelist[n].name); - else fprintf(fp,"%s\n", peaklist[m].genelist[n].name); - nu++; - } - n++; - } - } - } - fclose(fp); -} -*************/ - -/* - * print the motif model and its score - */ -void print_model( FILE *fp, char *filename ){ - int j,k; - fp = fopen(filename,"w"); - fprintf(fp,"comparison\\class; avg relative entropy %0.3f, %i genes, threshold %0.3f, quality %0.1f\n", rel_entropy(), motif_count(), t, rel_entropy()*motif_count() ); - fprintf(fp," ++ + 0 - --\n"); - for(j=0; j< numclasses; j++){ - if( j< 10 ) fprintf(fp," "); - if( col_tog[j] == 0 ) fprintf(fp,"#"); - else fprintf(fp," "); - fprintf(fp,"%i,%.3f",j+1, col_rel_entropy(j) ); - for(k=4; k>=0; k--){ - if( hist[1][j][k] > 0 ) fprintf(fp,"%s", nice_d(log(hist[0][j][k]/hist[1][j][k])) ); - else fprintf(fp," NA"); - } - fprintf(fp,"\n"); - } - fclose(fp); -} - -/* - * prints a double with 3 digits, aligned by spaces - * longest would be -999.000, so 8 characters - * shortest would be 1.000, 5 characters - * up to 3 spaces - */ -char *nice_d( double val ){ - char *returnval; - char *dval; - char *spaces; - int i, numspaces=0; - - returnval = (char *) malloc(10*sizeof(char)); - dval = (char *) malloc(10*sizeof(char)); - spaces = (char *) malloc(10*sizeof(char)); - - sprintf(dval,"%.3f", val); - - if( val >= 0 ) numspaces++; - if( fabs(val) < 100 ){ - if( fabs(val) < 10 ) numspaces+=2; - else numspaces++; - } - if( numspaces == 0 ) sprintf(spaces,""); - if( numspaces == 1 ) sprintf(spaces," "); - if( numspaces == 2 ) sprintf(spaces," "); - if( numspaces == 3 ) sprintf(spaces," "); - sprintf(returnval,"%s%s", spaces, dval); - return returnval; -} - -/* - * print the peaks only in annotation format, for use in comppeaks, subpeaks, etc - */ -/************** -void print_annotpeak( FILE *fp, char *filename, int motif ){ - int m; - fp = fopen(filename,"w"); - for(m=0; m< numpeaks; m++){ - if( peaklist[m].assgngene > 0 ){ - fprintf(fp,"%s-%i %s %s %ld %ld %ld ", peaklist[m].filename, motif, peaklist[m].regid, peaklist[m].chrom, peaklist[m].pkposn, peaklist[m].start, peaklist[m].end ); - fprintf(fp,"%f 1.0 1.0 1 1 1.0 ", peaklist[m].height ); - fprintf(fp,"1 1.0 1.0 1.0 1 -1.0 poop 1\n"); - } - } - close(fp); -} -*************/ -