Mercurial > repos > immport-devteam > cross_sample
view cross_sample/src/cent_adjust.c @ 3:5f670146a9af draft
Uploaded
author | immport-devteam |
---|---|
date | Mon, 27 Feb 2017 13:29:54 -0500 |
parents | |
children |
line wrap: on
line source
///////////////////////////////////////////////////////// // Cent_adjust version number and modification history // ImmPort BISC project // Author: Yu "Max" Qian // v1.01: Oct 16, 2009 // Line 899 of the main function: // Changed kmean_term=1 to kmean_term=2 ////////////////////////////////////////////////////////// #include <time.h> #include <stdio.h> #include <stdlib.h> #include <math.h> #include <string.h> #define DEBUG 0 #define LINE_LEN 1024 #define FILE_NAME_LEN 128 #define PARA_NAME_LEN 64 #define MAX_VALUE 1000000000 #define CUBE 0 void getctrfileinfo(FILE *f_src_ctr, long *num_clust) { int ch='\n'; int prev='\n'; long num_rows=0; while ((ch = fgetc(f_src_ctr))!= EOF ) { if (ch == '\n') { ++num_rows; } prev = ch; } if (prev!='\n') ++num_rows; *num_clust=num_rows; //printf("center file has %ld rows\n", *num_clust); } /************************************* Read basic info of the source file **************************************/ void getfileinfo(FILE *f_src, long *file_Len, long *num_dm, char *name_string, int *time_ID) { char src[LINE_LEN]; char current_name[64]; char prv; long num_rows=0; long num_columns=0; int ch='\n'; int prev='\n'; long time_pos=0; long i=0; long j=0; src[0]='\0'; fgets(src, LINE_LEN, f_src); name_string[0]='\0'; current_name[0]='\0'; prv='\n'; while ((src[i]==' ') || (src[i]=='\t')) //skip space and tab characters i++; while ((src[i]!='\r') && (src[i]!='\n')) //repeat until the end of the line { current_name[j]=src[i]; if ((src[i]=='\t') && (prv!='\t')) //a complete word { current_name[j]='\0'; /* * Commented out John Campbell, June 10 2010 * We no longer want to automatically remove Time column. * This column should have been removed by column selection if (0!=strcmp(current_name,"Time")) { num_columns++; //num_columns does not inlcude the column of Time time_pos++; strcat(name_string,current_name); strcat(name_string,"\t"); } else { *time_ID=time_pos; } */ num_columns++; strcat(name_string,current_name); strcat(name_string,"\t"); current_name[0]='\0'; j=0; } if ((src[i]=='\t') && (prv=='\t')) //a duplicate tab or space { current_name[0]='\0'; j=0; } if (src[i]!='\t') j++; prv=src[i]; i++; } if (prv!='\t') //the last one hasn't been retrieved { current_name[j]='\0'; /* * Commented out John Campbell, June 10 2010 * We no longer want to automatically remove Time column. * This column should have been removed by column selection if (0!=strcmp(current_name,"Time")) { num_columns++; strcat(name_string,current_name); time_pos++; } else { *time_ID=time_pos; } */ num_columns++; strcat(name_string,current_name); } if (DEBUG==1) { printf("time_ID is %d\n",*time_ID); printf("name_string is %s\n",name_string); } // # of rows while ((ch = fgetc(f_src))!= EOF ) { if (ch == '\n') { ++num_rows; } prev = ch; } if (prev!='\n') ++num_rows; *file_Len=num_rows; *num_dm=num_columns; //printf("original file size is %ld; number of dimensions is %ld\n", *file_Len, *num_dm); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /************************************* Read the source file into uncomp_data **************************************/ void readsource(FILE *f_src, long file_Len, long num_dm, double **uncomp_data, int time_ID) { long time_pass=0; //to mark whether the time_ID has been passed long index=0; long i=0; long j=0; long t=0; char src[LINE_LEN]; char xc[LINE_LEN/10]; src[0]='\0'; fgets(src,LINE_LEN, f_src); //skip the first line about parameter names while (!feof(f_src) && (index<file_Len)) //index = 0, 1, ..., file_Len-1 { src[0]='\0'; fgets(src,LINE_LEN,f_src); i=0; time_pass=0; if (time_ID==-1) { for (t=0;t<num_dm;t++) //there is no time_ID { xc[0]='\0'; j=0; while ((src[i]!='\r') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t')) { xc[j]=src[i]; i++; j++; } xc[j]='\0'; i++; uncomp_data[index][t]=atof(xc); } } else { for (t=0;t<=num_dm;t++) //the time column needs to be skipped, so there are num_dm+1 columns { xc[0]='\0'; j=0; while ((src[i]!='\r') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t')) { xc[j]=src[i]; i++; j++; } xc[j]='\0'; i++; if (t==time_ID) { time_pass=1; continue; } if (time_pass) uncomp_data[index][t-1]=atof(xc); else uncomp_data[index][t]=atof(xc); } } index++; //fprintf(fout_ID,"%s",src); } //end of while if (DEBUG == 1) { printf("the last line of the source data is:\n"); for (j=0;j<num_dm;j++) printf("%f ",uncomp_data[index-1][j]); printf("\n"); } } ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void readcenter(FILE *f_src_ctr, long num_clust, long num_dm, double **cluster_center, long *IDmapping) { char src[LINE_LEN]; char xc[LINE_LEN/10]; long i=0; long j=0; int m=0; int t=0; for (i=0;i<num_clust;i++) { src[0]='\0'; fgets(src,LINE_LEN, f_src_ctr); m=0; for (j=0;j<num_dm+1;j++) { xc[0]='\0'; t=0; while ((src[m]!='\r') && (src[m]!='\n') && (src[m]!=' ') && (src[m]!='\t')) { xc[t]=src[m]; m++; t++; } xc[t]='\0'; m++; if (j==0) IDmapping[i]=atoi(xc); else cluster_center[i][j-1]=atof(xc); //printf("cluster_center[%d][%d]=%f\n",i,j,cluster_center[i][j]); } } } /**************************************** Normalization ******************************************/ void tran(double **orig_data, long clean_Len, long num_dm, long norm_used, double **matrix_to_cluster) { long i=0; long j=0; double biggest=0; double smallest=MAX_VALUE; double *aver; //average of each column double *std; //standard deviation of each column aver=(double*)malloc(sizeof(double)*clean_Len); memset(aver,0,sizeof(double)*clean_Len); std=(double*)malloc(sizeof(double)*clean_Len); memset(std,0,sizeof(double)*clean_Len); if (norm_used==2) //z-score normalization { for (j=0;j<num_dm;j++) { aver[j]=0; for (i=0;i<clean_Len;i++) aver[j]=aver[j]+orig_data[i][j]; aver[j]=aver[j]/(double)clean_Len; std[j]=0; for (i=0;i<clean_Len;i++) std[j]=std[j]+(orig_data[i][j]-aver[j])*(orig_data[i][j]-aver[j]); std[j]=sqrt(std[j]/(double)clean_Len); for (i=0;i<clean_Len;i++) matrix_to_cluster[i][j]=(orig_data[i][j]-aver[j])/std[j]; //z-score normalization } } if (norm_used==1) //0-1 min-max normalization { for (j=0;j<num_dm;j++) { biggest=0; smallest=MAX_VALUE; for (i=0;i<clean_Len;i++) { if (orig_data[i][j]>biggest) biggest=orig_data[i][j]; if (orig_data[i][j]<smallest) smallest=orig_data[i][j]; } for (i=0;i<clean_Len;i++) { if (biggest==smallest) matrix_to_cluster[i][j]=biggest; else matrix_to_cluster[i][j]=(orig_data[i][j]-smallest)/(biggest-smallest); } } } if (norm_used==0) //no normalization { for (i=0;i<clean_Len;i++) for (j=0;j<num_dm;j++) matrix_to_cluster[i][j]=orig_data[i][j]; } } ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void assign_event(double **Matrix, long k, long dist_used, double kmean_term, long file_Len, long num_dm, long *shortest_id, double **center, int random_init) { long i=0; long j=0; long t=0; long random=0; long random1=0; long random2=0; long times=0; long times_allowed=0; long *num; //num[i]=t means the ith cluster has t points double vvv=1.0; // the biggest variation; double distance=0.0; double xv=0.0; double variation=0.0; double EPS=0.0; double diff=0.0; double mean_dx=0; double mean_dy=0; double sum_var=0; double dx=0; double dy=0; double sd_x=0; double sd_y=0; double *temp_center; double *shortest_distance; double **sum; temp_center = (double *)malloc(sizeof(double)*num_dm); memset(temp_center,0,sizeof(double)*num_dm); /* Choosing Centers */ if (random_init) { for (i=0;i<k;i++) { random1=rand()*rand(); //srand( (unsigned)time( NULL ) ); random2=abs((random1%5)+1); for (t=0;t<random2;t++) random2=random2*rand()+rand(); random=abs(random2%file_Len); //printf("random=%d\n",random); for (j=0;j<num_dm;j++) center[i][j]=Matrix[random][j]; } } //printf("finish random selection\n"); /* To compute the nearest center for every point */ shortest_distance = (double *)malloc(sizeof(double)*file_Len); memset(shortest_distance,0,sizeof(double)*file_Len); num = (long *)malloc(sizeof(long)*k); memset(num,0,sizeof(long)*k); sum = (double **)malloc(sizeof(double*)*k); memset(sum,0,sizeof(double*)*k); for (i=0;i<k;i++) { sum[i] = (double *)malloc(sizeof(double)*num_dm); memset(sum[i],0,sizeof(double)*num_dm); } for (i=0;i<k;i++) for (j=0;j<num_dm;j++) sum[i][j]=0.0; //sum[i][j] = k means the sum of the jth dimension of all points in the ith group is k //printf("before recursion\n"); if (kmean_term>=1) times_allowed = (long)kmean_term; else EPS = kmean_term; times=0; while (((vvv>EPS) && (kmean_term<1)) || ((times<times_allowed) && (kmean_term>=1))) { for (i=0;i<k;i++) { num[i]=0; for (j=0;j<num_dm;j++) sum[i][j]=0.0; } for (i=0;i<file_Len;i++) //for each data point i, we compute the distance between Matrix[i] and center[j] { shortest_distance[i]=MAX_VALUE; for (j=0;j<k;j++) //for each center j { distance=0.0; if (dist_used==0) //Euclidean distance { for (t=0;t<num_dm;t++) //for each dimension { diff=center[j][t]-Matrix[i][t]; if (diff<0) diff=-diff; if (CUBE) distance = distance+(diff*diff*diff); else distance = distance+(diff*diff); } } else //pearson correlation { mean_dx=0.0; mean_dy=0.0; sum_var=0.0; dx=0.0; dy=0.0; sd_x=0.0; sd_y=0.0; for (t=0;t<num_dm;t++) { mean_dx+=center[j][t]; mean_dy+=Matrix[i][t]; } mean_dx=mean_dx/(double)num_dm; mean_dy=mean_dy/(double)num_dm; //printf("mean_dx=%f\n",mean_dx); for (t=0;t<num_dm;t++) { dx=center[j][t]-mean_dx; dy=Matrix[i][t]-mean_dy; sum_var+=dx*dy; sd_x+=dx*dx; sd_y+=dy*dy; } if (sqrt(sd_x*sd_y)==0) distance = 1.0; else distance = 1.0 - (sum_var/(sqrt(sd_x*sd_y))); // distance ranges from 0 to 2; //printf("distance=%f\n",distance); } //pearson correlation ends if (distance<shortest_distance[i]) { shortest_distance[i]=distance; shortest_id[i]=j; } }//end for j num[shortest_id[i]]=num[shortest_id[i]]+1; for (t=0;t<num_dm;t++) sum[shortest_id[i]][t]=sum[shortest_id[i]][t]+Matrix[i][t]; }//end for i /* recompute the centers */ //compute_mean(group); vvv=0.0; for (j=0;j<k;j++) { memcpy(temp_center,center[j],sizeof(double)*num_dm); variation=0.0; if (num[j]!=0) { for (t=0;t<num_dm;t++) { center[j][t]=sum[j][t]/(double)num[j]; xv=(temp_center[t]-center[j][t]); variation=variation+xv*xv; } } if (variation>vvv) vvv=variation; //vvv is the biggest variation among the k clusters; } //compute_variation; times++; } //end for while free(num); for (i=0;i<k;i++) free(sum[i]); free(sum); free(temp_center); free(shortest_distance); } ////////////////////////////////////////////////////// /*************************** Show *****************************/ void show(double **Matrix, long *cluster_id, long file_Len, long k, long num_dm, char *name_string, long *IDmapping) { int situ1=0; int situ2=0; long i=0; long id=0; long j=0; long info_id=0; long nearest_id=0; long insert=0; long temp=0; long m=0; long n=0; long t=0; long *size_c; long **size_mybound_1; long **size_mybound_2; long **size_mybound_3; long **size_mybound_0; double interval=0.0; double *big; double *small; double **center; double **mybound; long **prof; //prof[i][j]=1 means population i is + at parameter j FILE *fpcnt_id; //proportion id //FILE *fcent_id; //center_id, i.e., centers of clusters within the original data FILE *fprof_id; //profile_id big=(double *)malloc(sizeof(double)*num_dm); memset(big,0,sizeof(double)*num_dm); small=(double *)malloc(sizeof(double)*num_dm); memset(small,0,sizeof(double)*num_dm); for (i=0;i<num_dm;i++) { big[i]=0.0; small[i]=(double)MAX_VALUE; } size_c=(long *)malloc(sizeof(long)*k); memset(size_c,0,sizeof(long)*k); center=(double**)malloc(sizeof(double*)*k); memset(center,0,sizeof(double*)*k); for (i=0;i<k;i++) { center[i]=(double*)malloc(sizeof(double)*num_dm); memset(center[i],0,sizeof(double)*num_dm); } mybound=(double**)malloc(sizeof(double*)*num_dm); memset(mybound,0,sizeof(double*)*num_dm); for (i=0;i<num_dm;i++) //there are 3 mybounds for 4 categories { mybound[i]=(double*)malloc(sizeof(double)*3); memset(mybound[i],0,sizeof(double)*3); } prof=(long **)malloc(sizeof(long*)*k); memset(prof,0,sizeof(long*)*k); for (i=0;i<k;i++) { prof[i]=(long *)malloc(sizeof(long)*num_dm); memset(prof[i],0,sizeof(long)*num_dm); } for (i=0;i<file_Len;i++) { id=cluster_id[i]; for (j=0;j<num_dm;j++) { center[id][j]=center[id][j]+Matrix[i][j]; if (big[j]<Matrix[i][j]) big[j]=Matrix[i][j]; if (small[j]>Matrix[i][j]) small[j]=Matrix[i][j]; } size_c[id]++; } for (i=0;i<k;i++) for (j=0;j<num_dm;j++) { if (size_c[i]!=0) center[i][j]=(center[i][j]/(double)(size_c[i])); else center[i][j]=0; } for (j=0;j<num_dm;j++) { interval=((big[j]-small[j])/4.0); //printf("interval[%d] is %f\n",j,interval); for (i=0;i<3;i++) mybound[j][i]=small[j]+((double)(i+1)*interval); } size_mybound_0=(long **)malloc(sizeof(long*)*k); memset(size_mybound_0,0,sizeof(long*)*k); for (i=0;i<k;i++) { size_mybound_0[i]=(long*)malloc(sizeof(long)*num_dm); memset(size_mybound_0[i],0,sizeof(long)*num_dm); } size_mybound_1=(long **)malloc(sizeof(long*)*k); memset(size_mybound_1,0,sizeof(long*)*k); for (i=0;i<k;i++) { size_mybound_1[i]=(long*)malloc(sizeof(long)*num_dm); memset(size_mybound_1[i],0,sizeof(long)*num_dm); } size_mybound_2=(long **)malloc(sizeof(long*)*k); memset(size_mybound_2,0,sizeof(long*)*k); for (i=0;i<k;i++) { size_mybound_2[i]=(long*)malloc(sizeof(long)*num_dm); memset(size_mybound_2[i],0,sizeof(long)*num_dm); } size_mybound_3=(long **)malloc(sizeof(long*)*k); memset(size_mybound_3,0,sizeof(long*)*k); for (i=0;i<k;i++) { size_mybound_3[i]=(long*)malloc(sizeof(long)*num_dm); memset(size_mybound_3[i],0,sizeof(long)*num_dm); } for (i=0;i<file_Len;i++) for (j=0;j<num_dm;j++) { if (Matrix[i][j]<mybound[j][0])// && ((Matrix[i][j]-small[j])>0)) //the smallest values excluded size_mybound_0[cluster_id[i]][j]++; else { if (Matrix[i][j]<mybound[j][1]) size_mybound_1[cluster_id[i]][j]++; else { if (Matrix[i][j]<mybound[j][2]) size_mybound_2[cluster_id[i]][j]++; else //if (Matrix[i][j]!=big[j]) //the biggest values excluded size_mybound_3[cluster_id[i]][j]++; } } } fprof_id=fopen("profile.txt","w"); fprintf(fprof_id,"Population_ID\t"); fprintf(fprof_id,"%s\n",name_string); for (i=0;i<k;i++) { fprintf(fprof_id,"%ld\t",IDmapping[i]); for (j=0;j<num_dm;j++) { if (size_mybound_0[i][j]>size_mybound_1[i][j]) situ1=0; else situ1=1; if (size_mybound_2[i][j]>size_mybound_3[i][j]) situ2=2; else situ2=3; if ((situ1==0) && (situ2==2)) { if (size_mybound_0[i][j]>size_mybound_2[i][j]) prof[i][j]=0; else prof[i][j]=2; } if ((situ1==0) && (situ2==3)) { if (size_mybound_0[i][j]>size_mybound_3[i][j]) prof[i][j]=0; else prof[i][j]=3; } if ((situ1==1) && (situ2==2)) { if (size_mybound_1[i][j]>size_mybound_2[i][j]) prof[i][j]=1; else prof[i][j]=2; } if ((situ1==1) && (situ2==3)) { if (size_mybound_1[i][j]>size_mybound_3[i][j]) prof[i][j]=1; else prof[i][j]=3; } //begin to output profile if (j==num_dm-1) { if (prof[i][j]==0) fprintf(fprof_id,"1\n"); if (prof[i][j]==1) fprintf(fprof_id,"2\n"); if (prof[i][j]==2) fprintf(fprof_id,"3\n"); if (prof[i][j]==3) fprintf(fprof_id,"4\n"); } else { if (prof[i][j]==0) fprintf(fprof_id,"1\t"); if (prof[i][j]==1) fprintf(fprof_id,"2\t"); if (prof[i][j]==2) fprintf(fprof_id,"3\t"); if (prof[i][j]==3) fprintf(fprof_id,"4\t"); } } } fclose(fprof_id); /////////////////////////////////////////////////////////// fpcnt_id=fopen("percentage.txt","w"); fprintf(fpcnt_id,"Population_ID\tPercentage\n"); for (t=0;t<k;t++) { fprintf(fpcnt_id,"%ld\t%.2f\n",IDmapping[t],(double)size_c[t]*100.0/(double)file_Len); } fclose(fpcnt_id); free(big); free(small); free(size_c); for (i=0;i<k;i++) { free(center[i]); free(prof[i]); free(size_mybound_0[i]); free(size_mybound_1[i]); free(size_mybound_2[i]); free(size_mybound_3[i]); } free(center); free(prof); free(size_mybound_0); free(size_mybound_1); free(size_mybound_2); free(size_mybound_3); for (i=0;i<num_dm;i++) free(mybound[i]); free(mybound); } /******************************************************** Main Function **************************************************/ int main (int argc, char **argv) { //inputs FILE *f_src; //source file pointer FILE *f_src_ctr; //source center file //outputs FILE *f_cid; //cluster-id file pointer FILE *f_mfi; //added April 16, 2009 char name_string[LINE_LEN]; //for name use int time_id=-1; long file_Len=0; long num_clust=0; long num_dm=0; long norm_used=0; long dist_used=0; long i=0; long j=0; long *cluster_id; long *IDmapping; //this is to keep the original populationID of the center.txt double kmean_term=0; double **cluster_center; double **orig_data; double **normalized_data; /* _strtime( tmpbuf ); printf( "Starting time:\t\t\t\t%s\n", tmpbuf ); _strdate( tmpbuf ); printf( "Starting date:\t\t\t\t%s\n", tmpbuf ); */ if (argc!=3) { printf("usage: cent_adjust input_center input_data_file\n"); exit(0); } f_src_ctr=fopen(argv[1],"r"); //read source data f_src=fopen(argv[2],"r"); getfileinfo(f_src, &file_Len, &num_dm, name_string, &time_id); //get the filelength, number of dimensions, and num/name of parameters rewind(f_src); //reset data file pointer orig_data = (double **)malloc(sizeof(double*)*file_Len); memset(orig_data,0,sizeof(double*)*file_Len); for (i=0;i<file_Len;i++) { orig_data[i]=(double *)malloc(sizeof(double)*num_dm); memset(orig_data[i],0,sizeof(double)*num_dm); } readsource(f_src, file_Len, num_dm, orig_data, time_id); //read the data; fclose(f_src); ///////////////////////////////////////////////////////////////////////////// getctrfileinfo(f_src_ctr, &num_clust); //get how many populations norm_used=0; dist_used=0; kmean_term=2; //modified on Oct 16, 2009: changed kmean_term=1 to kmean_term=2 rewind(f_src_ctr); //reset center file pointer //read population center cluster_center=(double **)malloc(sizeof(double*)*num_clust); memset(cluster_center,0,sizeof(double*)*num_clust); for (i=0;i<num_clust;i++) { cluster_center[i]=(double*)malloc(sizeof(double)*num_dm); memset(cluster_center[i],0,sizeof(double)*num_dm); } for (i=0;i<num_clust;i++) for (j=0;j<num_dm;j++) cluster_center[i][j]=0; IDmapping=(long *)malloc(sizeof(long)*num_clust); memset(IDmapping,0,sizeof(long)*num_clust); readcenter(f_src_ctr,num_clust,num_dm,cluster_center,IDmapping); //read population center fclose(f_src_ctr); ///////////////////////////////////////////////////////////////////////////// normalized_data=(double **)malloc(sizeof(double*)*file_Len); memset(normalized_data,0,sizeof(double*)*file_Len); for (i=0;i<file_Len;i++) { normalized_data[i]=(double *)malloc(sizeof(double)*num_dm); memset(normalized_data[i],0,sizeof(double)*num_dm); } tran(orig_data, file_Len, num_dm, norm_used, normalized_data); /************************************************* Compute number of clusters *************************************************/ cluster_id=(long*)malloc(sizeof(long)*file_Len); memset(cluster_id,0,sizeof(long)*file_Len); assign_event(normalized_data,num_clust,dist_used,kmean_term,file_Len,num_dm,cluster_id,cluster_center,0); //show(orig_data,cluster_id,file_Len,num_clust,num_dm,show_data,num_disp,name_string); show(orig_data, cluster_id, file_Len, num_clust, num_dm, name_string, IDmapping); f_cid=fopen("population_id.txt","w"); for (i=0;i<file_Len;i++) fprintf(f_cid,"%ld\n",IDmapping[cluster_id[i]]); fclose(f_cid); //added April 16, 2009 f_mfi=fopen("MFI.txt","w"); for (i=0;i<num_clust;i++) { fprintf(f_mfi,"%ld\t",IDmapping[i]); for (j=0;j<num_dm;j++) { if (j==num_dm-1) fprintf(f_mfi,"%.0f\n",cluster_center[i][j]); else fprintf(f_mfi,"%.0f\t",cluster_center[i][j]); } } fclose(f_mfi); //ended April 16, 2009 for (i=0;i<num_clust;i++) free(cluster_center[i]); free(cluster_center); /********************************************** Release memory ******************************************/ for (i=0;i<file_Len;i++) { free(orig_data[i]); free(normalized_data[i]); } free(orig_data); free(normalized_data); free(cluster_id); free(IDmapping); /* _strtime( tmpbuf ); printf( "Ending time:\t\t\t\t%s\n", tmpbuf ); _strdate( tmpbuf ); printf( "Ending date:\t\t\t\t%s\n", tmpbuf ); */ }