# HG changeset patch # User immport-devteam # Date 1488220194 18000 # Node ID 5f670146a9af50ae3143b98f95585bb860fbb4a8 # Parent 311a4f16c483502e10a51babdb7877d8e450db77 Uploaded diff -r 311a4f16c483 -r 5f670146a9af cross_sample/src/README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cross_sample/src/README Mon Feb 27 13:29:54 2017 -0500 @@ -0,0 +1,139 @@ +This package contains R code for converting and transforming a binary +FCS file, using the FCSTrans software, and the C code for running the +flock1 and flock2 population identification software. + +src - contains the C code for flock1, flock2 and cent_adjust +bin - contains the compiled C code for flock1, flock2 and cent_adjust, + plus the R code for using the FCSTrans algorithm for file + conversion and data transformation. +doc - documentation for FCSTrans and FLOCK algorithms +example - sample data and output from FCSTrans and FLOCK + +To run this software sucessfully the code assumes the software was installed +in the /usr/local/flock directory and that R, plus the Bioconductor flowCore +module has been installed. If you use the ipconvert.sh shell script, you +may need to adjust the location of the RScript executable and the location +of the FCSTrans.R code by editing the shell script. + +############################################################################# +# Overview +############################################################################# +ImmPort-FLOCK + +FLOCK (FLOw Clustering without K), an automated population discovery tool for +multidimensional FCM data was designed to specifically take into account the +unique feature of FCM data and produce objective segregation of cell +populations. + +FLOCK parameter settings can be customized by defining Bins and Density +Threshold. The number of bins is an integer specifying the number of +equal-sized regions the data will be partitioned into on each axis. Increasing +the number of bins increases the sensitivity to detect rare populations but +may also result in single populations being divided. Density Threshold is the +cut-off value to separate the dense regions from background. It is a floating +point number that helps define population centers; increasing the threshold +may help separate major populations but could cause the algorithm to overlook +rare populations. + +############################################################################# +# Compiling C code +############################################################################# +cd bin +cc -o flock1 ../src/flock1.c ../src/find_connected.c -lm +cc -o flock2 ../src/flock2.c -lm +cc -o cent_adjust ../src/cent_adjust.c -lm + +############################################################################# +# FCSTrans +############################################################################# +A shell script named ipconvert.sh is included that runs the FCSTrans R +code for converting and transforming a binary FCS file. The output consists +of one text file containing the transformed channel intensity values and +another file containing a list of the FCS parameters. + +cd bin +/usr/local/flock/bin/ipconvert.sh ../example/data/FCS2.fcs +/usr/local/flock/bin/ipconvert.sh ../example/data/FCS3.fcs + +############################################################################# +# Running flock1 or flock2 +############################################################################# +Running the FLOCK1 and FLOCK2 algorithms generate 8 output files that have +generic file names. For this reason, it is recommended that one output +directory be created for one input file to the program. + +cd example/output/FCS2 +/usr/local/flock/bin/flock1 ../../data/FCS2.txt + +cd example/output/FCS3 +/usr/local/flock/bin/flock2 ../../data/FCS3.txt + +Files created: MFI.txt, percentage.txt, population_id.txt, profile.txt, + flock_results.txt, coordinates.txt, population_center.txt + and percentage.txt + +Usage Information for FLOCK1 +---------------------------------------------------------------------------- +basic mode: flock1 fcs.txt +advanced_ mode: flock1 fcs.txt num_bin density_index max_num_pop + +Usage Information for FLOCK2 +---------------------------------------------------------------------------- +basic mode: flock data_file +advanced mode 0 (specify maximum # of pops): flock data_file max_num_pop +advanced mode 1 (without # of pops): flock data_file num_bin density_index +advanced mode 2 (specify # of pops): flock data_file num_bin density_index + number_of_pop +advanced mode 3 (specify both # of pops): flock data_file num_bin density_index + number_of_pop max_num_pop + +FLOCK Output Files +---------------------------------------------------------------------------- +coordinates.txt: + Output is the intensity values for each marker and event + +flock_results.txt: + A combination of the input file, event identifiers and population + identifiers. + +MFI.txt: + Provides the mean fluorescence intensity for each population for each + marker/parameter + +population_id.txt: + Contains population identifiers (i.e, values from [1 to n] where n is + the population assigned to the corresponding events in the input data + file, one identifier per row.) + +population_center.txt: + Contains the centroid coordinates for each identified population + +percentage.txt: + Includes the population identifiers and percentage of events within that + population (relative to the whole data file) + +profile.txt: + Displays an expression profile, where the approximate expression level + for each marker is assigned a numeric value from 1-4, for each identified + population + +fcs_properties.txt: + Contains the number of events, number of populations, and number of + markers, as well as the algorithm parameters used in the analysis + +############################################################################# +# Running cent_adjust +############################################################################# +Running the cent_adjust algorithm generates 4 output files that have +generic file names. For this reason, it is recommened that one output +directory be created for one input file to the program. + +mkdir example/output/FCS2/cent_adjust +cd example/output/FCS2/cent_adjust +/usr/local/flock/bin/cent_adjust ../population_center.txt ../coordinates.txt + +Files created: MFI.txt, percentage.txt, population_id.txt and profile.txt + +Usage Information for cent_adjust +---------------------------------------------------------------------------- +basic mode: cent_adjust input_center input_data_file diff -r 311a4f16c483 -r 5f670146a9af cross_sample/src/cent_adjust.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cross_sample/src/cent_adjust.c Mon Feb 27 13:29:54 2017 -0500 @@ -0,0 +1,1009 @@ +///////////////////////////////////////////////////////// +// Cent_adjust version number and modification history +// ImmPort BISC project +// Author: Yu "Max" Qian +// v1.01: Oct 16, 2009 +// Line 899 of the main function: +// Changed kmean_term=1 to kmean_term=2 +////////////////////////////////////////////////////////// + + +#include +#include +#include +#include +#include + +#define DEBUG 0 +#define LINE_LEN 1024 +#define FILE_NAME_LEN 128 +#define PARA_NAME_LEN 64 +#define MAX_VALUE 1000000000 +#define CUBE 0 + + +void getctrfileinfo(FILE *f_src_ctr, long *num_clust) +{ + int ch='\n'; + int prev='\n'; + long num_rows=0; + + while ((ch = fgetc(f_src_ctr))!= EOF ) + { + if (ch == '\n') + { + ++num_rows; + } + prev = ch; + } + if (prev!='\n') + ++num_rows; + + *num_clust=num_rows; + //printf("center file has %ld rows\n", *num_clust); +} + +/************************************* Read basic info of the source file **************************************/ +void getfileinfo(FILE *f_src, long *file_Len, long *num_dm, char *name_string, int *time_ID) +{ + char src[LINE_LEN]; + char current_name[64]; + char prv; + + long num_rows=0; + long num_columns=0; + int ch='\n'; + int prev='\n'; + long time_pos=0; + long i=0; + long j=0; + + + + src[0]='\0'; + fgets(src, LINE_LEN, f_src); + + name_string[0]='\0'; + current_name[0]='\0'; + prv='\n'; + + while ((src[i]==' ') || (src[i]=='\t')) //skip space and tab characters + i++; + + while ((src[i]!='\r') && (src[i]!='\n')) //repeat until the end of the line + { + current_name[j]=src[i]; + + if ((src[i]=='\t') && (prv!='\t')) //a complete word + { + current_name[j]='\0'; + + /* + * Commented out John Campbell, June 10 2010 + * We no longer want to automatically remove Time column. + * This column should have been removed by column selection + if (0!=strcmp(current_name,"Time")) + { + num_columns++; //num_columns does not inlcude the column of Time + time_pos++; + strcat(name_string,current_name); + strcat(name_string,"\t"); + } + else + { + *time_ID=time_pos; + } + */ + + num_columns++; + strcat(name_string,current_name); + strcat(name_string,"\t"); + + current_name[0]='\0'; + j=0; + } + + if ((src[i]=='\t') && (prv=='\t')) //a duplicate tab or space + { + current_name[0]='\0'; + j=0; + } + + if (src[i]!='\t') + j++; + + + prv=src[i]; + i++; + } + + if (prv!='\t') //the last one hasn't been retrieved + { + current_name[j]='\0'; + /* + * Commented out John Campbell, June 10 2010 + * We no longer want to automatically remove Time column. + * This column should have been removed by column selection + if (0!=strcmp(current_name,"Time")) + { + num_columns++; + strcat(name_string,current_name); + time_pos++; + } + else + { + *time_ID=time_pos; + } + */ + + num_columns++; + strcat(name_string,current_name); + } + + if (DEBUG==1) + { + printf("time_ID is %d\n",*time_ID); + printf("name_string is %s\n",name_string); + } + + // # of rows + + while ((ch = fgetc(f_src))!= EOF ) + { + if (ch == '\n') + { + ++num_rows; + } + prev = ch; + } + if (prev!='\n') + ++num_rows; + + *file_Len=num_rows; + *num_dm=num_columns; + + //printf("original file size is %ld; number of dimensions is %ld\n", *file_Len, *num_dm); +} + + + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +/************************************* Read the source file into uncomp_data **************************************/ +void readsource(FILE *f_src, long file_Len, long num_dm, double **uncomp_data, int time_ID) +{ + long time_pass=0; //to mark whether the time_ID has been passed + long index=0; + + long i=0; + long j=0; + long t=0; + + char src[LINE_LEN]; + char xc[LINE_LEN/10]; + + src[0]='\0'; + fgets(src,LINE_LEN, f_src); //skip the first line about parameter names + + while (!feof(f_src) && (indexbiggest) + biggest=orig_data[i][j]; + if (orig_data[i][j]=1) + times_allowed = (long)kmean_term; + else + EPS = kmean_term; + + times=0; + + while (((vvv>EPS) && (kmean_term<1)) || ((times=1))) + { + for (i=0;ivvv) + vvv=variation; //vvv is the biggest variation among the k clusters; + } + //compute_variation; + times++; + } //end for while + + + + free(num); + for (i=0;iMatrix[i][j]) + small[j]=Matrix[i][j]; + } + + size_c[id]++; + } + + for (i=0;i0)) //the smallest values excluded + size_mybound_0[cluster_id[i]][j]++; + else + { + if (Matrix[i][j]size_mybound_1[i][j]) + situ1=0; + else + situ1=1; + if (size_mybound_2[i][j]>size_mybound_3[i][j]) + situ2=2; + else + situ2=3; + + if ((situ1==0) && (situ2==2)) + { + if (size_mybound_0[i][j]>size_mybound_2[i][j]) + prof[i][j]=0; + else + prof[i][j]=2; + } + if ((situ1==0) && (situ2==3)) + { + if (size_mybound_0[i][j]>size_mybound_3[i][j]) + prof[i][j]=0; + else + prof[i][j]=3; + } + if ((situ1==1) && (situ2==2)) + { + if (size_mybound_1[i][j]>size_mybound_2[i][j]) + prof[i][j]=1; + else + prof[i][j]=2; + } + if ((situ1==1) && (situ2==3)) + { + if (size_mybound_1[i][j]>size_mybound_3[i][j]) + prof[i][j]=1; + else + prof[i][j]=3; + } + + //begin to output profile + if (j==num_dm-1) + { + if (prof[i][j]==0) + fprintf(fprof_id,"1\n"); + if (prof[i][j]==1) + fprintf(fprof_id,"2\n"); + if (prof[i][j]==2) + fprintf(fprof_id,"3\n"); + if (prof[i][j]==3) + fprintf(fprof_id,"4\n"); + } + else + { + if (prof[i][j]==0) + fprintf(fprof_id,"1\t"); + if (prof[i][j]==1) + fprintf(fprof_id,"2\t"); + if (prof[i][j]==2) + fprintf(fprof_id,"3\t"); + if (prof[i][j]==3) + fprintf(fprof_id,"4\t"); + } + } + } + fclose(fprof_id); + + /////////////////////////////////////////////////////////// + + + fpcnt_id=fopen("percentage.txt","w"); + fprintf(fpcnt_id,"Population_ID\tPercentage\n"); + + for (t=0;t +#include +#include +#include + +//static const char *rcsid = "$Id: find_connected.c,v 1.1 2008/09/05 21:54:40 rpl Exp $"; + +int find_connected(int **G, int num_dense_grids, int ndim, int *grid_clusterID); +void depth_first(int startnode); + +void bail(const char *); /* exits via abort */ +static void check_clusters(int *gcID, int ndense); +static void merge_cluster(int from, int into); + + +/* Vars that will not change througout the depth-first recursion. We + store them here to avoid endless replication on the stack. */ +static int **Gr=0; +static int *gcID = 0; /* grid cluster IDs */ +static int *cluster_count=0; /* count of nodes per cluster */ +static int ndense=0; +static int ndim=0; +/* cid changes between depth-first searches, but is constant within a + single search, so it goes here. */ +static int cid=0; + +/* Find connected components in the graph of neighboring grids defined in G. + * + * Output: + * + * grid_clusterID[] -- cluster to which each dense grid was assigned + * return value -- number of clusters assigned. + */ +int find_connected(int **G, int n_dense_grids, int num_dm, int *grid_clusterID) +{ + int nclust=0; /* number of clusters found */ + int i; + int *subfac; + int subval=0,nempty=0; + int clustid=0; + + size_t sz = n_dense_grids*sizeof(int); + cluster_count = malloc(sz); + if(!cluster_count) + bail("find_connected: Unable to allocate %zd bytes.\n"); + memset(cluster_count,0,sz); + + /* set up the statics that will be used in the DFS */ + Gr=G; + gcID = grid_clusterID; + ndense = n_dense_grids; + ndim = num_dm; + + + for(i=0;i= 0) { + /* We are, so zero the count for the old cluster. */ + cluster_count[ gcID[node] ] = 0; + merge_cluster(gcID[node], cid); + return; + } + + /* Update for this node */ + gcID[node] = cid; + cluster_count[cid]++; + + /* Recursively search the child nodes */ + for(i=0; i= 0) /* This is a child node */ + depth_first(Gr[node][i]); +} + +void bail(const char *msg) +{ + fprintf(stderr,"%s",msg); + abort(); +} + + +static void check_clusters(int *gcID, int ndense) +{ + int i; + + for(i=0; imax_grid) to STDERR; + MAX_GRID changed to 50 as larger than 50 seems not useful for any file we have got + +******************************************************************************/ +#include +#include +#include +#include +#include +#include +#include +#include + + +#define DEBUG 0 +#define LINE_LEN 1024 +#define FILE_NAME_LEN 128 +#define PARA_NAME_LEN 64 +#define MAX_VALUE 1000000000 +#define MIN_GRID 6 +#define MAX_GRID 50 + +#define NORM_METHOD 2 //2 if z-score; 0 if no normalization; 1 if min-max +#define KMEANS_TERM 100 +//#define MAX_NUM_POP 30 + + +int find_connected(int **G, int num_dense_grids, int ndim, int *grid_clusterID); + +/************* Read basic info of the source file ****************************/ +void getfileinfo(FILE *f_src, int *file_Len, int *num_dm, char *name_string, int *time_ID) +{ + char src[LINE_LEN]; + char current_name[64]; + char prv; + + int num_rows=0; + int num_columns=0; + int ch='\n'; + int prev='\n'; + int time_pos=0; + int i=0; + int j=0; + int sw=0; + + src[0]='\0'; + fgets(src, LINE_LEN, f_src); + + if ((src[0]=='F') && (src[1]=='C') && (src[2]=='S')) + { + fprintf(stderr,"the correct input format is a tab-delimited txt file, instead of FCS file.\n"); + abort(); + } + + name_string[0]='\0'; + current_name[0]='\0'; + prv='\n'; + + // skip space and tab characters + while ((src[i]==' ') || (src[i]=='\t')) + i++; + + // repeat until the end of line is reached + while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!='\r')) + { + current_name[j]=src[i]; + + if ((src[i]=='\t') && (prv!='\t')) //a complete word + { + current_name[j]='\0'; + + if (0!=strcmp(current_name,"Time")) + { + num_columns++; //num_columns does not inlcude the column of Time + time_pos++; + if (sw) { + strcat(name_string,"\t"); + } + strcat(name_string,current_name); + sw = 1; + } + else + { + *time_ID=time_pos; + } + + + current_name[0]='\0'; + j=0; + } + + if ((src[i]=='\t') && (prv=='\t')) //a duplicate tab or space + { + current_name[0]='\0'; + j=0; + } + + if (src[i]!='\t') + j++; + + prv=src[i]; + i++; + } + + if (prv!='\t') //the last one hasn't been retrieved + { + current_name[j]='\0'; + + if (0!=strcmp(current_name,"Time")) + { + num_columns++; + strcat(name_string,"\t"); + strcat(name_string,current_name); + time_pos++; + } + else + { + *time_ID=time_pos; + } + + + } + if (DEBUG==1) + { + printf("time_ID is %d\n",*time_ID); + printf("name_string is %s\n",name_string); + } + + //start computing # of rows + + while ((ch = fgetc(f_src))!= EOF ) + { + if (ch == '\n') + { + ++num_rows; + } + prev = ch; + } + if (prev!='\n') + ++num_rows; + + //added on July 23, 2010 + if (num_rows<50) + { + fprintf(stderr,"Number of events in the input file is too few and should not be processed!\n"); //modified on July 23, 2010 + abort(); + } + + *file_Len=num_rows; + *num_dm=num_columns; + + printf("original file size is %d; number of dimensions is %d\n", *file_Len, *num_dm); +} + + + +/************************************* Read the source file into uncomp_data **************************************/ +void readsource(FILE *f_src, int file_Len, int num_dm, double **uncomp_data, int time_ID) +{ + int time_pass=0; //to mark whether the time_ID has been passed + int index=0; + + int i=0; + int j=0; + int t=0; + + char src[LINE_LEN]; + char xc[LINE_LEN/10]; + + src[0]='\0'; + fgets(src,LINE_LEN, f_src); //skip the first line about parameter names + + while (!feof(f_src) && (indexbiggest) + biggest=orig_data[i][j]; + if (orig_data[i][j] index in the original data of the ith element of the ordered list) + * grid_ID -- mapping between the original data and the "grids" (see below) found as a byproduct + * of the sorting procedure. + * num_nonempty -- the number of grids that occur in the data (= the number of distinct values assumed + * by grid_ID) + */ + +void radixsort_flock(int **position,int file_Len,int num_dm,int num_bin,int *sorted_seq,int *num_nonempty,int *grid_ID) +{ + int i=0; + int length=0; + int start=0; + int prev_ID=0; + int curr_ID=0; + + int j=0; + int t=0; + int p=0; + int loc=0; + int temp=0; + int equal=0; + + int *count; //count[i]=j means there are j numbers having value i at the processing digit + int *index; //index[i]=j means the starting position of grid i is j + int *cp; //current position + int *mark; //mark[i]=0 means it is not an ending point of a part, 1 means it is (a "part" is a group of items with identical bins for all dimensions) + int *seq; //temporary sequence + + count=(int*)malloc(sizeof(int)*num_bin); + memset(count,0,sizeof(int)*num_bin); + + cp=(int*)malloc(sizeof(int)*num_bin); + memset(cp,0,sizeof(int)*num_bin); + + index=(int*)malloc(sizeof(int)*num_bin); // initialized below + + seq=(int*)malloc(sizeof(int)*file_Len); + memset(seq,0,sizeof(int)*file_Len); + + mark=(int*)malloc(sizeof(int)*file_Len); + memset(mark,0,sizeof(int)*file_Len); + + for (i=0;i0) && (index[i+1]<=file_Len)) + { + mark[index[i+1]-1]=1; // Mark the end of the segment in the ordered list + } + else + { + printf("out of myboundary for mark at index[i+1]-1.\n"); + } + } + mark[file_Len-1]=1; + + for (i=0;i0)) + { + mark[index[t+1]-1]=1; // update the part boundaries to include the differences in the current radix. + } + + } + mark[i]=1; + + /* Update the permutation vector for the current part (i.e., from start to i). By the time we finish the loop over i + the PV will be completely updated for the partial ordering up to the current radix. */ + for (t=start;t<=i;t++) + { + loc=position[sorted_seq[t]][j];//loc=position[t][j]; + temp=index[loc]+cp[loc]; + if ((temp=0)) + { + // seq is a temporary because we have to maintain the old PV until we have finished this step. + seq[temp]=sorted_seq[t]; //sorted_seq[i]=temp is also another way to sort + cp[loc]++; + } + else + { + printf("out of myboundary for seq at temp.\n"); + } + } + + for (t=start;t<=i;t++) + { + // copy the temporary back into sorted_seq. sorted_seq is now updated for radix j up through + // entry i in the ordered list. + if ((t>=0) && (tbig[j]) + big[j]=data_in[i][j]; + + if (data_in[i][j]=big[j]) + position[i][j]=num_bin-1; + else + { + position[i][j]=(int)((data_in[i][j]-small[j])/interval[j]); //position[i][j]=t means point i is at the t grid of dimensional j + if ((position[i][j]>=num_bin) || (position[i][j]<0)) + { + //printf("position mis-computed in density analysis!\n"); + //exit(0); + fprintf(stderr,"Incorrect input file format or input parameters (number of bins overflows)!\n"); //modified on July 23, 2010 + abort(); + + } + } + } + } + + + free(small); + free(big); +} + +/********************************************** select_bin to select the number of bins **********************************/ +//num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty); +/* Determine the number of bins to use in each dimension. Additionally sort the data elements according to the binned + * values, and partition the data into "grids" with identical (binned) values. We try progressively more bins until we + * maximize a merit function, then return the results obtained using the optimal number of bins. + * + * Outputs: + * position -- binned data values + * sorted_seq -- permutation vector mapping the ordered list to the original data + * all_grid_ID -- grid to which each data element was assigned. + * num_nonempty -- number of distinct values assumed by all_grid_ID + * interval -- bin width for each data dimension + * return value -- the number of bins selected. + */ + +int select_bin(double **normalized_data, int file_Len, int num_dm, int min_grid, int max_grid, int **position, int *sorted_seq, + int *all_grid_ID, int *num_nonempty, double *interval, int user_num_bin) +{ + + int num_bin=0; + int select_num_bin=0; + int m=0; + int n=0; + + int i=0; + int bin_scope=0; + int temp_num_nonempty=0; + + int *temp_grid_ID; + int *temp_sorted_seq; + int **temp_position; + + //sorted_seq[i]=j means the event j ranks i + + double temp_index=0; + double *bin_index; + double *temp_interval; + + + + temp_grid_ID=(int *)malloc(sizeof(int)*file_Len); + memset(temp_grid_ID,0,sizeof(int)*file_Len); + + temp_sorted_seq=(int *)malloc(sizeof(int)*file_Len); + memset(temp_sorted_seq,0,sizeof(int)*file_Len); + + temp_position=(int **)malloc(sizeof(int*)*file_Len); + memset(temp_position,0,sizeof(int*)*file_Len); + for (m=0;m=(double)(file_Len)*0.95) + break; + if ((bin_index[i]max_grid)) + { + fprintf(stderr,"Number of events collected is too few in terms of number of markers used. The file should not be processed!\n"); //modified on Nov 4, 2010 + exit(0); + } + + if (temp_index==0) + { + fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010 + abort(); + } + + + free(temp_grid_ID); + free(temp_sorted_seq); + free(bin_index); + free(temp_interval); + + for (m=0;m=den_t_event) + temp_num_dense_grids++; + + if (temp_num_dense_grids<=1) + { + //modified on July 23, 2010 + //printf("a too high density threshold is set! Please decrease your density threshold.\n"); + //exit(0); + fprintf(stderr,"a too high density threshold is set! Please decrease your density threshold.\n"); //modified on July 23, 2010 + abort(); + } + + if (temp_num_dense_grids>=100000) + { + //modified on July 23, 2010 + //printf("a too low density threshold is set! Please increase your density threshold.\n"); + //exit(0); + fprintf(stderr,"a too low density threshold is set! Please increase your density threshold.\n"); //modified on July 23, 2010 + abort(); + } + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Compute dense_grid_reverse + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + temp_num_dense_grids=0; + temp_num_dense_events=0; + for (i=0;i=den_t_event) + { + dense_grid_reverse[i]=temp_num_dense_grids; //dense_grid_reverse provides a mapping from all nonempty grids to dense grids. + temp_num_dense_grids++; + temp_num_dense_events+=all_grid_vol[i]; + } + } + + num_dense_grids[0]=temp_num_dense_grids; + num_dense_events[0]=temp_num_dense_events; + + free(grid_size_index); + free(all_grid_vol); + + return den_t_event; +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Compute densegridID_To_gridevent and eventID_To_denseventID +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent); +/* + * Filter the data so that only the data belonging to dense grids is left + * + * Output: + * eventID_To_denseeventID -- mapping from original event ID to ID in list containing only events contained in dense grids. + * densegridID_To_gridevent -- mapping from dense grids to prototype members of the grids. + * + */ + +void grid_To_event(int file_Len, int *dense_grid_reverse, int *all_grid_ID, int *eventID_To_denseventID, int *densegridID_To_gridevent) +{ + int i=0; + int dense_grid_ID=0; + int dense_event_ID=0; + + for (i=0;iseq_value[i]) + return -1; + if (search_value[i]==seq_value[i]) + continue; + } + return 0; +} + +//binary search the dense_grid_seq to return the dense grid ID if it exists +int binary_search(int num_dense_grids, int num_dm, int *search_value, int **dense_grid_seq) +{ + + int low=0; + int high=0; + int mid=0; + int comp_result=0; + int match=0; + //int found=0; + + low = 0; + high = num_dense_grids-1; + + while (low <= high) + { + mid = (int)((low + high)/2); + + comp_result=compare_value(num_dm, search_value,dense_grid_seq[mid]); + + + switch(comp_result) + { + case 1: + high=mid-1; + break; + case -1: + low=mid+1; + break; + case 0: + match=1; + break; + } + if (match==1) + break; + } + + + + if (match==1) + { + return mid; + } + else + return -1; +} + + +/********************************************** Computing Centers Using IDs **********************************************/ + +void ID2Center(double **data_in, int file_Len, int num_dm, int *eventID_To_denseventID, int num_clust, int *cluster_ID, double **population_center) +{ + int i=0; + int j=0; + int ID=0; + int eventID=0; + int *size_c; + + + + size_c=(int *)malloc(sizeof(int)*num_clust); + memset(size_c,0,sizeof(int)*num_clust); + + for (i=0;imax_num_pop) || (num_population<=1)) + { + + if (num_population<=1) + merge_dist=merge_dist-0.1; + + if (num_population>max_num_pop) + merge_dist=merge_dist+0.1; + + + for (i=0;i(merge_dist*interval[t])) + merge=0; + } + + if ((merge) && (cluster_populationID[i]!=cluster_populationID[j])) + { + if (cluster_populationID[i]max_num_pop) && (num_population==1)) + break; + else + temp_num_population=num_population; + + if (num_clust<=1) + break; + } + + for (i=0;ikmean_term) && (kmean_term<1)) || ((times=1))) + { + for (i=0;ivvv) + vvv=variation; //vvv is the biggest variation among the k clusters; + } + //compute_variation; + times++; + } //end for while + + + + + free(num); + + for (i=0;iMatrix[i][j]) + small[j]=Matrix[i][j]; + } + + size_c[id]++; + } + + for (i=0;i0)) //the smallest values excluded + size_mybound_0[cluster_id[i]][j]++; + else + { + if (Matrix[i][j]size_mybound_1[i][j]) + situ1=0; + else + situ1=1; + if (size_mybound_2[i][j]>size_mybound_3[i][j]) + situ2=2; + else + situ2=3; + + if ((situ1==0) && (situ2==2)) + { + if (size_mybound_0[i][j]>size_mybound_2[i][j]) + prof[i][j]=0; + else + prof[i][j]=2; + } + if ((situ1==0) && (situ2==3)) + { + if (size_mybound_0[i][j]>size_mybound_3[i][j]) + prof[i][j]=0; + else + prof[i][j]=3; + } + if ((situ1==1) && (situ2==2)) + { + if (size_mybound_1[i][j]>size_mybound_2[i][j]) + prof[i][j]=1; + else + prof[i][j]=2; + } + if ((situ1==1) && (situ2==3)) + { + if (size_mybound_1[i][j]>size_mybound_3[i][j]) + prof[i][j]=1; + else + prof[i][j]=3; + } + + //begin to output profile + if (j==num_dm-1) + { + if (prof[i][j]==0) + fprintf(fprof_id,"1\n"); + if (prof[i][j]==1) + fprintf(fprof_id,"2\n"); + if (prof[i][j]==2) + fprintf(fprof_id,"3\n"); + if (prof[i][j]==3) + fprintf(fprof_id,"4\n"); + } + else + { + if (prof[i][j]==0) + fprintf(fprof_id,"1\t"); + if (prof[i][j]==1) + fprintf(fprof_id,"2\t"); + if (prof[i][j]==2) + fprintf(fprof_id,"3\t"); + if (prof[i][j]==3) + fprintf(fprof_id,"4\t"); + } + } + } + fclose(fprof_id); + + /////////////////////////////////////////////////////////// + + + fpcnt_id=fopen("percentage.txt","w"); + fprintf(fpcnt_id,"Population_ID\tPercentage\n"); + + for (t=0;t population_center +//data + grid_ID -> grid_Center + +/* what is the final output */ +//the final things we want are grid_Center in the original data and grid_clusterID //or population_center in the original data +//Sparse grids will not be considered when computing the two centroids (centroids of grids and centroids of clusters) + +/* what information should select_bin output */ +//the size of all IDs are unknown to function main because we only consider the events in dense grids, and also the number of dense grids +//is unknown, therefore I must use a prescreening to output +//how many bins I should use +//the number of dense grids +//total number of events in the dense grids + +/* basic procedure of main function */ +//1. read raw file and normalize the raw file +//2. select_bin +//3. allocate memory for eventID_To_denseventID, grid_clusterID, grid_ID, cluster_ID. +//4. call select_dense and merge_grids with grid_clusterID, grid_ID, cluster_ID. +//5. release normalized data; allocate memory for grid_Center and population_center +//6. output grid_Center and population_center using ID2Center together with grid_clusterID //from IDs to centers + +int main (int argc, char **argv) +{ + //inputs + FILE *f_src; //source file pointer + + FILE *f_out; //coordinates + FILE *f_cid; //population-ID of events + FILE *f_ctr; //centroids of populations + FILE *f_results; //coordinates file event and population column + FILE *f_mfi; //added April 16, 2009 for mean fluorescence intensity + FILE *f_parameters; //number of bins and density calculated by + //the algorithm. Used to update the database + FILE *f_properties; //Properties file used by Image generation software + + char para_name_string[LINE_LEN]; + + int time_ID=-1; + int num_bin=0; //the bins I will use on each dimension + + int file_Len=0; //number of events + int num_dm=0; + int num_clust=0; + int num_dense_events=0; + int num_dense_grids=0; + int num_nonempty=0; + int num_population=0; + //int temp=0; + + //below are read from configuration file + int i=0; + int j=0; + int max_num_pop=0; + + int den_t_event=0; + + int *grid_clusterID; //(dense)gridID to clusterID + int *grid_ID; //(dense)eventID to gridID + int *cluster_ID; //(dense)eventID to clusterID + int *eventID_To_denseventID; //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k + int *all_grid_ID; //gridID for all events + int *densegridID_To_gridevent; + int *sorted_seq; + int *dense_grid_reverse; + int *population_ID; //denseeventID to populationID + int *cluster_populationID; //clusterID to populationID + int *grid_populationID; //gridID to populationID + int *all_population_ID; //populationID of event + + int **position; + int **dense_grid_seq; + + double *interval; + + double **population_center; //population centroids in the raw/original data + double **cluster_center; //cluster centroids in the raw/original data + + double **input_data; + double **normalized_data; + + int min = 999999; + int max = 0; + + printf( "Starting time:\t\t\t\t"); + fflush(stdout); + system("/bin/date"); + ///////////////////////////////////////////////////////////// + + if ((argc!=2) && (argc!=4) && (argc!=5)) + { + //modified on Jul 23, 2010 + //printf("usage:\n"); + //printf("basic mode: flock data_file\n"); + //printf("advanced mode: flock data_file num_bin density_index\n"); + //exit(0); + + fprintf(stderr,"Incorrect number of input parameters!\n"); //modified on July 23, 2010 + //fprintf(stderr,"basic mode: flock data_file\n"); //modified on July 23, 2010 + //fprintf(stderr,"advanced mode1: flock data_file num_bin density_index\n"); //modified on July 23, 2010 + fprintf(stderr,"advanced mode: flock data_file num_bin density_index max_num_pop\n"); //added on Dec 16, 2010 for GenePattern + abort(); + } + + f_src=fopen(argv[1],"r"); + + if (argc==2) + { + max_num_pop=30; //default value, maximum 30 clusters + } + + if (argc==4) + { + num_bin=atoi(argv[2]); + printf("num_bin is %d\n",num_bin); + + den_t_event=atoi(argv[3]); + printf("density_index is %d\n",den_t_event); + + max_num_pop=30; + + if (((num_bin<6) && (num_bin!=0)) || (num_bin>29)) + { + fprintf(stderr,"Incorrect input range of number of bins, which should be larger than 5 and smaller than 30\n"); + abort(); + } + + if (((den_t_event<3) && (den_t_event!=0)) || (den_t_event>99)) + { + fprintf(stderr,"Incorrect input range of density threshold, which should be larger than 2 and smaller than 100\n"); + abort(); + } + } + + if (argc==5) + { + num_bin=atoi(argv[2]); + printf("num_bin is %d\n",num_bin); + + den_t_event=atoi(argv[3]); + printf("density_index is %d\n",den_t_event); + + max_num_pop=atoi(argv[4]); + printf("max number of clusters is %d\n",max_num_pop); + + if (((num_bin<6) && (num_bin!=0)) || (num_bin>29)) + { + fprintf(stderr,"Incorrect input range of number of bins, which should be larger than 5 and smaller than 30\n"); + abort(); + } + + if (((den_t_event<3) && (den_t_event!=0)) || (den_t_event>99)) + { + fprintf(stderr,"Incorrect input range of density threshold, which should be larger than 2 and smaller than 100\n"); + abort(); + } + + if ((max_num_pop<5) || (max_num_pop>999)) + { + fprintf(stderr,"Incorrect input range of maximum number of populations, which should be larger than 4 and smaller than 1000\n"); + abort(); + } + } + + + getfileinfo(f_src, &file_Len, &num_dm, para_name_string, &time_ID); //get the filelength, number of dimensions, and num/name of parameters + + /************************************************* Read the data *****************************************************/ + + rewind(f_src); //reset the file pointer + + input_data = (double **)malloc(sizeof(double*)*file_Len); + memset(input_data,0,sizeof(double*)*file_Len); + for (i=0;i=1) //num_bin has been selected by user + select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin); + else //num_bin has not been selected by user + { + num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin); + printf("selected bin number is %d\n",num_bin); + } + printf("number of non-empty grids is %d\n",num_nonempty); + + + + /* Although we return sorted_seq from select_bin(), we don't use it for anything, except possibly diagnostics */ + free(sorted_seq); + + + dense_grid_reverse=(int*)malloc(sizeof(int)*num_nonempty); + memset(dense_grid_reverse,0,sizeof(int)*num_nonempty); + + /************************************************* select_dense **********************************************/ + + if (den_t_event>=1) //den_t_event must be larger or equal to 2 if the user wants to set it + select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event); + else + { + den_t_event=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event); + printf("automated selected density threshold is %d\n",den_t_event); + } + + printf("Number of dense grids is %d\n",num_dense_grids); + + densegridID_To_gridevent = (int *)malloc(sizeof(int)*num_dense_grids); + memset(densegridID_To_gridevent,0,sizeof(int)*num_dense_grids); + + for (i=0;i max) { + max = (int)input_data[i][j]; + } + if (j==num_dm-1) + { + fprintf(f_out,"%d\n",(int)input_data[i][j]); + fprintf(f_results,"%d\t",(int)input_data[i][j]); + } + else + { + fprintf(f_out,"%d\t",(int)input_data[i][j]); + fprintf(f_results,"%d\t",(int)input_data[i][j]); + } + } + //fprintf(f_results,"%d\t",i + 1); + fprintf(f_results,"%d\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009 + } + +/* + f_parameters=fopen("parameters.txt","w"); + fprintf(f_parameters,"Number_of_Bins\t%d\n",num_bin); + fprintf(f_parameters,"Density\t%d\n",den_t_event); + fprintf(f_parameters,"Min\t%d\n",min); + fprintf(f_parameters,"Max\t%d\n",max); + fclose(f_parameters); +*/ + + f_properties=fopen("fcs.properties","w"); + fprintf(f_properties,"Bins=%d\n",num_bin); + fprintf(f_properties,"Density=%d\n",den_t_event); + fprintf(f_properties,"Min=%d\n",min); + fprintf(f_properties,"Max=%d\n",max); + fprintf(f_properties,"Populations=%d\n",num_population); + fprintf(f_properties,"Events=%d\n",file_Len); + fprintf(f_properties,"Markers=%d\n",num_dm); + fclose(f_properties); + + + for (i=0;imax_grid) to STDERR +// 8. Fixed a bug for 2D data by using K=e*K +// 9. Added some header files, may not be necessary +// 10. Added a lower bound (at least two) for number of populations +/*************************************************************************************************************************************** + + FLOCK: FLOw cytometry Clustering without K (Named by: Jamie A. Lee and Richard H. Scheuermann) + + Author: (Max) Yu Qian, Ph.D. + + Copyright: Scheuermann Lab, Dept. of Pathology, UTSW + + Development: November 2005 ~ Forever + + Status: July 2010: Release 2.0 + + Usage: flock data_file + Note: the input file format must be channel values and the delimiter between two values must be a tab. + + + +****************************************************************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + + + +#define DEBUG 0 +#define LINE_LEN 1024 +#define FILE_NAME_LEN 128 +#define PARA_NAME_LEN 64 +#define MAX_VALUE 1000000000 +#define MIN_GRID 6 +#define MAX_GRID 50 +#define E_T 1.0 + +#define NORM_METHOD 2 //2 if z-score; 0 if no normalization; 1 if min-max +#define KMEANS_TERM 10 +#define MAX_POP_NUM 128 + +#ifndef max + #define max( a, b ) ( ((a) > (b)) ? (a) : (b) ) +#endif + +#ifndef min + #define min( a, b ) ( ((a) < (b)) ? (a) : (b) ) +#endif + +static long **Gr=0; +static long *gcID = 0; /* grid cluster IDs */ +static long *cluster_count=0; /* count of nodes per cluster */ +static long ndense=0; +static long ndim=0; +/* cid changes between depth-first searches, but is constant within a + single search, so it goes here. */ +static long cid=0; +/* Do a depth-first search for a single connected component in graph + * G. Start from node, tag the nodes found with cid, and record + * the tags in grid_clusterID. Also, record the node count in + * cluster_count. If we find a node that has already been assigned to + * a cluster, that means we're merging two clusters, so zero out the + * old cid's node count. + * + * Note that our graph is constructed as a DAG, so we can be + * guaranteed to terminate without checking for cycles. + * + * Note2: this function can potentially recurse to depth = ndense. + * Plan stack size accordingly. + * + * Output: + * + * grid_clusterID[] -- array where we tag the nodes. + * cluster_count[] -- count of the number of nodes per cluster. + */ +static void merge_cluster(long from, long into) +{ + int i; + + for(i=0; i= 0) { + /* We are, so zero the count for the old cluster. */ + cluster_count[ gcID[node] ] = 0; + merge_cluster(gcID[node], cid); + return; + } + + /* Update for this node */ + gcID[node] = cid; + cluster_count[cid]++; + + /* Recursively search the child nodes */ + for(i=0; i= 0) /* This is a child node */ + depth_first(Gr[node][i]); +} + +void bail(const char *msg) +{ + fprintf(stderr,"%s",msg); + exit(0); +} + + +static void check_clusters(long *gcID, int ndense) +{ + int i; + + for(i=0; ibiggest) + biggest=orig_data[i][j]; + if (orig_data[i][j] index in the original data of the ith element of the ordered list) + * grid_ID -- mapping between the original data and the "grids" (see below) found as a byproduct + * of the sorting procedure. + * num_nonempty -- the number of grids that occur in the data (= the number of distinct values assumed + * by grid_ID) + */ + +void radixsort_flock(long **position,long file_Len,long num_dm,long num_bin,long *sorted_seq,long *num_nonempty,long *grid_ID) +{ + long i=0; + long length=0; + long start=0; + long prev_ID=0; + long curr_ID=0; + + long j=0; + long t=0; + long p=0; + long loc=0; + long temp=0; + long equal=0; + + long *count; //count[i]=j means there are j numbers having value i at the processing digit + long *index; //index[i]=j means the starting position of grid i is j + long *cp; //current position + long *mark; //mark[i]=0 means it is not an ending point of a part, 1 means it is (a "part" is a group of items with identical bins for all dimensions) + long *seq; //temporary sequence + + count=(long*)malloc(sizeof(long)*num_bin); + memset(count,0,sizeof(long)*num_bin); + + cp=(long*)malloc(sizeof(long)*num_bin); + memset(cp,0,sizeof(long)*num_bin); + + index=(long*)malloc(sizeof(long)*num_bin); // initialized below + + seq=(long*)malloc(sizeof(long)*file_Len); + memset(seq,0,sizeof(long)*file_Len); + + mark=(long*)malloc(sizeof(long)*file_Len); + memset(mark,0,sizeof(long)*file_Len); + + for (i=0;i0) && (index[i+1]<=file_Len)) + { + mark[index[i+1]-1]=1; // Mark the end of the segment in the ordered list + } + else + { + printf("out of myboundary for mark at index[i+1]-1.\n"); + } + } + mark[file_Len-1]=1; + + for (i=0;i0)) + { + mark[index[t+1]-1]=1; // update the part boundaries to include the differences in the current radix. + } + + } + mark[i]=1; + + /* Update the permutation vector for the current part (i.e., from start to i). By the time we finish the loop over i + the PV will be completely updated for the partial ordering up to the current radix. */ + for (t=start;t<=i;t++) + { + loc=position[sorted_seq[t]][j];//loc=position[t][j]; + temp=index[loc]+cp[loc]; + if ((temp=0)) + { + // seq is a temporary because we have to maintain the old PV until we have finished this step. + seq[temp]=sorted_seq[t]; //sorted_seq[i]=temp is also another way to sort + cp[loc]++; + } + else + { + printf("out of myboundary for seq at temp.\n"); + } + } + + for (t=start;t<=i;t++) + { + // copy the temporary back into sorted_seq. sorted_seq is now updated for radix j up through + // entry i in the ordered list. + if ((t>=0) && (tbig[j]) + big[j]=data_in[i][j]; + + if (data_in[i][j]=big[j]) + position[i][j]=num_bin-1; + else + { + position[i][j]=(long)((data_in[i][j]-small[j])/interval[j]); //position[i][j]=t means point i is at the t grid of dimensional j + if ((position[i][j]>=num_bin) || (position[i][j]<0)) + { + //printf("position mis-computed in density analysis!\n"); + //exit(0); + fprintf(stderr,"Incorrect input file format or input parameters (number of bins overflows)!\n"); //modified on July 23, 2010 + exit(0); + } + } + } + } + + + free(small); + free(big); +} + +/********************************************** select_bin to select the number of bins **********************************/ +//num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty); +/* Determine the number of bins to use in each dimension. Additionally sort the data elements according to the binned + * values, and partition the data into "grids" with identical (binned) values. We try progressively more bins until we + * maximize a merit function, then return the results obtained using the optimal number of bins. + * + * Outputs: + * position -- binned data values + * sorted_seq -- permutation vector mapping the ordered list to the original data + * all_grid_ID -- grid to which each data element was assigned. + * num_nonempty -- number of distinct values assumed by all_grid_ID + * interval -- bin width for each data dimension + * return value -- the number of bins selected. + */ + +long select_bin(double **normalized_data, long file_Len, long num_dm, long min_grid, long max_grid, long **position, long *sorted_seq, + long *all_grid_ID, long *num_nonempty, double *interval, long user_num_bin) +{ + + long num_bin=0; + long select_num_bin=0; + long m=0; + long n=0; + + long i=0; + long bin_scope=0; + long temp_num_nonempty=0; + + long *temp_grid_ID; + long *temp_sorted_seq; + long **temp_position; + + //sorted_seq[i]=j means the event j ranks i + + double temp_index=0; + double *bin_index; + double *temp_interval; + + + temp_grid_ID=(long *)malloc(sizeof(long)*file_Len); + memset(temp_grid_ID,0,sizeof(long)*file_Len); + + temp_sorted_seq=(long *)malloc(sizeof(long)*file_Len); + memset(temp_sorted_seq,0,sizeof(long)*file_Len); + + temp_position=(long **)malloc(sizeof(long*)*file_Len); + memset(temp_position,0,sizeof(long*)*file_Len); + for (m=0;m=(double)(file_Len)*0.95) + break; + if ((bin_index[i]max_grid)) + { + fprintf(stderr,"Number of events collected is too few in terms of number of markers used. The file should not be processed!\n"); //modified on Nov 04, 2010 + exit(0); + } + + if (temp_index==0) + { + fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010 + exit(0); + } + + free(temp_grid_ID); + free(temp_sorted_seq); + free(bin_index); + free(temp_interval); + + for (m=0;m=den_t_event) + temp_num_dense_grids++; + + if (temp_num_dense_grids<=1) + { + //exit(0); + //printf("a too high density threshold is set! Please decrease your density threshold.\n"); + fprintf(stderr,"a too high density threshold! Please decrease your density threshold.\n"); //modified on July 23, 2010 + exit(0); + } + + if (temp_num_dense_grids>=100000) + { + //modified on July 23, 2010 + //printf("a too low density threshold is set! Please increase your density threshold.\n"); + //exit(0); + fprintf(stderr,"a too low density threshold! Please increase your density threshold.\n"); //modified on July 23, 2010 + exit(0); + } + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Compute dense_grid_reverse + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + temp_num_dense_grids=0; + temp_num_dense_events=0; + for (i=0;i=den_t_event) + { + dense_grid_reverse[i]=temp_num_dense_grids; //dense_grid_reverse provides a mapping from all nonempty grids to dense grids. + temp_num_dense_grids++; + temp_num_dense_events+=all_grid_vol[i]; + } + } + + num_dense_grids[0]=temp_num_dense_grids; + num_dense_events[0]=temp_num_dense_events; + + free(grid_size_index); + free(all_grid_vol); + + return den_t_event; +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Compute densegridID_To_gridevent and eventID_To_denseventID +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent); +/* + * Filter the data so that only the data belonging to dense grids is left + * + * Output: + * eventID_To_denseeventID -- mapping from original event ID to ID in list containing only events contained in dense grids. + * densegridID_To_gridevent -- mapping from dense grids to prototype members of the grids. + * + */ + +void grid_To_event(long file_Len, long *dense_grid_reverse, long *all_grid_ID, long *eventID_To_denseventID, long *densegridID_To_gridevent) +{ + long i=0; + long dense_grid_ID=0; + long dense_event_ID=0; + + for (i=0;iseq_value[i]) + return -1; + if (search_value[i]==seq_value[i]) + continue; + } + return 0; +} + +//binary search the dense_grid_seq to return the dense grid ID if it exists +long binary_search(long num_dense_grids, long num_dm, long *search_value, long **dense_grid_seq) +{ + + long low=0; + long high=0; + long mid=0; + long comp_result=0; + long match=0; + //long found=0; + + low = 0; + high = num_dense_grids-1; + + while (low <= high) + { + mid = (long)((low + high)/2); + + comp_result=compare_value(num_dm, search_value,dense_grid_seq[mid]); + + + switch(comp_result) + { + case 1: + high=mid-1; + break; + case -1: + low=mid+1; + break; + case 0: + match=1; + break; + } + if (match==1) + break; + } + + + + if (match==1) + { + return mid; + } + else + return -1; +} + + +/********************************************** Computing Centers Using IDs **********************************************/ + +void ID2Center(double **data_in, long file_Len, long num_dm, long *eventID_To_denseventID, long num_clust, long *cluster_ID, double **population_center) +{ + long i=0; + long j=0; + long ID=0; + long eventID=0; + long *size_c; + + + + size_c=(long *)malloc(sizeof(long)*num_clust); + memset(size_c,0,sizeof(long)*num_clust); + + for (i=0;imax_num_pop) && (merge_dist<5.0)) || ((num_population<=1) && (merge_dist>0.1))) + { + + if (num_population<=1) + merge_dist=merge_dist-0.1; + + if (num_population>max_num_pop) + merge_dist=merge_dist+0.1; + + + for (i=0;i(merge_dist*interval[t])) + merge=0; + } + + if ((merge) && (cluster_populationID[i]!=cluster_populationID[j])) + { + if (cluster_populationID[i]max_num_pop) && (num_population==1)) + break; + else + temp_num_population=num_population; + + if (num_clust<=1) + break; + } //end while + + for (i=0;ikmean_term) && (kmean_term<1)) || ((times=1))) + { + for (i=0;ivvv) + vvv=variation; //vvv is the biggest variation among the k clusters; + } + //compute_variation; + times++; + } //end for while + + + + + free(num); + + for (i=0;iMatrix[i][j]) + small[j]=Matrix[i][j]; + } + + size_c[id]++; + } + + for (i=0;i0)) //the smallest values excluded + size_mybound_0[cluster_id[i]][j]++; + else + { + if (Matrix[i][j]size_mybound_1[i][j]) + situ1=0; + else + situ1=1; + if (size_mybound_2[i][j]>size_mybound_3[i][j]) + situ2=2; + else + situ2=3; + + if ((situ1==0) && (situ2==2)) + { + if (size_mybound_0[i][j]>size_mybound_2[i][j]) + prof[i][j]=0; + else + prof[i][j]=2; + } + if ((situ1==0) && (situ2==3)) + { + if (size_mybound_0[i][j]>size_mybound_3[i][j]) + prof[i][j]=0; + else + prof[i][j]=3; + } + if ((situ1==1) && (situ2==2)) + { + if (size_mybound_1[i][j]>size_mybound_2[i][j]) + prof[i][j]=1; + else + prof[i][j]=2; + } + if ((situ1==1) && (situ2==3)) + { + if (size_mybound_1[i][j]>size_mybound_3[i][j]) + prof[i][j]=1; + else + prof[i][j]=3; + } + + //begin to output profile + if (j==num_dm-1) + { + if (prof[i][j]==0) + fprintf(fprof_id,"1\n"); + if (prof[i][j]==1) + fprintf(fprof_id,"2\n"); + if (prof[i][j]==2) + fprintf(fprof_id,"3\n"); + if (prof[i][j]==3) + fprintf(fprof_id,"4\n"); + } + else + { + if (prof[i][j]==0) + fprintf(fprof_id,"1\t"); + if (prof[i][j]==1) + fprintf(fprof_id,"2\t"); + if (prof[i][j]==2) + fprintf(fprof_id,"3\t"); + if (prof[i][j]==3) + fprintf(fprof_id,"4\t"); + } + } + } + fclose(fprof_id); + + /////////////////////////////////////////////////////////// + + + fpcnt_id=fopen("percentage.txt","w"); + fprintf(fpcnt_id,"Population_ID\tPercentage\n"); + + for (t=0;t0) + { + k=(long)sqrt((double)k); + if (num_dm<=3) + k=(long)(2.7183*(double)k); //e*k for low-dimensional space, added Nov. 4, 2010 + //printf("the k-nearest k is %d\n",k); + } + else + { + printf("error in k\n"); + exit(0); + } + + nearest_neighbors=(long *)malloc(sizeof(long)*k); + memset(nearest_neighbors,0,sizeof(long)*k); + + nearest_distance=(double *)malloc(sizeof(double)*k); + memset(nearest_distance,0,sizeof(double)*k); + + temp=0; + + for (i=0;ibiggest_distance) + { + biggest_distance=nearest_distance[j]; + current_biggest=j; + } + } + + if (biggest_distance>distance) + { + nearest_neighbors[current_biggest]=i; + nearest_distance[current_biggest]=distance; + } + } + } + } + + for (i=0;i population_center +//data + grid_ID -> grid_Center + +/* what is the final output */ +//the final things we want are grid_Center in the original data and grid_clusterID //or population_center in the original data +//Sparse grids will not be considered when computing the two centroids (centroids of grids and centroids of clusters) + +/* what information should select_bin output */ +//the size of all IDs are unknown to function main because we only consider the events in dense grids, and also the number of dense grids +//is unknown, therefore I must use a prescreening to output +//how many bins I should use +//the number of dense grids +//total number of events in the dense grids + +/* basic procedure of main function */ +//1. read raw file and normalize the raw file +//2. select_bin +//3. allocate memory for eventID_To_denseventID, grid_clusterID, grid_ID, cluster_ID. +//4. call select_dense and merge_grids with grid_clusterID, grid_ID, cluster_ID. +//5. release normalized data; allocate memory for grid_Center and population_center +//6. output grid_Center and population_center using ID2Center together with grid_clusterID //from IDs to centers + +int main (int argc, char **argv) +{ + //inputs + FILE *f_src; //source file pointer + + FILE *f_out; //coordinates + FILE *f_cid; //population-ID of events + FILE *f_ctr; //centroids of populations + FILE *f_results; //coordinates file event and population column + FILE *f_mfi; //added April 16, 2009 for mean fluorescence intensity + FILE *f_parameters; //number of bins and density calculated by + //the algorithm. Used to update the database + FILE *f_properties; //Properties file used by Image generation software + + //char tmpbuf[128]; + + char para_name_string[LINE_LEN]; + + long time_ID=-1; + long num_bin=0; //the bins I will use on each dimension + long num_pop=0; + long file_Len=0; //number of events + long num_dm=0; + long num_clust=0; + long num_dense_events=0; + long num_dense_grids=0; + long num_nonempty=0; + long num_population=0; + long num_real_pop=0; + long keep_merge=0; + long num_checked_range=0; + + //below are read from configuration file + long i=0; + long j=0; + long p=0; + long t=0; + long q=0; + long temp_i=0; + long temp_j=0; + long first_i=0; + long first_j=0; + long d1=0; + long d2=0; + long d3=0; + long d_d=0; + long max_num_pop=0; //upperbound of the number of populations that is equal to 2^d + long index_id=0; + long num_computed_population=0; + long tot_size=0; + + int den_t_event=0; + + double distance=0.0; + double dist=0.0; + double current_smallest_dist=0; + double max_d_dist=0.0; + double Ei=0.0; + double Ej=0.0; + double E1=0.0; + double E2=0.0; + double E3=0.0; + double Ep=0.0; + double temp=0.0; + double tmp=0.0; + + long *grid_clusterID; //(dense)gridID to clusterID + long *grid_ID; //(dense)eventID to gridID + long *cluster_ID; //(dense)eventID to clusterID + long *eventID_To_denseventID; //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k + long *all_grid_ID; //gridID for all events + long *densegridID_To_gridevent; + long *sorted_seq; + long *dense_grid_reverse; + long *population_ID; //denseeventID to populationID + long *cluster_populationID; //clusterID to populationID + long *grid_populationID; //gridID to populationID + long *all_population_ID; //populationID of event + long *size_c; + long *size_p_2; + long *partit; + long *size_p; + long *temp_size_j; + long *all_computed_pop_ID; + + + long **position; + long **dense_grid_seq; + long **temp_population_ID; + + double *interval; + double *center_1; + double *center_2; + double *center_3; + double *aver; + double *std; + double *center_p_1; + double *center_p_2; + double *center_p_3; + + double **population_center; //population centroids in the raw/original data + double **cluster_center; //cluster centroids in the raw/original data + double **new_population_center; + double **temp_population_center; + double **min_pop; + double **max_pop; + + double **input_data; + double **normalized_data; + + double **real_pop_center; + double **distance_pop; + + double ***ind_pop; + double ***ind_pop_3; + + int min = 999999; + int max = 0; + + printf( "Starting time:\t\t\t\t"); + fflush(stdout); + system("/bin/date"); + + /* + * Windows Version + _strtime( tmpbuf ); + printf( "Starting time:\t\t\t\t%s\n", tmpbuf ); + _strdate( tmpbuf ); + printf( "Starting date:\t\t\t\t%s\n", tmpbuf ); + */ + + ///////////////////////////////////////////////////////////// + + if ((argc!=2) && (argc!=3) && (argc!=4) && (argc!=5) && (argc!=6)) + { + fprintf(stderr,"Incorrect number of input parameters!\n"); + fprintf(stderr,"usage:\n"); + fprintf(stderr,"basic mode: flock data_file\n"); + fprintf(stderr,"advanced mode 0 (specify maximum # of pops): flock data_file max_num_pop\n"); + fprintf(stderr,"advanced mode 1 (without # of pops): flock data_file num_bin density_index\n"); + fprintf(stderr,"advanced mode 2 (specify # of pops): flock data_file num_bin density_index number_of_pop\n"); + fprintf(stderr,"advanced mode 3 (specify both # of pops): flock data_file num_bin density_index number_of_pop max_num_pop\n"); + exit(0); + } + + f_src=fopen(argv[1],"r"); + + if (argc==3) + { + max_num_pop=atoi(argv[2]); + printf("num of maximum pop is %ld\n",max_num_pop); + } + + if (argc==4) + { + num_bin=atoi(argv[2]); + printf("num_bin is %ld\n",num_bin); + + den_t_event=atoi(argv[3]); + printf("density_index is %d\n",den_t_event); + } + + if (argc==5) + { + num_bin=atoi(argv[2]); + printf("num_bin is %ld\n",num_bin); + + den_t_event=atoi(argv[3]); + printf("density_index is %d\n",den_t_event); + + num_pop=atoi(argv[4]); + printf("num of pop is %ld\n",num_pop); + } + + if (argc==6) + { + num_bin=atoi(argv[2]); + printf("num_bin is %ld\n",num_bin); + + den_t_event=atoi(argv[3]); + printf("density_index is %d\n",den_t_event); + + num_pop=atoi(argv[4]); + printf("num of pop is %ld\n",num_pop); + + max_num_pop=atoi(argv[5]); + printf("num of pop is %ld\n",max_num_pop); + } + + + if (num_pop==1) + { + printf("it is not allowed to specify only one population\n"); + exit(0); + } + + + + getfileinfo(f_src, &file_Len, &num_dm, para_name_string, &time_ID); //get the filelength, number of dimensions, and num/name of parameters + + + if (max_num_pop==0) + { + max_num_pop=(long)pow(2,num_dm); + if (max_num_pop>MAX_POP_NUM) + max_num_pop=MAX_POP_NUM; + } + + /************************************************* Read the data *****************************************************/ + + rewind(f_src); //reset the file pointer + + input_data = (double **)malloc(sizeof(double*)*file_Len); + memset(input_data,0,sizeof(double*)*file_Len); + for (i=0;i=1) //num_bin has been selected by user + { + select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin); + printf("user set bin number is %ld\n",num_bin); + } + else //num_bin has not been selected by user + { + num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin); + printf("selected bin number is %ld\n",num_bin); + } + printf("number of non-empty grids is %ld\n",num_nonempty); + + + + /* Although we return sorted_seq from select_bin(), we don't use it for anything, except possibly diagnostics */ + free(sorted_seq); + + + dense_grid_reverse=(long*)malloc(sizeof(long)*num_nonempty); + memset(dense_grid_reverse,0,sizeof(long)*num_nonempty); + + /************************************************* select_dense **********************************************/ + + if (den_t_event>=1) //den_t_event must be larger or equal to 2 if the user wants to set it + den_t_event=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event); + else + { + den_t_event=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event); + printf("automated selected density threshold is %d\n",den_t_event); + } + + printf("Number of dense grids is %ld\n",num_dense_grids); + + densegridID_To_gridevent = (long *)malloc(sizeof(long)*num_dense_grids); + memset(densegridID_To_gridevent,0,sizeof(long)*num_dense_grids); + + for (i=0;imax_d_dist) + { + max_d_dist=dist; + d_d=t; + } + } + + if (j==0) + d1=d_d; + if (j==1) + d2=d_d; + if (j==2) + d3=d_d; + } + + + for (t=0;tEi) && (Ep>Ej)) //the two parts should be partitioned + { + partit[i]=num_computed_population; + num_computed_population++; + } + + } //end for (i=0;i0) + { + j=temp_size_j[index_id]; + if (temp_population_ID[index_id][j]==1) + all_computed_pop_ID[i]=partit[index_id]; + + temp_size_j[index_id]++; + } + } + + printf("Number of groups after partitioning is %ld\n", num_computed_population); + + + free(size_p_2); + + free(aver); + free(std); + free(partit); + + free(center_p_1); + free(center_p_2); + free(center_p_3); + + + + for (i=0;i=num_computed_population) + printf("all_population_ID[%ld]=%ld\n",i,all_population_ID[i]); + } + + num_population=num_computed_population; + + free(all_computed_pop_ID); + //end partitioning + // since the num_population has changed, population_center needs to be redefined as below + + population_center=(double **)malloc(sizeof(double*)*num_population); + memset(population_center,0,sizeof(double*)*num_population); + for (i=0;inum_real_pop) + { + fprintf(stderr,"number of populations specified too large\n"); + exit(0); + } + + center_1=(double *)malloc(sizeof(double)*num_dm); + memset(center_1,0,sizeof(double)*num_dm); + + center_2=(double *)malloc(sizeof(double)*num_dm); + memset(center_2,0,sizeof(double)*num_dm); + + center_3=(double *)malloc(sizeof(double)*num_dm); + memset(center_3,0,sizeof(double)*num_dm); + + + + while (((num_real_pop>num_pop) && (num_pop!=0)) || ((keep_merge==1) && (num_pop==0) && (num_real_pop>2))) + { + + keep_merge=0; //so, if no entering merge function, while will exit when num_pop==0 + + real_pop_center=(double **)malloc(sizeof(double*)*num_real_pop); + memset(real_pop_center,0,sizeof(double*)*num_real_pop); + for (i=0;imax_pop[index_id][j]) + max_pop[index_id][j]=normalized_data[i][j]; + } + } + +/* for (i=0;i2) + num_checked_range=(long)((double)num_real_pop*(num_real_pop-1)/2.0); //to find the mergeable pair among the num_checked_range pairs of smallest distances + else + num_checked_range=1; + + size_c=(long *)malloc(sizeof(long)*num_real_pop); + memset(size_c,0,sizeof(long)*num_real_pop); + + for (i=0;imax_d_dist) + { + max_d_dist=dist; + d_d=j; + } + } + + if (i==0) + d1=d_d; + if (i==1) + d2=d_d; + if (i==2) + d3=d_d; + } + + //printf("d1=%d;d2=%d;d3=%d\n",d1,d2,d3); + + Ei=get_avg_dist(real_pop_center[temp_i], temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c); + Ej=get_avg_dist(real_pop_center[temp_j], temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c); + + for (t=0;ttemp_j) + { + all_population_ID[i]=all_population_ID[i]-1; + } + else + { + if (all_population_ID[i]==temp_j) + { + all_population_ID[i]=temp_i; + } + } + } + + num_real_pop--; + } + else + { + if ((num_pop!=0) && (num_popfirst_j) + { + all_population_ID[i]=all_population_ID[i]-1; + } + else + { + if (all_population_ID[i]==first_j) + { + all_population_ID[i]=first_i; + } + } + } + + num_real_pop--; + + + + } + } + //printf("num_real_pop is %d\n",num_real_pop); + + } //end of while + + + + free(center_1); + free(center_2); + free(center_3); + printf("Final number of populations is %ld\n",num_real_pop); + + new_population_center=(double **)malloc(sizeof(double*)*num_real_pop); + memset(new_population_center,0,sizeof(double*)*num_real_pop); + for (i=0;i max) { + max = (int)input_data[i][j]; + } + if (j==num_dm-1) + { + fprintf(f_out,"%d\n",(int)input_data[i][j]); + fprintf(f_results,"%d\t",(int)input_data[i][j]); + } + else + { + fprintf(f_out,"%d\t",(int)input_data[i][j]); + fprintf(f_results,"%d\t",(int)input_data[i][j]); + } + } + //fprintf(f_results,"%ld\t",i + 1); + fprintf(f_results,"%ld\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009 + } + +/* + f_parameters=fopen("parameters.txt","w"); + fprintf(f_parameters,"Number_of_Bins\t%ld\n",num_bin); + fprintf(f_parameters,"Density\t%d\n",den_t_event); + fprintf(f_parameters,"Min\t%d\n",min); + fprintf(f_parameters,"Max\t%d\n",max); + fclose(f_parameters); +*/ + + f_properties=fopen("fcs.properties","w"); + fprintf(f_properties,"Bins=%ld\n",num_bin); + fprintf(f_properties,"Density=%d\n",den_t_event); + fprintf(f_properties,"Min=%d\n",min); + fprintf(f_properties,"Max=%d\n",max); + fprintf(f_properties,"Populations=%ld\n",num_real_pop); + fprintf(f_properties,"Events=%ld\n",file_Len); + fprintf(f_properties,"Markers=%ld\n",num_dm); + fclose(f_properties); + + for (i=0;imax_grid) to STDERR -// 8. Fixed a bug for 2D data by using K=e*K -// 9. Added some header files, may not be necessary -// 10. Added a lower bound (at least two) for number of populations -/*************************************************************************************************************************************** - - FLOCK: FLOw cytometry Clustering without K (Named by: Jamie A. Lee and Richard H. Scheuermann) - - Author: (Max) Yu Qian, Ph.D. - - Copyright: Scheuermann Lab, Dept. of Pathology, UTSW - - Development: November 2005 ~ Forever - - Status: July 2010: Release 2.0 - - Usage: flock data_file - Note: the input file format must be channel values and the delimiter between two values must be a tab. - - - -****************************************************************************************************************************************/ - -#include -#include -#include -#include -#include -#include -#include -#include - - - -#define DEBUG 0 -#define LINE_LEN 1024 -#define FILE_NAME_LEN 128 -#define PARA_NAME_LEN 64 -#define MAX_VALUE 1000000000 -#define MIN_GRID 6 -#define MAX_GRID 50 -#define E_T 1.0 - -#define NORM_METHOD 2 //2 if z-score; 0 if no normalization; 1 if min-max -#define KMEANS_TERM 10 -#define MAX_POP_NUM 128 - -#ifndef max - #define max( a, b ) ( ((a) > (b)) ? (a) : (b) ) -#endif - -#ifndef min - #define min( a, b ) ( ((a) < (b)) ? (a) : (b) ) -#endif - -static long **Gr=0; -static long *gcID = 0; /* grid cluster IDs */ -static long *cluster_count=0; /* count of nodes per cluster */ -static long ndense=0; -static long ndim=0; -/* cid changes between depth-first searches, but is constant within a - single search, so it goes here. */ -static long cid=0; -/* Do a depth-first search for a single connected component in graph - * G. Start from node, tag the nodes found with cid, and record - * the tags in grid_clusterID. Also, record the node count in - * cluster_count. If we find a node that has already been assigned to - * a cluster, that means we're merging two clusters, so zero out the - * old cid's node count. - * - * Note that our graph is constructed as a DAG, so we can be - * guaranteed to terminate without checking for cycles. - * - * Note2: this function can potentially recurse to depth = ndense. - * Plan stack size accordingly. - * - * Output: - * - * grid_clusterID[] -- array where we tag the nodes. - * cluster_count[] -- count of the number of nodes per cluster. - */ -static void merge_cluster(long from, long into) -{ - int i; - - for(i=0; i= 0) { - /* We are, so zero the count for the old cluster. */ - cluster_count[ gcID[node] ] = 0; - merge_cluster(gcID[node], cid); - return; - } - - /* Update for this node */ - gcID[node] = cid; - cluster_count[cid]++; - - /* Recursively search the child nodes */ - for(i=0; i= 0) /* This is a child node */ - depth_first(Gr[node][i]); -} - -void bail(const char *msg) -{ - fprintf(stderr,"%s",msg); - exit(0); -} - - -static void check_clusters(long *gcID, int ndense) -{ - int i; - - for(i=0; ibiggest) - biggest=orig_data[i][j]; - if (orig_data[i][j] index in the original data of the ith element of the ordered list) - * grid_ID -- mapping between the original data and the "grids" (see below) found as a byproduct - * of the sorting procedure. - * num_nonempty -- the number of grids that occur in the data (= the number of distinct values assumed - * by grid_ID) - */ - -void radixsort_flock(long **position,long file_Len,long num_dm,long num_bin,long *sorted_seq,long *num_nonempty,long *grid_ID) -{ - long i=0; - long length=0; - long start=0; - long prev_ID=0; - long curr_ID=0; - - long j=0; - long t=0; - long p=0; - long loc=0; - long temp=0; - long equal=0; - - long *count; //count[i]=j means there are j numbers having value i at the processing digit - long *index; //index[i]=j means the starting position of grid i is j - long *cp; //current position - long *mark; //mark[i]=0 means it is not an ending point of a part, 1 means it is (a "part" is a group of items with identical bins for all dimensions) - long *seq; //temporary sequence - - count=(long*)malloc(sizeof(long)*num_bin); - memset(count,0,sizeof(long)*num_bin); - - cp=(long*)malloc(sizeof(long)*num_bin); - memset(cp,0,sizeof(long)*num_bin); - - index=(long*)malloc(sizeof(long)*num_bin); // initialized below - - seq=(long*)malloc(sizeof(long)*file_Len); - memset(seq,0,sizeof(long)*file_Len); - - mark=(long*)malloc(sizeof(long)*file_Len); - memset(mark,0,sizeof(long)*file_Len); - - for (i=0;i0) && (index[i+1]<=file_Len)) - { - mark[index[i+1]-1]=1; // Mark the end of the segment in the ordered list - } - else - { - printf("out of myboundary for mark at index[i+1]-1.\n"); - } - } - mark[file_Len-1]=1; - - for (i=0;i0)) - { - mark[index[t+1]-1]=1; // update the part boundaries to include the differences in the current radix. - } - - } - mark[i]=1; - - /* Update the permutation vector for the current part (i.e., from start to i). By the time we finish the loop over i - the PV will be completely updated for the partial ordering up to the current radix. */ - for (t=start;t<=i;t++) - { - loc=position[sorted_seq[t]][j];//loc=position[t][j]; - temp=index[loc]+cp[loc]; - if ((temp=0)) - { - // seq is a temporary because we have to maintain the old PV until we have finished this step. - seq[temp]=sorted_seq[t]; //sorted_seq[i]=temp is also another way to sort - cp[loc]++; - } - else - { - printf("out of myboundary for seq at temp.\n"); - } - } - - for (t=start;t<=i;t++) - { - // copy the temporary back into sorted_seq. sorted_seq is now updated for radix j up through - // entry i in the ordered list. - if ((t>=0) && (tbig[j]) - big[j]=data_in[i][j]; - - if (data_in[i][j]=big[j]) - position[i][j]=num_bin-1; - else - { - position[i][j]=(long)((data_in[i][j]-small[j])/interval[j]); //position[i][j]=t means point i is at the t grid of dimensional j - if ((position[i][j]>=num_bin) || (position[i][j]<0)) - { - //printf("position mis-computed in density analysis!\n"); - //exit(0); - fprintf(stderr,"Incorrect input file format or input parameters (number of bins overflows)!\n"); //modified on July 23, 2010 - exit(0); - } - } - } - } - - - free(small); - free(big); -} - -/********************************************** select_bin to select the number of bins **********************************/ -//num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty); -/* Determine the number of bins to use in each dimension. Additionally sort the data elements according to the binned - * values, and partition the data into "grids" with identical (binned) values. We try progressively more bins until we - * maximize a merit function, then return the results obtained using the optimal number of bins. - * - * Outputs: - * position -- binned data values - * sorted_seq -- permutation vector mapping the ordered list to the original data - * all_grid_ID -- grid to which each data element was assigned. - * num_nonempty -- number of distinct values assumed by all_grid_ID - * interval -- bin width for each data dimension - * return value -- the number of bins selected. - */ - -long select_bin(double **normalized_data, long file_Len, long num_dm, long min_grid, long max_grid, long **position, long *sorted_seq, - long *all_grid_ID, long *num_nonempty, double *interval, long user_num_bin) -{ - - long num_bin=0; - long select_num_bin=0; - long m=0; - long n=0; - - long i=0; - long bin_scope=0; - long temp_num_nonempty=0; - - long *temp_grid_ID; - long *temp_sorted_seq; - long **temp_position; - - //sorted_seq[i]=j means the event j ranks i - - double temp_index=0; - double *bin_index; - double *temp_interval; - - - temp_grid_ID=(long *)malloc(sizeof(long)*file_Len); - memset(temp_grid_ID,0,sizeof(long)*file_Len); - - temp_sorted_seq=(long *)malloc(sizeof(long)*file_Len); - memset(temp_sorted_seq,0,sizeof(long)*file_Len); - - temp_position=(long **)malloc(sizeof(long*)*file_Len); - memset(temp_position,0,sizeof(long*)*file_Len); - for (m=0;m=(double)(file_Len)*0.95) - break; - if ((bin_index[i]max_grid)) - { - fprintf(stderr,"Number of events collected is too few in terms of number of markers used. The file should not be processed!\n"); //modified on Nov 04, 2010 - exit(0); - } - - if (temp_index==0) - { - fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010 - exit(0); - } - - free(temp_grid_ID); - free(temp_sorted_seq); - free(bin_index); - free(temp_interval); - - for (m=0;m=den_t_event) - temp_num_dense_grids++; - - if (temp_num_dense_grids<=1) - { - //exit(0); - //printf("a too high density threshold is set! Please decrease your density threshold.\n"); - fprintf(stderr,"a too high density threshold! Please decrease your density threshold.\n"); //modified on July 23, 2010 - exit(0); - } - - if (temp_num_dense_grids>=100000) - { - //modified on July 23, 2010 - //printf("a too low density threshold is set! Please increase your density threshold.\n"); - //exit(0); - fprintf(stderr,"a too low density threshold! Please increase your density threshold.\n"); //modified on July 23, 2010 - exit(0); - } - - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Compute dense_grid_reverse - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - temp_num_dense_grids=0; - temp_num_dense_events=0; - for (i=0;i=den_t_event) - { - dense_grid_reverse[i]=temp_num_dense_grids; //dense_grid_reverse provides a mapping from all nonempty grids to dense grids. - temp_num_dense_grids++; - temp_num_dense_events+=all_grid_vol[i]; - } - } - - num_dense_grids[0]=temp_num_dense_grids; - num_dense_events[0]=temp_num_dense_events; - - free(grid_size_index); - free(all_grid_vol); - - return den_t_event; -} - -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Compute densegridID_To_gridevent and eventID_To_denseventID -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent); -/* - * Filter the data so that only the data belonging to dense grids is left - * - * Output: - * eventID_To_denseeventID -- mapping from original event ID to ID in list containing only events contained in dense grids. - * densegridID_To_gridevent -- mapping from dense grids to prototype members of the grids. - * - */ - -void grid_To_event(long file_Len, long *dense_grid_reverse, long *all_grid_ID, long *eventID_To_denseventID, long *densegridID_To_gridevent) -{ - long i=0; - long dense_grid_ID=0; - long dense_event_ID=0; - - for (i=0;iseq_value[i]) - return -1; - if (search_value[i]==seq_value[i]) - continue; - } - return 0; -} - -//binary search the dense_grid_seq to return the dense grid ID if it exists -long binary_search(long num_dense_grids, long num_dm, long *search_value, long **dense_grid_seq) -{ - - long low=0; - long high=0; - long mid=0; - long comp_result=0; - long match=0; - //long found=0; - - low = 0; - high = num_dense_grids-1; - - while (low <= high) - { - mid = (long)((low + high)/2); - - comp_result=compare_value(num_dm, search_value,dense_grid_seq[mid]); - - - switch(comp_result) - { - case 1: - high=mid-1; - break; - case -1: - low=mid+1; - break; - case 0: - match=1; - break; - } - if (match==1) - break; - } - - - - if (match==1) - { - return mid; - } - else - return -1; -} - - -/********************************************** Computing Centers Using IDs **********************************************/ - -void ID2Center(double **data_in, long file_Len, long num_dm, long *eventID_To_denseventID, long num_clust, long *cluster_ID, double **population_center) -{ - long i=0; - long j=0; - long ID=0; - long eventID=0; - long *size_c; - - - - size_c=(long *)malloc(sizeof(long)*num_clust); - memset(size_c,0,sizeof(long)*num_clust); - - for (i=0;imax_num_pop) && (merge_dist<5.0)) || ((num_population<=1) && (merge_dist>0.1))) - { - - if (num_population<=1) - merge_dist=merge_dist-0.1; - - if (num_population>max_num_pop) - merge_dist=merge_dist+0.1; - - - for (i=0;i(merge_dist*interval[t])) - merge=0; - } - - if ((merge) && (cluster_populationID[i]!=cluster_populationID[j])) - { - if (cluster_populationID[i]max_num_pop) && (num_population==1)) - break; - else - temp_num_population=num_population; - - if (num_clust<=1) - break; - } //end while - - for (i=0;ikmean_term) && (kmean_term<1)) || ((times=1))) - { - for (i=0;ivvv) - vvv=variation; //vvv is the biggest variation among the k clusters; - } - //compute_variation; - times++; - } //end for while - - - - - free(num); - - for (i=0;iMatrix[i][j]) - small[j]=Matrix[i][j]; - } - - size_c[id]++; - } - - for (i=0;i0)) //the smallest values excluded - size_mybound_0[cluster_id[i]][j]++; - else - { - if (Matrix[i][j]size_mybound_1[i][j]) - situ1=0; - else - situ1=1; - if (size_mybound_2[i][j]>size_mybound_3[i][j]) - situ2=2; - else - situ2=3; - - if ((situ1==0) && (situ2==2)) - { - if (size_mybound_0[i][j]>size_mybound_2[i][j]) - prof[i][j]=0; - else - prof[i][j]=2; - } - if ((situ1==0) && (situ2==3)) - { - if (size_mybound_0[i][j]>size_mybound_3[i][j]) - prof[i][j]=0; - else - prof[i][j]=3; - } - if ((situ1==1) && (situ2==2)) - { - if (size_mybound_1[i][j]>size_mybound_2[i][j]) - prof[i][j]=1; - else - prof[i][j]=2; - } - if ((situ1==1) && (situ2==3)) - { - if (size_mybound_1[i][j]>size_mybound_3[i][j]) - prof[i][j]=1; - else - prof[i][j]=3; - } - - //begin to output profile - if (j==num_dm-1) - { - if (prof[i][j]==0) - fprintf(fprof_id,"1\n"); - if (prof[i][j]==1) - fprintf(fprof_id,"2\n"); - if (prof[i][j]==2) - fprintf(fprof_id,"3\n"); - if (prof[i][j]==3) - fprintf(fprof_id,"4\n"); - } - else - { - if (prof[i][j]==0) - fprintf(fprof_id,"1\t"); - if (prof[i][j]==1) - fprintf(fprof_id,"2\t"); - if (prof[i][j]==2) - fprintf(fprof_id,"3\t"); - if (prof[i][j]==3) - fprintf(fprof_id,"4\t"); - } - } - } - fclose(fprof_id); - - /////////////////////////////////////////////////////////// - - - fpcnt_id=fopen("percentage.txt","w"); - fprintf(fpcnt_id,"Population_ID\tPercentage\n"); - - for (t=0;t0) - { - k=(long)sqrt((double)k); - if (num_dm<=3) - k=(long)(2.7183*(double)k); //e*k for low-dimensional space, added Nov. 4, 2010 - //printf("the k-nearest k is %d\n",k); - } - else - { - printf("error in k\n"); - exit(0); - } - - nearest_neighbors=(long *)malloc(sizeof(long)*k); - memset(nearest_neighbors,0,sizeof(long)*k); - - nearest_distance=(double *)malloc(sizeof(double)*k); - memset(nearest_distance,0,sizeof(double)*k); - - temp=0; - - for (i=0;ibiggest_distance) - { - biggest_distance=nearest_distance[j]; - current_biggest=j; - } - } - - if (biggest_distance>distance) - { - nearest_neighbors[current_biggest]=i; - nearest_distance[current_biggest]=distance; - } - } - } - } - - for (i=0;i population_center -//data + grid_ID -> grid_Center - -/* what is the final output */ -//the final things we want are grid_Center in the original data and grid_clusterID //or population_center in the original data -//Sparse grids will not be considered when computing the two centroids (centroids of grids and centroids of clusters) - -/* what information should select_bin output */ -//the size of all IDs are unknown to function main because we only consider the events in dense grids, and also the number of dense grids -//is unknown, therefore I must use a prescreening to output -//how many bins I should use -//the number of dense grids -//total number of events in the dense grids - -/* basic procedure of main function */ -//1. read raw file and normalize the raw file -//2. select_bin -//3. allocate memory for eventID_To_denseventID, grid_clusterID, grid_ID, cluster_ID. -//4. call select_dense and merge_grids with grid_clusterID, grid_ID, cluster_ID. -//5. release normalized data; allocate memory for grid_Center and population_center -//6. output grid_Center and population_center using ID2Center together with grid_clusterID //from IDs to centers - -int main (int argc, char **argv) -{ - //inputs - FILE *f_src; //source file pointer - - FILE *f_out; //coordinates - FILE *f_cid; //population-ID of events - FILE *f_ctr; //centroids of populations - FILE *f_results; //coordinates file event and population column - FILE *f_mfi; //added April 16, 2009 for mean fluorescence intensity - FILE *f_parameters; //number of bins and density calculated by - //the algorithm. Used to update the database - FILE *f_properties; //Properties file used by Image generation software - - //char tmpbuf[128]; - - char para_name_string[LINE_LEN]; - - long time_ID=-1; - long num_bin=0; //the bins I will use on each dimension - long num_pop=0; - long file_Len=0; //number of events - long num_dm=0; - long num_clust=0; - long num_dense_events=0; - long num_dense_grids=0; - long num_nonempty=0; - long num_population=0; - long num_real_pop=0; - long keep_merge=0; - long num_checked_range=0; - - //below are read from configuration file - long i=0; - long j=0; - long p=0; - long t=0; - long q=0; - long temp_i=0; - long temp_j=0; - long first_i=0; - long first_j=0; - long d1=0; - long d2=0; - long d3=0; - long d_d=0; - long max_num_pop=0; //upperbound of the number of populations that is equal to 2^d - long index_id=0; - long num_computed_population=0; - long tot_size=0; - - int den_t_event=0; - - double distance=0.0; - double dist=0.0; - double current_smallest_dist=0; - double max_d_dist=0.0; - double Ei=0.0; - double Ej=0.0; - double E1=0.0; - double E2=0.0; - double E3=0.0; - double Ep=0.0; - double temp=0.0; - double tmp=0.0; - - long *grid_clusterID; //(dense)gridID to clusterID - long *grid_ID; //(dense)eventID to gridID - long *cluster_ID; //(dense)eventID to clusterID - long *eventID_To_denseventID; //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k - long *all_grid_ID; //gridID for all events - long *densegridID_To_gridevent; - long *sorted_seq; - long *dense_grid_reverse; - long *population_ID; //denseeventID to populationID - long *cluster_populationID; //clusterID to populationID - long *grid_populationID; //gridID to populationID - long *all_population_ID; //populationID of event - long *size_c; - long *size_p_2; - long *partit; - long *size_p; - long *temp_size_j; - long *all_computed_pop_ID; - - - long **position; - long **dense_grid_seq; - long **temp_population_ID; - - double *interval; - double *center_1; - double *center_2; - double *center_3; - double *aver; - double *std; - double *center_p_1; - double *center_p_2; - double *center_p_3; - - double **population_center; //population centroids in the raw/original data - double **cluster_center; //cluster centroids in the raw/original data - double **new_population_center; - double **temp_population_center; - double **min_pop; - double **max_pop; - - double **input_data; - double **normalized_data; - - double **real_pop_center; - double **distance_pop; - - double ***ind_pop; - double ***ind_pop_3; - - int min = 999999; - int max = 0; - - printf( "Starting time:\t\t\t\t"); - fflush(stdout); - system("/bin/date"); - - /* - * Windows Version - _strtime( tmpbuf ); - printf( "Starting time:\t\t\t\t%s\n", tmpbuf ); - _strdate( tmpbuf ); - printf( "Starting date:\t\t\t\t%s\n", tmpbuf ); - */ - - ///////////////////////////////////////////////////////////// - - if ((argc!=2) && (argc!=3) && (argc!=4) && (argc!=5) && (argc!=6)) - { - fprintf(stderr,"Incorrect number of input parameters!\n"); - fprintf(stderr,"usage:\n"); - fprintf(stderr,"basic mode: flock data_file\n"); - fprintf(stderr,"advanced mode 0 (specify maximum # of pops): flock data_file max_num_pop\n"); - fprintf(stderr,"advanced mode 1 (without # of pops): flock data_file num_bin density_index\n"); - fprintf(stderr,"advanced mode 2 (specify # of pops): flock data_file num_bin density_index number_of_pop\n"); - fprintf(stderr,"advanced mode 3 (specify both # of pops): flock data_file num_bin density_index number_of_pop max_num_pop\n"); - exit(0); - } - - f_src=fopen(argv[1],"r"); - - if (argc==3) - { - max_num_pop=atoi(argv[2]); - printf("num of maximum pop is %ld\n",max_num_pop); - } - - if (argc==4) - { - num_bin=atoi(argv[2]); - printf("num_bin is %ld\n",num_bin); - - den_t_event=atoi(argv[3]); - printf("density_index is %d\n",den_t_event); - } - - if (argc==5) - { - num_bin=atoi(argv[2]); - printf("num_bin is %ld\n",num_bin); - - den_t_event=atoi(argv[3]); - printf("density_index is %d\n",den_t_event); - - num_pop=atoi(argv[4]); - printf("num of pop is %ld\n",num_pop); - } - - if (argc==6) - { - num_bin=atoi(argv[2]); - printf("num_bin is %ld\n",num_bin); - - den_t_event=atoi(argv[3]); - printf("density_index is %d\n",den_t_event); - - num_pop=atoi(argv[4]); - printf("num of pop is %ld\n",num_pop); - - max_num_pop=atoi(argv[5]); - printf("num of pop is %ld\n",max_num_pop); - } - - - if (num_pop==1) - { - printf("it is not allowed to specify only one population\n"); - exit(0); - } - - - - getfileinfo(f_src, &file_Len, &num_dm, para_name_string, &time_ID); //get the filelength, number of dimensions, and num/name of parameters - - - if (max_num_pop==0) - { - max_num_pop=(long)pow(2,num_dm); - if (max_num_pop>MAX_POP_NUM) - max_num_pop=MAX_POP_NUM; - } - - /************************************************* Read the data *****************************************************/ - - rewind(f_src); //reset the file pointer - - input_data = (double **)malloc(sizeof(double*)*file_Len); - memset(input_data,0,sizeof(double*)*file_Len); - for (i=0;i=1) //num_bin has been selected by user - { - select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin); - printf("user set bin number is %ld\n",num_bin); - } - else //num_bin has not been selected by user - { - num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin); - printf("selected bin number is %ld\n",num_bin); - } - printf("number of non-empty grids is %ld\n",num_nonempty); - - - - /* Although we return sorted_seq from select_bin(), we don't use it for anything, except possibly diagnostics */ - free(sorted_seq); - - - dense_grid_reverse=(long*)malloc(sizeof(long)*num_nonempty); - memset(dense_grid_reverse,0,sizeof(long)*num_nonempty); - - /************************************************* select_dense **********************************************/ - - if (den_t_event>=1) //den_t_event must be larger or equal to 2 if the user wants to set it - den_t_event=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event); - else - { - den_t_event=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event); - printf("automated selected density threshold is %d\n",den_t_event); - } - - printf("Number of dense grids is %ld\n",num_dense_grids); - - densegridID_To_gridevent = (long *)malloc(sizeof(long)*num_dense_grids); - memset(densegridID_To_gridevent,0,sizeof(long)*num_dense_grids); - - for (i=0;imax_d_dist) - { - max_d_dist=dist; - d_d=t; - } - } - - if (j==0) - d1=d_d; - if (j==1) - d2=d_d; - if (j==2) - d3=d_d; - } - - - for (t=0;tEi) && (Ep>Ej)) //the two parts should be partitioned - { - partit[i]=num_computed_population; - num_computed_population++; - } - - } //end for (i=0;i0) - { - j=temp_size_j[index_id]; - if (temp_population_ID[index_id][j]==1) - all_computed_pop_ID[i]=partit[index_id]; - - temp_size_j[index_id]++; - } - } - - printf("Number of groups after partitioning is %ld\n", num_computed_population); - - - free(size_p_2); - - free(aver); - free(std); - free(partit); - - free(center_p_1); - free(center_p_2); - free(center_p_3); - - - - for (i=0;i=num_computed_population) - printf("all_population_ID[%ld]=%ld\n",i,all_population_ID[i]); - } - - num_population=num_computed_population; - - free(all_computed_pop_ID); - //end partitioning - // since the num_population has changed, population_center needs to be redefined as below - - population_center=(double **)malloc(sizeof(double*)*num_population); - memset(population_center,0,sizeof(double*)*num_population); - for (i=0;inum_real_pop) - { - fprintf(stderr,"number of populations specified too large\n"); - exit(0); - } - - center_1=(double *)malloc(sizeof(double)*num_dm); - memset(center_1,0,sizeof(double)*num_dm); - - center_2=(double *)malloc(sizeof(double)*num_dm); - memset(center_2,0,sizeof(double)*num_dm); - - center_3=(double *)malloc(sizeof(double)*num_dm); - memset(center_3,0,sizeof(double)*num_dm); - - - - while (((num_real_pop>num_pop) && (num_pop!=0)) || ((keep_merge==1) && (num_pop==0) && (num_real_pop>2))) - { - - keep_merge=0; //so, if no entering merge function, while will exit when num_pop==0 - - real_pop_center=(double **)malloc(sizeof(double*)*num_real_pop); - memset(real_pop_center,0,sizeof(double*)*num_real_pop); - for (i=0;imax_pop[index_id][j]) - max_pop[index_id][j]=normalized_data[i][j]; - } - } - -/* for (i=0;i2) - num_checked_range=(long)((double)num_real_pop*(num_real_pop-1)/2.0); //to find the mergeable pair among the num_checked_range pairs of smallest distances - else - num_checked_range=1; - - size_c=(long *)malloc(sizeof(long)*num_real_pop); - memset(size_c,0,sizeof(long)*num_real_pop); - - for (i=0;imax_d_dist) - { - max_d_dist=dist; - d_d=j; - } - } - - if (i==0) - d1=d_d; - if (i==1) - d2=d_d; - if (i==2) - d3=d_d; - } - - //printf("d1=%d;d2=%d;d3=%d\n",d1,d2,d3); - - Ei=get_avg_dist(real_pop_center[temp_i], temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c); - Ej=get_avg_dist(real_pop_center[temp_j], temp_i,temp_j, all_population_ID, num_real_pop,file_Len, num_dm, normalized_data, d1,d2,d3,size_c); - - for (t=0;ttemp_j) - { - all_population_ID[i]=all_population_ID[i]-1; - } - else - { - if (all_population_ID[i]==temp_j) - { - all_population_ID[i]=temp_i; - } - } - } - - num_real_pop--; - } - else - { - if ((num_pop!=0) && (num_popfirst_j) - { - all_population_ID[i]=all_population_ID[i]-1; - } - else - { - if (all_population_ID[i]==first_j) - { - all_population_ID[i]=first_i; - } - } - } - - num_real_pop--; - - - - } - } - //printf("num_real_pop is %d\n",num_real_pop); - - } //end of while - - - - free(center_1); - free(center_2); - free(center_3); - printf("Final number of populations is %ld\n",num_real_pop); - - new_population_center=(double **)malloc(sizeof(double*)*num_real_pop); - memset(new_population_center,0,sizeof(double*)*num_real_pop); - for (i=0;i max) { - max = (int)input_data[i][j]; - } - if (j==num_dm-1) - { - fprintf(f_out,"%d\n",(int)input_data[i][j]); - fprintf(f_results,"%d\t",(int)input_data[i][j]); - } - else - { - fprintf(f_out,"%d\t",(int)input_data[i][j]); - fprintf(f_results,"%d\t",(int)input_data[i][j]); - } - } - //fprintf(f_results,"%ld\t",i + 1); - fprintf(f_results,"%ld\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009 - } - -/* - f_parameters=fopen("parameters.txt","w"); - fprintf(f_parameters,"Number_of_Bins\t%ld\n",num_bin); - fprintf(f_parameters,"Density\t%d\n",den_t_event); - fprintf(f_parameters,"Min\t%d\n",min); - fprintf(f_parameters,"Max\t%d\n",max); - fclose(f_parameters); -*/ - - f_properties=fopen("fcs.properties","w"); - fprintf(f_properties,"Bins=%ld\n",num_bin); - fprintf(f_properties,"Density=%d\n",den_t_event); - fprintf(f_properties,"Min=%d\n",min); - fprintf(f_properties,"Max=%d\n",max); - fprintf(f_properties,"Populations=%ld\n",num_real_pop); - fprintf(f_properties,"Events=%ld\n",file_Len); - fprintf(f_properties,"Markers=%ld\n",num_dm); - fclose(f_properties); - - for (i=0;i