# HG changeset patch # User immport-devteam # Date 1488220063 18000 # Node ID 311a4f16c483502e10a51babdb7877d8e450db77 # Parent 7eab80f867799e58218e26cd176ba48e982ca9c6 Deleted selected files diff -r 7eab80f86779 -r 311a4f16c483 src/README --- a/src/README Mon Feb 27 13:26:09 2017 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,139 +0,0 @@ -This package contains R code for converting and transforming a binary -FCS file, using the FCSTrans software, and the C code for running the -flock1 and flock2 population identification software. - -src - contains the C code for flock1, flock2 and cent_adjust -bin - contains the compiled C code for flock1, flock2 and cent_adjust, - plus the R code for using the FCSTrans algorithm for file - conversion and data transformation. -doc - documentation for FCSTrans and FLOCK algorithms -example - sample data and output from FCSTrans and FLOCK - -To run this software sucessfully the code assumes the software was installed -in the /usr/local/flock directory and that R, plus the Bioconductor flowCore -module has been installed. If you use the ipconvert.sh shell script, you -may need to adjust the location of the RScript executable and the location -of the FCSTrans.R code by editing the shell script. - -############################################################################# -# Overview -############################################################################# -ImmPort-FLOCK - -FLOCK (FLOw Clustering without K), an automated population discovery tool for -multidimensional FCM data was designed to specifically take into account the -unique feature of FCM data and produce objective segregation of cell -populations. - -FLOCK parameter settings can be customized by defining Bins and Density -Threshold. The number of bins is an integer specifying the number of -equal-sized regions the data will be partitioned into on each axis. Increasing -the number of bins increases the sensitivity to detect rare populations but -may also result in single populations being divided. Density Threshold is the -cut-off value to separate the dense regions from background. It is a floating -point number that helps define population centers; increasing the threshold -may help separate major populations but could cause the algorithm to overlook -rare populations. - -############################################################################# -# Compiling C code -############################################################################# -cd bin -cc -o flock1 ../src/flock1.c ../src/find_connected.c -lm -cc -o flock2 ../src/flock2.c -lm -cc -o cent_adjust ../src/cent_adjust.c -lm - -############################################################################# -# FCSTrans -############################################################################# -A shell script named ipconvert.sh is included that runs the FCSTrans R -code for converting and transforming a binary FCS file. The output consists -of one text file containing the transformed channel intensity values and -another file containing a list of the FCS parameters. - -cd bin -/usr/local/flock/bin/ipconvert.sh ../example/data/FCS2.fcs -/usr/local/flock/bin/ipconvert.sh ../example/data/FCS3.fcs - -############################################################################# -# Running flock1 or flock2 -############################################################################# -Running the FLOCK1 and FLOCK2 algorithms generate 8 output files that have -generic file names. For this reason, it is recommended that one output -directory be created for one input file to the program. - -cd example/output/FCS2 -/usr/local/flock/bin/flock1 ../../data/FCS2.txt - -cd example/output/FCS3 -/usr/local/flock/bin/flock2 ../../data/FCS3.txt - -Files created: MFI.txt, percentage.txt, population_id.txt, profile.txt, - flock_results.txt, coordinates.txt, population_center.txt - and percentage.txt - -Usage Information for FLOCK1 ----------------------------------------------------------------------------- -basic mode: flock1 fcs.txt -advanced_ mode: flock1 fcs.txt num_bin density_index max_num_pop - -Usage Information for FLOCK2 ----------------------------------------------------------------------------- -basic mode: flock data_file -advanced mode 0 (specify maximum # of pops): flock data_file max_num_pop -advanced mode 1 (without # of pops): flock data_file num_bin density_index -advanced mode 2 (specify # of pops): flock data_file num_bin density_index - number_of_pop -advanced mode 3 (specify both # of pops): flock data_file num_bin density_index - number_of_pop max_num_pop - -FLOCK Output Files ----------------------------------------------------------------------------- -coordinates.txt: - Output is the intensity values for each marker and event - -flock_results.txt: - A combination of the input file, event identifiers and population - identifiers. - -MFI.txt: - Provides the mean fluorescence intensity for each population for each - marker/parameter - -population_id.txt: - Contains population identifiers (i.e, values from [1 to n] where n is - the population assigned to the corresponding events in the input data - file, one identifier per row.) - -population_center.txt: - Contains the centroid coordinates for each identified population - -percentage.txt: - Includes the population identifiers and percentage of events within that - population (relative to the whole data file) - -profile.txt: - Displays an expression profile, where the approximate expression level - for each marker is assigned a numeric value from 1-4, for each identified - population - -fcs_properties.txt: - Contains the number of events, number of populations, and number of - markers, as well as the algorithm parameters used in the analysis - -############################################################################# -# Running cent_adjust -############################################################################# -Running the cent_adjust algorithm generates 4 output files that have -generic file names. For this reason, it is recommened that one output -directory be created for one input file to the program. - -mkdir example/output/FCS2/cent_adjust -cd example/output/FCS2/cent_adjust -/usr/local/flock/bin/cent_adjust ../population_center.txt ../coordinates.txt - -Files created: MFI.txt, percentage.txt, population_id.txt and profile.txt - -Usage Information for cent_adjust ----------------------------------------------------------------------------- -basic mode: cent_adjust input_center input_data_file diff -r 7eab80f86779 -r 311a4f16c483 src/cent_adjust.c --- a/src/cent_adjust.c Mon Feb 27 13:26:09 2017 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1009 +0,0 @@ -///////////////////////////////////////////////////////// -// Cent_adjust version number and modification history -// ImmPort BISC project -// Author: Yu "Max" Qian -// v1.01: Oct 16, 2009 -// Line 899 of the main function: -// Changed kmean_term=1 to kmean_term=2 -////////////////////////////////////////////////////////// - - -#include -#include -#include -#include -#include - -#define DEBUG 0 -#define LINE_LEN 1024 -#define FILE_NAME_LEN 128 -#define PARA_NAME_LEN 64 -#define MAX_VALUE 1000000000 -#define CUBE 0 - - -void getctrfileinfo(FILE *f_src_ctr, long *num_clust) -{ - int ch='\n'; - int prev='\n'; - long num_rows=0; - - while ((ch = fgetc(f_src_ctr))!= EOF ) - { - if (ch == '\n') - { - ++num_rows; - } - prev = ch; - } - if (prev!='\n') - ++num_rows; - - *num_clust=num_rows; - //printf("center file has %ld rows\n", *num_clust); -} - -/************************************* Read basic info of the source file **************************************/ -void getfileinfo(FILE *f_src, long *file_Len, long *num_dm, char *name_string, int *time_ID) -{ - char src[LINE_LEN]; - char current_name[64]; - char prv; - - long num_rows=0; - long num_columns=0; - int ch='\n'; - int prev='\n'; - long time_pos=0; - long i=0; - long j=0; - - - - src[0]='\0'; - fgets(src, LINE_LEN, f_src); - - name_string[0]='\0'; - current_name[0]='\0'; - prv='\n'; - - while ((src[i]==' ') || (src[i]=='\t')) //skip space and tab characters - i++; - - while ((src[i]!='\r') && (src[i]!='\n')) //repeat until the end of the line - { - current_name[j]=src[i]; - - if ((src[i]=='\t') && (prv!='\t')) //a complete word - { - current_name[j]='\0'; - - /* - * Commented out John Campbell, June 10 2010 - * We no longer want to automatically remove Time column. - * This column should have been removed by column selection - if (0!=strcmp(current_name,"Time")) - { - num_columns++; //num_columns does not inlcude the column of Time - time_pos++; - strcat(name_string,current_name); - strcat(name_string,"\t"); - } - else - { - *time_ID=time_pos; - } - */ - - num_columns++; - strcat(name_string,current_name); - strcat(name_string,"\t"); - - current_name[0]='\0'; - j=0; - } - - if ((src[i]=='\t') && (prv=='\t')) //a duplicate tab or space - { - current_name[0]='\0'; - j=0; - } - - if (src[i]!='\t') - j++; - - - prv=src[i]; - i++; - } - - if (prv!='\t') //the last one hasn't been retrieved - { - current_name[j]='\0'; - /* - * Commented out John Campbell, June 10 2010 - * We no longer want to automatically remove Time column. - * This column should have been removed by column selection - if (0!=strcmp(current_name,"Time")) - { - num_columns++; - strcat(name_string,current_name); - time_pos++; - } - else - { - *time_ID=time_pos; - } - */ - - num_columns++; - strcat(name_string,current_name); - } - - if (DEBUG==1) - { - printf("time_ID is %d\n",*time_ID); - printf("name_string is %s\n",name_string); - } - - // # of rows - - while ((ch = fgetc(f_src))!= EOF ) - { - if (ch == '\n') - { - ++num_rows; - } - prev = ch; - } - if (prev!='\n') - ++num_rows; - - *file_Len=num_rows; - *num_dm=num_columns; - - //printf("original file size is %ld; number of dimensions is %ld\n", *file_Len, *num_dm); -} - - - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -/************************************* Read the source file into uncomp_data **************************************/ -void readsource(FILE *f_src, long file_Len, long num_dm, double **uncomp_data, int time_ID) -{ - long time_pass=0; //to mark whether the time_ID has been passed - long index=0; - - long i=0; - long j=0; - long t=0; - - char src[LINE_LEN]; - char xc[LINE_LEN/10]; - - src[0]='\0'; - fgets(src,LINE_LEN, f_src); //skip the first line about parameter names - - while (!feof(f_src) && (indexbiggest) - biggest=orig_data[i][j]; - if (orig_data[i][j]=1) - times_allowed = (long)kmean_term; - else - EPS = kmean_term; - - times=0; - - while (((vvv>EPS) && (kmean_term<1)) || ((times=1))) - { - for (i=0;ivvv) - vvv=variation; //vvv is the biggest variation among the k clusters; - } - //compute_variation; - times++; - } //end for while - - - - free(num); - for (i=0;iMatrix[i][j]) - small[j]=Matrix[i][j]; - } - - size_c[id]++; - } - - for (i=0;i0)) //the smallest values excluded - size_mybound_0[cluster_id[i]][j]++; - else - { - if (Matrix[i][j]size_mybound_1[i][j]) - situ1=0; - else - situ1=1; - if (size_mybound_2[i][j]>size_mybound_3[i][j]) - situ2=2; - else - situ2=3; - - if ((situ1==0) && (situ2==2)) - { - if (size_mybound_0[i][j]>size_mybound_2[i][j]) - prof[i][j]=0; - else - prof[i][j]=2; - } - if ((situ1==0) && (situ2==3)) - { - if (size_mybound_0[i][j]>size_mybound_3[i][j]) - prof[i][j]=0; - else - prof[i][j]=3; - } - if ((situ1==1) && (situ2==2)) - { - if (size_mybound_1[i][j]>size_mybound_2[i][j]) - prof[i][j]=1; - else - prof[i][j]=2; - } - if ((situ1==1) && (situ2==3)) - { - if (size_mybound_1[i][j]>size_mybound_3[i][j]) - prof[i][j]=1; - else - prof[i][j]=3; - } - - //begin to output profile - if (j==num_dm-1) - { - if (prof[i][j]==0) - fprintf(fprof_id,"1\n"); - if (prof[i][j]==1) - fprintf(fprof_id,"2\n"); - if (prof[i][j]==2) - fprintf(fprof_id,"3\n"); - if (prof[i][j]==3) - fprintf(fprof_id,"4\n"); - } - else - { - if (prof[i][j]==0) - fprintf(fprof_id,"1\t"); - if (prof[i][j]==1) - fprintf(fprof_id,"2\t"); - if (prof[i][j]==2) - fprintf(fprof_id,"3\t"); - if (prof[i][j]==3) - fprintf(fprof_id,"4\t"); - } - } - } - fclose(fprof_id); - - /////////////////////////////////////////////////////////// - - - fpcnt_id=fopen("percentage.txt","w"); - fprintf(fpcnt_id,"Population_ID\tPercentage\n"); - - for (t=0;t -#include -#include -#include - -//static const char *rcsid = "$Id: find_connected.c,v 1.1 2008/09/05 21:54:40 rpl Exp $"; - -int find_connected(int **G, int num_dense_grids, int ndim, int *grid_clusterID); -void depth_first(int startnode); - -void bail(const char *); /* exits via abort */ -static void check_clusters(int *gcID, int ndense); -static void merge_cluster(int from, int into); - - -/* Vars that will not change througout the depth-first recursion. We - store them here to avoid endless replication on the stack. */ -static int **Gr=0; -static int *gcID = 0; /* grid cluster IDs */ -static int *cluster_count=0; /* count of nodes per cluster */ -static int ndense=0; -static int ndim=0; -/* cid changes between depth-first searches, but is constant within a - single search, so it goes here. */ -static int cid=0; - -/* Find connected components in the graph of neighboring grids defined in G. - * - * Output: - * - * grid_clusterID[] -- cluster to which each dense grid was assigned - * return value -- number of clusters assigned. - */ -int find_connected(int **G, int n_dense_grids, int num_dm, int *grid_clusterID) -{ - int nclust=0; /* number of clusters found */ - int i; - int *subfac; - int subval=0,nempty=0; - int clustid=0; - - size_t sz = n_dense_grids*sizeof(int); - cluster_count = malloc(sz); - if(!cluster_count) - bail("find_connected: Unable to allocate %zd bytes.\n"); - memset(cluster_count,0,sz); - - /* set up the statics that will be used in the DFS */ - Gr=G; - gcID = grid_clusterID; - ndense = n_dense_grids; - ndim = num_dm; - - - for(i=0;i= 0) { - /* We are, so zero the count for the old cluster. */ - cluster_count[ gcID[node] ] = 0; - merge_cluster(gcID[node], cid); - return; - } - - /* Update for this node */ - gcID[node] = cid; - cluster_count[cid]++; - - /* Recursively search the child nodes */ - for(i=0; i= 0) /* This is a child node */ - depth_first(Gr[node][i]); -} - -void bail(const char *msg) -{ - fprintf(stderr,"%s",msg); - abort(); -} - - -static void check_clusters(int *gcID, int ndense) -{ - int i; - - for(i=0; imax_grid) to STDERR; - MAX_GRID changed to 50 as larger than 50 seems not useful for any file we have got - -******************************************************************************/ -#include -#include -#include -#include -#include -#include -#include -#include - - -#define DEBUG 0 -#define LINE_LEN 1024 -#define FILE_NAME_LEN 128 -#define PARA_NAME_LEN 64 -#define MAX_VALUE 1000000000 -#define MIN_GRID 6 -#define MAX_GRID 50 - -#define NORM_METHOD 2 //2 if z-score; 0 if no normalization; 1 if min-max -#define KMEANS_TERM 100 -//#define MAX_NUM_POP 30 - - -int find_connected(int **G, int num_dense_grids, int ndim, int *grid_clusterID); - -/************* Read basic info of the source file ****************************/ -void getfileinfo(FILE *f_src, int *file_Len, int *num_dm, char *name_string, int *time_ID) -{ - char src[LINE_LEN]; - char current_name[64]; - char prv; - - int num_rows=0; - int num_columns=0; - int ch='\n'; - int prev='\n'; - int time_pos=0; - int i=0; - int j=0; - int sw=0; - - src[0]='\0'; - fgets(src, LINE_LEN, f_src); - - if ((src[0]=='F') && (src[1]=='C') && (src[2]=='S')) - { - fprintf(stderr,"the correct input format is a tab-delimited txt file, instead of FCS file.\n"); - abort(); - } - - name_string[0]='\0'; - current_name[0]='\0'; - prv='\n'; - - // skip space and tab characters - while ((src[i]==' ') || (src[i]=='\t')) - i++; - - // repeat until the end of line is reached - while ((src[i]!='\0') && (src[i]!='\n') && (src[i]!='\r')) - { - current_name[j]=src[i]; - - if ((src[i]=='\t') && (prv!='\t')) //a complete word - { - current_name[j]='\0'; - - if (0!=strcmp(current_name,"Time")) - { - num_columns++; //num_columns does not inlcude the column of Time - time_pos++; - if (sw) { - strcat(name_string,"\t"); - } - strcat(name_string,current_name); - sw = 1; - } - else - { - *time_ID=time_pos; - } - - - current_name[0]='\0'; - j=0; - } - - if ((src[i]=='\t') && (prv=='\t')) //a duplicate tab or space - { - current_name[0]='\0'; - j=0; - } - - if (src[i]!='\t') - j++; - - prv=src[i]; - i++; - } - - if (prv!='\t') //the last one hasn't been retrieved - { - current_name[j]='\0'; - - if (0!=strcmp(current_name,"Time")) - { - num_columns++; - strcat(name_string,"\t"); - strcat(name_string,current_name); - time_pos++; - } - else - { - *time_ID=time_pos; - } - - - } - if (DEBUG==1) - { - printf("time_ID is %d\n",*time_ID); - printf("name_string is %s\n",name_string); - } - - //start computing # of rows - - while ((ch = fgetc(f_src))!= EOF ) - { - if (ch == '\n') - { - ++num_rows; - } - prev = ch; - } - if (prev!='\n') - ++num_rows; - - //added on July 23, 2010 - if (num_rows<50) - { - fprintf(stderr,"Number of events in the input file is too few and should not be processed!\n"); //modified on July 23, 2010 - abort(); - } - - *file_Len=num_rows; - *num_dm=num_columns; - - printf("original file size is %d; number of dimensions is %d\n", *file_Len, *num_dm); -} - - - -/************************************* Read the source file into uncomp_data **************************************/ -void readsource(FILE *f_src, int file_Len, int num_dm, double **uncomp_data, int time_ID) -{ - int time_pass=0; //to mark whether the time_ID has been passed - int index=0; - - int i=0; - int j=0; - int t=0; - - char src[LINE_LEN]; - char xc[LINE_LEN/10]; - - src[0]='\0'; - fgets(src,LINE_LEN, f_src); //skip the first line about parameter names - - while (!feof(f_src) && (indexbiggest) - biggest=orig_data[i][j]; - if (orig_data[i][j] index in the original data of the ith element of the ordered list) - * grid_ID -- mapping between the original data and the "grids" (see below) found as a byproduct - * of the sorting procedure. - * num_nonempty -- the number of grids that occur in the data (= the number of distinct values assumed - * by grid_ID) - */ - -void radixsort_flock(int **position,int file_Len,int num_dm,int num_bin,int *sorted_seq,int *num_nonempty,int *grid_ID) -{ - int i=0; - int length=0; - int start=0; - int prev_ID=0; - int curr_ID=0; - - int j=0; - int t=0; - int p=0; - int loc=0; - int temp=0; - int equal=0; - - int *count; //count[i]=j means there are j numbers having value i at the processing digit - int *index; //index[i]=j means the starting position of grid i is j - int *cp; //current position - int *mark; //mark[i]=0 means it is not an ending point of a part, 1 means it is (a "part" is a group of items with identical bins for all dimensions) - int *seq; //temporary sequence - - count=(int*)malloc(sizeof(int)*num_bin); - memset(count,0,sizeof(int)*num_bin); - - cp=(int*)malloc(sizeof(int)*num_bin); - memset(cp,0,sizeof(int)*num_bin); - - index=(int*)malloc(sizeof(int)*num_bin); // initialized below - - seq=(int*)malloc(sizeof(int)*file_Len); - memset(seq,0,sizeof(int)*file_Len); - - mark=(int*)malloc(sizeof(int)*file_Len); - memset(mark,0,sizeof(int)*file_Len); - - for (i=0;i0) && (index[i+1]<=file_Len)) - { - mark[index[i+1]-1]=1; // Mark the end of the segment in the ordered list - } - else - { - printf("out of myboundary for mark at index[i+1]-1.\n"); - } - } - mark[file_Len-1]=1; - - for (i=0;i0)) - { - mark[index[t+1]-1]=1; // update the part boundaries to include the differences in the current radix. - } - - } - mark[i]=1; - - /* Update the permutation vector for the current part (i.e., from start to i). By the time we finish the loop over i - the PV will be completely updated for the partial ordering up to the current radix. */ - for (t=start;t<=i;t++) - { - loc=position[sorted_seq[t]][j];//loc=position[t][j]; - temp=index[loc]+cp[loc]; - if ((temp=0)) - { - // seq is a temporary because we have to maintain the old PV until we have finished this step. - seq[temp]=sorted_seq[t]; //sorted_seq[i]=temp is also another way to sort - cp[loc]++; - } - else - { - printf("out of myboundary for seq at temp.\n"); - } - } - - for (t=start;t<=i;t++) - { - // copy the temporary back into sorted_seq. sorted_seq is now updated for radix j up through - // entry i in the ordered list. - if ((t>=0) && (tbig[j]) - big[j]=data_in[i][j]; - - if (data_in[i][j]=big[j]) - position[i][j]=num_bin-1; - else - { - position[i][j]=(int)((data_in[i][j]-small[j])/interval[j]); //position[i][j]=t means point i is at the t grid of dimensional j - if ((position[i][j]>=num_bin) || (position[i][j]<0)) - { - //printf("position mis-computed in density analysis!\n"); - //exit(0); - fprintf(stderr,"Incorrect input file format or input parameters (number of bins overflows)!\n"); //modified on July 23, 2010 - abort(); - - } - } - } - } - - - free(small); - free(big); -} - -/********************************************** select_bin to select the number of bins **********************************/ -//num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty); -/* Determine the number of bins to use in each dimension. Additionally sort the data elements according to the binned - * values, and partition the data into "grids" with identical (binned) values. We try progressively more bins until we - * maximize a merit function, then return the results obtained using the optimal number of bins. - * - * Outputs: - * position -- binned data values - * sorted_seq -- permutation vector mapping the ordered list to the original data - * all_grid_ID -- grid to which each data element was assigned. - * num_nonempty -- number of distinct values assumed by all_grid_ID - * interval -- bin width for each data dimension - * return value -- the number of bins selected. - */ - -int select_bin(double **normalized_data, int file_Len, int num_dm, int min_grid, int max_grid, int **position, int *sorted_seq, - int *all_grid_ID, int *num_nonempty, double *interval, int user_num_bin) -{ - - int num_bin=0; - int select_num_bin=0; - int m=0; - int n=0; - - int i=0; - int bin_scope=0; - int temp_num_nonempty=0; - - int *temp_grid_ID; - int *temp_sorted_seq; - int **temp_position; - - //sorted_seq[i]=j means the event j ranks i - - double temp_index=0; - double *bin_index; - double *temp_interval; - - - - temp_grid_ID=(int *)malloc(sizeof(int)*file_Len); - memset(temp_grid_ID,0,sizeof(int)*file_Len); - - temp_sorted_seq=(int *)malloc(sizeof(int)*file_Len); - memset(temp_sorted_seq,0,sizeof(int)*file_Len); - - temp_position=(int **)malloc(sizeof(int*)*file_Len); - memset(temp_position,0,sizeof(int*)*file_Len); - for (m=0;m=(double)(file_Len)*0.95) - break; - if ((bin_index[i]max_grid)) - { - fprintf(stderr,"Number of events collected is too few in terms of number of markers used. The file should not be processed!\n"); //modified on Nov 4, 2010 - exit(0); - } - - if (temp_index==0) - { - fprintf(stderr,"Too many dimensions with too few events in the input file, or a too large number of bins used.\n"); //modified on July 23, 2010 - abort(); - } - - - free(temp_grid_ID); - free(temp_sorted_seq); - free(bin_index); - free(temp_interval); - - for (m=0;m=den_t_event) - temp_num_dense_grids++; - - if (temp_num_dense_grids<=1) - { - //modified on July 23, 2010 - //printf("a too high density threshold is set! Please decrease your density threshold.\n"); - //exit(0); - fprintf(stderr,"a too high density threshold is set! Please decrease your density threshold.\n"); //modified on July 23, 2010 - abort(); - } - - if (temp_num_dense_grids>=100000) - { - //modified on July 23, 2010 - //printf("a too low density threshold is set! Please increase your density threshold.\n"); - //exit(0); - fprintf(stderr,"a too low density threshold is set! Please increase your density threshold.\n"); //modified on July 23, 2010 - abort(); - } - - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Compute dense_grid_reverse - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - temp_num_dense_grids=0; - temp_num_dense_events=0; - for (i=0;i=den_t_event) - { - dense_grid_reverse[i]=temp_num_dense_grids; //dense_grid_reverse provides a mapping from all nonempty grids to dense grids. - temp_num_dense_grids++; - temp_num_dense_events+=all_grid_vol[i]; - } - } - - num_dense_grids[0]=temp_num_dense_grids; - num_dense_events[0]=temp_num_dense_events; - - free(grid_size_index); - free(all_grid_vol); - - return den_t_event; -} - -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Compute densegridID_To_gridevent and eventID_To_denseventID -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// grid_To_event(file_Len, dense_grid_reverse, all_grid_ID, eventID_To_denseventID, densegridID_To_gridevent); -/* - * Filter the data so that only the data belonging to dense grids is left - * - * Output: - * eventID_To_denseeventID -- mapping from original event ID to ID in list containing only events contained in dense grids. - * densegridID_To_gridevent -- mapping from dense grids to prototype members of the grids. - * - */ - -void grid_To_event(int file_Len, int *dense_grid_reverse, int *all_grid_ID, int *eventID_To_denseventID, int *densegridID_To_gridevent) -{ - int i=0; - int dense_grid_ID=0; - int dense_event_ID=0; - - for (i=0;iseq_value[i]) - return -1; - if (search_value[i]==seq_value[i]) - continue; - } - return 0; -} - -//binary search the dense_grid_seq to return the dense grid ID if it exists -int binary_search(int num_dense_grids, int num_dm, int *search_value, int **dense_grid_seq) -{ - - int low=0; - int high=0; - int mid=0; - int comp_result=0; - int match=0; - //int found=0; - - low = 0; - high = num_dense_grids-1; - - while (low <= high) - { - mid = (int)((low + high)/2); - - comp_result=compare_value(num_dm, search_value,dense_grid_seq[mid]); - - - switch(comp_result) - { - case 1: - high=mid-1; - break; - case -1: - low=mid+1; - break; - case 0: - match=1; - break; - } - if (match==1) - break; - } - - - - if (match==1) - { - return mid; - } - else - return -1; -} - - -/********************************************** Computing Centers Using IDs **********************************************/ - -void ID2Center(double **data_in, int file_Len, int num_dm, int *eventID_To_denseventID, int num_clust, int *cluster_ID, double **population_center) -{ - int i=0; - int j=0; - int ID=0; - int eventID=0; - int *size_c; - - - - size_c=(int *)malloc(sizeof(int)*num_clust); - memset(size_c,0,sizeof(int)*num_clust); - - for (i=0;imax_num_pop) || (num_population<=1)) - { - - if (num_population<=1) - merge_dist=merge_dist-0.1; - - if (num_population>max_num_pop) - merge_dist=merge_dist+0.1; - - - for (i=0;i(merge_dist*interval[t])) - merge=0; - } - - if ((merge) && (cluster_populationID[i]!=cluster_populationID[j])) - { - if (cluster_populationID[i]max_num_pop) && (num_population==1)) - break; - else - temp_num_population=num_population; - - if (num_clust<=1) - break; - } - - for (i=0;ikmean_term) && (kmean_term<1)) || ((times=1))) - { - for (i=0;ivvv) - vvv=variation; //vvv is the biggest variation among the k clusters; - } - //compute_variation; - times++; - } //end for while - - - - - free(num); - - for (i=0;iMatrix[i][j]) - small[j]=Matrix[i][j]; - } - - size_c[id]++; - } - - for (i=0;i0)) //the smallest values excluded - size_mybound_0[cluster_id[i]][j]++; - else - { - if (Matrix[i][j]size_mybound_1[i][j]) - situ1=0; - else - situ1=1; - if (size_mybound_2[i][j]>size_mybound_3[i][j]) - situ2=2; - else - situ2=3; - - if ((situ1==0) && (situ2==2)) - { - if (size_mybound_0[i][j]>size_mybound_2[i][j]) - prof[i][j]=0; - else - prof[i][j]=2; - } - if ((situ1==0) && (situ2==3)) - { - if (size_mybound_0[i][j]>size_mybound_3[i][j]) - prof[i][j]=0; - else - prof[i][j]=3; - } - if ((situ1==1) && (situ2==2)) - { - if (size_mybound_1[i][j]>size_mybound_2[i][j]) - prof[i][j]=1; - else - prof[i][j]=2; - } - if ((situ1==1) && (situ2==3)) - { - if (size_mybound_1[i][j]>size_mybound_3[i][j]) - prof[i][j]=1; - else - prof[i][j]=3; - } - - //begin to output profile - if (j==num_dm-1) - { - if (prof[i][j]==0) - fprintf(fprof_id,"1\n"); - if (prof[i][j]==1) - fprintf(fprof_id,"2\n"); - if (prof[i][j]==2) - fprintf(fprof_id,"3\n"); - if (prof[i][j]==3) - fprintf(fprof_id,"4\n"); - } - else - { - if (prof[i][j]==0) - fprintf(fprof_id,"1\t"); - if (prof[i][j]==1) - fprintf(fprof_id,"2\t"); - if (prof[i][j]==2) - fprintf(fprof_id,"3\t"); - if (prof[i][j]==3) - fprintf(fprof_id,"4\t"); - } - } - } - fclose(fprof_id); - - /////////////////////////////////////////////////////////// - - - fpcnt_id=fopen("percentage.txt","w"); - fprintf(fpcnt_id,"Population_ID\tPercentage\n"); - - for (t=0;t population_center -//data + grid_ID -> grid_Center - -/* what is the final output */ -//the final things we want are grid_Center in the original data and grid_clusterID //or population_center in the original data -//Sparse grids will not be considered when computing the two centroids (centroids of grids and centroids of clusters) - -/* what information should select_bin output */ -//the size of all IDs are unknown to function main because we only consider the events in dense grids, and also the number of dense grids -//is unknown, therefore I must use a prescreening to output -//how many bins I should use -//the number of dense grids -//total number of events in the dense grids - -/* basic procedure of main function */ -//1. read raw file and normalize the raw file -//2. select_bin -//3. allocate memory for eventID_To_denseventID, grid_clusterID, grid_ID, cluster_ID. -//4. call select_dense and merge_grids with grid_clusterID, grid_ID, cluster_ID. -//5. release normalized data; allocate memory for grid_Center and population_center -//6. output grid_Center and population_center using ID2Center together with grid_clusterID //from IDs to centers - -int main (int argc, char **argv) -{ - //inputs - FILE *f_src; //source file pointer - - FILE *f_out; //coordinates - FILE *f_cid; //population-ID of events - FILE *f_ctr; //centroids of populations - FILE *f_results; //coordinates file event and population column - FILE *f_mfi; //added April 16, 2009 for mean fluorescence intensity - FILE *f_parameters; //number of bins and density calculated by - //the algorithm. Used to update the database - FILE *f_properties; //Properties file used by Image generation software - - char para_name_string[LINE_LEN]; - - int time_ID=-1; - int num_bin=0; //the bins I will use on each dimension - - int file_Len=0; //number of events - int num_dm=0; - int num_clust=0; - int num_dense_events=0; - int num_dense_grids=0; - int num_nonempty=0; - int num_population=0; - //int temp=0; - - //below are read from configuration file - int i=0; - int j=0; - int max_num_pop=0; - - int den_t_event=0; - - int *grid_clusterID; //(dense)gridID to clusterID - int *grid_ID; //(dense)eventID to gridID - int *cluster_ID; //(dense)eventID to clusterID - int *eventID_To_denseventID; //eventID_To_denseventID[i]=k means event i is in a dense grid and its ID within dense events is k - int *all_grid_ID; //gridID for all events - int *densegridID_To_gridevent; - int *sorted_seq; - int *dense_grid_reverse; - int *population_ID; //denseeventID to populationID - int *cluster_populationID; //clusterID to populationID - int *grid_populationID; //gridID to populationID - int *all_population_ID; //populationID of event - - int **position; - int **dense_grid_seq; - - double *interval; - - double **population_center; //population centroids in the raw/original data - double **cluster_center; //cluster centroids in the raw/original data - - double **input_data; - double **normalized_data; - - int min = 999999; - int max = 0; - - printf( "Starting time:\t\t\t\t"); - fflush(stdout); - system("/bin/date"); - ///////////////////////////////////////////////////////////// - - if ((argc!=2) && (argc!=4) && (argc!=5)) - { - //modified on Jul 23, 2010 - //printf("usage:\n"); - //printf("basic mode: flock data_file\n"); - //printf("advanced mode: flock data_file num_bin density_index\n"); - //exit(0); - - fprintf(stderr,"Incorrect number of input parameters!\n"); //modified on July 23, 2010 - //fprintf(stderr,"basic mode: flock data_file\n"); //modified on July 23, 2010 - //fprintf(stderr,"advanced mode1: flock data_file num_bin density_index\n"); //modified on July 23, 2010 - fprintf(stderr,"advanced mode: flock data_file num_bin density_index max_num_pop\n"); //added on Dec 16, 2010 for GenePattern - abort(); - } - - f_src=fopen(argv[1],"r"); - - if (argc==2) - { - max_num_pop=30; //default value, maximum 30 clusters - } - - if (argc==4) - { - num_bin=atoi(argv[2]); - printf("num_bin is %d\n",num_bin); - - den_t_event=atoi(argv[3]); - printf("density_index is %d\n",den_t_event); - - max_num_pop=30; - - if (((num_bin<6) && (num_bin!=0)) || (num_bin>29)) - { - fprintf(stderr,"Incorrect input range of number of bins, which should be larger than 5 and smaller than 30\n"); - abort(); - } - - if (((den_t_event<3) && (den_t_event!=0)) || (den_t_event>99)) - { - fprintf(stderr,"Incorrect input range of density threshold, which should be larger than 2 and smaller than 100\n"); - abort(); - } - } - - if (argc==5) - { - num_bin=atoi(argv[2]); - printf("num_bin is %d\n",num_bin); - - den_t_event=atoi(argv[3]); - printf("density_index is %d\n",den_t_event); - - max_num_pop=atoi(argv[4]); - printf("max number of clusters is %d\n",max_num_pop); - - if (((num_bin<6) && (num_bin!=0)) || (num_bin>29)) - { - fprintf(stderr,"Incorrect input range of number of bins, which should be larger than 5 and smaller than 30\n"); - abort(); - } - - if (((den_t_event<3) && (den_t_event!=0)) || (den_t_event>99)) - { - fprintf(stderr,"Incorrect input range of density threshold, which should be larger than 2 and smaller than 100\n"); - abort(); - } - - if ((max_num_pop<5) || (max_num_pop>999)) - { - fprintf(stderr,"Incorrect input range of maximum number of populations, which should be larger than 4 and smaller than 1000\n"); - abort(); - } - } - - - getfileinfo(f_src, &file_Len, &num_dm, para_name_string, &time_ID); //get the filelength, number of dimensions, and num/name of parameters - - /************************************************* Read the data *****************************************************/ - - rewind(f_src); //reset the file pointer - - input_data = (double **)malloc(sizeof(double*)*file_Len); - memset(input_data,0,sizeof(double*)*file_Len); - for (i=0;i=1) //num_bin has been selected by user - select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin); - else //num_bin has not been selected by user - { - num_bin=select_bin(normalized_data, file_Len, num_dm, MIN_GRID, MAX_GRID, position, sorted_seq, all_grid_ID, &num_nonempty, interval,num_bin); - printf("selected bin number is %d\n",num_bin); - } - printf("number of non-empty grids is %d\n",num_nonempty); - - - - /* Although we return sorted_seq from select_bin(), we don't use it for anything, except possibly diagnostics */ - free(sorted_seq); - - - dense_grid_reverse=(int*)malloc(sizeof(int)*num_nonempty); - memset(dense_grid_reverse,0,sizeof(int)*num_nonempty); - - /************************************************* select_dense **********************************************/ - - if (den_t_event>=1) //den_t_event must be larger or equal to 2 if the user wants to set it - select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event); - else - { - den_t_event=select_dense(file_Len, all_grid_ID, num_nonempty, &num_dense_grids, &num_dense_events, dense_grid_reverse, den_t_event); - printf("automated selected density threshold is %d\n",den_t_event); - } - - printf("Number of dense grids is %d\n",num_dense_grids); - - densegridID_To_gridevent = (int *)malloc(sizeof(int)*num_dense_grids); - memset(densegridID_To_gridevent,0,sizeof(int)*num_dense_grids); - - for (i=0;i max) { - max = (int)input_data[i][j]; - } - if (j==num_dm-1) - { - fprintf(f_out,"%d\n",(int)input_data[i][j]); - fprintf(f_results,"%d\t",(int)input_data[i][j]); - } - else - { - fprintf(f_out,"%d\t",(int)input_data[i][j]); - fprintf(f_results,"%d\t",(int)input_data[i][j]); - } - } - //fprintf(f_results,"%d\t",i + 1); - fprintf(f_results,"%d\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009 - } - -/* - f_parameters=fopen("parameters.txt","w"); - fprintf(f_parameters,"Number_of_Bins\t%d\n",num_bin); - fprintf(f_parameters,"Density\t%d\n",den_t_event); - fprintf(f_parameters,"Min\t%d\n",min); - fprintf(f_parameters,"Max\t%d\n",max); - fclose(f_parameters); -*/ - - f_properties=fopen("fcs.properties","w"); - fprintf(f_properties,"Bins=%d\n",num_bin); - fprintf(f_properties,"Density=%d\n",den_t_event); - fprintf(f_properties,"Min=%d\n",min); - fprintf(f_properties,"Max=%d\n",max); - fprintf(f_properties,"Populations=%d\n",num_population); - fprintf(f_properties,"Events=%d\n",file_Len); - fprintf(f_properties,"Markers=%d\n",num_dm); - fclose(f_properties); - - - for (i=0;i