Previous changeset 0:8d951baf795f (2017-02-27) Next changeset 2:311a4f16c483 (2017-02-27) |
Commit message:
add FLOCK |
added:
src/README src/cent_adjust.c src/find_connected.c src/flock1.c src/flock2.c |
b |
diff -r 8d951baf795f -r 7eab80f86779 src/README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/README Mon Feb 27 13:26:09 2017 -0500 |
[ |
@@ -0,0 +1,139 @@ +This package contains R code for converting and transforming a binary +FCS file, using the FCSTrans software, and the C code for running the +flock1 and flock2 population identification software. + +src - contains the C code for flock1, flock2 and cent_adjust +bin - contains the compiled C code for flock1, flock2 and cent_adjust, + plus the R code for using the FCSTrans algorithm for file + conversion and data transformation. +doc - documentation for FCSTrans and FLOCK algorithms +example - sample data and output from FCSTrans and FLOCK + +To run this software sucessfully the code assumes the software was installed +in the /usr/local/flock directory and that R, plus the Bioconductor flowCore +module has been installed. If you use the ipconvert.sh shell script, you +may need to adjust the location of the RScript executable and the location +of the FCSTrans.R code by editing the shell script. + +############################################################################# +# Overview +############################################################################# +ImmPort-FLOCK + +FLOCK (FLOw Clustering without K), an automated population discovery tool for +multidimensional FCM data was designed to specifically take into account the +unique feature of FCM data and produce objective segregation of cell +populations. + +FLOCK parameter settings can be customized by defining Bins and Density +Threshold. The number of bins is an integer specifying the number of +equal-sized regions the data will be partitioned into on each axis. Increasing +the number of bins increases the sensitivity to detect rare populations but +may also result in single populations being divided. Density Threshold is the +cut-off value to separate the dense regions from background. It is a floating +point number that helps define population centers; increasing the threshold +may help separate major populations but could cause the algorithm to overlook +rare populations. + +############################################################################# +# Compiling C code +############################################################################# +cd bin +cc -o flock1 ../src/flock1.c ../src/find_connected.c -lm +cc -o flock2 ../src/flock2.c -lm +cc -o cent_adjust ../src/cent_adjust.c -lm + +############################################################################# +# FCSTrans +############################################################################# +A shell script named ipconvert.sh is included that runs the FCSTrans R +code for converting and transforming a binary FCS file. The output consists +of one text file containing the transformed channel intensity values and +another file containing a list of the FCS parameters. + +cd bin +/usr/local/flock/bin/ipconvert.sh ../example/data/FCS2.fcs +/usr/local/flock/bin/ipconvert.sh ../example/data/FCS3.fcs + +############################################################################# +# Running flock1 or flock2 +############################################################################# +Running the FLOCK1 and FLOCK2 algorithms generate 8 output files that have +generic file names. For this reason, it is recommended that one output +directory be created for one input file to the program. + +cd example/output/FCS2 +/usr/local/flock/bin/flock1 ../../data/FCS2.txt + +cd example/output/FCS3 +/usr/local/flock/bin/flock2 ../../data/FCS3.txt + +Files created: MFI.txt, percentage.txt, population_id.txt, profile.txt, + flock_results.txt, coordinates.txt, population_center.txt + and percentage.txt + +Usage Information for FLOCK1 +---------------------------------------------------------------------------- +basic mode: flock1 fcs.txt +advanced_ mode: flock1 fcs.txt num_bin density_index max_num_pop + +Usage Information for FLOCK2 +---------------------------------------------------------------------------- +basic mode: flock data_file +advanced mode 0 (specify maximum # of pops): flock data_file max_num_pop +advanced mode 1 (without # of pops): flock data_file num_bin density_index +advanced mode 2 (specify # of pops): flock data_file num_bin density_index + number_of_pop +advanced mode 3 (specify both # of pops): flock data_file num_bin density_index + number_of_pop max_num_pop + +FLOCK Output Files +---------------------------------------------------------------------------- +coordinates.txt: + Output is the intensity values for each marker and event + +flock_results.txt: + A combination of the input file, event identifiers and population + identifiers. + +MFI.txt: + Provides the mean fluorescence intensity for each population for each + marker/parameter + +population_id.txt: + Contains population identifiers (i.e, values from [1 to n] where n is + the population assigned to the corresponding events in the input data + file, one identifier per row.) + +population_center.txt: + Contains the centroid coordinates for each identified population + +percentage.txt: + Includes the population identifiers and percentage of events within that + population (relative to the whole data file) + +profile.txt: + Displays an expression profile, where the approximate expression level + for each marker is assigned a numeric value from 1-4, for each identified + population + +fcs_properties.txt: + Contains the number of events, number of populations, and number of + markers, as well as the algorithm parameters used in the analysis + +############################################################################# +# Running cent_adjust +############################################################################# +Running the cent_adjust algorithm generates 4 output files that have +generic file names. For this reason, it is recommened that one output +directory be created for one input file to the program. + +mkdir example/output/FCS2/cent_adjust +cd example/output/FCS2/cent_adjust +/usr/local/flock/bin/cent_adjust ../population_center.txt ../coordinates.txt + +Files created: MFI.txt, percentage.txt, population_id.txt and profile.txt + +Usage Information for cent_adjust +---------------------------------------------------------------------------- +basic mode: cent_adjust input_center input_data_file |
b |
diff -r 8d951baf795f -r 7eab80f86779 src/cent_adjust.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/cent_adjust.c Mon Feb 27 13:26:09 2017 -0500 |
[ |
b'@@ -0,0 +1,1009 @@\n+/////////////////////////////////////////////////////////\n+// Cent_adjust version number and modification history\n+// ImmPort BISC project\n+// Author: Yu "Max" Qian\n+// v1.01: Oct 16, 2009\n+// Line 899 of the main function:\n+// Changed kmean_term=1 to kmean_term=2\n+//////////////////////////////////////////////////////////\n+\n+\n+#include <time.h>\n+#include <stdio.h>\n+#include <stdlib.h>\n+#include <math.h>\n+#include <string.h>\n+\n+#define DEBUG 0\n+#define LINE_LEN 1024\n+#define FILE_NAME_LEN 128\n+#define PARA_NAME_LEN 64\n+#define MAX_VALUE 1000000000\n+#define CUBE 0\n+\n+\n+void getctrfileinfo(FILE *f_src_ctr, long *num_clust)\n+{\n+\tint ch=\'\\n\';\n+\tint prev=\'\\n\';\n+\tlong num_rows=0;\n+\n+\twhile ((ch = fgetc(f_src_ctr))!= EOF )\n+ {\n+\t\tif (ch == \'\\n\')\n+ {\n+\t\t\t++num_rows;\n+ }\n+\t\tprev = ch;\n+ }\n+\tif (prev!=\'\\n\')\n+\t\t++num_rows;\n+\t\n+\t*num_clust=num_rows;\n+\t//printf("center file has %ld rows\\n", *num_clust);\n+}\n+\n+/************************************* Read basic info of the source file **************************************/\n+void getfileinfo(FILE *f_src, long *file_Len, long *num_dm, char *name_string, int *time_ID)\n+{\n+\tchar src[LINE_LEN];\n+\tchar current_name[64];\n+\tchar prv;\n+\n+\tlong num_rows=0;\n+\tlong num_columns=0;\n+\tint ch=\'\\n\';\n+\tint prev=\'\\n\';\n+\tlong time_pos=0;\n+\tlong i=0;\n+\tlong j=0;\n+\n+\t\n+\n+\tsrc[0]=\'\\0\';\n+\tfgets(src, LINE_LEN, f_src);\n+\n+\tname_string[0]=\'\\0\';\n+\tcurrent_name[0]=\'\\0\';\n+\tprv=\'\\n\';\n+\n+\twhile ((src[i]==\' \') || (src[i]==\'\\t\')) //skip space and tab characters\n+\t\ti++;\n+\n+\twhile ((src[i]!=\'\\r\') && (src[i]!=\'\\n\')) //repeat until the end of the line\n+\t{\n+\t\tcurrent_name[j]=src[i];\n+\t\t\n+\t\tif ((src[i]==\'\\t\') && (prv!=\'\\t\')) //a complete word\n+\t\t{\n+\t\t\tcurrent_name[j]=\'\\0\';\n+\t\t\t\n+ /* \n+ * Commented out John Campbell, June 10 2010\n+ * We no longer want to automatically remove Time column.\n+ * This column should have been removed by column selection\n+ if (0!=strcmp(current_name,"Time"))\n+ {\n+ num_columns++; //num_columns does not inlcude the column of Time\n+ time_pos++;\n+ strcat(name_string,current_name); \n+ strcat(name_string,"\\t");\n+ }\n+ else\n+ {\n+ *time_ID=time_pos;\n+ }\n+ */\n+\n+ num_columns++;\n+ strcat(name_string,current_name);\n+ strcat(name_string,"\\t");\n+\n+\t\t current_name[0]=\'\\0\';\n+\t\t j=0;\t\t\t\n+\t\t}\t\t\n+\t\t\n+\t\tif ((src[i]==\'\\t\') && (prv==\'\\t\')) //a duplicate tab or space\n+\t\t{\n+\t\t\tcurrent_name[0]=\'\\0\';\n+\t\t\tj=0;\n+\t\t}\n+\t\t\n+ if (src[i]!=\'\\t\')\n+\t j++;\n+\n+\t\t\n+\t\tprv=src[i];\n+\t\ti++;\n+\t}\n+\t\n+\tif (prv!=\'\\t\') //the last one hasn\'t been retrieved\n+\t{\n+\t\tcurrent_name[j]=\'\\0\';\n+ /* \n+ * Commented out John Campbell, June 10 2010\n+ * We no longer want to automatically remove Time column.\n+ * This column should have been removed by column selection\n+ if (0!=strcmp(current_name,"Time"))\n+ {\n+ num_columns++;\n+ strcat(name_string,current_name);\n+ time_pos++;\n+ }\n+ else\n+ {\n+ *time_ID=time_pos;\n+ }\n+ */\n+\n+ num_columns++;\n+ strcat(name_string,current_name);\n+\t}\n+\n+\tif (DEBUG==1)\n+\t{\n+\t\tprintf("time_ID is %d\\n",*time_ID);\n+\t\tprintf("name_string is %s\\n",name_string);\n+\t}\n+\n+\t// # of rows\n+\n+\twhile ((ch = fgetc(f_src))!= EOF )\n+ {\n+\t\tif (ch == \'\\n\')\n+ {\n+\t\t\t++num_rows;\n+ }\n+\t\tprev = ch;\n+ }\n+\tif (prev!=\'\\n\')\n+\t\t++num_rows;\n+\t\n+\t*file_Len=num_rows;\n+\t*num_dm=num_columns; \n+\n+\t//printf("original file size is %ld; number of dimensions is %ld\\n", *file_Len, *num_dm);\n+}\n+\n+\n+\n+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n+/************************************* Read the source file into uncomp_data **************************************/\n+void readsource(FILE *f_src, lo'..b'0;\n+\tlong j=0;\n+\n+\tlong *cluster_id;\n+\tlong *IDmapping; //this is to keep the original populationID of the center.txt\n+\n+\tdouble kmean_term=0;\n+\t\n+\tdouble **cluster_center;\n+\tdouble **orig_data;\n+\tdouble **normalized_data;\n+\t\t\n+/*\n+\t_strtime( tmpbuf );\n+ printf( "Starting time:\\t\\t\\t\\t%s\\n", tmpbuf );\n+\t_strdate( tmpbuf );\n+ printf( "Starting date:\\t\\t\\t\\t%s\\n", tmpbuf );\n+*/\n+\n+\tif (argc!=3)\n+\t{\n+\t\tprintf("usage: cent_adjust input_center input_data_file\\n"); \n+\t\texit(0);\n+\t}\t\n+\t\n+\n+\tf_src_ctr=fopen(argv[1],"r");\t\n+\t\n+\t//read source data\n+\tf_src=fopen(argv[2],"r");\n+\t\n+\tgetfileinfo(f_src, &file_Len, &num_dm, name_string, &time_id); //get the filelength, number of dimensions, and num/name of parameters\n+\n+\trewind(f_src); //reset data file pointer\t\n+\n+\torig_data = (double **)malloc(sizeof(double*)*file_Len);\n+\tmemset(orig_data,0,sizeof(double*)*file_Len);\n+\tfor (i=0;i<file_Len;i++)\n+\t{\n+\t\torig_data[i]=(double *)malloc(sizeof(double)*num_dm);\n+\t\tmemset(orig_data[i],0,sizeof(double)*num_dm);\n+\t}\n+\t\n+\treadsource(f_src, file_Len, num_dm, orig_data, time_id); //read the data;\n+\t\n+\tfclose(f_src);\n+\t/////////////////////////////////////////////////////////////////////////////\n+\tgetctrfileinfo(f_src_ctr, &num_clust); //get how many populations\n+\tnorm_used=0;\n+\tdist_used=0;\n+\tkmean_term=2; //modified on Oct 16, 2009: changed kmean_term=1 to kmean_term=2\n+\n+\trewind(f_src_ctr); //reset center file pointer\n+\n+\t//read population center\n+\tcluster_center=(double **)malloc(sizeof(double*)*num_clust);\n+\tmemset(cluster_center,0,sizeof(double*)*num_clust);\n+\tfor (i=0;i<num_clust;i++)\n+\t{\n+\t\tcluster_center[i]=(double*)malloc(sizeof(double)*num_dm);\n+\t\tmemset(cluster_center[i],0,sizeof(double)*num_dm);\n+\t}\n+\tfor (i=0;i<num_clust;i++)\n+\t\tfor (j=0;j<num_dm;j++)\n+\t\t\tcluster_center[i][j]=0;\n+\n+\tIDmapping=(long *)malloc(sizeof(long)*num_clust);\n+\tmemset(IDmapping,0,sizeof(long)*num_clust);\n+\n+\treadcenter(f_src_ctr,num_clust,num_dm,cluster_center,IDmapping); //read population center\n+ fclose(f_src_ctr);\n+\n+\t/////////////////////////////////////////////////////////////////////////////\n+\tnormalized_data=(double **)malloc(sizeof(double*)*file_Len);\n+\tmemset(normalized_data,0,sizeof(double*)*file_Len);\n+\tfor (i=0;i<file_Len;i++)\n+\t{\n+\t\tnormalized_data[i]=(double *)malloc(sizeof(double)*num_dm);\n+\t\tmemset(normalized_data[i],0,sizeof(double)*num_dm);\n+\t}\n+\t\n+\ttran(orig_data, file_Len, num_dm, norm_used, normalized_data);\n+\t/************************************************* Compute number of clusters *************************************************/\n+\t\n+\tcluster_id=(long*)malloc(sizeof(long)*file_Len);\n+\tmemset(cluster_id,0,sizeof(long)*file_Len);\n+\n+\tassign_event(normalized_data,num_clust,dist_used,kmean_term,file_Len,num_dm,cluster_id,cluster_center,0);\n+\n+\t\n+\t//show(orig_data,cluster_id,file_Len,num_clust,num_dm,show_data,num_disp,name_string); \n+\tshow(orig_data, cluster_id, file_Len, num_clust, num_dm, name_string, IDmapping);\n+\n+\tf_cid=fopen("population_id.txt","w");\n+\n+\tfor (i=0;i<file_Len;i++)\n+\t\tfprintf(f_cid,"%ld\\n",IDmapping[cluster_id[i]]);\n+\t\t\n+\n+\tfclose(f_cid);\n+ \n+\t//added April 16, 2009\n+\tf_mfi=fopen("MFI.txt","w");\n+\n+\tfor (i=0;i<num_clust;i++)\n+\t{\n+\t\tfprintf(f_mfi,"%ld\\t",IDmapping[i]);\n+\n+\t\tfor (j=0;j<num_dm;j++)\n+\t\t{\n+\t\t\tif (j==num_dm-1)\n+\t\t\t\tfprintf(f_mfi,"%.0f\\n",cluster_center[i][j]);\n+\t\t\telse\n+\t\t\t\tfprintf(f_mfi,"%.0f\\t",cluster_center[i][j]);\n+\t\t}\n+\t}\n+\tfclose(f_mfi);\n+\n+\t//ended April 16, 2009\n+\n+\tfor (i=0;i<num_clust;i++)\n+\t\tfree(cluster_center[i]);\n+\tfree(cluster_center);\n+\t\t\n+\n+\t/********************************************** Release memory ******************************************/\n+ \n+\tfor (i=0;i<file_Len;i++)\n+\t{\n+\t\tfree(orig_data[i]);\t\t\n+\t\tfree(normalized_data[i]);\n+\t}\n+\t\n+\tfree(orig_data);\n+\tfree(normalized_data);\n+\tfree(cluster_id);\n+\tfree(IDmapping);\n+\n+/*\n+\t_strtime( tmpbuf );\n+ printf( "Ending time:\\t\\t\\t\\t%s\\n", tmpbuf );\n+\t_strdate( tmpbuf );\n+ printf( "Ending date:\\t\\t\\t\\t%s\\n", tmpbuf );\n+*/\n+\n+}\n' |
b |
diff -r 8d951baf795f -r 7eab80f86779 src/find_connected.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/find_connected.c Mon Feb 27 13:26:09 2017 -0500 |
[ |
@@ -0,0 +1,176 @@ +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> + +//static const char *rcsid = "$Id: find_connected.c,v 1.1 2008/09/05 21:54:40 rpl Exp $"; + +int find_connected(int **G, int num_dense_grids, int ndim, int *grid_clusterID); +void depth_first(int startnode); + +void bail(const char *); /* exits via abort */ +static void check_clusters(int *gcID, int ndense); +static void merge_cluster(int from, int into); + + +/* Vars that will not change througout the depth-first recursion. We + store them here to avoid endless replication on the stack. */ +static int **Gr=0; +static int *gcID = 0; /* grid cluster IDs */ +static int *cluster_count=0; /* count of nodes per cluster */ +static int ndense=0; +static int ndim=0; +/* cid changes between depth-first searches, but is constant within a + single search, so it goes here. */ +static int cid=0; + +/* Find connected components in the graph of neighboring grids defined in G. + * + * Output: + * + * grid_clusterID[] -- cluster to which each dense grid was assigned + * return value -- number of clusters assigned. + */ +int find_connected(int **G, int n_dense_grids, int num_dm, int *grid_clusterID) +{ + int nclust=0; /* number of clusters found */ + int i; + int *subfac; + int subval=0,nempty=0; + int clustid=0; + + size_t sz = n_dense_grids*sizeof(int); + cluster_count = malloc(sz); + if(!cluster_count) + bail("find_connected: Unable to allocate %zd bytes.\n"); + memset(cluster_count,0,sz); + + /* set up the statics that will be used in the DFS */ + Gr=G; + gcID = grid_clusterID; + ndense = n_dense_grids; + ndim = num_dm; + + + for(i=0;i<ndense;++i) + grid_clusterID[i] = -1; + + for(i=0;i<ndense;++i) { + if(grid_clusterID[i] < 0) { /* grid hasn't been assigned yet */ + cid = nclust++; + depth_first(i); + } + } + +#ifndef NDEBUG + check_clusters(gcID,ndense); +#endif + + /* At this point we probably have some clusters that are empty due to merging. + We want to compact the cluster numbering to eliminate the empty clusters. */ + + subfac = malloc(sz); + if(!subfac) + bail("find_connected: Unable to allocate %zd bytes.\n"); + subval=0; + nempty=0; + + /* cluster #i needs to have its ID decremented by 1 for each empty cluster with + ID < i. Precaclulate the decrements in this loop: */ + for(i=0;i<nclust;++i) { + //clustid = grid_clusterID[i]; + if(cluster_count[i] == 0) { + subval++; + nempty++; + } + subfac[i] = subval; + } + + /* Now apply the decrements to all of the dense grids */ + for(i=0;i<ndense;++i) { + clustid = grid_clusterID[i]; + grid_clusterID[i] -= subfac[clustid]; + } + +#ifndef NDEBUG + // check_clusters(gcID,ndense); +#endif + + /* correct the number of clusters found */ + nclust -= nempty; + + return nclust; +} + + +/* Do a depth-first search for a single connected component in graph + * G. Start from node, tag the nodes found with cid, and record + * the tags in grid_clusterID. Also, record the node count in + * cluster_count. If we find a node that has already been assigned to + * a cluster, that means we're merging two clusters, so zero out the + * old cid's node count. + * + * Note that our graph is constructed as a DAG, so we can be + * guaranteed to terminate without checking for cycles. + * + * Note2: this function can potentially recurse to depth = ndense. + * Plan stack size accordingly. + * + * Output: + * + * grid_clusterID[] -- array where we tag the nodes. + * cluster_count[] -- count of the number of nodes per cluster. + */ + +void depth_first(int node) +{ + int i; + + if(gcID[node] == cid) // we're guaranteed no cycles, but it is possible to reach a node + return; // through two different paths in the same cluster. This early + // return saves us some unneeded work. + + /* Check to see if we are merging a cluster */ + if(gcID[node] >= 0) { + /* We are, so zero the count for the old cluster. */ + cluster_count[ gcID[node] ] = 0; + merge_cluster(gcID[node], cid); + return; + } + + /* Update for this node */ + gcID[node] = cid; + cluster_count[cid]++; + + /* Recursively search the child nodes */ + for(i=0; i<ndim; ++i) + if(Gr[node][i] >= 0) /* This is a child node */ + depth_first(Gr[node][i]); +} + +void bail(const char *msg) +{ + fprintf(stderr,"%s",msg); + abort(); +} + + +static void check_clusters(int *gcID, int ndense) +{ + int i; + + for(i=0; i<ndense; ++i) + if(gcID[i] < 0) { + fprintf(stderr,"faulty cluster id at i= %d\n",i); + abort(); + } +} + +static void merge_cluster(int from, int into) +{ + int i; + + for(i=0; i<ndense; ++i) + if(gcID[i] == from) + gcID[i] = into; +} |
b |
diff -r 8d951baf795f -r 7eab80f86779 src/flock1.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/flock1.c Mon Feb 27 13:26:09 2017 -0500 |
[ |
b'@@ -0,0 +1,2400 @@\n+/*****************************************************************************\n+\t\n+\tFLOCK: FLOw cytometry Clustering without K (Named by: Jamie A. Lee and Richard H. Scheuermann) \n+\t\n+\tAuthor: (Max) Yu Qian, Ph.D.\n+\t\n+\tCopyright: Scheuermann Lab, Dept. of Pathology, UTSW\n+\t\n+\tDevelopment: November 2005 ~ Forever\n+\n+\tAlgorithm Status: May 2007: Release 1.0\n+\n+\tUsage: flock data_file\n+\t\t Note: the input file format must be channel values and the delimiter between two values must be a tab.\n+\n+ Changes made July 23, 2010: made errors to STDERR\n+\tChanges made Nov 4, 2010: added one more error (select_num_bin<min_grid) || (select_num_bin>max_grid) to STDERR;\n+\t MAX_GRID changed to 50 as larger than 50 seems not useful for any file we have got\n+\t\n+******************************************************************************/\n+#include <time.h>\n+#include <stdio.h>\n+#include <stdlib.h>\n+#include <math.h>\n+#include <string.h>\n+#include <sys/stat.h>\n+#include <unistd.h>\n+#include <assert.h>\n+\n+\n+#define DEBUG 0\n+#define LINE_LEN 1024\n+#define FILE_NAME_LEN 128\n+#define PARA_NAME_LEN 64\n+#define MAX_VALUE 1000000000\n+#define MIN_GRID 6\n+#define MAX_GRID 50\n+\n+#define NORM_METHOD 2 //2 if z-score; 0 if no normalization; 1 if min-max \n+#define KMEANS_TERM 100\n+//#define MAX_NUM_POP 30\n+\n+\n+int find_connected(int **G, int num_dense_grids, int ndim, int *grid_clusterID);\n+\n+/************* Read basic info of the source file ****************************/\n+void getfileinfo(FILE *f_src, int *file_Len, int *num_dm, char *name_string, int *time_ID)\n+{\n+ char src[LINE_LEN];\n+ char current_name[64];\n+ char prv;\n+\n+ int num_rows=0;\n+ int num_columns=0;\n+ int ch=\'\\n\';\n+ int prev=\'\\n\';\n+ int time_pos=0;\n+ int i=0;\n+ int j=0;\n+ int sw=0;\n+\n+ src[0]=\'\\0\';\n+ fgets(src, LINE_LEN, f_src);\n+\n+ if ((src[0]==\'F\') && (src[1]==\'C\') && (src[2]==\'S\'))\n+\t{\n+\t\tfprintf(stderr,"the correct input format is a tab-delimited txt file, instead of FCS file.\\n");\n+\t\tabort();\n+\t}\n+\n+ name_string[0]=\'\\0\';\n+ current_name[0]=\'\\0\';\n+ prv=\'\\n\';\n+\n+ // skip space and tab characters\n+ while ((src[i]==\' \') || (src[i]==\'\\t\'))\n+ i++;\n+\n+ // repeat until the end of line is reached\n+ while ((src[i]!=\'\\0\') && (src[i]!=\'\\n\') && (src[i]!=\'\\r\'))\n+ {\n+ current_name[j]=src[i];\n+\t\t\n+ if ((src[i]==\'\\t\') && (prv!=\'\\t\')) //a complete word\n+ {\n+ current_name[j]=\'\\0\';\n+\t\t\t\n+ if (0!=strcmp(current_name,"Time"))\n+ {\n+ num_columns++; //num_columns does not inlcude the column of Time\n+ time_pos++;\n+ if (sw) {\n+ strcat(name_string,"\\t");\n+ }\n+ strcat(name_string,current_name); \n+ sw = 1;\n+ }\n+ else\n+ {\n+ *time_ID=time_pos;\n+ }\n+ \n+ \n+ current_name[0]=\'\\0\';\n+ j=0;\t\t\t\n+ }\t\t\n+\t\t\n+ if ((src[i]==\'\\t\') && (prv==\'\\t\')) //a duplicate tab or space\n+ {\n+ current_name[0]=\'\\0\';\n+ j=0;\n+ }\n+\t\t\n+ if (src[i]!=\'\\t\')\n+ j++;\n+\t\t\n+ prv=src[i];\n+ i++;\n+ }\n+\t\n+ if (prv!=\'\\t\') //the last one hasn\'t been retrieved\n+ {\n+ current_name[j]=\'\\0\';\n+ \n+ if (0!=strcmp(current_name,"Time"))\n+ {\n+ num_columns++;\n+ strcat(name_string,"\\t");\n+ strcat(name_string,current_name);\n+ time_pos++;\n+ }\n+ else\n+ {\n+ *time_ID=time_pos;\n+ }\n+ \n+ \n+ }\n+ if (DEBUG==1)\n+ {\n+ printf("time_ID is %d\\n",*time_ID);\n+ printf("name_string is %s\\n",name_string);\n+ }\n+\n+ //start computing # of rows\n+\n+ while ((ch = fgetc(f_src))!= EOF )\n+ {\n+ if (ch == \'\\n\')\n+ {\n+ ++num_rows;\n+ }\n+ prev = ch;\n+ }\n+ if (prev!=\'\\n\')\n+ ++num_rows;\n+\n+ //added on July 23, 2010\n+ if (num_rows<50)\n+ {\n+ fprintf(stderr,"Number of events '..b'lation_ID=(int*)malloc(sizeof(int)*file_Len);\n+ memset(all_population_ID,0,sizeof(int)*file_Len);\n+\n+ kmeans(normalized_data, num_population, KMEANS_TERM, file_Len, num_dm, all_population_ID, population_center);\n+ show(input_data, all_population_ID, file_Len, num_population, num_dm, para_name_string);\n+\n+ ID2Center_all(input_data,file_Len,num_dm,num_population,all_population_ID,population_center);\n+ \n+\n+ f_cid=fopen("population_id.txt","w");\n+ f_ctr=fopen("population_center.txt","w");\n+ f_out=fopen("coordinates.txt","w");\n+ f_results=fopen("flock_results.txt","w");\n+\n+/*\n+ f_parameters=fopen("parameters.txt","w");\n+ fprintf(f_parameters,"Number_of_Bins\\t%d\\n",num_bin);\n+ fprintf(f_parameters,"Density\\t%f\\n",aver_index);\n+ fclose(f_parameters);\n+*/\n+\n+ for (i=0;i<file_Len;i++)\n+\tfprintf(f_cid,"%d\\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009\n+\n+ /*\n+ * New to check for min/max to add to parameters.txt\n+ *\n+ */\n+ \n+ fprintf(f_out,"%s\\n",para_name_string);\n+ //fprintf(f_results,"%s\\tEvent\\tPopulation\\n",para_name_string);\n+ fprintf(f_results,"%s\\tPopulation\\n",para_name_string);\n+ for (i=0;i<file_Len;i++)\n+ {\n+\tfor (j=0;j<num_dm;j++)\n+\t{\n+\t\tif (input_data[i][j] < min) {\n+\t\t\tmin = (int)input_data[i][j];\n+\t\t}\n+\t\tif (input_data[i][j] > max) {\n+\t\t\tmax = (int)input_data[i][j];\n+\t\t}\n+\t\tif (j==num_dm-1)\n+\t\t{\n+\t\t\tfprintf(f_out,"%d\\n",(int)input_data[i][j]);\n+\t\t\tfprintf(f_results,"%d\\t",(int)input_data[i][j]);\n+\t\t}\n+\t\telse\n+\t\t{\n+\t\t\tfprintf(f_out,"%d\\t",(int)input_data[i][j]);\n+\t\t\tfprintf(f_results,"%d\\t",(int)input_data[i][j]);\n+\t\t}\n+\t}\n+\t//fprintf(f_results,"%d\\t",i + 1);\n+\tfprintf(f_results,"%d\\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009\n+ }\n+\n+/*\n+ f_parameters=fopen("parameters.txt","w");\n+ fprintf(f_parameters,"Number_of_Bins\\t%d\\n",num_bin);\n+ fprintf(f_parameters,"Density\\t%d\\n",den_t_event);\n+ fprintf(f_parameters,"Min\\t%d\\n",min);\n+ fprintf(f_parameters,"Max\\t%d\\n",max);\n+ fclose(f_parameters);\n+*/\n+\n+ f_properties=fopen("fcs.properties","w");\n+ fprintf(f_properties,"Bins=%d\\n",num_bin);\n+ fprintf(f_properties,"Density=%d\\n",den_t_event);\n+ fprintf(f_properties,"Min=%d\\n",min);\n+ fprintf(f_properties,"Max=%d\\n",max);\n+ fprintf(f_properties,"Populations=%d\\n",num_population);\n+ fprintf(f_properties,"Events=%d\\n",file_Len);\n+ fprintf(f_properties,"Markers=%d\\n",num_dm);\n+ fclose(f_properties);\n+\n+\n+ for (i=0;i<num_population;i++) {\n+\t/* Add if we want to include population id in the output\n+\t*/\n+\tfprintf(f_ctr,"%d\\t",i+1); //i changed to i+1 to start from 1 instead of 0: April 16, 2009\n+\n+\tfor (j=0;j<num_dm;j++) {\n+\t\tif (j==num_dm-1)\n+\t\t\tfprintf(f_ctr,"%.0f\\n",population_center[i][j]);\n+\t\telse\n+\t\t\tfprintf(f_ctr,"%.0f\\t",population_center[i][j]);\n+\t}\n+ }\n+\n+ \t//added April 16, 2009\n+\tf_mfi=fopen("MFI.txt","w");\n+\n+\tfor (i=0;i<num_population;i++)\n+\t{\n+\t\tfprintf(f_mfi,"%d\\t",i+1);\n+\n+\t\tfor (j=0;j<num_dm;j++)\n+\t\t{\n+\t\t\tif (j==num_dm-1)\n+\t\t\t\tfprintf(f_mfi,"%.0f\\n",population_center[i][j]);\n+\t\t\telse\n+\t\t\t\tfprintf(f_mfi,"%.0f\\t",population_center[i][j]);\n+\t\t}\n+\t}\n+\tfclose(f_mfi);\n+\n+\t//ended April 16, 2009\n+\t\t\t\n+ fclose(f_cid);\n+ fclose(f_ctr);\n+ fclose(f_out);\n+ fclose(f_results);\n+\n+\n+ for (i=0;i<num_population;i++)\n+ {\n+\tfree(population_center[i]);\n+ }\n+ free(population_center);\n+ \n+\n+ for (i=0;i<file_Len;i++)\n+ free(normalized_data[i]);\n+ free(normalized_data);\t\n+\t\n+ free(grid_populationID);\n+\n+ free(cluster_populationID);\n+ free(grid_clusterID);\n+ free(cluster_ID);\n+\n+ for (i=0;i<file_Len;i++)\n+ free(input_data[i]);\n+ free(input_data);\n+\n+ free(grid_ID);\n+ free(population_ID);\n+ free(all_population_ID);\n+ free(eventID_To_denseventID);\n+\t\t\n+ ///////////////////////////////////////////////////////////\n+ printf("Ending time:\\t\\t\\t\\t");\n+ fflush(stdout);\n+ system("/bin/date");\n+\n+ return 0;\n+\n+}\n' |
b |
diff -r 8d951baf795f -r 7eab80f86779 src/flock2.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/flock2.c Mon Feb 27 13:26:09 2017 -0500 |
[ |
b"@@ -0,0 +1,3405 @@\n+///////////\n+// Changes made:\n+// 1. Added another parameter: number of populations\n+// 2. Added a hierarchical merging step based on density change between centroids of two hyper-regions\n+// 3. Picked the longest dimensions between the population centroids to judge whether the two parts should be merged\n+// 4. Removed checking time parameter\n+// 5. Output error to stderr\n+// 6. Fixed the bug of density threshold always = 3\n+// 7. Added another error (select_num_bin<min_grid) || (select_num_bin>max_grid) to STDERR\n+// 8. Fixed a bug for 2D data by using K=e*K\n+// 9. Added some header files, may not be necessary\n+// 10. Added a lower bound (at least two) for number of populations\n+/***************************************************************************************************************************************\n+\t\n+\tFLOCK: FLOw cytometry Clustering without K (Named by: Jamie A. Lee and Richard H. Scheuermann) \n+\t\n+\tAuthor: (Max) Yu Qian, Ph.D.\n+\t\n+\tCopyright: Scheuermann Lab, Dept. of Pathology, UTSW\n+\t\n+\tDevelopment: November 2005 ~ Forever\n+\n+\tStatus: July 2010: Release 2.0\n+\n+\tUsage: flock data_file\n+\t\t Note: the input file format must be channel values and the delimiter between two values must be a tab.\n+\n+\n+ \t\n+****************************************************************************************************************************************/\n+\n+#include <time.h>\n+#include <stdio.h>\n+#include <stdlib.h>\n+#include <math.h>\n+#include <string.h>\n+#include <sys/stat.h>\n+#include <unistd.h>\n+#include <assert.h>\n+\n+\n+\n+#define DEBUG 0\n+#define LINE_LEN 1024\n+#define FILE_NAME_LEN 128\n+#define PARA_NAME_LEN 64\n+#define MAX_VALUE 1000000000\n+#define MIN_GRID 6\n+#define MAX_GRID 50\n+#define E_T 1.0\n+\n+#define NORM_METHOD 2 //2 if z-score; 0 if no normalization; 1 if min-max \n+#define KMEANS_TERM 10 \n+#define MAX_POP_NUM 128\n+\n+#ifndef max\n+ #define max( a, b ) ( ((a) > (b)) ? (a) : (b) )\n+#endif\n+\n+#ifndef min\n+ #define min( a, b ) ( ((a) < (b)) ? (a) : (b) )\n+#endif\n+\n+static long **Gr=0;\n+static long *gcID = 0; /* grid cluster IDs */\n+static long *cluster_count=0; /* count of nodes per cluster */\n+static long ndense=0;\n+static long ndim=0;\n+/* cid changes between depth-first searches, but is constant within a\n+ single search, so it goes here. */\n+static long cid=0;\n+/* Do a depth-first search for a single connected component in graph\n+ * G. Start from node, tag the nodes found with cid, and record\n+ * the tags in grid_clusterID. Also, record the node count in\n+ * cluster_count. If we find a node that has already been assigned to\n+ * a cluster, that means we're merging two clusters, so zero out the\n+ * old cid's node count.\n+ *\n+ * Note that our graph is constructed as a DAG, so we can be\n+ * guaranteed to terminate without checking for cycles. \n+ *\n+ * Note2: this function can potentially recurse to depth = ndense.\n+ * Plan stack size accordingly. \n+ *\n+ * Output:\n+ *\n+ * grid_clusterID[] -- array where we tag the nodes.\n+ * cluster_count[] -- count of the number of nodes per cluster.\n+ */\n+static void merge_cluster(long from, long into)\n+{\n+ int i;\n+\n+ for(i=0; i<ndense; ++i)\n+ if(gcID[i] == from)\n+ gcID[i] = into;\n+}\n+ \n+void depth_first(long node)\n+{\n+ long i;\n+\n+ if(gcID[node] == cid) // we're guaranteed no cycles, but it is possible to reach a node \n+ return; // through two different paths in the same cluster. This early\n+ // return saves us some unneeded work.\n+\n+ /* Check to see if we are merging a cluster */\n+ if(gcID[node] >= 0) {\n+ /* We are, so zero the count for the old cluster. */\n+ cluster_count[ gcID[node] ] = 0; \n+ merge_cluster(gcID[node], cid);\n+ return;\n+ }\n+\n+ /* Update for this node */\n+ gcID[node] = cid;\n+ cluster_count[cid]++;\n+\n+ /* Recursively search the child nodes */\n+ for(i=0; i<ndim; ++i)\n+ if(Gr[node][i] >= 0) /* This is a child node */\n"..b'ring);\n+\n+ ID2Center_all(input_data,file_Len,num_dm,num_real_pop,all_population_ID,new_population_center);\n+ \n+\n+ f_cid=fopen("population_id.txt","w");\n+ f_ctr=fopen("population_center.txt","w");\n+ f_out=fopen("coordinates.txt","w");\n+ f_results=fopen("flock_results.txt","w");\n+\n+/*\n+ f_parameters=fopen("parameters.txt","w");\n+ fprintf(f_parameters,"Number_of_Bins\\t%d\\n",num_bin);\n+ fprintf(f_parameters,"Density\\t%f\\n",aver_index);\n+ fclose(f_parameters);\n+*/\n+\n+ for (i=0;i<file_Len;i++)\n+\tfprintf(f_cid,"%ld\\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009\n+\n+ /*\n+ * New to check for min/max to add to parameters.txt\n+ *\n+ */\n+ \n+ fprintf(f_out,"%s\\n",para_name_string);\n+ //fprintf(f_results,"%s\\tEvent\\tPopulation\\n",para_name_string);\n+ fprintf(f_results,"%s\\tPopulation\\n",para_name_string);\n+ for (i=0;i<file_Len;i++)\n+ {\n+\tfor (j=0;j<num_dm;j++)\n+\t{\n+\t\tif (input_data[i][j] < min) {\n+\t\t\tmin = (int)input_data[i][j];\n+\t\t}\n+\t\tif (input_data[i][j] > max) {\n+\t\t\tmax = (int)input_data[i][j];\n+\t\t}\n+\t\tif (j==num_dm-1)\n+\t\t{\n+\t\t\tfprintf(f_out,"%d\\n",(int)input_data[i][j]);\n+\t\t\tfprintf(f_results,"%d\\t",(int)input_data[i][j]);\n+\t\t}\n+\t\telse\n+\t\t{\n+\t\t\tfprintf(f_out,"%d\\t",(int)input_data[i][j]);\n+\t\t\tfprintf(f_results,"%d\\t",(int)input_data[i][j]);\n+\t\t}\n+\t}\n+\t//fprintf(f_results,"%ld\\t",i + 1);\n+\tfprintf(f_results,"%ld\\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009\n+ }\n+\n+/*\n+ f_parameters=fopen("parameters.txt","w");\n+ fprintf(f_parameters,"Number_of_Bins\\t%ld\\n",num_bin);\n+ fprintf(f_parameters,"Density\\t%d\\n",den_t_event);\n+ fprintf(f_parameters,"Min\\t%d\\n",min);\n+ fprintf(f_parameters,"Max\\t%d\\n",max);\n+ fclose(f_parameters);\n+*/\n+\n+ f_properties=fopen("fcs.properties","w");\n+ fprintf(f_properties,"Bins=%ld\\n",num_bin);\n+ fprintf(f_properties,"Density=%d\\n",den_t_event);\n+ fprintf(f_properties,"Min=%d\\n",min);\n+ fprintf(f_properties,"Max=%d\\n",max);\n+ fprintf(f_properties,"Populations=%ld\\n",num_real_pop);\n+ fprintf(f_properties,"Events=%ld\\n",file_Len);\n+ fprintf(f_properties,"Markers=%ld\\n",num_dm);\n+ fclose(f_properties);\n+\n+ for (i=0;i<num_real_pop;i++) {\n+\t/* Add if we want to include population id in the output\n+\t*/\n+\tfprintf(f_ctr,"%ld\\t",i+1); //i changed to i+1 to start from 1 instead of 0: April 16, 2009\n+\n+\tfor (j=0;j<num_dm;j++) {\n+\t\tif (j==num_dm-1)\n+\t\t\tfprintf(f_ctr,"%.0f\\n",new_population_center[i][j]);\n+\t\telse\n+\t\t\tfprintf(f_ctr,"%.0f\\t",new_population_center[i][j]);\n+\t}\n+ }\n+\n+ \t//added April 16, 2009\n+\tf_mfi=fopen("MFI.txt","w");\n+\n+\tfor (i=0;i<num_real_pop;i++)\n+\t{\n+\t\tfprintf(f_mfi,"%ld\\t",i+1);\n+\n+\t\tfor (j=0;j<num_dm;j++)\n+\t\t{\n+\t\t\tif (j==num_dm-1)\n+\t\t\t\tfprintf(f_mfi,"%.0f\\n",new_population_center[i][j]);\n+\t\t\telse\n+\t\t\t\tfprintf(f_mfi,"%.0f\\t",new_population_center[i][j]);\n+\t\t}\n+\t}\n+\tfclose(f_mfi);\n+\n+\t//ended April 16, 2009\n+\t\t\t\n+ fclose(f_cid);\n+ fclose(f_ctr);\n+ fclose(f_out);\n+ fclose(f_results);\n+\n+\n+ for (i=0;i<num_population;i++)\n+ \tfree(population_center[i]);\n+ \n+ free(population_center);\n+ \n+ for (i=0;i<num_real_pop;i++)\n+\t free(new_population_center[i]);\n+ \n+ free(new_population_center);\n+\n+ for (i=0;i<file_Len;i++)\n+ free(normalized_data[i]);\n+ free(normalized_data);\t\n+\t\n+ free(grid_populationID);\n+\n+ free(cluster_populationID);\n+ free(grid_clusterID);\n+ free(cluster_ID);\n+\n+ for (i=0;i<file_Len;i++)\n+ free(input_data[i]);\n+ free(input_data);\n+\n+ free(grid_ID);\n+ free(population_ID);\n+ free(all_population_ID);\n+ free(eventID_To_denseventID);\n+\t\t\n+ ///////////////////////////////////////////////////////////\n+ printf("Ending time:\\t\\t\\t\\t");\n+ fflush(stdout);\n+ system("/bin/date");\n+\n+ /*\n+ * Windows version\n+ _strtime( tmpbuf );\n+ printf( "Ending time:\\t\\t\\t\\t%s\\n", tmpbuf );\n+ _strdate( tmpbuf );\n+ printf( "Ending date:\\t\\t\\t\\t%s\\n", tmpbuf );\n+ */\n+ \n+ return 0;\n+}\n' |