Repository 'cross_sample'
hg clone https://toolshed.g2.bx.psu.edu/repos/immport-devteam/cross_sample

Changeset 1:7eab80f86779 (2017-02-27)
Previous changeset 0:8d951baf795f (2017-02-27) Next changeset 2:311a4f16c483 (2017-02-27)
Commit message:
add FLOCK
added:
src/README
src/cent_adjust.c
src/find_connected.c
src/flock1.c
src/flock2.c
b
diff -r 8d951baf795f -r 7eab80f86779 src/README
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/README Mon Feb 27 13:26:09 2017 -0500
[
@@ -0,0 +1,139 @@
+This package contains R code for converting and transforming a binary
+FCS file, using the FCSTrans software, and the C code for running the
+flock1 and flock2 population identification software.
+
+src - contains the C code for flock1, flock2 and cent_adjust
+bin - contains the compiled C code for flock1, flock2 and cent_adjust,
+      plus the R code for using the FCSTrans algorithm for file
+      conversion and data transformation.
+doc - documentation for FCSTrans and FLOCK algorithms
+example - sample data and output from FCSTrans and FLOCK
+
+To run this software sucessfully the code assumes the software was installed
+in the /usr/local/flock directory and that R, plus the Bioconductor flowCore
+module has been installed. If you use the ipconvert.sh shell script, you
+may need to adjust the location of the RScript executable and the location
+of the FCSTrans.R code by editing the shell script.
+
+#############################################################################
+# Overview
+#############################################################################
+ImmPort-FLOCK
+
+FLOCK (FLOw Clustering without K), an automated population discovery tool for
+multidimensional FCM data was designed to specifically take into account the
+unique feature of FCM data and produce objective segregation of cell
+populations.
+
+FLOCK parameter settings can be customized by defining Bins and Density
+Threshold.  The number of bins is an integer specifying the number of
+equal-sized regions the data will be partitioned into on each axis. Increasing
+the number of bins increases the sensitivity to detect rare populations but
+may also result in single populations being divided.  Density Threshold is the
+cut-off value to separate the dense regions from background. It is a floating
+point number that helps define population centers; increasing the threshold
+may help separate major populations but could cause the algorithm to overlook
+rare populations.
+
+#############################################################################
+# Compiling C code
+#############################################################################
+cd bin
+cc -o flock1 ../src/flock1.c ../src/find_connected.c -lm
+cc -o flock2 ../src/flock2.c -lm
+cc -o cent_adjust ../src/cent_adjust.c -lm
+
+#############################################################################
+# FCSTrans
+#############################################################################
+A shell script named ipconvert.sh is included that runs the FCSTrans R
+code for converting and transforming a binary FCS file. The output consists
+of one text file containing the transformed channel intensity values and
+another file containing a list of the FCS parameters.
+
+cd bin
+/usr/local/flock/bin/ipconvert.sh ../example/data/FCS2.fcs
+/usr/local/flock/bin/ipconvert.sh ../example/data/FCS3.fcs
+
+#############################################################################
+# Running flock1 or flock2
+#############################################################################
+Running the FLOCK1 and FLOCK2 algorithms generate 8 output files that have
+generic file names. For this reason, it is recommended that one output
+directory be created for one input file to the program.
+
+cd example/output/FCS2
+/usr/local/flock/bin/flock1 ../../data/FCS2.txt
+
+cd example/output/FCS3
+/usr/local/flock/bin/flock2 ../../data/FCS3.txt
+
+Files created: MFI.txt, percentage.txt, population_id.txt, profile.txt,
+               flock_results.txt, coordinates.txt, population_center.txt
+               and percentage.txt
+
+Usage Information for FLOCK1
+----------------------------------------------------------------------------
+basic mode: flock1 fcs.txt
+advanced_ mode: flock1 fcs.txt num_bin density_index max_num_pop
+
+Usage Information for FLOCK2
+----------------------------------------------------------------------------
+basic mode: flock data_file
+advanced mode 0 (specify maximum # of pops): flock data_file max_num_pop
+advanced mode 1 (without # of pops): flock data_file num_bin density_index
+advanced mode 2 (specify # of pops): flock data_file num_bin density_index
+                                           number_of_pop
+advanced mode 3 (specify both # of pops): flock data_file num_bin density_index
+                                           number_of_pop max_num_pop
+
+FLOCK Output Files
+----------------------------------------------------------------------------
+coordinates.txt:
+    Output is the intensity values for each marker and event
+
+flock_results.txt:
+    A combination of the input file, event identifiers and population
+    identifiers.
+
+MFI.txt:
+    Provides the mean fluorescence intensity for each population for each
+    marker/parameter
+
+population_id.txt:
+    Contains population identifiers (i.e, values from [1 to n] where n is
+    the population assigned to the  corresponding events in the input data
+    file, one identifier per row.)
+
+population_center.txt:
+    Contains the centroid coordinates for each identified population
+
+percentage.txt:
+    Includes the population identifiers and percentage of events within that
+    population (relative to the whole data file)
+
+profile.txt:
+    Displays an expression profile, where the approximate expression level
+    for each marker is assigned a numeric value from 1-4, for each identified
+    population
+
+fcs_properties.txt:
+    Contains the number of events, number of populations, and number of
+    markers, as well as the algorithm parameters used in the  analysis
+
+#############################################################################
+# Running cent_adjust 
+#############################################################################
+Running the cent_adjust algorithm generates 4 output files that have
+generic file names. For this reason, it is recommened that one output
+directory be created for one input file to the program. 
+
+mkdir example/output/FCS2/cent_adjust
+cd example/output/FCS2/cent_adjust
+/usr/local/flock/bin/cent_adjust ../population_center.txt ../coordinates.txt
+
+Files created: MFI.txt, percentage.txt, population_id.txt and profile.txt
+
+Usage Information for cent_adjust
+----------------------------------------------------------------------------
+basic mode: cent_adjust input_center input_data_file
b
diff -r 8d951baf795f -r 7eab80f86779 src/cent_adjust.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/cent_adjust.c Mon Feb 27 13:26:09 2017 -0500
[
b'@@ -0,0 +1,1009 @@\n+/////////////////////////////////////////////////////////\n+//  Cent_adjust version number and modification history\n+//  ImmPort BISC project\n+//  Author: Yu "Max" Qian\n+//  v1.01: Oct 16, 2009\n+//         Line 899 of the main function:\n+//         Changed kmean_term=1 to kmean_term=2\n+//////////////////////////////////////////////////////////\n+\n+\n+#include <time.h>\n+#include <stdio.h>\n+#include <stdlib.h>\n+#include <math.h>\n+#include <string.h>\n+\n+#define DEBUG 0\n+#define LINE_LEN 1024\n+#define FILE_NAME_LEN 128\n+#define PARA_NAME_LEN 64\n+#define MAX_VALUE 1000000000\n+#define CUBE 0\n+\n+\n+void getctrfileinfo(FILE *f_src_ctr, long *num_clust)\n+{\n+\tint ch=\'\\n\';\n+\tint prev=\'\\n\';\n+\tlong num_rows=0;\n+\n+\twhile ((ch = fgetc(f_src_ctr))!= EOF )\n+    {\n+\t\tif (ch == \'\\n\')\n+        {\n+\t\t\t++num_rows;\n+        }\n+\t\tprev = ch;\n+    }\n+\tif (prev!=\'\\n\')\n+\t\t++num_rows;\n+\t\n+\t*num_clust=num_rows;\n+\t//printf("center file has %ld rows\\n", *num_clust);\n+}\n+\n+/************************************* Read basic info of the source file **************************************/\n+void getfileinfo(FILE *f_src, long *file_Len, long *num_dm, char *name_string, int *time_ID)\n+{\n+\tchar src[LINE_LEN];\n+\tchar current_name[64];\n+\tchar prv;\n+\n+\tlong num_rows=0;\n+\tlong num_columns=0;\n+\tint ch=\'\\n\';\n+\tint prev=\'\\n\';\n+\tlong time_pos=0;\n+\tlong i=0;\n+\tlong j=0;\n+\n+\t\n+\n+\tsrc[0]=\'\\0\';\n+\tfgets(src, LINE_LEN, f_src);\n+\n+\tname_string[0]=\'\\0\';\n+\tcurrent_name[0]=\'\\0\';\n+\tprv=\'\\n\';\n+\n+\twhile ((src[i]==\' \') || (src[i]==\'\\t\')) //skip space and tab characters\n+\t\ti++;\n+\n+\twhile ((src[i]!=\'\\r\') && (src[i]!=\'\\n\')) //repeat until the end of the line\n+\t{\n+\t\tcurrent_name[j]=src[i];\n+\t\t\n+\t\tif ((src[i]==\'\\t\') && (prv!=\'\\t\')) //a complete word\n+\t\t{\n+\t\t\tcurrent_name[j]=\'\\0\';\n+\t\t\t\n+          /* \n+           * Commented out John Campbell, June 10 2010\n+           * We no longer want to automatically remove Time column.\n+           * This column should have been removed by column selection\n+          if (0!=strcmp(current_name,"Time"))\n+            {\n+              num_columns++; //num_columns does not inlcude the column of Time\n+              time_pos++;\n+              strcat(name_string,current_name); \n+              strcat(name_string,"\\t");\n+            }\n+          else\n+            {\n+              *time_ID=time_pos;\n+            }\n+          */\n+\n+           num_columns++;\n+           strcat(name_string,current_name);\n+           strcat(name_string,"\\t");\n+\n+\t\t   current_name[0]=\'\\0\';\n+\t\t   j=0;\t\t\t\n+\t\t}\t\t\n+\t\t\n+\t\tif ((src[i]==\'\\t\') && (prv==\'\\t\')) //a duplicate tab or space\n+\t\t{\n+\t\t\tcurrent_name[0]=\'\\0\';\n+\t\t\tj=0;\n+\t\t}\n+\t\t\n+        if (src[i]!=\'\\t\')\n+\t        j++;\n+\n+\t\t\n+\t\tprv=src[i];\n+\t\ti++;\n+\t}\n+\t\n+\tif (prv!=\'\\t\') //the last one hasn\'t been retrieved\n+\t{\n+\t\tcurrent_name[j]=\'\\0\';\n+      /* \n+       * Commented out John Campbell, June 10 2010\n+       * We no longer want to automatically remove Time column.\n+       * This column should have been removed by column selection\n+      if (0!=strcmp(current_name,"Time"))\n+        {\n+          num_columns++;\n+          strcat(name_string,current_name);\n+          time_pos++;\n+        }\n+      else\n+        {\n+          *time_ID=time_pos;\n+        }\n+      */\n+\n+      num_columns++;\n+      strcat(name_string,current_name);\n+\t}\n+\n+\tif (DEBUG==1)\n+\t{\n+\t\tprintf("time_ID is %d\\n",*time_ID);\n+\t\tprintf("name_string is %s\\n",name_string);\n+\t}\n+\n+\t// # of rows\n+\n+\twhile ((ch = fgetc(f_src))!= EOF )\n+    {\n+\t\tif (ch == \'\\n\')\n+        {\n+\t\t\t++num_rows;\n+        }\n+\t\tprev = ch;\n+    }\n+\tif (prev!=\'\\n\')\n+\t\t++num_rows;\n+\t\n+\t*file_Len=num_rows;\n+\t*num_dm=num_columns; \n+\n+\t//printf("original file size is %ld; number of dimensions is %ld\\n", *file_Len, *num_dm);\n+}\n+\n+\n+\n+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n+/************************************* Read the source file into uncomp_data **************************************/\n+void readsource(FILE *f_src, lo'..b'0;\n+\tlong j=0;\n+\n+\tlong *cluster_id;\n+\tlong *IDmapping; //this is to keep the original populationID of the center.txt\n+\n+\tdouble kmean_term=0;\n+\t\n+\tdouble **cluster_center;\n+\tdouble **orig_data;\n+\tdouble **normalized_data;\n+\t\t\n+/*\n+\t_strtime( tmpbuf );\n+    printf( "Starting time:\\t\\t\\t\\t%s\\n", tmpbuf );\n+\t_strdate( tmpbuf );\n+    printf( "Starting date:\\t\\t\\t\\t%s\\n", tmpbuf );\n+*/\n+\n+\tif (argc!=3)\n+\t{\n+\t\tprintf("usage: cent_adjust input_center input_data_file\\n");       \n+\t\texit(0);\n+\t}\t\n+\t\n+\n+\tf_src_ctr=fopen(argv[1],"r");\t\n+\t\n+\t//read source data\n+\tf_src=fopen(argv[2],"r");\n+\t\n+\tgetfileinfo(f_src, &file_Len, &num_dm, name_string, &time_id); //get the filelength, number of dimensions, and num/name of parameters\n+\n+\trewind(f_src); //reset data file pointer\t\n+\n+\torig_data = (double **)malloc(sizeof(double*)*file_Len);\n+\tmemset(orig_data,0,sizeof(double*)*file_Len);\n+\tfor (i=0;i<file_Len;i++)\n+\t{\n+\t\torig_data[i]=(double *)malloc(sizeof(double)*num_dm);\n+\t\tmemset(orig_data[i],0,sizeof(double)*num_dm);\n+\t}\n+\t\n+\treadsource(f_src, file_Len, num_dm, orig_data, time_id); //read the data;\n+\t\n+\tfclose(f_src);\n+\t/////////////////////////////////////////////////////////////////////////////\n+\tgetctrfileinfo(f_src_ctr, &num_clust); //get how many populations\n+\tnorm_used=0;\n+\tdist_used=0;\n+\tkmean_term=2;  //modified on Oct 16, 2009: changed kmean_term=1 to kmean_term=2\n+\n+\trewind(f_src_ctr); //reset center file pointer\n+\n+\t//read population center\n+\tcluster_center=(double **)malloc(sizeof(double*)*num_clust);\n+\tmemset(cluster_center,0,sizeof(double*)*num_clust);\n+\tfor (i=0;i<num_clust;i++)\n+\t{\n+\t\tcluster_center[i]=(double*)malloc(sizeof(double)*num_dm);\n+\t\tmemset(cluster_center[i],0,sizeof(double)*num_dm);\n+\t}\n+\tfor (i=0;i<num_clust;i++)\n+\t\tfor (j=0;j<num_dm;j++)\n+\t\t\tcluster_center[i][j]=0;\n+\n+\tIDmapping=(long *)malloc(sizeof(long)*num_clust);\n+\tmemset(IDmapping,0,sizeof(long)*num_clust);\n+\n+\treadcenter(f_src_ctr,num_clust,num_dm,cluster_center,IDmapping); //read population center\n+    fclose(f_src_ctr);\n+\n+\t/////////////////////////////////////////////////////////////////////////////\n+\tnormalized_data=(double **)malloc(sizeof(double*)*file_Len);\n+\tmemset(normalized_data,0,sizeof(double*)*file_Len);\n+\tfor (i=0;i<file_Len;i++)\n+\t{\n+\t\tnormalized_data[i]=(double *)malloc(sizeof(double)*num_dm);\n+\t\tmemset(normalized_data[i],0,sizeof(double)*num_dm);\n+\t}\n+\t\n+\ttran(orig_data, file_Len, num_dm, norm_used, normalized_data);\n+\t/************************************************* Compute number of clusters *************************************************/\n+\t\n+\tcluster_id=(long*)malloc(sizeof(long)*file_Len);\n+\tmemset(cluster_id,0,sizeof(long)*file_Len);\n+\n+\tassign_event(normalized_data,num_clust,dist_used,kmean_term,file_Len,num_dm,cluster_id,cluster_center,0);\n+\n+\t\n+\t//show(orig_data,cluster_id,file_Len,num_clust,num_dm,show_data,num_disp,name_string); \n+\tshow(orig_data, cluster_id, file_Len, num_clust, num_dm, name_string, IDmapping);\n+\n+\tf_cid=fopen("population_id.txt","w");\n+\n+\tfor (i=0;i<file_Len;i++)\n+\t\tfprintf(f_cid,"%ld\\n",IDmapping[cluster_id[i]]);\n+\t\t\n+\n+\tfclose(f_cid);\n+ \n+\t//added April 16, 2009\n+\tf_mfi=fopen("MFI.txt","w");\n+\n+\tfor (i=0;i<num_clust;i++)\n+\t{\n+\t\tfprintf(f_mfi,"%ld\\t",IDmapping[i]);\n+\n+\t\tfor (j=0;j<num_dm;j++)\n+\t\t{\n+\t\t\tif (j==num_dm-1)\n+\t\t\t\tfprintf(f_mfi,"%.0f\\n",cluster_center[i][j]);\n+\t\t\telse\n+\t\t\t\tfprintf(f_mfi,"%.0f\\t",cluster_center[i][j]);\n+\t\t}\n+\t}\n+\tfclose(f_mfi);\n+\n+\t//ended April 16, 2009\n+\n+\tfor (i=0;i<num_clust;i++)\n+\t\tfree(cluster_center[i]);\n+\tfree(cluster_center);\n+\t\t\n+\n+\t/********************************************** Release memory ******************************************/\n+  \n+\tfor (i=0;i<file_Len;i++)\n+\t{\n+\t\tfree(orig_data[i]);\t\t\n+\t\tfree(normalized_data[i]);\n+\t}\n+\t\n+\tfree(orig_data);\n+\tfree(normalized_data);\n+\tfree(cluster_id);\n+\tfree(IDmapping);\n+\n+/*\n+\t_strtime( tmpbuf );\n+    printf( "Ending time:\\t\\t\\t\\t%s\\n", tmpbuf );\n+\t_strdate( tmpbuf );\n+    printf( "Ending date:\\t\\t\\t\\t%s\\n", tmpbuf );\n+*/\n+\n+}\n'
b
diff -r 8d951baf795f -r 7eab80f86779 src/find_connected.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/find_connected.c Mon Feb 27 13:26:09 2017 -0500
[
@@ -0,0 +1,176 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+//static const char *rcsid = "$Id: find_connected.c,v 1.1 2008/09/05 21:54:40 rpl Exp $";
+
+int find_connected(int **G, int num_dense_grids, int ndim, int *grid_clusterID);
+void depth_first(int startnode);
+
+void bail(const char *);        /* exits via abort */
+static void check_clusters(int *gcID, int ndense);
+static void merge_cluster(int from, int into);
+  
+
+/* Vars that will not change througout the depth-first recursion.  We
+   store them here to avoid endless replication on the stack. */
+static int **Gr=0;
+static int *gcID = 0;          /* grid cluster IDs */
+static int *cluster_count=0;   /* count of nodes per cluster */
+static int ndense=0;
+static int ndim=0;
+/* cid changes between depth-first searches, but is constant within a
+   single search, so it goes here. */
+static int cid=0;
+
+/* Find connected components in the graph of neighboring grids defined in G.
+ *
+ * Output:
+ *
+ * grid_clusterID[]  -- cluster to which each dense grid was assigned
+ * return value      -- number of clusters assigned.
+ */
+int find_connected(int **G, int n_dense_grids, int num_dm, int *grid_clusterID)
+{
+  int nclust=0;                  /* number of clusters found */
+  int i;
+  int *subfac;
+  int subval=0,nempty=0;
+    int clustid=0;
+  
+  size_t sz = n_dense_grids*sizeof(int); 
+  cluster_count = malloc(sz);
+  if(!cluster_count)
+    bail("find_connected: Unable to allocate %zd bytes.\n");
+  memset(cluster_count,0,sz);
+
+  /* set up the statics that will be used in the DFS */
+  Gr=G;
+  gcID = grid_clusterID;
+  ndense = n_dense_grids;
+  ndim = num_dm;
+
+  
+  for(i=0;i<ndense;++i)
+    grid_clusterID[i] = -1;
+
+  for(i=0;i<ndense;++i) {
+    if(grid_clusterID[i] < 0) {  /* grid hasn't been assigned yet */
+      cid = nclust++;
+      depth_first(i);
+    }
+  }
+
+#ifndef NDEBUG
+  check_clusters(gcID,ndense);
+#endif 
+  
+  /* At this point we probably have some clusters that are empty due to merging.
+     We want to compact the cluster numbering to eliminate the empty clusters. */
+
+  subfac = malloc(sz);
+  if(!subfac)
+    bail("find_connected: Unable to allocate %zd bytes.\n");
+ subval=0;
+ nempty=0;
+
+  /* cluster #i needs to have its ID decremented by 1 for each empty cluster with
+     ID < i.  Precaclulate the decrements in this loop: */
+  for(i=0;i<nclust;++i) {
+    //clustid = grid_clusterID[i];
+    if(cluster_count[i] == 0) {
+      subval++;
+      nempty++;
+    }
+    subfac[i] = subval;
+  }
+
+  /* Now apply the decrements to all of the dense grids */
+  for(i=0;i<ndense;++i) {
+   clustid = grid_clusterID[i];
+    grid_clusterID[i] -= subfac[clustid];
+  }
+
+#ifndef NDEBUG
+  //  check_clusters(gcID,ndense);
+#endif  
+  
+  /* correct the number of clusters found */
+  nclust -= nempty;
+
+  return nclust;
+}
+
+
+/* Do a depth-first search for a single connected component in graph
+ * G.  Start from node, tag the nodes found with cid, and record
+ * the tags in grid_clusterID.  Also, record the node count in
+ * cluster_count.  If we find a node that has already been assigned to
+ * a cluster, that means we're merging two clusters, so zero out the
+ * old cid's node count.
+ *
+ * Note that our graph is constructed as a DAG, so we can be
+ * guaranteed to terminate without checking for cycles.  
+ *
+ * Note2: this function can potentially recurse to depth = ndense.
+ * Plan stack size accordingly.  
+ *
+ * Output:
+ *
+ * grid_clusterID[] -- array where we tag the nodes.
+ * cluster_count[]  -- count of the number of nodes per cluster.
+ */
+   
+void depth_first(int node)
+{
+  int i;
+
+  if(gcID[node] == cid)         // we're guaranteed no cycles, but it is possible to reach a node 
+    return;                     // through two different paths in the same cluster.  This early
+                                // return saves us some unneeded work.
+
+  /* Check to see if we are merging a cluster */
+  if(gcID[node] >= 0) {
+    /* We are, so zero the count for the old cluster. */
+    cluster_count[ gcID[node] ] = 0;  
+    merge_cluster(gcID[node], cid);
+    return;
+  }
+
+  /* Update for this node */
+  gcID[node] = cid;
+  cluster_count[cid]++;
+
+  /* Recursively search the child nodes */
+  for(i=0; i<ndim; ++i)
+    if(Gr[node][i] >= 0)      /* This is a child node */
+      depth_first(Gr[node][i]);
+}
+
+void bail(const char *msg)
+{
+  fprintf(stderr,"%s",msg);
+  abort();
+}
+
+
+static void check_clusters(int *gcID, int ndense)
+{
+  int i;
+
+  for(i=0; i<ndense; ++i)
+    if(gcID[i] < 0) {
+      fprintf(stderr,"faulty cluster id at i= %d\n",i);
+      abort();
+    }
+}
+
+static void merge_cluster(int from, int into)
+{
+  int i;
+
+  for(i=0; i<ndense; ++i)
+    if(gcID[i] == from)
+      gcID[i] = into;
+}
b
diff -r 8d951baf795f -r 7eab80f86779 src/flock1.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/flock1.c Mon Feb 27 13:26:09 2017 -0500
[
b'@@ -0,0 +1,2400 @@\n+/*****************************************************************************\n+\t\n+\tFLOCK: FLOw cytometry Clustering without K (Named by: Jamie A. Lee and Richard H. Scheuermann) \n+\t\n+\tAuthor: (Max) Yu Qian, Ph.D.\n+\t\n+\tCopyright: Scheuermann Lab, Dept. of Pathology, UTSW\n+\t\n+\tDevelopment: November 2005 ~ Forever\n+\n+\tAlgorithm Status: May 2007: Release 1.0\n+\n+\tUsage: flock data_file\n+\t\t    Note: the input file format must be channel values and the delimiter between two values must be a tab.\n+\n+    Changes made July 23, 2010: made errors to STDERR\n+\tChanges made Nov 4, 2010: added one more error (select_num_bin<min_grid) || (select_num_bin>max_grid) to STDERR;\n+\t                          MAX_GRID changed to 50 as larger than 50 seems not useful for any file we have got\n+\t\n+******************************************************************************/\n+#include <time.h>\n+#include <stdio.h>\n+#include <stdlib.h>\n+#include <math.h>\n+#include <string.h>\n+#include <sys/stat.h>\n+#include <unistd.h>\n+#include <assert.h>\n+\n+\n+#define DEBUG 0\n+#define LINE_LEN 1024\n+#define FILE_NAME_LEN 128\n+#define PARA_NAME_LEN 64\n+#define MAX_VALUE 1000000000\n+#define MIN_GRID 6\n+#define MAX_GRID 50\n+\n+#define NORM_METHOD 2 //2 if z-score; 0 if no normalization; 1 if min-max \n+#define KMEANS_TERM 100\n+//#define MAX_NUM_POP 30\n+\n+\n+int find_connected(int **G, int num_dense_grids, int ndim, int *grid_clusterID);\n+\n+/************* Read basic info of the source file ****************************/\n+void getfileinfo(FILE *f_src, int *file_Len, int *num_dm, char *name_string, int *time_ID)\n+{\n+  char src[LINE_LEN];\n+  char current_name[64];\n+  char prv;\n+\n+  int num_rows=0;\n+  int num_columns=0;\n+  int ch=\'\\n\';\n+  int prev=\'\\n\';\n+  int time_pos=0;\n+  int i=0;\n+  int j=0;\n+  int sw=0;\n+\n+  src[0]=\'\\0\';\n+  fgets(src, LINE_LEN, f_src);\n+\n+  if ((src[0]==\'F\') && (src[1]==\'C\') && (src[2]==\'S\'))\n+\t{\n+\t\tfprintf(stderr,"the correct input format is a tab-delimited txt file, instead of FCS file.\\n");\n+\t\tabort();\n+\t}\n+\n+  name_string[0]=\'\\0\';\n+  current_name[0]=\'\\0\';\n+  prv=\'\\n\';\n+\n+  // skip space and tab characters\n+  while ((src[i]==\' \') || (src[i]==\'\\t\'))\n+    i++;\n+\n+  // repeat until the end of line is reached\n+  while ((src[i]!=\'\\0\') && (src[i]!=\'\\n\') && (src[i]!=\'\\r\'))\n+    {\n+      current_name[j]=src[i];\n+\t\t\n+      if ((src[i]==\'\\t\') && (prv!=\'\\t\')) //a complete word\n+        {\n+          current_name[j]=\'\\0\';\n+\t\t\t\n+          if (0!=strcmp(current_name,"Time"))\n+            {\n+              num_columns++; //num_columns does not inlcude the column of Time\n+              time_pos++;\n+              if (sw) {\n+                  strcat(name_string,"\\t");\n+              }\n+              strcat(name_string,current_name); \n+              sw = 1;\n+            }\n+          else\n+            {\n+              *time_ID=time_pos;\n+            }\n+          \n+          \n+          current_name[0]=\'\\0\';\n+          j=0;\t\t\t\n+        }\t\t\n+\t\t\n+      if ((src[i]==\'\\t\') && (prv==\'\\t\')) //a duplicate tab or space\n+        {\n+          current_name[0]=\'\\0\';\n+          j=0;\n+        }\n+\t\t\n+      if (src[i]!=\'\\t\')\n+        j++;\n+\t\t\n+      prv=src[i];\n+      i++;\n+    }\n+\t\n+  if (prv!=\'\\t\') //the last one hasn\'t been retrieved\n+    {\n+      current_name[j]=\'\\0\';\n+      \n+      if (0!=strcmp(current_name,"Time"))\n+        {\n+          num_columns++;\n+          strcat(name_string,"\\t");\n+          strcat(name_string,current_name);\n+          time_pos++;\n+        }\n+      else\n+        {\n+          *time_ID=time_pos;\n+        }\n+      \n+      \n+    }\n+  if (DEBUG==1)\n+    {\n+      printf("time_ID is %d\\n",*time_ID);\n+      printf("name_string is %s\\n",name_string);\n+    }\n+\n+  //start computing # of rows\n+\n+  while ((ch = fgetc(f_src))!= EOF )\n+    {\n+      if (ch == \'\\n\')\n+        {\n+          ++num_rows;\n+        }\n+      prev = ch;\n+    }\n+  if (prev!=\'\\n\')\n+    ++num_rows;\n+\n+  //added on July 23, 2010\n+  if (num_rows<50)\n+  {\n+    fprintf(stderr,"Number of events '..b'lation_ID=(int*)malloc(sizeof(int)*file_Len);\n+  memset(all_population_ID,0,sizeof(int)*file_Len);\n+\n+  kmeans(normalized_data, num_population, KMEANS_TERM, file_Len, num_dm, all_population_ID, population_center);\n+  show(input_data, all_population_ID, file_Len, num_population, num_dm, para_name_string);\n+\n+  ID2Center_all(input_data,file_Len,num_dm,num_population,all_population_ID,population_center);\n+  \n+\n+  f_cid=fopen("population_id.txt","w");\n+  f_ctr=fopen("population_center.txt","w");\n+  f_out=fopen("coordinates.txt","w");\n+  f_results=fopen("flock_results.txt","w");\n+\n+/*\n+  f_parameters=fopen("parameters.txt","w");\n+  fprintf(f_parameters,"Number_of_Bins\\t%d\\n",num_bin);\n+  fprintf(f_parameters,"Density\\t%f\\n",aver_index);\n+  fclose(f_parameters);\n+*/\n+\n+  for (i=0;i<file_Len;i++)\n+\tfprintf(f_cid,"%d\\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009\n+\n+  /*\n+   * New to check for min/max to add to parameters.txt\n+   *\n+  */\n+  \n+  fprintf(f_out,"%s\\n",para_name_string);\n+  //fprintf(f_results,"%s\\tEvent\\tPopulation\\n",para_name_string);\n+  fprintf(f_results,"%s\\tPopulation\\n",para_name_string);\n+  for (i=0;i<file_Len;i++)\n+  {\n+\tfor (j=0;j<num_dm;j++)\n+\t{\n+\t\tif (input_data[i][j] < min) {\n+\t\t\tmin = (int)input_data[i][j];\n+\t\t}\n+\t\tif (input_data[i][j] > max) {\n+\t\t\tmax = (int)input_data[i][j];\n+\t\t}\n+\t\tif (j==num_dm-1)\n+\t\t{\n+\t\t\tfprintf(f_out,"%d\\n",(int)input_data[i][j]);\n+\t\t\tfprintf(f_results,"%d\\t",(int)input_data[i][j]);\n+\t\t}\n+\t\telse\n+\t\t{\n+\t\t\tfprintf(f_out,"%d\\t",(int)input_data[i][j]);\n+\t\t\tfprintf(f_results,"%d\\t",(int)input_data[i][j]);\n+\t\t}\n+\t}\n+\t//fprintf(f_results,"%d\\t",i + 1);\n+\tfprintf(f_results,"%d\\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009\n+  }\n+\n+/*\n+  f_parameters=fopen("parameters.txt","w");\n+  fprintf(f_parameters,"Number_of_Bins\\t%d\\n",num_bin);\n+  fprintf(f_parameters,"Density\\t%d\\n",den_t_event);\n+  fprintf(f_parameters,"Min\\t%d\\n",min);\n+  fprintf(f_parameters,"Max\\t%d\\n",max);\n+  fclose(f_parameters);\n+*/\n+\n+  f_properties=fopen("fcs.properties","w");\n+  fprintf(f_properties,"Bins=%d\\n",num_bin);\n+  fprintf(f_properties,"Density=%d\\n",den_t_event);\n+  fprintf(f_properties,"Min=%d\\n",min);\n+  fprintf(f_properties,"Max=%d\\n",max);\n+  fprintf(f_properties,"Populations=%d\\n",num_population);\n+  fprintf(f_properties,"Events=%d\\n",file_Len);\n+  fprintf(f_properties,"Markers=%d\\n",num_dm);\n+  fclose(f_properties);\n+\n+\n+  for (i=0;i<num_population;i++) {\n+\t/* Add if we want to include population id in the output\n+\t*/\n+\tfprintf(f_ctr,"%d\\t",i+1);  //i changed to i+1 to start from 1 instead of 0: April 16, 2009\n+\n+\tfor (j=0;j<num_dm;j++) {\n+\t\tif (j==num_dm-1)\n+\t\t\tfprintf(f_ctr,"%.0f\\n",population_center[i][j]);\n+\t\telse\n+\t\t\tfprintf(f_ctr,"%.0f\\t",population_center[i][j]);\n+\t}\n+  }\n+\n+  \t//added April 16, 2009\n+\tf_mfi=fopen("MFI.txt","w");\n+\n+\tfor (i=0;i<num_population;i++)\n+\t{\n+\t\tfprintf(f_mfi,"%d\\t",i+1);\n+\n+\t\tfor (j=0;j<num_dm;j++)\n+\t\t{\n+\t\t\tif (j==num_dm-1)\n+\t\t\t\tfprintf(f_mfi,"%.0f\\n",population_center[i][j]);\n+\t\t\telse\n+\t\t\t\tfprintf(f_mfi,"%.0f\\t",population_center[i][j]);\n+\t\t}\n+\t}\n+\tfclose(f_mfi);\n+\n+\t//ended April 16, 2009\n+\t\t\t\n+  fclose(f_cid);\n+  fclose(f_ctr);\n+  fclose(f_out);\n+  fclose(f_results);\n+\n+\n+  for (i=0;i<num_population;i++)\n+  {\n+\tfree(population_center[i]);\n+  }\n+  free(population_center);\n+ \n+\n+  for (i=0;i<file_Len;i++)\n+    free(normalized_data[i]);\n+  free(normalized_data);\t\n+\t\n+  free(grid_populationID);\n+\n+  free(cluster_populationID);\n+  free(grid_clusterID);\n+  free(cluster_ID);\n+\n+  for (i=0;i<file_Len;i++)\n+    free(input_data[i]);\n+  free(input_data);\n+\n+  free(grid_ID);\n+  free(population_ID);\n+  free(all_population_ID);\n+  free(eventID_To_denseventID);\n+\t\t\n+  ///////////////////////////////////////////////////////////\n+  printf("Ending time:\\t\\t\\t\\t");\n+  fflush(stdout);\n+  system("/bin/date");\n+\n+  return 0;\n+\n+}\n'
b
diff -r 8d951baf795f -r 7eab80f86779 src/flock2.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/flock2.c Mon Feb 27 13:26:09 2017 -0500
[
b"@@ -0,0 +1,3405 @@\n+///////////\n+// Changes made:\n+// 1. Added another parameter: number of populations\n+// 2. Added a hierarchical merging step based on density change between centroids of two hyper-regions\n+// 3. Picked the longest dimensions between the population centroids to judge whether the two parts should be merged\n+// 4. Removed checking time parameter\n+// 5. Output error to stderr\n+// 6. Fixed the bug of density threshold always = 3\n+// 7. Added another error (select_num_bin<min_grid) || (select_num_bin>max_grid) to STDERR\n+// 8. Fixed a bug for 2D data by using K=e*K\n+// 9. Added some header files, may not be necessary\n+// 10. Added a lower bound (at least two) for number of populations\n+/***************************************************************************************************************************************\n+\t\n+\tFLOCK: FLOw cytometry Clustering without K (Named by: Jamie A. Lee and Richard H. Scheuermann) \n+\t\n+\tAuthor: (Max) Yu Qian, Ph.D.\n+\t\n+\tCopyright: Scheuermann Lab, Dept. of Pathology, UTSW\n+\t\n+\tDevelopment: November 2005 ~ Forever\n+\n+\tStatus: July 2010: Release 2.0\n+\n+\tUsage: flock data_file\n+\t\t    Note: the input file format must be channel values and the delimiter between two values must be a tab.\n+\n+\n+    \t\n+****************************************************************************************************************************************/\n+\n+#include <time.h>\n+#include <stdio.h>\n+#include <stdlib.h>\n+#include <math.h>\n+#include <string.h>\n+#include <sys/stat.h>\n+#include <unistd.h>\n+#include <assert.h>\n+\n+\n+\n+#define DEBUG 0\n+#define LINE_LEN 1024\n+#define FILE_NAME_LEN 128\n+#define PARA_NAME_LEN 64\n+#define MAX_VALUE 1000000000\n+#define MIN_GRID 6\n+#define MAX_GRID 50\n+#define E_T 1.0\n+\n+#define NORM_METHOD 2 //2 if z-score; 0 if no normalization; 1 if min-max \n+#define KMEANS_TERM 10 \n+#define MAX_POP_NUM 128\n+\n+#ifndef max\n+    #define max( a, b ) ( ((a) > (b)) ? (a) : (b) )\n+#endif\n+\n+#ifndef min\n+    #define min( a, b ) ( ((a) < (b)) ? (a) : (b) )\n+#endif\n+\n+static long **Gr=0;\n+static long *gcID = 0;          /* grid cluster IDs */\n+static long *cluster_count=0;   /* count of nodes per cluster */\n+static long ndense=0;\n+static long ndim=0;\n+/* cid changes between depth-first searches, but is constant within a\n+   single search, so it goes here. */\n+static long cid=0;\n+/* Do a depth-first search for a single connected component in graph\n+ * G.  Start from node, tag the nodes found with cid, and record\n+ * the tags in grid_clusterID.  Also, record the node count in\n+ * cluster_count.  If we find a node that has already been assigned to\n+ * a cluster, that means we're merging two clusters, so zero out the\n+ * old cid's node count.\n+ *\n+ * Note that our graph is constructed as a DAG, so we can be\n+ * guaranteed to terminate without checking for cycles.  \n+ *\n+ * Note2: this function can potentially recurse to depth = ndense.\n+ * Plan stack size accordingly.  \n+ *\n+ * Output:\n+ *\n+ * grid_clusterID[] -- array where we tag the nodes.\n+ * cluster_count[]  -- count of the number of nodes per cluster.\n+ */\n+static void merge_cluster(long from, long into)\n+{\n+  int i;\n+\n+  for(i=0; i<ndense; ++i)\n+    if(gcID[i] == from)\n+      gcID[i] = into;\n+}\n+   \n+void depth_first(long node)\n+{\n+  long i;\n+\n+  if(gcID[node] == cid)         // we're guaranteed no cycles, but it is possible to reach a node \n+    return;                     // through two different paths in the same cluster.  This early\n+                                // return saves us some unneeded work.\n+\n+  /* Check to see if we are merging a cluster */\n+  if(gcID[node] >= 0) {\n+    /* We are, so zero the count for the old cluster. */\n+    cluster_count[ gcID[node] ] = 0;  \n+    merge_cluster(gcID[node], cid);\n+    return;\n+  }\n+\n+  /* Update for this node */\n+  gcID[node] = cid;\n+  cluster_count[cid]++;\n+\n+  /* Recursively search the child nodes */\n+  for(i=0; i<ndim; ++i)\n+    if(Gr[node][i] >= 0)      /* This is a child node */\n"..b'ring);\n+\n+  ID2Center_all(input_data,file_Len,num_dm,num_real_pop,all_population_ID,new_population_center);\n+  \n+\n+  f_cid=fopen("population_id.txt","w");\n+  f_ctr=fopen("population_center.txt","w");\n+  f_out=fopen("coordinates.txt","w");\n+  f_results=fopen("flock_results.txt","w");\n+\n+/*\n+  f_parameters=fopen("parameters.txt","w");\n+  fprintf(f_parameters,"Number_of_Bins\\t%d\\n",num_bin);\n+  fprintf(f_parameters,"Density\\t%f\\n",aver_index);\n+  fclose(f_parameters);\n+*/\n+\n+  for (i=0;i<file_Len;i++)\n+\tfprintf(f_cid,"%ld\\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009\n+\n+  /*\n+   * New to check for min/max to add to parameters.txt\n+   *\n+  */\n+  \n+  fprintf(f_out,"%s\\n",para_name_string);\n+  //fprintf(f_results,"%s\\tEvent\\tPopulation\\n",para_name_string);\n+  fprintf(f_results,"%s\\tPopulation\\n",para_name_string);\n+  for (i=0;i<file_Len;i++)\n+  {\n+\tfor (j=0;j<num_dm;j++)\n+\t{\n+\t\tif (input_data[i][j] < min) {\n+\t\t\tmin = (int)input_data[i][j];\n+\t\t}\n+\t\tif (input_data[i][j] > max) {\n+\t\t\tmax = (int)input_data[i][j];\n+\t\t}\n+\t\tif (j==num_dm-1)\n+\t\t{\n+\t\t\tfprintf(f_out,"%d\\n",(int)input_data[i][j]);\n+\t\t\tfprintf(f_results,"%d\\t",(int)input_data[i][j]);\n+\t\t}\n+\t\telse\n+\t\t{\n+\t\t\tfprintf(f_out,"%d\\t",(int)input_data[i][j]);\n+\t\t\tfprintf(f_results,"%d\\t",(int)input_data[i][j]);\n+\t\t}\n+\t}\n+\t//fprintf(f_results,"%ld\\t",i + 1);\n+\tfprintf(f_results,"%ld\\n",all_population_ID[i]+1); //all_population_ID[i] changed to all_population_ID[i]+1 to start from 1 instead of 0: April 16, 2009\n+  }\n+\n+/*\n+  f_parameters=fopen("parameters.txt","w");\n+  fprintf(f_parameters,"Number_of_Bins\\t%ld\\n",num_bin);\n+  fprintf(f_parameters,"Density\\t%d\\n",den_t_event);\n+  fprintf(f_parameters,"Min\\t%d\\n",min);\n+  fprintf(f_parameters,"Max\\t%d\\n",max);\n+  fclose(f_parameters);\n+*/\n+\n+  f_properties=fopen("fcs.properties","w");\n+  fprintf(f_properties,"Bins=%ld\\n",num_bin);\n+  fprintf(f_properties,"Density=%d\\n",den_t_event);\n+  fprintf(f_properties,"Min=%d\\n",min);\n+  fprintf(f_properties,"Max=%d\\n",max);\n+  fprintf(f_properties,"Populations=%ld\\n",num_real_pop);\n+  fprintf(f_properties,"Events=%ld\\n",file_Len);\n+  fprintf(f_properties,"Markers=%ld\\n",num_dm);\n+  fclose(f_properties);\n+\n+  for (i=0;i<num_real_pop;i++) {\n+\t/* Add if we want to include population id in the output\n+\t*/\n+\tfprintf(f_ctr,"%ld\\t",i+1);  //i changed to i+1 to start from 1 instead of 0: April 16, 2009\n+\n+\tfor (j=0;j<num_dm;j++) {\n+\t\tif (j==num_dm-1)\n+\t\t\tfprintf(f_ctr,"%.0f\\n",new_population_center[i][j]);\n+\t\telse\n+\t\t\tfprintf(f_ctr,"%.0f\\t",new_population_center[i][j]);\n+\t}\n+  }\n+\n+  \t//added April 16, 2009\n+\tf_mfi=fopen("MFI.txt","w");\n+\n+\tfor (i=0;i<num_real_pop;i++)\n+\t{\n+\t\tfprintf(f_mfi,"%ld\\t",i+1);\n+\n+\t\tfor (j=0;j<num_dm;j++)\n+\t\t{\n+\t\t\tif (j==num_dm-1)\n+\t\t\t\tfprintf(f_mfi,"%.0f\\n",new_population_center[i][j]);\n+\t\t\telse\n+\t\t\t\tfprintf(f_mfi,"%.0f\\t",new_population_center[i][j]);\n+\t\t}\n+\t}\n+\tfclose(f_mfi);\n+\n+\t//ended April 16, 2009\n+\t\t\t\n+  fclose(f_cid);\n+  fclose(f_ctr);\n+  fclose(f_out);\n+  fclose(f_results);\n+\n+\n+  for (i=0;i<num_population;i++)\n+  \tfree(population_center[i]);\n+  \n+  free(population_center);\n+ \n+  for (i=0;i<num_real_pop;i++)\n+\t  free(new_population_center[i]);\n+  \n+  free(new_population_center);\n+\n+  for (i=0;i<file_Len;i++)\n+    free(normalized_data[i]);\n+  free(normalized_data);\t\n+\t\n+  free(grid_populationID);\n+\n+  free(cluster_populationID);\n+  free(grid_clusterID);\n+  free(cluster_ID);\n+\n+  for (i=0;i<file_Len;i++)\n+    free(input_data[i]);\n+  free(input_data);\n+\n+  free(grid_ID);\n+  free(population_ID);\n+  free(all_population_ID);\n+  free(eventID_To_denseventID);\n+\t\t\n+  ///////////////////////////////////////////////////////////\n+  printf("Ending time:\\t\\t\\t\\t");\n+  fflush(stdout);\n+  system("/bin/date");\n+\n+  /*\n+   * Windows version\n+  _strtime( tmpbuf );\n+  printf( "Ending time:\\t\\t\\t\\t%s\\n", tmpbuf );\n+  _strdate( tmpbuf );\n+  printf( "Ending date:\\t\\t\\t\\t%s\\n", tmpbuf );\n+ */\n+  \n+  return 0;\n+}\n'