comparison cross_sample/src/cent_adjust.c @ 3:5f670146a9af draft

Uploaded
author immport-devteam
date Mon, 27 Feb 2017 13:29:54 -0500
parents
children
comparison
equal deleted inserted replaced
2:311a4f16c483 3:5f670146a9af
1 /////////////////////////////////////////////////////////
2 // Cent_adjust version number and modification history
3 // ImmPort BISC project
4 // Author: Yu "Max" Qian
5 // v1.01: Oct 16, 2009
6 // Line 899 of the main function:
7 // Changed kmean_term=1 to kmean_term=2
8 //////////////////////////////////////////////////////////
9
10
11 #include <time.h>
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <math.h>
15 #include <string.h>
16
17 #define DEBUG 0
18 #define LINE_LEN 1024
19 #define FILE_NAME_LEN 128
20 #define PARA_NAME_LEN 64
21 #define MAX_VALUE 1000000000
22 #define CUBE 0
23
24
25 void getctrfileinfo(FILE *f_src_ctr, long *num_clust)
26 {
27 int ch='\n';
28 int prev='\n';
29 long num_rows=0;
30
31 while ((ch = fgetc(f_src_ctr))!= EOF )
32 {
33 if (ch == '\n')
34 {
35 ++num_rows;
36 }
37 prev = ch;
38 }
39 if (prev!='\n')
40 ++num_rows;
41
42 *num_clust=num_rows;
43 //printf("center file has %ld rows\n", *num_clust);
44 }
45
46 /************************************* Read basic info of the source file **************************************/
47 void getfileinfo(FILE *f_src, long *file_Len, long *num_dm, char *name_string, int *time_ID)
48 {
49 char src[LINE_LEN];
50 char current_name[64];
51 char prv;
52
53 long num_rows=0;
54 long num_columns=0;
55 int ch='\n';
56 int prev='\n';
57 long time_pos=0;
58 long i=0;
59 long j=0;
60
61
62
63 src[0]='\0';
64 fgets(src, LINE_LEN, f_src);
65
66 name_string[0]='\0';
67 current_name[0]='\0';
68 prv='\n';
69
70 while ((src[i]==' ') || (src[i]=='\t')) //skip space and tab characters
71 i++;
72
73 while ((src[i]!='\r') && (src[i]!='\n')) //repeat until the end of the line
74 {
75 current_name[j]=src[i];
76
77 if ((src[i]=='\t') && (prv!='\t')) //a complete word
78 {
79 current_name[j]='\0';
80
81 /*
82 * Commented out John Campbell, June 10 2010
83 * We no longer want to automatically remove Time column.
84 * This column should have been removed by column selection
85 if (0!=strcmp(current_name,"Time"))
86 {
87 num_columns++; //num_columns does not inlcude the column of Time
88 time_pos++;
89 strcat(name_string,current_name);
90 strcat(name_string,"\t");
91 }
92 else
93 {
94 *time_ID=time_pos;
95 }
96 */
97
98 num_columns++;
99 strcat(name_string,current_name);
100 strcat(name_string,"\t");
101
102 current_name[0]='\0';
103 j=0;
104 }
105
106 if ((src[i]=='\t') && (prv=='\t')) //a duplicate tab or space
107 {
108 current_name[0]='\0';
109 j=0;
110 }
111
112 if (src[i]!='\t')
113 j++;
114
115
116 prv=src[i];
117 i++;
118 }
119
120 if (prv!='\t') //the last one hasn't been retrieved
121 {
122 current_name[j]='\0';
123 /*
124 * Commented out John Campbell, June 10 2010
125 * We no longer want to automatically remove Time column.
126 * This column should have been removed by column selection
127 if (0!=strcmp(current_name,"Time"))
128 {
129 num_columns++;
130 strcat(name_string,current_name);
131 time_pos++;
132 }
133 else
134 {
135 *time_ID=time_pos;
136 }
137 */
138
139 num_columns++;
140 strcat(name_string,current_name);
141 }
142
143 if (DEBUG==1)
144 {
145 printf("time_ID is %d\n",*time_ID);
146 printf("name_string is %s\n",name_string);
147 }
148
149 // # of rows
150
151 while ((ch = fgetc(f_src))!= EOF )
152 {
153 if (ch == '\n')
154 {
155 ++num_rows;
156 }
157 prev = ch;
158 }
159 if (prev!='\n')
160 ++num_rows;
161
162 *file_Len=num_rows;
163 *num_dm=num_columns;
164
165 //printf("original file size is %ld; number of dimensions is %ld\n", *file_Len, *num_dm);
166 }
167
168
169
170 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
171 /************************************* Read the source file into uncomp_data **************************************/
172 void readsource(FILE *f_src, long file_Len, long num_dm, double **uncomp_data, int time_ID)
173 {
174 long time_pass=0; //to mark whether the time_ID has been passed
175 long index=0;
176
177 long i=0;
178 long j=0;
179 long t=0;
180
181 char src[LINE_LEN];
182 char xc[LINE_LEN/10];
183
184 src[0]='\0';
185 fgets(src,LINE_LEN, f_src); //skip the first line about parameter names
186
187 while (!feof(f_src) && (index<file_Len)) //index = 0, 1, ..., file_Len-1
188 {
189 src[0]='\0';
190 fgets(src,LINE_LEN,f_src);
191 i=0;
192 time_pass=0;
193
194 if (time_ID==-1)
195 {
196 for (t=0;t<num_dm;t++) //there is no time_ID
197 {
198 xc[0]='\0';
199 j=0;
200 while ((src[i]!='\r') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t'))
201 {
202 xc[j]=src[i];
203 i++;
204 j++;
205 }
206
207 xc[j]='\0';
208 i++;
209
210 uncomp_data[index][t]=atof(xc);
211 }
212 }
213 else
214 {
215 for (t=0;t<=num_dm;t++) //the time column needs to be skipped, so there are num_dm+1 columns
216 {
217 xc[0]='\0';
218 j=0;
219 while ((src[i]!='\r') && (src[i]!='\n') && (src[i]!=' ') && (src[i]!='\t'))
220 {
221 xc[j]=src[i];
222 i++;
223 j++;
224 }
225
226 xc[j]='\0';
227 i++;
228
229 if (t==time_ID)
230 {
231 time_pass=1;
232 continue;
233 }
234
235 if (time_pass)
236 uncomp_data[index][t-1]=atof(xc);
237 else
238 uncomp_data[index][t]=atof(xc);
239 }
240 }
241 index++;
242 //fprintf(fout_ID,"%s",src);
243 } //end of while
244
245 if (DEBUG == 1)
246 {
247 printf("the last line of the source data is:\n");
248 for (j=0;j<num_dm;j++)
249 printf("%f ",uncomp_data[index-1][j]);
250 printf("\n");
251 }
252 }
253
254 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
255 void readcenter(FILE *f_src_ctr, long num_clust, long num_dm, double **cluster_center, long *IDmapping)
256 {
257 char src[LINE_LEN];
258 char xc[LINE_LEN/10];
259
260 long i=0;
261 long j=0;
262 int m=0;
263 int t=0;
264
265 for (i=0;i<num_clust;i++)
266 {
267 src[0]='\0';
268 fgets(src,LINE_LEN, f_src_ctr);
269 m=0;
270 for (j=0;j<num_dm+1;j++)
271 {
272 xc[0]='\0';
273 t=0;
274 while ((src[m]!='\r') && (src[m]!='\n') && (src[m]!=' ') && (src[m]!='\t'))
275 {
276 xc[t]=src[m];
277 m++;
278 t++;
279 }
280 xc[t]='\0';
281 m++;
282 if (j==0)
283 IDmapping[i]=atoi(xc);
284 else
285 cluster_center[i][j-1]=atof(xc);
286 //printf("cluster_center[%d][%d]=%f\n",i,j,cluster_center[i][j]);
287 }
288 }
289 }
290
291
292 /**************************************** Normalization ******************************************/
293 void tran(double **orig_data, long clean_Len, long num_dm, long norm_used, double **matrix_to_cluster)
294 {
295 long i=0;
296 long j=0;
297
298 double biggest=0;
299 double smallest=MAX_VALUE;
300
301 double *aver; //average of each column
302 double *std; //standard deviation of each column
303
304 aver=(double*)malloc(sizeof(double)*clean_Len);
305 memset(aver,0,sizeof(double)*clean_Len);
306
307 std=(double*)malloc(sizeof(double)*clean_Len);
308 memset(std,0,sizeof(double)*clean_Len);
309
310 if (norm_used==2) //z-score normalization
311 {
312 for (j=0;j<num_dm;j++)
313 {
314 aver[j]=0;
315 for (i=0;i<clean_Len;i++)
316 aver[j]=aver[j]+orig_data[i][j];
317 aver[j]=aver[j]/(double)clean_Len;
318
319 std[j]=0;
320 for (i=0;i<clean_Len;i++)
321 std[j]=std[j]+(orig_data[i][j]-aver[j])*(orig_data[i][j]-aver[j]);
322 std[j]=sqrt(std[j]/(double)clean_Len);
323
324 for (i=0;i<clean_Len;i++)
325 matrix_to_cluster[i][j]=(orig_data[i][j]-aver[j])/std[j]; //z-score normalization
326 }
327 }
328
329 if (norm_used==1) //0-1 min-max normalization
330 {
331 for (j=0;j<num_dm;j++)
332 {
333 biggest=0;
334 smallest=MAX_VALUE;
335 for (i=0;i<clean_Len;i++)
336 {
337 if (orig_data[i][j]>biggest)
338 biggest=orig_data[i][j];
339 if (orig_data[i][j]<smallest)
340 smallest=orig_data[i][j];
341 }
342
343 for (i=0;i<clean_Len;i++)
344 {
345 if (biggest==smallest)
346 matrix_to_cluster[i][j]=biggest;
347 else
348 matrix_to_cluster[i][j]=(orig_data[i][j]-smallest)/(biggest-smallest);
349 }
350 }
351 }
352
353 if (norm_used==0) //no normalization
354 {
355 for (i=0;i<clean_Len;i++)
356 for (j=0;j<num_dm;j++)
357 matrix_to_cluster[i][j]=orig_data[i][j];
358 }
359
360
361 }
362
363 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
364 void assign_event(double **Matrix, long k, long dist_used, double kmean_term, long file_Len, long num_dm, long *shortest_id, double **center, int random_init)
365 {
366
367 long i=0;
368
369 long j=0;
370 long t=0;
371 long random=0;
372 long random1=0;
373 long random2=0;
374 long times=0;
375 long times_allowed=0;
376
377 long *num; //num[i]=t means the ith cluster has t points
378
379 double vvv=1.0; // the biggest variation;
380 double distance=0.0;
381 double xv=0.0;
382 double variation=0.0;
383 double EPS=0.0;
384 double diff=0.0;
385 double mean_dx=0;
386 double mean_dy=0;
387 double sum_var=0;
388 double dx=0;
389 double dy=0;
390 double sd_x=0;
391 double sd_y=0;
392
393 double *temp_center;
394 double *shortest_distance;
395
396 double **sum;
397
398 temp_center = (double *)malloc(sizeof(double)*num_dm);
399 memset(temp_center,0,sizeof(double)*num_dm);
400
401 /* Choosing Centers */
402 if (random_init)
403 {
404 for (i=0;i<k;i++)
405 {
406 random1=rand()*rand();
407 //srand( (unsigned)time( NULL ) );
408 random2=abs((random1%5)+1);
409 for (t=0;t<random2;t++)
410 random2=random2*rand()+rand();
411
412 random=abs(random2%file_Len);
413 //printf("random=%d\n",random);
414 for (j=0;j<num_dm;j++)
415 center[i][j]=Matrix[random][j];
416
417 }
418 }
419
420 //printf("finish random selection\n");
421 /* To compute the nearest center for every point */
422
423 shortest_distance = (double *)malloc(sizeof(double)*file_Len);
424 memset(shortest_distance,0,sizeof(double)*file_Len);
425
426 num = (long *)malloc(sizeof(long)*k);
427 memset(num,0,sizeof(long)*k);
428
429 sum = (double **)malloc(sizeof(double*)*k);
430 memset(sum,0,sizeof(double*)*k);
431 for (i=0;i<k;i++)
432 {
433 sum[i] = (double *)malloc(sizeof(double)*num_dm);
434 memset(sum[i],0,sizeof(double)*num_dm);
435 }
436
437 for (i=0;i<k;i++)
438 for (j=0;j<num_dm;j++)
439 sum[i][j]=0.0; //sum[i][j] = k means the sum of the jth dimension of all points in the ith group is k
440
441 //printf("before recursion\n");
442 if (kmean_term>=1)
443 times_allowed = (long)kmean_term;
444 else
445 EPS = kmean_term;
446
447 times=0;
448
449 while (((vvv>EPS) && (kmean_term<1)) || ((times<times_allowed) && (kmean_term>=1)))
450 {
451 for (i=0;i<k;i++)
452 {
453 num[i]=0;
454 for (j=0;j<num_dm;j++)
455 sum[i][j]=0.0;
456 }
457
458 for (i=0;i<file_Len;i++) //for each data point i, we compute the distance between Matrix[i] and center[j]
459 {
460 shortest_distance[i]=MAX_VALUE;
461 for (j=0;j<k;j++) //for each center j
462 {
463
464 distance=0.0;
465
466 if (dist_used==0) //Euclidean distance
467 {
468 for (t=0;t<num_dm;t++) //for each dimension
469 {
470 diff=center[j][t]-Matrix[i][t];
471 if (diff<0)
472 diff=-diff;
473
474 if (CUBE)
475 distance = distance+(diff*diff*diff);
476 else
477 distance = distance+(diff*diff);
478 }
479 }
480 else //pearson correlation
481 {
482 mean_dx=0.0;
483 mean_dy=0.0;
484 sum_var=0.0;
485 dx=0.0;
486 dy=0.0;
487 sd_x=0.0;
488 sd_y=0.0;
489 for (t=0;t<num_dm;t++)
490 {
491 mean_dx+=center[j][t];
492 mean_dy+=Matrix[i][t];
493 }
494 mean_dx=mean_dx/(double)num_dm;
495 mean_dy=mean_dy/(double)num_dm;
496 //printf("mean_dx=%f\n",mean_dx);
497
498 for (t=0;t<num_dm;t++)
499 {
500 dx=center[j][t]-mean_dx;
501 dy=Matrix[i][t]-mean_dy;
502 sum_var+=dx*dy;
503 sd_x+=dx*dx;
504 sd_y+=dy*dy;
505 }
506 if (sqrt(sd_x*sd_y)==0)
507 distance = 1.0;
508 else
509 distance = 1.0 - (sum_var/(sqrt(sd_x*sd_y))); // distance ranges from 0 to 2;
510 //printf("distance=%f\n",distance);
511 } //pearson correlation ends
512
513
514 if (distance<shortest_distance[i])
515 {
516 shortest_distance[i]=distance;
517 shortest_id[i]=j;
518 }
519 }//end for j
520 num[shortest_id[i]]=num[shortest_id[i]]+1;
521 for (t=0;t<num_dm;t++)
522 sum[shortest_id[i]][t]=sum[shortest_id[i]][t]+Matrix[i][t];
523 }//end for i
524 /* recompute the centers */
525 //compute_mean(group);
526 vvv=0.0;
527 for (j=0;j<k;j++)
528 {
529 memcpy(temp_center,center[j],sizeof(double)*num_dm);
530 variation=0.0;
531 if (num[j]!=0)
532 {
533 for (t=0;t<num_dm;t++)
534 {
535 center[j][t]=sum[j][t]/(double)num[j];
536 xv=(temp_center[t]-center[j][t]);
537 variation=variation+xv*xv;
538 }
539 }
540
541 if (variation>vvv)
542 vvv=variation; //vvv is the biggest variation among the k clusters;
543 }
544 //compute_variation;
545 times++;
546 } //end for while
547
548
549
550 free(num);
551 for (i=0;i<k;i++)
552 free(sum[i]);
553 free(sum);
554 free(temp_center);
555 free(shortest_distance);
556
557 }
558
559 //////////////////////////////////////////////////////
560 /*************************** Show *****************************/
561 void show(double **Matrix, long *cluster_id, long file_Len, long k, long num_dm, char *name_string, long *IDmapping)
562 {
563 int situ1=0;
564 int situ2=0;
565
566 long i=0;
567 long id=0;
568 long j=0;
569 long info_id=0;
570 long nearest_id=0;
571 long insert=0;
572 long temp=0;
573 long m=0;
574 long n=0;
575 long t=0;
576
577 long *size_c;
578
579
580
581 long **size_mybound_1;
582 long **size_mybound_2;
583 long **size_mybound_3;
584 long **size_mybound_0;
585
586 double interval=0.0;
587
588 double *big;
589 double *small;
590
591
592 double **center;
593 double **mybound;
594
595 long **prof; //prof[i][j]=1 means population i is + at parameter j
596
597 FILE *fpcnt_id; //proportion id
598 //FILE *fcent_id; //center_id, i.e., centers of clusters within the original data
599 FILE *fprof_id; //profile_id
600
601 big=(double *)malloc(sizeof(double)*num_dm);
602 memset(big,0,sizeof(double)*num_dm);
603
604 small=(double *)malloc(sizeof(double)*num_dm);
605 memset(small,0,sizeof(double)*num_dm);
606
607 for (i=0;i<num_dm;i++)
608 {
609 big[i]=0.0;
610 small[i]=(double)MAX_VALUE;
611 }
612
613
614 size_c=(long *)malloc(sizeof(long)*k);
615 memset(size_c,0,sizeof(long)*k);
616
617 center=(double**)malloc(sizeof(double*)*k);
618 memset(center,0,sizeof(double*)*k);
619 for (i=0;i<k;i++)
620 {
621 center[i]=(double*)malloc(sizeof(double)*num_dm);
622 memset(center[i],0,sizeof(double)*num_dm);
623 }
624
625 mybound=(double**)malloc(sizeof(double*)*num_dm);
626 memset(mybound,0,sizeof(double*)*num_dm);
627 for (i=0;i<num_dm;i++) //there are 3 mybounds for 4 categories
628 {
629 mybound[i]=(double*)malloc(sizeof(double)*3);
630 memset(mybound[i],0,sizeof(double)*3);
631 }
632
633 prof=(long **)malloc(sizeof(long*)*k);
634 memset(prof,0,sizeof(long*)*k);
635 for (i=0;i<k;i++)
636 {
637 prof[i]=(long *)malloc(sizeof(long)*num_dm);
638 memset(prof[i],0,sizeof(long)*num_dm);
639 }
640
641
642 for (i=0;i<file_Len;i++)
643 {
644 id=cluster_id[i];
645 for (j=0;j<num_dm;j++)
646 {
647 center[id][j]=center[id][j]+Matrix[i][j];
648 if (big[j]<Matrix[i][j])
649 big[j]=Matrix[i][j];
650 if (small[j]>Matrix[i][j])
651 small[j]=Matrix[i][j];
652 }
653
654 size_c[id]++;
655 }
656
657 for (i=0;i<k;i++)
658 for (j=0;j<num_dm;j++)
659 {
660 if (size_c[i]!=0)
661 center[i][j]=(center[i][j]/(double)(size_c[i]));
662 else
663 center[i][j]=0;
664 }
665
666 for (j=0;j<num_dm;j++)
667 {
668 interval=((big[j]-small[j])/4.0);
669 //printf("interval[%d] is %f\n",j,interval);
670 for (i=0;i<3;i++)
671 mybound[j][i]=small[j]+((double)(i+1)*interval);
672 }
673
674
675 size_mybound_0=(long **)malloc(sizeof(long*)*k);
676 memset(size_mybound_0,0,sizeof(long*)*k);
677
678 for (i=0;i<k;i++)
679 {
680 size_mybound_0[i]=(long*)malloc(sizeof(long)*num_dm);
681 memset(size_mybound_0[i],0,sizeof(long)*num_dm);
682 }
683
684 size_mybound_1=(long **)malloc(sizeof(long*)*k);
685 memset(size_mybound_1,0,sizeof(long*)*k);
686
687 for (i=0;i<k;i++)
688 {
689 size_mybound_1[i]=(long*)malloc(sizeof(long)*num_dm);
690 memset(size_mybound_1[i],0,sizeof(long)*num_dm);
691 }
692
693 size_mybound_2=(long **)malloc(sizeof(long*)*k);
694 memset(size_mybound_2,0,sizeof(long*)*k);
695
696 for (i=0;i<k;i++)
697 {
698 size_mybound_2[i]=(long*)malloc(sizeof(long)*num_dm);
699 memset(size_mybound_2[i],0,sizeof(long)*num_dm);
700 }
701
702 size_mybound_3=(long **)malloc(sizeof(long*)*k);
703 memset(size_mybound_3,0,sizeof(long*)*k);
704
705 for (i=0;i<k;i++)
706 {
707 size_mybound_3[i]=(long*)malloc(sizeof(long)*num_dm);
708 memset(size_mybound_3[i],0,sizeof(long)*num_dm);
709 }
710
711 for (i=0;i<file_Len;i++)
712 for (j=0;j<num_dm;j++)
713 {
714 if (Matrix[i][j]<mybound[j][0])// && ((Matrix[i][j]-small[j])>0)) //the smallest values excluded
715 size_mybound_0[cluster_id[i]][j]++;
716 else
717 {
718 if (Matrix[i][j]<mybound[j][1])
719 size_mybound_1[cluster_id[i]][j]++;
720 else
721 {
722 if (Matrix[i][j]<mybound[j][2])
723 size_mybound_2[cluster_id[i]][j]++;
724 else
725 //if (Matrix[i][j]!=big[j]) //the biggest values excluded
726 size_mybound_3[cluster_id[i]][j]++;
727 }
728
729 }
730 }
731
732 fprof_id=fopen("profile.txt","w");
733 fprintf(fprof_id,"Population_ID\t");
734 fprintf(fprof_id,"%s\n",name_string);
735
736 for (i=0;i<k;i++)
737 {
738 fprintf(fprof_id,"%ld\t",IDmapping[i]);
739 for (j=0;j<num_dm;j++)
740 {
741
742 if (size_mybound_0[i][j]>size_mybound_1[i][j])
743 situ1=0;
744 else
745 situ1=1;
746 if (size_mybound_2[i][j]>size_mybound_3[i][j])
747 situ2=2;
748 else
749 situ2=3;
750
751 if ((situ1==0) && (situ2==2))
752 {
753 if (size_mybound_0[i][j]>size_mybound_2[i][j])
754 prof[i][j]=0;
755 else
756 prof[i][j]=2;
757 }
758 if ((situ1==0) && (situ2==3))
759 {
760 if (size_mybound_0[i][j]>size_mybound_3[i][j])
761 prof[i][j]=0;
762 else
763 prof[i][j]=3;
764 }
765 if ((situ1==1) && (situ2==2))
766 {
767 if (size_mybound_1[i][j]>size_mybound_2[i][j])
768 prof[i][j]=1;
769 else
770 prof[i][j]=2;
771 }
772 if ((situ1==1) && (situ2==3))
773 {
774 if (size_mybound_1[i][j]>size_mybound_3[i][j])
775 prof[i][j]=1;
776 else
777 prof[i][j]=3;
778 }
779
780 //begin to output profile
781 if (j==num_dm-1)
782 {
783 if (prof[i][j]==0)
784 fprintf(fprof_id,"1\n");
785 if (prof[i][j]==1)
786 fprintf(fprof_id,"2\n");
787 if (prof[i][j]==2)
788 fprintf(fprof_id,"3\n");
789 if (prof[i][j]==3)
790 fprintf(fprof_id,"4\n");
791 }
792 else
793 {
794 if (prof[i][j]==0)
795 fprintf(fprof_id,"1\t");
796 if (prof[i][j]==1)
797 fprintf(fprof_id,"2\t");
798 if (prof[i][j]==2)
799 fprintf(fprof_id,"3\t");
800 if (prof[i][j]==3)
801 fprintf(fprof_id,"4\t");
802 }
803 }
804 }
805 fclose(fprof_id);
806
807 ///////////////////////////////////////////////////////////
808
809
810 fpcnt_id=fopen("percentage.txt","w");
811 fprintf(fpcnt_id,"Population_ID\tPercentage\n");
812
813 for (t=0;t<k;t++)
814 {
815 fprintf(fpcnt_id,"%ld\t%.2f\n",IDmapping[t],(double)size_c[t]*100.0/(double)file_Len);
816 }
817 fclose(fpcnt_id);
818
819 free(big);
820 free(small);
821 free(size_c);
822
823 for (i=0;i<k;i++)
824 {
825 free(center[i]);
826 free(prof[i]);
827 free(size_mybound_0[i]);
828 free(size_mybound_1[i]);
829 free(size_mybound_2[i]);
830 free(size_mybound_3[i]);
831 }
832 free(center);
833 free(prof);
834 free(size_mybound_0);
835 free(size_mybound_1);
836 free(size_mybound_2);
837 free(size_mybound_3);
838
839 for (i=0;i<num_dm;i++)
840 free(mybound[i]);
841 free(mybound);
842
843 }
844 /******************************************************** Main Function **************************************************/
845 int main (int argc, char **argv)
846 {
847 //inputs
848 FILE *f_src; //source file pointer
849 FILE *f_src_ctr; //source center file
850 //outputs
851 FILE *f_cid; //cluster-id file pointer
852 FILE *f_mfi; //added April 16, 2009
853
854
855 char name_string[LINE_LEN]; //for name use
856
857 int time_id=-1;
858
859 long file_Len=0;
860 long num_clust=0;
861 long num_dm=0;
862 long norm_used=0;
863 long dist_used=0;
864 long i=0;
865 long j=0;
866
867 long *cluster_id;
868 long *IDmapping; //this is to keep the original populationID of the center.txt
869
870 double kmean_term=0;
871
872 double **cluster_center;
873 double **orig_data;
874 double **normalized_data;
875
876 /*
877 _strtime( tmpbuf );
878 printf( "Starting time:\t\t\t\t%s\n", tmpbuf );
879 _strdate( tmpbuf );
880 printf( "Starting date:\t\t\t\t%s\n", tmpbuf );
881 */
882
883 if (argc!=3)
884 {
885 printf("usage: cent_adjust input_center input_data_file\n");
886 exit(0);
887 }
888
889
890 f_src_ctr=fopen(argv[1],"r");
891
892 //read source data
893 f_src=fopen(argv[2],"r");
894
895 getfileinfo(f_src, &file_Len, &num_dm, name_string, &time_id); //get the filelength, number of dimensions, and num/name of parameters
896
897 rewind(f_src); //reset data file pointer
898
899 orig_data = (double **)malloc(sizeof(double*)*file_Len);
900 memset(orig_data,0,sizeof(double*)*file_Len);
901 for (i=0;i<file_Len;i++)
902 {
903 orig_data[i]=(double *)malloc(sizeof(double)*num_dm);
904 memset(orig_data[i],0,sizeof(double)*num_dm);
905 }
906
907 readsource(f_src, file_Len, num_dm, orig_data, time_id); //read the data;
908
909 fclose(f_src);
910 /////////////////////////////////////////////////////////////////////////////
911 getctrfileinfo(f_src_ctr, &num_clust); //get how many populations
912 norm_used=0;
913 dist_used=0;
914 kmean_term=2; //modified on Oct 16, 2009: changed kmean_term=1 to kmean_term=2
915
916 rewind(f_src_ctr); //reset center file pointer
917
918 //read population center
919 cluster_center=(double **)malloc(sizeof(double*)*num_clust);
920 memset(cluster_center,0,sizeof(double*)*num_clust);
921 for (i=0;i<num_clust;i++)
922 {
923 cluster_center[i]=(double*)malloc(sizeof(double)*num_dm);
924 memset(cluster_center[i],0,sizeof(double)*num_dm);
925 }
926 for (i=0;i<num_clust;i++)
927 for (j=0;j<num_dm;j++)
928 cluster_center[i][j]=0;
929
930 IDmapping=(long *)malloc(sizeof(long)*num_clust);
931 memset(IDmapping,0,sizeof(long)*num_clust);
932
933 readcenter(f_src_ctr,num_clust,num_dm,cluster_center,IDmapping); //read population center
934 fclose(f_src_ctr);
935
936 /////////////////////////////////////////////////////////////////////////////
937 normalized_data=(double **)malloc(sizeof(double*)*file_Len);
938 memset(normalized_data,0,sizeof(double*)*file_Len);
939 for (i=0;i<file_Len;i++)
940 {
941 normalized_data[i]=(double *)malloc(sizeof(double)*num_dm);
942 memset(normalized_data[i],0,sizeof(double)*num_dm);
943 }
944
945 tran(orig_data, file_Len, num_dm, norm_used, normalized_data);
946 /************************************************* Compute number of clusters *************************************************/
947
948 cluster_id=(long*)malloc(sizeof(long)*file_Len);
949 memset(cluster_id,0,sizeof(long)*file_Len);
950
951 assign_event(normalized_data,num_clust,dist_used,kmean_term,file_Len,num_dm,cluster_id,cluster_center,0);
952
953
954 //show(orig_data,cluster_id,file_Len,num_clust,num_dm,show_data,num_disp,name_string);
955 show(orig_data, cluster_id, file_Len, num_clust, num_dm, name_string, IDmapping);
956
957 f_cid=fopen("population_id.txt","w");
958
959 for (i=0;i<file_Len;i++)
960 fprintf(f_cid,"%ld\n",IDmapping[cluster_id[i]]);
961
962
963 fclose(f_cid);
964
965 //added April 16, 2009
966 f_mfi=fopen("MFI.txt","w");
967
968 for (i=0;i<num_clust;i++)
969 {
970 fprintf(f_mfi,"%ld\t",IDmapping[i]);
971
972 for (j=0;j<num_dm;j++)
973 {
974 if (j==num_dm-1)
975 fprintf(f_mfi,"%.0f\n",cluster_center[i][j]);
976 else
977 fprintf(f_mfi,"%.0f\t",cluster_center[i][j]);
978 }
979 }
980 fclose(f_mfi);
981
982 //ended April 16, 2009
983
984 for (i=0;i<num_clust;i++)
985 free(cluster_center[i]);
986 free(cluster_center);
987
988
989 /********************************************** Release memory ******************************************/
990
991 for (i=0;i<file_Len;i++)
992 {
993 free(orig_data[i]);
994 free(normalized_data[i]);
995 }
996
997 free(orig_data);
998 free(normalized_data);
999 free(cluster_id);
1000 free(IDmapping);
1001
1002 /*
1003 _strtime( tmpbuf );
1004 printf( "Ending time:\t\t\t\t%s\n", tmpbuf );
1005 _strdate( tmpbuf );
1006 printf( "Ending date:\t\t\t\t%s\n", tmpbuf );
1007 */
1008
1009 }