comparison alignCustomAmplicon/utils.cpp @ 0:d32bddcff685 draft

Uploaded
author fcaramia
date Wed, 09 Jan 2013 00:24:09 -0500
parents
children 0aaf65fbb48a
comparison
equal deleted inserted replaced
-1:000000000000 0:d32bddcff685
1 #include <stdio.h>
2 #include <string>
3 #include <vector>
4
5 #define max(a,b,c) a > b ? ( a > c ? a : c ) : ( b > c ? b : c ) ;
6 #define min(a,b,c) a < b ? ( a < c ? a : c ) : ( b < c ? b : c ) ;
7 using namespace std;
8
9 void printMatrix (double* m, int rows, int cols)
10 {
11 for(int i = 0; i < rows; i++)
12 {
13 for(int j = 0; j < cols; j++)
14 {
15 printf("%.2f ", *(m + i*cols +j));
16 }
17 printf("\n");
18 }
19
20 }
21
22 int levenshteinDistance(string s, string t)
23 {
24 //Calculate Edit distance of 2 strings
25 if(s == t)
26 return 0;
27 int size1 = s.length()+1;
28 int size2 = t.length()+1;
29 int d[size1][size2];
30
31 for (int i=0; i<size1; i++)d[i][0] = i;
32 for (int i=0; i<size2; i++)d[0][i] = i;
33
34 for (int i=1; i<size1; i++)
35 for (int j=1; j<size2; j++)
36 {
37 if(s[i-1] == t[j-1] )
38 d[i][j] = d[i-1][j-1]; //if equeal no operation
39 else
40 d[i][j] = min(d[i-1][j] + 1,d[i][j-1] + 1,d[i-1][j-1] + 1 ); //Addition, deletion or substitution
41
42 }
43
44 return d[size1-1][size2-1]; // return edit distance
45 }
46
47 double nw_align2 (string &seq1, string &seq2, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool noFrontGapPenalty, bool noEndGapPenalty, bool debug)
48 {
49 //Do NW alignment of 2 strings...
50 int size1 = seq1.length();
51 int size2 = seq2.length();
52
53 double scores[size1+1][size2+1];
54 char dir[size1+1][size2+1];
55 char gaps[size1+1][size2+1];
56
57 for(int i = 0; i <= size1; i++)
58 {
59 scores[i][0] = (double)(i-1)*gapPenExt + gapPenOpen;
60 dir[i][0] = 'U';
61 gaps[i][0] = 'O';
62 }
63 for(int j = 0; j <= size2; j++)
64 {
65 scores[0][j] = (double)(j-1)*gapPenExt + gapPenOpen;
66 dir[0][j] = 'L';
67 gaps[0][j] = 'O';
68 }
69
70 scores[0][0] = 0;
71 gaps[0][0] = 'N';
72
73 if (noFrontGapPenalty)
74 {
75 for(int i = 0; i <= size1; i++) scores[i][0] = 0;
76 for(int j = 0; j <= size2; j++) scores[0][j] = 0;
77 }
78
79 double match;
80 double del;
81 double insert;
82
83 for(int i = 1; i <= size1; i++)
84 for(int j = 1; j <= size2; j++)
85 {
86 if (seq1[i-1] == 'N' || seq2[j-1] == 'N')
87 {
88 match = scores[i-1][j-1];
89 }
90 else
91 {
92 match = scores[i-1][j-1] + (double)(seq1[i-1] == seq2[j-1]? matchScore : missmatch_penalty); //1.9f clustalw
93 }
94 del = scores[i-1][j] + (gaps[i-1][j] == 'N'? gapPenOpen:gapPenExt);
95 insert = scores[i][j-1] + (gaps[i][j-1] == 'N'? gapPenOpen:gapPenExt);
96 //debug
97 //printf("I:%d J:%d match = %f del %f insert:%f\n",i,j,match,del,insert);
98
99 if (match>del && match > insert)
100 {
101 scores[i][j] = match;
102 dir[i][j] = 'D';
103 gaps[i][j] = 'N';
104 }
105 else if(del > insert)
106 {
107 scores[i][j] = del;
108 dir[i][j] = 'U';
109 gaps[i][j] = 'O';
110 }
111 else
112 {
113 scores[i][j] = insert;
114 dir[i][j] = 'L';
115 gaps[i][j] = 'O';
116 }
117 }
118
119 //debug
120 //if (debug)
121 // printMatrix(*scores, size1+1, size2+1);
122 //printMatrix(*dir, size1+1, size2+1);
123
124 string alignment1="";
125 string alignment2="";
126
127
128 int cont1 = size1;
129 int cont2 = size2;
130 int max1 = size1;
131 int max2 = size2;
132 double num = scores[size1][size2];;
133
134 if (noEndGapPenalty)
135 {
136 //Search Maxes
137
138 for(int i = 1; i <= size1; i++)
139 {
140 if(num<scores[i][size2])
141 {
142 num = scores[i][size2];
143 max1 = i;
144 max2 = size2;
145 }
146 }
147 for(int j = 1; j <= size2; j++)
148 {
149 if(num<scores[size1][j])
150 {
151 num = scores[size1][j];
152 max1 = size1;
153 max2 = j;
154 }
155 }
156 cont1 = max1;
157 cont2 = max2;
158 }
159
160 while (cont1 > 0 && cont2 > 0)
161 {
162 if(dir[cont1][cont2] == 'D')
163 {
164 alignment1+= seq1[cont1-1];
165 alignment2+= seq2[cont2-1];
166 cont1--;
167 cont2--;
168
169 }
170 else if(dir[cont1][cont2] == 'L')
171 {
172 alignment1+= '-';
173 alignment2+= seq2[cont2-1];
174 cont2--;
175 }
176 else
177 {
178 alignment1+= seq1[cont1-1];
179 alignment2+= '-';
180 cont1--;
181
182 }
183 }
184
185 while (cont1 > 0)
186 {
187 alignment1+= seq1[cont1-1];
188 alignment2+= '-';
189 cont1--;
190 }
191
192 while (cont2 > 0)
193 {
194 alignment1+= '-';
195 alignment2+= seq2[cont2-1];
196 cont2--;
197 }
198
199 alignment1 = string (alignment1.rbegin(),alignment1.rend());
200 alignment2 = string (alignment2.rbegin(),alignment2.rend());
201
202
203 if (noEndGapPenalty)
204 {
205 //Need to fill out rest of alignment...
206 if (max1 != size1)
207 {
208 for (int i=max1; i<size1; ++i )
209 {
210 alignment1+= seq1 [i];
211 alignment2+= '-';
212
213 }
214 }
215
216 if (max2 != size2)
217 {
218
219 for (int i=max2; i<size2; ++i )
220 {
221 alignment2+= seq2 [i];
222 alignment1+= '-';
223
224 }
225 }
226
227 }
228 if (debug)
229 {
230 printf("Seq1: %s\n", alignment1.c_str());
231 printf("Seq2: %s\n\n", alignment2.c_str());
232
233 }
234
235 //Returns
236 seq1 = alignment1;
237 seq2 = alignment2;
238
239 return num;
240
241 }
242
243 vector <string> nw_alignAlignments (vector<string> a1, vector<string> a2, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool noFrontGapPenalty, bool noEndGapPenalty, bool debug)
244 {
245 //Do NW alignment of 2 strings...
246 int size1 = a1[0].length();
247 int size2 = a2[0].length();
248 int v1 = a1.size();
249 int v2 = a2.size();
250 int combs = v1 * v2;
251
252 vector<string> msa;
253 msa.clear();
254 vector<string> alignment1, alignment2;
255
256 for (int i=0;i<v1;++i)
257 alignment1.push_back(string());
258 for (int i=0;i<v2;++i)
259 alignment2.push_back(string());
260
261
262
263 double scores[size1+1][size2+1];
264 char dir[size1+1][size2+1];
265 char gaps[size1+1][size2+1];
266
267 for(int i = 0; i <= size1; i++)
268 {
269 scores[i][0] = (double)(i-1)*gapPenExt + gapPenOpen;
270 dir[i][0] = 'U';
271 gaps[i][0] = 'O';
272 }
273 for(int j = 0; j <= size2; j++)
274 {
275 scores[0][j] = (double)(j-1)*gapPenExt + gapPenOpen;
276 dir[0][j] = 'L';
277 gaps[0][j] = 'O';
278 }
279
280 scores[0][0] = 0;
281 gaps[0][0] = 'N';
282
283 if (noFrontGapPenalty)
284 {
285 for(int i = 0; i <= size1; i++) scores[i][0] = 0;
286 for(int j = 0; j <= size2; j++) scores[0][j] = 0;
287 }
288
289 double match;
290 double del;
291 double insert;
292
293 for(int i = 1; i <= size1; i++)
294 for(int j = 1; j <= size2; j++)
295 {
296
297 match = del = insert = 0.0f;
298 for(int z=0;z<v1;++z)
299 {
300 for(int w=0;w<v2;++w)
301 {
302 if (a1[z][i-1] == 'N' || a2[w][j-1] == 'N')
303 {
304 match += scores[i-1][j-1];
305 }
306 else
307 {
308 match += scores[i-1][j-1] + (double)(a1[z][i-1] == a2[w][j-1]? matchScore:missmatch_penalty);
309 }
310 del += scores[i-1][j] + (gaps[i-1][j] == 'N'? gapPenOpen:gapPenExt);
311 insert += scores[i][j-1] + (gaps[i][j-1] == 'N'? gapPenOpen:gapPenExt);
312 }
313 }
314
315 match /= combs;
316 del /= combs;
317 insert /= combs;
318
319 if (match > del && match > insert)
320 {
321 scores[i][j] = match;
322 dir[i][j] = 'D';
323 gaps[i][j] = 'N';
324 }
325 else if(del > insert)
326 {
327 scores[i][j] = del;
328 dir[i][j] = 'U';
329 gaps[i][j] = 'O';
330 }
331 else
332 {
333 scores[i][j] = insert;
334 dir[i][j] = 'L';
335 gaps[i][j] = 'O';
336 }
337 }
338
339 //debug
340 // if (debug)
341 // printMatrix(*scores, size1+1, size2+1);
342 //printMatrix(*dir, size1+1, size2+1);
343
344
345 int cont1 = size1;
346 int cont2 = size2;
347 int max1 = size1;
348 int max2 = size2;
349 double num = scores[size1][size2];
350
351 if (noEndGapPenalty)
352 {
353 //Search Maxes
354
355 for(int i = 1; i <= size1; i++)
356 {
357 if(num<scores[i][size2])
358 {
359 num = scores[i][size2];
360 max1 = i;
361 max2 = size2;
362 }
363 }
364 for(int j = 1; j <= size2; j++)
365 {
366 if(num<scores[size1][j])
367 {
368 num = scores[size1][j];
369 max1 = size1;
370 max2 = j;
371 }
372 }
373 cont1 = max1;
374 cont2 = max2;
375 }
376
377 while (cont1 > 0 && cont2 > 0)
378 {
379 if(dir[cont1][cont2] == 'D')
380 {
381
382 for(int z=0;z<v1;++z)
383 alignment1[z]+= a1[z][cont1-1];
384
385 for(int w=0;w<v2;++w)
386 alignment2[w]+= a2[w][cont2-1];
387
388 cont1--;
389 cont2--;
390
391 }
392 else if(dir[cont1][cont2] == 'L')
393 {
394 for(int z=0;z<v1;++z)
395 alignment1[z]+= '-';
396 for(int w=0;w<v2;++w)
397 alignment2[w]+= a2[w][cont2-1];
398
399 cont2--;
400 }
401 else
402 {
403 for(int z=0;z<v1;++z)
404 alignment1[z]+= a1[z][cont1-1];
405
406 for(int w=0;w<v2;++w)
407 alignment2[w]+= '-';
408
409 cont1--;
410
411 }
412 }
413
414 while (cont1 > 0)
415 {
416 for(int z=0;z<v1;++z)
417 alignment1[z]+= a1[z][cont1-1];
418 for(int w=0;w<v2;++w)
419 alignment2[w]+= '-';
420
421 cont1--;
422 }
423
424 while (cont2 > 0)
425 {
426 for(int z=0;z<v1;++z)
427 alignment1[z]+= '-';
428 for(int w=0;w<v2;++w)
429 alignment2[w]+= a2[w][cont2-1];
430
431 cont2--;
432 }
433
434
435
436
437 for(int z=0;z<v1;++z)
438 {
439 alignment1[z] = string (alignment1[z].rbegin(),alignment1[z].rend());
440 }
441 for(int w=0;w<v2;++w)
442 {
443 alignment2[w] = string (alignment2[w].rbegin(),alignment2[w].rend());
444 }
445
446
447
448 if (noEndGapPenalty)
449 {
450 //Need to fill out rest of alignment...
451 if (max1 != size1)
452 {
453 for (int i=max1; i<size1; ++i )
454 {
455 for(int z=0;z<v1;++z)
456 alignment1[z]+= a1[z][i];
457 for(int w=0;w<v2;++w)
458 alignment2[w]+= '-';
459
460 }
461 }
462
463 if (max2 != size2)
464 {
465
466 for (int i=max2; i<size2; ++i )
467 {
468 for(int z=0;z<v1;++z)
469 alignment1[z]+= '-';
470 for(int w=0;w<v2;++w)
471 alignment2[w]+= a2[w][i];
472
473 }
474 }
475
476 }
477
478 //returns
479 for(int z=0;z<v1;++z)
480 msa.push_back(alignment1[z]);
481 for(int w=0;w<v2;++w)
482 msa.push_back(alignment2[w]);
483
484 if (debug)
485 {
486 for(int z=0;z<v1;++z)
487 printf("%s\n",alignment1[z].c_str());
488 for(int w=0;w<v2;++w)
489 printf("%s\n",alignment2[w].c_str());
490
491 }
492
493 return msa;
494
495 }
496
497
498