Mercurial > repos > dawe > srf2fastq
comparison srf2fastq/io_lib-1.12.2/progs/makeSCF.c @ 0:d901c9f41a6a default tip
Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author | dawe |
---|---|
date | Tue, 07 Jun 2011 17:48:05 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d901c9f41a6a |
---|---|
1 /* | |
2 * Copyright (c) Medical Research Council 1994-1998. All rights reserved. | |
3 * | |
4 * Permission to use, copy, modify and distribute this software and its | |
5 * documentation for any purpose is hereby granted without fee, provided that | |
6 * this copyright and notice appears in all copies. | |
7 * | |
8 * This file was written by James Bonfield, Simon Dear, Rodger Staden, | |
9 * as part of the Staden Package at the MRC Laboratory of Molecular | |
10 * Biology, Hills Road, Cambridge, CB2 2QH, United Kingdom. | |
11 * | |
12 * MRC disclaims all warranties with regard to this software. | |
13 */ | |
14 | |
15 | |
16 /* | |
17 * makeSCF v3.06, 17/04/2001 | |
18 * | |
19 * Derived from the older makeSCF; this one has been rewritten to use the new | |
20 * IO libraries and no longer performs quality clipping itself. It also writes | |
21 * in a new format that is more easily compressed. | |
22 */ | |
23 | |
24 #include <stdio.h> | |
25 #include <strings.h> | |
26 #include <io_lib/Read.h> | |
27 #include <io_lib/traceType.h> | |
28 #include <io_lib/xalloc.h> | |
29 | |
30 /* | |
31 * Add our comments. | |
32 * 1. Work out the comment length - simply add 1K to the current length. | |
33 * 2. Copy the old comments to our new. | |
34 * 3. Replace '\n' with ' '. | |
35 * 4. Add our comments and switch. | |
36 */ | |
37 void add_comments(Read *r, char *name, int format) { | |
38 Comments *cc; | |
39 int clen; | |
40 char *cp; | |
41 char buf[1024]; | |
42 | |
43 /* 1. */ | |
44 if (r->info) { | |
45 clen = strlen(r->info) + 1024; | |
46 } else { | |
47 clen = 1024; | |
48 } | |
49 | |
50 /* 2. */ | |
51 if (NULL == (cc = (char *)xmalloc(clen))) | |
52 return; | |
53 | |
54 if (r->info) | |
55 strcpy(cc, r->info); | |
56 else | |
57 *cc = 0; | |
58 | |
59 /* 3. */ | |
60 cp = cc; | |
61 /* | |
62 while (*cp) { | |
63 if (*cp == '\n') | |
64 *cp = ' '; | |
65 | |
66 cp++; | |
67 } | |
68 *cp++ = '\n'; | |
69 */ | |
70 | |
71 /* 4. */ | |
72 sprintf(buf, "CONV=makeSCF V3.06\nDATF=%s\nDATN=%s\n", | |
73 trace_type_int2str(format), name); | |
74 | |
75 strcat(cp, buf); | |
76 | |
77 if (r->info) | |
78 xfree(r->info); | |
79 | |
80 r->info = cc; | |
81 } | |
82 | |
83 void scale_trace8(Read *r) { | |
84 double s; | |
85 int i; | |
86 | |
87 if (r->maxTraceVal <= 255) | |
88 return; | |
89 | |
90 s = ((double)255)/r->maxTraceVal; | |
91 | |
92 for (i = 0; i < r->NPoints; i++) { | |
93 r->traceA[i] *= s; | |
94 r->traceC[i] *= s; | |
95 r->traceG[i] *= s; | |
96 r->traceT[i] *= s; | |
97 } | |
98 | |
99 r->maxTraceVal = 255; | |
100 } | |
101 | |
102 /* | |
103 * Here we just take the minimum trace value and subtract this from all others. | |
104 * The assumption is that the signal will always be 'base line' on at least | |
105 * one of the four channels. | |
106 */ | |
107 void subtract_background(Read *r) { | |
108 int i, min; | |
109 for (i = 0; i < r->NPoints; i++) { | |
110 min = 999999; | |
111 if (r->traceA[i] < min) min = r->traceA[i]; | |
112 if (r->traceC[i] < min) min = r->traceC[i]; | |
113 if (r->traceG[i] < min) min = r->traceG[i]; | |
114 if (r->traceT[i] < min) min = r->traceT[i]; | |
115 r->traceA[i] -= min; | |
116 r->traceC[i] -= min; | |
117 r->traceG[i] -= min; | |
118 r->traceT[i] -= min; | |
119 } | |
120 } | |
121 | |
122 /* | |
123 * Find the average background level of a trace, and subtract this from the | |
124 * peak heights. | |
125 * | |
126 * NB. That method is flawed. For now take the minimum instead of average, but | |
127 * this also has horrid flaws. See above method. | |
128 */ | |
129 void subtract_background_old(Read *r) { | |
130 int i, j, min, bg, max = 0; | |
131 int win_len = 501, win_len2 = win_len/2; | |
132 int *background; | |
133 | |
134 if (NULL == (background = (int *)xmalloc((r->NPoints + 2 * win_len) | |
135 * sizeof(*background)))) | |
136 return; | |
137 | |
138 if (r->NPoints < win_len) | |
139 win_len = r->NPoints; | |
140 | |
141 /* Find minimum trace levels at each point */ | |
142 for (i = 0, j = win_len2; i < r->NPoints; i++, j++) { | |
143 min = INT_MAX; | |
144 if (r->traceA[i] < min) | |
145 min = r->traceA[i]; | |
146 if (r->traceC[i] < min) | |
147 min = r->traceC[i]; | |
148 if (r->traceG[i] < min) | |
149 min = r->traceG[i]; | |
150 if (r->traceT[i] < min) | |
151 min = r->traceT[i]; | |
152 background[j] = min; | |
153 } | |
154 for (i = 0; i < win_len2; i++) { | |
155 background[i] = background[i + win_len2]; | |
156 background[i + r->NPoints] = background[i + r->NPoints - win_len2]; | |
157 } | |
158 | |
159 /* Take lowest background over win_len and subtract it */ | |
160 for (i = 0; i < r->NPoints; i++) { | |
161 /* Could optimise this considerably */ | |
162 bg = INT_MAX; | |
163 for (j = 0; j < win_len; j++) { | |
164 if (background[i + j] < bg) | |
165 bg = background[i + j]; | |
166 } | |
167 | |
168 r->traceA[i] -= bg; | |
169 r->traceC[i] -= bg; | |
170 r->traceG[i] -= bg; | |
171 r->traceT[i] -= bg; | |
172 | |
173 if (r->traceA[i] > max) max = r->traceA[i]; | |
174 if (r->traceC[i] > max) max = r->traceC[i]; | |
175 if (r->traceG[i] > max) max = r->traceG[i]; | |
176 if (r->traceT[i] > max) max = r->traceT[i]; | |
177 } | |
178 | |
179 r->maxTraceVal = max; | |
180 | |
181 xfree(background); | |
182 } | |
183 | |
184 /* | |
185 * Find the maximum height of traces at the called bases. Use this to clip any | |
186 * other bases. | |
187 */ | |
188 void reset_max_called_height(Read *r) { | |
189 int i, max = 0; | |
190 | |
191 /* Find max */ | |
192 for (i=0; i < r->NBases; i++) { | |
193 switch(r->base[i]) { | |
194 case 'a': | |
195 case 'A': | |
196 if (r->traceA[r->basePos[i]] > max) | |
197 max = r->traceA[r->basePos[i]]; | |
198 break; | |
199 | |
200 case 'c': | |
201 case 'C': | |
202 if (r->traceC[r->basePos[i]] > max) | |
203 max = r->traceC[r->basePos[i]]; | |
204 break; | |
205 | |
206 case 'g': | |
207 case 'G': | |
208 if (r->traceG[r->basePos[i]] > max) | |
209 max = r->traceG[r->basePos[i]]; | |
210 break; | |
211 | |
212 case 't': | |
213 case 'T': | |
214 if (r->traceT[r->basePos[i]] > max) | |
215 max = r->traceT[r->basePos[i]]; | |
216 break; | |
217 } | |
218 } | |
219 | |
220 /* Clip to max */ | |
221 for (i = 0; i < r->NPoints; i++) { | |
222 if (r->traceA[i] > max) | |
223 r->traceA[i] = max; | |
224 if (r->traceC[i] > max) | |
225 r->traceC[i] = max; | |
226 if (r->traceG[i] > max) | |
227 r->traceG[i] = max; | |
228 if (r->traceT[i] > max) | |
229 r->traceT[i] = max; | |
230 } | |
231 if (r->maxTraceVal > max) | |
232 r->maxTraceVal = max; | |
233 } | |
234 | |
235 void rescale_heights(Read *r) { | |
236 int win_len = 1000; | |
237 int total = 0; | |
238 int max, max2; | |
239 int i, j, k; | |
240 double max_val = 0, rescale = 1.0; | |
241 TRACE *ta, *tc, *tg, *tt; | |
242 | |
243 ta = r->traceA; | |
244 tc = r->traceC; | |
245 tg = r->traceG; | |
246 tt = r->traceT; | |
247 | |
248 if (r->NPoints < 2*win_len + 1) | |
249 return; | |
250 | |
251 for (k = 0; k < 2; k++) { | |
252 max2 = win_len * r->maxTraceVal; | |
253 | |
254 for (i = 0; i < win_len; i++) { | |
255 max = 0; | |
256 if (ta[i] > max) max = ta[i]; | |
257 if (tc[i] > max) max = tc[i]; | |
258 if (tg[i] > max) max = tg[i]; | |
259 if (tt[i] > max) max = tt[i]; | |
260 total += max; | |
261 } | |
262 | |
263 for (j = 0; i < r->NPoints; i++, j++) { | |
264 max = 0; | |
265 if (ta[j] > max) max = ta[j]; | |
266 if (tc[j] > max) max = tc[j]; | |
267 if (tg[j] > max) max = tg[j]; | |
268 if (tt[j] > max) max = tt[j]; | |
269 total -= max; | |
270 | |
271 max = 0; | |
272 if (ta[i] > max) max = ta[i]; | |
273 if (tc[i] > max) max = tc[i]; | |
274 if (tg[i] > max) max = tg[i]; | |
275 if (tt[i] > max) max = tt[i]; | |
276 total += max; | |
277 | |
278 if (k == 0) { | |
279 if (r->traceA[j] * ((double)max2 / total) > max_val) | |
280 max_val = r->traceA[j] * ((double)max2 / total); | |
281 if (r->traceC[j] * ((double)max2 / total) > max_val) | |
282 max_val = r->traceC[j] * ((double)max2 / total); | |
283 if (r->traceG[j] * ((double)max2 / total) > max_val) | |
284 max_val = r->traceG[j] * ((double)max2 / total); | |
285 if (r->traceT[j] * ((double)max2 / total) > max_val) | |
286 max_val = r->traceT[j] * ((double)max2 / total); | |
287 } else { | |
288 r->traceA[j] *= (double)max2 / total * rescale; | |
289 r->traceC[j] *= (double)max2 / total * rescale; | |
290 r->traceG[j] *= (double)max2 / total * rescale; | |
291 r->traceT[j] *= (double)max2 / total * rescale; | |
292 } | |
293 } | |
294 | |
295 for (; j < r->NPoints; j++) { | |
296 if (k == 0) { | |
297 if (r->traceA[j] * ((double)max2 / total) > max_val) | |
298 max_val = r->traceA[j] * ((double)max2 / total); | |
299 if (r->traceC[j] * ((double)max2 / total) > max_val) | |
300 max_val = r->traceC[j] * ((double)max2 / total); | |
301 if (r->traceG[j] * ((double)max2 / total) > max_val) | |
302 max_val = r->traceG[j] * ((double)max2 / total); | |
303 if (r->traceT[j] * ((double)max2 / total) > max_val) | |
304 max_val = r->traceT[j] * ((double)max2 / total); | |
305 } else { | |
306 r->traceA[j] *= (double)max2 / total; | |
307 r->traceC[j] *= (double)max2 / total; | |
308 r->traceG[j] *= (double)max2 / total; | |
309 r->traceT[j] *= (double)max2 / total; | |
310 } | |
311 } | |
312 | |
313 if (max_val > 65535) | |
314 rescale = 65535 / max_val; | |
315 else | |
316 rescale = 1.0; | |
317 } | |
318 } | |
319 | |
320 static int convert(char *in, mFILE *ofp, char *out, int format, int prec, | |
321 int comp, int normalise) { | |
322 Read *r; | |
323 | |
324 if (NULL == (r = read_reading(in, format))) { | |
325 fprintf(stderr, "%s: failed to read\n", in); | |
326 return 1; | |
327 } | |
328 | |
329 if (normalise) { | |
330 subtract_background(r); | |
331 reset_max_called_height(r); | |
332 rescale_heights(r); | |
333 } | |
334 | |
335 add_comments(r, in, format); | |
336 if (prec == 1) | |
337 scale_trace8(r); | |
338 | |
339 if (comp != -1) | |
340 set_compression_method(comp); | |
341 if (0 != (mfwrite_reading(ofp, r, TT_SCF))) { | |
342 fprintf(stderr, "%s: failed to write\n", out); | |
343 read_deallocate(r); | |
344 return 1; | |
345 } | |
346 | |
347 read_deallocate(r); | |
348 return 0; | |
349 } | |
350 | |
351 | |
352 void usage(void) { | |
353 fprintf(stderr, | |
354 "makeSCF [-8] [-2] [-3] [-s] [-compress mode] [-normalise]\n" | |
355 " -(abi|alf|scf|any) input_name [-output output_name]\n" | |
356 " or\n" | |
357 "makeSCF [-8] [-2] [-3] [-s] [-compress mode] [-normalise]\n" | |
358 " [-(abi|alf|scf|any)] input_name1 output_name1 ... " | |
359 "input_nameN output_nameN \n"); | |
360 | |
361 exit(1); | |
362 } | |
363 | |
364 int main(int argc, char **argv) { | |
365 int format = TT_ANY, r, prec = 0, version = 3, silent = 0; | |
366 int compress_mode = -1; | |
367 char *inf = NULL; | |
368 char *outf = NULL; | |
369 mFILE *ofp = mstdout(); | |
370 int normalise = 0; | |
371 | |
372 for (argc--, argv++; argc > 0; argc--, argv++) { | |
373 if (strcmp(*argv, "-8") == 0) { | |
374 prec = 1; | |
375 } else if (strcmp(*argv, "-2") == 0) { | |
376 version = 2; | |
377 } else if (strcmp(*argv, "-3") == 0) { | |
378 version = 3; | |
379 } else if (strcmp(*argv, "-normalise") == 0) { | |
380 normalise = 1; | |
381 } else if (strcmp(*argv, "-s") == 0) { | |
382 silent = 1; | |
383 } else if (strcasecmp(*argv, "-abi") == 0) { | |
384 format = TT_ABI; | |
385 inf = *++argv; | |
386 argc--; | |
387 } else if (strcasecmp(*argv, "-alf") == 0) { | |
388 format = TT_ALF; | |
389 inf = *++argv; | |
390 argc--; | |
391 } else if (strcasecmp(*argv, "-scf") == 0) { | |
392 format = TT_SCF; | |
393 inf = *++argv; | |
394 argc--; | |
395 } else if (strcasecmp(*argv, "-ztr") == 0) { | |
396 format = TT_ZTR; | |
397 inf = *++argv; | |
398 argc--; | |
399 } else if (strcasecmp(*argv, "-any") == 0) { | |
400 format = TT_ANY; | |
401 inf = *++argv; | |
402 argc--; | |
403 } else if (strcasecmp(*argv, "-output") == 0) { | |
404 outf = *++argv; | |
405 argc--; | |
406 } else if (strcasecmp(*argv, "-compress") == 0) { | |
407 compress_mode = compress_str2int(*++argv); | |
408 argc--; | |
409 } else { | |
410 break; | |
411 } | |
412 } | |
413 | |
414 /* if no args left than input file must have been specified */ | |
415 if (!argc && !inf) | |
416 usage(); | |
417 | |
418 /* if outfile set, then using original syntax, so don't expect | |
419 any extra args */ | |
420 if (argc && outf) | |
421 usage(); | |
422 | |
423 if (!silent) { | |
424 printf("makeSCF v3.06\n"); | |
425 printf("Copyright (c) MRC Laboratory of Molecular Biology, 2001. All rights reserved.\n"); | |
426 } | |
427 | |
428 set_scf_version(version); | |
429 | |
430 | |
431 if(!argc) { | |
432 /* original calling syntax */ | |
433 if (outf) { | |
434 ofp = mfopen(outf, "wb+"); | |
435 if (NULL == ofp) { | |
436 perror(outf); | |
437 return 1; | |
438 } | |
439 } | |
440 | |
441 r = convert(inf, ofp, outf, format, prec, compress_mode, normalise); | |
442 mfclose(ofp); | |
443 | |
444 return r; | |
445 | |
446 } | |
447 | |
448 /* else */ { | |
449 /* new calling syntax, handling multiple files */ | |
450 int result=0; | |
451 | |
452 for (; argc > 0; argc--, argv++) { | |
453 if (inf) { | |
454 /* got infile, so get outfile and process */ | |
455 outf= *argv; | |
456 ofp = mfopen(outf, "wb+"); | |
457 | |
458 if (NULL == ofp) { | |
459 perror(outf); | |
460 if(!result) result=1; | |
461 continue; | |
462 } | |
463 r = convert(inf, ofp, outf, format, prec, compress_mode, | |
464 normalise); | |
465 mfclose(ofp); | |
466 if(!result) /* keep track of the first error */ | |
467 result=r; | |
468 | |
469 /* now need to get another infile */ | |
470 inf=NULL; | |
471 } else { | |
472 /* need infile */ | |
473 inf= *argv; | |
474 } | |
475 } | |
476 | |
477 return result; | |
478 } | |
479 } | |
480 |