comparison srf2fastq/io_lib-1.12.2/progs/makeSCF.c @ 0:d901c9f41a6a default tip

Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author dawe
date Tue, 07 Jun 2011 17:48:05 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d901c9f41a6a
1 /*
2 * Copyright (c) Medical Research Council 1994-1998. All rights reserved.
3 *
4 * Permission to use, copy, modify and distribute this software and its
5 * documentation for any purpose is hereby granted without fee, provided that
6 * this copyright and notice appears in all copies.
7 *
8 * This file was written by James Bonfield, Simon Dear, Rodger Staden,
9 * as part of the Staden Package at the MRC Laboratory of Molecular
10 * Biology, Hills Road, Cambridge, CB2 2QH, United Kingdom.
11 *
12 * MRC disclaims all warranties with regard to this software.
13 */
14
15
16 /*
17 * makeSCF v3.06, 17/04/2001
18 *
19 * Derived from the older makeSCF; this one has been rewritten to use the new
20 * IO libraries and no longer performs quality clipping itself. It also writes
21 * in a new format that is more easily compressed.
22 */
23
24 #include <stdio.h>
25 #include <strings.h>
26 #include <io_lib/Read.h>
27 #include <io_lib/traceType.h>
28 #include <io_lib/xalloc.h>
29
30 /*
31 * Add our comments.
32 * 1. Work out the comment length - simply add 1K to the current length.
33 * 2. Copy the old comments to our new.
34 * 3. Replace '\n' with ' '.
35 * 4. Add our comments and switch.
36 */
37 void add_comments(Read *r, char *name, int format) {
38 Comments *cc;
39 int clen;
40 char *cp;
41 char buf[1024];
42
43 /* 1. */
44 if (r->info) {
45 clen = strlen(r->info) + 1024;
46 } else {
47 clen = 1024;
48 }
49
50 /* 2. */
51 if (NULL == (cc = (char *)xmalloc(clen)))
52 return;
53
54 if (r->info)
55 strcpy(cc, r->info);
56 else
57 *cc = 0;
58
59 /* 3. */
60 cp = cc;
61 /*
62 while (*cp) {
63 if (*cp == '\n')
64 *cp = ' ';
65
66 cp++;
67 }
68 *cp++ = '\n';
69 */
70
71 /* 4. */
72 sprintf(buf, "CONV=makeSCF V3.06\nDATF=%s\nDATN=%s\n",
73 trace_type_int2str(format), name);
74
75 strcat(cp, buf);
76
77 if (r->info)
78 xfree(r->info);
79
80 r->info = cc;
81 }
82
83 void scale_trace8(Read *r) {
84 double s;
85 int i;
86
87 if (r->maxTraceVal <= 255)
88 return;
89
90 s = ((double)255)/r->maxTraceVal;
91
92 for (i = 0; i < r->NPoints; i++) {
93 r->traceA[i] *= s;
94 r->traceC[i] *= s;
95 r->traceG[i] *= s;
96 r->traceT[i] *= s;
97 }
98
99 r->maxTraceVal = 255;
100 }
101
102 /*
103 * Here we just take the minimum trace value and subtract this from all others.
104 * The assumption is that the signal will always be 'base line' on at least
105 * one of the four channels.
106 */
107 void subtract_background(Read *r) {
108 int i, min;
109 for (i = 0; i < r->NPoints; i++) {
110 min = 999999;
111 if (r->traceA[i] < min) min = r->traceA[i];
112 if (r->traceC[i] < min) min = r->traceC[i];
113 if (r->traceG[i] < min) min = r->traceG[i];
114 if (r->traceT[i] < min) min = r->traceT[i];
115 r->traceA[i] -= min;
116 r->traceC[i] -= min;
117 r->traceG[i] -= min;
118 r->traceT[i] -= min;
119 }
120 }
121
122 /*
123 * Find the average background level of a trace, and subtract this from the
124 * peak heights.
125 *
126 * NB. That method is flawed. For now take the minimum instead of average, but
127 * this also has horrid flaws. See above method.
128 */
129 void subtract_background_old(Read *r) {
130 int i, j, min, bg, max = 0;
131 int win_len = 501, win_len2 = win_len/2;
132 int *background;
133
134 if (NULL == (background = (int *)xmalloc((r->NPoints + 2 * win_len)
135 * sizeof(*background))))
136 return;
137
138 if (r->NPoints < win_len)
139 win_len = r->NPoints;
140
141 /* Find minimum trace levels at each point */
142 for (i = 0, j = win_len2; i < r->NPoints; i++, j++) {
143 min = INT_MAX;
144 if (r->traceA[i] < min)
145 min = r->traceA[i];
146 if (r->traceC[i] < min)
147 min = r->traceC[i];
148 if (r->traceG[i] < min)
149 min = r->traceG[i];
150 if (r->traceT[i] < min)
151 min = r->traceT[i];
152 background[j] = min;
153 }
154 for (i = 0; i < win_len2; i++) {
155 background[i] = background[i + win_len2];
156 background[i + r->NPoints] = background[i + r->NPoints - win_len2];
157 }
158
159 /* Take lowest background over win_len and subtract it */
160 for (i = 0; i < r->NPoints; i++) {
161 /* Could optimise this considerably */
162 bg = INT_MAX;
163 for (j = 0; j < win_len; j++) {
164 if (background[i + j] < bg)
165 bg = background[i + j];
166 }
167
168 r->traceA[i] -= bg;
169 r->traceC[i] -= bg;
170 r->traceG[i] -= bg;
171 r->traceT[i] -= bg;
172
173 if (r->traceA[i] > max) max = r->traceA[i];
174 if (r->traceC[i] > max) max = r->traceC[i];
175 if (r->traceG[i] > max) max = r->traceG[i];
176 if (r->traceT[i] > max) max = r->traceT[i];
177 }
178
179 r->maxTraceVal = max;
180
181 xfree(background);
182 }
183
184 /*
185 * Find the maximum height of traces at the called bases. Use this to clip any
186 * other bases.
187 */
188 void reset_max_called_height(Read *r) {
189 int i, max = 0;
190
191 /* Find max */
192 for (i=0; i < r->NBases; i++) {
193 switch(r->base[i]) {
194 case 'a':
195 case 'A':
196 if (r->traceA[r->basePos[i]] > max)
197 max = r->traceA[r->basePos[i]];
198 break;
199
200 case 'c':
201 case 'C':
202 if (r->traceC[r->basePos[i]] > max)
203 max = r->traceC[r->basePos[i]];
204 break;
205
206 case 'g':
207 case 'G':
208 if (r->traceG[r->basePos[i]] > max)
209 max = r->traceG[r->basePos[i]];
210 break;
211
212 case 't':
213 case 'T':
214 if (r->traceT[r->basePos[i]] > max)
215 max = r->traceT[r->basePos[i]];
216 break;
217 }
218 }
219
220 /* Clip to max */
221 for (i = 0; i < r->NPoints; i++) {
222 if (r->traceA[i] > max)
223 r->traceA[i] = max;
224 if (r->traceC[i] > max)
225 r->traceC[i] = max;
226 if (r->traceG[i] > max)
227 r->traceG[i] = max;
228 if (r->traceT[i] > max)
229 r->traceT[i] = max;
230 }
231 if (r->maxTraceVal > max)
232 r->maxTraceVal = max;
233 }
234
235 void rescale_heights(Read *r) {
236 int win_len = 1000;
237 int total = 0;
238 int max, max2;
239 int i, j, k;
240 double max_val = 0, rescale = 1.0;
241 TRACE *ta, *tc, *tg, *tt;
242
243 ta = r->traceA;
244 tc = r->traceC;
245 tg = r->traceG;
246 tt = r->traceT;
247
248 if (r->NPoints < 2*win_len + 1)
249 return;
250
251 for (k = 0; k < 2; k++) {
252 max2 = win_len * r->maxTraceVal;
253
254 for (i = 0; i < win_len; i++) {
255 max = 0;
256 if (ta[i] > max) max = ta[i];
257 if (tc[i] > max) max = tc[i];
258 if (tg[i] > max) max = tg[i];
259 if (tt[i] > max) max = tt[i];
260 total += max;
261 }
262
263 for (j = 0; i < r->NPoints; i++, j++) {
264 max = 0;
265 if (ta[j] > max) max = ta[j];
266 if (tc[j] > max) max = tc[j];
267 if (tg[j] > max) max = tg[j];
268 if (tt[j] > max) max = tt[j];
269 total -= max;
270
271 max = 0;
272 if (ta[i] > max) max = ta[i];
273 if (tc[i] > max) max = tc[i];
274 if (tg[i] > max) max = tg[i];
275 if (tt[i] > max) max = tt[i];
276 total += max;
277
278 if (k == 0) {
279 if (r->traceA[j] * ((double)max2 / total) > max_val)
280 max_val = r->traceA[j] * ((double)max2 / total);
281 if (r->traceC[j] * ((double)max2 / total) > max_val)
282 max_val = r->traceC[j] * ((double)max2 / total);
283 if (r->traceG[j] * ((double)max2 / total) > max_val)
284 max_val = r->traceG[j] * ((double)max2 / total);
285 if (r->traceT[j] * ((double)max2 / total) > max_val)
286 max_val = r->traceT[j] * ((double)max2 / total);
287 } else {
288 r->traceA[j] *= (double)max2 / total * rescale;
289 r->traceC[j] *= (double)max2 / total * rescale;
290 r->traceG[j] *= (double)max2 / total * rescale;
291 r->traceT[j] *= (double)max2 / total * rescale;
292 }
293 }
294
295 for (; j < r->NPoints; j++) {
296 if (k == 0) {
297 if (r->traceA[j] * ((double)max2 / total) > max_val)
298 max_val = r->traceA[j] * ((double)max2 / total);
299 if (r->traceC[j] * ((double)max2 / total) > max_val)
300 max_val = r->traceC[j] * ((double)max2 / total);
301 if (r->traceG[j] * ((double)max2 / total) > max_val)
302 max_val = r->traceG[j] * ((double)max2 / total);
303 if (r->traceT[j] * ((double)max2 / total) > max_val)
304 max_val = r->traceT[j] * ((double)max2 / total);
305 } else {
306 r->traceA[j] *= (double)max2 / total;
307 r->traceC[j] *= (double)max2 / total;
308 r->traceG[j] *= (double)max2 / total;
309 r->traceT[j] *= (double)max2 / total;
310 }
311 }
312
313 if (max_val > 65535)
314 rescale = 65535 / max_val;
315 else
316 rescale = 1.0;
317 }
318 }
319
320 static int convert(char *in, mFILE *ofp, char *out, int format, int prec,
321 int comp, int normalise) {
322 Read *r;
323
324 if (NULL == (r = read_reading(in, format))) {
325 fprintf(stderr, "%s: failed to read\n", in);
326 return 1;
327 }
328
329 if (normalise) {
330 subtract_background(r);
331 reset_max_called_height(r);
332 rescale_heights(r);
333 }
334
335 add_comments(r, in, format);
336 if (prec == 1)
337 scale_trace8(r);
338
339 if (comp != -1)
340 set_compression_method(comp);
341 if (0 != (mfwrite_reading(ofp, r, TT_SCF))) {
342 fprintf(stderr, "%s: failed to write\n", out);
343 read_deallocate(r);
344 return 1;
345 }
346
347 read_deallocate(r);
348 return 0;
349 }
350
351
352 void usage(void) {
353 fprintf(stderr,
354 "makeSCF [-8] [-2] [-3] [-s] [-compress mode] [-normalise]\n"
355 " -(abi|alf|scf|any) input_name [-output output_name]\n"
356 " or\n"
357 "makeSCF [-8] [-2] [-3] [-s] [-compress mode] [-normalise]\n"
358 " [-(abi|alf|scf|any)] input_name1 output_name1 ... "
359 "input_nameN output_nameN \n");
360
361 exit(1);
362 }
363
364 int main(int argc, char **argv) {
365 int format = TT_ANY, r, prec = 0, version = 3, silent = 0;
366 int compress_mode = -1;
367 char *inf = NULL;
368 char *outf = NULL;
369 mFILE *ofp = mstdout();
370 int normalise = 0;
371
372 for (argc--, argv++; argc > 0; argc--, argv++) {
373 if (strcmp(*argv, "-8") == 0) {
374 prec = 1;
375 } else if (strcmp(*argv, "-2") == 0) {
376 version = 2;
377 } else if (strcmp(*argv, "-3") == 0) {
378 version = 3;
379 } else if (strcmp(*argv, "-normalise") == 0) {
380 normalise = 1;
381 } else if (strcmp(*argv, "-s") == 0) {
382 silent = 1;
383 } else if (strcasecmp(*argv, "-abi") == 0) {
384 format = TT_ABI;
385 inf = *++argv;
386 argc--;
387 } else if (strcasecmp(*argv, "-alf") == 0) {
388 format = TT_ALF;
389 inf = *++argv;
390 argc--;
391 } else if (strcasecmp(*argv, "-scf") == 0) {
392 format = TT_SCF;
393 inf = *++argv;
394 argc--;
395 } else if (strcasecmp(*argv, "-ztr") == 0) {
396 format = TT_ZTR;
397 inf = *++argv;
398 argc--;
399 } else if (strcasecmp(*argv, "-any") == 0) {
400 format = TT_ANY;
401 inf = *++argv;
402 argc--;
403 } else if (strcasecmp(*argv, "-output") == 0) {
404 outf = *++argv;
405 argc--;
406 } else if (strcasecmp(*argv, "-compress") == 0) {
407 compress_mode = compress_str2int(*++argv);
408 argc--;
409 } else {
410 break;
411 }
412 }
413
414 /* if no args left than input file must have been specified */
415 if (!argc && !inf)
416 usage();
417
418 /* if outfile set, then using original syntax, so don't expect
419 any extra args */
420 if (argc && outf)
421 usage();
422
423 if (!silent) {
424 printf("makeSCF v3.06\n");
425 printf("Copyright (c) MRC Laboratory of Molecular Biology, 2001. All rights reserved.\n");
426 }
427
428 set_scf_version(version);
429
430
431 if(!argc) {
432 /* original calling syntax */
433 if (outf) {
434 ofp = mfopen(outf, "wb+");
435 if (NULL == ofp) {
436 perror(outf);
437 return 1;
438 }
439 }
440
441 r = convert(inf, ofp, outf, format, prec, compress_mode, normalise);
442 mfclose(ofp);
443
444 return r;
445
446 }
447
448 /* else */ {
449 /* new calling syntax, handling multiple files */
450 int result=0;
451
452 for (; argc > 0; argc--, argv++) {
453 if (inf) {
454 /* got infile, so get outfile and process */
455 outf= *argv;
456 ofp = mfopen(outf, "wb+");
457
458 if (NULL == ofp) {
459 perror(outf);
460 if(!result) result=1;
461 continue;
462 }
463 r = convert(inf, ofp, outf, format, prec, compress_mode,
464 normalise);
465 mfclose(ofp);
466 if(!result) /* keep track of the first error */
467 result=r;
468
469 /* now need to get another infile */
470 inf=NULL;
471 } else {
472 /* need infile */
473 inf= *argv;
474 }
475 }
476
477 return result;
478 }
479 }
480