comparison srf2fastq/io_lib-1.12.2/io_lib/translate.c @ 0:d901c9f41a6a default tip

Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author dawe
date Tue, 07 Jun 2011 17:48:05 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d901c9f41a6a
1 /*
2 * Copyright (c) Medical Research Council 1994. All rights reserved.
3 *
4 * Permission to use, copy, modify and distribute this software and its
5 * documentation for any purpose is hereby granted without fee, provided that
6 * this copyright and notice appears in all copies.
7 *
8 * This file was written by James Bonfield, Simon Dear, Rodger Staden,
9 * as part of the Staden Package at the MRC Laboratory of Molecular
10 * Biology, Hills Road, Cambridge, CB2 2QH, United Kingdom.
11 *
12 * MRC disclaims all warranties with regard to this software.
13 */
14
15 /*
16 * File: translate.c
17 * Purpose: Translates between different reading structures.
18 * Last update: 01/09/94
19 */
20
21 #include <stdio.h>
22 #ifndef NDEBUG
23 # define NDEBUG /* disable assertions */
24 #endif
25 #include <assert.h>
26
27 #include "io_lib/stdio_hack.h"
28 #include "io_lib/misc.h"
29 #include "io_lib/scf.h"
30 #include "io_lib/Read.h"
31 #include "io_lib/expFileIO.h"
32 #include "io_lib/traceType.h"
33 #include "io_lib/translate.h"
34 #include "io_lib/open_trace_file.h"
35 #ifdef USE_BIOLIMS
36 #include "spBiolims.h"
37 #endif
38
39
40
41 #ifdef IOLIB_SCF
42 /*
43 * Translates an Scf structure into a Read structure.
44 * The Scf structure is left unchanged.
45 *
46 * Returns:
47 * A pointer to an allocated Read structure upon success.
48 * NULLRead upon failure.
49 */
50 Read *scf2read(Scf *scf) {
51 Read *read;
52 register int i, i_end;
53 TRACE max_val = 0;
54 int sections = read_sections(0);
55 int nsamples = 0;
56 int nbases = 0;
57
58 /* allocate */
59 if (sections & READ_SAMPLES)
60 nsamples = scf->header.samples;
61 if (sections & READ_BASES)
62 nbases = scf->header.bases;
63 read = read_allocate(nsamples, nbases);
64
65 if (NULLRead == read)
66 return NULLRead;
67
68 if (sections & READ_SAMPLES) {
69 /* copy the samples */
70 i_end = scf->header.samples;
71 read->NPoints = i_end;
72
73 if (scf->header.sample_size == 1) {
74 for (i = 0; i < i_end; i++) {
75 read->traceA[i] = scf->samples.samples1[i].sample_A;
76 read->traceC[i] = scf->samples.samples1[i].sample_C;
77 read->traceG[i] = scf->samples.samples1[i].sample_G;
78 read->traceT[i] = scf->samples.samples1[i].sample_T;
79
80 if (read->traceA[i] > max_val) max_val = read->traceA[i];
81 if (read->traceC[i] > max_val) max_val = read->traceC[i];
82 if (read->traceG[i] > max_val) max_val = read->traceG[i];
83 if (read->traceT[i] > max_val) max_val = read->traceT[i];
84 }
85 } else { /* sample_size == 2 */
86 for (i = 0; i < i_end; i++) {
87 read->traceA[i] = scf->samples.samples2[i].sample_A;
88 read->traceC[i] = scf->samples.samples2[i].sample_C;
89 read->traceG[i] = scf->samples.samples2[i].sample_G;
90 read->traceT[i] = scf->samples.samples2[i].sample_T;
91
92 if (read->traceA[i] > max_val) max_val = read->traceA[i];
93 if (read->traceC[i] > max_val) max_val = read->traceC[i];
94 if (read->traceG[i] > max_val) max_val = read->traceG[i];
95 if (read->traceT[i] > max_val) max_val = read->traceT[i];
96 }
97 }
98
99 read->maxTraceVal = max_val;
100 }
101
102 if (sections & READ_BASES) {
103 /* copy the bases */
104 i_end = scf->header.bases;
105 read->NBases = i_end;
106
107 for (i = 0; i < i_end; i++) {
108 read->basePos[i] = scf->bases[i].peak_index;
109 read->prob_A[i] = scf->bases[i].prob_A;
110 read->prob_C[i] = scf->bases[i].prob_C;
111 read->prob_G[i] = scf->bases[i].prob_G;
112 read->prob_T[i] = scf->bases[i].prob_T;
113 read->base[i] = scf->bases[i].base;
114 }
115 read->base[i] = 0;
116 }
117
118 if (sections & READ_COMMENTS) {
119 /* allocate and copy the comments */
120 if (scf->header.comments_size > 0 && scf->comments) {
121 read->info = (char *)xmalloc(scf->header.comments_size+1);
122 if (NULL == read->info) {
123 read_deallocate(read);
124 return NULLRead;
125 }
126
127 memcpy(read->info, scf->comments, scf->header.comments_size);
128 read->info[scf->header.comments_size] = '\0';
129 }
130 }
131
132 /* other bits and pieces */
133 read->leftCutoff = scf->header.bases_left_clip;
134 read->rightCutoff = read->NBases - scf->header.bases_right_clip + 1;
135 read->format = TT_SCF;
136
137 if (scf->private_data) {
138 read->private_data = xmalloc(scf->header.private_size);
139 memcpy(read->private_data,scf->private_data, scf->header.private_size);
140 }
141
142 return read;
143 }
144
145 /*
146 * Translates a Read structure into a Scf structure.
147 * The Read structure is left unchanged.
148 *
149 * Returns:
150 * A pointer to an allocated Scf structure upon success.
151 * NULL upon failure.
152 */
153 Scf *read2scf(Read *read) {
154 Scf *scf;
155 register int i, i_end;
156 int sample_size;
157
158 /* allocate */
159 sample_size = read->maxTraceVal >= 0x100 ? 2 : 1;
160 scf = scf_allocate(read->NPoints, sample_size, read->NBases, 0, 0);
161 if (NULL == scf)
162 return NULL;
163
164 /* copy the samples */
165 i_end = read->NPoints;
166 scf->header.samples = i_end;
167
168 if (sample_size == 1) {
169 scf->header.sample_size = 1;
170 for (i = 0; i < i_end; i++) {
171 scf->samples.samples1[i].sample_A = (uint_1)read->traceA[i];
172 scf->samples.samples1[i].sample_C = (uint_1)read->traceC[i];
173 scf->samples.samples1[i].sample_G = (uint_1)read->traceG[i];
174 scf->samples.samples1[i].sample_T = (uint_1)read->traceT[i];
175 }
176 } else {
177 scf->header.sample_size = 2;
178 for (i = 0; i < i_end; i++) {
179 scf->samples.samples2[i].sample_A = read->traceA[i];
180 scf->samples.samples2[i].sample_C = read->traceC[i];
181 scf->samples.samples2[i].sample_G = read->traceG[i];
182 scf->samples.samples2[i].sample_T = read->traceT[i];
183 }
184 }
185
186 /* copy the bases */
187 i_end = read->NBases;
188 scf->header.bases = i_end;
189
190 for (i = 0; i < i_end; i++) {
191 scf->bases[i].peak_index = read->basePos ? read->basePos[i] : i;
192 scf->bases[i].prob_A = read->prob_A ? read->prob_A[i] : 0;
193 scf->bases[i].prob_C = read->prob_A ? read->prob_C[i] : 0;
194 scf->bases[i].prob_G = read->prob_A ? read->prob_G[i] : 0;
195 scf->bases[i].prob_T = read->prob_A ? read->prob_T[i] : 0;
196 scf->bases[i].base = read->base ? read->base[i] : '-';
197 }
198
199 /* allocate and copy the comments */
200 if (read->info) {
201 scf->header.comments_size = strlen(read->info) + 1;
202 scf->comments = (char *)xmalloc(scf->header.comments_size);
203 if (NULL == scf->comments) {
204 scf_deallocate(scf);
205 return NULL;
206 }
207
208 memcpy(scf->comments, read->info, scf->header.comments_size - 1);
209
210 /* just to make sure */
211 scf->comments[scf->header.comments_size-1] = '\0';
212 }
213
214 /* other bits and pieces */
215 scf->header.bases_left_clip = read->leftCutoff;
216 scf->header.bases_right_clip = read->NBases - read->rightCutoff + 1;
217 scf->header.code_set = CSET_DEFAULT;
218 memcpy(scf->header.version, scf_version_float2str(SCF_VERSION), 4);
219
220 return scf;
221 }
222 #endif /* IOLIB_SCF */
223
224
225 #ifdef IOLIB_EXP
226
227 #define extend(e, entry, len) \
228 do { \
229 (void)ArrayRef(e->entries[entry],e->Nentries[entry]++); \
230 if (NULL == (exp_get_entry(e, entry) = (char *)xmalloc(len))) \
231 return NULL; \
232 } while (0)
233
234 /*
235 * Translates a Read structure and an Experiment file.
236 * The Read structure is left unchanged.
237 *
238 * Returns:
239 * A pointer to an allocated Exp_info structure upon success.
240 * NULL upon failure.
241 */
242 Exp_info *read2exp(Read *read, char *EN) {
243 Exp_info *e;
244 char *t = trace_type_int2str(read->format), *p;
245 int l = strlen(EN)+1;
246 char *sq;
247 int i;
248 static char valid_bases[256];
249 static int valid_setup = 0;
250
251 if (!valid_setup) {
252 for (i = 0; i < 256; i++)
253 valid_bases[i] = '-';
254 /* IUBC codes */
255 for (sq = "acgturymkswbdhvnACGTURYMKSWBDHVN"; *sq; sq++)
256 valid_bases[(unsigned)*sq] = *sq;
257 valid_setup = 1;
258 }
259
260 if (NULL == (e = exp_create_info()))
261 return NULL;
262
263 /* Copy original exp file if present */
264 if (read->orig_trace && read->orig_trace_format == TT_EXP) {
265 int i, j, k;
266 Exp_info *re = (Exp_info *)read->orig_trace;
267
268 for (i = 0; i < MAXIMUM_EFLTS; i++) {
269 if (EFLT_SQ == i ||
270 EFLT_QL == i ||
271 EFLT_QR == i)
272 continue;
273
274 if (0 == (k = exp_Nentries(re, i)))
275 continue;
276
277 e->Nentries[i] = k;
278 ArrayRef(e->entries[i], e->Nentries[i]);
279 for (j = 0; j < k; j++) {
280 arr(char *, e->entries[i], j) =
281 strdup(arr(char *, re->entries[i], j));
282 }
283 }
284
285 /* Otherwise create our EN, ID, LN and LT lines */
286 } else {
287 /* Entry name and ID lines */
288 if (p = strrchr(EN, '/'))
289 EN = p+1;
290 extend(e, EFLT_EN, l);
291 sprintf(exp_get_entry(e, EFLT_EN), "%s", EN);
292 extend(e, EFLT_ID, l);
293 sprintf(exp_get_entry(e, EFLT_ID), "%s", EN);
294
295 /* Trace file & type */
296 if (read->trace_name) {
297 char *cp;
298 if (cp = strrchr(read->trace_name, '/'))
299 cp++;
300 else
301 cp = read->trace_name;
302 extend(e, EFLT_LN, strlen(cp)+1);
303 strcpy(exp_get_entry(e, EFLT_LN), cp);
304 }
305
306 if (read->format != TT_ANY && read->format != TT_ANYTR) {
307 extend(e, EFLT_LT, strlen(t)+1);
308 strcpy(exp_get_entry(e, EFLT_LT), t);
309 }
310 }
311
312 /* Output SQ, QL and QR lines */
313
314 /* Cutoffs */
315 if (read->leftCutoff) {
316 extend(e, EFLT_QL, 15);
317 sprintf(exp_get_entry(e, EFLT_QL), "%d", read->leftCutoff);
318 }
319
320 if (read->rightCutoff && read->rightCutoff != read->NBases+1) {
321 extend(e, EFLT_QR, 15);
322 sprintf(exp_get_entry(e, EFLT_QR), "%d", read->rightCutoff);
323 }
324
325 /* Bases */
326 extend(e, EFLT_SQ, read->NBases+1);
327 sq = exp_get_entry(e, EFLT_SQ);
328 for (i = 0; i < read->NBases; i++) {
329 sq[i] = valid_bases[(unsigned)read->base[i]];
330 }
331 sq[i] = 0;
332
333 #ifdef USE_BIOLIMS
334 /*
335 * Johnt:
336 * - Added tags below to allow for biolims update
337 * - This is all a very big bodge to allow BioLIMS
338 * attributes to be passed through the Read structure
339 * to the Experiment file
340 * - Any changes to this should also be mirrored in ../biolims/Exp.cpp
341 */
342 {
343 int1 *qa; /* quality array */
344 int i;
345 char tmp[1024];
346 char *line; /* current line from info */
347 char *end; /* end of current line */
348
349 /* AV only supports a single prob value for now */
350 qa = (int1 *)malloc((read->NBases+1)*sizeof(int1));
351
352 /* need max 4 bytes per value - see conf2str */
353 extend(e,EFLT_AV,(read->NBases+1)*5);
354
355 /* merge into single quality array */
356 for(i=0;i<read->NBases;i++){
357 switch(read->base[i]){
358 case 'a':
359 case 'A':
360 qa[i] = read->prob_A[i];
361 break;
362 case 'c':
363 case 'C':
364 qa[i] = read->prob_C[i];
365 break;
366 case 'g':
367 case 'G':
368 qa[i] = read->prob_G[i];
369 break;
370 case 't':
371 case 'T':
372 qa[i] = read->prob_T[i];
373 break;
374 default:
375 qa[i] = 0;
376 }
377 }
378 conf2str(qa,read->NBases,exp_get_entry(e, EFLT_AV));
379 free(qa);
380
381 /* Parse the read notes for everything else */
382 if( read->info) {
383 for(line=read->info;end=strchr(line,'\n');line=end+1){
384 *end='\0'; /* * put back */
385
386 /* look for known tags */
387 if(!strncmp(line,EXP_CHEM,EXP_TAGLEN)){
388 /* CH */
389 int chem=0;
390 extend(e, EFLT_CH, 15);
391 if( !strcmp(CH_types[1],line+EXP_TAGLEN))
392 chem=1;
393 sprintf(exp_get_entry(e, EFLT_CH), "%d",chem );
394
395 } else if(!strncmp(line,EXP_PRMR,EXP_TAGLEN)){
396 /* PR */
397 int primer=0;
398 extend(e,EFLT_PR,15);
399 for(primer=1;primer<nPR_types;primer++)
400 if(!strcmp(PR_types[i],line+EXP_TAGLEN))
401 break;
402 if(primer>=nPR_types)
403 primer=1;
404 sprintf(exp_get_entry(e, EFLT_PR), "%d",primer);
405
406 } else if(!strncmp(line,EXP_VECT,EXP_TAGLEN)){
407 /* SV */
408 extend(e,EFLT_SV,strlen(line)-EXP_TAGLEN+1);
409 strcpy(exp_get_entry(e, EFLT_SV),line+EXP_TAGLEN);
410
411 } else if(!strncmp(line,EXP_CLOV,EXP_TAGLEN)){
412 /* CV */
413 extend(e,EFLT_CV,strlen(line)-EXP_TAGLEN+1);
414 strcpy(exp_get_entry(e, EFLT_CV),line+EXP_TAGLEN);
415
416 } else if(!strncmp(line,EXP_CLON,EXP_TAGLEN)){
417 /* CN */
418 extend(e,EFLT_CN,strlen(line)-EXP_TAGLEN+1);
419 strcpy(exp_get_entry(e, EFLT_CN),line+EXP_TAGLEN);
420
421 } else if(!strncmp(line,EXP_FEAT,EXP_TAGLEN)){
422 /* FEAT=start stop key\r\rcomment */
423 /* key and comment have \n encoded as \r */
424 int start,stop,i;
425 char *key; /* biolims feature key */
426 char *comment; /* biolims feature comment */
427 line+=EXP_TAGLEN;
428 start=atoi(line);
429 line=strchr(line,' ')+1;
430 stop=atoi(line);
431 key=strchr(line,' ')+1;
432 comment=strstr(key,"\r\r");
433 *comment='\0'; /* * put back */
434 comment+=2;
435 /* replace \r with \n in key and comment */
436 for(i=0;key[i];i++)
437 if(key[i]=='\r') key[i]='\n';
438 for(i=0;comment[i];i++)
439 if(comment[i]=='\r') comment[i]='\n';
440 /* could possibly be one of a number of EXP tags excoded
441 as a BioLIMS feature */
442 if(!strncmp(key,featCLON,STADEN_FKEY_LEN)){
443 /* CS */
444 extend(e, EFLT_CS, 32);
445 exp_create_range(exp_get_entry(e, EFLT_CS),start,stop);
446 } else if(!strncmp(key,featVECI,STADEN_FKEY_LEN)){
447 /* SI */
448 extend(e, EFLT_SI, 32);
449 exp_create_range(exp_get_entry(e, EFLT_SI),start,stop);
450 } else if(!strncmp(key,featTEMP,STADEN_FKEY_LEN)){
451 /* TN */
452 extend(e, EFLT_TN, strlen(comment)+1);
453 strcpy(exp_get_entry(e, EFLT_TN),comment);
454 } else if(!strncmp(key,featSTRD,STADEN_FKEY_LEN)){
455 /* ST */
456 extend(e, EFLT_ST, strlen(comment)+1);
457 strcpy(exp_get_entry(e, EFLT_ST),comment);
458 } else if( !strncmp(key,featVECT,STADEN_FKEY_LEN)){
459 /* SL and SR */
460 extend(e,EFLT_SL,15);
461 extend(e,EFLT_SR,15);
462 sprintf(exp_get_entry(e, EFLT_SL),"%d",start);
463 sprintf(exp_get_entry(e, EFLT_SR),"%d",stop);
464 } else if( !strncmp(key,featGELR,STADEN_FKEY_LEN)){
465 /* TG */
466 char tag[5]; /* staden note tag (always 4 chars) */
467 char strand=*comment; /* first char of comment */
468 /* key has format STADEN_GELR:XXXX -
469 where XXXX is staden note tag */
470 strncpy(tag,key+STADEN_FKEY_LEN+1,4);
471 tag[4]='\0';
472 comment+=2; /* skip over strand */
473 sprintf(tmp,"%s %c %d..%d\n%s",
474 key,strand,start,stop,comment);
475 extend(e,EFLT_TG,strlen(tmp)+1);
476 strcpy(exp_get_entry(e, EFLT_TG),tmp);
477
478 } else if( !strncmp(key,featCONS,STADEN_FKEY_LEN)){
479 /* TC */
480 char tag[5]; /* staden note tag (always 4 chars) */
481 /* comment has the format Srstart-rend\ncomment
482 S is a single character strand indicator
483 rstart and rend are the real start and end of the
484 tag
485 */
486 char strand=*comment; /* first char of comment */
487 char *rangestart = comment+2; /*skip over strand*/
488 char *rangeend;
489 char *emptystring="";
490
491 /* key has format STADEN_CONS:XXXX -
492 where XXXX is staden note tag */
493 strncpy(tag,key+STADEN_FKEY_LEN+1,4);
494 tag[4]='\0';
495 /* the REAL range might actually be outside the bases,
496 so this is recorded in the first line of
497 the comment. This is merged with the
498 feature range to allow for complimenting */
499 comment=strchr(rangestart,'\n');
500 if( comment )
501 *(comment++)='\0'; /* *** put back */
502 else
503 comment=emptystring;
504
505 /* now special processing of bounds */
506 rangeend=strchr(rangestart,'-');
507 if( rangeend ){
508 long rstart,rend;
509 *(rangeend++)='\0'; /* *** put back */
510 rstart=atol(rangestart);
511 rend=atol(rangeend);
512 *(rangeend-1)='-';
513
514 /* if start is the same as rstart, just
515 need to extend the stop bounds to the
516 real end If not there has been some
517 complimenting happening so need to
518 extend the start bounds, as its now
519 backwards. */
520 if( start==rstart)
521 stop=start+rend-rstart;
522 else
523 start=stop-rend+rstart;
524 } else {
525 /* don't have a range so just use the
526 feature size */
527 }
528
529 sprintf(tmp,"%s %c %d..%d\n%s",
530 key,strand,start,stop,comment);
531 extend(e,EFLT_TC,strlen(tmp)+1);
532 strcpy(exp_get_entry(e, EFLT_TC),tmp);
533 if( comment!=emptystring)
534 *(comment-1)='\n';
535
536 } else {
537 /* TG */
538 /* a biolims feature, encode this with tag
539 BIOL, and the Biolims Feature key in
540 the first line of the comment*/
541 sprintf(tmp,"%s = %d..%d\n%s%s\n%s",/* use strand = */
542 tagBIOL,start,stop,featKey,key,comment);
543 extend(e,EFLT_TG,strlen(tmp)+1);
544 strcpy(exp_get_entry(e, EFLT_TG),tmp);
545 }
546 *(comment-2)='\r';
547 }
548 /* else unused value */
549
550 *end='\n';
551 }
552 }
553 }
554 #else /* USE_BIOLIMS */
555 /* Confidence values */
556 if (read->prob_A && read->prob_C && read->prob_G && read->prob_T &&
557 read->NBases > 0) {
558 /* We have some, but are they non zero values? */
559 for (i = 0; i < read->NBases; i++) {
560 if (read->prob_A[i] || read->prob_C[i] ||
561 read->prob_G[i] || read->prob_T[i])
562 break;
563 }
564 if (i != read->NBases) {
565 int1 *conf = (int1 *)xmalloc(read->NBases);
566 char *cstr = (char *)xmalloc(read->NBases * 5 + 2);
567
568 for (i = 0; i < read->NBases; i++) {
569 switch (read->base[i]) {
570 case 'a':
571 case 'A':
572 conf[i] = read->prob_A[i];
573 break;
574 case 'c':
575 case 'C':
576 conf[i] = read->prob_C[i];
577 break;
578 case 'g':
579 case 'G':
580 conf[i] = read->prob_G[i];
581 break;
582 case 't':
583 case 'T':
584 conf[i] = read->prob_T[i];
585 break;
586 default:
587 conf[i] = (read->prob_A[i] + read->prob_C[i] +
588 read->prob_G[i] + read->prob_T[i]) / 4;
589 }
590 }
591
592 conf2str(conf, read->NBases, cstr);
593 extend(e, EFLT_AV, strlen(cstr)+1);
594 sprintf(exp_get_entry(e, EFLT_AV), "%s", cstr);
595 xfree(conf);
596 xfree(cstr);
597 }
598 }
599 #endif /* if USE_BIOLIMS else ... */
600
601 return e;
602 }
603
604
605 /*
606 * Controls the use of the SQ and ON lines when loading an experiment file.
607 * The default (value&1 == 1) is to load these into the Read structure.
608 * With value&1 == 0 we load the sequence directly from the trace file
609 * (LT line).
610 * value&2 controls whether to use the SL/SR fields when setting the cutoff.
611 * value&2 == 0 implies to do so, and value&2 == 2 implies to not.
612 *
613 * The default use is to use the SQ and ON lines. Returns the old value.
614 */
615 static int use_experiment_sequence = 1;
616 int read_experiment_redirect(int value) {
617 int old = use_experiment_sequence;
618
619 use_experiment_sequence = value;
620 return old;
621 }
622
623
624 /*
625 * Translates an experiment file to a Read structure.
626 * The Exp_info structure is left unchanged.
627 *
628 * Returns:
629 * A pointer to an allocated Read structure upon success.
630 * NULLRead upon failure.
631 */
632 Read *exp2read(Exp_info *e, char *fn) {
633 Read *r;
634 int q, s, ttype, err = 0;
635 char *str;
636 int use_exp = use_experiment_sequence;
637 FILE *fp;
638
639 if (!exp_Nentries(e, EFLT_LN)) {
640 err = 1;
641 } else {
642 /* Read the trace component of the experiment file */
643 ttype = exp_Nentries(e,EFLT_LT)
644 ? trace_type_str2int(exp_get_entry(e, EFLT_LT)) : TT_ANYTR;
645
646 if (fp = open_trace_file(exp_get_entry(e, EFLT_LN), fn)) {
647 if (NULLRead == (r = fread_reading(fp, NULL, ttype)))
648 err = 1;
649 } else {
650 err = 1;
651 }
652 }
653
654 if (err) {
655 use_exp = 1;
656 r = read_allocate(0, 1);
657 }
658
659 /* Set the left cutoff (QL / SL) */
660 q=-1;
661 if (exp_Nentries(e, EFLT_QL))
662 q = atoi(exp_get_entry(e, EFLT_QL));
663 if ((use_exp&2) != 2) {
664 s=-1;
665 if (exp_Nentries(e, EFLT_SL))
666 s = atoi(exp_get_entry(e, EFLT_SL));
667 if (q != -1 || s != -1)
668 r->leftCutoff = MAX(q, s);
669 } else {
670 r->leftCutoff = q != -1 ? q : 0;
671 }
672
673 /* Set the right cutoff (QR / SR) */
674 q = INT_MAX;
675 if (exp_Nentries(e, EFLT_QR))
676 q = atoi(exp_get_entry(e, EFLT_QR));
677 if ((use_exp&2) != 2) {
678 s = INT_MAX;
679 if (exp_Nentries(e, EFLT_SR))
680 s = atoi(exp_get_entry(e, EFLT_SR));
681 if (q != INT_MAX || s != INT_MAX)
682 r->rightCutoff = MIN(q, s);
683 } else {
684 r->rightCutoff = q != INT_MAX ? q : 0;
685 }
686
687 if (r->rightCutoff && r->rightCutoff <= r->leftCutoff)
688 r->rightCutoff = r->leftCutoff+1;
689
690 /* Bases and base positions, if desired */
691 if (use_exp&1) {
692 if (exp_Nentries(e, EFLT_SQ) && (str = exp_get_entry(e, EFLT_SQ))) {
693 int slen = strlen(str);
694
695 if (NULL == (r->base = (char *)xrealloc(r->base, slen+1)))
696 return NULLRead;
697 if (NULL == (r->prob_A = (char *)xrealloc(r->prob_A, slen+1)))
698 return NULLRead;
699 if (NULL == (r->prob_C = (char *)xrealloc(r->prob_C, slen+1)))
700 return NULLRead;
701 if (NULL == (r->prob_G = (char *)xrealloc(r->prob_G, slen+1)))
702 return NULLRead;
703 if (NULL == (r->prob_T = (char *)xrealloc(r->prob_T, slen+1)))
704 return NULLRead;
705 if (r->basePos) {
706 xfree(r->basePos);
707 r->basePos = NULL;
708 }
709
710 /* Clear them */
711 memset(r->prob_A, 0, slen);
712 memset(r->prob_C, 0, slen);
713 memset(r->prob_G, 0, slen);
714 memset(r->prob_T, 0, slen);
715
716 strcpy(r->base, str);
717 r->NBases = slen;
718
719 /* Copy AV values into prob_* arrays */
720 if (exp_Nentries(e, EFLT_AV) &&
721 (str = exp_get_entry(e, EFLT_AV))) {
722 int1 *conf = (int1 *)xmalloc((slen+1)*sizeof(*conf));
723 int i;
724
725 str2conf(conf, slen, str);
726 for (i = 0; i < slen ; i++) {
727 switch(r->base[i]) {
728 case 'a':
729 case 'A':
730 r->prob_A[i] = conf[i];
731 r->prob_C[i] = 0;
732 r->prob_G[i] = 0;
733 r->prob_T[i] = 0;
734 break;
735 case 'c':
736 case 'C':
737 r->prob_A[i] = 0;
738 r->prob_C[i] = conf[i];
739 r->prob_G[i] = 0;
740 r->prob_T[i] = 0;
741 break;
742 case 'g':
743 case 'G':
744 r->prob_A[i] = 0;
745 r->prob_C[i] = 0;
746 r->prob_G[i] = conf[i];
747 r->prob_T[i] = 0;
748 break;
749 case 't':
750 case 'T':
751 r->prob_A[i] = 0;
752 r->prob_C[i] = 0;
753 r->prob_G[i] = 0;
754 r->prob_T[i] = conf[i];
755 break;
756 default:
757 r->prob_A[i] = conf[i];
758 r->prob_C[i] = conf[i];
759 r->prob_G[i] = conf[i];
760 r->prob_T[i] = conf[i];
761 break;
762 }
763 }
764
765 xfree(conf);
766 } else {
767 memset(r->prob_A, 0, slen * sizeof(r->prob_A[0]));
768 memset(r->prob_C, 0, slen * sizeof(r->prob_C[0]));
769 memset(r->prob_G, 0, slen * sizeof(r->prob_G[0]));
770 memset(r->prob_T, 0, slen * sizeof(r->prob_T[0]));
771 }
772 }
773
774 r->format = TT_EXP;
775 }
776
777 r->orig_trace = e;
778 r->orig_trace_format = TT_EXP;
779 r->orig_trace_free = (void (*)(void *))exp_destroy_info;
780
781 return r;
782 }
783 #endif /* IOLIB_EXP */
784
785
786
787 /*
788 * Takes an original read structure and a set of edit change arrays and
789 * produces a new base position array incorporating all the edits. For
790 * insertions, interpolation is used to derive a suitable sample position.
791 *
792 * INPUTS:
793 *
794 * Read *r = The original unedited read structure
795 * int Comp = 0=Normal sequence, 1=Complemented sequence
796 * int Ned = Length of edited arrays to follow
797 * char *edBases = Sequence of base characters incorporating ins/del edits
798 * uint_2 *edPos = Corresponding original base numbers, 0 indicates an
799 * insertion. Base numbers start at 1.
800 *
801 * OUTPUTS:
802 *
803 * This array is assumed to be empty with an allocated length of Ned elements.
804 *
805 * uint_2* basePos = Base positions in samples
806 */
807
808 void read_update_base_positions( Read *r, int Comp, int Ned, char *edBases,
809 int_2 *edPos, uint_2 *basePos )
810 {
811 int i, j;
812 int gap;
813 int delta;
814 int o_N;
815 int o_NPoints;
816 uint_2* o_basePos;
817 int start;
818 int end;
819
820
821
822 /* Check input */
823 assert(r);
824 assert(edBases);
825 assert(edPos);
826 assert(basePos);
827 assert(Ned>0);
828 if( (Ned<=0) || !r || !edBases || !edPos || !basePos )
829 return;
830
831
832
833 /* Original sequence data */
834 o_N = r->NBases;
835 o_NPoints = r->NPoints;
836 o_basePos = r->basePos;
837
838
839
840 /* Copy original base positions */
841 for( i=0; i<Ned; i++ )
842 {
843 /* If inserted base, set position to zero */
844 if( edPos[i] == 0 )
845 {
846 basePos[i] = 0;
847 }
848 else
849 {
850 /* Get original position taking into account complementation */
851 basePos[i] = o_basePos[ Comp ? o_N-edPos[i]+1-1 : edPos[i]-1 ];
852 }
853 }
854
855
856
857 /* Do linear interpolation to create positions for inserted bases */
858 for( i=0; i<Ned; i++ )
859 {
860 /* Search for start */
861 while( (basePos[i]!=0) && (i<Ned) )
862 i++;
863 start = (i==0) ? 0 : basePos[i-1];
864
865
866
867 /* Search for end */
868 gap = 0;
869 while( (basePos[i]==0) && (i<Ned) )
870 {
871 gap++;
872 i++;
873 }
874 if( i == Ned )
875 {
876 if( gap == 0 )
877 return;
878 end = o_NPoints;
879 }
880 else
881 {
882 end = basePos[i];
883 }
884
885
886
887 /* Do the interpolation */
888 delta = (end - start) / (gap + 1);
889 for( j=i-gap; j<i; j++ )
890 {
891 /* Watch out for insertions at start (j==0) */
892 if( j==0 )
893 basePos[j] = delta;
894 else
895 basePos[j] = basePos[j-1] + delta;
896 }
897 }
898 }
899
900
901
902 /*
903 * Takes a set of edit change arrays and produces a new set of confidence
904 * arrays incorporating all the edits.
905 *
906 * INPUTS:
907 *
908 * int Ned = Length of edited arrays to follow
909 * char* edBases = Sequence of base characters incorporating ins/del edits
910 * int1* edConf = Corresponding confidence values, 100 for insertions
911 *
912 *
913 * OUTPUTS:
914 *
915 * These output arrays are assumed to be empty with an allocated length
916 * of Ned elements each. The names and types are identical to the same
917 * elements in the Read structure.
918 *
919 * char* prob_A = Base confidence A
920 * char* prob_C = Base confidence C
921 * char* prob_G = Base confidence G
922 * char* prob_T = Base confidence T
923 *
924 */
925
926 void read_update_confidence_values( int Ned, char* edBases, int1* edConf,
927 char* prob_A, char* prob_C, char* prob_G, char* prob_T )
928 {
929 int i;
930 char c;
931
932
933
934 /* Check input */
935 assert(Ned>0);
936 assert(edBases);
937 assert(edConf);
938 assert(prob_A);
939 assert(prob_C);
940 assert(prob_G);
941 assert(prob_T);
942 if( (Ned<=0) || !edBases || !edConf || !prob_A || !prob_C || !prob_G || !prob_T )
943 return;
944
945
946
947 /* Copy over confidence values */
948 for( i=0; i<Ned; i++ )
949 {
950 c = (char) edConf[i];
951 switch( edBases[i] )
952 {
953 case 'A':
954 case 'a':
955 prob_A[i] = c;
956 prob_C[i] = 0;
957 prob_G[i] = 0;
958 prob_T[i] = 0;
959 break;
960
961
962 case 'C':
963 case 'c':
964 prob_A[i] = 0;
965 prob_C[i] = c;
966 prob_G[i] = 0;
967 prob_T[i] = 0;
968 break;
969
970 case 'G':
971 case 'g':
972 prob_A[i] = 0;
973 prob_C[i] = 0;
974 prob_G[i] = c;
975 prob_T[i] = 0;
976 break;
977
978 case 'T':
979 case 't':
980 prob_A[i] = 0;
981 prob_C[i] = 0;
982 prob_G[i] = 0;
983 prob_T[i] = c;
984 break;
985
986 default:
987 prob_A[i] = c;
988 prob_C[i] = c;
989 prob_G[i] = c;
990 prob_T[i] = c;
991 break;
992 }
993 }
994 }
995
996
997
998 int read_sections(int new_sec) {
999 static int sections = READ_ALL;
1000
1001 if (new_sec)
1002 sections = new_sec;
1003
1004 return sections;
1005 }