Mercurial > repos > dawe > srf2fastq
comparison srf2fastq/io_lib-1.12.2/io_lib/translate.c @ 0:d901c9f41a6a default tip
Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author | dawe |
---|---|
date | Tue, 07 Jun 2011 17:48:05 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d901c9f41a6a |
---|---|
1 /* | |
2 * Copyright (c) Medical Research Council 1994. All rights reserved. | |
3 * | |
4 * Permission to use, copy, modify and distribute this software and its | |
5 * documentation for any purpose is hereby granted without fee, provided that | |
6 * this copyright and notice appears in all copies. | |
7 * | |
8 * This file was written by James Bonfield, Simon Dear, Rodger Staden, | |
9 * as part of the Staden Package at the MRC Laboratory of Molecular | |
10 * Biology, Hills Road, Cambridge, CB2 2QH, United Kingdom. | |
11 * | |
12 * MRC disclaims all warranties with regard to this software. | |
13 */ | |
14 | |
15 /* | |
16 * File: translate.c | |
17 * Purpose: Translates between different reading structures. | |
18 * Last update: 01/09/94 | |
19 */ | |
20 | |
21 #include <stdio.h> | |
22 #ifndef NDEBUG | |
23 # define NDEBUG /* disable assertions */ | |
24 #endif | |
25 #include <assert.h> | |
26 | |
27 #include "io_lib/stdio_hack.h" | |
28 #include "io_lib/misc.h" | |
29 #include "io_lib/scf.h" | |
30 #include "io_lib/Read.h" | |
31 #include "io_lib/expFileIO.h" | |
32 #include "io_lib/traceType.h" | |
33 #include "io_lib/translate.h" | |
34 #include "io_lib/open_trace_file.h" | |
35 #ifdef USE_BIOLIMS | |
36 #include "spBiolims.h" | |
37 #endif | |
38 | |
39 | |
40 | |
41 #ifdef IOLIB_SCF | |
42 /* | |
43 * Translates an Scf structure into a Read structure. | |
44 * The Scf structure is left unchanged. | |
45 * | |
46 * Returns: | |
47 * A pointer to an allocated Read structure upon success. | |
48 * NULLRead upon failure. | |
49 */ | |
50 Read *scf2read(Scf *scf) { | |
51 Read *read; | |
52 register int i, i_end; | |
53 TRACE max_val = 0; | |
54 int sections = read_sections(0); | |
55 int nsamples = 0; | |
56 int nbases = 0; | |
57 | |
58 /* allocate */ | |
59 if (sections & READ_SAMPLES) | |
60 nsamples = scf->header.samples; | |
61 if (sections & READ_BASES) | |
62 nbases = scf->header.bases; | |
63 read = read_allocate(nsamples, nbases); | |
64 | |
65 if (NULLRead == read) | |
66 return NULLRead; | |
67 | |
68 if (sections & READ_SAMPLES) { | |
69 /* copy the samples */ | |
70 i_end = scf->header.samples; | |
71 read->NPoints = i_end; | |
72 | |
73 if (scf->header.sample_size == 1) { | |
74 for (i = 0; i < i_end; i++) { | |
75 read->traceA[i] = scf->samples.samples1[i].sample_A; | |
76 read->traceC[i] = scf->samples.samples1[i].sample_C; | |
77 read->traceG[i] = scf->samples.samples1[i].sample_G; | |
78 read->traceT[i] = scf->samples.samples1[i].sample_T; | |
79 | |
80 if (read->traceA[i] > max_val) max_val = read->traceA[i]; | |
81 if (read->traceC[i] > max_val) max_val = read->traceC[i]; | |
82 if (read->traceG[i] > max_val) max_val = read->traceG[i]; | |
83 if (read->traceT[i] > max_val) max_val = read->traceT[i]; | |
84 } | |
85 } else { /* sample_size == 2 */ | |
86 for (i = 0; i < i_end; i++) { | |
87 read->traceA[i] = scf->samples.samples2[i].sample_A; | |
88 read->traceC[i] = scf->samples.samples2[i].sample_C; | |
89 read->traceG[i] = scf->samples.samples2[i].sample_G; | |
90 read->traceT[i] = scf->samples.samples2[i].sample_T; | |
91 | |
92 if (read->traceA[i] > max_val) max_val = read->traceA[i]; | |
93 if (read->traceC[i] > max_val) max_val = read->traceC[i]; | |
94 if (read->traceG[i] > max_val) max_val = read->traceG[i]; | |
95 if (read->traceT[i] > max_val) max_val = read->traceT[i]; | |
96 } | |
97 } | |
98 | |
99 read->maxTraceVal = max_val; | |
100 } | |
101 | |
102 if (sections & READ_BASES) { | |
103 /* copy the bases */ | |
104 i_end = scf->header.bases; | |
105 read->NBases = i_end; | |
106 | |
107 for (i = 0; i < i_end; i++) { | |
108 read->basePos[i] = scf->bases[i].peak_index; | |
109 read->prob_A[i] = scf->bases[i].prob_A; | |
110 read->prob_C[i] = scf->bases[i].prob_C; | |
111 read->prob_G[i] = scf->bases[i].prob_G; | |
112 read->prob_T[i] = scf->bases[i].prob_T; | |
113 read->base[i] = scf->bases[i].base; | |
114 } | |
115 read->base[i] = 0; | |
116 } | |
117 | |
118 if (sections & READ_COMMENTS) { | |
119 /* allocate and copy the comments */ | |
120 if (scf->header.comments_size > 0 && scf->comments) { | |
121 read->info = (char *)xmalloc(scf->header.comments_size+1); | |
122 if (NULL == read->info) { | |
123 read_deallocate(read); | |
124 return NULLRead; | |
125 } | |
126 | |
127 memcpy(read->info, scf->comments, scf->header.comments_size); | |
128 read->info[scf->header.comments_size] = '\0'; | |
129 } | |
130 } | |
131 | |
132 /* other bits and pieces */ | |
133 read->leftCutoff = scf->header.bases_left_clip; | |
134 read->rightCutoff = read->NBases - scf->header.bases_right_clip + 1; | |
135 read->format = TT_SCF; | |
136 | |
137 if (scf->private_data) { | |
138 read->private_data = xmalloc(scf->header.private_size); | |
139 memcpy(read->private_data,scf->private_data, scf->header.private_size); | |
140 } | |
141 | |
142 return read; | |
143 } | |
144 | |
145 /* | |
146 * Translates a Read structure into a Scf structure. | |
147 * The Read structure is left unchanged. | |
148 * | |
149 * Returns: | |
150 * A pointer to an allocated Scf structure upon success. | |
151 * NULL upon failure. | |
152 */ | |
153 Scf *read2scf(Read *read) { | |
154 Scf *scf; | |
155 register int i, i_end; | |
156 int sample_size; | |
157 | |
158 /* allocate */ | |
159 sample_size = read->maxTraceVal >= 0x100 ? 2 : 1; | |
160 scf = scf_allocate(read->NPoints, sample_size, read->NBases, 0, 0); | |
161 if (NULL == scf) | |
162 return NULL; | |
163 | |
164 /* copy the samples */ | |
165 i_end = read->NPoints; | |
166 scf->header.samples = i_end; | |
167 | |
168 if (sample_size == 1) { | |
169 scf->header.sample_size = 1; | |
170 for (i = 0; i < i_end; i++) { | |
171 scf->samples.samples1[i].sample_A = (uint_1)read->traceA[i]; | |
172 scf->samples.samples1[i].sample_C = (uint_1)read->traceC[i]; | |
173 scf->samples.samples1[i].sample_G = (uint_1)read->traceG[i]; | |
174 scf->samples.samples1[i].sample_T = (uint_1)read->traceT[i]; | |
175 } | |
176 } else { | |
177 scf->header.sample_size = 2; | |
178 for (i = 0; i < i_end; i++) { | |
179 scf->samples.samples2[i].sample_A = read->traceA[i]; | |
180 scf->samples.samples2[i].sample_C = read->traceC[i]; | |
181 scf->samples.samples2[i].sample_G = read->traceG[i]; | |
182 scf->samples.samples2[i].sample_T = read->traceT[i]; | |
183 } | |
184 } | |
185 | |
186 /* copy the bases */ | |
187 i_end = read->NBases; | |
188 scf->header.bases = i_end; | |
189 | |
190 for (i = 0; i < i_end; i++) { | |
191 scf->bases[i].peak_index = read->basePos ? read->basePos[i] : i; | |
192 scf->bases[i].prob_A = read->prob_A ? read->prob_A[i] : 0; | |
193 scf->bases[i].prob_C = read->prob_A ? read->prob_C[i] : 0; | |
194 scf->bases[i].prob_G = read->prob_A ? read->prob_G[i] : 0; | |
195 scf->bases[i].prob_T = read->prob_A ? read->prob_T[i] : 0; | |
196 scf->bases[i].base = read->base ? read->base[i] : '-'; | |
197 } | |
198 | |
199 /* allocate and copy the comments */ | |
200 if (read->info) { | |
201 scf->header.comments_size = strlen(read->info) + 1; | |
202 scf->comments = (char *)xmalloc(scf->header.comments_size); | |
203 if (NULL == scf->comments) { | |
204 scf_deallocate(scf); | |
205 return NULL; | |
206 } | |
207 | |
208 memcpy(scf->comments, read->info, scf->header.comments_size - 1); | |
209 | |
210 /* just to make sure */ | |
211 scf->comments[scf->header.comments_size-1] = '\0'; | |
212 } | |
213 | |
214 /* other bits and pieces */ | |
215 scf->header.bases_left_clip = read->leftCutoff; | |
216 scf->header.bases_right_clip = read->NBases - read->rightCutoff + 1; | |
217 scf->header.code_set = CSET_DEFAULT; | |
218 memcpy(scf->header.version, scf_version_float2str(SCF_VERSION), 4); | |
219 | |
220 return scf; | |
221 } | |
222 #endif /* IOLIB_SCF */ | |
223 | |
224 | |
225 #ifdef IOLIB_EXP | |
226 | |
227 #define extend(e, entry, len) \ | |
228 do { \ | |
229 (void)ArrayRef(e->entries[entry],e->Nentries[entry]++); \ | |
230 if (NULL == (exp_get_entry(e, entry) = (char *)xmalloc(len))) \ | |
231 return NULL; \ | |
232 } while (0) | |
233 | |
234 /* | |
235 * Translates a Read structure and an Experiment file. | |
236 * The Read structure is left unchanged. | |
237 * | |
238 * Returns: | |
239 * A pointer to an allocated Exp_info structure upon success. | |
240 * NULL upon failure. | |
241 */ | |
242 Exp_info *read2exp(Read *read, char *EN) { | |
243 Exp_info *e; | |
244 char *t = trace_type_int2str(read->format), *p; | |
245 int l = strlen(EN)+1; | |
246 char *sq; | |
247 int i; | |
248 static char valid_bases[256]; | |
249 static int valid_setup = 0; | |
250 | |
251 if (!valid_setup) { | |
252 for (i = 0; i < 256; i++) | |
253 valid_bases[i] = '-'; | |
254 /* IUBC codes */ | |
255 for (sq = "acgturymkswbdhvnACGTURYMKSWBDHVN"; *sq; sq++) | |
256 valid_bases[(unsigned)*sq] = *sq; | |
257 valid_setup = 1; | |
258 } | |
259 | |
260 if (NULL == (e = exp_create_info())) | |
261 return NULL; | |
262 | |
263 /* Copy original exp file if present */ | |
264 if (read->orig_trace && read->orig_trace_format == TT_EXP) { | |
265 int i, j, k; | |
266 Exp_info *re = (Exp_info *)read->orig_trace; | |
267 | |
268 for (i = 0; i < MAXIMUM_EFLTS; i++) { | |
269 if (EFLT_SQ == i || | |
270 EFLT_QL == i || | |
271 EFLT_QR == i) | |
272 continue; | |
273 | |
274 if (0 == (k = exp_Nentries(re, i))) | |
275 continue; | |
276 | |
277 e->Nentries[i] = k; | |
278 ArrayRef(e->entries[i], e->Nentries[i]); | |
279 for (j = 0; j < k; j++) { | |
280 arr(char *, e->entries[i], j) = | |
281 strdup(arr(char *, re->entries[i], j)); | |
282 } | |
283 } | |
284 | |
285 /* Otherwise create our EN, ID, LN and LT lines */ | |
286 } else { | |
287 /* Entry name and ID lines */ | |
288 if (p = strrchr(EN, '/')) | |
289 EN = p+1; | |
290 extend(e, EFLT_EN, l); | |
291 sprintf(exp_get_entry(e, EFLT_EN), "%s", EN); | |
292 extend(e, EFLT_ID, l); | |
293 sprintf(exp_get_entry(e, EFLT_ID), "%s", EN); | |
294 | |
295 /* Trace file & type */ | |
296 if (read->trace_name) { | |
297 char *cp; | |
298 if (cp = strrchr(read->trace_name, '/')) | |
299 cp++; | |
300 else | |
301 cp = read->trace_name; | |
302 extend(e, EFLT_LN, strlen(cp)+1); | |
303 strcpy(exp_get_entry(e, EFLT_LN), cp); | |
304 } | |
305 | |
306 if (read->format != TT_ANY && read->format != TT_ANYTR) { | |
307 extend(e, EFLT_LT, strlen(t)+1); | |
308 strcpy(exp_get_entry(e, EFLT_LT), t); | |
309 } | |
310 } | |
311 | |
312 /* Output SQ, QL and QR lines */ | |
313 | |
314 /* Cutoffs */ | |
315 if (read->leftCutoff) { | |
316 extend(e, EFLT_QL, 15); | |
317 sprintf(exp_get_entry(e, EFLT_QL), "%d", read->leftCutoff); | |
318 } | |
319 | |
320 if (read->rightCutoff && read->rightCutoff != read->NBases+1) { | |
321 extend(e, EFLT_QR, 15); | |
322 sprintf(exp_get_entry(e, EFLT_QR), "%d", read->rightCutoff); | |
323 } | |
324 | |
325 /* Bases */ | |
326 extend(e, EFLT_SQ, read->NBases+1); | |
327 sq = exp_get_entry(e, EFLT_SQ); | |
328 for (i = 0; i < read->NBases; i++) { | |
329 sq[i] = valid_bases[(unsigned)read->base[i]]; | |
330 } | |
331 sq[i] = 0; | |
332 | |
333 #ifdef USE_BIOLIMS | |
334 /* | |
335 * Johnt: | |
336 * - Added tags below to allow for biolims update | |
337 * - This is all a very big bodge to allow BioLIMS | |
338 * attributes to be passed through the Read structure | |
339 * to the Experiment file | |
340 * - Any changes to this should also be mirrored in ../biolims/Exp.cpp | |
341 */ | |
342 { | |
343 int1 *qa; /* quality array */ | |
344 int i; | |
345 char tmp[1024]; | |
346 char *line; /* current line from info */ | |
347 char *end; /* end of current line */ | |
348 | |
349 /* AV only supports a single prob value for now */ | |
350 qa = (int1 *)malloc((read->NBases+1)*sizeof(int1)); | |
351 | |
352 /* need max 4 bytes per value - see conf2str */ | |
353 extend(e,EFLT_AV,(read->NBases+1)*5); | |
354 | |
355 /* merge into single quality array */ | |
356 for(i=0;i<read->NBases;i++){ | |
357 switch(read->base[i]){ | |
358 case 'a': | |
359 case 'A': | |
360 qa[i] = read->prob_A[i]; | |
361 break; | |
362 case 'c': | |
363 case 'C': | |
364 qa[i] = read->prob_C[i]; | |
365 break; | |
366 case 'g': | |
367 case 'G': | |
368 qa[i] = read->prob_G[i]; | |
369 break; | |
370 case 't': | |
371 case 'T': | |
372 qa[i] = read->prob_T[i]; | |
373 break; | |
374 default: | |
375 qa[i] = 0; | |
376 } | |
377 } | |
378 conf2str(qa,read->NBases,exp_get_entry(e, EFLT_AV)); | |
379 free(qa); | |
380 | |
381 /* Parse the read notes for everything else */ | |
382 if( read->info) { | |
383 for(line=read->info;end=strchr(line,'\n');line=end+1){ | |
384 *end='\0'; /* * put back */ | |
385 | |
386 /* look for known tags */ | |
387 if(!strncmp(line,EXP_CHEM,EXP_TAGLEN)){ | |
388 /* CH */ | |
389 int chem=0; | |
390 extend(e, EFLT_CH, 15); | |
391 if( !strcmp(CH_types[1],line+EXP_TAGLEN)) | |
392 chem=1; | |
393 sprintf(exp_get_entry(e, EFLT_CH), "%d",chem ); | |
394 | |
395 } else if(!strncmp(line,EXP_PRMR,EXP_TAGLEN)){ | |
396 /* PR */ | |
397 int primer=0; | |
398 extend(e,EFLT_PR,15); | |
399 for(primer=1;primer<nPR_types;primer++) | |
400 if(!strcmp(PR_types[i],line+EXP_TAGLEN)) | |
401 break; | |
402 if(primer>=nPR_types) | |
403 primer=1; | |
404 sprintf(exp_get_entry(e, EFLT_PR), "%d",primer); | |
405 | |
406 } else if(!strncmp(line,EXP_VECT,EXP_TAGLEN)){ | |
407 /* SV */ | |
408 extend(e,EFLT_SV,strlen(line)-EXP_TAGLEN+1); | |
409 strcpy(exp_get_entry(e, EFLT_SV),line+EXP_TAGLEN); | |
410 | |
411 } else if(!strncmp(line,EXP_CLOV,EXP_TAGLEN)){ | |
412 /* CV */ | |
413 extend(e,EFLT_CV,strlen(line)-EXP_TAGLEN+1); | |
414 strcpy(exp_get_entry(e, EFLT_CV),line+EXP_TAGLEN); | |
415 | |
416 } else if(!strncmp(line,EXP_CLON,EXP_TAGLEN)){ | |
417 /* CN */ | |
418 extend(e,EFLT_CN,strlen(line)-EXP_TAGLEN+1); | |
419 strcpy(exp_get_entry(e, EFLT_CN),line+EXP_TAGLEN); | |
420 | |
421 } else if(!strncmp(line,EXP_FEAT,EXP_TAGLEN)){ | |
422 /* FEAT=start stop key\r\rcomment */ | |
423 /* key and comment have \n encoded as \r */ | |
424 int start,stop,i; | |
425 char *key; /* biolims feature key */ | |
426 char *comment; /* biolims feature comment */ | |
427 line+=EXP_TAGLEN; | |
428 start=atoi(line); | |
429 line=strchr(line,' ')+1; | |
430 stop=atoi(line); | |
431 key=strchr(line,' ')+1; | |
432 comment=strstr(key,"\r\r"); | |
433 *comment='\0'; /* * put back */ | |
434 comment+=2; | |
435 /* replace \r with \n in key and comment */ | |
436 for(i=0;key[i];i++) | |
437 if(key[i]=='\r') key[i]='\n'; | |
438 for(i=0;comment[i];i++) | |
439 if(comment[i]=='\r') comment[i]='\n'; | |
440 /* could possibly be one of a number of EXP tags excoded | |
441 as a BioLIMS feature */ | |
442 if(!strncmp(key,featCLON,STADEN_FKEY_LEN)){ | |
443 /* CS */ | |
444 extend(e, EFLT_CS, 32); | |
445 exp_create_range(exp_get_entry(e, EFLT_CS),start,stop); | |
446 } else if(!strncmp(key,featVECI,STADEN_FKEY_LEN)){ | |
447 /* SI */ | |
448 extend(e, EFLT_SI, 32); | |
449 exp_create_range(exp_get_entry(e, EFLT_SI),start,stop); | |
450 } else if(!strncmp(key,featTEMP,STADEN_FKEY_LEN)){ | |
451 /* TN */ | |
452 extend(e, EFLT_TN, strlen(comment)+1); | |
453 strcpy(exp_get_entry(e, EFLT_TN),comment); | |
454 } else if(!strncmp(key,featSTRD,STADEN_FKEY_LEN)){ | |
455 /* ST */ | |
456 extend(e, EFLT_ST, strlen(comment)+1); | |
457 strcpy(exp_get_entry(e, EFLT_ST),comment); | |
458 } else if( !strncmp(key,featVECT,STADEN_FKEY_LEN)){ | |
459 /* SL and SR */ | |
460 extend(e,EFLT_SL,15); | |
461 extend(e,EFLT_SR,15); | |
462 sprintf(exp_get_entry(e, EFLT_SL),"%d",start); | |
463 sprintf(exp_get_entry(e, EFLT_SR),"%d",stop); | |
464 } else if( !strncmp(key,featGELR,STADEN_FKEY_LEN)){ | |
465 /* TG */ | |
466 char tag[5]; /* staden note tag (always 4 chars) */ | |
467 char strand=*comment; /* first char of comment */ | |
468 /* key has format STADEN_GELR:XXXX - | |
469 where XXXX is staden note tag */ | |
470 strncpy(tag,key+STADEN_FKEY_LEN+1,4); | |
471 tag[4]='\0'; | |
472 comment+=2; /* skip over strand */ | |
473 sprintf(tmp,"%s %c %d..%d\n%s", | |
474 key,strand,start,stop,comment); | |
475 extend(e,EFLT_TG,strlen(tmp)+1); | |
476 strcpy(exp_get_entry(e, EFLT_TG),tmp); | |
477 | |
478 } else if( !strncmp(key,featCONS,STADEN_FKEY_LEN)){ | |
479 /* TC */ | |
480 char tag[5]; /* staden note tag (always 4 chars) */ | |
481 /* comment has the format Srstart-rend\ncomment | |
482 S is a single character strand indicator | |
483 rstart and rend are the real start and end of the | |
484 tag | |
485 */ | |
486 char strand=*comment; /* first char of comment */ | |
487 char *rangestart = comment+2; /*skip over strand*/ | |
488 char *rangeend; | |
489 char *emptystring=""; | |
490 | |
491 /* key has format STADEN_CONS:XXXX - | |
492 where XXXX is staden note tag */ | |
493 strncpy(tag,key+STADEN_FKEY_LEN+1,4); | |
494 tag[4]='\0'; | |
495 /* the REAL range might actually be outside the bases, | |
496 so this is recorded in the first line of | |
497 the comment. This is merged with the | |
498 feature range to allow for complimenting */ | |
499 comment=strchr(rangestart,'\n'); | |
500 if( comment ) | |
501 *(comment++)='\0'; /* *** put back */ | |
502 else | |
503 comment=emptystring; | |
504 | |
505 /* now special processing of bounds */ | |
506 rangeend=strchr(rangestart,'-'); | |
507 if( rangeend ){ | |
508 long rstart,rend; | |
509 *(rangeend++)='\0'; /* *** put back */ | |
510 rstart=atol(rangestart); | |
511 rend=atol(rangeend); | |
512 *(rangeend-1)='-'; | |
513 | |
514 /* if start is the same as rstart, just | |
515 need to extend the stop bounds to the | |
516 real end If not there has been some | |
517 complimenting happening so need to | |
518 extend the start bounds, as its now | |
519 backwards. */ | |
520 if( start==rstart) | |
521 stop=start+rend-rstart; | |
522 else | |
523 start=stop-rend+rstart; | |
524 } else { | |
525 /* don't have a range so just use the | |
526 feature size */ | |
527 } | |
528 | |
529 sprintf(tmp,"%s %c %d..%d\n%s", | |
530 key,strand,start,stop,comment); | |
531 extend(e,EFLT_TC,strlen(tmp)+1); | |
532 strcpy(exp_get_entry(e, EFLT_TC),tmp); | |
533 if( comment!=emptystring) | |
534 *(comment-1)='\n'; | |
535 | |
536 } else { | |
537 /* TG */ | |
538 /* a biolims feature, encode this with tag | |
539 BIOL, and the Biolims Feature key in | |
540 the first line of the comment*/ | |
541 sprintf(tmp,"%s = %d..%d\n%s%s\n%s",/* use strand = */ | |
542 tagBIOL,start,stop,featKey,key,comment); | |
543 extend(e,EFLT_TG,strlen(tmp)+1); | |
544 strcpy(exp_get_entry(e, EFLT_TG),tmp); | |
545 } | |
546 *(comment-2)='\r'; | |
547 } | |
548 /* else unused value */ | |
549 | |
550 *end='\n'; | |
551 } | |
552 } | |
553 } | |
554 #else /* USE_BIOLIMS */ | |
555 /* Confidence values */ | |
556 if (read->prob_A && read->prob_C && read->prob_G && read->prob_T && | |
557 read->NBases > 0) { | |
558 /* We have some, but are they non zero values? */ | |
559 for (i = 0; i < read->NBases; i++) { | |
560 if (read->prob_A[i] || read->prob_C[i] || | |
561 read->prob_G[i] || read->prob_T[i]) | |
562 break; | |
563 } | |
564 if (i != read->NBases) { | |
565 int1 *conf = (int1 *)xmalloc(read->NBases); | |
566 char *cstr = (char *)xmalloc(read->NBases * 5 + 2); | |
567 | |
568 for (i = 0; i < read->NBases; i++) { | |
569 switch (read->base[i]) { | |
570 case 'a': | |
571 case 'A': | |
572 conf[i] = read->prob_A[i]; | |
573 break; | |
574 case 'c': | |
575 case 'C': | |
576 conf[i] = read->prob_C[i]; | |
577 break; | |
578 case 'g': | |
579 case 'G': | |
580 conf[i] = read->prob_G[i]; | |
581 break; | |
582 case 't': | |
583 case 'T': | |
584 conf[i] = read->prob_T[i]; | |
585 break; | |
586 default: | |
587 conf[i] = (read->prob_A[i] + read->prob_C[i] + | |
588 read->prob_G[i] + read->prob_T[i]) / 4; | |
589 } | |
590 } | |
591 | |
592 conf2str(conf, read->NBases, cstr); | |
593 extend(e, EFLT_AV, strlen(cstr)+1); | |
594 sprintf(exp_get_entry(e, EFLT_AV), "%s", cstr); | |
595 xfree(conf); | |
596 xfree(cstr); | |
597 } | |
598 } | |
599 #endif /* if USE_BIOLIMS else ... */ | |
600 | |
601 return e; | |
602 } | |
603 | |
604 | |
605 /* | |
606 * Controls the use of the SQ and ON lines when loading an experiment file. | |
607 * The default (value&1 == 1) is to load these into the Read structure. | |
608 * With value&1 == 0 we load the sequence directly from the trace file | |
609 * (LT line). | |
610 * value&2 controls whether to use the SL/SR fields when setting the cutoff. | |
611 * value&2 == 0 implies to do so, and value&2 == 2 implies to not. | |
612 * | |
613 * The default use is to use the SQ and ON lines. Returns the old value. | |
614 */ | |
615 static int use_experiment_sequence = 1; | |
616 int read_experiment_redirect(int value) { | |
617 int old = use_experiment_sequence; | |
618 | |
619 use_experiment_sequence = value; | |
620 return old; | |
621 } | |
622 | |
623 | |
624 /* | |
625 * Translates an experiment file to a Read structure. | |
626 * The Exp_info structure is left unchanged. | |
627 * | |
628 * Returns: | |
629 * A pointer to an allocated Read structure upon success. | |
630 * NULLRead upon failure. | |
631 */ | |
632 Read *exp2read(Exp_info *e, char *fn) { | |
633 Read *r; | |
634 int q, s, ttype, err = 0; | |
635 char *str; | |
636 int use_exp = use_experiment_sequence; | |
637 FILE *fp; | |
638 | |
639 if (!exp_Nentries(e, EFLT_LN)) { | |
640 err = 1; | |
641 } else { | |
642 /* Read the trace component of the experiment file */ | |
643 ttype = exp_Nentries(e,EFLT_LT) | |
644 ? trace_type_str2int(exp_get_entry(e, EFLT_LT)) : TT_ANYTR; | |
645 | |
646 if (fp = open_trace_file(exp_get_entry(e, EFLT_LN), fn)) { | |
647 if (NULLRead == (r = fread_reading(fp, NULL, ttype))) | |
648 err = 1; | |
649 } else { | |
650 err = 1; | |
651 } | |
652 } | |
653 | |
654 if (err) { | |
655 use_exp = 1; | |
656 r = read_allocate(0, 1); | |
657 } | |
658 | |
659 /* Set the left cutoff (QL / SL) */ | |
660 q=-1; | |
661 if (exp_Nentries(e, EFLT_QL)) | |
662 q = atoi(exp_get_entry(e, EFLT_QL)); | |
663 if ((use_exp&2) != 2) { | |
664 s=-1; | |
665 if (exp_Nentries(e, EFLT_SL)) | |
666 s = atoi(exp_get_entry(e, EFLT_SL)); | |
667 if (q != -1 || s != -1) | |
668 r->leftCutoff = MAX(q, s); | |
669 } else { | |
670 r->leftCutoff = q != -1 ? q : 0; | |
671 } | |
672 | |
673 /* Set the right cutoff (QR / SR) */ | |
674 q = INT_MAX; | |
675 if (exp_Nentries(e, EFLT_QR)) | |
676 q = atoi(exp_get_entry(e, EFLT_QR)); | |
677 if ((use_exp&2) != 2) { | |
678 s = INT_MAX; | |
679 if (exp_Nentries(e, EFLT_SR)) | |
680 s = atoi(exp_get_entry(e, EFLT_SR)); | |
681 if (q != INT_MAX || s != INT_MAX) | |
682 r->rightCutoff = MIN(q, s); | |
683 } else { | |
684 r->rightCutoff = q != INT_MAX ? q : 0; | |
685 } | |
686 | |
687 if (r->rightCutoff && r->rightCutoff <= r->leftCutoff) | |
688 r->rightCutoff = r->leftCutoff+1; | |
689 | |
690 /* Bases and base positions, if desired */ | |
691 if (use_exp&1) { | |
692 if (exp_Nentries(e, EFLT_SQ) && (str = exp_get_entry(e, EFLT_SQ))) { | |
693 int slen = strlen(str); | |
694 | |
695 if (NULL == (r->base = (char *)xrealloc(r->base, slen+1))) | |
696 return NULLRead; | |
697 if (NULL == (r->prob_A = (char *)xrealloc(r->prob_A, slen+1))) | |
698 return NULLRead; | |
699 if (NULL == (r->prob_C = (char *)xrealloc(r->prob_C, slen+1))) | |
700 return NULLRead; | |
701 if (NULL == (r->prob_G = (char *)xrealloc(r->prob_G, slen+1))) | |
702 return NULLRead; | |
703 if (NULL == (r->prob_T = (char *)xrealloc(r->prob_T, slen+1))) | |
704 return NULLRead; | |
705 if (r->basePos) { | |
706 xfree(r->basePos); | |
707 r->basePos = NULL; | |
708 } | |
709 | |
710 /* Clear them */ | |
711 memset(r->prob_A, 0, slen); | |
712 memset(r->prob_C, 0, slen); | |
713 memset(r->prob_G, 0, slen); | |
714 memset(r->prob_T, 0, slen); | |
715 | |
716 strcpy(r->base, str); | |
717 r->NBases = slen; | |
718 | |
719 /* Copy AV values into prob_* arrays */ | |
720 if (exp_Nentries(e, EFLT_AV) && | |
721 (str = exp_get_entry(e, EFLT_AV))) { | |
722 int1 *conf = (int1 *)xmalloc((slen+1)*sizeof(*conf)); | |
723 int i; | |
724 | |
725 str2conf(conf, slen, str); | |
726 for (i = 0; i < slen ; i++) { | |
727 switch(r->base[i]) { | |
728 case 'a': | |
729 case 'A': | |
730 r->prob_A[i] = conf[i]; | |
731 r->prob_C[i] = 0; | |
732 r->prob_G[i] = 0; | |
733 r->prob_T[i] = 0; | |
734 break; | |
735 case 'c': | |
736 case 'C': | |
737 r->prob_A[i] = 0; | |
738 r->prob_C[i] = conf[i]; | |
739 r->prob_G[i] = 0; | |
740 r->prob_T[i] = 0; | |
741 break; | |
742 case 'g': | |
743 case 'G': | |
744 r->prob_A[i] = 0; | |
745 r->prob_C[i] = 0; | |
746 r->prob_G[i] = conf[i]; | |
747 r->prob_T[i] = 0; | |
748 break; | |
749 case 't': | |
750 case 'T': | |
751 r->prob_A[i] = 0; | |
752 r->prob_C[i] = 0; | |
753 r->prob_G[i] = 0; | |
754 r->prob_T[i] = conf[i]; | |
755 break; | |
756 default: | |
757 r->prob_A[i] = conf[i]; | |
758 r->prob_C[i] = conf[i]; | |
759 r->prob_G[i] = conf[i]; | |
760 r->prob_T[i] = conf[i]; | |
761 break; | |
762 } | |
763 } | |
764 | |
765 xfree(conf); | |
766 } else { | |
767 memset(r->prob_A, 0, slen * sizeof(r->prob_A[0])); | |
768 memset(r->prob_C, 0, slen * sizeof(r->prob_C[0])); | |
769 memset(r->prob_G, 0, slen * sizeof(r->prob_G[0])); | |
770 memset(r->prob_T, 0, slen * sizeof(r->prob_T[0])); | |
771 } | |
772 } | |
773 | |
774 r->format = TT_EXP; | |
775 } | |
776 | |
777 r->orig_trace = e; | |
778 r->orig_trace_format = TT_EXP; | |
779 r->orig_trace_free = (void (*)(void *))exp_destroy_info; | |
780 | |
781 return r; | |
782 } | |
783 #endif /* IOLIB_EXP */ | |
784 | |
785 | |
786 | |
787 /* | |
788 * Takes an original read structure and a set of edit change arrays and | |
789 * produces a new base position array incorporating all the edits. For | |
790 * insertions, interpolation is used to derive a suitable sample position. | |
791 * | |
792 * INPUTS: | |
793 * | |
794 * Read *r = The original unedited read structure | |
795 * int Comp = 0=Normal sequence, 1=Complemented sequence | |
796 * int Ned = Length of edited arrays to follow | |
797 * char *edBases = Sequence of base characters incorporating ins/del edits | |
798 * uint_2 *edPos = Corresponding original base numbers, 0 indicates an | |
799 * insertion. Base numbers start at 1. | |
800 * | |
801 * OUTPUTS: | |
802 * | |
803 * This array is assumed to be empty with an allocated length of Ned elements. | |
804 * | |
805 * uint_2* basePos = Base positions in samples | |
806 */ | |
807 | |
808 void read_update_base_positions( Read *r, int Comp, int Ned, char *edBases, | |
809 int_2 *edPos, uint_2 *basePos ) | |
810 { | |
811 int i, j; | |
812 int gap; | |
813 int delta; | |
814 int o_N; | |
815 int o_NPoints; | |
816 uint_2* o_basePos; | |
817 int start; | |
818 int end; | |
819 | |
820 | |
821 | |
822 /* Check input */ | |
823 assert(r); | |
824 assert(edBases); | |
825 assert(edPos); | |
826 assert(basePos); | |
827 assert(Ned>0); | |
828 if( (Ned<=0) || !r || !edBases || !edPos || !basePos ) | |
829 return; | |
830 | |
831 | |
832 | |
833 /* Original sequence data */ | |
834 o_N = r->NBases; | |
835 o_NPoints = r->NPoints; | |
836 o_basePos = r->basePos; | |
837 | |
838 | |
839 | |
840 /* Copy original base positions */ | |
841 for( i=0; i<Ned; i++ ) | |
842 { | |
843 /* If inserted base, set position to zero */ | |
844 if( edPos[i] == 0 ) | |
845 { | |
846 basePos[i] = 0; | |
847 } | |
848 else | |
849 { | |
850 /* Get original position taking into account complementation */ | |
851 basePos[i] = o_basePos[ Comp ? o_N-edPos[i]+1-1 : edPos[i]-1 ]; | |
852 } | |
853 } | |
854 | |
855 | |
856 | |
857 /* Do linear interpolation to create positions for inserted bases */ | |
858 for( i=0; i<Ned; i++ ) | |
859 { | |
860 /* Search for start */ | |
861 while( (basePos[i]!=0) && (i<Ned) ) | |
862 i++; | |
863 start = (i==0) ? 0 : basePos[i-1]; | |
864 | |
865 | |
866 | |
867 /* Search for end */ | |
868 gap = 0; | |
869 while( (basePos[i]==0) && (i<Ned) ) | |
870 { | |
871 gap++; | |
872 i++; | |
873 } | |
874 if( i == Ned ) | |
875 { | |
876 if( gap == 0 ) | |
877 return; | |
878 end = o_NPoints; | |
879 } | |
880 else | |
881 { | |
882 end = basePos[i]; | |
883 } | |
884 | |
885 | |
886 | |
887 /* Do the interpolation */ | |
888 delta = (end - start) / (gap + 1); | |
889 for( j=i-gap; j<i; j++ ) | |
890 { | |
891 /* Watch out for insertions at start (j==0) */ | |
892 if( j==0 ) | |
893 basePos[j] = delta; | |
894 else | |
895 basePos[j] = basePos[j-1] + delta; | |
896 } | |
897 } | |
898 } | |
899 | |
900 | |
901 | |
902 /* | |
903 * Takes a set of edit change arrays and produces a new set of confidence | |
904 * arrays incorporating all the edits. | |
905 * | |
906 * INPUTS: | |
907 * | |
908 * int Ned = Length of edited arrays to follow | |
909 * char* edBases = Sequence of base characters incorporating ins/del edits | |
910 * int1* edConf = Corresponding confidence values, 100 for insertions | |
911 * | |
912 * | |
913 * OUTPUTS: | |
914 * | |
915 * These output arrays are assumed to be empty with an allocated length | |
916 * of Ned elements each. The names and types are identical to the same | |
917 * elements in the Read structure. | |
918 * | |
919 * char* prob_A = Base confidence A | |
920 * char* prob_C = Base confidence C | |
921 * char* prob_G = Base confidence G | |
922 * char* prob_T = Base confidence T | |
923 * | |
924 */ | |
925 | |
926 void read_update_confidence_values( int Ned, char* edBases, int1* edConf, | |
927 char* prob_A, char* prob_C, char* prob_G, char* prob_T ) | |
928 { | |
929 int i; | |
930 char c; | |
931 | |
932 | |
933 | |
934 /* Check input */ | |
935 assert(Ned>0); | |
936 assert(edBases); | |
937 assert(edConf); | |
938 assert(prob_A); | |
939 assert(prob_C); | |
940 assert(prob_G); | |
941 assert(prob_T); | |
942 if( (Ned<=0) || !edBases || !edConf || !prob_A || !prob_C || !prob_G || !prob_T ) | |
943 return; | |
944 | |
945 | |
946 | |
947 /* Copy over confidence values */ | |
948 for( i=0; i<Ned; i++ ) | |
949 { | |
950 c = (char) edConf[i]; | |
951 switch( edBases[i] ) | |
952 { | |
953 case 'A': | |
954 case 'a': | |
955 prob_A[i] = c; | |
956 prob_C[i] = 0; | |
957 prob_G[i] = 0; | |
958 prob_T[i] = 0; | |
959 break; | |
960 | |
961 | |
962 case 'C': | |
963 case 'c': | |
964 prob_A[i] = 0; | |
965 prob_C[i] = c; | |
966 prob_G[i] = 0; | |
967 prob_T[i] = 0; | |
968 break; | |
969 | |
970 case 'G': | |
971 case 'g': | |
972 prob_A[i] = 0; | |
973 prob_C[i] = 0; | |
974 prob_G[i] = c; | |
975 prob_T[i] = 0; | |
976 break; | |
977 | |
978 case 'T': | |
979 case 't': | |
980 prob_A[i] = 0; | |
981 prob_C[i] = 0; | |
982 prob_G[i] = 0; | |
983 prob_T[i] = c; | |
984 break; | |
985 | |
986 default: | |
987 prob_A[i] = c; | |
988 prob_C[i] = c; | |
989 prob_G[i] = c; | |
990 prob_T[i] = c; | |
991 break; | |
992 } | |
993 } | |
994 } | |
995 | |
996 | |
997 | |
998 int read_sections(int new_sec) { | |
999 static int sections = READ_ALL; | |
1000 | |
1001 if (new_sec) | |
1002 sections = new_sec; | |
1003 | |
1004 return sections; | |
1005 } |