comparison srf2fastq/io_lib-1.12.2/io_lib/scf_extras.c @ 0:d901c9f41a6a default tip

Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author dawe
date Tue, 07 Jun 2011 17:48:05 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d901c9f41a6a
1 /*
2 * Copyright (c) Medical Research Council 1998. All rights reserved.
3 *
4 * Permission to use, copy, modify and distribute this software and its
5 * documentation for any purpose is hereby granted without fee, provided that
6 * this copyright and notice appears in all copies and that credit is given
7 * where due.
8 *
9 * This file was written by James Bonfield, Simon Dear, Rodger Staden,
10 * Kathryn Beal, as part of the Staden Package at the MRC Laboratory of
11 * Molecular Biology, Hills Road, Cambridge, CB2 2QH, United Kingdom.
12 *
13 * MRC disclaims all warranties with regard to this software.
14 */
15
16 /*
17 * This file contains the necessary code for reading the quality values from
18 * an SCF file. It supports both V2 and V3 SCF formats.
19 * It's done in an efficient manner by extracting only the relevant SCF
20 * components.
21 * This file is derived from the Gap4 source file scf_extras.c.
22 */
23
24 #include <stdlib.h>
25
26 #include "io_lib/stdio_hack.h"
27 #include "io_lib/compress.h"
28 #include "io_lib/misc.h"
29 #include "io_lib/scf.h"
30 #include "io_lib/expFileIO.h"
31 #include "io_lib/traceType.h"
32 #include "io_lib/open_trace_file.h"
33 #include "io_lib/scf_extras.h"
34
35 /*
36 * ---------------------------------------------------------------------------
37 * Loads confidence values from the trace file and averages them.
38 * 'opos' is optional - if not known then set to NULL.
39 *
40 * Returns 0 for success
41 * -1 for failure
42 */
43 int get_read_conf(Exp_info *e, int length, int2 *opos, int1 *conf) {
44 int ttype, i;
45 FILE *fp;
46 uint_1 *prob_A, *prob_C, *prob_G, *prob_T;
47 char *seq;
48 float scf_version;
49 int nbases = 0;
50
51 /* Sanity check */
52 if (!(exp_Nentries(e,EFLT_LT) && exp_Nentries(e,EFLT_LN)))
53 return -1;
54
55 /* Find and load trace file */
56 ttype = trace_type_str2int(exp_get_entry(e, EFLT_LT));
57
58 if (ttype != TT_SCF &&
59 ttype != TT_CTF &&
60 ttype != TT_ZTR)
61 return -1;
62
63 /*
64 * We only support direct reading accuracy values from SCF files.
65 * Otherwise we have to take a slower approach.
66 */
67 if (ttype != TT_SCF) {
68 Read *r;
69 int sec = read_sections(0);
70 read_sections(READ_BASES);
71
72 if (NULL == (r = read_reading(exp_get_entry(e,EFLT_LN), TT_ANYTR))) {
73 read_sections(sec);
74 return -1;
75 }
76
77 prob_A = (int1 *)xmalloc(r->NBases);
78 prob_C = (int1 *)xmalloc(r->NBases);
79 prob_G = (int1 *)xmalloc(r->NBases);
80 prob_T = (int1 *)xmalloc(r->NBases);
81 seq = (char *)xmalloc(r->NBases);
82
83 memcpy(prob_A, r->prob_A, r->NBases);
84 memcpy(prob_C, r->prob_C, r->NBases);
85 memcpy(prob_G, r->prob_G, r->NBases);
86 memcpy(prob_T, r->prob_T, r->NBases);
87 memcpy(seq, r->base, r->NBases);
88
89 nbases = r->NBases;
90
91 read_deallocate(r);
92 read_sections(sec);
93
94 } else {
95 Header h;
96 /* For SCF files we read directly - the above code would also do. */
97
98 if (NULL == (fp = open_trace_file(exp_get_entry(e,EFLT_LN), NULL)))
99 return -1;
100
101 /* Read the SCF header */
102 if (-1 == read_scf_header(fp, &h))
103 return -1;
104 scf_version = scf_version_str2float(h.version);
105 nbases = h.bases;
106
107 /* Alloc memory */
108 prob_A = (uint_1 *)xmalloc(h.bases * sizeof(*prob_A));
109 prob_C = (uint_1 *)xmalloc(h.bases * sizeof(*prob_A));
110 prob_G = (uint_1 *)xmalloc(h.bases * sizeof(*prob_A));
111 prob_T = (uint_1 *)xmalloc(h.bases * sizeof(*prob_A));
112 seq = (char *)xmalloc(h.bases * sizeof(*seq));
113 if (NULL == prob_A ||
114 NULL == prob_C ||
115 NULL == prob_G ||
116 NULL == prob_T ||
117 NULL == seq)
118 return -1;
119
120 /* Load base scores */
121 if (scf_version >= 3.0) {
122 /*
123 * Version 3 base format:
124 * num_bases * 4byte peak index
125 * num_bases * prob_A
126 * num_bases * prob_C
127 * num_bases * prob_G
128 * num_bases * prob_T
129 * num_bases * base
130 * num_bases * spare (x3)
131 */
132 fseek(fp, (off_t)h.bases_offset + 4 * h.bases, SEEK_SET);
133 if (h.bases != fread(prob_A, 1, h.bases, fp))
134 return -1;
135 if (h.bases != fread(prob_C, 1, h.bases, fp))
136 return -1;
137 if (h.bases != fread(prob_G, 1, h.bases, fp))
138 return -1;
139 if (h.bases != fread(prob_T, 1, h.bases, fp))
140 return -1;
141 if (h.bases != fread(seq, 1, h.bases, fp))
142 return -1;
143 } else {
144 int i;
145 uint_1 buf[12];
146
147 /*
148 * Version 2 base format
149 * num_bases * base_struct, where base_struct is 12 bytes:
150 * 0-3 peak_index
151 * 4-7 prob_A/C/G/T
152 * 8 base
153 * 9- spare
154 */
155 fseek(fp, (off_t)h.bases_offset, SEEK_SET);
156
157 for (i = 0; (unsigned)i < h.bases; i++) {
158 if (1 != fread(buf, 12, 1, fp))
159 return -1;
160 prob_A[i] = buf[4];
161 prob_C[i] = buf[5];
162 prob_G[i] = buf[6];
163 prob_T[i] = buf[7];
164 seq[i] = buf[8];
165 }
166 }
167
168 fclose(fp);
169 }
170
171 /* Determine confidence values */
172 if (opos) {
173 for (i=0; i<length; i++) {
174 if (opos[i] == 0) {
175 /* Inserted base, change to 0% */
176 conf[i] = 0;
177 } else {
178 switch(seq[opos[i]-1]) {
179 case 'a':
180 case 'A':
181 conf[i] = prob_A[opos[i]-1];
182 break;
183 case 'c':
184 case 'C':
185 conf[i] = prob_C[opos[i]-1];
186 break;
187 case 'g':
188 case 'G':
189 conf[i] = prob_G[opos[i]-1];
190 break;
191 case 't':
192 case 'T':
193 conf[i] = prob_T[opos[i]-1];
194 break;
195 default:
196 conf[i] = 2;
197 }
198 }
199 }
200 } else {
201 int mlength = MIN(length, nbases);
202
203 for (i=0; i < mlength; i++) {
204 switch(seq[i]) {
205 case 'a':
206 case 'A':
207 conf[i] = prob_A[i];
208 break;
209 case 'c':
210 case 'C':
211 conf[i] = prob_C[i];
212 break;
213 case 'g':
214 case 'G':
215 conf[i] = prob_G[i];
216 break;
217 case 't':
218 case 'T':
219 conf[i] = prob_T[i];
220 break;
221 case 'n':
222 case 'N':
223 case '-':
224 conf[i] = (prob_A[i] + prob_C[i] + prob_G[i] + prob_T[i]) / 4;
225 break;
226 default:
227 conf[i] = 2;
228 }
229 }
230 for (; i < length; i++)
231 conf[i] = 2;
232 }
233
234 xfree(prob_A);
235 xfree(prob_C);
236 xfree(prob_G);
237 xfree(prob_T);
238 xfree(seq);
239
240 return 0;
241 }