Mercurial > repos > dawe > srf2fastq
comparison srf2fastq/io_lib-1.12.2/io_lib/scf_extras.c @ 0:d901c9f41a6a default tip
Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author | dawe |
---|---|
date | Tue, 07 Jun 2011 17:48:05 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d901c9f41a6a |
---|---|
1 /* | |
2 * Copyright (c) Medical Research Council 1998. All rights reserved. | |
3 * | |
4 * Permission to use, copy, modify and distribute this software and its | |
5 * documentation for any purpose is hereby granted without fee, provided that | |
6 * this copyright and notice appears in all copies and that credit is given | |
7 * where due. | |
8 * | |
9 * This file was written by James Bonfield, Simon Dear, Rodger Staden, | |
10 * Kathryn Beal, as part of the Staden Package at the MRC Laboratory of | |
11 * Molecular Biology, Hills Road, Cambridge, CB2 2QH, United Kingdom. | |
12 * | |
13 * MRC disclaims all warranties with regard to this software. | |
14 */ | |
15 | |
16 /* | |
17 * This file contains the necessary code for reading the quality values from | |
18 * an SCF file. It supports both V2 and V3 SCF formats. | |
19 * It's done in an efficient manner by extracting only the relevant SCF | |
20 * components. | |
21 * This file is derived from the Gap4 source file scf_extras.c. | |
22 */ | |
23 | |
24 #include <stdlib.h> | |
25 | |
26 #include "io_lib/stdio_hack.h" | |
27 #include "io_lib/compress.h" | |
28 #include "io_lib/misc.h" | |
29 #include "io_lib/scf.h" | |
30 #include "io_lib/expFileIO.h" | |
31 #include "io_lib/traceType.h" | |
32 #include "io_lib/open_trace_file.h" | |
33 #include "io_lib/scf_extras.h" | |
34 | |
35 /* | |
36 * --------------------------------------------------------------------------- | |
37 * Loads confidence values from the trace file and averages them. | |
38 * 'opos' is optional - if not known then set to NULL. | |
39 * | |
40 * Returns 0 for success | |
41 * -1 for failure | |
42 */ | |
43 int get_read_conf(Exp_info *e, int length, int2 *opos, int1 *conf) { | |
44 int ttype, i; | |
45 FILE *fp; | |
46 uint_1 *prob_A, *prob_C, *prob_G, *prob_T; | |
47 char *seq; | |
48 float scf_version; | |
49 int nbases = 0; | |
50 | |
51 /* Sanity check */ | |
52 if (!(exp_Nentries(e,EFLT_LT) && exp_Nentries(e,EFLT_LN))) | |
53 return -1; | |
54 | |
55 /* Find and load trace file */ | |
56 ttype = trace_type_str2int(exp_get_entry(e, EFLT_LT)); | |
57 | |
58 if (ttype != TT_SCF && | |
59 ttype != TT_CTF && | |
60 ttype != TT_ZTR) | |
61 return -1; | |
62 | |
63 /* | |
64 * We only support direct reading accuracy values from SCF files. | |
65 * Otherwise we have to take a slower approach. | |
66 */ | |
67 if (ttype != TT_SCF) { | |
68 Read *r; | |
69 int sec = read_sections(0); | |
70 read_sections(READ_BASES); | |
71 | |
72 if (NULL == (r = read_reading(exp_get_entry(e,EFLT_LN), TT_ANYTR))) { | |
73 read_sections(sec); | |
74 return -1; | |
75 } | |
76 | |
77 prob_A = (int1 *)xmalloc(r->NBases); | |
78 prob_C = (int1 *)xmalloc(r->NBases); | |
79 prob_G = (int1 *)xmalloc(r->NBases); | |
80 prob_T = (int1 *)xmalloc(r->NBases); | |
81 seq = (char *)xmalloc(r->NBases); | |
82 | |
83 memcpy(prob_A, r->prob_A, r->NBases); | |
84 memcpy(prob_C, r->prob_C, r->NBases); | |
85 memcpy(prob_G, r->prob_G, r->NBases); | |
86 memcpy(prob_T, r->prob_T, r->NBases); | |
87 memcpy(seq, r->base, r->NBases); | |
88 | |
89 nbases = r->NBases; | |
90 | |
91 read_deallocate(r); | |
92 read_sections(sec); | |
93 | |
94 } else { | |
95 Header h; | |
96 /* For SCF files we read directly - the above code would also do. */ | |
97 | |
98 if (NULL == (fp = open_trace_file(exp_get_entry(e,EFLT_LN), NULL))) | |
99 return -1; | |
100 | |
101 /* Read the SCF header */ | |
102 if (-1 == read_scf_header(fp, &h)) | |
103 return -1; | |
104 scf_version = scf_version_str2float(h.version); | |
105 nbases = h.bases; | |
106 | |
107 /* Alloc memory */ | |
108 prob_A = (uint_1 *)xmalloc(h.bases * sizeof(*prob_A)); | |
109 prob_C = (uint_1 *)xmalloc(h.bases * sizeof(*prob_A)); | |
110 prob_G = (uint_1 *)xmalloc(h.bases * sizeof(*prob_A)); | |
111 prob_T = (uint_1 *)xmalloc(h.bases * sizeof(*prob_A)); | |
112 seq = (char *)xmalloc(h.bases * sizeof(*seq)); | |
113 if (NULL == prob_A || | |
114 NULL == prob_C || | |
115 NULL == prob_G || | |
116 NULL == prob_T || | |
117 NULL == seq) | |
118 return -1; | |
119 | |
120 /* Load base scores */ | |
121 if (scf_version >= 3.0) { | |
122 /* | |
123 * Version 3 base format: | |
124 * num_bases * 4byte peak index | |
125 * num_bases * prob_A | |
126 * num_bases * prob_C | |
127 * num_bases * prob_G | |
128 * num_bases * prob_T | |
129 * num_bases * base | |
130 * num_bases * spare (x3) | |
131 */ | |
132 fseek(fp, (off_t)h.bases_offset + 4 * h.bases, SEEK_SET); | |
133 if (h.bases != fread(prob_A, 1, h.bases, fp)) | |
134 return -1; | |
135 if (h.bases != fread(prob_C, 1, h.bases, fp)) | |
136 return -1; | |
137 if (h.bases != fread(prob_G, 1, h.bases, fp)) | |
138 return -1; | |
139 if (h.bases != fread(prob_T, 1, h.bases, fp)) | |
140 return -1; | |
141 if (h.bases != fread(seq, 1, h.bases, fp)) | |
142 return -1; | |
143 } else { | |
144 int i; | |
145 uint_1 buf[12]; | |
146 | |
147 /* | |
148 * Version 2 base format | |
149 * num_bases * base_struct, where base_struct is 12 bytes: | |
150 * 0-3 peak_index | |
151 * 4-7 prob_A/C/G/T | |
152 * 8 base | |
153 * 9- spare | |
154 */ | |
155 fseek(fp, (off_t)h.bases_offset, SEEK_SET); | |
156 | |
157 for (i = 0; (unsigned)i < h.bases; i++) { | |
158 if (1 != fread(buf, 12, 1, fp)) | |
159 return -1; | |
160 prob_A[i] = buf[4]; | |
161 prob_C[i] = buf[5]; | |
162 prob_G[i] = buf[6]; | |
163 prob_T[i] = buf[7]; | |
164 seq[i] = buf[8]; | |
165 } | |
166 } | |
167 | |
168 fclose(fp); | |
169 } | |
170 | |
171 /* Determine confidence values */ | |
172 if (opos) { | |
173 for (i=0; i<length; i++) { | |
174 if (opos[i] == 0) { | |
175 /* Inserted base, change to 0% */ | |
176 conf[i] = 0; | |
177 } else { | |
178 switch(seq[opos[i]-1]) { | |
179 case 'a': | |
180 case 'A': | |
181 conf[i] = prob_A[opos[i]-1]; | |
182 break; | |
183 case 'c': | |
184 case 'C': | |
185 conf[i] = prob_C[opos[i]-1]; | |
186 break; | |
187 case 'g': | |
188 case 'G': | |
189 conf[i] = prob_G[opos[i]-1]; | |
190 break; | |
191 case 't': | |
192 case 'T': | |
193 conf[i] = prob_T[opos[i]-1]; | |
194 break; | |
195 default: | |
196 conf[i] = 2; | |
197 } | |
198 } | |
199 } | |
200 } else { | |
201 int mlength = MIN(length, nbases); | |
202 | |
203 for (i=0; i < mlength; i++) { | |
204 switch(seq[i]) { | |
205 case 'a': | |
206 case 'A': | |
207 conf[i] = prob_A[i]; | |
208 break; | |
209 case 'c': | |
210 case 'C': | |
211 conf[i] = prob_C[i]; | |
212 break; | |
213 case 'g': | |
214 case 'G': | |
215 conf[i] = prob_G[i]; | |
216 break; | |
217 case 't': | |
218 case 'T': | |
219 conf[i] = prob_T[i]; | |
220 break; | |
221 case 'n': | |
222 case 'N': | |
223 case '-': | |
224 conf[i] = (prob_A[i] + prob_C[i] + prob_G[i] + prob_T[i]) / 4; | |
225 break; | |
226 default: | |
227 conf[i] = 2; | |
228 } | |
229 } | |
230 for (; i < length; i++) | |
231 conf[i] = 2; | |
232 } | |
233 | |
234 xfree(prob_A); | |
235 xfree(prob_C); | |
236 xfree(prob_G); | |
237 xfree(prob_T); | |
238 xfree(seq); | |
239 | |
240 return 0; | |
241 } |