Mercurial > repos > dawe > srf2fastq
comparison srf2fastq/io_lib-1.12.2/io_lib/seqIOALF.c @ 0:d901c9f41a6a default tip
Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author | dawe |
---|---|
date | Tue, 07 Jun 2011 17:48:05 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d901c9f41a6a |
---|---|
1 /* | |
2 * Copyright (c) Medical Research Council 1994. All rights reserved. | |
3 * | |
4 * Permission to use, copy, modify and distribute this software and its | |
5 * documentation for any purpose is hereby granted without fee, provided that | |
6 * this copyright and notice appears in all copies. | |
7 * | |
8 * This file was written by James Bonfield, Simon Dear, Rodger Staden, | |
9 * as part of the Staden Package at the MRC Laboratory of Molecular | |
10 * Biology, Hills Road, Cambridge, CB2 2QH, United Kingdom. | |
11 * | |
12 * MRC disclaims all warranties with regard to this software. | |
13 */ | |
14 | |
15 /* | |
16 * Title: seqIOALF | |
17 * | |
18 * File: seqIOALF.c | |
19 * Purpose: IO of ALF sequences | |
20 * Last update: 9th September 1994 | |
21 */ | |
22 | |
23 /* | |
24 * Change Log :- | |
25 * 14.01.91 SD | |
26 * when complimenting the sequence with an odd number of bases, | |
27 * the middle base position was not adjusted. | |
28 * 15.01.91 SD Put StLouis stuff on compilation flag | |
29 * 15.01.91 SD New include file (opp.h) | |
30 * 02.08.91 SD Changes the mapping of uncertainty codes so that we | |
31 * now only generate A C G T and - | |
32 * Previously... bug in interpreting ALF integer fields. | |
33 * We now treat them as unsigned. | |
34 * 17.09.91 LFW changed STLOUIS compilation flag to SAVE_EDITS | |
35 * and AUTO_CLIP | |
36 * 25.10.91 SD Machine independant I/O...removed BIGENDIAN flag | |
37 * 25.11.91 SD There was a hard limit (of 1024) for allocation of | |
38 * space for number of bases, yet program would | |
39 * read in more if there were any, causing nasties to happen. | |
40 * | |
41 * 11.11.92 LFW added section to actually check that the trace it | |
42 * is trying to open is an ALF file using traceType sub | |
43 * | |
44 * 10.11.92 SD SCF comments now stored in seq data structure | |
45 * 09.09.94 JKB Update to use Read instead of Seq library. | |
46 * 04.03.98 JKB Look for "Raw data" when "Processed data" is not found. | |
47 */ | |
48 | |
49 /* RMD I made substantial changes to this file 12/28/90 so as to | |
50 * read sequence data more freely (necessary when reading data from | |
51 * multiple trace files). | |
52 * The affected area is indicated by comments starting RMD, like | |
53 * this one. | |
54 */ | |
55 | |
56 /* This file was adapted by LFW from seqIOABI.c. | |
57 * The ALF results file is a concatenation of many files with an | |
58 * index structure at the beginning, consisting of a 512 byte | |
59 * block that we ignore, followed by 128 byte blocks describing | |
60 * each file. All files, including the header region, are rounded | |
61 * up to a multiple of 512 bytes long. | |
62 * The getIndexEntry routines identify the 128 byte index component | |
63 * of interest by matching 4 chars of its ASCII label, then extract | |
64 * the field of choice from that entry. | |
65 * | |
66 * Note that the SUN and PC are of opposite endian-ness, so that | |
67 * we have to provide special routines to read words and longwords | |
68 * from the results file. Luckily the floating point numbers are | |
69 * written out in ASCII. | |
70 */ | |
71 | |
72 | |
73 /* ---- Imports ---- */ | |
74 | |
75 | |
76 #include <ctype.h> | |
77 #include <stdio.h> | |
78 #include <string.h> | |
79 | |
80 #include "io_lib/stdio_hack.h" | |
81 | |
82 #include "io_lib/Read.h" | |
83 #include "io_lib/mach-io.h" | |
84 #include "io_lib/xalloc.h" | |
85 | |
86 /* ---- Constants ---- */ | |
87 | |
88 #define BasesPerLine 50 /* For output formatting */ | |
89 | |
90 #define IndexEntryLength ((off_t)128) | |
91 | |
92 | |
93 /* | |
94 * Here are some labels we will be looking for, four chars packed | |
95 * into a long word. | |
96 */ | |
97 #define EntryLabel ((uint_4) ((((('A'<<8)+'L')<<8)+'F')<<8)+' ') | |
98 #define BaseEntryLabel ((uint_4) ((((('S'<<8)+'e')<<8)+'q')<<8)+'u') | |
99 #define DataEntryLabel ((uint_4) ((((('P'<<8)+'r')<<8)+'o')<<8)+'c') | |
100 #define RawDataEntryLabel ((uint_4) ((((('R'<<8)+'a')<<8)+'w')<<8)+' ') | |
101 | |
102 /* RMD make enough space for bases - hard limit */ | |
103 #define BASELIMIT 4096 | |
104 | |
105 | |
106 /* ---- Internal functions ---- */ | |
107 | |
108 /* | |
109 * From the ALF results file connected to `fp' whose index starts | |
110 * at byte offset `indexO', return in `val' the `lw'th long word | |
111 * from the entry labelled `label'. | |
112 * The result is 0 for failure, 1 for success. | |
113 */ | |
114 static int getIndexEntryLW(FILE *fp, off_t indexO, | |
115 uint_4 label, int lw, | |
116 uint_4 *val) { | |
117 off_t entryNum=-1; | |
118 int i; | |
119 uint_4 entryLabel; | |
120 | |
121 do { | |
122 entryNum++; | |
123 if (fseek(fp, indexO+(entryNum*IndexEntryLength), 0) != 0) | |
124 return 0; | |
125 | |
126 if (!be_read_int_4(fp, &entryLabel)) | |
127 return 0; | |
128 } while (!(entryLabel == label)); | |
129 | |
130 for(i=2; i<lw; i++) | |
131 if (!be_read_int_4(fp, val)) | |
132 return 0; | |
133 | |
134 | |
135 /* when i = lw read in the 4 bytes backwards */ | |
136 if (!le_read_int_4(fp,val)) | |
137 return 0; | |
138 | |
139 return 1; | |
140 } | |
141 | |
142 /* | |
143 * From the ALF results file connected to `fp' whose index starts | |
144 * at byte offset `indexO', return in `val' the `lw'th word (int2) | |
145 * from the entry labelled `label'. | |
146 * The result is 0 for failure, 1 for success. | |
147 */ | |
148 static int getIndexEntryW(FILE *fp, off_t indexO, | |
149 uint_4 label, int lw, | |
150 uint_2 *val) { | |
151 off_t entryNum=-1; | |
152 int i; | |
153 uint_4 entryLabel; | |
154 uint_4 jval; | |
155 | |
156 do { | |
157 entryNum++; | |
158 if (fseek(fp, indexO+(entryNum*IndexEntryLength), 0) != 0) | |
159 return 0; | |
160 | |
161 if (!be_read_int_4(fp, &entryLabel)) | |
162 return 0; | |
163 } while (!(entryLabel == label)); | |
164 | |
165 | |
166 for(i=2; i<lw; i++) | |
167 if (!be_read_int_4(fp, &jval)) | |
168 return 0; | |
169 | |
170 if (!le_read_int_2(fp, val)) | |
171 return 0; | |
172 | |
173 return 1; | |
174 } | |
175 | |
176 | |
177 /* ---- Exports ---- */ | |
178 | |
179 | |
180 /* | |
181 * Read the ALF format sequence from FILE *fp into a Read structure. | |
182 * All printing characters (as defined by ANSII C `isprint') | |
183 * are accepted, but `N's are translated to `-'s. In this respect we | |
184 * are adhering (more or less) to the CSET_DEFAULT uncertainty code set. | |
185 * | |
186 * Returns: | |
187 * Read * - Success, the Read structure read. | |
188 * NULLRead - Failure. | |
189 */ | |
190 Read *fread_alf(FILE *fp) { | |
191 Read *read = NULLRead; | |
192 int i; | |
193 int numPoints; | |
194 int sections = read_sections(0); | |
195 | |
196 uint_4 data_size; | |
197 uint_4 dataO; | |
198 uint_4 header_size=396; /* size of the header of the processed data | |
199 section */ | |
200 uint_2 actBaseDataSize; /* actual number of bytes of data of information | |
201 containing the base and basePos information */ | |
202 int num_points; /* keeps track of the actual number of points, | |
203 rather than the early guess of numPoints */ | |
204 | |
205 off_t indexO; /* File offset where the index is */ | |
206 uint_4 baseO; /* File offset where the bases are stored */ | |
207 | |
208 | |
209 /* | |
210 * RMD lots of changes below here until end of data reading section | |
211 * Some are cosmetic. | |
212 * getIndexEntry calls in front of where they were needed, and made | |
213 * There is a substantive change to the inner loop of the sequence | |
214 * reading section. This now uses fscanf - much less rigid than the | |
215 * previous scheme. Note that it reads bp as a float. This is because | |
216 * it is a float in multiple trace data files! (bizarre Pharmacia | |
217 * programming!). | |
218 */ | |
219 | |
220 | |
221 /************************************************************* | |
222 * Read the various file offsets | |
223 *************************************************************/ | |
224 | |
225 /* indexO is the offset of the index. | |
226 * Or I could look for the first label, starting 'ALF' | |
227 * if I used 512 then none of the entries are on long | |
228 * word boundaries | |
229 */ | |
230 indexO = 522; | |
231 | |
232 /* offset in file of first base of sequence */ | |
233 if (! (getIndexEntryLW(fp,indexO,BaseEntryLabel,12,&baseO)) ) | |
234 goto bail_out; | |
235 | |
236 /* actual size of region containing this data */ | |
237 if (! (getIndexEntryW(fp,indexO,BaseEntryLabel,10,&actBaseDataSize)) ) | |
238 goto bail_out; | |
239 | |
240 /* Look for Processed data first. If we fail to find it, then look for | |
241 * the Raw data (same format). | |
242 */ | |
243 | |
244 /* offset in file to start of processed data segment - there | |
245 * is then a header of size header_size (currently 396) | |
246 */ | |
247 if (! (getIndexEntryLW(fp,indexO,DataEntryLabel,12,&dataO)) ) { | |
248 if (! (getIndexEntryLW(fp,indexO,RawDataEntryLabel,12,&dataO)) ) | |
249 goto bail_out; | |
250 | |
251 /* actual size of region containing this data */ | |
252 if (! (getIndexEntryLW(fp,indexO,RawDataEntryLabel,10,&data_size)) ) | |
253 goto bail_out; | |
254 } else { | |
255 /* actual size of region containing this data */ | |
256 if (! (getIndexEntryLW(fp,indexO,DataEntryLabel,10,&data_size)) ) | |
257 goto bail_out; | |
258 } | |
259 | |
260 /* Because each trace value is stored in a 2 byte | |
261 * integer, thus to store A C G T information | |
262 * it takes 8 bytes. So subtract off the header and | |
263 * divide by 8 | |
264 */ | |
265 numPoints = (int)((data_size - header_size)/ 8); | |
266 | |
267 /* Allocate the sequence */ | |
268 if (NULLRead == (read = read_allocate(numPoints, BASELIMIT))) | |
269 goto bail_out; | |
270 | |
271 /************************************************************* | |
272 * Read the bases information | |
273 *************************************************************/ | |
274 if (sections & READ_BASES) { | |
275 /* new locals introduced by LFW and/or RMD for the ALF */ | |
276 int numBases; /* number of nucleotides read in */ | |
277 float bp; | |
278 char ch; | |
279 | |
280 if (!(fseek(fp, (off_t)baseO, 0) == 0)) | |
281 goto bail_out; | |
282 | |
283 for (numBases = 0; (unsigned)ftell(fp) < baseO+(unsigned short)actBaseDataSize | |
284 && numBases<BASELIMIT;) { | |
285 char line[200]; | |
286 | |
287 fgets(line, (int)sizeof(line), fp); | |
288 sscanf(line, "%c %*d %f", &ch, &bp); | |
289 | |
290 /* we convert ch to Staden format here */ | |
291 switch (ch) { | |
292 case 'A': | |
293 case 'C': | |
294 case 'G': | |
295 case 'T': | |
296 break; | |
297 default: | |
298 ch = '-'; | |
299 /* | |
300 if (isupper(ch)) | |
301 ch = '-'; | |
302 else | |
303 ch = '\0'; | |
304 */ | |
305 } | |
306 | |
307 if (ch) { | |
308 read->base[numBases] = ch; | |
309 read->prob_A[numBases] = 0; | |
310 read->prob_C[numBases] = 0; | |
311 read->prob_G[numBases] = 0; | |
312 read->prob_T[numBases] = 0; | |
313 read->basePos[numBases] = bp; | |
314 ++numBases; | |
315 } | |
316 } | |
317 read->base[numBases] = 0; | |
318 | |
319 read->NBases = numBases; | |
320 } | |
321 | |
322 /************************************************************* | |
323 * Read the trace information | |
324 *************************************************************/ | |
325 | |
326 if (sections & READ_SAMPLES) { | |
327 | |
328 /* | |
329 * Traces are stored as 2 byte integers in records in the order of | |
330 * A C G T A C G T ... | |
331 */ | |
332 | |
333 if (fseek(fp, (off_t)(dataO+header_size), 0) != 0) | |
334 goto bail_out; | |
335 | |
336 num_points = 0; | |
337 | |
338 for (i=0; i < read->NPoints; i++) { | |
339 if (!le_read_int_2(fp, &(read->traceA[i]))) | |
340 goto bail_out; | |
341 if (read->maxTraceVal < read->traceA[i]) | |
342 read->maxTraceVal = read->traceA[i]; | |
343 | |
344 if (!le_read_int_2(fp, &(read->traceC[i]))) | |
345 goto bail_out; | |
346 if (read->maxTraceVal < read->traceC[i]) | |
347 read->maxTraceVal = read->traceC[i]; | |
348 | |
349 if (!le_read_int_2(fp, &(read->traceG[i]))) | |
350 goto bail_out; | |
351 if (read->maxTraceVal < read->traceG[i]) | |
352 read->maxTraceVal = read->traceG[i]; | |
353 | |
354 if (!le_read_int_2(fp, &(read->traceT[i]))) | |
355 goto bail_out; | |
356 if (read->maxTraceVal < read->traceT[i]) | |
357 read->maxTraceVal = read->traceT[i]; | |
358 | |
359 if (read->traceA[i]==0 && read->traceT[i]==0 && | |
360 read->traceC[i]==0 && read->traceG[i]==0 && | |
361 i > (numPoints-64)) | |
362 break; | |
363 | |
364 num_points++; | |
365 } | |
366 } | |
367 | |
368 /* SUCCESS */ | |
369 | |
370 read->format = TT_ALF; | |
371 return(read); | |
372 | |
373 /* FAILURE */ | |
374 bail_out: | |
375 if (read) | |
376 read_deallocate(read); | |
377 | |
378 return NULLRead; | |
379 } | |
380 | |
381 /* | |
382 * Read the ALF format sequence with name `fn' into a Read structure. | |
383 * All printing characters (as defined by ANSII C `isprint') | |
384 * are accepted, but `N's are translated to `-'s. In this respect we | |
385 * are adhering (more or less) to the CSET_DEFAULT uncertainty code set. | |
386 * | |
387 * Returns: | |
388 * Read * - Success, the Read structure read. | |
389 * NULLRead - Failure. | |
390 */ | |
391 Read *read_alf(char *fn) { | |
392 FILE *fp; | |
393 Read *read; | |
394 | |
395 /* Open file */ | |
396 if ((fp = fopen(fn, "rb")) == NULL) | |
397 return NULLRead; | |
398 | |
399 read = fread_alf(fp); | |
400 fclose(fp); | |
401 | |
402 if (read && (read->trace_name = (char *)xmalloc(strlen(fn)+1))) | |
403 strcpy(read->trace_name, fn); | |
404 | |
405 return read; | |
406 } | |
407 | |
408 /* | |
409 * Write to an ALF file - unsupported. | |
410 */ | |
411 /* ARGSUSED */ | |
412 int write_alf(char *fn, Read *read) { | |
413 fprintf(stderr, "ALF write support is unavailable\n"); | |
414 return -1; | |
415 } | |
416 | |
417 /* | |
418 * Write to an ALF file - unsupported. | |
419 */ | |
420 /* ARGSUSED */ | |
421 int fwrite_alf(FILE *fp, Read *read) { | |
422 fprintf(stderr, "ALF write support is unavailable\n"); | |
423 return -1; | |
424 } |