comparison srf2fastq/io_lib-1.12.2/io_lib/seqIOALF.c @ 0:d901c9f41a6a default tip

Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author dawe
date Tue, 07 Jun 2011 17:48:05 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d901c9f41a6a
1 /*
2 * Copyright (c) Medical Research Council 1994. All rights reserved.
3 *
4 * Permission to use, copy, modify and distribute this software and its
5 * documentation for any purpose is hereby granted without fee, provided that
6 * this copyright and notice appears in all copies.
7 *
8 * This file was written by James Bonfield, Simon Dear, Rodger Staden,
9 * as part of the Staden Package at the MRC Laboratory of Molecular
10 * Biology, Hills Road, Cambridge, CB2 2QH, United Kingdom.
11 *
12 * MRC disclaims all warranties with regard to this software.
13 */
14
15 /*
16 * Title: seqIOALF
17 *
18 * File: seqIOALF.c
19 * Purpose: IO of ALF sequences
20 * Last update: 9th September 1994
21 */
22
23 /*
24 * Change Log :-
25 * 14.01.91 SD
26 * when complimenting the sequence with an odd number of bases,
27 * the middle base position was not adjusted.
28 * 15.01.91 SD Put StLouis stuff on compilation flag
29 * 15.01.91 SD New include file (opp.h)
30 * 02.08.91 SD Changes the mapping of uncertainty codes so that we
31 * now only generate A C G T and -
32 * Previously... bug in interpreting ALF integer fields.
33 * We now treat them as unsigned.
34 * 17.09.91 LFW changed STLOUIS compilation flag to SAVE_EDITS
35 * and AUTO_CLIP
36 * 25.10.91 SD Machine independant I/O...removed BIGENDIAN flag
37 * 25.11.91 SD There was a hard limit (of 1024) for allocation of
38 * space for number of bases, yet program would
39 * read in more if there were any, causing nasties to happen.
40 *
41 * 11.11.92 LFW added section to actually check that the trace it
42 * is trying to open is an ALF file using traceType sub
43 *
44 * 10.11.92 SD SCF comments now stored in seq data structure
45 * 09.09.94 JKB Update to use Read instead of Seq library.
46 * 04.03.98 JKB Look for "Raw data" when "Processed data" is not found.
47 */
48
49 /* RMD I made substantial changes to this file 12/28/90 so as to
50 * read sequence data more freely (necessary when reading data from
51 * multiple trace files).
52 * The affected area is indicated by comments starting RMD, like
53 * this one.
54 */
55
56 /* This file was adapted by LFW from seqIOABI.c.
57 * The ALF results file is a concatenation of many files with an
58 * index structure at the beginning, consisting of a 512 byte
59 * block that we ignore, followed by 128 byte blocks describing
60 * each file. All files, including the header region, are rounded
61 * up to a multiple of 512 bytes long.
62 * The getIndexEntry routines identify the 128 byte index component
63 * of interest by matching 4 chars of its ASCII label, then extract
64 * the field of choice from that entry.
65 *
66 * Note that the SUN and PC are of opposite endian-ness, so that
67 * we have to provide special routines to read words and longwords
68 * from the results file. Luckily the floating point numbers are
69 * written out in ASCII.
70 */
71
72
73 /* ---- Imports ---- */
74
75
76 #include <ctype.h>
77 #include <stdio.h>
78 #include <string.h>
79
80 #include "io_lib/stdio_hack.h"
81
82 #include "io_lib/Read.h"
83 #include "io_lib/mach-io.h"
84 #include "io_lib/xalloc.h"
85
86 /* ---- Constants ---- */
87
88 #define BasesPerLine 50 /* For output formatting */
89
90 #define IndexEntryLength ((off_t)128)
91
92
93 /*
94 * Here are some labels we will be looking for, four chars packed
95 * into a long word.
96 */
97 #define EntryLabel ((uint_4) ((((('A'<<8)+'L')<<8)+'F')<<8)+' ')
98 #define BaseEntryLabel ((uint_4) ((((('S'<<8)+'e')<<8)+'q')<<8)+'u')
99 #define DataEntryLabel ((uint_4) ((((('P'<<8)+'r')<<8)+'o')<<8)+'c')
100 #define RawDataEntryLabel ((uint_4) ((((('R'<<8)+'a')<<8)+'w')<<8)+' ')
101
102 /* RMD make enough space for bases - hard limit */
103 #define BASELIMIT 4096
104
105
106 /* ---- Internal functions ---- */
107
108 /*
109 * From the ALF results file connected to `fp' whose index starts
110 * at byte offset `indexO', return in `val' the `lw'th long word
111 * from the entry labelled `label'.
112 * The result is 0 for failure, 1 for success.
113 */
114 static int getIndexEntryLW(FILE *fp, off_t indexO,
115 uint_4 label, int lw,
116 uint_4 *val) {
117 off_t entryNum=-1;
118 int i;
119 uint_4 entryLabel;
120
121 do {
122 entryNum++;
123 if (fseek(fp, indexO+(entryNum*IndexEntryLength), 0) != 0)
124 return 0;
125
126 if (!be_read_int_4(fp, &entryLabel))
127 return 0;
128 } while (!(entryLabel == label));
129
130 for(i=2; i<lw; i++)
131 if (!be_read_int_4(fp, val))
132 return 0;
133
134
135 /* when i = lw read in the 4 bytes backwards */
136 if (!le_read_int_4(fp,val))
137 return 0;
138
139 return 1;
140 }
141
142 /*
143 * From the ALF results file connected to `fp' whose index starts
144 * at byte offset `indexO', return in `val' the `lw'th word (int2)
145 * from the entry labelled `label'.
146 * The result is 0 for failure, 1 for success.
147 */
148 static int getIndexEntryW(FILE *fp, off_t indexO,
149 uint_4 label, int lw,
150 uint_2 *val) {
151 off_t entryNum=-1;
152 int i;
153 uint_4 entryLabel;
154 uint_4 jval;
155
156 do {
157 entryNum++;
158 if (fseek(fp, indexO+(entryNum*IndexEntryLength), 0) != 0)
159 return 0;
160
161 if (!be_read_int_4(fp, &entryLabel))
162 return 0;
163 } while (!(entryLabel == label));
164
165
166 for(i=2; i<lw; i++)
167 if (!be_read_int_4(fp, &jval))
168 return 0;
169
170 if (!le_read_int_2(fp, val))
171 return 0;
172
173 return 1;
174 }
175
176
177 /* ---- Exports ---- */
178
179
180 /*
181 * Read the ALF format sequence from FILE *fp into a Read structure.
182 * All printing characters (as defined by ANSII C `isprint')
183 * are accepted, but `N's are translated to `-'s. In this respect we
184 * are adhering (more or less) to the CSET_DEFAULT uncertainty code set.
185 *
186 * Returns:
187 * Read * - Success, the Read structure read.
188 * NULLRead - Failure.
189 */
190 Read *fread_alf(FILE *fp) {
191 Read *read = NULLRead;
192 int i;
193 int numPoints;
194 int sections = read_sections(0);
195
196 uint_4 data_size;
197 uint_4 dataO;
198 uint_4 header_size=396; /* size of the header of the processed data
199 section */
200 uint_2 actBaseDataSize; /* actual number of bytes of data of information
201 containing the base and basePos information */
202 int num_points; /* keeps track of the actual number of points,
203 rather than the early guess of numPoints */
204
205 off_t indexO; /* File offset where the index is */
206 uint_4 baseO; /* File offset where the bases are stored */
207
208
209 /*
210 * RMD lots of changes below here until end of data reading section
211 * Some are cosmetic.
212 * getIndexEntry calls in front of where they were needed, and made
213 * There is a substantive change to the inner loop of the sequence
214 * reading section. This now uses fscanf - much less rigid than the
215 * previous scheme. Note that it reads bp as a float. This is because
216 * it is a float in multiple trace data files! (bizarre Pharmacia
217 * programming!).
218 */
219
220
221 /*************************************************************
222 * Read the various file offsets
223 *************************************************************/
224
225 /* indexO is the offset of the index.
226 * Or I could look for the first label, starting 'ALF'
227 * if I used 512 then none of the entries are on long
228 * word boundaries
229 */
230 indexO = 522;
231
232 /* offset in file of first base of sequence */
233 if (! (getIndexEntryLW(fp,indexO,BaseEntryLabel,12,&baseO)) )
234 goto bail_out;
235
236 /* actual size of region containing this data */
237 if (! (getIndexEntryW(fp,indexO,BaseEntryLabel,10,&actBaseDataSize)) )
238 goto bail_out;
239
240 /* Look for Processed data first. If we fail to find it, then look for
241 * the Raw data (same format).
242 */
243
244 /* offset in file to start of processed data segment - there
245 * is then a header of size header_size (currently 396)
246 */
247 if (! (getIndexEntryLW(fp,indexO,DataEntryLabel,12,&dataO)) ) {
248 if (! (getIndexEntryLW(fp,indexO,RawDataEntryLabel,12,&dataO)) )
249 goto bail_out;
250
251 /* actual size of region containing this data */
252 if (! (getIndexEntryLW(fp,indexO,RawDataEntryLabel,10,&data_size)) )
253 goto bail_out;
254 } else {
255 /* actual size of region containing this data */
256 if (! (getIndexEntryLW(fp,indexO,DataEntryLabel,10,&data_size)) )
257 goto bail_out;
258 }
259
260 /* Because each trace value is stored in a 2 byte
261 * integer, thus to store A C G T information
262 * it takes 8 bytes. So subtract off the header and
263 * divide by 8
264 */
265 numPoints = (int)((data_size - header_size)/ 8);
266
267 /* Allocate the sequence */
268 if (NULLRead == (read = read_allocate(numPoints, BASELIMIT)))
269 goto bail_out;
270
271 /*************************************************************
272 * Read the bases information
273 *************************************************************/
274 if (sections & READ_BASES) {
275 /* new locals introduced by LFW and/or RMD for the ALF */
276 int numBases; /* number of nucleotides read in */
277 float bp;
278 char ch;
279
280 if (!(fseek(fp, (off_t)baseO, 0) == 0))
281 goto bail_out;
282
283 for (numBases = 0; (unsigned)ftell(fp) < baseO+(unsigned short)actBaseDataSize
284 && numBases<BASELIMIT;) {
285 char line[200];
286
287 fgets(line, (int)sizeof(line), fp);
288 sscanf(line, "%c %*d %f", &ch, &bp);
289
290 /* we convert ch to Staden format here */
291 switch (ch) {
292 case 'A':
293 case 'C':
294 case 'G':
295 case 'T':
296 break;
297 default:
298 ch = '-';
299 /*
300 if (isupper(ch))
301 ch = '-';
302 else
303 ch = '\0';
304 */
305 }
306
307 if (ch) {
308 read->base[numBases] = ch;
309 read->prob_A[numBases] = 0;
310 read->prob_C[numBases] = 0;
311 read->prob_G[numBases] = 0;
312 read->prob_T[numBases] = 0;
313 read->basePos[numBases] = bp;
314 ++numBases;
315 }
316 }
317 read->base[numBases] = 0;
318
319 read->NBases = numBases;
320 }
321
322 /*************************************************************
323 * Read the trace information
324 *************************************************************/
325
326 if (sections & READ_SAMPLES) {
327
328 /*
329 * Traces are stored as 2 byte integers in records in the order of
330 * A C G T A C G T ...
331 */
332
333 if (fseek(fp, (off_t)(dataO+header_size), 0) != 0)
334 goto bail_out;
335
336 num_points = 0;
337
338 for (i=0; i < read->NPoints; i++) {
339 if (!le_read_int_2(fp, &(read->traceA[i])))
340 goto bail_out;
341 if (read->maxTraceVal < read->traceA[i])
342 read->maxTraceVal = read->traceA[i];
343
344 if (!le_read_int_2(fp, &(read->traceC[i])))
345 goto bail_out;
346 if (read->maxTraceVal < read->traceC[i])
347 read->maxTraceVal = read->traceC[i];
348
349 if (!le_read_int_2(fp, &(read->traceG[i])))
350 goto bail_out;
351 if (read->maxTraceVal < read->traceG[i])
352 read->maxTraceVal = read->traceG[i];
353
354 if (!le_read_int_2(fp, &(read->traceT[i])))
355 goto bail_out;
356 if (read->maxTraceVal < read->traceT[i])
357 read->maxTraceVal = read->traceT[i];
358
359 if (read->traceA[i]==0 && read->traceT[i]==0 &&
360 read->traceC[i]==0 && read->traceG[i]==0 &&
361 i > (numPoints-64))
362 break;
363
364 num_points++;
365 }
366 }
367
368 /* SUCCESS */
369
370 read->format = TT_ALF;
371 return(read);
372
373 /* FAILURE */
374 bail_out:
375 if (read)
376 read_deallocate(read);
377
378 return NULLRead;
379 }
380
381 /*
382 * Read the ALF format sequence with name `fn' into a Read structure.
383 * All printing characters (as defined by ANSII C `isprint')
384 * are accepted, but `N's are translated to `-'s. In this respect we
385 * are adhering (more or less) to the CSET_DEFAULT uncertainty code set.
386 *
387 * Returns:
388 * Read * - Success, the Read structure read.
389 * NULLRead - Failure.
390 */
391 Read *read_alf(char *fn) {
392 FILE *fp;
393 Read *read;
394
395 /* Open file */
396 if ((fp = fopen(fn, "rb")) == NULL)
397 return NULLRead;
398
399 read = fread_alf(fp);
400 fclose(fp);
401
402 if (read && (read->trace_name = (char *)xmalloc(strlen(fn)+1)))
403 strcpy(read->trace_name, fn);
404
405 return read;
406 }
407
408 /*
409 * Write to an ALF file - unsupported.
410 */
411 /* ARGSUSED */
412 int write_alf(char *fn, Read *read) {
413 fprintf(stderr, "ALF write support is unavailable\n");
414 return -1;
415 }
416
417 /*
418 * Write to an ALF file - unsupported.
419 */
420 /* ARGSUSED */
421 int fwrite_alf(FILE *fp, Read *read) {
422 fprintf(stderr, "ALF write support is unavailable\n");
423 return -1;
424 }