1
|
1 /*****************************************************************
|
|
2 * SQUID - a library of functions for biological sequence analysis
|
|
3 * Copyright (C) 1992-2002 Washington University School of Medicine
|
|
4 *
|
|
5 * This source code is freely distributed under the terms of the
|
|
6 * GNU General Public License. See the files COPYRIGHT and LICENSE
|
|
7 * for details.
|
|
8 *****************************************************************/
|
|
9
|
|
10 /* stockholm.c
|
|
11 * SRE, Fri May 28 15:46:41 1999
|
|
12 *
|
|
13 * Reading/writing of Stockholm format multiple sequence alignments.
|
|
14 *
|
|
15 * example of API:
|
|
16 *
|
|
17 * MSA *msa;
|
|
18 * FILE *fp; -- opened for write with fopen()
|
|
19 * MSAFILE *afp; -- opened for read with MSAFileOpen()
|
|
20 *
|
|
21 * while ((msa = ReadStockholm(afp)) != NULL)
|
|
22 * {
|
|
23 * WriteStockholm(fp, msa);
|
|
24 * MSAFree(msa);
|
|
25 * }
|
|
26 *
|
|
27 * RCS $Id: stockholm.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: stockholm.c,v 1.7 2002/10/12 04:40:36 eddy Exp)
|
|
28 */
|
|
29 #include <stdio.h>
|
|
30 #include <string.h>
|
|
31 #include "squid.h"
|
|
32 #include "msa.h"
|
|
33
|
|
34 static int parse_gf(MSA *msa, char *buf);
|
|
35 static int parse_gs(MSA *msa, char *buf);
|
|
36 static int parse_gc(MSA *msa, char *buf);
|
|
37 static int parse_gr(MSA *msa, char *buf);
|
|
38 static int parse_comment(MSA *msa, char *buf);
|
|
39 static int parse_sequence(MSA *msa, char *buf);
|
|
40 static void actually_write_stockholm(FILE *fp, MSA *msa, int cpl);
|
|
41
|
|
42 #ifdef TESTDRIVE_STOCKHOLM
|
|
43 /*****************************************************************
|
|
44 * stockholm.c test driver:
|
|
45 * cc -DTESTDRIVE_STOCKHOLM -g -O2 -Wall -o test stockholm.c msa.c gki.c sqerror.c sre_string.c file.c hsregex.c sre_math.c sre_ctype.c -lm
|
|
46 *
|
|
47 */
|
|
48 int
|
|
49 main(int argc, char **argv)
|
|
50 {
|
|
51 MSAFILE *afp;
|
|
52 MSA *msa;
|
|
53 char *file;
|
|
54
|
|
55 file = argv[1];
|
|
56
|
|
57 if ((afp = MSAFileOpen(file, MSAFILE_STOCKHOLM, NULL)) == NULL)
|
|
58 Die("Couldn't open %s\n", file);
|
|
59
|
|
60 while ((msa = ReadStockholm(afp)) != NULL)
|
|
61 {
|
|
62 WriteStockholm(stdout, msa);
|
|
63 MSAFree(msa);
|
|
64 }
|
|
65
|
|
66 MSAFileClose(afp);
|
|
67 exit(0);
|
|
68 }
|
|
69 /******************************************************************/
|
|
70 #endif /* testdriver */
|
|
71
|
|
72
|
|
73 /* Function: ReadStockholm()
|
|
74 * Date: SRE, Fri May 21 17:33:10 1999 [St. Louis]
|
|
75 *
|
|
76 * Purpose: Parse the next alignment from an open Stockholm
|
|
77 * format alignment file. Return the alignment, or
|
|
78 * NULL if there are no more alignments in the file.
|
|
79 *
|
|
80 * Args: afp - open alignment file
|
|
81 *
|
|
82 * Returns: MSA * - an alignment object.
|
|
83 * caller responsible for an MSAFree()
|
|
84 * NULL if no more alignments
|
|
85 *
|
|
86 * Diagnostics:
|
|
87 * Will Die() here with a (potentially) useful message
|
|
88 * if a parsing error occurs
|
|
89 */
|
|
90 MSA *
|
|
91 ReadStockholm(MSAFILE *afp)
|
|
92 {
|
|
93 MSA *msa;
|
|
94 char *s;
|
|
95 int status;
|
|
96
|
|
97 if (feof(afp->f)) return NULL;
|
|
98
|
|
99 /* Initialize allocation of the MSA.
|
|
100 */
|
|
101 msa = MSAAlloc(10, 0);
|
|
102
|
|
103 /* Check the magic Stockholm header line.
|
|
104 * We have to skip blank lines here, else we perceive
|
|
105 * trailing blank lines in a file as a format error when
|
|
106 * reading in multi-record mode.
|
|
107 */
|
|
108 do {
|
|
109 if ((s = MSAFileGetLine(afp)) == NULL) {
|
|
110 MSAFree(msa);
|
|
111 return NULL;
|
|
112 }
|
|
113 } while (IsBlankline(s));
|
|
114
|
|
115 if (strncmp(s, "# STOCKHOLM 1.", 14) != 0)
|
|
116 Die("\
|
|
117 File %s doesn't appear to be in Stockholm format.\n\
|
|
118 Assuming there isn't some other problem with your file (it is an\n\
|
|
119 alignment file, right?), please either:\n\
|
|
120 a) use the Babelfish format autotranslator option (-B, usually);\n\
|
|
121 b) specify the file's format with the --informat option; or\n\
|
|
122 a) reformat the alignment to Stockholm format.\n",
|
|
123 afp->fname);
|
|
124
|
|
125 /* Read the alignment file one line at a time.
|
|
126 */
|
|
127 while ((s = MSAFileGetLine(afp)) != NULL)
|
|
128 {
|
|
129 while (*s == ' ' || *s == '\t') s++; /* skip leading whitespace */
|
|
130
|
|
131 if (*s == '#') {
|
|
132 if (strncmp(s, "#=GF", 4) == 0) status = parse_gf(msa, s);
|
|
133 else if (strncmp(s, "#=GS", 4) == 0) status = parse_gs(msa, s);
|
|
134 else if (strncmp(s, "#=GC", 4) == 0) status = parse_gc(msa, s);
|
|
135 else if (strncmp(s, "#=GR", 4) == 0) status = parse_gr(msa, s);
|
|
136 else status = parse_comment(msa, s);
|
|
137 }
|
|
138 else if (strncmp(s, "//", 2) == 0) break;
|
|
139 else if (*s == '\n') continue;
|
|
140 else status = parse_sequence(msa, s);
|
|
141
|
|
142 if (status == 0)
|
|
143 Die("Stockholm format parse error: line %d of file %s while reading alignment %s",
|
|
144 afp->linenumber, afp->fname, msa->name == NULL? "" : msa->name);
|
|
145 }
|
|
146
|
|
147 if (s == NULL && msa->nseq != 0)
|
|
148 Die ("Didn't find // at end of alignment %s", msa->name == NULL ? "" : msa->name);
|
|
149
|
|
150 if (s == NULL && msa->nseq == 0) {
|
|
151 /* probably just some junk at end of file */
|
|
152 MSAFree(msa);
|
|
153 return NULL;
|
|
154 }
|
|
155
|
|
156 MSAVerifyParse(msa);
|
|
157 return msa;
|
|
158 }
|
|
159
|
|
160
|
|
161 /* Function: WriteStockholm()
|
|
162 * Date: SRE, Mon May 31 19:15:22 1999 [St. Louis]
|
|
163 *
|
|
164 * Purpose: Write an alignment in standard multi-block
|
|
165 * Stockholm format to an open file. A wrapper
|
|
166 * for actually_write_stockholm().
|
|
167 *
|
|
168 * Args: fp - file that's open for writing
|
|
169 * msa - alignment to write
|
|
170 *
|
|
171 * Returns: (void)
|
|
172 */
|
|
173 void
|
|
174 WriteStockholm(FILE *fp, MSA *msa)
|
|
175 {
|
|
176 actually_write_stockholm(fp, msa, 50); /* 50 char per block */
|
|
177 }
|
|
178
|
|
179 /* Function: WriteStockholmOneBlock()
|
|
180 * Date: SRE, Mon May 31 19:15:22 1999 [St. Louis]
|
|
181 *
|
|
182 * Purpose: Write an alignment in Pfam's single-block
|
|
183 * Stockholm format to an open file. A wrapper
|
|
184 * for actually_write_stockholm().
|
|
185 *
|
|
186 * Args: fp - file that's open for writing
|
|
187 * msa - alignment to write
|
|
188 *
|
|
189 * Returns: (void)
|
|
190 */
|
|
191 void
|
|
192 WriteStockholmOneBlock(FILE *fp, MSA *msa)
|
|
193 {
|
|
194 actually_write_stockholm(fp, msa, msa->alen); /* one big block */
|
|
195 }
|
|
196
|
|
197
|
|
198 /* Function: actually_write_stockholm()
|
|
199 * Date: SRE, Fri May 21 17:39:22 1999 [St. Louis]
|
|
200 *
|
|
201 * Purpose: Write an alignment in Stockholm format to
|
|
202 * an open file. This is the function that actually
|
|
203 * does the work. The API's WriteStockholm()
|
|
204 * and WriteStockholmOneBlock() are wrappers.
|
|
205 *
|
|
206 * Args: fp - file that's open for writing
|
|
207 * msa - alignment to write
|
|
208 * cpl - characters to write per line in alignment block
|
|
209 *
|
|
210 * Returns: (void)
|
|
211 */
|
|
212 static void
|
|
213 actually_write_stockholm(FILE *fp, MSA *msa, int cpl)
|
|
214 {
|
|
215 int i, j;
|
|
216 int len = 0;
|
|
217 int namewidth;
|
|
218 int typewidth = 0; /* markup tags are up to 5 chars long */
|
|
219 int markupwidth = 0; /* #=GR, #=GC are four char wide + 1 space */
|
|
220 char *buf;
|
|
221 int currpos;
|
|
222 char *s, *tok;
|
|
223
|
|
224 /* Figure out how much space we need for name + markup
|
|
225 * to keep the alignment in register. Required by Stockholm
|
|
226 * spec, even though our Stockholm parser doesn't care (Erik's does).
|
|
227 */
|
|
228 namewidth = 0;
|
|
229 for (i = 0; i < msa->nseq; i++)
|
|
230 if ((len = strlen(msa->sqname[i])) > namewidth)
|
|
231 namewidth = len;
|
|
232
|
|
233 /* Figure out how much space we need for markup tags
|
|
234 * markupwidth = always 4 if we're doing markup: strlen("#=GR")
|
|
235 * typewidth = longest markup tag
|
|
236 */
|
|
237 if (msa->ss != NULL) { markupwidth = 4; typewidth = 2; }
|
|
238 if (msa->sa != NULL) { markupwidth = 4; typewidth = 2; }
|
|
239 for (i = 0; i < msa->ngr; i++)
|
|
240 if ((len = strlen(msa->gr_tag[i])) > typewidth) typewidth = len;
|
|
241
|
|
242 if (msa->rf != NULL) { markupwidth = 4; if (typewidth < 2) typewidth = 2; }
|
|
243 if (msa->ss_cons != NULL) { markupwidth = 4; if (typewidth < 7) typewidth = 7; }
|
|
244 if (msa->sa_cons != NULL) { markupwidth = 4; if (typewidth < 7) typewidth = 7; }
|
|
245 for (i = 0; i < msa->ngc; i++)
|
|
246 if ((len = strlen(msa->gc_tag[i])) > typewidth) typewidth = len;
|
|
247
|
|
248 buf = MallocOrDie(sizeof(char) * (cpl+namewidth+typewidth+markupwidth+61));
|
|
249
|
|
250 /* Magic Stockholm header
|
|
251 */
|
|
252 fprintf(fp, "# STOCKHOLM 1.0\n");
|
|
253
|
|
254 /* Free text comments
|
|
255 */
|
|
256 for (i = 0; i < msa->ncomment; i++)
|
|
257 fprintf(fp, "# %s\n", msa->comment[i]);
|
|
258 if (msa->ncomment > 0) fprintf(fp, "\n");
|
|
259
|
|
260 /* GF section: per-file annotation
|
|
261 */
|
|
262 if (msa->name != NULL) fprintf(fp, "#=GF ID %s\n", msa->name);
|
|
263 if (msa->acc != NULL) fprintf(fp, "#=GF AC %s\n", msa->acc);
|
|
264 if (msa->desc != NULL) fprintf(fp, "#=GF DE %s\n", msa->desc);
|
|
265 if (msa->au != NULL) fprintf(fp, "#=GF AU %s\n", msa->au);
|
|
266
|
|
267 /* Thresholds are hacky. Pfam has two. Rfam has one.
|
|
268 */
|
|
269 if (msa->cutoff_is_set[MSA_CUTOFF_GA1] && msa->cutoff_is_set[MSA_CUTOFF_GA2])
|
|
270 fprintf(fp, "#=GF GA %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_GA1], msa->cutoff[MSA_CUTOFF_GA2]);
|
|
271 else if (msa->cutoff_is_set[MSA_CUTOFF_GA1])
|
|
272 fprintf(fp, "#=GF GA %.1f\n", msa->cutoff[MSA_CUTOFF_GA1]);
|
|
273 if (msa->cutoff_is_set[MSA_CUTOFF_NC1] && msa->cutoff_is_set[MSA_CUTOFF_NC2])
|
|
274 fprintf(fp, "#=GF NC %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_NC1], msa->cutoff[MSA_CUTOFF_NC2]);
|
|
275 else if (msa->cutoff_is_set[MSA_CUTOFF_NC1])
|
|
276 fprintf(fp, "#=GF NC %.1f\n", msa->cutoff[MSA_CUTOFF_NC1]);
|
|
277 if (msa->cutoff_is_set[MSA_CUTOFF_TC1] && msa->cutoff_is_set[MSA_CUTOFF_TC2])
|
|
278 fprintf(fp, "#=GF TC %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_TC1], msa->cutoff[MSA_CUTOFF_TC2]);
|
|
279 else if (msa->cutoff_is_set[MSA_CUTOFF_TC1])
|
|
280 fprintf(fp, "#=GF TC %.1f\n", msa->cutoff[MSA_CUTOFF_TC1]);
|
|
281
|
|
282 for (i = 0; i < msa->ngf; i++)
|
|
283 fprintf(fp, "#=GF %-5s %s\n", msa->gf_tag[i], msa->gf[i]);
|
|
284 fprintf(fp, "\n");
|
|
285
|
|
286
|
|
287 /* GS section: per-sequence annotation
|
|
288 */
|
|
289 if (msa->flags & MSA_SET_WGT)
|
|
290 {
|
|
291 for (i = 0; i < msa->nseq; i++)
|
|
292 fprintf(fp, "#=GS %-*.*s WT %.2f\n", namewidth, namewidth, msa->sqname[i], msa->wgt[i]);
|
|
293 fprintf(fp, "\n");
|
|
294 }
|
|
295 if (msa->sqacc != NULL)
|
|
296 {
|
|
297 for (i = 0; i < msa->nseq; i++)
|
|
298 if (msa->sqacc[i] != NULL)
|
|
299 fprintf(fp, "#=GS %-*.*s AC %s\n", namewidth, namewidth, msa->sqname[i], msa->sqacc[i]);
|
|
300 fprintf(fp, "\n");
|
|
301 }
|
|
302 if (msa->sqdesc != NULL)
|
|
303 {
|
|
304 for (i = 0; i < msa->nseq; i++)
|
|
305 if (msa->sqdesc[i] != NULL)
|
|
306 fprintf(fp, "#=GS %*.*s DE %s\n", namewidth, namewidth, msa->sqname[i], msa->sqdesc[i]);
|
|
307 fprintf(fp, "\n");
|
|
308 }
|
|
309 for (i = 0; i < msa->ngs; i++)
|
|
310 {
|
|
311 /* Multiannotated GS tags are possible; for example,
|
|
312 * #=GS foo DR PDB; 1xxx;
|
|
313 * #=GS foo DR PDB; 2yyy;
|
|
314 * These are stored, for example, as:
|
|
315 * msa->gs[0][0] = "PDB; 1xxx;\nPDB; 2yyy;"
|
|
316 * and must be decomposed.
|
|
317 */
|
|
318 for (j = 0; j < msa->nseq; j++)
|
|
319 if (msa->gs[i][j] != NULL)
|
|
320 {
|
|
321 s = msa->gs[i][j];
|
|
322 while ((tok = sre_strtok(&s, "\n", NULL)) != NULL)
|
|
323 fprintf(fp, "#=GS %*.*s %5s %s\n", namewidth, namewidth,
|
|
324 msa->sqname[j], msa->gs_tag[i], tok);
|
|
325 }
|
|
326 fprintf(fp, "\n");
|
|
327 }
|
|
328
|
|
329 /* Alignment section:
|
|
330 * contains aligned sequence, #=GR annotation, and #=GC annotation
|
|
331 */
|
|
332 for (currpos = 0; currpos < msa->alen; currpos += cpl)
|
|
333 {
|
|
334 if (currpos > 0) fprintf(fp, "\n");
|
|
335 for (i = 0; i < msa->nseq; i++)
|
|
336 {
|
|
337 strncpy(buf, msa->aseq[i] + currpos, cpl);
|
|
338 buf[cpl] = '\0';
|
|
339 fprintf(fp, "%-*.*s %s\n", namewidth+typewidth+markupwidth, namewidth+typewidth+markupwidth,
|
|
340 msa->sqname[i], buf);
|
|
341
|
|
342 if (msa->ss != NULL && msa->ss[i] != NULL) {
|
|
343 strncpy(buf, msa->ss[i] + currpos, cpl);
|
|
344 buf[cpl] = '\0';
|
|
345 fprintf(fp, "#=GR %-*.*s SS %s\n", namewidth, namewidth, msa->sqname[i], buf);
|
|
346 }
|
|
347 if (msa->sa != NULL && msa->sa[i] != NULL) {
|
|
348 strncpy(buf, msa->sa[i] + currpos, cpl);
|
|
349 buf[cpl] = '\0';
|
|
350 fprintf(fp, "#=GR %-*.*s SA %s\n", namewidth, namewidth, msa->sqname[i], buf);
|
|
351 }
|
|
352 for (j = 0; j < msa->ngr; j++)
|
|
353 if (msa->gr[j][i] != NULL) {
|
|
354 strncpy(buf, msa->gr[j][i] + currpos, cpl);
|
|
355 buf[cpl] = '\0';
|
|
356 fprintf(fp, "#=GR %-*.*s %5s %s\n",
|
|
357 namewidth, namewidth, msa->sqname[i], msa->gr_tag[j], buf);
|
|
358 }
|
|
359 }
|
|
360 if (msa->ss_cons != NULL) {
|
|
361 strncpy(buf, msa->ss_cons + currpos, cpl);
|
|
362 buf[cpl] = '\0';
|
|
363 fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "SS_cons", buf);
|
|
364 }
|
|
365
|
|
366 if (msa->sa_cons != NULL) {
|
|
367 strncpy(buf, msa->sa_cons + currpos, cpl);
|
|
368 buf[cpl] = '\0';
|
|
369 fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "SA_cons", buf);
|
|
370 }
|
|
371
|
|
372 if (msa->rf != NULL) {
|
|
373 strncpy(buf, msa->rf + currpos, cpl);
|
|
374 buf[cpl] = '\0';
|
|
375 fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "RF", buf);
|
|
376 }
|
|
377 for (j = 0; j < msa->ngc; j++) {
|
|
378 strncpy(buf, msa->gc[j] + currpos, cpl);
|
|
379 buf[cpl] = '\0';
|
|
380 fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth,
|
|
381 msa->gc_tag[j], buf);
|
|
382 }
|
|
383 }
|
|
384 fprintf(fp, "//\n");
|
|
385 free(buf);
|
|
386 }
|
|
387
|
|
388
|
|
389
|
|
390
|
|
391
|
|
392 /* Format of a GF line:
|
|
393 * #=GF <featurename> <text>
|
|
394 */
|
|
395 static int
|
|
396 parse_gf(MSA *msa, char *buf)
|
|
397 {
|
|
398 char *gf;
|
|
399 char *featurename;
|
|
400 char *text;
|
|
401 char *s;
|
|
402
|
|
403 s = buf;
|
|
404 if ((gf = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
|
|
405 if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
|
|
406 if ((text = sre_strtok(&s, "\n", NULL)) == NULL) return 0;
|
|
407 while (*text && (*text == ' ' || *text == '\t')) text++;
|
|
408
|
|
409 if (strcmp(featurename, "ID") == 0)
|
|
410 msa->name = sre_strdup(text, -1);
|
|
411 else if (strcmp(featurename, "AC") == 0)
|
|
412 msa->acc = sre_strdup(text, -1);
|
|
413 else if (strcmp(featurename, "DE") == 0)
|
|
414 msa->desc = sre_strdup(text, -1);
|
|
415 else if (strcmp(featurename, "AU") == 0)
|
|
416 msa->au = sre_strdup(text, -1);
|
|
417 else if (strcmp(featurename, "GA") == 0)
|
|
418 { /* Pfam has GA1, GA2. Rfam just has GA1. */
|
|
419 s = text;
|
|
420 if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
|
|
421 msa->cutoff[MSA_CUTOFF_GA1] = atof(text);
|
|
422 msa->cutoff_is_set[MSA_CUTOFF_GA1] = TRUE;
|
|
423 if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) {
|
|
424 msa->cutoff[MSA_CUTOFF_GA2] = atof(text);
|
|
425 msa->cutoff_is_set[MSA_CUTOFF_GA2] = TRUE;
|
|
426 }
|
|
427 }
|
|
428 else if (strcmp(featurename, "NC") == 0)
|
|
429 {
|
|
430 s = text;
|
|
431 if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
|
|
432 msa->cutoff[MSA_CUTOFF_NC1] = atof(text);
|
|
433 msa->cutoff_is_set[MSA_CUTOFF_NC1] = TRUE;
|
|
434 if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) {
|
|
435 msa->cutoff[MSA_CUTOFF_NC2] = atof(text);
|
|
436 msa->cutoff_is_set[MSA_CUTOFF_NC2] = TRUE;
|
|
437 }
|
|
438 }
|
|
439 else if (strcmp(featurename, "TC") == 0)
|
|
440 {
|
|
441 s = text;
|
|
442 if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
|
|
443 msa->cutoff[MSA_CUTOFF_TC1] = atof(text);
|
|
444 msa->cutoff_is_set[MSA_CUTOFF_TC1] = TRUE;
|
|
445 if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) {
|
|
446 msa->cutoff[MSA_CUTOFF_TC2] = atof(text);
|
|
447 msa->cutoff_is_set[MSA_CUTOFF_TC2] = TRUE;
|
|
448 }
|
|
449 }
|
|
450 else
|
|
451 MSAAddGF(msa, featurename, text);
|
|
452
|
|
453 return 1;
|
|
454 }
|
|
455
|
|
456
|
|
457 /* Format of a GS line:
|
|
458 * #=GS <seqname> <featurename> <text>
|
|
459 */
|
|
460 static int
|
|
461 parse_gs(MSA *msa, char *buf)
|
|
462 {
|
|
463 char *gs;
|
|
464 char *seqname;
|
|
465 char *featurename;
|
|
466 char *text;
|
|
467 int seqidx;
|
|
468 char *s;
|
|
469
|
|
470 s = buf;
|
|
471 if ((gs = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
|
|
472 if ((seqname = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
|
|
473 if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
|
|
474 if ((text = sre_strtok(&s, "\n", NULL)) == NULL) return 0;
|
|
475 while (*text && (*text == ' ' || *text == '\t')) text++;
|
|
476
|
|
477 /* GS usually follows another GS; guess lastidx+1
|
|
478 */
|
|
479 seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx+1);
|
|
480 msa->lastidx = seqidx;
|
|
481
|
|
482 if (strcmp(featurename, "WT") == 0)
|
|
483 {
|
|
484 msa->wgt[seqidx] = atof(text);
|
|
485 msa->flags |= MSA_SET_WGT;
|
|
486 }
|
|
487
|
|
488 else if (strcmp(featurename, "AC") == 0)
|
|
489 MSASetSeqAccession(msa, seqidx, text);
|
|
490
|
|
491 else if (strcmp(featurename, "DE") == 0)
|
|
492 MSASetSeqDescription(msa, seqidx, text);
|
|
493
|
|
494 else
|
|
495 MSAAddGS(msa, featurename, seqidx, text);
|
|
496
|
|
497 return 1;
|
|
498 }
|
|
499
|
|
500 /* Format of a GC line:
|
|
501 * #=GC <featurename> <text>
|
|
502 */
|
|
503 static int
|
|
504 parse_gc(MSA *msa, char *buf)
|
|
505 {
|
|
506 char *gc;
|
|
507 char *featurename;
|
|
508 char *text;
|
|
509 char *s;
|
|
510 int len;
|
|
511
|
|
512 s = buf;
|
|
513 if ((gc = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
|
|
514 if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
|
|
515 if ((text = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0;
|
|
516
|
|
517 if (strcmp(featurename, "SS_cons") == 0)
|
|
518 sre_strcat(&(msa->ss_cons), -1, text, len);
|
|
519 else if (strcmp(featurename, "SA_cons") == 0)
|
|
520 sre_strcat(&(msa->sa_cons), -1, text, len);
|
|
521 else if (strcmp(featurename, "RF") == 0)
|
|
522 sre_strcat(&(msa->rf), -1, text, len);
|
|
523 else
|
|
524 MSAAppendGC(msa, featurename, text);
|
|
525
|
|
526 return 1;
|
|
527 }
|
|
528
|
|
529 /* Format of a GR line:
|
|
530 * #=GR <seqname> <featurename> <text>
|
|
531 */
|
|
532 static int
|
|
533 parse_gr(MSA *msa, char *buf)
|
|
534 {
|
|
535 char *gr;
|
|
536 char *seqname;
|
|
537 char *featurename;
|
|
538 char *text;
|
|
539 int seqidx;
|
|
540 int len;
|
|
541 int j;
|
|
542 char *s;
|
|
543
|
|
544 s = buf;
|
|
545 if ((gr = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
|
|
546 if ((seqname = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
|
|
547 if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
|
|
548 if ((text = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0;
|
|
549
|
|
550 /* GR usually follows sequence it refers to; guess msa->lastidx */
|
|
551 seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx);
|
|
552 msa->lastidx = seqidx;
|
|
553
|
|
554 if (strcmp(featurename, "SS") == 0)
|
|
555 {
|
|
556 if (msa->ss == NULL)
|
|
557 {
|
|
558 msa->ss = MallocOrDie(sizeof(char *) * msa->nseqalloc);
|
|
559 msa->sslen = MallocOrDie(sizeof(int) * msa->nseqalloc);
|
|
560 for (j = 0; j < msa->nseqalloc; j++)
|
|
561 {
|
|
562 msa->ss[j] = NULL;
|
|
563 msa->sslen[j] = 0;
|
|
564 }
|
|
565 }
|
|
566 msa->sslen[seqidx] = sre_strcat(&(msa->ss[seqidx]), msa->sslen[seqidx], text, len);
|
|
567 }
|
|
568 else if (strcmp(featurename, "SA") == 0)
|
|
569 {
|
|
570 if (msa->sa == NULL)
|
|
571 {
|
|
572 msa->sa = MallocOrDie(sizeof(char *) * msa->nseqalloc);
|
|
573 msa->salen = MallocOrDie(sizeof(int) * msa->nseqalloc);
|
|
574 for (j = 0; j < msa->nseqalloc; j++)
|
|
575 {
|
|
576 msa->sa[j] = NULL;
|
|
577 msa->salen[j] = 0;
|
|
578 }
|
|
579 }
|
|
580 msa->salen[seqidx] = sre_strcat(&(msa->sa[seqidx]), msa->salen[seqidx], text, len);
|
|
581 }
|
|
582 else
|
|
583 MSAAppendGR(msa, featurename, seqidx, text);
|
|
584
|
|
585 return 1;
|
|
586 }
|
|
587
|
|
588
|
|
589 /* comments are simply stored verbatim, not parsed
|
|
590 */
|
|
591 static int
|
|
592 parse_comment(MSA *msa, char *buf)
|
|
593 {
|
|
594 char *s;
|
|
595 char *comment;
|
|
596
|
|
597 s = buf + 1; /* skip leading '#' */
|
|
598 if (*s == '\n') { *s = '\0'; comment = s; } /* deal with blank comment */
|
|
599 else if ((comment = sre_strtok(&s, "\n", NULL)) == NULL) return 0;
|
|
600
|
|
601 MSAAddComment(msa, comment);
|
|
602 return 1;
|
|
603 }
|
|
604
|
|
605 static int
|
|
606 parse_sequence(MSA *msa, char *buf)
|
|
607 {
|
|
608 char *s;
|
|
609 char *seqname;
|
|
610 char *text;
|
|
611 int seqidx;
|
|
612 int len;
|
|
613
|
|
614 s = buf;
|
|
615 if ((seqname = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
|
|
616 if ((text = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0;
|
|
617
|
|
618 /* seq usually follows another seq; guess msa->lastidx +1 */
|
|
619 seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx+1);
|
|
620 msa->lastidx = seqidx;
|
|
621
|
|
622 msa->sqlen[seqidx] = sre_strcat(&(msa->aseq[seqidx]), msa->sqlen[seqidx], text, len);
|
|
623 return 1;
|
|
624 }
|
|
625
|
|
626
|
|
627
|