Mercurial > repos > clustalomega > clustalomega
comparison clustalomega/clustal-omega-1.0.2/src/squid/sqio.c @ 1:bc707542e5de
Uploaded
author | clustalomega |
---|---|
date | Thu, 21 Jul 2011 13:35:08 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:ff1768533a07 | 1:bc707542e5de |
---|---|
1 /***************************************************************** | |
2 * SQUID - a library of functions for biological sequence analysis | |
3 * Copyright (C) 1992-2002 Washington University School of Medicine | |
4 * | |
5 * This source code is freely distributed under the terms of the | |
6 * GNU General Public License. See the files COPYRIGHT and LICENSE | |
7 * for details. | |
8 *****************************************************************/ | |
9 | |
10 /* File: sqio.c | |
11 * From: ureadseq.c in Don Gilbert's sequence i/o package | |
12 * | |
13 * Reads and writes nucleic/protein sequence in various | |
14 * formats. Data files may have multiple sequences. | |
15 * | |
16 * Heavily modified from READSEQ package | |
17 * Copyright (C) 1990 by D.G. Gilbert | |
18 * Biology Dept., Indiana University, Bloomington, IN 47405 | |
19 * email: gilbertd@bio.indiana.edu | |
20 * Thanks Don! | |
21 * | |
22 * SRE: Modifications as noted. Fri Jul 3 09:44:54 1992 | |
23 * Packaged for squid, Thu Oct 1 10:07:11 1992 | |
24 * ANSI conversion in full swing, Mon Jul 12 12:22:21 1993 | |
25 * | |
26 * CVS $Id: sqio.c,v 1.29 2002/08/26 23:10:52 eddy Exp) | |
27 * | |
28 ***************************************************************** | |
29 * Basic API for single sequence reading: | |
30 * | |
31 * SQFILE *sqfp; | |
32 * char *seqfile; | |
33 * int format; - see squid.h for formats; example: SQFILE_FASTA | |
34 * char *seq; | |
35 * SQINFO sqinfo; | |
36 * | |
37 * if ((sqfp = SeqfileOpen(seqfile, format, "BLASTDB")) == NULL) | |
38 * Die("Failed to open sequence database file %s\n%s\n", seqfile, usage); | |
39 * while (ReadSeq(sqfp, sqfp->format, &seq, &sqinfo)) { | |
40 * do_stuff; | |
41 * FreeSequence(seq, &sqinfo); | |
42 * } | |
43 * SeqfileClose(sqfp); | |
44 * | |
45 ***************************************************************** | |
46 */ | |
47 | |
48 #include <stdio.h> | |
49 #include <stdlib.h> | |
50 #include <string.h> | |
51 #include <ctype.h> | |
52 | |
53 #ifndef SEEK_SET | |
54 #include <unistd.h> | |
55 #endif | |
56 | |
57 #include "squid.h" | |
58 #include "msa.h" | |
59 #include "ssi.h" | |
60 | |
61 static void SeqfileGetLine(SQFILE *V); | |
62 | |
63 #define kStartLength 500 | |
64 | |
65 static char *aminos = "ABCDEFGHIKLMNPQRSTVWXYZ*"; | |
66 static char *primenuc = "ACGTUN"; | |
67 static char *protonly = "EFIPQZ"; | |
68 | |
69 static SQFILE *seqfile_open(char *filename, int format, char *env, int ssimode); | |
70 | |
71 /* Function: SeqfileOpen() | |
72 * | |
73 * Purpose : Open a sequence database file and prepare for reading | |
74 * sequentially. | |
75 * | |
76 * Args: filename - name of file to open | |
77 * format - format of file | |
78 * env - environment variable for path (e.g. BLASTDB) | |
79 * ssimode - -1, SSI_OFFSET_I32, or SSI_OFFSET_I64 | |
80 * | |
81 * Returns opened SQFILE ptr, or NULL on failure. | |
82 */ | |
83 SQFILE * | |
84 SeqfileOpen(char *filename, int format, char *env) | |
85 { | |
86 return seqfile_open(filename, format, env, -1); | |
87 } | |
88 SQFILE * | |
89 SeqfileOpenForIndexing(char *filename, int format, char *env, int ssimode) | |
90 { | |
91 return seqfile_open(filename, format, env, ssimode); | |
92 } | |
93 static SQFILE * | |
94 seqfile_open(char *filename, int format, char *env, int ssimode) | |
95 { | |
96 SQFILE *dbfp; | |
97 | |
98 dbfp = (SQFILE *) MallocOrDie (sizeof(SQFILE)); | |
99 | |
100 dbfp->ssimode = ssimode; | |
101 dbfp->rpl = -1; /* flag meaning "unset" */ | |
102 dbfp->lastrpl = 0; | |
103 dbfp->maxrpl = 0; | |
104 dbfp->bpl = -1; /* flag meaning "unset" */ | |
105 dbfp->lastbpl = 0; | |
106 dbfp->maxbpl = 0; | |
107 | |
108 /* Open our file handle. | |
109 * Three possibilities: | |
110 * 1. normal file open | |
111 * 2. filename = "-"; read from stdin | |
112 * 3. filename = "*.gz"; read thru pipe from gzip | |
113 * If we're reading from stdin or a pipe, we can't reliably | |
114 * back up, so we can't do two-pass parsers like the interleaved alignment | |
115 * formats. | |
116 */ | |
117 if (strcmp(filename, "-") == 0) | |
118 { | |
119 dbfp->f = stdin; | |
120 dbfp->do_stdin = TRUE; | |
121 dbfp->do_gzip = FALSE; | |
122 dbfp->fname = sre_strdup("[STDIN]", -1); | |
123 } | |
124 #ifndef SRE_STRICT_ANSI | |
125 /* popen(), pclose() aren't portable to non-POSIX systems; disable */ | |
126 else if (Strparse("^.*\\.gz$", filename, 0)) | |
127 { | |
128 char cmd[256]; | |
129 | |
130 /* Note that popen() will return "successfully" | |
131 * if file doesn't exist, because gzip works fine | |
132 * and prints an error! So we have to check for | |
133 * existence of file ourself. | |
134 */ | |
135 if (! FileExists(filename)) | |
136 Die("%s: file does not exist", filename); | |
137 | |
138 if (strlen(filename) + strlen("gzip -dc ") >= 256) | |
139 Die("filename > 255 char in SeqfileOpen()"); | |
140 sprintf(cmd, "gzip -dc %s", filename); | |
141 if ((dbfp->f = popen(cmd, "r")) == NULL) | |
142 return NULL; | |
143 | |
144 dbfp->do_stdin = FALSE; | |
145 dbfp->do_gzip = TRUE; | |
146 dbfp->fname = sre_strdup(filename, -1); | |
147 } | |
148 #endif /*SRE_STRICT_ANSI*/ | |
149 else | |
150 { | |
151 if ((dbfp->f = fopen(filename, "r")) == NULL && | |
152 (dbfp->f = EnvFileOpen(filename, env, NULL)) == NULL) | |
153 return NULL; | |
154 | |
155 dbfp->do_stdin = FALSE; | |
156 dbfp->do_gzip = FALSE; | |
157 dbfp->fname = sre_strdup(filename, -1); | |
158 } | |
159 | |
160 | |
161 /* Invoke autodetection if we haven't already been told what | |
162 * to expect. | |
163 */ | |
164 if (format == SQFILE_UNKNOWN) | |
165 { | |
166 if (dbfp->do_stdin == TRUE || dbfp->do_gzip) | |
167 Die("Can't autodetect sequence file format from a stdin or gzip pipe"); | |
168 format = SeqfileFormat(dbfp->f); | |
169 if (format == SQFILE_UNKNOWN) | |
170 Die("Can't determine format of sequence file %s", dbfp->fname); | |
171 } | |
172 | |
173 /* The hack for sequential access of an interleaved alignment file: | |
174 * read the alignment in, we'll copy sequences out one at a time. | |
175 */ | |
176 dbfp->msa = NULL; | |
177 dbfp->afp = NULL; | |
178 dbfp->format = format; | |
179 dbfp->linenumber = 0; | |
180 dbfp->buf = NULL; | |
181 dbfp->buflen = 0; | |
182 if (IsAlignmentFormat(format)) | |
183 { | |
184 /* We'll be reading from the MSA interface. Copy our data | |
185 * to the MSA afp's structure. | |
186 */ | |
187 dbfp->afp = MallocOrDie(sizeof(MSAFILE)); | |
188 dbfp->afp->f = dbfp->f; /* just a ptr, don't close */ | |
189 dbfp->afp->do_stdin = dbfp->do_stdin; | |
190 dbfp->afp->do_gzip = dbfp->do_gzip; | |
191 dbfp->afp->fname = dbfp->fname; /* just a ptr, don't free */ | |
192 dbfp->afp->format = dbfp->format; /* e.g. format */ | |
193 dbfp->afp->linenumber = dbfp->linenumber; /* e.g. 0 */ | |
194 dbfp->afp->buf = NULL; | |
195 dbfp->afp->buflen = 0; | |
196 | |
197 if ((dbfp->msa = MSAFileRead(dbfp->afp)) == NULL) | |
198 Die("Failed to read any alignment data from file %s", dbfp->fname); | |
199 /* hack: overload/reuse msa->lastidx; indicates | |
200 next seq to return upon a ReadSeq() call */ | |
201 dbfp->msa->lastidx = 0; | |
202 | |
203 return dbfp; | |
204 } | |
205 | |
206 /* Load the first line. | |
207 */ | |
208 SeqfileGetLine(dbfp); | |
209 return dbfp; | |
210 } | |
211 | |
212 /* Function: SeqfilePosition() | |
213 * | |
214 * Purpose: Move to a particular offset in a seqfile. | |
215 * Will not work on alignment files. | |
216 */ | |
217 void | |
218 SeqfilePosition(SQFILE *sqfp, SSIOFFSET *offset) | |
219 { | |
220 if (sqfp->do_stdin || sqfp->do_gzip || IsAlignmentFormat(sqfp->format)) | |
221 Die("SeqfilePosition() failed: in a nonrewindable data file or stream"); | |
222 | |
223 if (SSISetFilePosition(sqfp->f, offset) != 0) | |
224 Die("SSISetFilePosition failed, but that shouldn't happen."); | |
225 SeqfileGetLine(sqfp); | |
226 } | |
227 | |
228 | |
229 /* Function: SeqfileRewind() | |
230 * | |
231 * Purpose: Set a sequence file back to the first sequence. | |
232 * | |
233 * Won't work on alignment files. Although it would | |
234 * seem that it could (just set msa->lastidx back to 0), | |
235 * that'll fail on "multiple multiple" alignment file formats | |
236 * (e.g. Stockholm). | |
237 */ | |
238 void | |
239 SeqfileRewind(SQFILE *sqfp) | |
240 { | |
241 if (sqfp->do_stdin || sqfp->do_gzip) | |
242 Die("SeqfileRewind() failed: in a nonrewindable data file or stream"); | |
243 | |
244 rewind(sqfp->f); | |
245 SeqfileGetLine(sqfp); | |
246 } | |
247 | |
248 /* Function: SeqfileLineParameters() | |
249 * Date: SRE, Thu Feb 15 17:00:41 2001 [St. Louis] | |
250 * | |
251 * Purpose: After all the sequences have been read from the file, | |
252 * but before closing it, retrieve overall bytes-per-line and | |
253 * residues-per-line info. If non-zero, these mean that | |
254 * the file contains homogeneous sequence line lengths (except | |
255 * the last line in each record). | |
256 * | |
257 * If either of bpl or rpl is determined to be inhomogeneous, | |
258 * both are returned as 0. | |
259 * | |
260 * Args: *sqfp - an open but fully read sequence file | |
261 * ret_bpl - RETURN: bytes per line, or 0 if inhomogeneous | |
262 * ret_rpl - RETURN: residues per line, or 0 if inhomogenous. | |
263 * | |
264 * Returns: void | |
265 */ | |
266 void | |
267 SeqfileLineParameters(SQFILE *V, int *ret_bpl, int *ret_rpl) | |
268 { | |
269 if (V->rpl > 0 && V->maxrpl == V->rpl && | |
270 V->bpl > 0 && V->maxbpl == V->bpl) { | |
271 *ret_bpl = V->bpl; | |
272 *ret_rpl = V->rpl; | |
273 } else { | |
274 *ret_bpl = 0; | |
275 *ret_rpl = 0; | |
276 } | |
277 } | |
278 | |
279 | |
280 void | |
281 SeqfileClose(SQFILE *sqfp) | |
282 { | |
283 /* note: don't test for sqfp->msa being NULL. Now that | |
284 * we're holding afp open and allowing access to multi-MSA | |
285 * databases (e.g. Stockholm format, Pfam), msa ends | |
286 * up being NULL when we run out of alignments. | |
287 */ | |
288 if (sqfp->afp != NULL) { | |
289 if (sqfp->msa != NULL) MSAFree(sqfp->msa); | |
290 if (sqfp->afp->buf != NULL) free(sqfp->afp->buf); | |
291 free(sqfp->afp); | |
292 } | |
293 #ifndef SRE_STRICT_ANSI /* gunzip functionality only on POSIX systems */ | |
294 if (sqfp->do_gzip) pclose(sqfp->f); | |
295 #endif | |
296 else if (! sqfp->do_stdin) fclose(sqfp->f); | |
297 if (sqfp->buf != NULL) free(sqfp->buf); | |
298 if (sqfp->fname != NULL) free(sqfp->fname); | |
299 free(sqfp); | |
300 } | |
301 | |
302 | |
303 /* Function: SeqfileGetLine() | |
304 * Date: SRE, Tue Jun 22 09:15:49 1999 [Sanger Centre] | |
305 * | |
306 * Purpose: read a line from a sequence file into V->buf | |
307 * If the fgets() is NULL, sets V->buf[0] to '\0'. | |
308 * | |
309 * Args: V | |
310 * | |
311 * Returns: void | |
312 */ | |
313 static void | |
314 SeqfileGetLine(SQFILE *V) | |
315 { | |
316 if (V->ssimode >= 0) | |
317 if (0 != SSIGetFilePosition(V->f, V->ssimode, &(V->ssioffset))) | |
318 Die("SSIGetFilePosition() failed"); | |
319 if (sre_fgets(&(V->buf), &(V->buflen), V->f) == NULL) | |
320 *(V->buf) = '\0'; | |
321 V->linenumber++; | |
322 } | |
323 | |
324 | |
325 void | |
326 FreeSequence(char *seq, SQINFO *sqinfo) | |
327 { | |
328 if (seq != NULL) free(seq); /* FS, r244, here is potential problem in profile/profile */ | |
329 if (sqinfo->flags & SQINFO_SS){ | |
330 if (NULL != sqinfo->ss){ /* FS, r244 -> r245 */ | |
331 free(sqinfo->ss); | |
332 } | |
333 } | |
334 if (sqinfo->flags & SQINFO_SA){ | |
335 if (NULL != sqinfo->sa){ /* FS, r244 -> r245 */ | |
336 free(sqinfo->sa); | |
337 } | |
338 } | |
339 } | |
340 | |
341 int | |
342 SetSeqinfoString(SQINFO *sqinfo, char *sptr, int flag) | |
343 { | |
344 int len; | |
345 int pos; | |
346 | |
347 /* silently ignore NULL. */ | |
348 if (sptr == NULL) return 1; | |
349 | |
350 while (*sptr == ' ') sptr++; /* ignore leading whitespace */ | |
351 for (pos = strlen(sptr)-1; pos >= 0; pos--) | |
352 if (! isspace((int) sptr[pos])) break; | |
353 sptr[pos+1] = '\0'; /* ignore trailing whitespace */ | |
354 | |
355 switch (flag) { | |
356 case SQINFO_NAME: | |
357 if (*sptr != '-') | |
358 { | |
359 strncpy(sqinfo->name, sptr, SQINFO_NAMELEN-1); | |
360 sqinfo->name[SQINFO_NAMELEN-1] = '\0'; | |
361 sqinfo->flags |= SQINFO_NAME; | |
362 } | |
363 break; | |
364 | |
365 case SQINFO_ID: | |
366 if (*sptr != '-') | |
367 { | |
368 strncpy(sqinfo->id, sptr, SQINFO_NAMELEN-1); | |
369 sqinfo->id[SQINFO_NAMELEN-1] = '\0'; | |
370 sqinfo->flags |= SQINFO_ID; | |
371 } | |
372 break; | |
373 | |
374 case SQINFO_ACC: | |
375 if (*sptr != '-') | |
376 { | |
377 strncpy(sqinfo->acc, sptr, SQINFO_NAMELEN-1); | |
378 sqinfo->acc[SQINFO_NAMELEN-1] = '\0'; | |
379 sqinfo->flags |= SQINFO_ACC; | |
380 } | |
381 break; | |
382 | |
383 case SQINFO_DESC: | |
384 if (*sptr != '-') | |
385 { | |
386 if (sqinfo->flags & SQINFO_DESC) /* append? */ | |
387 { | |
388 len = strlen(sqinfo->desc); | |
389 if (len < SQINFO_DESCLEN-2) /* is there room? */ | |
390 { | |
391 strncat(sqinfo->desc, " ", SQINFO_DESCLEN-1-len); len++; | |
392 strncat(sqinfo->desc, sptr, SQINFO_DESCLEN-1-len); | |
393 } | |
394 } | |
395 else /* else copy */ | |
396 strncpy(sqinfo->desc, sptr, SQINFO_DESCLEN-1); | |
397 sqinfo->desc[SQINFO_DESCLEN-1] = '\0'; | |
398 sqinfo->flags |= SQINFO_DESC; | |
399 } | |
400 break; | |
401 | |
402 case SQINFO_START: | |
403 if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; } | |
404 sqinfo->start = atoi(sptr); | |
405 if (sqinfo->start != 0) sqinfo->flags |= SQINFO_START; | |
406 break; | |
407 | |
408 case SQINFO_STOP: | |
409 if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; } | |
410 sqinfo->stop = atoi(sptr); | |
411 if (sqinfo->stop != 0) sqinfo->flags |= SQINFO_STOP; | |
412 break; | |
413 | |
414 case SQINFO_OLEN: | |
415 if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; } | |
416 sqinfo->olen = atoi(sptr); | |
417 if (sqinfo->olen != 0) sqinfo->flags |= SQINFO_OLEN; | |
418 break; | |
419 | |
420 default: | |
421 Die("Invalid flag %d to SetSeqinfoString()", flag); | |
422 } | |
423 return 1; | |
424 } | |
425 | |
426 void | |
427 SeqinfoCopy(SQINFO *sq1, SQINFO *sq2) | |
428 { | |
429 sq1->flags = sq2->flags; | |
430 if (sq2->flags & SQINFO_NAME) strcpy(sq1->name, sq2->name); | |
431 if (sq2->flags & SQINFO_ID) strcpy(sq1->id, sq2->id); | |
432 if (sq2->flags & SQINFO_ACC) strcpy(sq1->acc, sq2->acc); | |
433 if (sq2->flags & SQINFO_DESC) strcpy(sq1->desc, sq2->desc); | |
434 if (sq2->flags & SQINFO_LEN) sq1->len = sq2->len; | |
435 if (sq2->flags & SQINFO_START) sq1->start = sq2->start; | |
436 if (sq2->flags & SQINFO_STOP) sq1->stop = sq2->stop; | |
437 if (sq2->flags & SQINFO_OLEN) sq1->olen = sq2->olen; | |
438 if (sq2->flags & SQINFO_TYPE) sq1->type = sq2->type; | |
439 if (sq2->flags & SQINFO_SS) sq1->ss = Strdup(sq2->ss); | |
440 if (sq2->flags & SQINFO_SA) sq1->sa = Strdup(sq2->sa); | |
441 } | |
442 | |
443 /* Function: ToDNA() | |
444 * | |
445 * Purpose: Convert a sequence to DNA. | |
446 * U --> T | |
447 */ | |
448 void | |
449 ToDNA(char *seq) | |
450 { | |
451 for (; *seq != '\0'; seq++) | |
452 { | |
453 if (*seq == 'U') *seq = 'T'; | |
454 else if (*seq == 'u') *seq = 't'; | |
455 } | |
456 } | |
457 | |
458 /* Function: ToRNA() | |
459 * | |
460 * Purpose: Convert a sequence to RNA. | |
461 * T --> U | |
462 */ | |
463 void | |
464 ToRNA(char *seq) | |
465 { | |
466 for (; *seq != '\0'; seq++) | |
467 { | |
468 if (*seq == 'T') *seq = 'U'; | |
469 else if (*seq == 't') *seq = 'u'; | |
470 } | |
471 } | |
472 | |
473 | |
474 /* Function: ToIUPAC() | |
475 * | |
476 * Purpose: Convert X's, o's, other junk in a nucleic acid sequence to N's, | |
477 * to comply with IUPAC code. If is_aseq is TRUE, will allow gap | |
478 * characters though, so we can call ToIUPAC() on aligned seqs. | |
479 * | |
480 * NUCLEOTIDES is defined in squid.h as: | |
481 * "ACGTUNRYMKSWHBVDacgtunrymkswhbvd" | |
482 * gap chars allowed by isgap() are defined in squid.h as: | |
483 * " ._-~" | |
484 * | |
485 * WU-BLAST's pressdb will | |
486 * choke on X's, for instance, necessitating conversion | |
487 * of certain genome centers' data. | |
488 */ | |
489 void | |
490 ToIUPAC(char *seq, int is_aseq) | |
491 { | |
492 if (is_aseq) { | |
493 for (; *seq != '\0'; seq++) | |
494 if (strchr(NUCLEOTIDES, *seq) == NULL && ! isgap(*seq)) *seq = 'N'; | |
495 } else { | |
496 for (; *seq != '\0'; seq++) | |
497 if (strchr(NUCLEOTIDES, *seq) == NULL) *seq = 'N'; | |
498 } | |
499 } | |
500 | |
501 | |
502 /* Function: addseq() | |
503 * | |
504 * Purpose: Add a line of sequence to the growing string in V. | |
505 * | |
506 * In the seven supported unaligned formats, all sequence | |
507 * lines may contain whitespace that must be filtered out; | |
508 * four formats (PIR, EMBL, Genbank, GCG) include coordinates | |
509 * that must be filtered out. Thus an (!isdigit && !isspace) | |
510 * test on each character before we accept it. | |
511 */ | |
512 static void | |
513 addseq(char *s, struct ReadSeqVars *V) | |
514 { | |
515 char *s0; | |
516 char *sq; | |
517 int rpl; /* valid residues per line */ | |
518 int bpl; /* characters per line */ | |
519 | |
520 if (V->ssimode == -1) | |
521 { /* Normal mode: keeping the seq */ | |
522 /* Make sure we have enough room. We know that s is <= buflen, | |
523 * so just make sure we've got room for a whole new buflen worth | |
524 * of sequence. | |
525 */ | |
526 if (V->seqlen + V->buflen > V->maxseq) { | |
527 V->maxseq += MAX(V->buflen, kStartLength); | |
528 V->seq = ReallocOrDie (V->seq, V->maxseq+1); | |
529 } | |
530 | |
531 sq = V->seq + V->seqlen; | |
532 while (*s != 0) { | |
533 #ifdef CLUSTALO | |
534 if (! isdigit((int) *s) && ! isspace((int) *s) && isprint((int) *s)) { | |
535 #else | |
536 if (! isdigit((int) *s) && ! isspace((int) *s)) { | |
537 #endif | |
538 *sq = *s; | |
539 sq++; | |
540 } | |
541 s++; | |
542 } | |
543 V->seqlen = sq - V->seq; | |
544 } | |
545 else /* else: indexing mode, discard the seq */ | |
546 { | |
547 s0 = s; | |
548 rpl = 0; | |
549 while (*s != 0) { | |
550 if (! isdigit((int) *s) && ! isspace((int) *s)) { | |
551 rpl++; | |
552 } | |
553 s++; | |
554 } | |
555 V->seqlen += rpl; | |
556 bpl = s - s0; | |
557 | |
558 /* Keep track of the global rpl, bpl for the file. | |
559 * This is overly complicated because we have to | |
560 * allow the last line of each record (e.g. the last addseq() call | |
561 * on each sequence) to have a different length - and sometimes | |
562 * we'll have one-line sequence records, too. Thus we only | |
563 * do something with the global V->rpl when we have *passed over* | |
564 * a line - we keep the last line's rpl in last_rpl. And because | |
565 * a file might consist entirely of single-line records, we keep | |
566 * a third guy, maxrpl, that tells us the maximum rpl of any line | |
567 * in the file. If we reach the end of file and rpl is still unset, | |
568 * we'll set it to maxrpl. If we reach eof and rpl is set, but is | |
569 * less than maxrpl, that's a weird case where a last line in some | |
570 * record is longer than every other line. | |
571 */ | |
572 if (V->rpl != 0) { /* 0 means we already know rpl is invalid */ | |
573 if (V->lastrpl > 0) { /* we're on something that's not the first line */ | |
574 if (V->rpl > 0 && V->lastrpl != V->rpl) V->rpl = 0; | |
575 else if (V->rpl == -1) V->rpl = V->lastrpl; | |
576 } | |
577 V->lastrpl = rpl; | |
578 if (rpl > V->maxrpl) V->maxrpl = rpl; /* make sure we check max length of final lines */ | |
579 } | |
580 if (V->bpl != 0) { /* 0 means we already know bpl is invalid */ | |
581 if (V->lastbpl > 0) { /* we're on something that's not the first line */ | |
582 if (V->bpl > 0 && V->lastbpl != V->bpl) V->bpl = 0; | |
583 else if (V->bpl == -1) V->bpl = V->lastbpl; | |
584 } | |
585 V->lastbpl = bpl; | |
586 if (bpl > V->maxbpl) V->maxbpl = bpl; /* make sure we check max length of final lines */ | |
587 } | |
588 } /* end of indexing mode of addseq(). */ | |
589 | |
590 } | |
591 | |
592 static void | |
593 readLoop(int addfirst, int (*endTest)(char *,int *), struct ReadSeqVars *V) | |
594 { | |
595 int addend = 0; | |
596 int done = 0; | |
597 | |
598 V->seqlen = 0; | |
599 V->lastrpl = V->lastbpl = 0; | |
600 if (addfirst) { | |
601 if (V->ssimode >= 0) V->d_off = V->ssioffset; | |
602 addseq(V->buf, V); | |
603 } else if (V->ssimode >= 0) | |
604 if (0 != SSIGetFilePosition(V->f, V->ssimode, &(V->d_off))) | |
605 Die("SSIGetFilePosition() failed"); | |
606 | |
607 do { | |
608 SeqfileGetLine(V); | |
609 /* feof() alone is a bug; files not necessarily \n terminated */ | |
610 if (*(V->buf) == '\0' && feof(V->f)) | |
611 done = TRUE; | |
612 done |= (*endTest)(V->buf, &addend); | |
613 if (addend || !done) | |
614 addseq(V->buf, V); | |
615 } while (!done); | |
616 } | |
617 | |
618 | |
619 static int | |
620 endPIR(char *s, int *addend) | |
621 { | |
622 *addend = 0; | |
623 if ((strncmp(s, "///", 3) == 0) || | |
624 (strncmp(s, "ENTRY", 5) == 0)) | |
625 return 1; | |
626 else | |
627 return 0; | |
628 } | |
629 | |
630 static void | |
631 readPIR(struct ReadSeqVars *V) | |
632 { | |
633 char *sptr; | |
634 /* load first line of entry */ | |
635 while (!feof(V->f) && strncmp(V->buf, "ENTRY", 5) != 0) { | |
636 SeqfileGetLine(V); | |
637 } | |
638 if (feof(V->f)) return; | |
639 if (V->ssimode >= 0) V->r_off = V->ssioffset; | |
640 | |
641 if ((sptr = strtok(V->buf + 15, "\n\t ")) != NULL) | |
642 { | |
643 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); | |
644 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID); | |
645 } | |
646 do { | |
647 SeqfileGetLine(V); | |
648 if (!feof(V->f) && strncmp(V->buf, "TITLE", 5) == 0) | |
649 SetSeqinfoString(V->sqinfo, V->buf+15, SQINFO_DESC); | |
650 else if (!feof(V->f) && strncmp(V->buf, "ACCESSION", 9) == 0) | |
651 { | |
652 if ((sptr = strtok(V->buf+15, " \t\n")) != NULL) | |
653 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC); | |
654 } | |
655 } while (! feof(V->f) && (strncmp(V->buf,"SEQUENCE", 8) != 0)); | |
656 SeqfileGetLine(V); /* skip next line, coords */ | |
657 | |
658 readLoop(0, endPIR, V); | |
659 | |
660 /* reading a real PIR-CODATA database file, we keep the source coords | |
661 */ | |
662 V->sqinfo->start = 1; | |
663 V->sqinfo->stop = V->seqlen; | |
664 V->sqinfo->olen = V->seqlen; | |
665 V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN; | |
666 | |
667 /* get next line | |
668 */ | |
669 while (!feof(V->f) && strncmp(V->buf, "ENTRY", 5) != 0) { | |
670 SeqfileGetLine(V); | |
671 } | |
672 } | |
673 | |
674 | |
675 | |
676 static int | |
677 endIG(char *s, int *addend) | |
678 { | |
679 *addend = 1; /* 1 or 2 occur in line w/ bases */ | |
680 return((strchr(s,'1')!=NULL) || (strchr(s,'2')!=NULL)); | |
681 } | |
682 | |
683 static void | |
684 readIG(struct ReadSeqVars *V) | |
685 { | |
686 char *nm; | |
687 /* position past ';' comments */ | |
688 do { | |
689 SeqfileGetLine(V); | |
690 } while (! (feof(V->f) || ((*V->buf != 0) && (*V->buf != ';')) )); | |
691 | |
692 if (!feof(V->f)) | |
693 { | |
694 if ((nm = strtok(V->buf, "\n\t ")) != NULL) | |
695 SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME); | |
696 | |
697 readLoop(0, endIG, V); | |
698 } | |
699 | |
700 while (!(feof(V->f) || ((*V->buf != '\0') && (*V->buf == ';')))) | |
701 SeqfileGetLine(V); | |
702 } | |
703 | |
704 static int | |
705 endStrider(char *s, int *addend) | |
706 { | |
707 *addend = 0; | |
708 return (strstr( s, "//") != NULL); | |
709 } | |
710 | |
711 static void | |
712 readStrider(struct ReadSeqVars *V) | |
713 { | |
714 char *nm; | |
715 | |
716 while ((!feof(V->f)) && (*V->buf == ';')) | |
717 { | |
718 if (strncmp(V->buf,"; DNA sequence", 14) == 0) | |
719 { | |
720 if ((nm = strtok(V->buf+16, ",\n\t ")) != NULL) | |
721 SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME); | |
722 } | |
723 SeqfileGetLine(V); | |
724 } | |
725 | |
726 if (! feof(V->f)) | |
727 readLoop(1, endStrider, V); | |
728 | |
729 /* load next line | |
730 */ | |
731 while ((!feof(V->f)) && (*V->buf != ';')) | |
732 SeqfileGetLine(V); | |
733 } | |
734 | |
735 | |
736 static int | |
737 endGB(char *s, int *addend) | |
738 { | |
739 *addend = 0; | |
740 return ((strstr(s,"//") != NULL) || (strstr(s,"LOCUS") == s)); | |
741 } | |
742 | |
743 static void | |
744 readGenBank(struct ReadSeqVars *V) | |
745 { | |
746 char *sptr; | |
747 int in_definition; | |
748 | |
749 /* We'll map three genbank identifiers onto names: | |
750 * LOCUS -> sqinfo.name | |
751 * ACCESSION -> sqinfo.acc [primary accession only] | |
752 * VERSION -> sqinfo.id | |
753 * We don't currently store the GI number, or secondary accessions. | |
754 */ | |
755 while (strncmp(V->buf, "LOCUS", 5) != 0) { | |
756 SeqfileGetLine(V); | |
757 } | |
758 if (V->ssimode >= 0) V->r_off = V->ssioffset; | |
759 | |
760 if ((sptr = strtok(V->buf+12, "\n\t ")) != NULL) | |
761 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); | |
762 | |
763 in_definition = FALSE; | |
764 while (! feof(V->f)) | |
765 { | |
766 SeqfileGetLine(V); | |
767 if (! feof(V->f) && strstr(V->buf, "DEFINITION") == V->buf) | |
768 { | |
769 if ((sptr = strtok(V->buf+12, "\n")) != NULL) | |
770 SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); | |
771 in_definition = TRUE; | |
772 } | |
773 else if (! feof(V->f) && strstr(V->buf, "ACCESSION") == V->buf) | |
774 { | |
775 if ((sptr = strtok(V->buf+12, "\n\t ")) != NULL) | |
776 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC); | |
777 in_definition = FALSE; | |
778 } | |
779 else if (! feof(V->f) && strstr(V->buf, "VERSION") == V->buf) | |
780 { | |
781 if ((sptr = strtok(V->buf+12, "\n\t ")) != NULL) | |
782 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID); | |
783 in_definition = FALSE; | |
784 } | |
785 else if (strncmp(V->buf,"ORIGIN", 6) != 0) | |
786 { | |
787 if (in_definition) | |
788 SetSeqinfoString(V->sqinfo, V->buf, SQINFO_DESC); | |
789 } | |
790 else | |
791 break; | |
792 } | |
793 | |
794 readLoop(0, endGB, V); | |
795 | |
796 /* reading a real GenBank database file, we keep the source coords | |
797 */ | |
798 V->sqinfo->start = 1; | |
799 V->sqinfo->stop = V->seqlen; | |
800 V->sqinfo->olen = V->seqlen; | |
801 V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN; | |
802 | |
803 | |
804 while (!(feof(V->f) || ((*V->buf!=0) && (strstr(V->buf,"LOCUS") == V->buf)))) | |
805 SeqfileGetLine(V); | |
806 /* SRE: V->s now holds "//", so sequential | |
807 reads are wedged: fixed Tue Jul 13 1993 */ | |
808 while (!feof(V->f) && strstr(V->buf, "LOCUS ") != V->buf) | |
809 SeqfileGetLine(V); | |
810 } | |
811 | |
812 static int | |
813 endGCGdata(char *s, int *addend) | |
814 { | |
815 *addend = 0; | |
816 return (*s == '>'); | |
817 } | |
818 | |
819 static void | |
820 readGCGdata(struct ReadSeqVars *V) | |
821 { | |
822 int binary = FALSE; /* whether data are binary or not */ | |
823 int blen = 0; /* length of binary sequence */ | |
824 | |
825 /* first line contains ">>>>" followed by name */ | |
826 if (Strparse(">>>>([^ ]+) .+2BIT +Len: ([0-9]+)", V->buf, 2)) | |
827 { | |
828 binary = TRUE; | |
829 SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME); | |
830 blen = atoi(sqd_parse[2]); | |
831 } | |
832 else if (Strparse(">>>>([^ ]+) .+ASCII +Len: [0-9]+", V->buf, 1)) | |
833 SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME); | |
834 else | |
835 Die("bogus GCGdata format? %s", V->buf); | |
836 | |
837 /* second line contains free text description */ | |
838 SeqfileGetLine(V); | |
839 SetSeqinfoString(V->sqinfo, V->buf, SQINFO_DESC); | |
840 | |
841 if (binary) { | |
842 /* allocate for blen characters +3... (allow for 3 bytes of slop) */ | |
843 if (blen >= V->maxseq) { | |
844 V->maxseq = blen; | |
845 if ((V->seq = (char *) realloc (V->seq, sizeof(char)*(V->maxseq+4)))==NULL) | |
846 Die("malloc failed"); | |
847 } | |
848 /* read (blen+3)/4 bytes from file */ | |
849 if (fread(V->seq, sizeof(char), (blen+3)/4, V->f) < (size_t) ((blen+3)/4)) | |
850 Die("fread failed"); | |
851 V->seqlen = blen; | |
852 /* convert binary code to seq */ | |
853 GCGBinaryToSequence(V->seq, blen); | |
854 } | |
855 else readLoop(0, endGCGdata, V); | |
856 | |
857 while (!(feof(V->f) || ((*V->buf != 0) && (*V->buf == '>')))) | |
858 SeqfileGetLine(V); | |
859 } | |
860 | |
861 static int | |
862 endPearson(char *s, int *addend) | |
863 { | |
864 *addend = 0; | |
865 return(*s == '>'); | |
866 } | |
867 | |
868 static void | |
869 readPearson(struct ReadSeqVars *V) | |
870 { | |
871 char *sptr; | |
872 | |
873 if (V->ssimode >= 0) V->r_off = V->ssioffset; | |
874 | |
875 if (*V->buf != '>') | |
876 Die("\ | |
877 File %s does not appear to be in FASTA format at line %d.\n\ | |
878 You may want to specify the file format on the command line.\n\ | |
879 Usually this is done with an option --informat <fmt>.\n", | |
880 V->fname, V->linenumber); | |
881 | |
882 if ((sptr = strtok(V->buf+1, "\n\t ")) != NULL) | |
883 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); | |
884 if ((sptr = strtok(NULL, "\n")) != NULL) | |
885 SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); | |
886 | |
887 readLoop(0, endPearson, V); | |
888 | |
889 while (!(feof(V->f) || ((*V->buf != 0) && (*V->buf == '>')))) { | |
890 SeqfileGetLine(V); | |
891 } | |
892 } | |
893 | |
894 | |
895 static int | |
896 endEMBL(char *s, int *addend) | |
897 { | |
898 *addend = 0; | |
899 /* Some people (Berlin 5S rRNA database, f'r instance) use | |
900 * an extended EMBL format that attaches extra data after | |
901 * the sequence -- watch out for that. We use the fact that | |
902 * real EMBL sequence lines begin with five spaces. | |
903 * | |
904 * We can use this as the sole end test because readEMBL() will | |
905 * advance to the next ID line before starting to read again. | |
906 */ | |
907 return (strncmp(s," ",5) != 0); | |
908 /* return ((strstr(s,"//") != NULL) || (strstr(s,"ID ") == s)); */ | |
909 } | |
910 | |
911 static void | |
912 readEMBL(struct ReadSeqVars *V) | |
913 { | |
914 char *sptr; | |
915 | |
916 /* make sure we have first line */ | |
917 while (!feof(V->f) && strncmp(V->buf, "ID ", 4) != 0) { | |
918 SeqfileGetLine(V); | |
919 } | |
920 if (V->ssimode >= 0) V->r_off = V->ssioffset; | |
921 | |
922 if ((sptr = strtok(V->buf+5, "\n\t ")) != NULL) | |
923 { | |
924 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); | |
925 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID); | |
926 } | |
927 | |
928 do { | |
929 SeqfileGetLine(V); | |
930 if (!feof(V->f) && strstr(V->buf, "AC ") == V->buf) | |
931 { | |
932 if ((sptr = strtok(V->buf+5, "; \t\n")) != NULL) | |
933 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC); | |
934 } | |
935 else if (!feof(V->f) && strstr(V->buf, "DE ") == V->buf) | |
936 { | |
937 if ((sptr = strtok(V->buf+5, "\n")) != NULL) | |
938 SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); | |
939 } | |
940 } while (! feof(V->f) && strncmp(V->buf,"SQ",2) != 0); | |
941 | |
942 readLoop(0, endEMBL, V); | |
943 | |
944 /* Hack for Staden experiment files: convert - to N | |
945 */ | |
946 if (V->ssimode == -1) /* if we're in ssi mode, we're not keeping the seq */ | |
947 for (sptr = V->seq; *sptr != '\0'; sptr++) | |
948 if (*sptr == '-') *sptr = 'N'; | |
949 | |
950 /* reading a real EMBL database file, we keep the source coords | |
951 */ | |
952 V->sqinfo->start = 1; | |
953 V->sqinfo->stop = V->seqlen; | |
954 V->sqinfo->olen = V->seqlen; | |
955 V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN; | |
956 | |
957 /* load next record's ID line */ | |
958 while (!feof(V->f) && strncmp(V->buf, "ID ", 4) != 0) { | |
959 SeqfileGetLine(V); | |
960 } | |
961 | |
962 } | |
963 | |
964 | |
965 static int | |
966 endZuker(char *s, int *addend) | |
967 { | |
968 *addend = 0; | |
969 return( *s == '(' ); | |
970 } | |
971 | |
972 static void | |
973 readZuker(struct ReadSeqVars *V) | |
974 { | |
975 char *sptr; | |
976 | |
977 SeqfileGetLine(V); /*s == "seqLen seqid string..."*/ | |
978 | |
979 if ((sptr = strtok(V->buf+6, " \t\n")) != NULL) | |
980 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); | |
981 | |
982 if ((sptr = strtok(NULL, "\n")) != NULL) | |
983 SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); | |
984 | |
985 readLoop(0, endZuker, V); | |
986 | |
987 while (!(feof(V->f) | ((*V->buf != '\0') & (*V->buf == '(')))) | |
988 SeqfileGetLine(V); | |
989 } | |
990 | |
991 static void | |
992 readUWGCG(struct ReadSeqVars *V) | |
993 { | |
994 char *si; | |
995 char *sptr; | |
996 int done; | |
997 | |
998 V->seqlen = 0; | |
999 | |
1000 /*writeseq: " %s Length: %d (today) Check: %d ..\n" */ | |
1001 /*drop above or ".." from id*/ | |
1002 if ((si = strstr(V->buf," Length: ")) != NULL) *si = 0; | |
1003 else if ((si = strstr(V->buf,"..")) != NULL) *si = 0; | |
1004 | |
1005 if ((sptr = strtok(V->buf, "\n\t ")) != NULL) | |
1006 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); | |
1007 | |
1008 do { | |
1009 done = feof(V->f); | |
1010 SeqfileGetLine(V); | |
1011 if (! done) addseq(V->buf, V); | |
1012 } while (!done); | |
1013 } | |
1014 | |
1015 | |
1016 /* Function: ReadSeq() | |
1017 * | |
1018 * Purpose: Read next sequence from an open database file. | |
1019 * Return the sequence and associated info. | |
1020 * | |
1021 * Args: fp - open sequence database file pointer | |
1022 * format - format of the file (previously determined | |
1023 * by call to SeqfileFormat()). | |
1024 * Currently unused, since we carry it in V. | |
1025 * ret_seq - RETURN: sequence | |
1026 * sqinfo - RETURN: filled in w/ other information | |
1027 * | |
1028 * Limitations: uses squid_errno, so it's not threadsafe. | |
1029 * | |
1030 * Return: 1 on success, 0 on failure. | |
1031 * ret_seq and some field of sqinfo are allocated here, | |
1032 * The preferred call mechanism to properly free the memory is: | |
1033 * | |
1034 * SQINFO sqinfo; | |
1035 * char *seq; | |
1036 * | |
1037 * ReadSeq(fp, format, &seq, &sqinfo); | |
1038 * ... do something... | |
1039 * FreeSequence(seq, &sqinfo); | |
1040 */ | |
1041 int | |
1042 ReadSeq(SQFILE *V, int format, char **ret_seq, SQINFO *sqinfo) | |
1043 { | |
1044 int gotuw; | |
1045 | |
1046 squid_errno = SQERR_OK; | |
1047 | |
1048 /* Here's the hack for sequential access of sequences from | |
1049 * the multiple sequence alignment formats | |
1050 */ | |
1051 if (IsAlignmentFormat(V->format)) | |
1052 { | |
1053 if (V->msa->lastidx >= V->msa->nseq) | |
1054 { /* out of data. try to read another alignment */ | |
1055 MSAFree(V->msa); | |
1056 if ((V->msa = MSAFileRead(V->afp)) == NULL) | |
1057 return 0; | |
1058 V->msa->lastidx = 0; | |
1059 } | |
1060 /* copy and dealign the appropriate aligned seq */ | |
1061 /* AW: stopping squid from dealigning sequences and corresponding info */ | |
1062 #ifdef CLUSTALO | |
1063 V->seq = sre_strdup(V->msa->aseq[V->msa->lastidx], V->msa->alen); | |
1064 #else | |
1065 MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen, | |
1066 V->msa->aseq[V->msa->lastidx], &(V->seq)); | |
1067 #endif | |
1068 V->seqlen = strlen(V->seq); | |
1069 | |
1070 /* Extract sqinfo stuff for this sequence from the msa. | |
1071 * Tedious; code that should be cleaned. | |
1072 */ | |
1073 sqinfo->flags = 0; | |
1074 if (V->msa->sqname[V->msa->lastidx] != NULL) | |
1075 SetSeqinfoString(sqinfo, V->msa->sqname[V->msa->lastidx], SQINFO_NAME); | |
1076 if (V->msa->sqacc != NULL && V->msa->sqacc[V->msa->lastidx] != NULL) | |
1077 SetSeqinfoString(sqinfo, V->msa->sqacc[V->msa->lastidx], SQINFO_ACC); | |
1078 if (V->msa->sqdesc != NULL && V->msa->sqdesc[V->msa->lastidx] != NULL) | |
1079 SetSeqinfoString(sqinfo, V->msa->sqdesc[V->msa->lastidx], SQINFO_DESC); | |
1080 if (V->msa->ss != NULL && V->msa->ss[V->msa->lastidx] != NULL) { | |
1081 /* AW: stopping squid from dealigning sequences and corresponding info */ | |
1082 #ifdef CLUSTALO | |
1083 sqinfo->ss = sre_strdup(V->msa->ss[V->msa->lastidx], V->msa->alen); | |
1084 #else | |
1085 MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen, | |
1086 V->msa->ss[V->msa->lastidx], &(sqinfo->ss)); | |
1087 #endif | |
1088 sqinfo->flags |= SQINFO_SS; | |
1089 } | |
1090 if (V->msa->sa != NULL && V->msa->sa[V->msa->lastidx] != NULL) { | |
1091 /* AW: stopping squid from dealigning sequences and corresponding info */ | |
1092 #ifdef CLUSTALO | |
1093 sqinfo->sa = sre_strdup(V->msa->sa[V->msa->lastidx], V->msa->alen); | |
1094 #else | |
1095 MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen, | |
1096 V->msa->sa[V->msa->lastidx], &(sqinfo->sa)); | |
1097 #endif | |
1098 sqinfo->flags |= SQINFO_SA; | |
1099 } | |
1100 V->msa->lastidx++; | |
1101 } | |
1102 else { | |
1103 if (feof(V->f)) return 0; | |
1104 | |
1105 if (V->ssimode == -1) { /* normal mode */ | |
1106 V->seq = (char*) calloc (kStartLength+1, sizeof(char)); | |
1107 V->maxseq = kStartLength; | |
1108 } else { /* index mode: discarding seq */ | |
1109 V->seq = NULL; | |
1110 V->maxseq = 0; | |
1111 } | |
1112 V->seqlen = 0; | |
1113 V->sqinfo = sqinfo; | |
1114 V->sqinfo->flags = 0; | |
1115 | |
1116 switch (V->format) { | |
1117 case SQFILE_IG : readIG(V); break; | |
1118 case SQFILE_STRIDER : readStrider(V); break; | |
1119 case SQFILE_GENBANK : readGenBank(V); break; | |
1120 case SQFILE_FASTA : readPearson(V); break; | |
1121 case SQFILE_EMBL : readEMBL(V); break; | |
1122 case SQFILE_ZUKER : readZuker(V); break; | |
1123 case SQFILE_PIR : readPIR(V); break; | |
1124 case SQFILE_GCGDATA : readGCGdata(V); break; | |
1125 | |
1126 case SQFILE_GCG : | |
1127 do { /* skip leading comments on GCG file */ | |
1128 gotuw = (strstr(V->buf,"..") != NULL); | |
1129 if (gotuw) readUWGCG(V); | |
1130 SeqfileGetLine(V); | |
1131 } while (! feof(V->f)); | |
1132 break; | |
1133 | |
1134 case SQFILE_IDRAW: /* SRE: no attempt to read idraw postscript */ | |
1135 default: | |
1136 squid_errno = SQERR_FORMAT; | |
1137 free(V->seq); | |
1138 return 0; | |
1139 } | |
1140 if (V->seq != NULL) /* (it can be NULL in indexing mode) */ | |
1141 V->seq[V->seqlen] = 0; /* stick a string terminator on it */ | |
1142 } | |
1143 | |
1144 /* Cleanup | |
1145 */ | |
1146 sqinfo->len = V->seqlen; | |
1147 sqinfo->flags |= SQINFO_LEN; | |
1148 *ret_seq = V->seq; | |
1149 if (squid_errno == SQERR_OK) return 1; else return 0; | |
1150 } | |
1151 | |
1152 /* Function: SeqfileFormat() | |
1153 * Date: SRE, Tue Jun 22 10:58:58 1999 [Sanger Centre] | |
1154 * | |
1155 * Purpose: Determine format of an open file. | |
1156 * Returns format code. | |
1157 * Rewinds the file. | |
1158 * | |
1159 * Autodetects the following unaligned formats: | |
1160 * SQFILE_FASTA | |
1161 * SQFILE_GENBANK | |
1162 * SQFILE_EMBL | |
1163 * SQFILE_GCG | |
1164 * SQFILE_GCGDATA | |
1165 * SQFILE_PIR | |
1166 * Also autodetects the following alignment formats: | |
1167 * MSAFILE_STOCKHOLM | |
1168 * MSAFILE_MSF | |
1169 * MSAFILE_CLUSTAL | |
1170 * MSAFILE_SELEX | |
1171 * MSAFILE_PHYLIP | |
1172 * | |
1173 * Can't autodetect MSAFILE_A2M, calls it SQFILE_FASTA. | |
1174 * MSAFileFormat() does the opposite. | |
1175 * | |
1176 * Args: sfp - open SQFILE | |
1177 * | |
1178 * Return: format code, or SQFILE_UNKNOWN if unrecognized | |
1179 */ | |
1180 int | |
1181 SeqfileFormat(FILE *fp) | |
1182 { | |
1183 char *buf; | |
1184 int len; | |
1185 int fmt = SQFILE_UNKNOWN; | |
1186 int ndataline; | |
1187 char *bufcpy, *s, *s1, *s2; | |
1188 int has_junk; | |
1189 | |
1190 buf = NULL; | |
1191 len = 0; | |
1192 ndataline = 0; | |
1193 has_junk = FALSE; | |
1194 while (sre_fgets(&buf, &len, fp) != NULL) | |
1195 { | |
1196 if (IsBlankline(buf)) continue; | |
1197 | |
1198 /* Well-behaved formats identify themselves in first nonblank line. | |
1199 */ | |
1200 if (ndataline == 0) | |
1201 { | |
1202 if (strncmp(buf, ">>>>", 4) == 0 && strstr(buf, "Len: ")) | |
1203 { fmt = SQFILE_GCGDATA; goto DONE; } | |
1204 | |
1205 if (buf[0] == '>') | |
1206 { fmt = SQFILE_FASTA; goto DONE; } | |
1207 | |
1208 if (strncmp(buf, "!!AA_SEQUENCE", 13) == 0 || | |
1209 strncmp(buf, "!!NA_SEQUENCE", 13) == 0) | |
1210 { fmt = SQFILE_GCG; goto DONE; } | |
1211 | |
1212 if (strncmp(buf, "# STOCKHOLM 1.", 14) == 0) | |
1213 { fmt = MSAFILE_STOCKHOLM; goto DONE; } | |
1214 | |
1215 if (strncmp(buf, "CLUSTAL", 7) == 0 && | |
1216 strstr(buf, "multiple sequence alignment") != NULL) | |
1217 { fmt = MSAFILE_CLUSTAL; goto DONE; } | |
1218 | |
1219 if (strncmp(buf, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0 || | |
1220 strncmp(buf, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0) | |
1221 { fmt = MSAFILE_MSF; goto DONE; } | |
1222 | |
1223 /* PHYLIP id: also just a good bet */ | |
1224 bufcpy = sre_strdup(buf, -1); | |
1225 s = bufcpy; | |
1226 if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) != NULL && | |
1227 (s2 = sre_strtok(&s, WHITESPACE, NULL)) != NULL && | |
1228 IsInt(s1) && | |
1229 IsInt(s2)) | |
1230 { free(bufcpy); fmt = MSAFILE_PHYLIP; goto DONE; } | |
1231 free(bufcpy); | |
1232 } | |
1233 | |
1234 /* We trust that other formats identify themselves soon. | |
1235 */ | |
1236 /* dead giveaways for extended SELEX */ | |
1237 if (strncmp(buf, "#=AU", 4) == 0 || | |
1238 strncmp(buf, "#=ID", 4) == 0 || | |
1239 strncmp(buf, "#=AC", 4) == 0 || | |
1240 strncmp(buf, "#=DE", 4) == 0 || | |
1241 strncmp(buf, "#=GA", 4) == 0 || | |
1242 strncmp(buf, "#=TC", 4) == 0 || | |
1243 strncmp(buf, "#=NC", 4) == 0 || | |
1244 strncmp(buf, "#=SQ", 4) == 0 || | |
1245 strncmp(buf, "#=SS", 4) == 0 || | |
1246 strncmp(buf, "#=CS", 4) == 0 || | |
1247 strncmp(buf, "#=RF", 4) == 0) | |
1248 { fmt = MSAFILE_SELEX; goto DONE; } | |
1249 | |
1250 if (strncmp(buf, "///", 3) == 0 || strncmp(buf, "ENTRY ", 6) == 0) | |
1251 { fmt = SQFILE_PIR; goto DONE; } | |
1252 | |
1253 /* a ha, diagnostic of an (old) MSF file */ | |
1254 if ((strstr(buf, "..") != NULL) && | |
1255 (strstr(buf, "MSF:") != NULL) && | |
1256 (strstr(buf, "Check:")!= NULL)) | |
1257 { fmt = MSAFILE_MSF; goto DONE; } | |
1258 | |
1259 /* unaligned GCG (must follow MSF test!) */ | |
1260 if (strstr(buf, " Check: ") != NULL && strstr(buf, "..") != NULL) | |
1261 { fmt = SQFILE_GCG; goto DONE; } | |
1262 | |
1263 if (strncmp(buf,"LOCUS ",6) == 0 || strncmp(buf,"ORIGIN ",6) == 0) | |
1264 { fmt = SQFILE_GENBANK; goto DONE; } | |
1265 | |
1266 if (strncmp(buf,"ID ",5) == 0 || strncmp(buf,"SQ ",5) == 0) | |
1267 { fmt = SQFILE_EMBL; goto DONE; } | |
1268 | |
1269 /* But past here, we're being desperate. A simple SELEX file is | |
1270 * very difficult to detect; we can only try to disprove it. | |
1271 */ | |
1272 s = buf; | |
1273 if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) continue; /* skip blank lines */ | |
1274 if (strchr("#%", *s1) != NULL) continue; /* skip comment lines */ | |
1275 | |
1276 /* Disproof 1. Noncomment, nonblank lines in a SELEX file | |
1277 * must have at least two space-delimited fields (name/seq) | |
1278 */ | |
1279 if ((s2 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) | |
1280 has_junk = TRUE; | |
1281 | |
1282 /* Disproof 2. | |
1283 * The sequence field should look like a sequence. | |
1284 */ | |
1285 if (s2 != NULL && Seqtype(s2) == kOtherSeq) | |
1286 has_junk = TRUE; | |
1287 | |
1288 ndataline++; | |
1289 if (ndataline == 300) break; /* only look at first 300 lines */ | |
1290 } | |
1291 | |
1292 if (ndataline == 0) | |
1293 Die("Sequence file contains no data"); | |
1294 | |
1295 /* If we've made it this far, we've run out of data, but there | |
1296 * was at least one line of it; check if we've | |
1297 * disproven SELEX. If not, cross our fingers, pray, and guess SELEX. | |
1298 */ | |
1299 if (has_junk == TRUE) fmt = SQFILE_UNKNOWN; | |
1300 else fmt = MSAFILE_SELEX; | |
1301 | |
1302 DONE: | |
1303 if (buf != NULL) free(buf); | |
1304 rewind(fp); | |
1305 return fmt; | |
1306 } | |
1307 | |
1308 /* Function: GCGBinaryToSequence() | |
1309 * | |
1310 * Purpose: Convert a GCG 2BIT binary string to DNA sequence. | |
1311 * 0 = C 1 = T 2 = A 3 = G | |
1312 * 4 nts/byte | |
1313 * | |
1314 * Args: seq - binary sequence. Converted in place to DNA. | |
1315 * len - length of DNA. binary is (len+3)/4 bytes | |
1316 */ | |
1317 int | |
1318 GCGBinaryToSequence(char *seq, int len) | |
1319 { | |
1320 int bpos; /* position in binary */ | |
1321 int spos; /* position in sequence */ | |
1322 char twobit; | |
1323 int i; | |
1324 | |
1325 for (bpos = (len-1)/4; bpos >= 0; bpos--) | |
1326 { | |
1327 twobit = seq[bpos]; | |
1328 spos = bpos*4; | |
1329 | |
1330 for (i = 3; i >= 0; i--) | |
1331 { | |
1332 switch (twobit & 0x3) { | |
1333 case 0: seq[spos+i] = 'C'; break; | |
1334 case 1: seq[spos+i] = 'T'; break; | |
1335 case 2: seq[spos+i] = 'A'; break; | |
1336 case 3: seq[spos+i] = 'G'; break; | |
1337 } | |
1338 twobit = twobit >> 2; | |
1339 } | |
1340 } | |
1341 seq[len] = '\0'; | |
1342 return 1; | |
1343 } | |
1344 | |
1345 | |
1346 /* Function: GCGchecksum() | |
1347 * Date: SRE, Mon May 31 11:13:21 1999 [St. Louis] | |
1348 * | |
1349 * Purpose: Calculate a GCG checksum for a sequence. | |
1350 * Code provided by Steve Smith of Genetics | |
1351 * Computer Group. | |
1352 * | |
1353 * Args: seq - sequence to calculate checksum for. | |
1354 * may contain gap symbols. | |
1355 * len - length of sequence (usually known, | |
1356 * so save a strlen() call) | |
1357 * | |
1358 * Returns: GCG checksum. | |
1359 */ | |
1360 int | |
1361 GCGchecksum(char *seq, int len) | |
1362 { | |
1363 int i; /* position in sequence */ | |
1364 int chk = 0; /* calculated checksum */ | |
1365 | |
1366 for (i = 0; i < len; i++) | |
1367 chk = (chk + (i % 57 + 1) * (sre_toupper((int) seq[i]))) % 10000; | |
1368 return chk; | |
1369 } | |
1370 | |
1371 | |
1372 /* Function: GCGMultchecksum() | |
1373 * | |
1374 * Purpose: GCG checksum for a multiple alignment: sum of | |
1375 * individual sequence checksums (including their | |
1376 * gap characters) modulo 10000. | |
1377 * | |
1378 * Implemented using spec provided by Steve Smith of | |
1379 * Genetics Computer Group. | |
1380 * | |
1381 * Args: seqs - sequences to be checksummed; aligned or not | |
1382 * nseq - number of sequences | |
1383 * | |
1384 * Return: the checksum, a number between 0 and 9999 | |
1385 */ | |
1386 int | |
1387 GCGMultchecksum(char **seqs, int nseq) | |
1388 { | |
1389 int chk = 0; | |
1390 int idx; | |
1391 | |
1392 for (idx = 0; idx < nseq; idx++) | |
1393 chk = (chk + GCGchecksum(seqs[idx], strlen(seqs[idx]))) % 10000; | |
1394 return chk; | |
1395 } | |
1396 | |
1397 | |
1398 | |
1399 | |
1400 /* Function: Seqtype() | |
1401 * | |
1402 * Purpose: Returns a (very good) guess about type of sequence: | |
1403 * kDNA, kRNA, kAmino, or kOtherSeq. | |
1404 * | |
1405 * Modified from, and replaces, Gilbert getseqtype(). | |
1406 */ | |
1407 int | |
1408 Seqtype(char *seq) | |
1409 { | |
1410 int saw; /* how many non-gap characters I saw */ | |
1411 char c; | |
1412 int po = 0; /* count of protein-only */ | |
1413 int nt = 0; /* count of t's */ | |
1414 int nu = 0; /* count of u's */ | |
1415 int na = 0; /* count of nucleotides */ | |
1416 int aa = 0; /* count of amino acids */ | |
1417 int no = 0; /* count of others */ | |
1418 | |
1419 /* Look at the first 300 non-gap characters | |
1420 */ | |
1421 | |
1422 #ifdef CLUSTALO | |
1423 /* VGGNGDDYLSGGTGNDTL is recognized as unknown using squid's default | |
1424 * approach. | |
1425 * We change it to the following: | |
1426 | |
1427 * 1. counting: ignore gaps and not alpha characters. if protein-only then | |
1428 * count as such (po). otherwise decide if amino-acid (aa) or nucleic-acid | |
1429 * (na) or unknown (no) | |
1430 * | |
1431 * 2. determine type: if we saw more unknown than aa or na, return unknown. | |
1432 * if encountered protein-only return protein-only. otherwise decide based | |
1433 * on majority. (if aa==na return na) | |
1434 */ | |
1435 for (saw = 0; *seq != '\0' && saw < 300; seq++) { | |
1436 c = sre_toupper((int) *seq); | |
1437 int unknown = 1; | |
1438 | |
1439 if (isgap(c) || ! isalpha((int) c)) { | |
1440 continue; | |
1441 } | |
1442 | |
1443 if (strchr(protonly, c)) { | |
1444 po++; | |
1445 unknown = 0; | |
1446 } | |
1447 | |
1448 if (strchr(aminos,c)) { | |
1449 aa++; | |
1450 unknown = 0; | |
1451 } | |
1452 | |
1453 if (strchr(primenuc,c)) { | |
1454 na++; | |
1455 unknown = 0; | |
1456 | |
1457 if (c == 'T') | |
1458 nt++; | |
1459 else if (c == 'U') | |
1460 nu++; | |
1461 } | |
1462 | |
1463 if (unknown) { | |
1464 no ++; | |
1465 } | |
1466 | |
1467 saw++; | |
1468 } | |
1469 | |
1470 if (no > aa && no > na) | |
1471 return kOtherSeq; | |
1472 | |
1473 if (po > 0 || aa>na) | |
1474 return kAmino; | |
1475 | |
1476 if (na >= aa) { | |
1477 if (nu > nt) | |
1478 return kRNA; | |
1479 else | |
1480 return kDNA; | |
1481 } | |
1482 | |
1483 return kOtherSeq; | |
1484 | |
1485 | |
1486 #else | |
1487 for (saw = 0; *seq != '\0' && saw < 300; seq++) | |
1488 { | |
1489 c = sre_toupper((int) *seq); | |
1490 if (! isgap(c)) | |
1491 { | |
1492 if (strchr(protonly, c)) po++; | |
1493 else if (strchr(primenuc,c)) { | |
1494 na++; | |
1495 if (c == 'T') nt++; | |
1496 else if (c == 'U') nu++; | |
1497 } | |
1498 else if (strchr(aminos,c)) aa++; | |
1499 else if (isalpha((int) c)) no++; | |
1500 saw++; | |
1501 } | |
1502 } | |
1503 | |
1504 if (no > 0) return kOtherSeq; | |
1505 else if (po > 0) return kAmino; | |
1506 else if (na > aa) { | |
1507 if (nu > nt) return kRNA; | |
1508 else return kDNA; | |
1509 } | |
1510 else return kAmino; /* ooooh. risky. */ | |
1511 #endif | |
1512 | |
1513 } | |
1514 | |
1515 | |
1516 /* Function: GuessAlignmentSeqtype() | |
1517 * Date: SRE, Wed Jul 7 09:42:34 1999 [St. Louis] | |
1518 * | |
1519 * Purpose: Try to guess whether an alignment is protein | |
1520 * or nucleic acid; return a code for the | |
1521 * type (kRNA, kDNA, or kAmino). | |
1522 * | |
1523 * Args: aseq - array of aligned sequences. (Could also | |
1524 * be an rseq unaligned sequence array) | |
1525 * nseq - number of aseqs | |
1526 * | |
1527 * Returns: kRNA, kDNA, kAmino; | |
1528 * kOtherSeq if inconsistency is detected. | |
1529 */ | |
1530 int | |
1531 GuessAlignmentSeqtype(char **aseq, int nseq) | |
1532 { | |
1533 int idx; | |
1534 int nrna = 0; | |
1535 int ndna = 0; | |
1536 int namino = 0; | |
1537 int nother = 0; | |
1538 | |
1539 for (idx = 0; idx < nseq; idx++) | |
1540 switch (Seqtype(aseq[idx])) { | |
1541 case kRNA: nrna++; break; | |
1542 case kDNA: ndna++; break; | |
1543 case kAmino: namino++; break; | |
1544 default: nother++; | |
1545 } | |
1546 | |
1547 /* Unambiguous decisions: | |
1548 */ | |
1549 if (nother) return kOtherSeq; | |
1550 if (namino == nseq) return kAmino; | |
1551 if (ndna == nseq) return kDNA; | |
1552 if (nrna == nseq) return kRNA; | |
1553 | |
1554 /* Ambiguous decisions: | |
1555 */ | |
1556 if (namino == 0) return kRNA; /* it's nucleic acid, but seems mixed RNA/DNA */ | |
1557 return kAmino; /* some amino acid seen; others probably short seqs, some | |
1558 of which may be entirely ACGT (ala,cys,gly,thr). We | |
1559 could be a little more sophisticated: U would be a giveaway | |
1560 that we're not in protein seqs */ | |
1561 } | |
1562 | |
1563 /* Function: WriteSimpleFASTA() | |
1564 * Date: SRE, Tue Nov 16 18:06:00 1999 [St. Louis] | |
1565 * | |
1566 * Purpose: Just write a FASTA format sequence to a file; | |
1567 * minimal interface, mostly for quick and dirty programs. | |
1568 * | |
1569 * Args: fp - open file handle (stdout, possibly) | |
1570 * seq - sequence to output | |
1571 * name - name for the sequence | |
1572 * desc - optional description line, or NULL. | |
1573 * | |
1574 * Returns: void | |
1575 */ | |
1576 void | |
1577 WriteSimpleFASTA(FILE *fp, char *seq, char *name, char *desc) | |
1578 { | |
1579 char buf[61]; | |
1580 int len; | |
1581 int pos; | |
1582 | |
1583 len = strlen(seq); | |
1584 buf[60] = '\0'; | |
1585 fprintf(fp, ">%s %s\n", name, desc != NULL ? desc : ""); | |
1586 for (pos = 0; pos < len; pos += 60) | |
1587 { | |
1588 strncpy(buf, seq+pos, 60); | |
1589 fprintf(fp, "%s\n", buf); | |
1590 } | |
1591 } | |
1592 | |
1593 int | |
1594 WriteSeq(FILE *outf, int outform, char *seq, SQINFO *sqinfo) | |
1595 { | |
1596 int numline = 0; | |
1597 int lines = 0, spacer = 0, width = 50, tab = 0; | |
1598 int i, j, l, l1, ibase; | |
1599 char endstr[10]; | |
1600 char s[100]; /* buffer for sequence */ | |
1601 char ss[100]; /* buffer for structure */ | |
1602 int checksum = 0; | |
1603 int seqlen; | |
1604 int which_case; /* 0 = do nothing. 1 = upper case. 2 = lower case */ | |
1605 int dostruc; /* TRUE to print structure lines*/ | |
1606 | |
1607 which_case = 0; | |
1608 dostruc = FALSE; | |
1609 seqlen = (sqinfo->flags & SQINFO_LEN) ? sqinfo->len : strlen(seq); | |
1610 | |
1611 if (IsAlignmentFormat(outform)) | |
1612 Die("Tried to write an aligned format with WriteSeq() -- bad, bad."); | |
1613 | |
1614 | |
1615 strcpy( endstr,""); | |
1616 l1 = 0; | |
1617 checksum = GCGchecksum(seq, seqlen); | |
1618 | |
1619 switch (outform) { | |
1620 case SQFILE_UNKNOWN: /* no header, just sequence */ | |
1621 strcpy(endstr,"\n"); /* end w/ extra blank line */ | |
1622 break; | |
1623 | |
1624 case SQFILE_GENBANK: | |
1625 fprintf(outf,"LOCUS %s %d bp\n", | |
1626 sqinfo->name, seqlen); | |
1627 fprintf(outf,"ACCESSION %s\n", | |
1628 (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "."); | |
1629 fprintf(outf,"DEFINITION %s\n", | |
1630 (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "."); | |
1631 fprintf(outf,"VERSION %s\n", | |
1632 (sqinfo->flags & SQINFO_ID) ? sqinfo->id : "."); | |
1633 fprintf(outf,"ORIGIN \n"); | |
1634 spacer = 11; | |
1635 numline = 1; | |
1636 strcpy(endstr, "\n//"); | |
1637 break; | |
1638 | |
1639 case SQFILE_GCGDATA: | |
1640 fprintf(outf, ">>>>%s 9/95 ASCII Len: %d\n", sqinfo->name, seqlen); | |
1641 fprintf(outf, "%s\n", (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); | |
1642 break; | |
1643 | |
1644 case SQFILE_PIR: | |
1645 fprintf(outf, "ENTRY %s\n", | |
1646 (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name); | |
1647 fprintf(outf, "TITLE %s\n", | |
1648 (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); | |
1649 fprintf(outf, "ACCESSION %s\n", | |
1650 (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-"); | |
1651 fprintf(outf, "SUMMARY #Length %d #Checksum %d\n", | |
1652 sqinfo->len, checksum); | |
1653 fprintf(outf, "SEQUENCE\n"); | |
1654 fprintf(outf, " 5 10 15 20 25 30\n"); | |
1655 spacer = 2; /* spaces after every residue */ | |
1656 numline = 1; /* number lines w/ coords */ | |
1657 width = 30; /* 30 aa per line */ | |
1658 strcpy(endstr, "\n///"); | |
1659 break; | |
1660 | |
1661 case SQFILE_SQUID: | |
1662 fprintf(outf, "NAM %s\n", sqinfo->name); | |
1663 if (sqinfo->flags & (SQINFO_ID | SQINFO_ACC | SQINFO_START | SQINFO_STOP | SQINFO_OLEN)) | |
1664 fprintf(outf, "SRC %s %s %d..%d::%d\n", | |
1665 (sqinfo->flags & SQINFO_ID) ? sqinfo->id : "-", | |
1666 (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-", | |
1667 (sqinfo->flags & SQINFO_START) ? sqinfo->start : 0, | |
1668 (sqinfo->flags & SQINFO_STOP) ? sqinfo->stop : 0, | |
1669 (sqinfo->flags & SQINFO_OLEN) ? sqinfo->olen : 0); | |
1670 if (sqinfo->flags & SQINFO_DESC) | |
1671 fprintf(outf, "DES %s\n", sqinfo->desc); | |
1672 if (sqinfo->flags & SQINFO_SS) | |
1673 { | |
1674 fprintf(outf, "SEQ +SS\n"); | |
1675 dostruc = TRUE; /* print structure lines too */ | |
1676 } | |
1677 else | |
1678 fprintf(outf, "SEQ\n"); | |
1679 numline = 1; /* number seq lines w/ coords */ | |
1680 strcpy(endstr, "\n++"); | |
1681 break; | |
1682 | |
1683 case SQFILE_EMBL: | |
1684 fprintf(outf,"ID %s\n", | |
1685 (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name); | |
1686 fprintf(outf,"AC %s\n", | |
1687 (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-"); | |
1688 fprintf(outf,"DE %s\n", | |
1689 (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); | |
1690 fprintf(outf,"SQ %d BP\n", seqlen); | |
1691 strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/ | |
1692 tab = 5; /** added 31jan91 */ | |
1693 spacer = 11; /** added 31jan91 */ | |
1694 break; | |
1695 | |
1696 case SQFILE_GCG: | |
1697 fprintf(outf,"%s\n", sqinfo->name); | |
1698 if (sqinfo->flags & SQINFO_ACC) | |
1699 fprintf(outf,"ACCESSION %s\n", sqinfo->acc); | |
1700 if (sqinfo->flags & SQINFO_DESC) | |
1701 fprintf(outf,"DEFINITION %s\n", sqinfo->desc); | |
1702 fprintf(outf," %s Length: %d (today) Check: %d ..\n", | |
1703 sqinfo->name, seqlen, checksum); | |
1704 spacer = 11; | |
1705 numline = 1; | |
1706 strcpy(endstr, "\n"); /* this is insurance to help prevent misreads at eof */ | |
1707 break; | |
1708 | |
1709 case SQFILE_STRIDER: /* ?? map ?*/ | |
1710 fprintf(outf,"; ### from DNA Strider ;-)\n"); | |
1711 fprintf(outf,"; DNA sequence %s, %d bases, %d checksum.\n;\n", | |
1712 sqinfo->name, seqlen, checksum); | |
1713 strcpy(endstr, "\n//"); | |
1714 break; | |
1715 | |
1716 /* SRE: Don had Zuker default to Pearson, which is not | |
1717 intuitive or helpful, since Zuker's MFOLD can't read | |
1718 Pearson format. More useful to use kIG */ | |
1719 case SQFILE_ZUKER: | |
1720 which_case = 1; /* MFOLD requires upper case. */ | |
1721 /*FALLTHRU*/ | |
1722 case SQFILE_IG: | |
1723 fprintf(outf,";%s %s\n", | |
1724 sqinfo->name, | |
1725 (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : ""); | |
1726 fprintf(outf,"%s\n", sqinfo->name); | |
1727 strcpy(endstr,"1"); /* == linear dna */ | |
1728 break; | |
1729 | |
1730 case SQFILE_RAW: /* Raw: no header at all. */ | |
1731 break; | |
1732 | |
1733 default : | |
1734 case SQFILE_FASTA: | |
1735 fprintf(outf,">%s %s\n", sqinfo->name, | |
1736 (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : ""); | |
1737 break; | |
1738 } | |
1739 | |
1740 if (which_case == 1) s2upper(seq); | |
1741 if (which_case == 2) s2lower(seq); | |
1742 | |
1743 | |
1744 width = MIN(width,100); | |
1745 for (i=0, l=0, ibase = 1, lines = 0; i < seqlen; ) { | |
1746 if (l1 < 0) l1 = 0; | |
1747 else if (l1 == 0) { | |
1748 if (numline) fprintf(outf,"%8d ",ibase); | |
1749 for (j=0; j<tab; j++) fputc(' ',outf); | |
1750 } | |
1751 if ((spacer != 0) && ((l+1) % spacer == 1)) | |
1752 { s[l] = ' '; ss[l] = ' '; l++; } | |
1753 s[l] = seq[i]; | |
1754 ss[l] = (sqinfo->flags & SQINFO_SS) ? sqinfo->ss[i] : '.'; | |
1755 l++; i++; | |
1756 l1++; /* don't count spaces for width*/ | |
1757 if (l1 == width || i == seqlen) { | |
1758 s[l] = ss[l] = '\0'; | |
1759 l = 0; l1 = 0; | |
1760 if (dostruc) | |
1761 { | |
1762 fprintf(outf, "%s\n", s); | |
1763 if (numline) fprintf(outf," "); | |
1764 for (j=0; j<tab; j++) fputc(' ',outf); | |
1765 if (i == seqlen) fprintf(outf,"%s%s\n",ss,endstr); | |
1766 else fprintf(outf,"%s\n",ss); | |
1767 } | |
1768 else | |
1769 { | |
1770 if (i == seqlen) fprintf(outf,"%s%s\n",s,endstr); | |
1771 else fprintf(outf,"%s\n",s); | |
1772 } | |
1773 lines++; | |
1774 ibase = i+1; | |
1775 } | |
1776 } | |
1777 return lines; | |
1778 } | |
1779 | |
1780 | |
1781 /* Function: ReadMultipleRseqs() | |
1782 * | |
1783 * Purpose: Open a data file and | |
1784 * parse it into an array of rseqs (raw, unaligned | |
1785 * sequences). | |
1786 * | |
1787 * Caller is responsible for free'ing memory allocated | |
1788 * to ret_rseqs, ret_weights, and ret_names. | |
1789 * | |
1790 * Weights are currently only supported for MSF format. | |
1791 * Sequences read from all other formats will be assigned | |
1792 * weights of 1.0. If the caller isn't interested in | |
1793 * weights, it passes NULL as ret_weights. | |
1794 * | |
1795 * Returns 1 on success. Returns 0 on failure and sets | |
1796 * squid_errno to indicate the cause. | |
1797 */ | |
1798 int | |
1799 ReadMultipleRseqs(char *seqfile, | |
1800 int fformat, | |
1801 char ***ret_rseqs, | |
1802 SQINFO **ret_sqinfo, | |
1803 int *ret_num) | |
1804 { | |
1805 SQINFO *sqinfo; /* array of sequence optional info */ | |
1806 SQFILE *dbfp; /* open ptr for sequential access of file */ | |
1807 char **rseqs; /* sequence array */ | |
1808 int numalloced; /* num of seqs currently alloced for */ | |
1809 int num; | |
1810 | |
1811 | |
1812 num = 0; | |
1813 numalloced = 16; | |
1814 rseqs = (char **) MallocOrDie (numalloced * sizeof(char *)); | |
1815 sqinfo = (SQINFO *) MallocOrDie (numalloced * sizeof(SQINFO)); | |
1816 if ((dbfp = SeqfileOpen(seqfile, fformat, NULL)) == NULL) return 0; | |
1817 | |
1818 while (ReadSeq(dbfp, dbfp->format, &rseqs[num], &(sqinfo[num]))) | |
1819 { | |
1820 num++; | |
1821 if (num == numalloced) /* more seqs coming, alloc more room */ | |
1822 { | |
1823 numalloced += 16; | |
1824 rseqs = (char **) ReallocOrDie (rseqs, numalloced*sizeof(char *)); | |
1825 sqinfo = (SQINFO *) ReallocOrDie (sqinfo, numalloced * sizeof(SQINFO)); | |
1826 } | |
1827 } | |
1828 SeqfileClose(dbfp); | |
1829 | |
1830 *ret_rseqs = rseqs; | |
1831 *ret_sqinfo = sqinfo; | |
1832 *ret_num = num; | |
1833 return 1; | |
1834 } | |
1835 | |
1836 | |
1837 /* Function: String2SeqfileFormat() | |
1838 * Date: SRE, Sun Jun 27 15:25:54 1999 [TW 723 over Canadian Shield] | |
1839 * | |
1840 * Purpose: Convert a string (e.g. from command line option arg) | |
1841 * to a format code. Case insensitive. Return | |
1842 * MSAFILE_UNKNOWN/SQFILE_UNKNOWN if string is bad. | |
1843 * Uses codes defined in squid.h (unaligned formats) and | |
1844 * msa.h (aligned formats). | |
1845 * | |
1846 * Args: s - string to convert; e.g. "stockholm" | |
1847 * | |
1848 * Returns: format code; e.g. MSAFILE_STOCKHOLM | |
1849 */ | |
1850 int | |
1851 String2SeqfileFormat(char *s) | |
1852 { | |
1853 char *s2; | |
1854 int code = SQFILE_UNKNOWN; | |
1855 | |
1856 if (s == NULL) return SQFILE_UNKNOWN; | |
1857 s2 = sre_strdup(s, -1); | |
1858 s2upper(s2); | |
1859 | |
1860 if (strcmp(s2, "FASTA") == 0) code = SQFILE_FASTA; | |
1861 #ifdef CLUSTALO | |
1862 if (strcmp(s2, "FA") == 0) code = SQFILE_FASTA; | |
1863 else if (strcmp(s2, "VIENNA") == 0) code = SQFILE_VIENNA; | |
1864 else if (strcmp(s2, "VIE") == 0) code = SQFILE_VIENNA; | |
1865 #endif | |
1866 else if (strcmp(s2, "GENBANK") == 0) code = SQFILE_GENBANK; | |
1867 #ifdef CLUSTALO | |
1868 else if (strcmp(s2, "GB") == 0) code = SQFILE_GENBANK; | |
1869 #endif | |
1870 else if (strcmp(s2, "EMBL") == 0) code = SQFILE_EMBL; | |
1871 else if (strcmp(s2, "GCG") == 0) code = SQFILE_GCG; | |
1872 else if (strcmp(s2, "GCGDATA") == 0) code = SQFILE_GCGDATA; | |
1873 else if (strcmp(s2, "RAW") == 0) code = SQFILE_RAW; | |
1874 else if (strcmp(s2, "IG") == 0) code = SQFILE_IG; | |
1875 else if (strcmp(s2, "STRIDER") == 0) code = SQFILE_STRIDER; | |
1876 else if (strcmp(s2, "IDRAW") == 0) code = SQFILE_IDRAW; | |
1877 else if (strcmp(s2, "ZUKER") == 0) code = SQFILE_ZUKER; | |
1878 else if (strcmp(s2, "PIR") == 0) code = SQFILE_PIR; | |
1879 else if (strcmp(s2, "SQUID") == 0) code = SQFILE_SQUID; | |
1880 else if (strcmp(s2, "STOCKHOLM") == 0) code = MSAFILE_STOCKHOLM; | |
1881 #ifdef CLUSTALO | |
1882 else if (strcmp(s2, "ST") == 0) code = MSAFILE_STOCKHOLM; | |
1883 else if (strcmp(s2, "STK") == 0) code = MSAFILE_STOCKHOLM; | |
1884 #endif | |
1885 else if (strcmp(s2, "SELEX") == 0) code = MSAFILE_SELEX; | |
1886 else if (strcmp(s2, "MSF") == 0) code = MSAFILE_MSF; | |
1887 else if (strcmp(s2, "CLUSTAL") == 0) code = MSAFILE_CLUSTAL; | |
1888 #ifdef CLUSTALO | |
1889 else if (strcmp(s2, "CLU") == 0) code = MSAFILE_CLUSTAL; | |
1890 #endif | |
1891 else if (strcmp(s2, "A2M") == 0) code = MSAFILE_A2M; | |
1892 else if (strcmp(s2, "PHYLIP") == 0) code = MSAFILE_PHYLIP; | |
1893 #ifdef CLUSTALO | |
1894 else if (strcmp(s2, "PHY") == 0) code = MSAFILE_PHYLIP; | |
1895 #endif | |
1896 else if (strcmp(s2, "EPS") == 0) code = MSAFILE_EPS; | |
1897 #ifdef CLUSTALO | |
1898 else code = SQFILE_UNKNOWN; | |
1899 #endif | |
1900 free(s2); | |
1901 return code; | |
1902 } | |
1903 char * | |
1904 SeqfileFormat2String(int code) | |
1905 { | |
1906 switch (code) { | |
1907 case SQFILE_UNKNOWN: return "unknown"; | |
1908 case SQFILE_FASTA: return "FASTA"; | |
1909 #ifdef CLUSTALO | |
1910 case SQFILE_VIENNA: return "Vienna"; | |
1911 #endif | |
1912 case SQFILE_GENBANK: return "Genbank"; | |
1913 case SQFILE_EMBL: return "EMBL"; | |
1914 case SQFILE_GCG: return "GCG"; | |
1915 case SQFILE_GCGDATA: return "GCG data library"; | |
1916 case SQFILE_RAW: return "raw"; | |
1917 case SQFILE_IG: return "Intelligenetics"; | |
1918 case SQFILE_STRIDER: return "MacStrider"; | |
1919 case SQFILE_IDRAW: return "Idraw Postscript"; | |
1920 case SQFILE_ZUKER: return "Zuker"; | |
1921 case SQFILE_PIR: return "PIR"; | |
1922 case SQFILE_SQUID: return "SQUID"; | |
1923 case MSAFILE_STOCKHOLM: return "Stockholm"; | |
1924 case MSAFILE_SELEX: return "SELEX"; | |
1925 case MSAFILE_MSF: return "MSF"; | |
1926 case MSAFILE_CLUSTAL: return "Clustal"; | |
1927 case MSAFILE_A2M: return "a2m"; | |
1928 case MSAFILE_PHYLIP: return "Phylip"; | |
1929 case MSAFILE_EPS: return "EPS"; | |
1930 default: | |
1931 Die("Bad code passed to MSAFormat2String()"); | |
1932 } | |
1933 /*NOTREACHED*/ | |
1934 return NULL; | |
1935 } | |
1936 | |
1937 | |
1938 /* Function: MSAToSqinfo() | |
1939 * Date: SRE, Tue Jul 20 14:36:56 1999 [St. Louis] | |
1940 * | |
1941 * Purpose: Take an MSA and generate a SQINFO array suitable | |
1942 * for use in annotating the unaligned sequences. | |
1943 * Return the array. | |
1944 * | |
1945 * Permanent temporary code. sqinfo was poorly designed. | |
1946 * it must eventually be replaced, but the odds | |
1947 * of this happening soon are nil, so I have to deal. | |
1948 * | |
1949 * Args: msa - the alignment | |
1950 * | |
1951 * Returns: ptr to allocated sqinfo array. | |
1952 * Freeing is ghastly: free in each individual sqinfo[i] | |
1953 * with FreeSequence(NULL, &(sqinfo[i])), then | |
1954 * free(sqinfo). | |
1955 */ | |
1956 SQINFO * | |
1957 MSAToSqinfo(MSA *msa) | |
1958 { | |
1959 int idx; | |
1960 SQINFO *sqinfo; | |
1961 | |
1962 sqinfo = MallocOrDie(sizeof(SQINFO) * msa->nseq); | |
1963 | |
1964 for (idx = 0; idx < msa->nseq; idx++) | |
1965 { | |
1966 sqinfo[idx].flags = 0; | |
1967 SetSeqinfoString(&(sqinfo[idx]), | |
1968 msa->sqname[idx], SQINFO_NAME); | |
1969 SetSeqinfoString(&(sqinfo[idx]), | |
1970 MSAGetSeqAccession(msa, idx), SQINFO_ACC); | |
1971 SetSeqinfoString(&(sqinfo[idx]), | |
1972 MSAGetSeqDescription(msa, idx), SQINFO_DESC); | |
1973 | |
1974 if (msa->ss != NULL && msa->ss[idx] != NULL) { | |
1975 MakeDealignedString(msa->aseq[idx], msa->alen, | |
1976 msa->ss[idx], &(sqinfo[idx].ss)); | |
1977 sqinfo[idx].flags |= SQINFO_SS; | |
1978 } | |
1979 | |
1980 if (msa->sa != NULL && msa->sa[idx] != NULL) { | |
1981 MakeDealignedString(msa->aseq[idx], msa->alen, | |
1982 msa->sa[idx], &(sqinfo[idx].sa)); | |
1983 sqinfo[idx].flags |= SQINFO_SA; | |
1984 } | |
1985 | |
1986 sqinfo[idx].len = DealignedLength(msa->aseq[idx]); | |
1987 sqinfo[idx].flags |= SQINFO_LEN; | |
1988 } | |
1989 return sqinfo; | |
1990 } | |
1991 | |
1992 | |
1993 | |
1994 /* cc -o sqio_test -DA_QUIET_DAY -L. sqio.c -lsquid */ | |
1995 #ifdef A_QUIET_DAY | |
1996 #include "ssi.h" | |
1997 int | |
1998 main(int argc, char **argv) | |
1999 { | |
2000 FILE *fp; | |
2001 char *filename; | |
2002 char *buf; | |
2003 int len; | |
2004 int mode = 3; | |
2005 SSIOFFSET off; | |
2006 | |
2007 filename = argv[1]; | |
2008 | |
2009 if (mode == 1) { | |
2010 buf = malloc(sizeof(char) * 256); | |
2011 if ((fp = fopen(filename, "r")) == NULL) | |
2012 Die("open of %s failed", filename); | |
2013 while (fgets(buf, 255, fp) != NULL) | |
2014 ; | |
2015 fclose(fp); | |
2016 free(buf); | |
2017 } else if (mode == 2) { | |
2018 if ((fp = fopen(filename, "r")) == NULL) | |
2019 Die("open of %s failed", filename); | |
2020 buf = NULL; len = 0; | |
2021 while (sre_fgets(&buf, &len, fp) != NULL) | |
2022 SSIGetFilePosition(fp, SSI_OFFSET_I32, &off); | |
2023 fclose(fp); | |
2024 free(buf); | |
2025 } else if (mode == 3) { | |
2026 SQFILE *dbfp; | |
2027 SQINFO info; | |
2028 | |
2029 if ((dbfp = SeqfileOpen(filename, SQFILE_FASTA, NULL)) == NULL) | |
2030 Die("open of %s failed", filename); | |
2031 while (ReadSeq(dbfp, dbfp->format, &buf, &info)) { | |
2032 SSIGetFilePosition(dbfp->f, SSI_OFFSET_I32, &off); | |
2033 FreeSequence(buf, &info); | |
2034 } | |
2035 SeqfileClose(dbfp); | |
2036 } | |
2037 | |
2038 } | |
2039 | |
2040 | |
2041 #endif |