1
|
1 /*****************************************************************
|
|
2 * SQUID - a library of functions for biological sequence analysis
|
|
3 * Copyright (C) 1992-2002 Washington University School of Medicine
|
|
4 *
|
|
5 * This source code is freely distributed under the terms of the
|
|
6 * GNU General Public License. See the files COPYRIGHT and LICENSE
|
|
7 * for details.
|
|
8 *****************************************************************/
|
|
9
|
|
10 /* phylip.c
|
|
11 * SRE, Mon Jun 14 14:08:33 1999 [St. Louis]
|
|
12 *
|
|
13 * Import/export of PHYLIP interleaved multiple sequence alignment
|
|
14 * format files.
|
|
15 *
|
|
16 * RCS $Id: phylip.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: phylip.c,v 1.1 1999/07/15 22:29:20 eddy Exp)
|
|
17 */
|
|
18
|
|
19 #include <stdio.h>
|
|
20 #include <stdlib.h>
|
|
21 #include <string.h>
|
|
22 #include <ctype.h>
|
|
23 #include "squid.h"
|
|
24 #include "msa.h"
|
|
25
|
|
26 #ifdef TESTDRIVE_PHYLIP
|
|
27 /*****************************************************************
|
|
28 * phylip.c test driver:
|
|
29 *
|
|
30 */
|
|
31 int
|
|
32 main(int argc, char **argv)
|
|
33 {
|
|
34 MSAFILE *afp;
|
|
35 MSA *msa;
|
|
36 char *file;
|
|
37
|
|
38 file = argv[1];
|
|
39
|
|
40 if ((afp = MSAFileOpen(file, MSAFILE_UNKNOWN, NULL)) == NULL)
|
|
41 Die("Couldn't open %s\n", file);
|
|
42
|
|
43 printf("format %d\n", afp->format);
|
|
44
|
|
45 while ((msa = ReadPhylip(afp)) != NULL)
|
|
46 {
|
|
47 WritePhylip(stdout, msa);
|
|
48 MSAFree(msa);
|
|
49 }
|
|
50
|
|
51 MSAFileClose(afp);
|
|
52 exit(0);
|
|
53 }
|
|
54 /******************************************************************/
|
|
55 #endif /* testdrive_phylip */
|
|
56
|
|
57
|
|
58
|
|
59 /* Function: ReadPhylip()
|
|
60 * Date: SRE, Fri Jun 18 12:59:37 1999 [Sanger Centre]
|
|
61 *
|
|
62 * Purpose: Parse an alignment from an open Phylip format
|
|
63 * alignment file. Phylip is a single-alignment format.
|
|
64 * Return the alignment, or NULL if we have no data.
|
|
65 *
|
|
66 * Args: afp - open alignment file
|
|
67 *
|
|
68 * Returns: MSA * - an alignment object
|
|
69 * Caller responsible for an MSAFree()
|
|
70 * NULL if no more alignments
|
|
71 */
|
|
72 MSA *
|
|
73 ReadPhylip(MSAFILE *afp)
|
|
74 {
|
|
75 MSA *msa;
|
|
76 char *s, *s1, *s2;
|
|
77 char name[11]; /* seq name max len = 10 char */
|
|
78 int nseq, alen;
|
|
79 int idx; /* index of current sequence */
|
|
80 int slen;
|
|
81 int nblock;
|
|
82
|
|
83 if (feof(afp->f)) return NULL;
|
|
84
|
|
85 /* Skip until we see a nonblank line; it's the header,
|
|
86 * containing nseq/alen
|
|
87 */
|
|
88 nseq = 0; alen = 0;
|
|
89 while ((s = MSAFileGetLine(afp)) != NULL)
|
|
90 {
|
|
91 if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) continue;
|
|
92 if ((s2 = sre_strtok(&s, WHITESPACE, NULL)) == NULL)
|
|
93 Die("Failed to parse nseq/alen from first line of PHYLIP file %s\n", afp->fname);
|
|
94 if (! IsInt(s1) || ! IsInt(s2))
|
|
95 Die("nseq and/or alen not an integer in first line of PHYLIP file %s\n", afp->fname);
|
|
96 nseq = atoi(s1);
|
|
97 alen = atoi(s2);
|
|
98 break;
|
|
99 }
|
|
100
|
|
101 msa = MSAAlloc(nseq, 0);
|
|
102 idx = 0;
|
|
103 nblock = 0;
|
|
104 while ((s = MSAFileGetLine(afp)) != NULL)
|
|
105 {
|
|
106 /* ignore blank lines. nonblank lines start w/ nonblank char */
|
|
107 if (isspace(*s)) continue;
|
|
108 /* First block has seq names */
|
|
109 if (nblock == 0) {
|
|
110 strncpy(name, s, 10);
|
|
111 name[10] = '\0';
|
|
112 GKIStoreKey(msa->index, name);
|
|
113 msa->sqname[idx] = sre_strdup(name, -1);
|
|
114 s += 10;
|
|
115 }
|
|
116 /* be careful of trailing whitespace on lines */
|
|
117 if ((s1 = sre_strtok(&s, WHITESPACE, &slen)) == NULL)
|
|
118 Die("Failed to parse sequence at line %d of PHYLIP file %s\n",
|
|
119 afp->linenumber, afp->fname);
|
|
120 msa->sqlen[idx] = sre_strcat(&(msa->aseq[idx]), msa->sqlen[idx], s1, slen);
|
|
121
|
|
122 idx++;
|
|
123 if (idx == nseq) { idx = 0; nblock++; }
|
|
124 }
|
|
125 msa->nseq = nseq;
|
|
126 MSAVerifyParse(msa); /* verifies; sets alen, wgt; frees sqlen[] */
|
|
127 return msa;
|
|
128 }
|
|
129
|
|
130
|
|
131
|
|
132 /* Function: WritePhylip()
|
|
133 * Date: SRE, Fri Jun 18 12:07:41 1999 [Sanger Centre]
|
|
134 *
|
|
135 * Purpose: Write an alignment in Phylip format to an open file.
|
|
136 *
|
|
137 * Args: fp - file that's open for writing.
|
|
138 * msa - alignment to write.
|
|
139 *
|
|
140 * Returns: (void)
|
|
141 */
|
|
142 void
|
|
143 WritePhylip(FILE *fp, MSA *msa)
|
|
144 {
|
|
145 int idx; /* counter for sequences */
|
|
146 int cpl = 50; /* 50 seq char per line */
|
|
147 char buf[51]; /* buffer for writing seq */
|
|
148 int pos;
|
|
149
|
|
150 /* First line has nseq, alen
|
|
151 */
|
|
152 fprintf(fp, " %d %d\n", msa->nseq, msa->alen);
|
|
153
|
|
154 /* Alignment section.
|
|
155 * PHYLIP is a multiblock format, blocks (optionally) separated
|
|
156 * by blanks; names only attached to first block. Names are
|
|
157 * restricted to ten char; we achieve this by simple truncation (!).
|
|
158 * (Do we need to convert gap characters from our ./- convention?)
|
|
159 */
|
|
160 for (pos = 0; pos < msa->alen; pos += cpl)
|
|
161 {
|
|
162 if (pos > 0) fprintf(fp, "\n");
|
|
163
|
|
164 for (idx = 0; idx < msa->nseq; idx++)
|
|
165 {
|
|
166 strncpy(buf, msa->aseq[idx] + pos, cpl);
|
|
167 buf[cpl] = '\0';
|
|
168 if (pos > 0) fprintf(fp, "%s\n", buf);
|
|
169 else fprintf(fp, "%-10.10s%s\n", msa->sqname[idx], buf);
|
|
170 }
|
|
171 }
|
|
172 return;
|
|
173 }
|