1
|
1 /*****************************************************************
|
|
2 * SQUID - a library of functions for biological sequence analysis
|
|
3 * Copyright (C) 1992-2002 Washington University School of Medicine
|
|
4 *
|
|
5 * This source code is freely distributed under the terms of the
|
|
6 * GNU General Public License. See the files COPYRIGHT and LICENSE
|
|
7 * for details.
|
|
8 *****************************************************************/
|
|
9
|
|
10 /*****************************************************************
|
|
11 * This code is an altered version of Henry Spencer's
|
|
12 * regex library. Alterations are limited to minor streamlining,
|
|
13 * and some name changes to protect the SQUID namespace.
|
|
14 * Henry's copyright notice appears below.
|
|
15 * You can obtain the original from
|
|
16 * ftp://ftp.zoo.toronto.edu/pub/bookregex.tar.Z
|
|
17 * Thanks, Henry!
|
|
18 *
|
|
19 * The magic word for compiling a testdriver: NBA_TEAM_IN_STL
|
|
20 * gcc -o test -g -DNBA_TEAM_IN_STL -L. hsregex.c -lsquid -lm
|
|
21 *
|
|
22 * Usage:
|
|
23 * test <pattern> <ntok> <string>
|
|
24 *
|
|
25 * SRE, Fri Aug 28 11:10:17 1998
|
|
26 * CVS $Id: hsregex.c,v 1.7 2001/08/09 17:50:17 eddy Exp)
|
|
27 *****************************************************************/
|
|
28
|
|
29 #include <stdio.h>
|
|
30 #include <stdlib.h>
|
|
31 #include <string.h>
|
|
32 #include <ctype.h>
|
|
33 #include "squid.h"
|
|
34
|
|
35 /* global sqd_parse[] are managed by Strparse().
|
|
36 * WARNING: TODO: this code is not threadsafe, and needs to be revised.
|
|
37 */
|
|
38 char *sqd_parse[10];
|
|
39
|
|
40 /* Function: Strparse()
|
|
41 *
|
|
42 * Purpose: Match a regexp to a string. Returns 1 if pattern matches,
|
|
43 * else 0.
|
|
44 *
|
|
45 * Much like Perl, Strparse() makes copies of the matching
|
|
46 * substrings available via globals, sqd_parse[].
|
|
47 * sqd_parse[0] contains a copy of the complete matched
|
|
48 * text. sqd_parse[1-9] contain copies of up to nine
|
|
49 * different substrings matched within parentheses.
|
|
50 * The memory for these strings is internally managed and
|
|
51 * volatile; the next call to Strparse() may destroy them.
|
|
52 * If the caller needs the matched substrings to persist
|
|
53 * beyond a new Strparse() call, it must make its own
|
|
54 * copies.
|
|
55 *
|
|
56 * A minor drawback of the memory management is that
|
|
57 * there will be a small amount of unfree'd memory being
|
|
58 * managed by Strparse() when a program exits; this may
|
|
59 * confuse memory debugging (Purify, dbmalloc). The
|
|
60 * general cleanup function SqdClean() is provided;
|
|
61 * you can call this before exiting.
|
|
62 *
|
|
63 * Uses an extended POSIX regular expression interface.
|
|
64 * A copylefted GNU implementation is included in the squid
|
|
65 * implementation (gnuregex.c) for use on non-POSIX compliant
|
|
66 * systems. POSIX 1003.2-compliant systems (all UNIX,
|
|
67 * some WinNT, I believe) can omit the GNU code if necessary.
|
|
68 *
|
|
69 * I built this for ease of use, not speed nor efficiency.
|
|
70 *
|
|
71 * Example: Strparse("foo-...-baz", "foo-bar-baz") returns 0
|
|
72 * Strparse("foo-(...)-baz", "foo-bar-baz")
|
|
73 * returns 0; sqd_parse[0] is "foo-bar-baz";
|
|
74 * sqd_parse[1] is "bar".
|
|
75 *
|
|
76 * A real example:
|
|
77 * s = ">gnl|ti|3 G10P69425RH2.T0 {SUB 81..737} /len=657"
|
|
78 * pat = "SUB ([0-9]+)"
|
|
79 * Strparse(pat, s, 1)
|
|
80 * returns 1; sqd_parse[1] is "81".
|
|
81 *
|
|
82 * Args: rexp - regular expression, extended POSIX form
|
|
83 * s - string to match against
|
|
84 * ntok - number of () substrings we will save (maximum NSUBEXP-1)
|
|
85 *
|
|
86 * Return: 1 on match, 0 if no match
|
|
87 */
|
|
88 int
|
|
89 Strparse(char *rexp, char *s, int ntok)
|
|
90 {
|
|
91 sqd_regexp *pat;
|
|
92 int code;
|
|
93 int len;
|
|
94 int i;
|
|
95 /* sanity check */
|
|
96 if (ntok >= NSUBEXP ) Die("Strparse(): ntok must be <= %d", NSUBEXP-1);
|
|
97
|
|
98 /* Free previous global substring buffers
|
|
99 */
|
|
100 for (i = 0; i <= ntok; i++)
|
|
101 if (sqd_parse[i] != NULL)
|
|
102 {
|
|
103 free(sqd_parse[i]);
|
|
104 sqd_parse[i] = NULL;
|
|
105 }
|
|
106
|
|
107 /* Compile and match the pattern, using our modified
|
|
108 * copy of Henry Spencer's regexp library
|
|
109 */
|
|
110 if ((pat = sqd_regcomp(rexp)) == NULL)
|
|
111 Die("regexp compilation failed.");
|
|
112 code = sqd_regexec(pat, s);
|
|
113
|
|
114 /* Fill the global substring buffers
|
|
115 */
|
|
116 if (code == 1)
|
|
117 for (i = 0; i <= ntok; i++)
|
|
118 if (pat->startp[i] != NULL && pat->endp[i] != NULL)
|
|
119 {
|
|
120 len = pat->endp[i] - pat->startp[i];
|
|
121 sqd_parse[i] = (char *) MallocOrDie(sizeof(char) * (len+1));
|
|
122 strncpy(sqd_parse[i], pat->startp[i], len);
|
|
123 sqd_parse[i][len] = '\0';
|
|
124 }
|
|
125
|
|
126 free(pat);
|
|
127 return code;
|
|
128 }
|
|
129
|
|
130 /* Function: SqdClean()
|
|
131 * Date: SRE, Wed Oct 29 12:52:08 1997 [TWA 721]
|
|
132 *
|
|
133 * Purpose: Clean up any squid library allocations before exiting
|
|
134 * a program, so we don't leave unfree'd memory around
|
|
135 * and confuse a malloc debugger like Purify or dbmalloc.
|
|
136 */
|
|
137 void
|
|
138 SqdClean(void)
|
|
139 {
|
|
140 int i;
|
|
141
|
|
142 /* Free global substring buffers that Strparse() uses
|
|
143 */
|
|
144 for (i = 0; i <= 9; i++)
|
|
145 if (sqd_parse[i] != NULL) {
|
|
146 free(sqd_parse[i]);
|
|
147 sqd_parse[i] = NULL;
|
|
148 }
|
|
149 }
|
|
150
|
|
151
|
|
152
|
|
153 /* all code below is:
|
|
154 * Copyright (c) 1986, 1993, 1995 by University of Toronto.
|
|
155 * Written by Henry Spencer. Not derived from licensed software.
|
|
156 *
|
|
157 * Permission is granted to anyone to use this software for any
|
|
158 * purpose on any computer system, and to redistribute it in any way,
|
|
159 * subject to the following restrictions:
|
|
160 *
|
|
161 * 1. The author is not responsible for the consequences of use of
|
|
162 * this software, no matter how awful, even if they arise
|
|
163 * from defects in it.
|
|
164 *
|
|
165 * 2. The origin of this software must not be misrepresented, either
|
|
166 * by explicit claim or by omission.
|
|
167 *
|
|
168 * 3. Altered versions must be plainly marked as such, and must not
|
|
169 * be misrepresented (by explicit claim or omission) as being
|
|
170 * the original software.
|
|
171 *
|
|
172 * 4. This notice must not be removed or altered.
|
|
173 */
|
|
174
|
|
175 /*
|
|
176 * sqd_regcomp and sqd_regexec -- sqd_regsub and sqd_regerror are elsewhere
|
|
177 */
|
|
178
|
|
179 /*
|
|
180 * The first byte of the regexp internal "program" is actually this magic
|
|
181 * number; the start node begins in the second byte.
|
|
182 */
|
|
183 #define SQD_REGMAGIC 0234
|
|
184
|
|
185 /*
|
|
186 * The "internal use only" fields in regexp.h are present to pass info from
|
|
187 * compile to execute that permits the execute phase to run lots faster on
|
|
188 * simple cases. They are:
|
|
189 *
|
|
190 * regstart char that must begin a match; '\0' if none obvious
|
|
191 * reganch is the match anchored (at beginning-of-line only)?
|
|
192 * regmust string (pointer into program) that match must include, or NULL
|
|
193 * regmlen length of regmust string
|
|
194 *
|
|
195 * Regstart and reganch permit very fast decisions on suitable starting points
|
|
196 * for a match, cutting down the work a lot. Regmust permits fast rejection
|
|
197 * of lines that cannot possibly match. The regmust tests are costly enough
|
|
198 * that sqd_regcomp() supplies a regmust only if the r.e. contains something
|
|
199 * potentially expensive (at present, the only such thing detected is * or +
|
|
200 * at the start of the r.e., which can involve a lot of backup). Regmlen is
|
|
201 * supplied because the test in sqd_regexec() needs it and sqd_regcomp() is computing
|
|
202 * it anyway.
|
|
203 */
|
|
204
|
|
205 /*
|
|
206 * Structure for regexp "program". This is essentially a linear encoding
|
|
207 * of a nondeterministic finite-state machine (aka syntax charts or
|
|
208 * "railroad normal form" in parsing technology). Each node is an opcode
|
|
209 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
|
|
210 * all nodes except BRANCH implement concatenation; a "next" pointer with
|
|
211 * a BRANCH on both ends of it is connecting two alternatives. (Here we
|
|
212 * have one of the subtle syntax dependencies: an individual BRANCH (as
|
|
213 * opposed to a collection of them) is never concatenated with anything
|
|
214 * because of operator precedence.) The operand of some types of node is
|
|
215 * a literal string; for others, it is a node leading into a sub-FSM. In
|
|
216 * particular, the operand of a BRANCH node is the first node of the branch.
|
|
217 * (NB this is *not* a tree structure: the tail of the branch connects
|
|
218 * to the thing following the set of BRANCHes.) The opcodes are:
|
|
219 */
|
|
220
|
|
221 /* definition number opnd? meaning */
|
|
222 #define END 0 /* no End of program. */
|
|
223 #define BOL 1 /* no Match beginning of line. */
|
|
224 #define EOL 2 /* no Match end of line. */
|
|
225 #define ANY 3 /* no Match any character. */
|
|
226 #define ANYOF 4 /* str Match any of these. */
|
|
227 #define ANYBUT 5 /* str Match any but one of these. */
|
|
228 #define BRANCH 6 /* node Match this, or the next..\&. */
|
|
229 #define BACK 7 /* no "next" ptr points backward. */
|
|
230 #define EXACTLY 8 /* str Match this string. */
|
|
231 #define NOTHING 9 /* no Match empty string. */
|
|
232 #define STAR 10 /* node Match this 0 or more times. */
|
|
233 #define PLUS 11 /* node Match this 1 or more times. */
|
|
234 #define OPEN 20 /* no Sub-RE starts here. */
|
|
235 /* OPEN+1 is number 1, etc. */
|
|
236 #define CLOSE 30 /* no Analogous to OPEN. */
|
|
237
|
|
238 /*
|
|
239 * Opcode notes:
|
|
240 *
|
|
241 * BRANCH The set of branches constituting a single choice are hooked
|
|
242 * together with their "next" pointers, since precedence prevents
|
|
243 * anything being concatenated to any individual branch. The
|
|
244 * "next" pointer of the last BRANCH in a choice points to the
|
|
245 * thing following the whole choice. This is also where the
|
|
246 * final "next" pointer of each individual branch points; each
|
|
247 * branch starts with the operand node of a BRANCH node.
|
|
248 *
|
|
249 * BACK Normal "next" pointers all implicitly point forward; BACK
|
|
250 * exists to make loop structures possible.
|
|
251 *
|
|
252 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
|
|
253 * BRANCH structures using BACK. Simple cases (one character
|
|
254 * per match) are implemented with STAR and PLUS for speed
|
|
255 * and to minimize recursive plunges.
|
|
256 *
|
|
257 * OPEN,CLOSE ...are numbered at compile time.
|
|
258 */
|
|
259
|
|
260 /*
|
|
261 * A node is one char of opcode followed by two chars of "next" pointer.
|
|
262 * "Next" pointers are stored as two 8-bit pieces, high order first. The
|
|
263 * value is a positive offset from the opcode of the node containing it.
|
|
264 * An operand, if any, simply follows the node. (Note that much of the
|
|
265 * code generation knows about this implicit relationship.)
|
|
266 *
|
|
267 * Using two bytes for the "next" pointer is vast overkill for most things,
|
|
268 * but allows patterns to get big without disasters.
|
|
269 */
|
|
270 #define OP(p) (*(p))
|
|
271 #define NEXT(p) (((*((p)+1)&0177)<<8) + (*((p)+2)&0377))
|
|
272 #define OPERAND(p) ((p) + 3)
|
|
273
|
|
274 /*
|
|
275 * Utility definitions.
|
|
276 */
|
|
277 #define FAIL(m) { sqd_regerror(m); return(NULL); }
|
|
278 #define ISREPN(c) ((c) == '*' || (c) == '+' || (c) == '?')
|
|
279 #define META "^$.[()|?+*\\"
|
|
280
|
|
281 /*
|
|
282 * Flags to be passed up and down.
|
|
283 */
|
|
284 #define HASWIDTH 01 /* Known never to match null string. */
|
|
285 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
|
|
286 #define SPSTART 04 /* Starts with * or +. */
|
|
287 #define WORST 0 /* Worst case. */
|
|
288
|
|
289 /*
|
|
290 * Work-variable struct for sqd_regcomp().
|
|
291 */
|
|
292 struct comp {
|
|
293 char *regparse; /* Input-scan pointer. */
|
|
294 int regnpar; /* () count. */
|
|
295 char *regcode; /* Code-emit pointer; ®dummy = don't. */
|
|
296 char regdummy[3]; /* NOTHING, 0 next ptr */
|
|
297 long regsize; /* Code size. */
|
|
298 };
|
|
299 #define EMITTING(cp) ((cp)->regcode != (cp)->regdummy)
|
|
300
|
|
301 /*
|
|
302 * Forward declarations for sqd_regcomp()'s friends.
|
|
303 */
|
|
304 static char *reg(struct comp *cp, int paren, int *flagp);
|
|
305 static char *regbranch(struct comp *cp, int *flagp);
|
|
306 static char *regpiece(struct comp *cp, int *flagp);
|
|
307 static char *regatom(struct comp *cp, int *flagp);
|
|
308 static char *regnode(struct comp *cp, int op);
|
|
309 static char *regnext(char *node);
|
|
310 static void regc(struct comp *cp, int c);
|
|
311 static void reginsert(struct comp *cp, int op, char *opnd);
|
|
312 static void regtail(struct comp *cp, char *p, char *val);
|
|
313 static void regoptail(struct comp *cp, char *p, char *val);
|
|
314
|
|
315 /*
|
|
316 - sqd_regcomp - compile a regular expression into internal code
|
|
317 *
|
|
318 * We can't allocate space until we know how big the compiled form will be,
|
|
319 * but we can't compile it (and thus know how big it is) until we've got a
|
|
320 * place to put the code. So we cheat: we compile it twice, once with code
|
|
321 * generation turned off and size counting turned on, and once "for real".
|
|
322 * This also means that we don't allocate space until we are sure that the
|
|
323 * thing really will compile successfully, and we never have to move the
|
|
324 * code and thus invalidate pointers into it. (Note that it has to be in
|
|
325 * one piece because free() must be able to free it all.)
|
|
326 *
|
|
327 * Beware that the optimization-preparation code in here knows about some
|
|
328 * of the structure of the compiled regexp.
|
|
329 */
|
|
330 sqd_regexp *
|
|
331 sqd_regcomp(exp)
|
|
332 const char *exp;
|
|
333 {
|
|
334 register sqd_regexp *r;
|
|
335 register char *scan;
|
|
336 int flags;
|
|
337 struct comp co;
|
|
338
|
|
339 if (exp == NULL)
|
|
340 FAIL("NULL argument to sqd_regcomp");
|
|
341
|
|
342 /* First pass: determine size, legality. */
|
|
343 co.regparse = (char *)exp;
|
|
344 co.regnpar = 1;
|
|
345 co.regsize = 0L;
|
|
346 co.regdummy[0] = NOTHING;
|
|
347 co.regdummy[1] = co.regdummy[2] = 0;
|
|
348 co.regcode = co.regdummy;
|
|
349 regc(&co, SQD_REGMAGIC);
|
|
350 if (reg(&co, 0, &flags) == NULL)
|
|
351 return(NULL);
|
|
352
|
|
353 /* Small enough for pointer-storage convention? */
|
|
354 if (co.regsize >= 0x7fffL) /* Probably could be 0xffffL. */
|
|
355 FAIL("regexp too big");
|
|
356
|
|
357 /* Allocate space. */
|
|
358 r = (sqd_regexp *)malloc(sizeof(sqd_regexp) + (size_t)co.regsize);
|
|
359 if (r == NULL)
|
|
360 FAIL("out of space");
|
|
361
|
|
362 /* Second pass: emit code. */
|
|
363 co.regparse = (char *)exp;
|
|
364 co.regnpar = 1;
|
|
365 co.regcode = r->program;
|
|
366 regc(&co, SQD_REGMAGIC);
|
|
367 if (reg(&co, 0, &flags) == NULL)
|
|
368 return(NULL);
|
|
369
|
|
370 /* Dig out information for optimizations. */
|
|
371 r->regstart = '\0'; /* Worst-case defaults. */
|
|
372 r->reganch = 0;
|
|
373 r->regmust = NULL;
|
|
374 r->regmlen = 0;
|
|
375 scan = r->program+1; /* First BRANCH. */
|
|
376 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
|
|
377 scan = OPERAND(scan);
|
|
378
|
|
379 /* Starting-point info. */
|
|
380 if (OP(scan) == EXACTLY)
|
|
381 r->regstart = *OPERAND(scan);
|
|
382 else if (OP(scan) == BOL)
|
|
383 r->reganch = 1;
|
|
384
|
|
385 /*
|
|
386 * If there's something expensive in the r.e., find the
|
|
387 * longest literal string that must appear and make it the
|
|
388 * regmust. Resolve ties in favor of later strings, since
|
|
389 * the regstart check works with the beginning of the r.e.
|
|
390 * and avoiding duplication strengthens checking. Not a
|
|
391 * strong reason, but sufficient in the absence of others.
|
|
392 */
|
|
393 if (flags&SPSTART) {
|
|
394 register char *longest = NULL;
|
|
395 register size_t len = 0;
|
|
396
|
|
397 for (; scan != NULL; scan = regnext(scan))
|
|
398 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
|
|
399 longest = OPERAND(scan);
|
|
400 len = strlen(OPERAND(scan));
|
|
401 }
|
|
402 r->regmust = longest;
|
|
403 r->regmlen = (int)len;
|
|
404 }
|
|
405 }
|
|
406
|
|
407 return(r);
|
|
408 }
|
|
409
|
|
410 /*
|
|
411 - reg - regular expression, i.e. main body or parenthesized thing
|
|
412 *
|
|
413 * Caller must absorb opening parenthesis.
|
|
414 *
|
|
415 * Combining parenthesis handling with the base level of regular expression
|
|
416 * is a trifle forced, but the need to tie the tails of the branches to what
|
|
417 * follows makes it hard to avoid.
|
|
418 */
|
|
419 static char *
|
|
420 reg(cp, paren, flagp)
|
|
421 register struct comp *cp;
|
|
422 int paren; /* Parenthesized? */
|
|
423 int *flagp;
|
|
424 {
|
|
425 register char *ret = NULL; /* SRE: NULL init added to silence gcc */
|
|
426 register char *br;
|
|
427 register char *ender;
|
|
428 register int parno = 0; /* SRE: init added to silence gcc */
|
|
429 int flags;
|
|
430
|
|
431 *flagp = HASWIDTH; /* Tentatively. */
|
|
432
|
|
433 if (paren) {
|
|
434 /* Make an OPEN node. */
|
|
435 if (cp->regnpar >= NSUBEXP)
|
|
436 FAIL("too many ()");
|
|
437 parno = cp->regnpar;
|
|
438 cp->regnpar++;
|
|
439 ret = regnode(cp, OPEN+parno);
|
|
440 }
|
|
441
|
|
442 /* Pick up the branches, linking them together. */
|
|
443 br = regbranch(cp, &flags);
|
|
444 if (br == NULL)
|
|
445 return(NULL);
|
|
446 if (paren)
|
|
447 regtail(cp, ret, br); /* OPEN -> first. */
|
|
448 else
|
|
449 ret = br;
|
|
450 *flagp &= ~(~flags&HASWIDTH); /* Clear bit if bit 0. */
|
|
451 *flagp |= flags&SPSTART;
|
|
452 while (*cp->regparse == '|') {
|
|
453 cp->regparse++;
|
|
454 br = regbranch(cp, &flags);
|
|
455 if (br == NULL)
|
|
456 return(NULL);
|
|
457 regtail(cp, ret, br); /* BRANCH -> BRANCH. */
|
|
458 *flagp &= ~(~flags&HASWIDTH);
|
|
459 *flagp |= flags&SPSTART;
|
|
460 }
|
|
461
|
|
462 /* Make a closing node, and hook it on the end. */
|
|
463 ender = regnode(cp, (paren) ? CLOSE+parno : END);
|
|
464 regtail(cp, ret, ender);
|
|
465
|
|
466 /* Hook the tails of the branches to the closing node. */
|
|
467 for (br = ret; br != NULL; br = regnext(br))
|
|
468 regoptail(cp, br, ender);
|
|
469
|
|
470 /* Check for proper termination. */
|
|
471 if (paren && *cp->regparse++ != ')') {
|
|
472 FAIL("unterminated ()");
|
|
473 } else if (!paren && *cp->regparse != '\0') {
|
|
474 if (*cp->regparse == ')') {
|
|
475 FAIL("unmatched ()");
|
|
476 } else
|
|
477 FAIL("internal error: junk on end");
|
|
478 /* NOTREACHED */
|
|
479 }
|
|
480
|
|
481 return(ret);
|
|
482 }
|
|
483
|
|
484 /*
|
|
485 - regbranch - one alternative of an | operator
|
|
486 *
|
|
487 * Implements the concatenation operator.
|
|
488 */
|
|
489 static char *
|
|
490 regbranch(cp, flagp)
|
|
491 register struct comp *cp;
|
|
492 int *flagp;
|
|
493 {
|
|
494 register char *ret;
|
|
495 register char *chain;
|
|
496 register char *latest;
|
|
497 int flags;
|
|
498 register int c;
|
|
499
|
|
500 *flagp = WORST; /* Tentatively. */
|
|
501
|
|
502 ret = regnode(cp, BRANCH);
|
|
503 chain = NULL;
|
|
504 while ((c = *cp->regparse) != '\0' && c != '|' && c != ')') {
|
|
505 latest = regpiece(cp, &flags);
|
|
506 if (latest == NULL)
|
|
507 return(NULL);
|
|
508 *flagp |= flags&HASWIDTH;
|
|
509 if (chain == NULL) /* First piece. */
|
|
510 *flagp |= flags&SPSTART;
|
|
511 else
|
|
512 regtail(cp, chain, latest);
|
|
513 chain = latest;
|
|
514 }
|
|
515 if (chain == NULL) /* Loop ran zero times. */
|
|
516 (void) regnode(cp, NOTHING);
|
|
517
|
|
518 return(ret);
|
|
519 }
|
|
520
|
|
521 /*
|
|
522 - regpiece - something followed by possible [*+?]
|
|
523 *
|
|
524 * Note that the branching code sequences used for ? and the general cases
|
|
525 * of * and + are somewhat optimized: they use the same NOTHING node as
|
|
526 * both the endmarker for their branch list and the body of the last branch.
|
|
527 * It might seem that this node could be dispensed with entirely, but the
|
|
528 * endmarker role is not redundant.
|
|
529 */
|
|
530 static char *
|
|
531 regpiece(cp, flagp)
|
|
532 register struct comp *cp;
|
|
533 int *flagp;
|
|
534 {
|
|
535 register char *ret;
|
|
536 register char op;
|
|
537 register char *next;
|
|
538 int flags;
|
|
539
|
|
540 ret = regatom(cp, &flags);
|
|
541 if (ret == NULL)
|
|
542 return(NULL);
|
|
543
|
|
544 op = *cp->regparse;
|
|
545 if (!ISREPN(op)) {
|
|
546 *flagp = flags;
|
|
547 return(ret);
|
|
548 }
|
|
549
|
|
550 if (!(flags&HASWIDTH) && op != '?')
|
|
551 FAIL("*+ operand could be empty");
|
|
552 switch (op) {
|
|
553 case '*': *flagp = WORST|SPSTART; break;
|
|
554 case '+': *flagp = WORST|SPSTART|HASWIDTH; break;
|
|
555 case '?': *flagp = WORST; break;
|
|
556 }
|
|
557
|
|
558 if (op == '*' && (flags&SIMPLE))
|
|
559 reginsert(cp, STAR, ret);
|
|
560 else if (op == '*') {
|
|
561 /* Emit x* as (x&|), where & means "self". */
|
|
562 reginsert(cp, BRANCH, ret); /* Either x */
|
|
563 regoptail(cp, ret, regnode(cp, BACK)); /* and loop */
|
|
564 regoptail(cp, ret, ret); /* back */
|
|
565 regtail(cp, ret, regnode(cp, BRANCH)); /* or */
|
|
566 regtail(cp, ret, regnode(cp, NOTHING)); /* null. */
|
|
567 } else if (op == '+' && (flags&SIMPLE))
|
|
568 reginsert(cp, PLUS, ret);
|
|
569 else if (op == '+') {
|
|
570 /* Emit x+ as x(&|), where & means "self". */
|
|
571 next = regnode(cp, BRANCH); /* Either */
|
|
572 regtail(cp, ret, next);
|
|
573 regtail(cp, regnode(cp, BACK), ret); /* loop back */
|
|
574 regtail(cp, next, regnode(cp, BRANCH)); /* or */
|
|
575 regtail(cp, ret, regnode(cp, NOTHING)); /* null. */
|
|
576 } else if (op == '?') {
|
|
577 /* Emit x? as (x|) */
|
|
578 reginsert(cp, BRANCH, ret); /* Either x */
|
|
579 regtail(cp, ret, regnode(cp, BRANCH)); /* or */
|
|
580 next = regnode(cp, NOTHING); /* null. */
|
|
581 regtail(cp, ret, next);
|
|
582 regoptail(cp, ret, next);
|
|
583 }
|
|
584 cp->regparse++;
|
|
585 if (ISREPN(*cp->regparse))
|
|
586 FAIL("nested *?+");
|
|
587
|
|
588 return(ret);
|
|
589 }
|
|
590
|
|
591 /*
|
|
592 - regatom - the lowest level
|
|
593 *
|
|
594 * Optimization: gobbles an entire sequence of ordinary characters so that
|
|
595 * it can turn them into a single node, which is smaller to store and
|
|
596 * faster to run. Backslashed characters are exceptions, each becoming a
|
|
597 * separate node; the code is simpler that way and it's not worth fixing.
|
|
598 */
|
|
599 static char *
|
|
600 regatom(cp, flagp)
|
|
601 register struct comp *cp;
|
|
602 int *flagp;
|
|
603 {
|
|
604 register char *ret;
|
|
605 int flags;
|
|
606
|
|
607 *flagp = WORST; /* Tentatively. */
|
|
608
|
|
609 switch (*cp->regparse++) {
|
|
610 case '^':
|
|
611 ret = regnode(cp, BOL);
|
|
612 break;
|
|
613 case '$':
|
|
614 ret = regnode(cp, EOL);
|
|
615 break;
|
|
616 case '.':
|
|
617 ret = regnode(cp, ANY);
|
|
618 *flagp |= HASWIDTH|SIMPLE;
|
|
619 break;
|
|
620 case '[': {
|
|
621 register int range;
|
|
622 register int rangeend;
|
|
623 register int c;
|
|
624
|
|
625 if (*cp->regparse == '^') { /* Complement of range. */
|
|
626 ret = regnode(cp, ANYBUT);
|
|
627 cp->regparse++;
|
|
628 } else
|
|
629 ret = regnode(cp, ANYOF);
|
|
630 if ((c = *cp->regparse) == ']' || c == '-') {
|
|
631 regc(cp, c);
|
|
632 cp->regparse++;
|
|
633 }
|
|
634 while ((c = *cp->regparse++) != '\0' && c != ']') {
|
|
635 if (c != '-')
|
|
636 regc(cp, c);
|
|
637 else if ((c = *cp->regparse) == ']' || c == '\0')
|
|
638 regc(cp, '-');
|
|
639 else {
|
|
640 range = (unsigned char)*(cp->regparse-2);
|
|
641 rangeend = (unsigned char)c;
|
|
642 if (range > rangeend)
|
|
643 FAIL("invalid [] range");
|
|
644 for (range++; range <= rangeend; range++)
|
|
645 regc(cp, range);
|
|
646 cp->regparse++;
|
|
647 }
|
|
648 }
|
|
649 regc(cp, '\0');
|
|
650 if (c != ']')
|
|
651 FAIL("unmatched []");
|
|
652 *flagp |= HASWIDTH|SIMPLE;
|
|
653 break;
|
|
654 }
|
|
655 case '(':
|
|
656 ret = reg(cp, 1, &flags);
|
|
657 if (ret == NULL)
|
|
658 return(NULL);
|
|
659 *flagp |= flags&(HASWIDTH|SPSTART);
|
|
660 break;
|
|
661 case '\0':
|
|
662 case '|':
|
|
663 case ')':
|
|
664 /* supposed to be caught earlier */
|
|
665 FAIL("internal error: \\0|) unexpected");
|
|
666 break;
|
|
667 case '?':
|
|
668 case '+':
|
|
669 case '*':
|
|
670 FAIL("?+* follows nothing");
|
|
671 break;
|
|
672 case '\\':
|
|
673 if (*cp->regparse == '\0')
|
|
674 FAIL("trailing \\");
|
|
675 ret = regnode(cp, EXACTLY);
|
|
676 regc(cp, *cp->regparse++);
|
|
677 regc(cp, '\0');
|
|
678 *flagp |= HASWIDTH|SIMPLE;
|
|
679 break;
|
|
680 default: {
|
|
681 register size_t len;
|
|
682 register char ender;
|
|
683
|
|
684 cp->regparse--;
|
|
685 len = strcspn(cp->regparse, META);
|
|
686 if (len == 0)
|
|
687 FAIL("internal error: strcspn 0");
|
|
688 ender = *(cp->regparse+len);
|
|
689 if (len > 1 && ISREPN(ender))
|
|
690 len--; /* Back off clear of ?+* operand. */
|
|
691 *flagp |= HASWIDTH;
|
|
692 if (len == 1)
|
|
693 *flagp |= SIMPLE;
|
|
694 ret = regnode(cp, EXACTLY);
|
|
695 for (; len > 0; len--)
|
|
696 regc(cp, *cp->regparse++);
|
|
697 regc(cp, '\0');
|
|
698 break;
|
|
699 }
|
|
700 }
|
|
701
|
|
702 return(ret);
|
|
703 }
|
|
704
|
|
705 /*
|
|
706 - regnode - emit a node
|
|
707 */
|
|
708 static char * /* Location. */
|
|
709 regnode(cp, op)
|
|
710 register struct comp *cp;
|
|
711 char op;
|
|
712 {
|
|
713 register char *const ret = cp->regcode;
|
|
714 register char *ptr;
|
|
715
|
|
716 if (!EMITTING(cp)) {
|
|
717 cp->regsize += 3;
|
|
718 return(ret);
|
|
719 }
|
|
720
|
|
721 ptr = ret;
|
|
722 *ptr++ = op;
|
|
723 *ptr++ = '\0'; /* Null next pointer. */
|
|
724 *ptr++ = '\0';
|
|
725 cp->regcode = ptr;
|
|
726
|
|
727 return(ret);
|
|
728 }
|
|
729
|
|
730 /*
|
|
731 - regc - emit (if appropriate) a byte of code
|
|
732 */
|
|
733 static void
|
|
734 regc(cp, b)
|
|
735 register struct comp *cp;
|
|
736 char b;
|
|
737 {
|
|
738 if (EMITTING(cp))
|
|
739 *cp->regcode++ = b;
|
|
740 else
|
|
741 cp->regsize++;
|
|
742 }
|
|
743
|
|
744 /*
|
|
745 - reginsert - insert an operator in front of already-emitted operand
|
|
746 *
|
|
747 * Means relocating the operand.
|
|
748 */
|
|
749 static void
|
|
750 reginsert(cp, op, opnd)
|
|
751 register struct comp *cp;
|
|
752 char op;
|
|
753 char *opnd;
|
|
754 {
|
|
755 register char *place;
|
|
756
|
|
757 if (!EMITTING(cp)) {
|
|
758 cp->regsize += 3;
|
|
759 return;
|
|
760 }
|
|
761
|
|
762 (void) memmove(opnd+3, opnd, (size_t)(cp->regcode - opnd));
|
|
763 cp->regcode += 3;
|
|
764
|
|
765 place = opnd; /* Op node, where operand used to be. */
|
|
766 *place++ = op;
|
|
767 *place++ = '\0';
|
|
768 *place++ = '\0';
|
|
769 }
|
|
770
|
|
771 /*
|
|
772 - regtail - set the next-pointer at the end of a node chain
|
|
773 */
|
|
774 static void
|
|
775 regtail(cp, p, val)
|
|
776 register struct comp *cp;
|
|
777 char *p;
|
|
778 char *val;
|
|
779 {
|
|
780 register char *scan;
|
|
781 register char *temp;
|
|
782 register int offset;
|
|
783
|
|
784 if (!EMITTING(cp))
|
|
785 return;
|
|
786
|
|
787 /* Find last node. */
|
|
788 for (scan = p; (temp = regnext(scan)) != NULL; scan = temp)
|
|
789 continue;
|
|
790
|
|
791 offset = (OP(scan) == BACK) ? scan - val : val - scan;
|
|
792 *(scan+1) = (offset>>8)&0177;
|
|
793 *(scan+2) = offset&0377;
|
|
794 }
|
|
795
|
|
796 /*
|
|
797 - regoptail - regtail on operand of first argument; nop if operandless
|
|
798 */
|
|
799 static void
|
|
800 regoptail(cp, p, val)
|
|
801 register struct comp *cp;
|
|
802 char *p;
|
|
803 char *val;
|
|
804 {
|
|
805 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
|
|
806 if (!EMITTING(cp) || OP(p) != BRANCH)
|
|
807 return;
|
|
808 regtail(cp, OPERAND(p), val);
|
|
809 }
|
|
810
|
|
811 /*
|
|
812 * sqd_regexec and friends
|
|
813 */
|
|
814
|
|
815 /*
|
|
816 * Work-variable struct for sqd_regexec().
|
|
817 */
|
|
818 struct exec {
|
|
819 char *reginput; /* String-input pointer. */
|
|
820 char *regbol; /* Beginning of input, for ^ check. */
|
|
821 char **regstartp; /* Pointer to startp array. */
|
|
822 char **regendp; /* Ditto for endp. */
|
|
823 };
|
|
824
|
|
825 /*
|
|
826 * Forwards.
|
|
827 */
|
|
828 static int regtry(struct exec *ep, sqd_regexp *rp, char *string);
|
|
829 static int regmatch(struct exec *ep, char *prog);
|
|
830 static size_t regrepeat(struct exec *ep, char *node);
|
|
831
|
|
832 #ifdef DEBUG
|
|
833 int regnarrate = 0;
|
|
834 void regdump();
|
|
835 static char *regprop();
|
|
836 #endif
|
|
837
|
|
838 /*
|
|
839 - sqd_regexec - match a regexp against a string
|
|
840 */
|
|
841 int
|
|
842 sqd_regexec(prog, str)
|
|
843 register sqd_regexp *prog;
|
|
844 const char *str;
|
|
845 {
|
|
846 register char *string = (char *)str; /* avert const poisoning */
|
|
847 register char *s;
|
|
848 struct exec ex;
|
|
849
|
|
850 /* Be paranoid. */
|
|
851 if (prog == NULL || string == NULL) {
|
|
852 sqd_regerror("NULL argument to sqd_regexec");
|
|
853 return(0);
|
|
854 }
|
|
855
|
|
856 /* Check validity of program. */
|
|
857 if ((unsigned char)*prog->program != SQD_REGMAGIC) {
|
|
858 sqd_regerror("corrupted regexp");
|
|
859 return(0);
|
|
860 }
|
|
861
|
|
862 /* If there is a "must appear" string, look for it. */
|
|
863 if (prog->regmust != NULL && strstr(string, prog->regmust) == NULL)
|
|
864 return(0);
|
|
865
|
|
866 /* Mark beginning of line for ^ . */
|
|
867 ex.regbol = string;
|
|
868 ex.regstartp = prog->startp;
|
|
869 ex.regendp = prog->endp;
|
|
870
|
|
871 /* Simplest case: anchored match need be tried only once. */
|
|
872 if (prog->reganch)
|
|
873 return(regtry(&ex, prog, string));
|
|
874
|
|
875 /* Messy cases: unanchored match. */
|
|
876 if (prog->regstart != '\0') {
|
|
877 /* We know what char it must start with. */
|
|
878 for (s = string; s != NULL; s = strchr(s+1, prog->regstart))
|
|
879 if (regtry(&ex, prog, s))
|
|
880 return(1);
|
|
881 return(0);
|
|
882 } else {
|
|
883 /* We don't -- general case. */
|
|
884 for (s = string; !regtry(&ex, prog, s); s++)
|
|
885 if (*s == '\0')
|
|
886 return(0);
|
|
887 return(1);
|
|
888 }
|
|
889 /* NOTREACHED */
|
|
890 }
|
|
891
|
|
892 /*
|
|
893 - regtry - try match at specific point
|
|
894 */
|
|
895 static int /* 0 failure, 1 success */
|
|
896 regtry(ep, prog, string)
|
|
897 register struct exec *ep;
|
|
898 sqd_regexp *prog;
|
|
899 char *string;
|
|
900 {
|
|
901 register int i;
|
|
902 register char **stp;
|
|
903 register char **enp;
|
|
904
|
|
905 ep->reginput = string;
|
|
906
|
|
907 stp = prog->startp;
|
|
908 enp = prog->endp;
|
|
909 for (i = NSUBEXP; i > 0; i--) {
|
|
910 *stp++ = NULL;
|
|
911 *enp++ = NULL;
|
|
912 }
|
|
913 if (regmatch(ep, prog->program + 1)) {
|
|
914 prog->startp[0] = string;
|
|
915 prog->endp[0] = ep->reginput;
|
|
916 return(1);
|
|
917 } else
|
|
918 return(0);
|
|
919 }
|
|
920
|
|
921 /*
|
|
922 - regmatch - main matching routine
|
|
923 *
|
|
924 * Conceptually the strategy is simple: check to see whether the current
|
|
925 * node matches, call self recursively to see whether the rest matches,
|
|
926 * and then act accordingly. In practice we make some effort to avoid
|
|
927 * recursion, in particular by going through "ordinary" nodes (that don't
|
|
928 * need to know whether the rest of the match failed) by a loop instead of
|
|
929 * by recursion.
|
|
930 */
|
|
931 static int /* 0 failure, 1 success */
|
|
932 regmatch(ep, prog)
|
|
933 register struct exec *ep;
|
|
934 char *prog;
|
|
935 {
|
|
936 register char *scan; /* Current node. */
|
|
937 char *next; /* Next node. */
|
|
938
|
|
939 #ifdef DEBUG
|
|
940 if (prog != NULL && regnarrate)
|
|
941 fprintf(stderr, "%s(\n", regprop(prog));
|
|
942 #endif
|
|
943 for (scan = prog; scan != NULL; scan = next) {
|
|
944 #ifdef DEBUG
|
|
945 if (regnarrate)
|
|
946 fprintf(stderr, "%s...\n", regprop(scan));
|
|
947 #endif
|
|
948 next = regnext(scan);
|
|
949
|
|
950 switch (OP(scan)) {
|
|
951 case BOL:
|
|
952 if (ep->reginput != ep->regbol)
|
|
953 return(0);
|
|
954 break;
|
|
955 case EOL:
|
|
956 if (*ep->reginput != '\0')
|
|
957 return(0);
|
|
958 break;
|
|
959 case ANY:
|
|
960 if (*ep->reginput == '\0')
|
|
961 return(0);
|
|
962 ep->reginput++;
|
|
963 break;
|
|
964 case EXACTLY: {
|
|
965 register size_t len;
|
|
966 register char *const opnd = OPERAND(scan);
|
|
967
|
|
968 /* Inline the first character, for speed. */
|
|
969 if (*opnd != *ep->reginput)
|
|
970 return(0);
|
|
971 len = strlen(opnd);
|
|
972 if (len > 1 && strncmp(opnd, ep->reginput, len) != 0)
|
|
973 return(0);
|
|
974 ep->reginput += len;
|
|
975 break;
|
|
976 }
|
|
977 case ANYOF:
|
|
978 if (*ep->reginput == '\0' ||
|
|
979 strchr(OPERAND(scan), *ep->reginput) == NULL)
|
|
980 return(0);
|
|
981 ep->reginput++;
|
|
982 break;
|
|
983 case ANYBUT:
|
|
984 if (*ep->reginput == '\0' ||
|
|
985 strchr(OPERAND(scan), *ep->reginput) != NULL)
|
|
986 return(0);
|
|
987 ep->reginput++;
|
|
988 break;
|
|
989 case NOTHING:
|
|
990 break;
|
|
991 case BACK:
|
|
992 break;
|
|
993 case OPEN+1: case OPEN+2: case OPEN+3:
|
|
994 case OPEN+4: case OPEN+5: case OPEN+6:
|
|
995 case OPEN+7: case OPEN+8: case OPEN+9: {
|
|
996 register const int no = OP(scan) - OPEN;
|
|
997 register char *const input = ep->reginput;
|
|
998
|
|
999 if (regmatch(ep, next)) {
|
|
1000 /*
|
|
1001 * Don't set startp if some later
|
|
1002 * invocation of the same parentheses
|
|
1003 * already has.
|
|
1004 */
|
|
1005 if (ep->regstartp[no] == NULL)
|
|
1006 ep->regstartp[no] = input;
|
|
1007 return(1);
|
|
1008 } else
|
|
1009 return(0);
|
|
1010 break;
|
|
1011 }
|
|
1012 case CLOSE+1: case CLOSE+2: case CLOSE+3:
|
|
1013 case CLOSE+4: case CLOSE+5: case CLOSE+6:
|
|
1014 case CLOSE+7: case CLOSE+8: case CLOSE+9: {
|
|
1015 register const int no = OP(scan) - CLOSE;
|
|
1016 register char *const input = ep->reginput;
|
|
1017
|
|
1018 if (regmatch(ep, next)) {
|
|
1019 /*
|
|
1020 * Don't set endp if some later
|
|
1021 * invocation of the same parentheses
|
|
1022 * already has.
|
|
1023 */
|
|
1024 if (ep->regendp[no] == NULL)
|
|
1025 ep->regendp[no] = input;
|
|
1026 return(1);
|
|
1027 } else
|
|
1028 return(0);
|
|
1029 break;
|
|
1030 }
|
|
1031 case BRANCH: {
|
|
1032 register char *const save = ep->reginput;
|
|
1033
|
|
1034 if (OP(next) != BRANCH) /* No choice. */
|
|
1035 next = OPERAND(scan); /* Avoid recursion. */
|
|
1036 else {
|
|
1037 while (OP(scan) == BRANCH) {
|
|
1038 if (regmatch(ep, OPERAND(scan)))
|
|
1039 return(1);
|
|
1040 ep->reginput = save;
|
|
1041 scan = regnext(scan);
|
|
1042 }
|
|
1043 return(0);
|
|
1044 /* NOTREACHED */
|
|
1045 }
|
|
1046 break;
|
|
1047 }
|
|
1048 case STAR: case PLUS: {
|
|
1049 register const char nextch =
|
|
1050 (OP(next) == EXACTLY) ? *OPERAND(next) : '\0';
|
|
1051 register size_t no;
|
|
1052 register char *const save = ep->reginput;
|
|
1053 register const size_t min = (OP(scan) == STAR) ? 0 : 1;
|
|
1054
|
|
1055 for (no = regrepeat(ep, OPERAND(scan)) + 1; no > min; no--) {
|
|
1056 ep->reginput = save + no - 1;
|
|
1057 /* If it could work, try it. */
|
|
1058 if (nextch == '\0' || *ep->reginput == nextch)
|
|
1059 if (regmatch(ep, next))
|
|
1060 return(1);
|
|
1061 }
|
|
1062 return(0);
|
|
1063 break;
|
|
1064 }
|
|
1065 case END:
|
|
1066 return(1); /* Success! */
|
|
1067 break;
|
|
1068 default:
|
|
1069 sqd_regerror("regexp corruption");
|
|
1070 return(0);
|
|
1071 break;
|
|
1072 }
|
|
1073 }
|
|
1074
|
|
1075 /*
|
|
1076 * We get here only if there's trouble -- normally "case END" is
|
|
1077 * the terminating point.
|
|
1078 */
|
|
1079 sqd_regerror("corrupted pointers");
|
|
1080 return(0);
|
|
1081 }
|
|
1082
|
|
1083 /*
|
|
1084 - regrepeat - report how many times something simple would match
|
|
1085 */
|
|
1086 static size_t
|
|
1087 regrepeat(ep, node)
|
|
1088 register struct exec *ep;
|
|
1089 char *node;
|
|
1090 {
|
|
1091 register size_t count;
|
|
1092 register char *scan;
|
|
1093 register char ch;
|
|
1094
|
|
1095 switch (OP(node)) {
|
|
1096 case ANY:
|
|
1097 return(strlen(ep->reginput));
|
|
1098 break;
|
|
1099 case EXACTLY:
|
|
1100 ch = *OPERAND(node);
|
|
1101 count = 0;
|
|
1102 for (scan = ep->reginput; *scan == ch; scan++)
|
|
1103 count++;
|
|
1104 return(count);
|
|
1105 break;
|
|
1106 case ANYOF:
|
|
1107 return(strspn(ep->reginput, OPERAND(node)));
|
|
1108 break;
|
|
1109 case ANYBUT:
|
|
1110 return(strcspn(ep->reginput, OPERAND(node)));
|
|
1111 break;
|
|
1112 default: /* Oh dear. Called inappropriately. */
|
|
1113 sqd_regerror("internal error: bad call of regrepeat");
|
|
1114 return(0); /* Best compromise. */
|
|
1115 break;
|
|
1116 }
|
|
1117 /* NOTREACHED */
|
|
1118 }
|
|
1119
|
|
1120 /*
|
|
1121 - regnext - dig the "next" pointer out of a node
|
|
1122 */
|
|
1123 static char *
|
|
1124 regnext(p)
|
|
1125 register char *p;
|
|
1126 {
|
|
1127 register const int offset = NEXT(p);
|
|
1128
|
|
1129 if (offset == 0)
|
|
1130 return(NULL);
|
|
1131
|
|
1132 return((OP(p) == BACK) ? p-offset : p+offset);
|
|
1133 }
|
|
1134
|
|
1135 #ifdef DEBUG
|
|
1136
|
|
1137 static char *regprop();
|
|
1138
|
|
1139 /*
|
|
1140 - regdump - dump a regexp onto stdout in vaguely comprehensible form
|
|
1141 */
|
|
1142 void
|
|
1143 regdump(r)
|
|
1144 sqd_regexp *r;
|
|
1145 {
|
|
1146 register char *s;
|
|
1147 register char op = EXACTLY; /* Arbitrary non-END op. */
|
|
1148 register char *next;
|
|
1149
|
|
1150
|
|
1151 s = r->program + 1;
|
|
1152 while (op != END) { /* While that wasn't END last time... */
|
|
1153 op = OP(s);
|
|
1154 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
|
|
1155 next = regnext(s);
|
|
1156 if (next == NULL) /* Next ptr. */
|
|
1157 printf("(0)");
|
|
1158 else
|
|
1159 printf("(%d)", (s-r->program)+(next-s));
|
|
1160 s += 3;
|
|
1161 if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
|
|
1162 /* Literal string, where present. */
|
|
1163 while (*s != '\0') {
|
|
1164 putchar(*s);
|
|
1165 s++;
|
|
1166 }
|
|
1167 s++;
|
|
1168 }
|
|
1169 putchar('\n');
|
|
1170 }
|
|
1171
|
|
1172 /* Header fields of interest. */
|
|
1173 if (r->regstart != '\0')
|
|
1174 printf("start `%c' ", r->regstart);
|
|
1175 if (r->reganch)
|
|
1176 printf("anchored ");
|
|
1177 if (r->regmust != NULL)
|
|
1178 printf("must have \"%s\"", r->regmust);
|
|
1179 printf("\n");
|
|
1180 }
|
|
1181
|
|
1182 /*
|
|
1183 - regprop - printable representation of opcode
|
|
1184 */
|
|
1185 static char *
|
|
1186 regprop(op)
|
|
1187 char *op;
|
|
1188 {
|
|
1189 register char *p;
|
|
1190 static char buf[50];
|
|
1191
|
|
1192 (void) strcpy(buf, ":");
|
|
1193
|
|
1194 switch (OP(op)) {
|
|
1195 case BOL:
|
|
1196 p = "BOL";
|
|
1197 break;
|
|
1198 case EOL:
|
|
1199 p = "EOL";
|
|
1200 break;
|
|
1201 case ANY:
|
|
1202 p = "ANY";
|
|
1203 break;
|
|
1204 case ANYOF:
|
|
1205 p = "ANYOF";
|
|
1206 break;
|
|
1207 case ANYBUT:
|
|
1208 p = "ANYBUT";
|
|
1209 break;
|
|
1210 case BRANCH:
|
|
1211 p = "BRANCH";
|
|
1212 break;
|
|
1213 case EXACTLY:
|
|
1214 p = "EXACTLY";
|
|
1215 break;
|
|
1216 case NOTHING:
|
|
1217 p = "NOTHING";
|
|
1218 break;
|
|
1219 case BACK:
|
|
1220 p = "BACK";
|
|
1221 break;
|
|
1222 case END:
|
|
1223 p = "END";
|
|
1224 break;
|
|
1225 case OPEN+1:
|
|
1226 case OPEN+2:
|
|
1227 case OPEN+3:
|
|
1228 case OPEN+4:
|
|
1229 case OPEN+5:
|
|
1230 case OPEN+6:
|
|
1231 case OPEN+7:
|
|
1232 case OPEN+8:
|
|
1233 case OPEN+9:
|
|
1234 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
|
|
1235 p = NULL;
|
|
1236 break;
|
|
1237 case CLOSE+1:
|
|
1238 case CLOSE+2:
|
|
1239 case CLOSE+3:
|
|
1240 case CLOSE+4:
|
|
1241 case CLOSE+5:
|
|
1242 case CLOSE+6:
|
|
1243 case CLOSE+7:
|
|
1244 case CLOSE+8:
|
|
1245 case CLOSE+9:
|
|
1246 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
|
|
1247 p = NULL;
|
|
1248 break;
|
|
1249 case STAR:
|
|
1250 p = "STAR";
|
|
1251 break;
|
|
1252 case PLUS:
|
|
1253 p = "PLUS";
|
|
1254 break;
|
|
1255 default:
|
|
1256 sqd_regerror("corrupted opcode");
|
|
1257 break;
|
|
1258 }
|
|
1259 if (p != NULL)
|
|
1260 (void) strcat(buf, p);
|
|
1261 return(buf);
|
|
1262 }
|
|
1263 #endif
|
|
1264
|
|
1265
|
|
1266 /*
|
|
1267 - sqd_regsub - perform substitutions after a regexp match
|
|
1268 */
|
|
1269 void
|
|
1270 sqd_regsub(rp, source, dest)
|
|
1271 const sqd_regexp *rp;
|
|
1272 const char *source;
|
|
1273 char *dest;
|
|
1274 {
|
|
1275 register sqd_regexp * const prog = (sqd_regexp *)rp;
|
|
1276 register char *src = (char *)source;
|
|
1277 register char *dst = dest;
|
|
1278 register char c;
|
|
1279 register int no;
|
|
1280 register size_t len;
|
|
1281
|
|
1282 if (prog == NULL || source == NULL || dest == NULL) {
|
|
1283 sqd_regerror("NULL parameter to sqd_regsub");
|
|
1284 return;
|
|
1285 }
|
|
1286 if ((unsigned char)*(prog->program) != SQD_REGMAGIC) {
|
|
1287 sqd_regerror("damaged regexp");
|
|
1288 return;
|
|
1289 }
|
|
1290
|
|
1291 while ((c = *src++) != '\0') {
|
|
1292 if (c == '&')
|
|
1293 no = 0;
|
|
1294 else if (c == '\\' && isdigit((int) (*src)))
|
|
1295 no = *src++ - '0';
|
|
1296 else
|
|
1297 no = -1;
|
|
1298
|
|
1299 if (no < 0) { /* Ordinary character. */
|
|
1300 if (c == '\\' && (*src == '\\' || *src == '&'))
|
|
1301 c = *src++;
|
|
1302 *dst++ = c;
|
|
1303 } else if (prog->startp[no] != NULL && prog->endp[no] != NULL &&
|
|
1304 prog->endp[no] > prog->startp[no]) {
|
|
1305 len = prog->endp[no] - prog->startp[no];
|
|
1306 (void) strncpy(dst, prog->startp[no], len);
|
|
1307 dst += len;
|
|
1308 if (*(dst-1) == '\0') { /* strncpy hit NUL. */
|
|
1309 sqd_regerror("damaged match string");
|
|
1310 return;
|
|
1311 }
|
|
1312 }
|
|
1313 }
|
|
1314 *dst++ = '\0';
|
|
1315 }
|
|
1316
|
|
1317
|
|
1318 void
|
|
1319 sqd_regerror(s)
|
|
1320 char *s;
|
|
1321 {
|
|
1322 fprintf(stderr, "regexp(3): %s\n", s);
|
|
1323 exit(EXIT_FAILURE);
|
|
1324 /* NOTREACHED */
|
|
1325 }
|
|
1326
|
|
1327 #ifdef NBA_TEAM_IN_STL
|
|
1328 int
|
|
1329 main(int argc, char **argv)
|
|
1330 {
|
|
1331 char *pat;
|
|
1332 int ntok;
|
|
1333 char *s;
|
|
1334 int status;
|
|
1335
|
|
1336 pat = argv[1];
|
|
1337 ntok = atoi(argv[2]);
|
|
1338 s = argv[3];
|
|
1339
|
|
1340 status = Strparse(pat, s, ntok);
|
|
1341 if (status == 0) {
|
|
1342 printf("no match\n");
|
|
1343 } else {
|
|
1344 int i;
|
|
1345 printf("MATCH.\n");
|
|
1346 for (i = 1; i <= ntok; i++)
|
|
1347 printf("matched token %1d: %s\n", i, sqd_parse[i]);
|
|
1348 }
|
|
1349 }
|
|
1350 #endif /*NBA_TEAM_IN_STL*/
|