comparison clustalomega/clustal-omega-1.0.2/src/squid/hsregex.c @ 1:bc707542e5de

Uploaded
author clustalomega
date Thu, 21 Jul 2011 13:35:08 -0400
parents
children
comparison
equal deleted inserted replaced
0:ff1768533a07 1:bc707542e5de
1 /*****************************************************************
2 * SQUID - a library of functions for biological sequence analysis
3 * Copyright (C) 1992-2002 Washington University School of Medicine
4 *
5 * This source code is freely distributed under the terms of the
6 * GNU General Public License. See the files COPYRIGHT and LICENSE
7 * for details.
8 *****************************************************************/
9
10 /*****************************************************************
11 * This code is an altered version of Henry Spencer's
12 * regex library. Alterations are limited to minor streamlining,
13 * and some name changes to protect the SQUID namespace.
14 * Henry's copyright notice appears below.
15 * You can obtain the original from
16 * ftp://ftp.zoo.toronto.edu/pub/bookregex.tar.Z
17 * Thanks, Henry!
18 *
19 * The magic word for compiling a testdriver: NBA_TEAM_IN_STL
20 * gcc -o test -g -DNBA_TEAM_IN_STL -L. hsregex.c -lsquid -lm
21 *
22 * Usage:
23 * test <pattern> <ntok> <string>
24 *
25 * SRE, Fri Aug 28 11:10:17 1998
26 * CVS $Id: hsregex.c,v 1.7 2001/08/09 17:50:17 eddy Exp)
27 *****************************************************************/
28
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <ctype.h>
33 #include "squid.h"
34
35 /* global sqd_parse[] are managed by Strparse().
36 * WARNING: TODO: this code is not threadsafe, and needs to be revised.
37 */
38 char *sqd_parse[10];
39
40 /* Function: Strparse()
41 *
42 * Purpose: Match a regexp to a string. Returns 1 if pattern matches,
43 * else 0.
44 *
45 * Much like Perl, Strparse() makes copies of the matching
46 * substrings available via globals, sqd_parse[].
47 * sqd_parse[0] contains a copy of the complete matched
48 * text. sqd_parse[1-9] contain copies of up to nine
49 * different substrings matched within parentheses.
50 * The memory for these strings is internally managed and
51 * volatile; the next call to Strparse() may destroy them.
52 * If the caller needs the matched substrings to persist
53 * beyond a new Strparse() call, it must make its own
54 * copies.
55 *
56 * A minor drawback of the memory management is that
57 * there will be a small amount of unfree'd memory being
58 * managed by Strparse() when a program exits; this may
59 * confuse memory debugging (Purify, dbmalloc). The
60 * general cleanup function SqdClean() is provided;
61 * you can call this before exiting.
62 *
63 * Uses an extended POSIX regular expression interface.
64 * A copylefted GNU implementation is included in the squid
65 * implementation (gnuregex.c) for use on non-POSIX compliant
66 * systems. POSIX 1003.2-compliant systems (all UNIX,
67 * some WinNT, I believe) can omit the GNU code if necessary.
68 *
69 * I built this for ease of use, not speed nor efficiency.
70 *
71 * Example: Strparse("foo-...-baz", "foo-bar-baz") returns 0
72 * Strparse("foo-(...)-baz", "foo-bar-baz")
73 * returns 0; sqd_parse[0] is "foo-bar-baz";
74 * sqd_parse[1] is "bar".
75 *
76 * A real example:
77 * s = ">gnl|ti|3 G10P69425RH2.T0 {SUB 81..737} /len=657"
78 * pat = "SUB ([0-9]+)"
79 * Strparse(pat, s, 1)
80 * returns 1; sqd_parse[1] is "81".
81 *
82 * Args: rexp - regular expression, extended POSIX form
83 * s - string to match against
84 * ntok - number of () substrings we will save (maximum NSUBEXP-1)
85 *
86 * Return: 1 on match, 0 if no match
87 */
88 int
89 Strparse(char *rexp, char *s, int ntok)
90 {
91 sqd_regexp *pat;
92 int code;
93 int len;
94 int i;
95 /* sanity check */
96 if (ntok >= NSUBEXP ) Die("Strparse(): ntok must be <= %d", NSUBEXP-1);
97
98 /* Free previous global substring buffers
99 */
100 for (i = 0; i <= ntok; i++)
101 if (sqd_parse[i] != NULL)
102 {
103 free(sqd_parse[i]);
104 sqd_parse[i] = NULL;
105 }
106
107 /* Compile and match the pattern, using our modified
108 * copy of Henry Spencer's regexp library
109 */
110 if ((pat = sqd_regcomp(rexp)) == NULL)
111 Die("regexp compilation failed.");
112 code = sqd_regexec(pat, s);
113
114 /* Fill the global substring buffers
115 */
116 if (code == 1)
117 for (i = 0; i <= ntok; i++)
118 if (pat->startp[i] != NULL && pat->endp[i] != NULL)
119 {
120 len = pat->endp[i] - pat->startp[i];
121 sqd_parse[i] = (char *) MallocOrDie(sizeof(char) * (len+1));
122 strncpy(sqd_parse[i], pat->startp[i], len);
123 sqd_parse[i][len] = '\0';
124 }
125
126 free(pat);
127 return code;
128 }
129
130 /* Function: SqdClean()
131 * Date: SRE, Wed Oct 29 12:52:08 1997 [TWA 721]
132 *
133 * Purpose: Clean up any squid library allocations before exiting
134 * a program, so we don't leave unfree'd memory around
135 * and confuse a malloc debugger like Purify or dbmalloc.
136 */
137 void
138 SqdClean(void)
139 {
140 int i;
141
142 /* Free global substring buffers that Strparse() uses
143 */
144 for (i = 0; i <= 9; i++)
145 if (sqd_parse[i] != NULL) {
146 free(sqd_parse[i]);
147 sqd_parse[i] = NULL;
148 }
149 }
150
151
152
153 /* all code below is:
154 * Copyright (c) 1986, 1993, 1995 by University of Toronto.
155 * Written by Henry Spencer. Not derived from licensed software.
156 *
157 * Permission is granted to anyone to use this software for any
158 * purpose on any computer system, and to redistribute it in any way,
159 * subject to the following restrictions:
160 *
161 * 1. The author is not responsible for the consequences of use of
162 * this software, no matter how awful, even if they arise
163 * from defects in it.
164 *
165 * 2. The origin of this software must not be misrepresented, either
166 * by explicit claim or by omission.
167 *
168 * 3. Altered versions must be plainly marked as such, and must not
169 * be misrepresented (by explicit claim or omission) as being
170 * the original software.
171 *
172 * 4. This notice must not be removed or altered.
173 */
174
175 /*
176 * sqd_regcomp and sqd_regexec -- sqd_regsub and sqd_regerror are elsewhere
177 */
178
179 /*
180 * The first byte of the regexp internal "program" is actually this magic
181 * number; the start node begins in the second byte.
182 */
183 #define SQD_REGMAGIC 0234
184
185 /*
186 * The "internal use only" fields in regexp.h are present to pass info from
187 * compile to execute that permits the execute phase to run lots faster on
188 * simple cases. They are:
189 *
190 * regstart char that must begin a match; '\0' if none obvious
191 * reganch is the match anchored (at beginning-of-line only)?
192 * regmust string (pointer into program) that match must include, or NULL
193 * regmlen length of regmust string
194 *
195 * Regstart and reganch permit very fast decisions on suitable starting points
196 * for a match, cutting down the work a lot. Regmust permits fast rejection
197 * of lines that cannot possibly match. The regmust tests are costly enough
198 * that sqd_regcomp() supplies a regmust only if the r.e. contains something
199 * potentially expensive (at present, the only such thing detected is * or +
200 * at the start of the r.e., which can involve a lot of backup). Regmlen is
201 * supplied because the test in sqd_regexec() needs it and sqd_regcomp() is computing
202 * it anyway.
203 */
204
205 /*
206 * Structure for regexp "program". This is essentially a linear encoding
207 * of a nondeterministic finite-state machine (aka syntax charts or
208 * "railroad normal form" in parsing technology). Each node is an opcode
209 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
210 * all nodes except BRANCH implement concatenation; a "next" pointer with
211 * a BRANCH on both ends of it is connecting two alternatives. (Here we
212 * have one of the subtle syntax dependencies: an individual BRANCH (as
213 * opposed to a collection of them) is never concatenated with anything
214 * because of operator precedence.) The operand of some types of node is
215 * a literal string; for others, it is a node leading into a sub-FSM. In
216 * particular, the operand of a BRANCH node is the first node of the branch.
217 * (NB this is *not* a tree structure: the tail of the branch connects
218 * to the thing following the set of BRANCHes.) The opcodes are:
219 */
220
221 /* definition number opnd? meaning */
222 #define END 0 /* no End of program. */
223 #define BOL 1 /* no Match beginning of line. */
224 #define EOL 2 /* no Match end of line. */
225 #define ANY 3 /* no Match any character. */
226 #define ANYOF 4 /* str Match any of these. */
227 #define ANYBUT 5 /* str Match any but one of these. */
228 #define BRANCH 6 /* node Match this, or the next..\&. */
229 #define BACK 7 /* no "next" ptr points backward. */
230 #define EXACTLY 8 /* str Match this string. */
231 #define NOTHING 9 /* no Match empty string. */
232 #define STAR 10 /* node Match this 0 or more times. */
233 #define PLUS 11 /* node Match this 1 or more times. */
234 #define OPEN 20 /* no Sub-RE starts here. */
235 /* OPEN+1 is number 1, etc. */
236 #define CLOSE 30 /* no Analogous to OPEN. */
237
238 /*
239 * Opcode notes:
240 *
241 * BRANCH The set of branches constituting a single choice are hooked
242 * together with their "next" pointers, since precedence prevents
243 * anything being concatenated to any individual branch. The
244 * "next" pointer of the last BRANCH in a choice points to the
245 * thing following the whole choice. This is also where the
246 * final "next" pointer of each individual branch points; each
247 * branch starts with the operand node of a BRANCH node.
248 *
249 * BACK Normal "next" pointers all implicitly point forward; BACK
250 * exists to make loop structures possible.
251 *
252 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
253 * BRANCH structures using BACK. Simple cases (one character
254 * per match) are implemented with STAR and PLUS for speed
255 * and to minimize recursive plunges.
256 *
257 * OPEN,CLOSE ...are numbered at compile time.
258 */
259
260 /*
261 * A node is one char of opcode followed by two chars of "next" pointer.
262 * "Next" pointers are stored as two 8-bit pieces, high order first. The
263 * value is a positive offset from the opcode of the node containing it.
264 * An operand, if any, simply follows the node. (Note that much of the
265 * code generation knows about this implicit relationship.)
266 *
267 * Using two bytes for the "next" pointer is vast overkill for most things,
268 * but allows patterns to get big without disasters.
269 */
270 #define OP(p) (*(p))
271 #define NEXT(p) (((*((p)+1)&0177)<<8) + (*((p)+2)&0377))
272 #define OPERAND(p) ((p) + 3)
273
274 /*
275 * Utility definitions.
276 */
277 #define FAIL(m) { sqd_regerror(m); return(NULL); }
278 #define ISREPN(c) ((c) == '*' || (c) == '+' || (c) == '?')
279 #define META "^$.[()|?+*\\"
280
281 /*
282 * Flags to be passed up and down.
283 */
284 #define HASWIDTH 01 /* Known never to match null string. */
285 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
286 #define SPSTART 04 /* Starts with * or +. */
287 #define WORST 0 /* Worst case. */
288
289 /*
290 * Work-variable struct for sqd_regcomp().
291 */
292 struct comp {
293 char *regparse; /* Input-scan pointer. */
294 int regnpar; /* () count. */
295 char *regcode; /* Code-emit pointer; &regdummy = don't. */
296 char regdummy[3]; /* NOTHING, 0 next ptr */
297 long regsize; /* Code size. */
298 };
299 #define EMITTING(cp) ((cp)->regcode != (cp)->regdummy)
300
301 /*
302 * Forward declarations for sqd_regcomp()'s friends.
303 */
304 static char *reg(struct comp *cp, int paren, int *flagp);
305 static char *regbranch(struct comp *cp, int *flagp);
306 static char *regpiece(struct comp *cp, int *flagp);
307 static char *regatom(struct comp *cp, int *flagp);
308 static char *regnode(struct comp *cp, int op);
309 static char *regnext(char *node);
310 static void regc(struct comp *cp, int c);
311 static void reginsert(struct comp *cp, int op, char *opnd);
312 static void regtail(struct comp *cp, char *p, char *val);
313 static void regoptail(struct comp *cp, char *p, char *val);
314
315 /*
316 - sqd_regcomp - compile a regular expression into internal code
317 *
318 * We can't allocate space until we know how big the compiled form will be,
319 * but we can't compile it (and thus know how big it is) until we've got a
320 * place to put the code. So we cheat: we compile it twice, once with code
321 * generation turned off and size counting turned on, and once "for real".
322 * This also means that we don't allocate space until we are sure that the
323 * thing really will compile successfully, and we never have to move the
324 * code and thus invalidate pointers into it. (Note that it has to be in
325 * one piece because free() must be able to free it all.)
326 *
327 * Beware that the optimization-preparation code in here knows about some
328 * of the structure of the compiled regexp.
329 */
330 sqd_regexp *
331 sqd_regcomp(exp)
332 const char *exp;
333 {
334 register sqd_regexp *r;
335 register char *scan;
336 int flags;
337 struct comp co;
338
339 if (exp == NULL)
340 FAIL("NULL argument to sqd_regcomp");
341
342 /* First pass: determine size, legality. */
343 co.regparse = (char *)exp;
344 co.regnpar = 1;
345 co.regsize = 0L;
346 co.regdummy[0] = NOTHING;
347 co.regdummy[1] = co.regdummy[2] = 0;
348 co.regcode = co.regdummy;
349 regc(&co, SQD_REGMAGIC);
350 if (reg(&co, 0, &flags) == NULL)
351 return(NULL);
352
353 /* Small enough for pointer-storage convention? */
354 if (co.regsize >= 0x7fffL) /* Probably could be 0xffffL. */
355 FAIL("regexp too big");
356
357 /* Allocate space. */
358 r = (sqd_regexp *)malloc(sizeof(sqd_regexp) + (size_t)co.regsize);
359 if (r == NULL)
360 FAIL("out of space");
361
362 /* Second pass: emit code. */
363 co.regparse = (char *)exp;
364 co.regnpar = 1;
365 co.regcode = r->program;
366 regc(&co, SQD_REGMAGIC);
367 if (reg(&co, 0, &flags) == NULL)
368 return(NULL);
369
370 /* Dig out information for optimizations. */
371 r->regstart = '\0'; /* Worst-case defaults. */
372 r->reganch = 0;
373 r->regmust = NULL;
374 r->regmlen = 0;
375 scan = r->program+1; /* First BRANCH. */
376 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
377 scan = OPERAND(scan);
378
379 /* Starting-point info. */
380 if (OP(scan) == EXACTLY)
381 r->regstart = *OPERAND(scan);
382 else if (OP(scan) == BOL)
383 r->reganch = 1;
384
385 /*
386 * If there's something expensive in the r.e., find the
387 * longest literal string that must appear and make it the
388 * regmust. Resolve ties in favor of later strings, since
389 * the regstart check works with the beginning of the r.e.
390 * and avoiding duplication strengthens checking. Not a
391 * strong reason, but sufficient in the absence of others.
392 */
393 if (flags&SPSTART) {
394 register char *longest = NULL;
395 register size_t len = 0;
396
397 for (; scan != NULL; scan = regnext(scan))
398 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
399 longest = OPERAND(scan);
400 len = strlen(OPERAND(scan));
401 }
402 r->regmust = longest;
403 r->regmlen = (int)len;
404 }
405 }
406
407 return(r);
408 }
409
410 /*
411 - reg - regular expression, i.e. main body or parenthesized thing
412 *
413 * Caller must absorb opening parenthesis.
414 *
415 * Combining parenthesis handling with the base level of regular expression
416 * is a trifle forced, but the need to tie the tails of the branches to what
417 * follows makes it hard to avoid.
418 */
419 static char *
420 reg(cp, paren, flagp)
421 register struct comp *cp;
422 int paren; /* Parenthesized? */
423 int *flagp;
424 {
425 register char *ret = NULL; /* SRE: NULL init added to silence gcc */
426 register char *br;
427 register char *ender;
428 register int parno = 0; /* SRE: init added to silence gcc */
429 int flags;
430
431 *flagp = HASWIDTH; /* Tentatively. */
432
433 if (paren) {
434 /* Make an OPEN node. */
435 if (cp->regnpar >= NSUBEXP)
436 FAIL("too many ()");
437 parno = cp->regnpar;
438 cp->regnpar++;
439 ret = regnode(cp, OPEN+parno);
440 }
441
442 /* Pick up the branches, linking them together. */
443 br = regbranch(cp, &flags);
444 if (br == NULL)
445 return(NULL);
446 if (paren)
447 regtail(cp, ret, br); /* OPEN -> first. */
448 else
449 ret = br;
450 *flagp &= ~(~flags&HASWIDTH); /* Clear bit if bit 0. */
451 *flagp |= flags&SPSTART;
452 while (*cp->regparse == '|') {
453 cp->regparse++;
454 br = regbranch(cp, &flags);
455 if (br == NULL)
456 return(NULL);
457 regtail(cp, ret, br); /* BRANCH -> BRANCH. */
458 *flagp &= ~(~flags&HASWIDTH);
459 *flagp |= flags&SPSTART;
460 }
461
462 /* Make a closing node, and hook it on the end. */
463 ender = regnode(cp, (paren) ? CLOSE+parno : END);
464 regtail(cp, ret, ender);
465
466 /* Hook the tails of the branches to the closing node. */
467 for (br = ret; br != NULL; br = regnext(br))
468 regoptail(cp, br, ender);
469
470 /* Check for proper termination. */
471 if (paren && *cp->regparse++ != ')') {
472 FAIL("unterminated ()");
473 } else if (!paren && *cp->regparse != '\0') {
474 if (*cp->regparse == ')') {
475 FAIL("unmatched ()");
476 } else
477 FAIL("internal error: junk on end");
478 /* NOTREACHED */
479 }
480
481 return(ret);
482 }
483
484 /*
485 - regbranch - one alternative of an | operator
486 *
487 * Implements the concatenation operator.
488 */
489 static char *
490 regbranch(cp, flagp)
491 register struct comp *cp;
492 int *flagp;
493 {
494 register char *ret;
495 register char *chain;
496 register char *latest;
497 int flags;
498 register int c;
499
500 *flagp = WORST; /* Tentatively. */
501
502 ret = regnode(cp, BRANCH);
503 chain = NULL;
504 while ((c = *cp->regparse) != '\0' && c != '|' && c != ')') {
505 latest = regpiece(cp, &flags);
506 if (latest == NULL)
507 return(NULL);
508 *flagp |= flags&HASWIDTH;
509 if (chain == NULL) /* First piece. */
510 *flagp |= flags&SPSTART;
511 else
512 regtail(cp, chain, latest);
513 chain = latest;
514 }
515 if (chain == NULL) /* Loop ran zero times. */
516 (void) regnode(cp, NOTHING);
517
518 return(ret);
519 }
520
521 /*
522 - regpiece - something followed by possible [*+?]
523 *
524 * Note that the branching code sequences used for ? and the general cases
525 * of * and + are somewhat optimized: they use the same NOTHING node as
526 * both the endmarker for their branch list and the body of the last branch.
527 * It might seem that this node could be dispensed with entirely, but the
528 * endmarker role is not redundant.
529 */
530 static char *
531 regpiece(cp, flagp)
532 register struct comp *cp;
533 int *flagp;
534 {
535 register char *ret;
536 register char op;
537 register char *next;
538 int flags;
539
540 ret = regatom(cp, &flags);
541 if (ret == NULL)
542 return(NULL);
543
544 op = *cp->regparse;
545 if (!ISREPN(op)) {
546 *flagp = flags;
547 return(ret);
548 }
549
550 if (!(flags&HASWIDTH) && op != '?')
551 FAIL("*+ operand could be empty");
552 switch (op) {
553 case '*': *flagp = WORST|SPSTART; break;
554 case '+': *flagp = WORST|SPSTART|HASWIDTH; break;
555 case '?': *flagp = WORST; break;
556 }
557
558 if (op == '*' && (flags&SIMPLE))
559 reginsert(cp, STAR, ret);
560 else if (op == '*') {
561 /* Emit x* as (x&|), where & means "self". */
562 reginsert(cp, BRANCH, ret); /* Either x */
563 regoptail(cp, ret, regnode(cp, BACK)); /* and loop */
564 regoptail(cp, ret, ret); /* back */
565 regtail(cp, ret, regnode(cp, BRANCH)); /* or */
566 regtail(cp, ret, regnode(cp, NOTHING)); /* null. */
567 } else if (op == '+' && (flags&SIMPLE))
568 reginsert(cp, PLUS, ret);
569 else if (op == '+') {
570 /* Emit x+ as x(&|), where & means "self". */
571 next = regnode(cp, BRANCH); /* Either */
572 regtail(cp, ret, next);
573 regtail(cp, regnode(cp, BACK), ret); /* loop back */
574 regtail(cp, next, regnode(cp, BRANCH)); /* or */
575 regtail(cp, ret, regnode(cp, NOTHING)); /* null. */
576 } else if (op == '?') {
577 /* Emit x? as (x|) */
578 reginsert(cp, BRANCH, ret); /* Either x */
579 regtail(cp, ret, regnode(cp, BRANCH)); /* or */
580 next = regnode(cp, NOTHING); /* null. */
581 regtail(cp, ret, next);
582 regoptail(cp, ret, next);
583 }
584 cp->regparse++;
585 if (ISREPN(*cp->regparse))
586 FAIL("nested *?+");
587
588 return(ret);
589 }
590
591 /*
592 - regatom - the lowest level
593 *
594 * Optimization: gobbles an entire sequence of ordinary characters so that
595 * it can turn them into a single node, which is smaller to store and
596 * faster to run. Backslashed characters are exceptions, each becoming a
597 * separate node; the code is simpler that way and it's not worth fixing.
598 */
599 static char *
600 regatom(cp, flagp)
601 register struct comp *cp;
602 int *flagp;
603 {
604 register char *ret;
605 int flags;
606
607 *flagp = WORST; /* Tentatively. */
608
609 switch (*cp->regparse++) {
610 case '^':
611 ret = regnode(cp, BOL);
612 break;
613 case '$':
614 ret = regnode(cp, EOL);
615 break;
616 case '.':
617 ret = regnode(cp, ANY);
618 *flagp |= HASWIDTH|SIMPLE;
619 break;
620 case '[': {
621 register int range;
622 register int rangeend;
623 register int c;
624
625 if (*cp->regparse == '^') { /* Complement of range. */
626 ret = regnode(cp, ANYBUT);
627 cp->regparse++;
628 } else
629 ret = regnode(cp, ANYOF);
630 if ((c = *cp->regparse) == ']' || c == '-') {
631 regc(cp, c);
632 cp->regparse++;
633 }
634 while ((c = *cp->regparse++) != '\0' && c != ']') {
635 if (c != '-')
636 regc(cp, c);
637 else if ((c = *cp->regparse) == ']' || c == '\0')
638 regc(cp, '-');
639 else {
640 range = (unsigned char)*(cp->regparse-2);
641 rangeend = (unsigned char)c;
642 if (range > rangeend)
643 FAIL("invalid [] range");
644 for (range++; range <= rangeend; range++)
645 regc(cp, range);
646 cp->regparse++;
647 }
648 }
649 regc(cp, '\0');
650 if (c != ']')
651 FAIL("unmatched []");
652 *flagp |= HASWIDTH|SIMPLE;
653 break;
654 }
655 case '(':
656 ret = reg(cp, 1, &flags);
657 if (ret == NULL)
658 return(NULL);
659 *flagp |= flags&(HASWIDTH|SPSTART);
660 break;
661 case '\0':
662 case '|':
663 case ')':
664 /* supposed to be caught earlier */
665 FAIL("internal error: \\0|) unexpected");
666 break;
667 case '?':
668 case '+':
669 case '*':
670 FAIL("?+* follows nothing");
671 break;
672 case '\\':
673 if (*cp->regparse == '\0')
674 FAIL("trailing \\");
675 ret = regnode(cp, EXACTLY);
676 regc(cp, *cp->regparse++);
677 regc(cp, '\0');
678 *flagp |= HASWIDTH|SIMPLE;
679 break;
680 default: {
681 register size_t len;
682 register char ender;
683
684 cp->regparse--;
685 len = strcspn(cp->regparse, META);
686 if (len == 0)
687 FAIL("internal error: strcspn 0");
688 ender = *(cp->regparse+len);
689 if (len > 1 && ISREPN(ender))
690 len--; /* Back off clear of ?+* operand. */
691 *flagp |= HASWIDTH;
692 if (len == 1)
693 *flagp |= SIMPLE;
694 ret = regnode(cp, EXACTLY);
695 for (; len > 0; len--)
696 regc(cp, *cp->regparse++);
697 regc(cp, '\0');
698 break;
699 }
700 }
701
702 return(ret);
703 }
704
705 /*
706 - regnode - emit a node
707 */
708 static char * /* Location. */
709 regnode(cp, op)
710 register struct comp *cp;
711 char op;
712 {
713 register char *const ret = cp->regcode;
714 register char *ptr;
715
716 if (!EMITTING(cp)) {
717 cp->regsize += 3;
718 return(ret);
719 }
720
721 ptr = ret;
722 *ptr++ = op;
723 *ptr++ = '\0'; /* Null next pointer. */
724 *ptr++ = '\0';
725 cp->regcode = ptr;
726
727 return(ret);
728 }
729
730 /*
731 - regc - emit (if appropriate) a byte of code
732 */
733 static void
734 regc(cp, b)
735 register struct comp *cp;
736 char b;
737 {
738 if (EMITTING(cp))
739 *cp->regcode++ = b;
740 else
741 cp->regsize++;
742 }
743
744 /*
745 - reginsert - insert an operator in front of already-emitted operand
746 *
747 * Means relocating the operand.
748 */
749 static void
750 reginsert(cp, op, opnd)
751 register struct comp *cp;
752 char op;
753 char *opnd;
754 {
755 register char *place;
756
757 if (!EMITTING(cp)) {
758 cp->regsize += 3;
759 return;
760 }
761
762 (void) memmove(opnd+3, opnd, (size_t)(cp->regcode - opnd));
763 cp->regcode += 3;
764
765 place = opnd; /* Op node, where operand used to be. */
766 *place++ = op;
767 *place++ = '\0';
768 *place++ = '\0';
769 }
770
771 /*
772 - regtail - set the next-pointer at the end of a node chain
773 */
774 static void
775 regtail(cp, p, val)
776 register struct comp *cp;
777 char *p;
778 char *val;
779 {
780 register char *scan;
781 register char *temp;
782 register int offset;
783
784 if (!EMITTING(cp))
785 return;
786
787 /* Find last node. */
788 for (scan = p; (temp = regnext(scan)) != NULL; scan = temp)
789 continue;
790
791 offset = (OP(scan) == BACK) ? scan - val : val - scan;
792 *(scan+1) = (offset>>8)&0177;
793 *(scan+2) = offset&0377;
794 }
795
796 /*
797 - regoptail - regtail on operand of first argument; nop if operandless
798 */
799 static void
800 regoptail(cp, p, val)
801 register struct comp *cp;
802 char *p;
803 char *val;
804 {
805 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
806 if (!EMITTING(cp) || OP(p) != BRANCH)
807 return;
808 regtail(cp, OPERAND(p), val);
809 }
810
811 /*
812 * sqd_regexec and friends
813 */
814
815 /*
816 * Work-variable struct for sqd_regexec().
817 */
818 struct exec {
819 char *reginput; /* String-input pointer. */
820 char *regbol; /* Beginning of input, for ^ check. */
821 char **regstartp; /* Pointer to startp array. */
822 char **regendp; /* Ditto for endp. */
823 };
824
825 /*
826 * Forwards.
827 */
828 static int regtry(struct exec *ep, sqd_regexp *rp, char *string);
829 static int regmatch(struct exec *ep, char *prog);
830 static size_t regrepeat(struct exec *ep, char *node);
831
832 #ifdef DEBUG
833 int regnarrate = 0;
834 void regdump();
835 static char *regprop();
836 #endif
837
838 /*
839 - sqd_regexec - match a regexp against a string
840 */
841 int
842 sqd_regexec(prog, str)
843 register sqd_regexp *prog;
844 const char *str;
845 {
846 register char *string = (char *)str; /* avert const poisoning */
847 register char *s;
848 struct exec ex;
849
850 /* Be paranoid. */
851 if (prog == NULL || string == NULL) {
852 sqd_regerror("NULL argument to sqd_regexec");
853 return(0);
854 }
855
856 /* Check validity of program. */
857 if ((unsigned char)*prog->program != SQD_REGMAGIC) {
858 sqd_regerror("corrupted regexp");
859 return(0);
860 }
861
862 /* If there is a "must appear" string, look for it. */
863 if (prog->regmust != NULL && strstr(string, prog->regmust) == NULL)
864 return(0);
865
866 /* Mark beginning of line for ^ . */
867 ex.regbol = string;
868 ex.regstartp = prog->startp;
869 ex.regendp = prog->endp;
870
871 /* Simplest case: anchored match need be tried only once. */
872 if (prog->reganch)
873 return(regtry(&ex, prog, string));
874
875 /* Messy cases: unanchored match. */
876 if (prog->regstart != '\0') {
877 /* We know what char it must start with. */
878 for (s = string; s != NULL; s = strchr(s+1, prog->regstart))
879 if (regtry(&ex, prog, s))
880 return(1);
881 return(0);
882 } else {
883 /* We don't -- general case. */
884 for (s = string; !regtry(&ex, prog, s); s++)
885 if (*s == '\0')
886 return(0);
887 return(1);
888 }
889 /* NOTREACHED */
890 }
891
892 /*
893 - regtry - try match at specific point
894 */
895 static int /* 0 failure, 1 success */
896 regtry(ep, prog, string)
897 register struct exec *ep;
898 sqd_regexp *prog;
899 char *string;
900 {
901 register int i;
902 register char **stp;
903 register char **enp;
904
905 ep->reginput = string;
906
907 stp = prog->startp;
908 enp = prog->endp;
909 for (i = NSUBEXP; i > 0; i--) {
910 *stp++ = NULL;
911 *enp++ = NULL;
912 }
913 if (regmatch(ep, prog->program + 1)) {
914 prog->startp[0] = string;
915 prog->endp[0] = ep->reginput;
916 return(1);
917 } else
918 return(0);
919 }
920
921 /*
922 - regmatch - main matching routine
923 *
924 * Conceptually the strategy is simple: check to see whether the current
925 * node matches, call self recursively to see whether the rest matches,
926 * and then act accordingly. In practice we make some effort to avoid
927 * recursion, in particular by going through "ordinary" nodes (that don't
928 * need to know whether the rest of the match failed) by a loop instead of
929 * by recursion.
930 */
931 static int /* 0 failure, 1 success */
932 regmatch(ep, prog)
933 register struct exec *ep;
934 char *prog;
935 {
936 register char *scan; /* Current node. */
937 char *next; /* Next node. */
938
939 #ifdef DEBUG
940 if (prog != NULL && regnarrate)
941 fprintf(stderr, "%s(\n", regprop(prog));
942 #endif
943 for (scan = prog; scan != NULL; scan = next) {
944 #ifdef DEBUG
945 if (regnarrate)
946 fprintf(stderr, "%s...\n", regprop(scan));
947 #endif
948 next = regnext(scan);
949
950 switch (OP(scan)) {
951 case BOL:
952 if (ep->reginput != ep->regbol)
953 return(0);
954 break;
955 case EOL:
956 if (*ep->reginput != '\0')
957 return(0);
958 break;
959 case ANY:
960 if (*ep->reginput == '\0')
961 return(0);
962 ep->reginput++;
963 break;
964 case EXACTLY: {
965 register size_t len;
966 register char *const opnd = OPERAND(scan);
967
968 /* Inline the first character, for speed. */
969 if (*opnd != *ep->reginput)
970 return(0);
971 len = strlen(opnd);
972 if (len > 1 && strncmp(opnd, ep->reginput, len) != 0)
973 return(0);
974 ep->reginput += len;
975 break;
976 }
977 case ANYOF:
978 if (*ep->reginput == '\0' ||
979 strchr(OPERAND(scan), *ep->reginput) == NULL)
980 return(0);
981 ep->reginput++;
982 break;
983 case ANYBUT:
984 if (*ep->reginput == '\0' ||
985 strchr(OPERAND(scan), *ep->reginput) != NULL)
986 return(0);
987 ep->reginput++;
988 break;
989 case NOTHING:
990 break;
991 case BACK:
992 break;
993 case OPEN+1: case OPEN+2: case OPEN+3:
994 case OPEN+4: case OPEN+5: case OPEN+6:
995 case OPEN+7: case OPEN+8: case OPEN+9: {
996 register const int no = OP(scan) - OPEN;
997 register char *const input = ep->reginput;
998
999 if (regmatch(ep, next)) {
1000 /*
1001 * Don't set startp if some later
1002 * invocation of the same parentheses
1003 * already has.
1004 */
1005 if (ep->regstartp[no] == NULL)
1006 ep->regstartp[no] = input;
1007 return(1);
1008 } else
1009 return(0);
1010 break;
1011 }
1012 case CLOSE+1: case CLOSE+2: case CLOSE+3:
1013 case CLOSE+4: case CLOSE+5: case CLOSE+6:
1014 case CLOSE+7: case CLOSE+8: case CLOSE+9: {
1015 register const int no = OP(scan) - CLOSE;
1016 register char *const input = ep->reginput;
1017
1018 if (regmatch(ep, next)) {
1019 /*
1020 * Don't set endp if some later
1021 * invocation of the same parentheses
1022 * already has.
1023 */
1024 if (ep->regendp[no] == NULL)
1025 ep->regendp[no] = input;
1026 return(1);
1027 } else
1028 return(0);
1029 break;
1030 }
1031 case BRANCH: {
1032 register char *const save = ep->reginput;
1033
1034 if (OP(next) != BRANCH) /* No choice. */
1035 next = OPERAND(scan); /* Avoid recursion. */
1036 else {
1037 while (OP(scan) == BRANCH) {
1038 if (regmatch(ep, OPERAND(scan)))
1039 return(1);
1040 ep->reginput = save;
1041 scan = regnext(scan);
1042 }
1043 return(0);
1044 /* NOTREACHED */
1045 }
1046 break;
1047 }
1048 case STAR: case PLUS: {
1049 register const char nextch =
1050 (OP(next) == EXACTLY) ? *OPERAND(next) : '\0';
1051 register size_t no;
1052 register char *const save = ep->reginput;
1053 register const size_t min = (OP(scan) == STAR) ? 0 : 1;
1054
1055 for (no = regrepeat(ep, OPERAND(scan)) + 1; no > min; no--) {
1056 ep->reginput = save + no - 1;
1057 /* If it could work, try it. */
1058 if (nextch == '\0' || *ep->reginput == nextch)
1059 if (regmatch(ep, next))
1060 return(1);
1061 }
1062 return(0);
1063 break;
1064 }
1065 case END:
1066 return(1); /* Success! */
1067 break;
1068 default:
1069 sqd_regerror("regexp corruption");
1070 return(0);
1071 break;
1072 }
1073 }
1074
1075 /*
1076 * We get here only if there's trouble -- normally "case END" is
1077 * the terminating point.
1078 */
1079 sqd_regerror("corrupted pointers");
1080 return(0);
1081 }
1082
1083 /*
1084 - regrepeat - report how many times something simple would match
1085 */
1086 static size_t
1087 regrepeat(ep, node)
1088 register struct exec *ep;
1089 char *node;
1090 {
1091 register size_t count;
1092 register char *scan;
1093 register char ch;
1094
1095 switch (OP(node)) {
1096 case ANY:
1097 return(strlen(ep->reginput));
1098 break;
1099 case EXACTLY:
1100 ch = *OPERAND(node);
1101 count = 0;
1102 for (scan = ep->reginput; *scan == ch; scan++)
1103 count++;
1104 return(count);
1105 break;
1106 case ANYOF:
1107 return(strspn(ep->reginput, OPERAND(node)));
1108 break;
1109 case ANYBUT:
1110 return(strcspn(ep->reginput, OPERAND(node)));
1111 break;
1112 default: /* Oh dear. Called inappropriately. */
1113 sqd_regerror("internal error: bad call of regrepeat");
1114 return(0); /* Best compromise. */
1115 break;
1116 }
1117 /* NOTREACHED */
1118 }
1119
1120 /*
1121 - regnext - dig the "next" pointer out of a node
1122 */
1123 static char *
1124 regnext(p)
1125 register char *p;
1126 {
1127 register const int offset = NEXT(p);
1128
1129 if (offset == 0)
1130 return(NULL);
1131
1132 return((OP(p) == BACK) ? p-offset : p+offset);
1133 }
1134
1135 #ifdef DEBUG
1136
1137 static char *regprop();
1138
1139 /*
1140 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1141 */
1142 void
1143 regdump(r)
1144 sqd_regexp *r;
1145 {
1146 register char *s;
1147 register char op = EXACTLY; /* Arbitrary non-END op. */
1148 register char *next;
1149
1150
1151 s = r->program + 1;
1152 while (op != END) { /* While that wasn't END last time... */
1153 op = OP(s);
1154 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1155 next = regnext(s);
1156 if (next == NULL) /* Next ptr. */
1157 printf("(0)");
1158 else
1159 printf("(%d)", (s-r->program)+(next-s));
1160 s += 3;
1161 if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1162 /* Literal string, where present. */
1163 while (*s != '\0') {
1164 putchar(*s);
1165 s++;
1166 }
1167 s++;
1168 }
1169 putchar('\n');
1170 }
1171
1172 /* Header fields of interest. */
1173 if (r->regstart != '\0')
1174 printf("start `%c' ", r->regstart);
1175 if (r->reganch)
1176 printf("anchored ");
1177 if (r->regmust != NULL)
1178 printf("must have \"%s\"", r->regmust);
1179 printf("\n");
1180 }
1181
1182 /*
1183 - regprop - printable representation of opcode
1184 */
1185 static char *
1186 regprop(op)
1187 char *op;
1188 {
1189 register char *p;
1190 static char buf[50];
1191
1192 (void) strcpy(buf, ":");
1193
1194 switch (OP(op)) {
1195 case BOL:
1196 p = "BOL";
1197 break;
1198 case EOL:
1199 p = "EOL";
1200 break;
1201 case ANY:
1202 p = "ANY";
1203 break;
1204 case ANYOF:
1205 p = "ANYOF";
1206 break;
1207 case ANYBUT:
1208 p = "ANYBUT";
1209 break;
1210 case BRANCH:
1211 p = "BRANCH";
1212 break;
1213 case EXACTLY:
1214 p = "EXACTLY";
1215 break;
1216 case NOTHING:
1217 p = "NOTHING";
1218 break;
1219 case BACK:
1220 p = "BACK";
1221 break;
1222 case END:
1223 p = "END";
1224 break;
1225 case OPEN+1:
1226 case OPEN+2:
1227 case OPEN+3:
1228 case OPEN+4:
1229 case OPEN+5:
1230 case OPEN+6:
1231 case OPEN+7:
1232 case OPEN+8:
1233 case OPEN+9:
1234 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1235 p = NULL;
1236 break;
1237 case CLOSE+1:
1238 case CLOSE+2:
1239 case CLOSE+3:
1240 case CLOSE+4:
1241 case CLOSE+5:
1242 case CLOSE+6:
1243 case CLOSE+7:
1244 case CLOSE+8:
1245 case CLOSE+9:
1246 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1247 p = NULL;
1248 break;
1249 case STAR:
1250 p = "STAR";
1251 break;
1252 case PLUS:
1253 p = "PLUS";
1254 break;
1255 default:
1256 sqd_regerror("corrupted opcode");
1257 break;
1258 }
1259 if (p != NULL)
1260 (void) strcat(buf, p);
1261 return(buf);
1262 }
1263 #endif
1264
1265
1266 /*
1267 - sqd_regsub - perform substitutions after a regexp match
1268 */
1269 void
1270 sqd_regsub(rp, source, dest)
1271 const sqd_regexp *rp;
1272 const char *source;
1273 char *dest;
1274 {
1275 register sqd_regexp * const prog = (sqd_regexp *)rp;
1276 register char *src = (char *)source;
1277 register char *dst = dest;
1278 register char c;
1279 register int no;
1280 register size_t len;
1281
1282 if (prog == NULL || source == NULL || dest == NULL) {
1283 sqd_regerror("NULL parameter to sqd_regsub");
1284 return;
1285 }
1286 if ((unsigned char)*(prog->program) != SQD_REGMAGIC) {
1287 sqd_regerror("damaged regexp");
1288 return;
1289 }
1290
1291 while ((c = *src++) != '\0') {
1292 if (c == '&')
1293 no = 0;
1294 else if (c == '\\' && isdigit((int) (*src)))
1295 no = *src++ - '0';
1296 else
1297 no = -1;
1298
1299 if (no < 0) { /* Ordinary character. */
1300 if (c == '\\' && (*src == '\\' || *src == '&'))
1301 c = *src++;
1302 *dst++ = c;
1303 } else if (prog->startp[no] != NULL && prog->endp[no] != NULL &&
1304 prog->endp[no] > prog->startp[no]) {
1305 len = prog->endp[no] - prog->startp[no];
1306 (void) strncpy(dst, prog->startp[no], len);
1307 dst += len;
1308 if (*(dst-1) == '\0') { /* strncpy hit NUL. */
1309 sqd_regerror("damaged match string");
1310 return;
1311 }
1312 }
1313 }
1314 *dst++ = '\0';
1315 }
1316
1317
1318 void
1319 sqd_regerror(s)
1320 char *s;
1321 {
1322 fprintf(stderr, "regexp(3): %s\n", s);
1323 exit(EXIT_FAILURE);
1324 /* NOTREACHED */
1325 }
1326
1327 #ifdef NBA_TEAM_IN_STL
1328 int
1329 main(int argc, char **argv)
1330 {
1331 char *pat;
1332 int ntok;
1333 char *s;
1334 int status;
1335
1336 pat = argv[1];
1337 ntok = atoi(argv[2]);
1338 s = argv[3];
1339
1340 status = Strparse(pat, s, ntok);
1341 if (status == 0) {
1342 printf("no match\n");
1343 } else {
1344 int i;
1345 printf("MATCH.\n");
1346 for (i = 1; i <= ntok; i++)
1347 printf("matched token %1d: %s\n", i, sqd_parse[i]);
1348 }
1349 }
1350 #endif /*NBA_TEAM_IN_STL*/