/* * Copyright 2000, International Business Machines Corporation and others. * All Rights Reserved. * * This software has been released under the terms of the IBM Public * License. For details, see the LICENSE file in the top-level source * directory or online at http://www.openafs.org/dl/license10.html */ /* * All Rights Reserved */ /* * HISTORY * Revision 1.1.7.2 1994/08/25 17:34:40 * Added s12y calls * [1994/08/25 17:23:12] * * Revision 1.1.7.1 1994/06/09 13:50:35 * fixed copyright in src/file * [1994/06/08 21:23:56] * * Revision 1.1.5.3 1993/01/18 19:51:30 * Embedded copyright notice * [1993/01/18 19:26:02 * * Revision 1.1.5.2 1992/11/24 15:37:53 * Change include file install directory from .../afs to .../dcedfs. * [1992/11/22 16:05:26] * * Revision 1.1.3.3 1992/01/24 01:48:02 * Merging dfs6.3 * [1992/01/24 00:02:28] * * Revision 1.1.3.2 1992/01/23 19:03:28 * Moving files onto the branch for dfs6.3 merge * [1992/01/23 18:19:02] * Revision 1.1.1.2 1992/01/22 19:29:25 * Bring in dfs6.3 sources * * Revision 1.1 1992/01/19 02:56:43 * Initial revision * * $EndLog$ */ /* * regex.c -- regular expression patter matching functions * */ #include /* * routines to do regular expression matching * * Entry points: * * re_comp(s) * char *s; * ... returns 0 if the string s was compiled successfully, * a pointer to an error message otherwise. * If passed 0 or a null string returns without changing * the currently compiled re (see note 11 below). * * re_exec(s) * char *s; * ... returns 1 if the string s matches the last compiled regular * expression, * 0 if the string s failed to match the last compiled * regular expression, and * -1 if the compiled regular expression was invalid * (indicating an internal error). * * The strings passed to both re_comp and re_exec may have trailing or * embedded newline characters; they are terminated by nulls. * * The identity of the author of these routines is lost in antiquity; * this is essentially the same as the re code in the original V6 ed. * * The regular expressions recognized are described below. This description * is essentially the same as that for ed. * * A regular expression specifies a set of strings of characters. * A member of this set of strings is said to be matched by * the regular expression. In the following specification for * regular expressions the word `character' means any character but NUL. * * 1. Any character except a special character matches itself. * Special characters are the regular expression delimiter plus * \ [ . and sometimes ^ * $. * 2. A . matches any character. * 3. A \ followed by any character except a digit or ( ) * matches that character. * 4. A nonempty string s bracketed [s] (or [^s]) matches any * character in (or not in) s. In s, \ has no special meaning, * and ] may only appear as the first letter. A substring * a-b, with a and b in ascending ASCII order, stands for * the inclusive range of ASCII characters. * 5. A regular expression of form 1-4 followed by * matches a * sequence of 0 or more matches of the regular expression. * 6. A regular expression, x, of form 1-8, bracketed \(x\) * matches what x matches. * 7. A \ followed by a digit n matches a copy of the string that the * bracketed regular expression beginning with the nth \( matched. * 8. A regular expression of form 1-8, x, followed by a regular * expression of form 1-7, y matches a match for x followed by * a match for y, with the x match being as long as possible * while still permitting a y match. * 9. A regular expression of form 1-8 preceded by ^ (or followed * by $), is constrained to matches that begin at the left * (or end at the right) end of a line. * 10. A regular expression of form 1-9 picks out the longest among * the leftmost matches in a line. * 11. An empty regular expression stands for a copy of the last * regular expression encountered. */ /* * constants for re's */ #define CBRA 1 #define CCHR 2 #define CDOT 4 #define CCL 6 #define NCCL 8 #define CDOL 10 #define CEOF 11 #define CKET 12 #define CBACK 18 #define CSTAR 01 #define ESIZE 512 #define NBRA 9 static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA]; static char circf; /* forward defs */ static int advance(); static int backref(); static int cclass(); /* * compile the regular expression argument into a dfa */ char * re_comp(register char *sp) { register int c; register char *ep = expbuf; int cclcnt, numbra = 0; char *lastep = 0; char bracket[NBRA]; char *bracketp = &bracket[0]; static char *retoolong = "Regular expression too long"; #define comperr(msg) {expbuf[0] = 0; numbra = 0; return(msg); } if (sp == 0 || *sp == '\0') { if (*ep == 0) return("No previous regular expression"); return(0); } if (*sp == '^') { circf = 1; sp++; } else circf = 0; for (;;) { if (ep >= &expbuf[ESIZE]) comperr(retoolong); if ((c = *sp++) == '\0') { if (bracketp != bracket) comperr("unmatched \\("); *ep++ = CEOF; *ep++ = 0; return(0); } if (c != '*') lastep = ep; switch (c) { case '.': *ep++ = CDOT; continue; case '*': if (lastep == 0 || *lastep == CBRA || *lastep == CKET) goto defchar; *lastep |= CSTAR; continue; case '$': if (*sp != '\0') goto defchar; *ep++ = CDOL; continue; case '[': *ep++ = CCL; *ep++ = 0; cclcnt = 1; if ((c = *sp++) == '^') { c = *sp++; ep[-2] = NCCL; } do { if (c == '\0') comperr("missing ]"); if (c == '-' && ep [-1] != 0) { if ((c = *sp++) == ']') { *ep++ = '-'; cclcnt++; break; } while (ep[-1] < c) { *ep = ep[-1] + 1; ep++; cclcnt++; if (ep >= &expbuf[ESIZE]) comperr(retoolong); } } *ep++ = c; cclcnt++; if (ep >= &expbuf[ESIZE]) comperr(retoolong); } while ((c = *sp++) != ']'); lastep[1] = cclcnt; continue; case '\\': if ((c = *sp++) == '(') { if (numbra >= NBRA) comperr("too many \\(\\) pairs"); *bracketp++ = numbra; *ep++ = CBRA; *ep++ = numbra++; continue; } if (c == ')') { if (bracketp <= bracket) comperr("unmatched \\)"); *ep++ = CKET; *ep++ = *--bracketp; continue; } if (c >= '1' && c < ('1' + NBRA)) { *ep++ = CBACK; *ep++ = c - '1'; continue; } *ep++ = CCHR; *ep++ = c; continue; defchar: default: *ep++ = CCHR; *ep++ = c; } } } /* * match the argument string against the compiled re */ int re_exec(register char *p1) { register char *p2 = expbuf; register int c; int rv; for (c = 0; c < NBRA; c++) { braslist[c] = 0; braelist[c] = 0; } if (circf) return((advance(p1, p2))); /* * fast check for first character */ if (*p2 == CCHR) { c = p2[1]; do { if (*p1 != c) continue; if (rv = advance(p1, p2)) return(rv); } while (*p1++); return(0); } /* * regular algorithm */ do if (rv = advance(p1, p2)) return(rv); while (*p1++); return(0); } /* * try to match the next thing in the dfa */ static int advance(lp, ep) register char *lp, *ep; { register char *curlp; int ct, i; int rv; for (;;) switch (*ep++) { case CCHR: if (*ep++ == *lp++) continue; return(0); case CDOT: if (*lp++) continue; return(0); case CDOL: if (*lp == '\0') continue; return(0); case CEOF: return(1); case CCL: if (cclass(ep, *lp++, 1)) { ep += *ep; continue; } return(0); case NCCL: if (cclass(ep, *lp++, 0)) { ep += *ep; continue; } return(0); case CBRA: braslist[*ep++] = lp; continue; case CKET: braelist[*ep++] = lp; continue; case CBACK: if (braelist[i = *ep++] == 0) return(-1); if (backref(i, lp)) { lp += braelist[i] - braslist[i]; continue; } return(0); case CBACK|CSTAR: if (braelist[i = *ep++] == 0) return(-1); curlp = lp; ct = braelist[i] - braslist[i]; while (backref(i, lp)) lp += ct; while (lp >= curlp) { if (rv = advance(lp, ep)) return(rv); lp -= ct; } continue; case CDOT|CSTAR: curlp = lp; while (*lp++) ; goto star; case CCHR|CSTAR: curlp = lp; while (*lp++ == *ep) ; ep++; goto star; case CCL|CSTAR: case NCCL|CSTAR: curlp = lp; while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) ; ep += *ep; goto star; star: do { lp--; if (rv = advance(lp, ep)) return(rv); } while (lp > curlp); return(0); default: return(-1); } } static int backref(i, lp) register int i; register char *lp; { register char *bp; bp = braslist[i]; while (*bp++ == *lp++) if (bp >= braelist[i]) return(1); return(0); } static int cclass(set, c, af) register char *set, c; int af; { register int n; if (c == 0) return(0); n = *set++; while (--n) if (*set++ == c) return(af); return(! af); }