/* * Copyright 2000, International Business Machines Corporation and others. * All Rights Reserved. * * This software has been released under the terms of the IBM Public * License. For details, see the LICENSE file in the top-level source * directory or online at http://www.openafs.org/dl/license10.html */ /* * regex.c -- regular expression patter matching functions * */ #include #include RCSID ("$Header$"); /* * routines to do regular expression matching * * Entry points: * * re_comp(s) * char *s; * ... returns 0 if the string s was compiled successfully, * a pointer to an error message otherwise. * If passed 0 or a null string returns without changing * the currently compiled re (see note 11 below). * * re_exec(s) * char *s; * ... returns 1 if the string s matches the last compiled regular * expression, * 0 if the string s failed to match the last compiled * regular expression, and * -1 if the compiled regular expression was invalid * (indicating an internal error). * * The strings passed to both re_comp and re_exec may have trailing or * embedded newline characters; they are terminated by nulls. * * The identity of the author of these routines is lost in antiquity; * this is essentially the same as the re code in the original V6 ed. * * The regular expressions recognized are described below. This description * is essentially the same as that for ed. * * A regular expression specifies a set of strings of characters. * A member of this set of strings is said to be matched by * the regular expression. In the following specification for * regular expressions the word `character' means any character but NUL. * * 1. Any character except a special character matches itself. * Special characters are the regular expression delimiter plus * \ [ . and sometimes ^ * $. * 2. A . matches any character. * 3. A \ followed by any character except a digit or ( ) * matches that character. * 4. A nonempty string s bracketed [s] (or [^s]) matches any * character in (or not in) s. In s, \ has no special meaning, * and ] may only appear as the first letter. A substring * a-b, with a and b in ascending ASCII order, stands for * the inclusive range of ASCII characters. * 5. A regular expression of form 1-4 followed by * matches a * sequence of 0 or more matches of the regular expression. * 6. A regular expression, x, of form 1-8, bracketed \(x\) * matches what x matches. * 7. A \ followed by a digit n matches a copy of the string that the * bracketed regular expression beginning with the nth \( matched. * 8. A regular expression of form 1-8, x, followed by a regular * expression of form 1-7, y matches a match for x followed by * a match for y, with the x match being as long as possible * while still permitting a y match. * 9. A regular expression of form 1-8 preceded by ^ (or followed * by $), is constrained to matches that begin at the left * (or end at the right) end of a line. * 10. A regular expression of form 1-9 picks out the longest among * the leftmost matches in a line. * 11. An empty regular expression stands for a copy of the last * regular expression encountered. */ /* * constants for re's */ #define CBRA 1 #define CCHR 2 #define CDOT 4 #define CCL 6 #define NCCL 8 #define CDOL 10 #define CEOF 11 #define CKET 12 #define CBACK 18 #define CSTAR 01 #define ESIZE 512 #define NBRA 9 static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA]; static char circf; /* forward defs */ static int advance(register char *lp, register char *ep); static int backref(register int i, register char *lp); static int cclass(register char *set, register char c, int af); /* * compile the regular expression argument into a dfa */ char * re_comp(register char *sp) { register int c; register char *ep = expbuf; int cclcnt, numbra = 0; char *lastep = 0; char bracket[NBRA]; char *bracketp = &bracket[0]; static char *retoolong = "Regular expression too long"; #define comperr(msg) {expbuf[0] = 0; numbra = 0; return(msg); } if (sp == 0 || *sp == '\0') { if (*ep == 0) return ("No previous regular expression"); return (0); } if (*sp == '^') { circf = 1; sp++; } else circf = 0; for (;;) { if (ep >= &expbuf[ESIZE - 10 /* fudge factor */]) comperr(retoolong); if ((c = *sp++) == '\0') { if (bracketp != bracket) comperr("unmatched \\("); *ep++ = CEOF; *ep++ = 0; return (0); } if (c != '*') lastep = ep; switch (c) { case '.': *ep++ = CDOT; continue; case '*': if (lastep == 0 || *lastep == CBRA || *lastep == CKET) goto defchar; *lastep |= CSTAR; continue; case '$': if (*sp != '\0') goto defchar; *ep++ = CDOL; continue; case '[': *ep++ = CCL; *ep++ = 0; cclcnt = 1; if ((c = *sp++) == '^') { c = *sp++; ep[-2] = NCCL; } do { if (c == '\0') comperr("missing ]"); if (c == '-' && ep[-1] != 0) { if ((c = *sp++) == ']') { *ep++ = '-'; cclcnt++; break; } while (ep[-1] < c) { *ep = ep[-1] + 1; ep++; cclcnt++; if (ep >= &expbuf[ESIZE - 10 /* fudge factor */]) comperr(retoolong); } } *ep++ = c; cclcnt++; if (ep >= &expbuf[ESIZE - 10 /* fudge factor */]) comperr(retoolong); } while ((c = *sp++) != ']'); lastep[1] = cclcnt; continue; case '\\': if ((c = *sp++) == '(') { if (numbra >= NBRA) comperr("too many \\(\\) pairs"); *bracketp++ = numbra; *ep++ = CBRA; *ep++ = numbra++; continue; } if (c == ')') { if (bracketp <= bracket) comperr("unmatched \\)"); *ep++ = CKET; *ep++ = *--bracketp; continue; } if (c >= '1' && c < ('1' + NBRA)) { *ep++ = CBACK; *ep++ = c - '1'; continue; } *ep++ = CCHR; *ep++ = c; continue; defchar: default: *ep++ = CCHR; *ep++ = c; } } } /* * match the argument string against the compiled re */ int re_exec(register char *p1) { register char *p2 = expbuf; register int c; int rv; for (c = 0; c < NBRA; c++) { braslist[c] = 0; braelist[c] = 0; } if (circf) return ((advance(p1, p2))); /* * fast check for first character */ if (*p2 == CCHR) { c = p2[1]; do { if (*p1 != c) continue; if (rv = advance(p1, p2)) return (rv); } while (*p1++); return (0); } /* * regular algorithm */ do if (rv = advance(p1, p2)) return (rv); while (*p1++); return (0); } /* * try to match the next thing in the dfa */ static int advance(register char *lp, register char *ep) { register char *curlp; int ct, i; int rv; for (;;) switch (*ep++) { case CCHR: if (*ep++ == *lp++) continue; return (0); case CDOT: if (*lp++) continue; return (0); case CDOL: if (*lp == '\0') continue; return (0); case CEOF: return (1); case CCL: if (cclass(ep, *lp++, 1)) { ep += *ep; continue; } return (0); case NCCL: if (cclass(ep, *lp++, 0)) { ep += *ep; continue; } return (0); case CBRA: braslist[*ep++] = lp; continue; case CKET: braelist[*ep++] = lp; continue; case CBACK: if (braelist[i = *ep++] == 0) return (-1); if (backref(i, lp)) { lp += braelist[i] - braslist[i]; continue; } return (0); case CBACK | CSTAR: if (braelist[i = *ep++] == 0) return (-1); curlp = lp; ct = braelist[i] - braslist[i]; while (backref(i, lp)) lp += ct; while (lp >= curlp) { if (rv = advance(lp, ep)) return (rv); lp -= ct; } continue; case CDOT | CSTAR: curlp = lp; while (*lp++); goto star; case CCHR | CSTAR: curlp = lp; while (*lp++ == *ep); ep++; goto star; case CCL | CSTAR: case NCCL | CSTAR: curlp = lp; while (cclass(ep, *lp++, ep[-1] == (CCL | CSTAR))); ep += *ep; goto star; star: do { lp--; if (rv = advance(lp, ep)) return (rv); } while (lp > curlp); return (0); default: return (-1); } } static int backref(register int i, register char *lp) { register char *bp; bp = braslist[i]; while (*bp++ == *lp++) if (bp >= braelist[i]) return (1); return (0); } static int cclass(register char *set, register char c, int af) { register int n; if (c == 0) return (0); n = *set++; while (--n) if (*set++ == c) return (af); return (!af); }