2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
11 * regex.c -- regular expression patter matching functions
15 #include <afsconfig.h>
16 #include <afs/param.h>
20 * routines to do regular expression matching
26 * ... returns 0 if the string s was compiled successfully,
27 * a pointer to an error message otherwise.
28 * If passed 0 or a null string returns without changing
29 * the currently compiled re (see note 11 below).
33 * ... returns 1 if the string s matches the last compiled regular
35 * 0 if the string s failed to match the last compiled
36 * regular expression, and
37 * -1 if the compiled regular expression was invalid
38 * (indicating an internal error).
40 * The strings passed to both re_comp and re_exec may have trailing or
41 * embedded newline characters; they are terminated by nulls.
43 * The identity of the author of these routines is lost in antiquity;
44 * this is essentially the same as the re code in the original V6 ed.
46 * The regular expressions recognized are described below. This description
47 * is essentially the same as that for ed.
49 * A regular expression specifies a set of strings of characters.
50 * A member of this set of strings is said to be matched by
51 * the regular expression. In the following specification for
52 * regular expressions the word `character' means any character but NUL.
54 * 1. Any character except a special character matches itself.
55 * Special characters are the regular expression delimiter plus
56 * \ [ . and sometimes ^ * $.
57 * 2. A . matches any character.
58 * 3. A \ followed by any character except a digit or ( )
59 * matches that character.
60 * 4. A nonempty string s bracketed [s] (or [^s]) matches any
61 * character in (or not in) s. In s, \ has no special meaning,
62 * and ] may only appear as the first letter. A substring
63 * a-b, with a and b in ascending ASCII order, stands for
64 * the inclusive range of ASCII characters.
65 * 5. A regular expression of form 1-4 followed by * matches a
66 * sequence of 0 or more matches of the regular expression.
67 * 6. A regular expression, x, of form 1-8, bracketed \(x\)
68 * matches what x matches.
69 * 7. A \ followed by a digit n matches a copy of the string that the
70 * bracketed regular expression beginning with the nth \( matched.
71 * 8. A regular expression of form 1-8, x, followed by a regular
72 * expression of form 1-7, y matches a match for x followed by
73 * a match for y, with the x match being as long as possible
74 * while still permitting a y match.
75 * 9. A regular expression of form 1-8 preceded by ^ (or followed
76 * by $), is constrained to matches that begin at the left
77 * (or end at the right) end of a line.
78 * 10. A regular expression of form 1-9 picks out the longest among
79 * the leftmost matches in a line.
80 * 11. An empty regular expression stands for a copy of the last
81 * regular expression encountered.
102 static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
108 static int advance(register char *lp, register char *ep);
109 static int backref(register int i, register char *lp);
110 static int cclass(register char *set, register char c, int af);
114 * compile the regular expression argument into a dfa
117 re_comp(register char *sp)
120 register char *ep = expbuf;
121 int cclcnt, numbra = 0;
124 char *bracketp = &bracket[0];
125 static char *retoolong = "Regular expression too long";
127 #define comperr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
129 if (sp == 0 || *sp == '\0') {
131 return ("No previous regular expression");
140 if (ep >= &expbuf[ESIZE - 10 /* fudge factor */])
142 if ((c = *sp++) == '\0') {
143 if (bracketp != bracket)
144 comperr("unmatched \\(");
158 if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
173 if ((c = *sp++) == '^') {
179 comperr("missing ]");
180 if (c == '-' && ep[-1] != 0) {
181 if ((c = *sp++) == ']') {
190 if (ep >= &expbuf[ESIZE - 10 /* fudge factor */])
196 if (ep >= &expbuf[ESIZE - 10 /* fudge factor */])
198 } while ((c = *sp++) != ']');
203 if ((c = *sp++) == '(') {
205 comperr("too many \\(\\) pairs");
206 *bracketp++ = numbra;
212 if (bracketp <= bracket)
213 comperr("unmatched \\)");
218 if (c >= '1' && c < ('1' + NBRA)) {
236 * match the argument string against the compiled re
239 re_exec(register char *p1)
241 register char *p2 = expbuf;
245 for (c = 0; c < NBRA; c++) {
250 return ((advance(p1, p2)));
252 * fast check for first character
259 if ((rv = advance(p1, p2)))
268 if ((rv = advance(p1, p2)))
275 * try to match the next thing in the dfa
278 advance(register char *lp, register char *ep)
280 register char *curlp;
306 if (cclass(ep, *lp++, 1)) {
313 if (cclass(ep, *lp++, 0)) {
320 braslist[*ep++] = lp;
324 braelist[*ep++] = lp;
328 if (braelist[i = *ep++] == 0)
330 if (backref(i, lp)) {
331 lp += braelist[i] - braslist[i];
337 if (braelist[i = *ep++] == 0)
340 ct = (int)(braelist[i] - braslist[i]);
341 while (backref(i, lp))
343 while (lp >= curlp) {
344 if ((rv = advance(lp, ep)))
357 while (*lp++ == *ep);
364 while (cclass(ep, *lp++, ep[-1] == (CCL | CSTAR)));
371 if ((rv = advance(lp, ep)))
373 } while (lp > curlp);
382 backref(register int i, register char *lp)
387 while (*bp++ == *lp++)
388 if (bp >= braelist[i])
394 cclass(register char *set, register char c, int af)