Standardize License information
[openafs.git] / src / bucoord / regex.c
1 /*
2  * Copyright 2000, International Business Machines Corporation and others.
3  * All Rights Reserved.
4  * 
5  * This software has been released under the terms of the IBM Public
6  * License.  For details, see the LICENSE file in the top-level source
7  * directory or online at http://www.openafs.org/dl/license10.html
8  */
9
10 #ifndef lint
11 static  char sccsid[] = "@(#)regex.c 1.1 86/09/24 SMI"; /* from UCB 4.1 80/12/21 */
12 #endif
13
14 /*
15  * routines to do regular expression matching
16  *
17  * Entry points:
18  *
19  *      re_comp(s)
20  *              char *s;
21  *       ... returns 0 if the string s was compiled successfully,
22  *                   a pointer to an error message otherwise.
23  *           If passed 0 or a null string returns without changing
24  *           the currently compiled re (see note 11 below).
25  *
26  *      re_exec(s)
27  *              char *s;
28  *       ... returns 1 if the string s matches the last compiled regular
29  *                     expression, 
30  *                   0 if the string s failed to match the last compiled
31  *                     regular expression, and
32  *                  -1 if the compiled regular expression was invalid 
33  *                     (indicating an internal error).
34  *
35  * The strings passed to both re_comp and re_exec may have trailing or
36  * embedded newline characters; they are terminated by nulls.
37  *
38  * The identity of the author of these routines is lost in antiquity;
39  * this is essentially the same as the re code in the original V6 ed.
40  *
41  * The regular expressions recognized are described below. This description
42  * is essentially the same as that for ed.
43  *
44  *      A regular expression specifies a set of strings of characters.
45  *      A member of this set of strings is said to be matched by
46  *      the regular expression.  In the following specification for
47  *      regular expressions the word `character' means any character but NUL.
48  *
49  *      1.  Any character except a special character matches itself.
50  *          Special characters are the regular expression delimiter plus
51  *          \ [ . and sometimes ^ * $.
52  *      2.  A . matches any character.
53  *      3.  A \ followed by any character except a digit or ( )
54  *          matches that character.
55  *      4.  A nonempty string s bracketed [s] (or [^s]) matches any
56  *          character in (or not in) s. In s, \ has no special meaning,
57  *          and ] may only appear as the first letter. A substring 
58  *          a-b, with a and b in ascending ASCII order, stands for
59  *          the inclusive range of ASCII characters.
60  *      5.  A regular expression of form 1-4 followed by * matches a
61  *          sequence of 0 or more matches of the regular expression.
62  *      6.  A regular expression, x, of form 1-8, bracketed \(x\)
63  *          matches what x matches.
64  *      7.  A \ followed by a digit n matches a copy of the string that the
65  *          bracketed regular expression beginning with the nth \( matched.
66  *      8.  A regular expression of form 1-8, x, followed by a regular
67  *          expression of form 1-7, y matches a match for x followed by
68  *          a match for y, with the x match being as long as possible
69  *          while still permitting a y match.
70  *      9.  A regular expression of form 1-8 preceded by ^ (or followed
71  *          by $), is constrained to matches that begin at the left
72  *          (or end at the right) end of a line.
73  *      10. A regular expression of form 1-9 picks out the longest among
74  *          the leftmost matches in a line.
75  *      11. An empty regular expression stands for a copy of the last
76  *          regular expression encountered.
77  */
78
79 /*
80  * constants for re's
81  */
82 #define CBRA    1
83 #define CCHR    2
84 #define CDOT    4
85 #define CCL     6
86 #define NCCL    8
87 #define CDOL    10
88 #define CEOF    11
89 #define CKET    12
90 #define CBACK   18
91
92 #define CSTAR   01
93
94 #define ESIZE   512
95 #define NBRA    9
96
97 static  char    expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
98 static  char    circf;
99
100 /*
101  * compile the regular expression argument into a dfa
102  */
103 char *
104 re_comp(sp)
105         register char   *sp;
106 {
107         register int    c;
108         register char   *ep = expbuf;
109         int     cclcnt, numbra = 0;
110         char    *lastep = 0;
111         char    bracket[NBRA];
112         char    *bracketp = &bracket[0];
113         static  char    *retoolong = "Regular expression too long";
114
115 #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
116
117         if (sp == 0 || *sp == '\0') {
118                 if (*ep == 0)
119                         return("No previous regular expression");
120                 return(0);
121         }
122         if (*sp == '^') {
123                 circf = 1;
124                 sp++;
125         }
126         else
127                 circf = 0;
128         for (;;) {
129                 if (ep >= &expbuf[ESIZE])
130                         comerr(retoolong);
131                 if ((c = *sp++) == '\0') {
132                         if (bracketp != bracket)
133                                 comerr("unmatched \\(");
134                         *ep++ = CEOF;
135                         *ep++ = 0;
136                         return(0);
137                 }
138                 if (c != '*')
139                         lastep = ep;
140                 switch (c) {
141
142                 case '.':
143                         *ep++ = CDOT;
144                         continue;
145
146                 case '*':
147                         if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
148                                 goto defchar;
149                         *lastep |= CSTAR;
150                         continue;
151
152                 case '$':
153                         if (*sp != '\0')
154                                 goto defchar;
155                         *ep++ = CDOL;
156                         continue;
157
158                 case '[':
159                         *ep++ = CCL;
160                         *ep++ = 0;
161                         cclcnt = 1;
162                         if ((c = *sp++) == '^') {
163                                 c = *sp++;
164                                 ep[-2] = NCCL;
165                         }
166                         do {
167                                 if (c == '\0')
168                                         comerr("missing ]");
169                                 if (c == '-' && ep [-1] != 0) {
170                                         if ((c = *sp++) == ']') {
171                                                 *ep++ = '-';
172                                                 cclcnt++;
173                                                 break;
174                                         }
175                                         while (ep[-1] < c) {
176                                                 *ep = ep[-1] + 1;
177                                                 ep++;
178                                                 cclcnt++;
179                                                 if (ep >= &expbuf[ESIZE])
180                                                         comerr(retoolong);
181                                         }
182                                 }
183                                 *ep++ = c;
184                                 cclcnt++;
185                                 if (ep >= &expbuf[ESIZE])
186                                         comerr(retoolong);
187                         } while ((c = *sp++) != ']');
188                         lastep[1] = cclcnt;
189                         continue;
190
191                 case '\\':
192                         if ((c = *sp++) == '(') {
193                                 if (numbra >= NBRA)
194                                         comerr("too many \\(\\) pairs");
195                                 *bracketp++ = numbra;
196                                 *ep++ = CBRA;
197                                 *ep++ = numbra++;
198                                 continue;
199                         }
200                         if (c == ')') {
201                                 if (bracketp <= bracket)
202                                         comerr("unmatched \\)");
203                                 *ep++ = CKET;
204                                 *ep++ = *--bracketp;
205                                 continue;
206                         }
207                         if (c >= '1' && c < ('1' + NBRA)) {
208                                 *ep++ = CBACK;
209                                 *ep++ = c - '1';
210                                 continue;
211                         }
212                         *ep++ = CCHR;
213                         *ep++ = c;
214                         continue;
215
216                 defchar:
217                 default:
218                         *ep++ = CCHR;
219                         *ep++ = c;
220                 }
221         }
222 }
223
224 /* 
225  * match the argument string against the compiled re
226  */
227 int
228 re_exec(p1)
229         register char   *p1;
230 {
231         register char   *p2 = expbuf;
232         register int    c;
233         int     rv;
234
235         for (c = 0; c < NBRA; c++) {
236                 braslist[c] = 0;
237                 braelist[c] = 0;
238         }
239         if (circf)
240                 return((advance(p1, p2)));
241         /*
242          * fast check for first character
243          */
244         if (*p2 == CCHR) {
245                 c = p2[1];
246                 do {
247                         if (*p1 != c)
248                                 continue;
249                         if (rv = advance(p1, p2))
250                                 return(rv);
251                 } while (*p1++);
252                 return(0);
253         }
254         /*
255          * regular algorithm
256          */
257         do
258                 if (rv = advance(p1, p2))
259                         return(rv);
260         while (*p1++);
261         return(0);
262 }
263
264 /* 
265  * try to match the next thing in the dfa
266  */
267 static  int
268 advance(lp, ep)
269         register char   *lp, *ep;
270 {
271         register char   *curlp;
272         int     ct, i;
273         int     rv;
274
275         for (;;)
276                 switch (*ep++) {
277
278                 case CCHR:
279                         if (*ep++ == *lp++)
280                                 continue;
281                         return(0);
282
283                 case CDOT:
284                         if (*lp++)
285                                 continue;
286                         return(0);
287
288                 case CDOL:
289                         if (*lp == '\0')
290                                 continue;
291                         return(0);
292
293                 case CEOF:
294                         return(1);
295
296                 case CCL:
297                         if (cclass(ep, *lp++, 1)) {
298                                 ep += *ep;
299                                 continue;
300                         }
301                         return(0);
302
303                 case NCCL:
304                         if (cclass(ep, *lp++, 0)) {
305                                 ep += *ep;
306                                 continue;
307                         }
308                         return(0);
309
310                 case CBRA:
311                         braslist[*ep++] = lp;
312                         continue;
313
314                 case CKET:
315                         braelist[*ep++] = lp;
316                         continue;
317
318                 case CBACK:
319                         if (braelist[i = *ep++] == 0)
320                                 return(-1);
321                         if (backref(i, lp)) {
322                                 lp += braelist[i] - braslist[i];
323                                 continue;
324                         }
325                         return(0);
326
327                 case CBACK|CSTAR:
328                         if (braelist[i = *ep++] == 0)
329                                 return(-1);
330                         curlp = lp;
331                         ct = braelist[i] - braslist[i];
332                         while (backref(i, lp))
333                                 lp += ct;
334                         while (lp >= curlp) {
335                                 if (rv = advance(lp, ep))
336                                         return(rv);
337                                 lp -= ct;
338                         }
339                         continue;
340
341                 case CDOT|CSTAR:
342                         curlp = lp;
343                         while (*lp++)
344                                 ;
345                         goto star;
346
347                 case CCHR|CSTAR:
348                         curlp = lp;
349                         while (*lp++ == *ep)
350                                 ;
351                         ep++;
352                         goto star;
353
354                 case CCL|CSTAR:
355                 case NCCL|CSTAR:
356                         curlp = lp;
357                         while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
358                                 ;
359                         ep += *ep;
360                         goto star;
361
362                 star:
363                         do {
364                                 lp--;
365                                 if (rv = advance(lp, ep))
366                                         return(rv);
367                         } while (lp > curlp);
368                         return(0);
369
370                 default:
371                         return(-1);
372                 }
373 }
374
375 static
376 backref(i, lp)
377         register int    i;
378         register char   *lp;
379 {
380         register char   *bp;
381
382         bp = braslist[i];
383         while (*bp++ == *lp++)
384                 if (bp >= braelist[i])
385                         return(1);
386         return(0);
387 }
388
389 static int
390 cclass(set, c, af)
391         register char   *set, c;
392         int     af;
393 {
394         register int    n;
395
396         if (c == 0)
397                 return(0);
398         n = *set++;
399         while (--n)
400                 if (*set++ == c)
401                         return(af);
402         return(! af);
403 }