afsconfig-and-rcsid-all-around-20010705
[openafs.git] / src / bucoord / regex.c
1 /*
2  * Copyright 2000, International Business Machines Corporation and others.
3  * All Rights Reserved.
4  * 
5  * This software has been released under the terms of the IBM Public
6  * License.  For details, see the LICENSE file in the top-level source
7  * directory or online at http://www.openafs.org/dl/license10.html
8  */
9
10 /*
11  * routines to do regular expression matching
12  *
13  * Entry points:
14  *
15  *      re_comp(s)
16  *              char *s;
17  *       ... returns 0 if the string s was compiled successfully,
18  *                   a pointer to an error message otherwise.
19  *           If passed 0 or a null string returns without changing
20  *           the currently compiled re (see note 11 below).
21  *
22  *      re_exec(s)
23  *              char *s;
24  *       ... returns 1 if the string s matches the last compiled regular
25  *                     expression, 
26  *                   0 if the string s failed to match the last compiled
27  *                     regular expression, and
28  *                  -1 if the compiled regular expression was invalid 
29  *                     (indicating an internal error).
30  *
31  * The strings passed to both re_comp and re_exec may have trailing or
32  * embedded newline characters; they are terminated by nulls.
33  *
34  * The identity of the author of these routines is lost in antiquity;
35  * this is essentially the same as the re code in the original V6 ed.
36  *
37  * The regular expressions recognized are described below. This description
38  * is essentially the same as that for ed.
39  *
40  *      A regular expression specifies a set of strings of characters.
41  *      A member of this set of strings is said to be matched by
42  *      the regular expression.  In the following specification for
43  *      regular expressions the word `character' means any character but NUL.
44  *
45  *      1.  Any character except a special character matches itself.
46  *          Special characters are the regular expression delimiter plus
47  *          \ [ . and sometimes ^ * $.
48  *      2.  A . matches any character.
49  *      3.  A \ followed by any character except a digit or ( )
50  *          matches that character.
51  *      4.  A nonempty string s bracketed [s] (or [^s]) matches any
52  *          character in (or not in) s. In s, \ has no special meaning,
53  *          and ] may only appear as the first letter. A substring 
54  *          a-b, with a and b in ascending ASCII order, stands for
55  *          the inclusive range of ASCII characters.
56  *      5.  A regular expression of form 1-4 followed by * matches a
57  *          sequence of 0 or more matches of the regular expression.
58  *      6.  A regular expression, x, of form 1-8, bracketed \(x\)
59  *          matches what x matches.
60  *      7.  A \ followed by a digit n matches a copy of the string that the
61  *          bracketed regular expression beginning with the nth \( matched.
62  *      8.  A regular expression of form 1-8, x, followed by a regular
63  *          expression of form 1-7, y matches a match for x followed by
64  *          a match for y, with the x match being as long as possible
65  *          while still permitting a y match.
66  *      9.  A regular expression of form 1-8 preceded by ^ (or followed
67  *          by $), is constrained to matches that begin at the left
68  *          (or end at the right) end of a line.
69  *      10. A regular expression of form 1-9 picks out the longest among
70  *          the leftmost matches in a line.
71  *      11. An empty regular expression stands for a copy of the last
72  *          regular expression encountered.
73  */
74
75 /*
76  * constants for re's
77  */
78 #define CBRA    1
79 #define CCHR    2
80 #define CDOT    4
81 #define CCL     6
82 #define NCCL    8
83 #define CDOL    10
84 #define CEOF    11
85 #define CKET    12
86 #define CBACK   18
87
88 #define CSTAR   01
89
90 #define ESIZE   512
91 #define NBRA    9
92
93 static  char    expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
94 static  char    circf;
95
96 /*
97  * compile the regular expression argument into a dfa
98  */
99 char *
100 re_comp(sp)
101         register char   *sp;
102 {
103         register int    c;
104         register char   *ep = expbuf;
105         int     cclcnt, numbra = 0;
106         char    *lastep = 0;
107         char    bracket[NBRA];
108         char    *bracketp = &bracket[0];
109         static  char    *retoolong = "Regular expression too long";
110
111 #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
112
113         if (sp == 0 || *sp == '\0') {
114                 if (*ep == 0)
115                         return("No previous regular expression");
116                 return(0);
117         }
118         if (*sp == '^') {
119                 circf = 1;
120                 sp++;
121         }
122         else
123                 circf = 0;
124         for (;;) {
125                 if (ep >= &expbuf[ESIZE])
126                         comerr(retoolong);
127                 if ((c = *sp++) == '\0') {
128                         if (bracketp != bracket)
129                                 comerr("unmatched \\(");
130                         *ep++ = CEOF;
131                         *ep++ = 0;
132                         return(0);
133                 }
134                 if (c != '*')
135                         lastep = ep;
136                 switch (c) {
137
138                 case '.':
139                         *ep++ = CDOT;
140                         continue;
141
142                 case '*':
143                         if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
144                                 goto defchar;
145                         *lastep |= CSTAR;
146                         continue;
147
148                 case '$':
149                         if (*sp != '\0')
150                                 goto defchar;
151                         *ep++ = CDOL;
152                         continue;
153
154                 case '[':
155                         *ep++ = CCL;
156                         *ep++ = 0;
157                         cclcnt = 1;
158                         if ((c = *sp++) == '^') {
159                                 c = *sp++;
160                                 ep[-2] = NCCL;
161                         }
162                         do {
163                                 if (c == '\0')
164                                         comerr("missing ]");
165                                 if (c == '-' && ep [-1] != 0) {
166                                         if ((c = *sp++) == ']') {
167                                                 *ep++ = '-';
168                                                 cclcnt++;
169                                                 break;
170                                         }
171                                         while (ep[-1] < c) {
172                                                 *ep = ep[-1] + 1;
173                                                 ep++;
174                                                 cclcnt++;
175                                                 if (ep >= &expbuf[ESIZE])
176                                                         comerr(retoolong);
177                                         }
178                                 }
179                                 *ep++ = c;
180                                 cclcnt++;
181                                 if (ep >= &expbuf[ESIZE])
182                                         comerr(retoolong);
183                         } while ((c = *sp++) != ']');
184                         lastep[1] = cclcnt;
185                         continue;
186
187                 case '\\':
188                         if ((c = *sp++) == '(') {
189                                 if (numbra >= NBRA)
190                                         comerr("too many \\(\\) pairs");
191                                 *bracketp++ = numbra;
192                                 *ep++ = CBRA;
193                                 *ep++ = numbra++;
194                                 continue;
195                         }
196                         if (c == ')') {
197                                 if (bracketp <= bracket)
198                                         comerr("unmatched \\)");
199                                 *ep++ = CKET;
200                                 *ep++ = *--bracketp;
201                                 continue;
202                         }
203                         if (c >= '1' && c < ('1' + NBRA)) {
204                                 *ep++ = CBACK;
205                                 *ep++ = c - '1';
206                                 continue;
207                         }
208                         *ep++ = CCHR;
209                         *ep++ = c;
210                         continue;
211
212                 defchar:
213                 default:
214                         *ep++ = CCHR;
215                         *ep++ = c;
216                 }
217         }
218 }
219
220 /* 
221  * match the argument string against the compiled re
222  */
223 int
224 re_exec(p1)
225         register char   *p1;
226 {
227         register char   *p2 = expbuf;
228         register int    c;
229         int     rv;
230
231         for (c = 0; c < NBRA; c++) {
232                 braslist[c] = 0;
233                 braelist[c] = 0;
234         }
235         if (circf)
236                 return((advance(p1, p2)));
237         /*
238          * fast check for first character
239          */
240         if (*p2 == CCHR) {
241                 c = p2[1];
242                 do {
243                         if (*p1 != c)
244                                 continue;
245                         if (rv = advance(p1, p2))
246                                 return(rv);
247                 } while (*p1++);
248                 return(0);
249         }
250         /*
251          * regular algorithm
252          */
253         do
254                 if (rv = advance(p1, p2))
255                         return(rv);
256         while (*p1++);
257         return(0);
258 }
259
260 /* 
261  * try to match the next thing in the dfa
262  */
263 static  int
264 advance(lp, ep)
265         register char   *lp, *ep;
266 {
267         register char   *curlp;
268         int     ct, i;
269         int     rv;
270
271         for (;;)
272                 switch (*ep++) {
273
274                 case CCHR:
275                         if (*ep++ == *lp++)
276                                 continue;
277                         return(0);
278
279                 case CDOT:
280                         if (*lp++)
281                                 continue;
282                         return(0);
283
284                 case CDOL:
285                         if (*lp == '\0')
286                                 continue;
287                         return(0);
288
289                 case CEOF:
290                         return(1);
291
292                 case CCL:
293                         if (cclass(ep, *lp++, 1)) {
294                                 ep += *ep;
295                                 continue;
296                         }
297                         return(0);
298
299                 case NCCL:
300                         if (cclass(ep, *lp++, 0)) {
301                                 ep += *ep;
302                                 continue;
303                         }
304                         return(0);
305
306                 case CBRA:
307                         braslist[*ep++] = lp;
308                         continue;
309
310                 case CKET:
311                         braelist[*ep++] = lp;
312                         continue;
313
314                 case CBACK:
315                         if (braelist[i = *ep++] == 0)
316                                 return(-1);
317                         if (backref(i, lp)) {
318                                 lp += braelist[i] - braslist[i];
319                                 continue;
320                         }
321                         return(0);
322
323                 case CBACK|CSTAR:
324                         if (braelist[i = *ep++] == 0)
325                                 return(-1);
326                         curlp = lp;
327                         ct = braelist[i] - braslist[i];
328                         while (backref(i, lp))
329                                 lp += ct;
330                         while (lp >= curlp) {
331                                 if (rv = advance(lp, ep))
332                                         return(rv);
333                                 lp -= ct;
334                         }
335                         continue;
336
337                 case CDOT|CSTAR:
338                         curlp = lp;
339                         while (*lp++)
340                                 ;
341                         goto star;
342
343                 case CCHR|CSTAR:
344                         curlp = lp;
345                         while (*lp++ == *ep)
346                                 ;
347                         ep++;
348                         goto star;
349
350                 case CCL|CSTAR:
351                 case NCCL|CSTAR:
352                         curlp = lp;
353                         while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
354                                 ;
355                         ep += *ep;
356                         goto star;
357
358                 star:
359                         do {
360                                 lp--;
361                                 if (rv = advance(lp, ep))
362                                         return(rv);
363                         } while (lp > curlp);
364                         return(0);
365
366                 default:
367                         return(-1);
368                 }
369 }
370
371 static
372 backref(i, lp)
373         register int    i;
374         register char   *lp;
375 {
376         register char   *bp;
377
378         bp = braslist[i];
379         while (*bp++ == *lp++)
380                 if (bp >= braelist[i])
381                         return(1);
382         return(0);
383 }
384
385 static int
386 cclass(set, c, af)
387         register char   *set, c;
388         int     af;
389 {
390         register int    n;
391
392         if (c == 0)
393                 return(0);
394         n = *set++;
395         while (--n)
396                 if (*set++ == c)
397                         return(af);
398         return(! af);
399 }