compile-snprintf-for-solaris25-20010430
[openafs.git] / src / util / regex.c
1 /*
2  * Copyright 2000, International Business Machines Corporation and others.
3  * All Rights Reserved.
4  * 
5  * This software has been released under the terms of the IBM Public
6  * License.  For details, see the LICENSE file in the top-level source
7  * directory or online at http://www.openafs.org/dl/license10.html
8  */
9
10 /*
11  * All Rights Reserved
12  */
13 /*
14  * HISTORY
15  * Revision 1.1.7.2  1994/08/25  17:34:40
16  *      Added s12y calls
17  *      [1994/08/25  17:23:12]
18  *
19  * Revision 1.1.7.1  1994/06/09  13:50:35
20  *      fixed copyright in src/file
21  *      [1994/06/08  21:23:56]
22  * 
23  * Revision 1.1.5.3  1993/01/18  19:51:30
24  *      Embedded copyright notice
25  *      [1993/01/18  19:26:02
26  * 
27  * Revision 1.1.5.2  1992/11/24  15:37:53
28  *      Change include file install directory from .../afs to .../dcedfs.
29  *      [1992/11/22  16:05:26]
30  * 
31  * Revision 1.1.3.3  1992/01/24  01:48:02
32  *      Merging dfs6.3
33  *      [1992/01/24  00:02:28]
34  * 
35  * Revision 1.1.3.2  1992/01/23  19:03:28
36  *      Moving files onto the branch for dfs6.3 merge
37  *      [1992/01/23  18:19:02]
38  *      Revision 1.1.1.2  1992/01/22  19:29:25
39  *      Bring in dfs6.3 sources
40  * 
41  * Revision 1.1  1992/01/19  02:56:43
42  *      Initial revision
43  * 
44  * $EndLog$
45  */
46 /*
47  * regex.c -- regular expression patter matching functions
48  * 
49  */
50
51 #include <afs/param.h>
52 /*
53  * routines to do regular expression matching
54  *
55  * Entry points:
56  *
57  *      re_comp(s)
58  *              char *s;
59  *       ... returns 0 if the string s was compiled successfully,
60  *                   a pointer to an error message otherwise.
61  *           If passed 0 or a null string returns without changing
62  *           the currently compiled re (see note 11 below).
63  *
64  *      re_exec(s)
65  *              char *s;
66  *       ... returns 1 if the string s matches the last compiled regular
67  *                     expression, 
68  *                   0 if the string s failed to match the last compiled
69  *                     regular expression, and
70  *                  -1 if the compiled regular expression was invalid 
71  *                     (indicating an internal error).
72  *
73  * The strings passed to both re_comp and re_exec may have trailing or
74  * embedded newline characters; they are terminated by nulls.
75  *
76  * The identity of the author of these routines is lost in antiquity;
77  * this is essentially the same as the re code in the original V6 ed.
78  *
79  * The regular expressions recognized are described below. This description
80  * is essentially the same as that for ed.
81  *
82  *      A regular expression specifies a set of strings of characters.
83  *      A member of this set of strings is said to be matched by
84  *      the regular expression.  In the following specification for
85  *      regular expressions the word `character' means any character but NUL.
86  *
87  *      1.  Any character except a special character matches itself.
88  *          Special characters are the regular expression delimiter plus
89  *          \ [ . and sometimes ^ * $.
90  *      2.  A . matches any character.
91  *      3.  A \ followed by any character except a digit or ( )
92  *          matches that character.
93  *      4.  A nonempty string s bracketed [s] (or [^s]) matches any
94  *          character in (or not in) s. In s, \ has no special meaning,
95  *          and ] may only appear as the first letter. A substring 
96  *          a-b, with a and b in ascending ASCII order, stands for
97  *          the inclusive range of ASCII characters.
98  *      5.  A regular expression of form 1-4 followed by * matches a
99  *          sequence of 0 or more matches of the regular expression.
100  *      6.  A regular expression, x, of form 1-8, bracketed \(x\)
101  *          matches what x matches.
102  *      7.  A \ followed by a digit n matches a copy of the string that the
103  *          bracketed regular expression beginning with the nth \( matched.
104  *      8.  A regular expression of form 1-8, x, followed by a regular
105  *          expression of form 1-7, y matches a match for x followed by
106  *          a match for y, with the x match being as long as possible
107  *          while still permitting a y match.
108  *      9.  A regular expression of form 1-8 preceded by ^ (or followed
109  *          by $), is constrained to matches that begin at the left
110  *          (or end at the right) end of a line.
111  *      10. A regular expression of form 1-9 picks out the longest among
112  *          the leftmost matches in a line.
113  *      11. An empty regular expression stands for a copy of the last
114  *          regular expression encountered.
115  */
116
117 /*
118  * constants for re's
119  */
120 #define CBRA    1
121 #define CCHR    2
122 #define CDOT    4
123 #define CCL     6
124 #define NCCL    8
125 #define CDOL    10
126 #define CEOF    11
127 #define CKET    12
128 #define CBACK   18
129
130 #define CSTAR   01
131
132 #define ESIZE   512
133 #define NBRA    9
134
135 static  char    expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
136 static  char    circf;
137
138 /* forward defs
139 */
140
141 static  int advance();
142 static  int backref();
143 static  int cclass();
144
145
146 /*
147  * compile the regular expression argument into a dfa
148  */
149 char * re_comp(register char *sp)
150 {
151         register int    c;
152         register char   *ep = expbuf;
153         int     cclcnt, numbra = 0;
154         char    *lastep = 0;
155         char    bracket[NBRA];
156         char    *bracketp = &bracket[0];
157         static  char    *retoolong = "Regular expression too long";
158
159 #define comperr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
160
161         if (sp == 0 || *sp == '\0') {
162                 if (*ep == 0)
163                         return("No previous regular expression");
164                 return(0);
165         }
166         if (*sp == '^') {
167                 circf = 1;
168                 sp++;
169         }
170         else
171                 circf = 0;
172         for (;;) {
173                 if (ep >= &expbuf[ESIZE])
174                         comperr(retoolong);
175                 if ((c = *sp++) == '\0') {
176                         if (bracketp != bracket)
177                                 comperr("unmatched \\(");
178                         *ep++ = CEOF;
179                         *ep++ = 0;
180                         return(0);
181                 }
182                 if (c != '*')
183                         lastep = ep;
184                 switch (c) {
185
186                 case '.':
187                         *ep++ = CDOT;
188                         continue;
189
190                 case '*':
191                         if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
192                                 goto defchar;
193                         *lastep |= CSTAR;
194                         continue;
195
196                 case '$':
197                         if (*sp != '\0')
198                                 goto defchar;
199                         *ep++ = CDOL;
200                         continue;
201
202                 case '[':
203                         *ep++ = CCL;
204                         *ep++ = 0;
205                         cclcnt = 1;
206                         if ((c = *sp++) == '^') {
207                                 c = *sp++;
208                                 ep[-2] = NCCL;
209                         }
210                         do {
211                                 if (c == '\0')
212                                         comperr("missing ]");
213                                 if (c == '-' && ep [-1] != 0) {
214                                         if ((c = *sp++) == ']') {
215                                                 *ep++ = '-';
216                                                 cclcnt++;
217                                                 break;
218                                         }
219                                         while (ep[-1] < c) {
220                                                 *ep = ep[-1] + 1;
221                                                 ep++;
222                                                 cclcnt++;
223                                                 if (ep >= &expbuf[ESIZE])
224                                                         comperr(retoolong);
225                                         }
226                                 }
227                                 *ep++ = c;
228                                 cclcnt++;
229                                 if (ep >= &expbuf[ESIZE])
230                                         comperr(retoolong);
231                         } while ((c = *sp++) != ']');
232                         lastep[1] = cclcnt;
233                         continue;
234
235                 case '\\':
236                         if ((c = *sp++) == '(') {
237                                 if (numbra >= NBRA)
238                                         comperr("too many \\(\\) pairs");
239                                 *bracketp++ = numbra;
240                                 *ep++ = CBRA;
241                                 *ep++ = numbra++;
242                                 continue;
243                         }
244                         if (c == ')') {
245                                 if (bracketp <= bracket)
246                                         comperr("unmatched \\)");
247                                 *ep++ = CKET;
248                                 *ep++ = *--bracketp;
249                                 continue;
250                         }
251                         if (c >= '1' && c < ('1' + NBRA)) {
252                                 *ep++ = CBACK;
253                                 *ep++ = c - '1';
254                                 continue;
255                         }
256                         *ep++ = CCHR;
257                         *ep++ = c;
258                         continue;
259
260                 defchar:
261                 default:
262                         *ep++ = CCHR;
263                         *ep++ = c;
264                 }
265         }
266 }
267
268 /* 
269  * match the argument string against the compiled re
270  */
271 int re_exec(register char *p1)
272 {
273         register char   *p2 = expbuf;
274         register int    c;
275         int     rv;
276
277         for (c = 0; c < NBRA; c++) {
278                 braslist[c] = 0;
279                 braelist[c] = 0;
280         }
281         if (circf)
282                 return((advance(p1, p2)));
283         /*
284          * fast check for first character
285          */
286         if (*p2 == CCHR) {
287                 c = p2[1];
288                 do {
289                         if (*p1 != c)
290                                 continue;
291                         if (rv = advance(p1, p2))
292                                 return(rv);
293                 } while (*p1++);
294                 return(0);
295         }
296         /*
297          * regular algorithm
298          */
299         do
300                 if (rv = advance(p1, p2))
301                         return(rv);
302         while (*p1++);
303         return(0);
304 }
305
306 /* 
307  * try to match the next thing in the dfa
308  */
309 static  int
310 advance(lp, ep)
311         register char   *lp, *ep;
312 {
313         register char   *curlp;
314         int     ct, i;
315         int     rv;
316
317         for (;;)
318                 switch (*ep++) {
319
320                 case CCHR:
321                         if (*ep++ == *lp++)
322                                 continue;
323                         return(0);
324
325                 case CDOT:
326                         if (*lp++)
327                                 continue;
328                         return(0);
329
330                 case CDOL:
331                         if (*lp == '\0')
332                                 continue;
333                         return(0);
334
335                 case CEOF:
336                         return(1);
337
338                 case CCL:
339                         if (cclass(ep, *lp++, 1)) {
340                                 ep += *ep;
341                                 continue;
342                         }
343                         return(0);
344
345                 case NCCL:
346                         if (cclass(ep, *lp++, 0)) {
347                                 ep += *ep;
348                                 continue;
349                         }
350                         return(0);
351
352                 case CBRA:
353                         braslist[*ep++] = lp;
354                         continue;
355
356                 case CKET:
357                         braelist[*ep++] = lp;
358                         continue;
359
360                 case CBACK:
361                         if (braelist[i = *ep++] == 0)
362                                 return(-1);
363                         if (backref(i, lp)) {
364                                 lp += braelist[i] - braslist[i];
365                                 continue;
366                         }
367                         return(0);
368
369                 case CBACK|CSTAR:
370                         if (braelist[i = *ep++] == 0)
371                                 return(-1);
372                         curlp = lp;
373                         ct = braelist[i] - braslist[i];
374                         while (backref(i, lp))
375                                 lp += ct;
376                         while (lp >= curlp) {
377                                 if (rv = advance(lp, ep))
378                                         return(rv);
379                                 lp -= ct;
380                         }
381                         continue;
382
383                 case CDOT|CSTAR:
384                         curlp = lp;
385                         while (*lp++)
386                                 ;
387                         goto star;
388
389                 case CCHR|CSTAR:
390                         curlp = lp;
391                         while (*lp++ == *ep)
392                                 ;
393                         ep++;
394                         goto star;
395
396                 case CCL|CSTAR:
397                 case NCCL|CSTAR:
398                         curlp = lp;
399                         while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
400                                 ;
401                         ep += *ep;
402                         goto star;
403
404                 star:
405                         do {
406                                 lp--;
407                                 if (rv = advance(lp, ep))
408                                         return(rv);
409                         } while (lp > curlp);
410                         return(0);
411
412                 default:
413                         return(-1);
414                 }
415 }
416
417 static int
418 backref(i, lp)
419         register int    i;
420         register char   *lp;
421 {
422         register char   *bp;
423
424         bp = braslist[i];
425         while (*bp++ == *lp++)
426                 if (bp >= braelist[i])
427                         return(1);
428         return(0);
429 }
430
431 static int
432 cclass(set, c, af)
433         register char   *set, c;
434         int     af;
435 {
436         register int    n;
437
438         if (c == 0)
439                 return(0);
440         n = *set++;
441         while (--n)
442                 if (*set++ == c)
443                         return(af);
444         return(! af);
445 }