src/bucoord/regex.c

   1 /*
   2  * Copyright 2000, International Business Machines Corporation and others.
   3  * All Rights Reserved.
   4  *
   5  * This software has been released under the terms of the IBM Public
   6  * License.  For details, see the LICENSE file in the top-level source
   7  * directory or online at http://www.openafs.org/dl/license10.html
   8  */
   9
  10 /*
  11  * routines to do regular expression matching
  12  *
  13  * Entry points:
  14  *
  15  *      re_comp(s)
  16  *              char *s;
  17  *       ... returns 0 if the string s was compiled successfully,
  18  *                   a pointer to an error message otherwise.
  19  *           If passed 0 or a null string returns without changing
  20  *           the currently compiled re (see note 11 below).
  21  *
  22  *      re_exec(s)
  23  *              char *s;
  24  *       ... returns 1 if the string s matches the last compiled regular
  25  *                     expression,
  26  *                   0 if the string s failed to match the last compiled
  27  *                     regular expression, and
  28  *                  -1 if the compiled regular expression was invalid
  29  *                     (indicating an internal error).
  30  *
  31  * The strings passed to both re_comp and re_exec may have trailing or
  32  * embedded newline characters; they are terminated by nulls.
  33  *
  34  * The identity of the author of these routines is lost in antiquity;
  35  * this is essentially the same as the re code in the original V6 ed.
  36  *
  37  * The regular expressions recognized are described below. This description
  38  * is essentially the same as that for ed.
  39  *
  40  *      A regular expression specifies a set of strings of characters.
  41  *      A member of this set of strings is said to be matched by
  42  *      the regular expression.  In the following specification for
  43  *      regular expressions the word `character' means any character but NUL.
  44  *
  45  *      1.  Any character except a special character matches itself.
  46  *          Special characters are the regular expression delimiter plus
  47  *          \ [ . and sometimes ^ * $.
  48  *      2.  A . matches any character.
  49  *      3.  A \ followed by any character except a digit or ( )
  50  *          matches that character.
  51  *      4.  A nonempty string s bracketed [s] (or [^s]) matches any
  52  *          character in (or not in) s. In s, \ has no special meaning,
  53  *          and ] may only appear as the first letter. A substring
  54  *          a-b, with a and b in ascending ASCII order, stands for
  55  *          the inclusive range of ASCII characters.
  56  *      5.  A regular expression of form 1-4 followed by * matches a
  57  *          sequence of 0 or more matches of the regular expression.
  58  *      6.  A regular expression, x, of form 1-8, bracketed \(x\)
  59  *          matches what x matches.
  60  *      7.  A \ followed by a digit n matches a copy of the string that the
  61  *          bracketed regular expression beginning with the nth \( matched.
  62  *      8.  A regular expression of form 1-8, x, followed by a regular
  63  *          expression of form 1-7, y matches a match for x followed by
  64  *          a match for y, with the x match being as long as possible
  65  *          while still permitting a y match.
  66  *      9.  A regular expression of form 1-8 preceded by ^ (or followed
  67  *          by $), is constrained to matches that begin at the left
  68  *          (or end at the right) end of a line.
  69  *      10. A regular expression of form 1-9 picks out the longest among
  70  *          the leftmost matches in a line.
  71  *      11. An empty regular expression stands for a copy of the last
  72  *          regular expression encountered.
  73  */
  74
  75 /*
  76  * constants for re's
  77  */
  78 #define CBRA    1
  79 #define CCHR    2
  80 #define CDOT    4
  81 #define CCL     6
  82 #define NCCL    8
  83 #define CDOL    10
  84 #define CEOF    11
  85 #define CKET    12
  86 #define CBACK   18
  87
  88 #define CSTAR   01
  89
  90 #define ESIZE   512
  91 #define NBRA    9
  92
  93 static  char    expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
  94 static  char    circf;
  95
  96 /*
  97  * compile the regular expression argument into a dfa
  98  */
  99 char *
 100 re_comp(sp)
 101         register char   *sp;
 102 {
 103         register int    c;
 104         register char   *ep = expbuf;
 105         int     cclcnt, numbra = 0;
 106         char    *lastep = 0;
 107         char    bracket[NBRA];
 108         char    *bracketp = &bracket[0];
 109         static  char    *retoolong = "Regular expression too long";
 110
 111 #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
 112
 113         if (sp == 0 || *sp == '\0') {
 114                 if (*ep == 0)
 115                         return("No previous regular expression");
 116                 return(0);
 117         }
 118         if (*sp == '^') {
 119                 circf = 1;
 120                 sp++;
 121         }
 122         else
 123                 circf = 0;
 124         for (;;) {
 125                 if (ep >= &expbuf[ESIZE])
 126                         comerr(retoolong);
 127                 if ((c = *sp++) == '\0') {
 128                         if (bracketp != bracket)
 129                                 comerr("unmatched \\(");
 130                         *ep++ = CEOF;
 131                         *ep++ = 0;
 132                         return(0);
 133                 }
 134                 if (c != '*')
 135                         lastep = ep;
 136                 switch (c) {
 137
 138                 case '.':
 139                         *ep++ = CDOT;
 140                         continue;
 141
 142                 case '*':
 143                         if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
 144                                 goto defchar;
 145                         *lastep |= CSTAR;
 146                         continue;
 147
 148                 case '$':
 149                         if (*sp != '\0')
 150                                 goto defchar;
 151                         *ep++ = CDOL;
 152                         continue;
 153
 154                 case '[':
 155                         *ep++ = CCL;
 156                         *ep++ = 0;
 157                         cclcnt = 1;
 158                         if ((c = *sp++) == '^') {
 159                                 c = *sp++;
 160                                 ep[-2] = NCCL;
 161                         }
 162                         do {
 163                                 if (c == '\0')
 164                                         comerr("missing ]");
 165                                 if (c == '-' && ep [-1] != 0) {
 166                                         if ((c = *sp++) == ']') {
 167                                                 *ep++ = '-';
 168                                                 cclcnt++;
 169                                                 break;
 170                                         }
 171                                         while (ep[-1] < c) {
 172                                                 *ep = ep[-1] + 1;
 173                                                 ep++;
 174                                                 cclcnt++;
 175                                                 if (ep >= &expbuf[ESIZE])
 176                                                         comerr(retoolong);
 177                                         }
 178                                 }
 179                                 *ep++ = c;
 180                                 cclcnt++;
 181                                 if (ep >= &expbuf[ESIZE])
 182                                         comerr(retoolong);
 183                         } while ((c = *sp++) != ']');
 184                         lastep[1] = cclcnt;
 185                         continue;
 186
 187                 case '\\':
 188                         if ((c = *sp++) == '(') {
 189                                 if (numbra >= NBRA)
 190                                         comerr("too many \\(\\) pairs");
 191                                 *bracketp++ = numbra;
 192                                 *ep++ = CBRA;
 193                                 *ep++ = numbra++;
 194                                 continue;
 195                         }
 196                         if (c == ')') {
 197                                 if (bracketp <= bracket)
 198                                         comerr("unmatched \\)");
 199                                 *ep++ = CKET;
 200                                 *ep++ = *--bracketp;
 201                                 continue;
 202                         }
 203                         if (c >= '1' && c < ('1' + NBRA)) {
 204                                 *ep++ = CBACK;
 205                                 *ep++ = c - '1';
 206                                 continue;
 207                         }
 208                         *ep++ = CCHR;
 209                         *ep++ = c;
 210                         continue;
 211
 212                 defchar:
 213                 default:
 214                         *ep++ = CCHR;
 215                         *ep++ = c;
 216                 }
 217         }
 218 }
 219
 220 /*
 221  * match the argument string against the compiled re
 222  */
 223 int
 224 re_exec(p1)
 225         register char   *p1;
 226 {
 227         register char   *p2 = expbuf;
 228         register int    c;
 229         int     rv;
 230
 231         for (c = 0; c < NBRA; c++) {
 232                 braslist[c] = 0;
 233                 braelist[c] = 0;
 234         }
 235         if (circf)
 236                 return((advance(p1, p2)));
 237         /*
 238          * fast check for first character
 239          */
 240         if (*p2 == CCHR) {
 241                 c = p2[1];
 242                 do {
 243                         if (*p1 != c)
 244                                 continue;
 245                         if (rv = advance(p1, p2))
 246                                 return(rv);
 247                 } while (*p1++);
 248                 return(0);
 249         }
 250         /*
 251          * regular algorithm
 252          */
 253         do
 254                 if (rv = advance(p1, p2))
 255                         return(rv);
 256         while (*p1++);
 257         return(0);
 258 }
 259
 260 /*
 261  * try to match the next thing in the dfa
 262  */
 263 static  int
 264 advance(lp, ep)
 265         register char   *lp, *ep;
 266 {
 267         register char   *curlp;
 268         int     ct, i;
 269         int     rv;
 270
 271         for (;;)
 272                 switch (*ep++) {
 273
 274                 case CCHR:
 275                         if (*ep++ == *lp++)
 276                                 continue;
 277                         return(0);
 278
 279                 case CDOT:
 280                         if (*lp++)
 281                                 continue;
 282                         return(0);
 283
 284                 case CDOL:
 285                         if (*lp == '\0')
 286                                 continue;
 287                         return(0);
 288
 289                 case CEOF:
 290                         return(1);
 291
 292                 case CCL:
 293                         if (cclass(ep, *lp++, 1)) {
 294                                 ep += *ep;
 295                                 continue;
 296                         }
 297                         return(0);
 298
 299                 case NCCL:
 300                         if (cclass(ep, *lp++, 0)) {
 301                                 ep += *ep;
 302                                 continue;
 303                         }
 304                         return(0);
 305
 306                 case CBRA:
 307                         braslist[*ep++] = lp;
 308                         continue;
 309
 310                 case CKET:
 311                         braelist[*ep++] = lp;
 312                         continue;
 313
 314                 case CBACK:
 315                         if (braelist[i = *ep++] == 0)
 316                                 return(-1);
 317                         if (backref(i, lp)) {
 318                                 lp += braelist[i] - braslist[i];
 319                                 continue;
 320                         }
 321                         return(0);
 322
 323                 case CBACK|CSTAR:
 324                         if (braelist[i = *ep++] == 0)
 325                                 return(-1);
 326                         curlp = lp;
 327                         ct = braelist[i] - braslist[i];
 328                         while (backref(i, lp))
 329                                 lp += ct;
 330                         while (lp >= curlp) {
 331                                 if (rv = advance(lp, ep))
 332                                         return(rv);
 333                                 lp -= ct;
 334                         }
 335                         continue;
 336
 337                 case CDOT|CSTAR:
 338                         curlp = lp;
 339                         while (*lp++)
 340                                 ;
 341                         goto star;
 342
 343                 case CCHR|CSTAR:
 344                         curlp = lp;
 345                         while (*lp++ == *ep)
 346                                 ;
 347                         ep++;
 348                         goto star;
 349
 350                 case CCL|CSTAR:
 351                 case NCCL|CSTAR:
 352                         curlp = lp;
 353                         while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
 354                                 ;
 355                         ep += *ep;
 356                         goto star;
 357
 358                 star:
 359                         do {
 360                                 lp--;
 361                                 if (rv = advance(lp, ep))
 362                                         return(rv);
 363                         } while (lp > curlp);
 364                         return(0);
 365
 366                 default:
 367                         return(-1);
 368                 }
 369 }
 370
 371 static
 372 backref(i, lp)
 373         register int    i;
 374         register char   *lp;
 375 {
 376         register char   *bp;
 377
 378         bp = braslist[i];
 379         while (*bp++ == *lp++)
 380                 if (bp >= braelist[i])
 381                         return(1);
 382         return(0);
 383 }
 384
 385 static int
 386 cclass(set, c, af)
 387         register char   *set, c;
 388         int     af;
 389 {
 390         register int    n;
 391
 392         if (c == 0)
 393                 return(0);
 394         n = *set++;
 395         while (--n)
 396                 if (*set++ == c)
 397                         return(af);
 398         return(! af);
 399 }