/*
 * Copyright 2000, International Business Machines Corporation and others.
 * All Rights Reserved.
 * 
 * This software has been released under the terms of the IBM Public
 * License.  For details, see the LICENSE file in the top-level source
 * directory or online at http://www.openafs.org/dl/license10.html
 */

/*
 * regex.c -- regular expression patter matching functions
 * 
 */

#include <afsconfig.h>
#include <afs/param.h>

RCSID
    ("$Header$");

/*
 * routines to do regular expression matching
 *
 * Entry points:
 *
 *	re_comp(s)
 *		char *s;
 *	 ... returns 0 if the string s was compiled successfully,
 *		     a pointer to an error message otherwise.
 *	     If passed 0 or a null string returns without changing
 *           the currently compiled re (see note 11 below).
 *
 *	re_exec(s)
 *		char *s;
 *	 ... returns 1 if the string s matches the last compiled regular
 *		       expression, 
 *		     0 if the string s failed to match the last compiled
 *		       regular expression, and
 *		    -1 if the compiled regular expression was invalid 
 *		       (indicating an internal error).
 *
 * The strings passed to both re_comp and re_exec may have trailing or
 * embedded newline characters; they are terminated by nulls.
 *
 * The identity of the author of these routines is lost in antiquity;
 * this is essentially the same as the re code in the original V6 ed.
 *
 * The regular expressions recognized are described below. This description
 * is essentially the same as that for ed.
 *
 *	A regular expression specifies a set of strings of characters.
 *	A member of this set of strings is said to be matched by
 *	the regular expression.  In the following specification for
 *	regular expressions the word `character' means any character but NUL.
 *
 *	1.  Any character except a special character matches itself.
 *	    Special characters are the regular expression delimiter plus
 *	    \ [ . and sometimes ^ * $.
 *	2.  A . matches any character.
 *	3.  A \ followed by any character except a digit or ( )
 *	    matches that character.
 *	4.  A nonempty string s bracketed [s] (or [^s]) matches any
 *	    character in (or not in) s. In s, \ has no special meaning,
 *	    and ] may only appear as the first letter. A substring 
 *	    a-b, with a and b in ascending ASCII order, stands for
 *	    the inclusive range of ASCII characters.
 *	5.  A regular expression of form 1-4 followed by * matches a
 *	    sequence of 0 or more matches of the regular expression.
 *	6.  A regular expression, x, of form 1-8, bracketed \(x\)
 *	    matches what x matches.
 *	7.  A \ followed by a digit n matches a copy of the string that the
 *	    bracketed regular expression beginning with the nth \( matched.
 *	8.  A regular expression of form 1-8, x, followed by a regular
 *	    expression of form 1-7, y matches a match for x followed by
 *	    a match for y, with the x match being as long as possible
 *	    while still permitting a y match.
 *	9.  A regular expression of form 1-8 preceded by ^ (or followed
 *	    by $), is constrained to matches that begin at the left
 *	    (or end at the right) end of a line.
 *	10. A regular expression of form 1-9 picks out the longest among
 *	    the leftmost matches in a line.
 *	11. An empty regular expression stands for a copy of the last
 *	    regular expression encountered.
 */

/*
 * constants for re's
 */
#define	CBRA	1
#define	CCHR	2
#define	CDOT	4
#define	CCL	6
#define	NCCL	8
#define	CDOL	10
#define	CEOF	11
#define	CKET	12
#define	CBACK	18

#define	CSTAR	01

#define	ESIZE	512
#define	NBRA	9

static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
static char circf;

/* forward defs
*/

static int advance(register char *lp, register char *ep);
static int backref(register int i, register char *lp);
static int cclass(register char *set, register char c, int af);


/*
 * compile the regular expression argument into a dfa
 */
char *
re_comp(register char *sp)
{
    register int c;
    register char *ep = expbuf;
    int cclcnt, numbra = 0;
    char *lastep = 0;
    char bracket[NBRA];
    char *bracketp = &bracket[0];
    static char *retoolong = "Regular expression too long";

#define	comperr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }

    if (sp == 0 || *sp == '\0') {
	if (*ep == 0)
	    return ("No previous regular expression");
	return (0);
    }
    if (*sp == '^') {
	circf = 1;
	sp++;
    } else
	circf = 0;
    for (;;) {
	if (ep >= &expbuf[ESIZE - 10 /* fudge factor */])
	    comperr(retoolong);
	if ((c = *sp++) == '\0') {
	    if (bracketp != bracket)
		comperr("unmatched \\(");
	    *ep++ = CEOF;
	    *ep++ = 0;
	    return (0);
	}
	if (c != '*')
	    lastep = ep;
	switch (c) {

	case '.':
	    *ep++ = CDOT;
	    continue;

	case '*':
	    if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
		goto defchar;
	    *lastep |= CSTAR;
	    continue;

	case '$':
	    if (*sp != '\0')
		goto defchar;
	    *ep++ = CDOL;
	    continue;

	case '[':
	    *ep++ = CCL;
	    *ep++ = 0;
	    cclcnt = 1;
	    if ((c = *sp++) == '^') {
		c = *sp++;
		ep[-2] = NCCL;
	    }
	    do {
		if (c == '\0')
		    comperr("missing ]");
		if (c == '-' && ep[-1] != 0) {
		    if ((c = *sp++) == ']') {
			*ep++ = '-';
			cclcnt++;
			break;
		    }
		    while (ep[-1] < c) {
			*ep = ep[-1] + 1;
			ep++;
			cclcnt++;
			if (ep >= &expbuf[ESIZE - 10 /* fudge factor */])
			    comperr(retoolong);
		    }
		}
		*ep++ = c;
		cclcnt++;
		if (ep >= &expbuf[ESIZE - 10 /* fudge factor */])
		    comperr(retoolong);
	    } while ((c = *sp++) != ']');
	    lastep[1] = cclcnt;
	    continue;

	case '\\':
	    if ((c = *sp++) == '(') {
		if (numbra >= NBRA)
		    comperr("too many \\(\\) pairs");
		*bracketp++ = numbra;
		*ep++ = CBRA;
		*ep++ = numbra++;
		continue;
	    }
	    if (c == ')') {
		if (bracketp <= bracket)
		    comperr("unmatched \\)");
		*ep++ = CKET;
		*ep++ = *--bracketp;
		continue;
	    }
	    if (c >= '1' && c < ('1' + NBRA)) {
		*ep++ = CBACK;
		*ep++ = c - '1';
		continue;
	    }
	    *ep++ = CCHR;
	    *ep++ = c;
	    continue;

	  defchar:
	default:
	    *ep++ = CCHR;
	    *ep++ = c;
	}
    }
}

/* 
 * match the argument string against the compiled re
 */
int
re_exec(register char *p1)
{
    register char *p2 = expbuf;
    register int c;
    int rv;

    for (c = 0; c < NBRA; c++) {
	braslist[c] = 0;
	braelist[c] = 0;
    }
    if (circf)
	return ((advance(p1, p2)));
    /*
     * fast check for first character
     */
    if (*p2 == CCHR) {
	c = p2[1];
	do {
	    if (*p1 != c)
		continue;
	    if (rv = advance(p1, p2))
		return (rv);
	} while (*p1++);
	return (0);
    }
    /*
     * regular algorithm
     */
    do
	if (rv = advance(p1, p2))
	    return (rv);
    while (*p1++);
    return (0);
}

/* 
 * try to match the next thing in the dfa
 */
static int
advance(register char *lp, register char *ep)
{
    register char *curlp;
    int ct, i;
    int rv;

    for (;;)
	switch (*ep++) {

	case CCHR:
	    if (*ep++ == *lp++)
		continue;
	    return (0);

	case CDOT:
	    if (*lp++)
		continue;
	    return (0);

	case CDOL:
	    if (*lp == '\0')
		continue;
	    return (0);

	case CEOF:
	    return (1);

	case CCL:
	    if (cclass(ep, *lp++, 1)) {
		ep += *ep;
		continue;
	    }
	    return (0);

	case NCCL:
	    if (cclass(ep, *lp++, 0)) {
		ep += *ep;
		continue;
	    }
	    return (0);

	case CBRA:
	    braslist[*ep++] = lp;
	    continue;

	case CKET:
	    braelist[*ep++] = lp;
	    continue;

	case CBACK:
	    if (braelist[i = *ep++] == 0)
		return (-1);
	    if (backref(i, lp)) {
		lp += braelist[i] - braslist[i];
		continue;
	    }
	    return (0);

	case CBACK | CSTAR:
	    if (braelist[i = *ep++] == 0)
		return (-1);
	    curlp = lp;
	    ct = braelist[i] - braslist[i];
	    while (backref(i, lp))
		lp += ct;
	    while (lp >= curlp) {
		if (rv = advance(lp, ep))
		    return (rv);
		lp -= ct;
	    }
	    continue;

	case CDOT | CSTAR:
	    curlp = lp;
	    while (*lp++);
	    goto star;

	case CCHR | CSTAR:
	    curlp = lp;
	    while (*lp++ == *ep);
	    ep++;
	    goto star;

	case CCL | CSTAR:
	case NCCL | CSTAR:
	    curlp = lp;
	    while (cclass(ep, *lp++, ep[-1] == (CCL | CSTAR)));
	    ep += *ep;
	    goto star;

	  star:
	    do {
		lp--;
		if (rv = advance(lp, ep))
		    return (rv);
	    } while (lp > curlp);
	    return (0);

	default:
	    return (-1);
	}
}

static int
backref(register int i, register char *lp)
{
    register char *bp;

    bp = braslist[i];
    while (*bp++ == *lp++)
	if (bp >= braelist[i])
	    return (1);
    return (0);
}

static int
cclass(register char *set, register char c, int af)
{
    register int n;

    if (c == 0)
	return (0);
    n = *set++;
    while (--n)
	if (*set++ == c)
	    return (af);
    return (!af);
}