2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
11 #include <afs/param.h>
16 #include <WINNT/regexp.h>
20 * DEFINITIONS ________________________________________________________________
24 #define markREPEAT TEXT('\x01')
25 #define markCHARACTER TEXT('\x02')
26 #define markANYCHAR TEXT('\x04')
27 #define markCHARSET TEXT('\x06')
28 #define markNONCHARSET TEXT('\x08')
29 #define markREFERENCE TEXT('\x0A')
30 #define markLPAREN TEXT('\xFC')
31 #define markRPAREN TEXT('\xFD')
32 #define markENDLINE TEXT('\xFE')
33 #define markENDPATTERN TEXT('\xFF')
37 * CLASS ROUTINES _____________________________________________________________
43 m_fMatchFromStart = FALSE;
44 m_achCompiled[0] = TEXT('\0');
47 REGEXP::REGEXP (LPCTSTR pszExpr)
49 m_fMatchFromStart = FALSE;
50 m_achCompiled[0] = TEXT('\0');
51 SetExpression (pszExpr);
54 REGEXP::~REGEXP (void)
56 ; // nothing really to do here
59 BOOL REGEXP::SetExpression (LPCTSTR pszExpr)
61 return Compile (pszExpr);
64 BOOL REGEXP::Matches (LPCTSTR pszExpr, LPCTSTR pszString)
66 REGEXP Expr (pszExpr);
67 return Expr.Matches (pszString);
70 BOOL REGEXP::fIsRegExp (void)
72 if (m_fMatchFromStart) // started with "^"?
73 return TRUE; // it's a regexp.
75 for (LPCTSTR pch = m_achCompiled; (*pch) && (*pch != markENDPATTERN); pch += 2)
77 if (*pch != markCHARACTER)
81 return FALSE; // just a string of characters
84 BOOL REGEXP::fIsRegExp (LPCTSTR pszExpr)
86 REGEXP Expr (pszExpr);
87 return Expr.fIsRegExp();
92 * REGEXP _____________________________________________________________________
96 BOOL REGEXP::Compile (LPCTSTR pszExpr)
98 BYTE aParens[ nCOMPILED_PARENS_MAX ];
99 PBYTE pParen = &aParens[0];
100 LPTSTR pchLastEx = NULL;
103 // Erase any previous compiled expression
105 LPTSTR pchCompiled = m_achCompiled;
106 *pchCompiled = TEXT('\0');
107 m_fMatchFromStart = FALSE;
109 if (!pszExpr || !*pszExpr)
111 SetLastError (ERROR_INVALID_PARAMETER);
115 // See if the expression starts with a "^"
117 if ((m_fMatchFromStart = (*pszExpr == TEXT('^'))) == TRUE)
120 // Start stripping characters from the expression
122 for (BOOL rc = TRUE; rc; )
126 if ((sizeof(TCHAR)*(pchCompiled - m_achCompiled)) > sizeof(m_achCompiled))
128 SetLastError (ERROR_META_EXPANSION_TOO_LONG);
133 if ((ch = *pszExpr++) == TEXT('\0'))
135 // We finally hit the end of this expression.
137 if (pParen != &aParens[0])
139 SetLastError (ERROR_BAD_FORMAT); // unmatched "\("
146 pchLastEx = pchCompiled;
152 *pchCompiled++ = markANYCHAR;
156 if ((pchLastEx == NULL) || (*pchLastEx == markLPAREN) || (*pchLastEx == markRPAREN))
158 *pchCompiled++ = markCHARACTER;
161 else // record that we can repeat the last expression
163 *pchLastEx |= markREPEAT;
168 if (*pszExpr != TEXT('\0'))
170 *pchCompiled++ = markCHARACTER;
173 else // record that we should match end-of-line
175 *pchCompiled++ = markENDLINE;
180 if ((ch = *pszExpr++) == '^')
182 *pchCompiled++ = markNONCHARSET;
187 *pchCompiled++ = markCHARSET;
190 *pchCompiled++ = 1; // length; this is pchLastEx[1]
193 if (ch == TEXT('\0'))
195 SetLastError (ERROR_BAD_FORMAT); // unmatched "\("
200 if ((ch == TEXT('-')) && (*pchCompiled != pchLastEx[2]))
202 if ((ch = *pszExpr++) == TEXT(']'))
204 *pchCompiled++ = TEXT('-');
208 while ((BYTE)pchCompiled[-1] < (BYTE)ch)
210 *pchCompiled = pchCompiled[-1] + 1;
213 if ((sizeof(TCHAR)*(pchCompiled - m_achCompiled)) > sizeof(m_achCompiled))
215 SetLastError (ERROR_META_EXPANSION_TOO_LONG);
226 if ((sizeof(TCHAR)*(pchCompiled - m_achCompiled)) > sizeof(m_achCompiled))
228 SetLastError (ERROR_META_EXPANSION_TOO_LONG);
234 } while ((ch = *pszExpr++) != TEXT(']'));
238 if ((ch = *pszExpr++) == TEXT('('))
240 if (nParens >= nCOMPILED_PARENS_MAX)
242 SetLastError (ERROR_META_EXPANSION_TOO_LONG);
247 *pchCompiled++ = markLPAREN;
248 *pchCompiled++ = nParens++;
250 else if (ch == TEXT(')'))
252 if (pParen == &aParens[0])
254 SetLastError (ERROR_BAD_FORMAT);
258 *pchCompiled++ = markRPAREN;
259 *pchCompiled++ = *--pParen;
261 else if ((ch >= TEXT('1')) && (ch < (TEXT('1') + nCOMPILED_PARENS_MAX)))
263 *pchCompiled++ = markREFERENCE;
264 *pchCompiled++ = ch - '1';
268 *pchCompiled++ = markCHARACTER;
274 *pchCompiled++ = markCHARACTER;
280 *pchCompiled++ = markENDPATTERN;
286 BOOL REGEXP::Matches (LPCTSTR pszString)
291 // Prepare a place to store information about \( and \) pairs
293 LPCTSTR aParenStarts[ nCOMPILED_PARENS_MAX ];
294 LPCTSTR aParenEnds[ nCOMPILED_PARENS_MAX ];
296 for (size_t ii = 0; ii < nCOMPILED_PARENS_MAX; ii++)
298 aParenStarts[ii] = NULL;
299 aParenEnds[ii] = NULL;
302 // If the expression starts with "^", we can do a quick pattern-match...
304 if (m_fMatchFromStart)
306 return MatchSubset (pszString, m_achCompiled, aParenStarts, aParenEnds);
309 // Otherwise, we have to work a little harder. If the expression
310 // at least starts with a recognized character, we can scan for that
311 // as the start of a pattern...
313 LPTSTR pchCompiled = m_achCompiled;
314 if (*pchCompiled == markCHARACTER)
316 TCHAR chStart = pchCompiled[1];
318 if (*pszString != chStart)
320 if (MatchSubset (pszString, pchCompiled, aParenStarts, aParenEnds))
322 } while (*pszString++);
327 // If the expression starts with something weird, we'll have to test
328 // against every character in the string.
331 if (MatchSubset (pszString, pchCompiled, aParenStarts, aParenEnds))
333 } while (*pszString++);
339 BOOL REGEXP::MatchSubset (LPCTSTR pszString, LPCTSTR pchCompiled, LPCTSTR *aParenStarts, LPCTSTR *aParenEnds)
341 LPCTSTR pchStartOfEx;
346 switch (*pchCompiled++)
349 if (*pchCompiled++ == *pszString++)
359 if (*pszString == TEXT('\0'))
367 if (fIsInCharSet (pchCompiled, *pszString++, TRUE))
369 pchCompiled += *pchCompiled;
375 if (fIsInCharSet (pchCompiled, *pszString++, FALSE))
377 pchCompiled += *pchCompiled;
383 aParenStarts[*pchCompiled++] = pszString;
387 aParenEnds[*pchCompiled++] = pszString;
391 if (aParenEnds[ii = *pchCompiled++] == 0)
392 return FALSE; // reference to invalid \(\) pair
393 if (CompareParen (ii, pszString, aParenStarts, aParenEnds))
395 pszString += aParenEnds[ii] - aParenStarts[ii];
400 case markREFERENCE|markREPEAT:
401 if (aParenEnds[ii = *pchCompiled++] == 0)
402 return FALSE; // reference to invalid \(\) pair
403 pchStartOfEx = pszString;
404 cchPattern = aParenEnds[ii] - aParenStarts[ii];
405 while (CompareParen (ii, pszString, aParenStarts, aParenEnds))
406 pszString += cchPattern;
407 while (pszString >= pchStartOfEx)
409 if (MatchSubset (pszString, pchCompiled, aParenStarts, aParenEnds))
411 pszString -= cchPattern;
415 case markANYCHAR|markREPEAT:
416 pchStartOfEx = pszString;
421 case markCHARACTER|markREPEAT:
422 pchStartOfEx = pszString;
423 while (*pszString++ == *pchCompiled)
428 case markCHARSET|markREPEAT:
429 case markNONCHARSET|markREPEAT:
430 pchStartOfEx = pszString;
431 while (fIsInCharSet (pchCompiled, *pszString++, (pchCompiled[-1] == (markCHARSET|markREPEAT))))
433 pchCompiled += *pchCompiled;
439 if (MatchSubset (pszString, pchCompiled, aParenStarts, aParenEnds))
441 } while (pszString > pchStartOfEx);
445 return FALSE; // damaged compiled string
450 BOOL REGEXP::CompareParen (int ii, LPCTSTR pszString, LPCTSTR *aParenStarts, LPCTSTR *aParenEnds)
452 LPCTSTR pchInParen = aParenStarts[ii];
453 while (*pchInParen++ == *pszString++)
454 if (pchInParen >= aParenEnds[ii])
460 BOOL REGEXP::fIsInCharSet (LPCTSTR pszCharSet, TCHAR chTest, int fInclusive)
464 for (int n = (int)(*pszCharSet++); --n; )
466 if (*pszCharSet++ == chTest)