2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
11 #include <afs/param.h>
16 #include <WINNT/regexp.h>
20 * DEFINITIONS ________________________________________________________________
24 #define markREPEAT TEXT('\x01')
25 #define markCHARACTER TEXT('\x02')
26 #define markANYCHAR TEXT('\x04')
27 #define markCHARSET TEXT('\x06')
28 #define markNONCHARSET TEXT('\x08')
29 #define markREFERENCE TEXT('\x0A')
30 #define markLPAREN TEXT('\xFC')
31 #define markRPAREN TEXT('\xFD')
32 #define markENDLINE TEXT('\xFE')
33 #define markENDPATTERN TEXT('\xFF')
37 * CLASS ROUTINES _____________________________________________________________
43 m_fMatchFromStart = FALSE;
44 m_achCompiled[0] = TEXT('\0');
47 REGEXP::REGEXP (LPCTSTR pszExpr)
49 m_fMatchFromStart = FALSE;
50 m_achCompiled[0] = TEXT('\0');
51 SetExpression (pszExpr);
54 REGEXP::~REGEXP (void)
56 ; // nothing really to do here
59 BOOL REGEXP::SetExpression (LPCTSTR pszExpr)
61 return Compile (pszExpr);
64 BOOL REGEXP::Matches (LPCTSTR pszExpr, LPCTSTR pszString)
66 REGEXP Expr (pszExpr);
67 return Expr.Matches (pszString);
70 BOOL REGEXP::fIsRegExp (void)
72 if (m_fMatchFromStart) // started with "^"?
73 return TRUE; // it's a regexp.
75 for (LPCTSTR pch = m_achCompiled; (*pch) && (*pch != markENDPATTERN); pch += 2)
77 if (*pch != markCHARACTER)
81 return FALSE; // just a string of characters
84 BOOL REGEXP::fIsRegExp (LPCTSTR pszExpr)
86 REGEXP Expr (pszExpr);
87 return Expr.fIsRegExp();
92 * REGEXP _____________________________________________________________________
96 BOOL REGEXP::Compile (LPCTSTR pszExpr)
98 BYTE aParens[ nCOMPILED_PARENS_MAX ];
99 PBYTE pParen = &aParens[0];
100 LPTSTR pchLastEx = NULL;
103 // Erase any previous compiled expression
105 LPTSTR pchCompiled = m_achCompiled;
106 *pchCompiled = TEXT('\0');
107 m_fMatchFromStart = FALSE;
109 if (!pszExpr || !*pszExpr)
111 SetLastError (ERROR_INVALID_PARAMETER);
115 // See if the expression starts with a "^"
117 if ((m_fMatchFromStart = (*pszExpr == TEXT('^'))) == TRUE)
120 // Start stripping characters from the expression
123 for (rc = TRUE; rc; )
127 if ((sizeof(TCHAR)*(pchCompiled - m_achCompiled)) > sizeof(m_achCompiled))
129 SetLastError (ERROR_META_EXPANSION_TOO_LONG);
134 if ((ch = *pszExpr++) == TEXT('\0'))
136 // We finally hit the end of this expression.
138 if (pParen != &aParens[0])
140 SetLastError (ERROR_BAD_FORMAT); // unmatched "\("
147 pchLastEx = pchCompiled;
153 *pchCompiled++ = markANYCHAR;
157 if ((pchLastEx == NULL) || (*pchLastEx == markLPAREN) || (*pchLastEx == markRPAREN))
159 *pchCompiled++ = markCHARACTER;
162 else // record that we can repeat the last expression
164 *pchLastEx |= markREPEAT;
169 if (*pszExpr != TEXT('\0'))
171 *pchCompiled++ = markCHARACTER;
174 else // record that we should match end-of-line
176 *pchCompiled++ = markENDLINE;
181 if ((ch = *pszExpr++) == '^')
183 *pchCompiled++ = markNONCHARSET;
188 *pchCompiled++ = markCHARSET;
191 *pchCompiled++ = 1; // length; this is pchLastEx[1]
194 if (ch == TEXT('\0'))
196 SetLastError (ERROR_BAD_FORMAT); // unmatched "\("
201 if ((ch == TEXT('-')) && (*pchCompiled != pchLastEx[2]))
203 if ((ch = *pszExpr++) == TEXT(']'))
205 *pchCompiled++ = TEXT('-');
209 while ((BYTE)pchCompiled[-1] < (BYTE)ch)
211 *pchCompiled = pchCompiled[-1] + 1;
214 if ((sizeof(TCHAR)*(pchCompiled - m_achCompiled)) > sizeof(m_achCompiled))
216 SetLastError (ERROR_META_EXPANSION_TOO_LONG);
227 if ((sizeof(TCHAR)*(pchCompiled - m_achCompiled)) > sizeof(m_achCompiled))
229 SetLastError (ERROR_META_EXPANSION_TOO_LONG);
235 } while ((ch = *pszExpr++) != TEXT(']'));
239 if ((ch = *pszExpr++) == TEXT('('))
241 if (nParens >= nCOMPILED_PARENS_MAX)
243 SetLastError (ERROR_META_EXPANSION_TOO_LONG);
248 *pchCompiled++ = markLPAREN;
249 *pchCompiled++ = nParens++;
251 else if (ch == TEXT(')'))
253 if (pParen == &aParens[0])
255 SetLastError (ERROR_BAD_FORMAT);
259 *pchCompiled++ = markRPAREN;
260 *pchCompiled++ = *--pParen;
262 else if ((ch >= TEXT('1')) && (ch < (TEXT('1') + nCOMPILED_PARENS_MAX)))
264 *pchCompiled++ = markREFERENCE;
265 *pchCompiled++ = ch - '1';
269 *pchCompiled++ = markCHARACTER;
275 *pchCompiled++ = markCHARACTER;
281 *pchCompiled++ = markENDPATTERN;
287 BOOL REGEXP::Matches (LPCTSTR pszString)
292 // Prepare a place to store information about \( and \) pairs
294 LPCTSTR aParenStarts[ nCOMPILED_PARENS_MAX ];
295 LPCTSTR aParenEnds[ nCOMPILED_PARENS_MAX ];
297 for (size_t ii = 0; ii < nCOMPILED_PARENS_MAX; ii++)
299 aParenStarts[ii] = NULL;
300 aParenEnds[ii] = NULL;
303 // If the expression starts with "^", we can do a quick pattern-match...
305 if (m_fMatchFromStart)
307 return MatchSubset (pszString, m_achCompiled, aParenStarts, aParenEnds);
310 // Otherwise, we have to work a little harder. If the expression
311 // at least starts with a recognized character, we can scan for that
312 // as the start of a pattern...
314 LPTSTR pchCompiled = m_achCompiled;
315 if (*pchCompiled == markCHARACTER)
317 TCHAR chStart = pchCompiled[1];
319 if (*pszString != chStart)
321 if (MatchSubset (pszString, pchCompiled, aParenStarts, aParenEnds))
323 } while (*pszString++);
328 // If the expression starts with something weird, we'll have to test
329 // against every character in the string.
332 if (MatchSubset (pszString, pchCompiled, aParenStarts, aParenEnds))
334 } while (*pszString++);
340 BOOL REGEXP::MatchSubset (LPCTSTR pszString, LPCTSTR pchCompiled, LPCTSTR *aParenStarts, LPCTSTR *aParenEnds)
342 LPCTSTR pchStartOfEx;
347 switch (*pchCompiled++)
350 if (*pchCompiled++ == *pszString++)
360 if (*pszString == TEXT('\0'))
368 if (fIsInCharSet (pchCompiled, *pszString++, TRUE))
370 pchCompiled += *pchCompiled;
376 if (fIsInCharSet (pchCompiled, *pszString++, FALSE))
378 pchCompiled += *pchCompiled;
384 aParenStarts[*pchCompiled++] = pszString;
388 aParenEnds[*pchCompiled++] = pszString;
392 if (aParenEnds[ii = *pchCompiled++] == 0)
393 return FALSE; // reference to invalid \(\) pair
394 if (CompareParen (ii, pszString, aParenStarts, aParenEnds))
396 pszString += aParenEnds[ii] - aParenStarts[ii];
401 case markREFERENCE|markREPEAT:
402 if (aParenEnds[ii = *pchCompiled++] == 0)
403 return FALSE; // reference to invalid \(\) pair
404 pchStartOfEx = pszString;
405 cchPattern = aParenEnds[ii] - aParenStarts[ii];
406 while (CompareParen (ii, pszString, aParenStarts, aParenEnds))
407 pszString += cchPattern;
408 while (pszString >= pchStartOfEx)
410 if (MatchSubset (pszString, pchCompiled, aParenStarts, aParenEnds))
412 pszString -= cchPattern;
416 case markANYCHAR|markREPEAT:
417 pchStartOfEx = pszString;
422 case markCHARACTER|markREPEAT:
423 pchStartOfEx = pszString;
424 while (*pszString++ == *pchCompiled)
429 case markCHARSET|markREPEAT:
430 case markNONCHARSET|markREPEAT:
431 pchStartOfEx = pszString;
432 while (fIsInCharSet (pchCompiled, *pszString++, (pchCompiled[-1] == (markCHARSET|markREPEAT))))
434 pchCompiled += *pchCompiled;
440 if (MatchSubset (pszString, pchCompiled, aParenStarts, aParenEnds))
442 } while (pszString > pchStartOfEx);
446 return FALSE; // damaged compiled string
451 BOOL REGEXP::CompareParen (int ii, LPCTSTR pszString, LPCTSTR *aParenStarts, LPCTSTR *aParenEnds)
453 LPCTSTR pchInParen = aParenStarts[ii];
454 while (*pchInParen++ == *pszString++)
455 if (pchInParen >= aParenEnds[ii])
461 BOOL REGEXP::fIsInCharSet (LPCTSTR pszCharSet, TCHAR chTest, int fInclusive)
465 for (int n = (int)(*pszCharSet++); --n; )
467 if (*pszCharSet++ == chTest)