2 * Copyright (c) 2008 Secure Endpoints Inc.
4 * Permission is hereby granted, free of charge, to any person
5 * obtaining a copy of this software and associated documentation
6 * files (the "Software"), to deal in the Software without
7 * restriction, including without limitation the rights to use, copy,
8 * modify, merge, publish, distribute, sublicense, and/or sell copies
9 * of the Software, and to permit persons to whom the Software is
10 * furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 /* This is part of the Microsoft Internationalized Domain Name
35 #include <normalization.h>
38 (WINAPI *pNormalizeString)( __in NORM_FORM NormForm,
39 __in_ecount(cwSrcLength) LPCWSTR lpSrcString,
41 __out_ecount(cwDstLength) LPWSTR lpDstString,
42 __in int cwDstLength ) = NULL;
45 (WINAPI *pIsNormalizedString)( __in NORM_FORM NormForm,
46 __in_ecount(cwLength) LPCWSTR lpString,
47 __in int cwLength ) = NULL;
50 #define NLSDLLNAME "Normaliz.dll"
51 #define NLSMAXCCH 1024
54 #define AFS_NORM_FORM NormalizationC
56 long cm_InitNormalization(void)
60 if (pNormalizeString != NULL)
63 h_Nls = LoadLibrary(NLSDLLNAME);
64 if (h_Nls == INVALID_HANDLE_VALUE) {
68 pNormalizeString = GetProcAddress(h_Nls, "NormalizeString");
69 pIsNormalizedString = GetProcAddress(h_Nls, "IsNormalizedString");
71 return (pNormalizeString && pIsNormalizedString);
74 /* \brief Normalize a UTF-16 string.
76 If the supplied destination buffer is insufficient or NULL, then a
77 new buffer will be allocated to hold the normalized string.
79 \param[in] src : Source UTF-16 string. Length is specified in
82 \param[in] cch_src : The character count in cch_src is assumed to
83 be tight and include the terminating NULL character if there is
84 one. If the NULL is absent, the resulting string will not be
87 \param[out] ext_dest : The destination buffer. Can be NULL, in
88 which case *pcch_dest MUST be 0.
90 \param[in,out] pcch_dest : On entry *pcch_dest contains a count of
91 characters in the destination buffer. On exit, it will contain
92 a count of characters that were copied to the destination
95 Returns a pointer to the buffer containing the normalized string or
96 NULL if the call was unsuccessful. If the returned destination
97 buffer is different from the supplied buffer and non-NULL, it
98 should be freed using free().
101 NormalizeUtf16String(const wchar_t * src, int cch_src, wchar_t * ext_dest, int *pcch_dest)
103 if ((pIsNormalizedString && (*pIsNormalizedString)(AFS_NORM_FORM, src, cch_src)) ||
104 (!pNormalizeString)) {
106 if (ext_dest == NULL || *pcch_dest < cch_src) {
107 ext_dest = malloc(cch_src * sizeof(wchar_t));
108 *pcch_dest = cch_src;
111 /* No need to or unable to normalize. Just copy the string.
112 Note that the string is not necessarily NULL terminated. */
115 memcpy(ext_dest, src, cch_src * sizeof(wchar_t));
116 *pcch_dest = cch_src;
128 int cch_dest = *pcch_dest;
132 while (tries-- > 0) {
134 rv = (*pNormalizeString)(AFS_NORM_FORM, src, cch_src, dest, cch_dest);
136 if (rv <= 0 && (gle = GetLastError()) != ERROR_SUCCESS) {
137 if (gle == ERROR_INSUFFICIENT_BUFFER) {
139 /* The buffer wasn't big enough. We are going to
140 try allocating one. */
142 cch_dest = (-rv) + NLSERRCCH;
146 /* Something else is wrong */
150 } else if (rv < 0) { /* rv < 0 && gle == ERROR_SUCCESS */
152 /* Technically not one of the expected outcomes */
155 } else { /* rv > 0 || (rv == 0 && gle == ERROR_SUCCESS) */
157 /* Possibly succeeded */
159 if (rv == 0) { /* Succeeded and the return string is empty */
165 /* Nope. We only calculated the required size of the buffer */
167 cch_dest = rv + NLSERRCCH;
178 if (dest != ext_dest && dest)
180 dest = malloc(cch_dest * sizeof(wchar_t));
185 if (dest != ext_dest && dest)
193 /* \brief Normalize a UTF-16 string into a UTF-8 string.
195 \param[in] src : Source string.
197 \param[in] cch_src : Count of characters in src. If the count includes the
198 NULL terminator, then the resulting string will be NULL
199 terminated. If it is -1, then src is assumed to be NULL
202 \param[out] adest : Destination buffer.
204 \param[in] cch_adest : Number of characters in the destination buffer.
206 Returns the number of characters stored into cch_adest. This will
207 include the terminating NULL if cch_src included the terminating
208 NULL or was -1. If this is 0, then the operation was unsuccessful.
210 long cm_NormalizeUtf16StringToUtf8(const wchar_t * src, int cch_src,
211 char * adest, int cch_adest)
216 if (FAILED(StringCchLengthW(src, NLSMAXCCH, &cch)))
223 wchar_t nbuf[NLSMAXCCH];
224 wchar_t * normalized;
225 int cch_norm = NLSMAXCCH;
227 normalized = NormalizeUtf16String(src, cch_src, nbuf, &cch_norm);
229 cch_adest = WideCharToMultiByte(CP_UTF8, 0, normalized, cch_norm,
230 adest, cch_adest, NULL, 0);
232 if (normalized != nbuf && normalized)
245 #define ESCVAL 0x1000
246 #define Esc(c) (ESCVAL + (short)(c))
247 #define IS_ESCAPED(c) (((c) & ESCVAL) == ESCVAL)
249 /* \brief Character sanitization map for CP-1252
251 The following map indicates which characters should be escaped in
252 the CP-1252 character map. Characters that are documented as
253 illegal characters in a file name are marked as escaped. Escaped
254 characters are marked using the ::Esc macro defined above. The
255 following exceptions apply:
257 - Path delimeters '\\' and '/' are NOT escaped because the
258 sanitization map applies to paths. While those characters are
259 illegal in filenames, they are legal in paths.
261 - Wildcard characters '*' and '?' ARE escaped. The document
262 referred below does not specify these characters as invalid.
263 Since no other escape mechanism exists, names containing
264 wildcards are indistinguishable from actual wildcards used in SMB
267 - Reserved names are not and cannot be represented in this map.
270 CON, PRN, AUX, NUL, COM1, COM2, COM3, COM4, COM5, COM6, COM7,
271 COM8, COM9, LPT1, LPT2, LPT3, LPT4, LPT5, LPT6, LPT7, LPT8, LPT9,
274 - Characters 0x80, 0x81, 0x8d, 0x8e, 0x8f, 0x90, 0x9d, 0x9e, 0x9f
275 are also escaped because they are unused in CP-1252 and hence
276 cannot be convered to a Unicode string.
278 Reserved names with extensions are also invalid. (i.e. NUL.txt)
280 \note The only bit we are actually interested in from the following
281 table is the ESCVAL bit. However, the characters themselves are
282 included for ease of maintenance.
284 \see "Naming a File" topic in the Windows SDK.
286 static const short sanitized_escapes_1252[] = {
287 Esc(0x00),Esc(0x01),Esc(0x02),Esc(0x03),Esc(0x04),Esc(0x05),Esc(0x06),Esc(0x07),
288 Esc(0x08),Esc(0x09),Esc(0x0a),Esc(0x0b),Esc(0x0c),Esc(0x0d),Esc(0x0e),Esc(0x0f),
289 Esc(0x10),Esc(0x11),Esc(0x12),Esc(0x13),Esc(0x14),Esc(0x15),Esc(0x16),Esc(0x17),
290 Esc(0x18),Esc(0x19),Esc(0x1a),Esc(0x1b),Esc(0x1c),Esc(0x1d),Esc(0x1e),Esc(0x1f),
291 ' ','!',Esc('"'),'#','$','%','&','\'','(',')',Esc('*'),'+',',','-','.','/',
292 '0','1','2','3','4','5','6','7','8','9',Esc(':'),';',Esc('<'),'=',Esc('>'),Esc('?'),
293 '@','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
294 'P','Q','R','S','T','U','V','W','X','Y','Z','[','\\',']','^','_',
295 '`','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o',
296 'p','q','r','s','t','u','v','w','x','y','z','{',Esc('|'),'}','~',Esc(0x7f),
297 Esc(0x80),Esc(0x81),0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,Esc(0x8d),Esc(0x8e),Esc(0x8f),
298 Esc(0x90),0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,Esc(0x9d),Esc(0x9e),0x9f,
299 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
300 0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
301 0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,
302 0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf,
303 0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
304 0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff
307 static int sanitize_bytestring(const char * src, int cch_src,
308 char * odest, int cch_dest)
311 while (cch_src > 0 && *src && cch_dest > 0) {
315 rc = sanitized_escapes_1252[*src];
316 if (IS_ESCAPED(rc)) {
317 static const char hex[] =
318 {'0','1','2','3','4','5','6','7',
319 '8','9','a','b','c','d','e','f'};
327 *dest++ = hex[(((int)*src) >> 4) & 0x0f];
328 *dest++ = hex[(((int)*src) & 0x0f)];
340 if (cch_src > 0 && cch_dest > 0) {
344 return (int)(dest - odest);
351 /* \brief Normalize a UTF-8 string.
353 \param[in] src String to normalize.
355 \param[in] cch_src : Count of characters in src. If this value is
356 -1, then src is assumed to be NULL terminated. The translated
357 string will be NULL terminated only if this is -1 or the count
358 includes the terminating NULL.
360 \param[out] adest : Destination string. Only considered valid if
361 \a cch_adest is non-zero.
363 \param[in] cch_adest : Number of characters in the destination
364 string. If this is zero, then the return value is the number
367 \return If \a cch_adest is non-zero, then the return value is the
368 number of bytes stored into adest. If \a cch_adest is zero,
369 then the return value is the number of bytes required. In both
370 cases, the return value is 0 if the call was unsuccessful.
372 long cm_NormalizeUtf8String(const char * src, int cch_src,
373 char * adest, int cch_adest)
375 wchar_t wsrcbuf[NLSMAXCCH];
380 /* Get some edge cases out first, so we don't have to worry about
381 cch_src being 0 etc. */
384 } else if (*src == '\0') {
391 cch_src = strlen(src) + 1;
394 cch = MultiByteToWideChar(CP_UTF8, 0, src,
395 cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
398 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
399 char sanitized[NLSMAXCCH];
402 /* If src doesn't have a unicode translation, then it
403 wasn't valid UTF-8. In this case, we assume that src
404 is CP-1252 and then try to convert again. But before
405 that, we use a translation table to "sanitize" the
408 cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
409 sizeof(sanitized)/sizeof(char));
411 if (cch_sanitized == 0) {
418 cch = MultiByteToWideChar(1252, 0, sanitized,
419 cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
421 /* Well, that didn't work either. Something is very wrong. */
433 wnorm = NormalizeUtf16String(wsrcbuf, cch, NULL, &cch_norm);
441 cch = WideCharToMultiByte(CP_UTF8, 0, wnorm,
442 cch_norm, adest, cch_adest * sizeof(char),
451 /*! \brief Case insensitive comparison with specific length
453 \param[in] str1 First string to compare. Assumed to be encoded in UTF-8.
455 \param[in] str2 Second string to compare. Assumed to be encoded in UTF-8.
457 \param[in] n Max byte count.
460 int cm_strnicmp_utf8(const char * str1, const char * str2, int n)
462 wchar_t wstr1[NLSMAXCCH];
465 wchar_t wstr2[NLSMAXCCH];
468 /* first check for NULL pointers */
474 } else if (str2 == NULL) {
478 len1 = MultiByteToWideChar(CP_UTF8, 0, str1, n, wstr1, NLSMAXCCH);
486 len2 = MultiByteToWideChar(CP_UTF8, 0, str2, n, wstr2, NLSMAXCCH);
494 rv = CompareStringW(LOCALE_INVARIANT, NORM_IGNORECASE, wstr1, len1, wstr2, len2);
505 int cm_stricmp_utf8(const char * str1, const char * str2)
507 wchar_t wstr1[NLSMAXCCH];
510 wchar_t wstr2[NLSMAXCCH];
513 /* first check for NULL pointers */
519 } else if (str2 == NULL) {
523 len1 = MultiByteToWideChar(CP_UTF8, 0, str1, -1, wstr1, NLSMAXCCH);
531 len2 = MultiByteToWideChar(CP_UTF8, 0, str2, -1, wstr2, NLSMAXCCH);
539 rv = CompareStringW(LOCALE_INVARIANT, NORM_IGNORECASE, wstr1, len1, wstr2, len2);
550 wchar_t * strupr_utf16(wchar_t * wstr, size_t cbstr)
552 wchar_t wstrd[NLSMAXCCH];
555 len = cbstr / sizeof(wchar_t);
556 len = LCMapStringW(LOCALE_INVARIANT, LCMAP_UPPERCASE, wstr, len, wstrd, NLSMAXCCH);
557 StringCbCopyW(wstr, cbstr, wstrd);
562 char * strupr_utf8(char * str, size_t cbstr)
564 wchar_t wstr[NLSMAXCCH];
565 wchar_t wstrd[NLSMAXCCH];
569 len = MultiByteToWideChar(CP_UTF8, 0, str, -1, wstr, NLSMAXCCH);
573 len = LCMapStringW(LOCALE_INVARIANT, LCMAP_UPPERCASE, wstr, len, wstrd, NLSMAXCCH);
575 len = WideCharToMultiByte(CP_UTF8, 0, wstrd, -1, str, cbstr, NULL, FALSE);
580 char * char_next_utf8(const char * c)
582 #define CH (*((const unsigned char *)c))
584 if ((CH & 0x80) == 0)
606 char * char_prev_utf8(const char * c)
608 #define CH (*((const unsigned char *)c))
612 if ((CH & 0x80) == 0)
615 while ((CH & 0xc0) == 0x80)