2 * Copyright (c) 2008 Secure Endpoints Inc.
4 * Permission is hereby granted, free of charge, to any person
5 * obtaining a copy of this software and associated documentation
6 * files (the "Software"), to deal in the Software without
7 * restriction, including without limitation the rights to use, copy,
8 * modify, merge, publish, distribute, sublicense, and/or sell copies
9 * of the Software, and to permit persons to whom the Software is
10 * furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 #include <afsconfig.h>
26 #include <afs/param.h>
40 /* This is part of the Microsoft Internationalized Domain Name
42 #include <normalization.h>
44 /* TODO: All the normalization and conversion code should NUL
45 terminate destination strings. */
48 (WINAPI *pNormalizeString)( __in NORM_FORM NormForm,
49 __in_ecount(cwSrcLength) LPCWSTR lpSrcString,
51 __out_ecount(cwDstLength) LPWSTR lpDstString,
52 __in int cwDstLength ) = NULL;
55 (WINAPI *pIsNormalizedString)( __in NORM_FORM NormForm,
56 __in_ecount(cwLength) LPCWSTR lpString,
57 __in int cwLength ) = NULL;
60 #define NLSDLLNAME "Normaliz.dll"
61 #define NLSMAXCCH 1024
64 #define AFS_NORM_FORM NormalizationC
66 static LCID nls_lcid = LOCALE_INVARIANT;
68 static int nls_init = 0;
71 is_windows_2000 (void)
73 static BOOL fChecked = FALSE;
74 static BOOL fIsWin2K = FALSE;
78 OSVERSIONINFO Version;
80 memset (&Version, 0x00, sizeof(Version));
81 Version.dwOSVersionInfoSize = sizeof(Version);
83 if (GetVersionEx (&Version))
85 if (Version.dwPlatformId == VER_PLATFORM_WIN32_NT &&
86 Version.dwMajorVersion >= 5)
95 long cm_InitNormalization(void)
99 if (pNormalizeString != NULL)
102 h_Nls = LoadLibrary(NLSDLLNAME);
103 if (h_Nls == INVALID_HANDLE_VALUE) {
108 (int (WINAPI *)( NORM_FORM, LPCWSTR,
110 GetProcAddress(h_Nls, "NormalizeString");
112 pIsNormalizedString =
114 (WINAPI *)( NORM_FORM, LPCWSTR, int ))
115 GetProcAddress(h_Nls, "IsNormalizedString");
117 if (is_windows_2000())
118 nls_lcid = MAKELCID(MAKELANGID(LANG_ENGLISH, SUBLANG_ENGLISH_US), SORT_DEFAULT);
122 return (pNormalizeString && pIsNormalizedString);
125 /* \brief Normalize a UTF-16 string.
127 If the supplied destination buffer is insufficient or NULL, then a
128 new buffer will be allocated to hold the normalized string.
130 \param[in] src : Source UTF-16 string. Length is specified in
133 \param[in] cch_src : The character count in cch_src is assumed to
134 be tight and include the terminating NULL character if there is
135 one. If the NULL is absent, the resulting string will not be
138 \param[out] ext_dest : The destination buffer. Can be NULL, in
139 which case *pcch_dest MUST be 0.
141 \param[in,out] pcch_dest : On entry *pcch_dest contains a count of
142 characters in the destination buffer. On exit, it will contain
143 a count of characters that were copied to the destination
146 Returns a pointer to the buffer containing the normalized string or
147 NULL if the call was unsuccessful. If the returned destination
148 buffer is different from the supplied buffer and non-NULL, it
149 should be freed using free().
152 NormalizeUtf16String(const wchar_t * src, int cch_src, wchar_t * ext_dest, int *pcch_dest)
155 cm_InitNormalization();
158 assert (pNormalizeString != NULL && pIsNormalizedString != NULL);
162 cch_src = (int)wcslen(src) + 1;
164 if ((pIsNormalizedString && (*pIsNormalizedString)(AFS_NORM_FORM, src, cch_src)) ||
165 (!pNormalizeString)) {
167 if (ext_dest == NULL || *pcch_dest < cch_src) {
168 ext_dest = malloc(cch_src * sizeof(wchar_t));
169 *pcch_dest = cch_src;
172 /* No need to or unable to normalize. Just copy the string.
173 Note that the string is not NUL terminated if the source
174 string is not NUL terminated. */
177 memcpy(ext_dest, src, cch_src * sizeof(wchar_t));
178 *pcch_dest = cch_src;
190 int cch_dest = *pcch_dest;
194 while (tries-- > 0) {
196 rv = (*pNormalizeString)(AFS_NORM_FORM, src, cch_src, dest, cch_dest);
198 if (rv <= 0 && (gle = GetLastError()) != ERROR_SUCCESS) {
199 if (gle == ERROR_INSUFFICIENT_BUFFER) {
201 /* The buffer wasn't big enough. We are going to
202 try allocating one. */
204 cch_dest = (-rv) + NLSERRCCH;
208 /* Something else is wrong */
212 } else if (rv < 0) { /* rv < 0 && gle == ERROR_SUCCESS */
214 /* Technically not one of the expected outcomes */
217 } else { /* rv > 0 || (rv == 0 && gle == ERROR_SUCCESS) */
219 /* Possibly succeeded */
221 if (rv == 0) { /* Succeeded and the return string is empty */
227 /* Nope. We only calculated the required size of the buffer */
229 cch_dest = rv + NLSERRCCH;
237 /* Can't NUL terminate */
238 cch_dest = max(rv,cch_dest) + NLSERRCCH;
247 if (dest != ext_dest && dest)
249 dest = malloc(cch_dest * sizeof(wchar_t));
254 if (dest != ext_dest && dest)
262 /*! \brief Normalize a Unicode string into a newly allocated buffer
264 The input string will be normalized using NFC.
266 \param[in] s UTF-16 string to be normalized.
268 \param[in] cch_src The number of characters in the input string. If
269 this is -1, then the input string is assumed to be NUL
272 \param[out] pcch_dest Receives the number of characters copied to
273 the output buffer. Note that the character count is the number
274 of wchar_t characters copied, and not the count of Unicode code
275 points. This includes the terminating NUL if cch_src was -1 or
276 included the terminating NUL.
278 \return A newly allocated buffer holding the normalized string or
279 NULL if the call failed.
281 cm_normchar_t * cm_NormalizeStringAlloc(const cm_unichar_t * s, int cch_src, int *pcch_dest)
287 cm_InitNormalization();
289 if (s == NULL || cch_src == 0 || *s == L'\0') {
291 *pcch_dest = ((cch_src != 0)? 1: 0);
295 r = NormalizeUtf16String(s, cch_src, NULL, &cch_dest);
298 *pcch_dest = cch_dest;
303 int cm_NormalizeString(const cm_unichar_t * s, int cch_src,
304 cm_normchar_t * dest, int cch_dest)
310 cm_InitNormalization();
312 r = NormalizeUtf16String(s, cch_src, dest, &tcch);
315 /* The supplied buffer was insufficient */
323 /*! \brief Convert a UTF-16 string to a UTF-8 string using a newly allocated buffer
325 \param[in] s UTF-16 source string
327 \param[in] cch_src Number of characters in \a s. This can be set to
328 -1 if \a s is NUL terminated.
330 \param[out] pcch_dest Receives a count of characters that were
331 copied to the target buffer.
333 \return A newly allocated buffer holding the UTF-8 string.
336 cm_utf8char_t * cm_Utf16ToUtf8Alloc(const cm_unichar_t * s, int cch_src, int *pcch_dest)
339 cm_utf8char_t * dest;
342 cm_InitNormalization();
344 if (s == NULL || cch_src == 0 || *s == L'\0') {
346 *pcch_dest = ((cch_src != 0)?1:0);
350 cch_dest = WideCharToMultiByte(CP_UTF8, 0, s, cch_src, NULL, 0, NULL, FALSE);
354 *pcch_dest = cch_dest;
358 dest = malloc((cch_dest + 1) * sizeof(cm_utf8char_t));
360 WideCharToMultiByte(CP_UTF8, 0, s, cch_src, dest, cch_dest, NULL, FALSE);
364 *pcch_dest = cch_dest;
369 int cm_Utf16ToUtf8(const cm_unichar_t * src, int cch_src,
370 cm_utf8char_t * dest, int cch_dest)
373 cm_InitNormalization();
375 return WideCharToMultiByte(CP_UTF8, 0, src, cch_src, dest, cch_dest, NULL, FALSE);
378 int cm_Utf16ToUtf16(const cm_unichar_t * src, int cch_src,
379 cm_unichar_t * dest, int cch_dest)
382 cm_InitNormalization();
385 StringCchCopyW(dest, cch_dest, src);
386 return (int)wcslen(dest) + 1;
388 int cch_conv = min(cch_src, cch_dest);
389 memcpy(dest, src, cch_conv * sizeof(cm_unichar_t));
394 /* \brief Normalize a UTF-16 string into a UTF-8 string.
396 \param[in] src : Source string.
398 \param[in] cch_src : Count of characters in src. If the count includes the
399 NULL terminator, then the resulting string will be NULL
400 terminated. If it is -1, then src is assumed to be NULL
403 \param[out] adest : Destination buffer.
405 \param[in] cch_adest : Number of characters in the destination buffer.
407 Returns the number of characters stored into cch_adest. This will
408 include the terminating NULL if cch_src included the terminating
409 NULL or was -1. If this is 0, then the operation was unsuccessful.
411 long cm_NormalizeUtf16StringToUtf8(const wchar_t * src, int cch_src,
412 char * adest, int cch_adest)
415 cm_InitNormalization();
420 if (FAILED(StringCchLengthW(src, NLSMAXCCH, &cch)))
423 cch_src = (int)cch+1;
427 wchar_t nbuf[NLSMAXCCH];
428 wchar_t * normalized;
429 int cch_norm = NLSMAXCCH;
431 normalized = NormalizeUtf16String(src, cch_src, nbuf, &cch_norm);
433 cch_adest = WideCharToMultiByte(CP_UTF8, 0, normalized, cch_norm,
434 adest, cch_adest, NULL, 0);
436 if (normalized != nbuf && normalized)
449 #define ESCVAL 0x1000
450 #define Esc(c) (ESCVAL + (short)(c))
451 #define IS_ESCAPED(c) (((c) & ESCVAL) == ESCVAL)
453 /* \brief Character sanitization map for CP-1252
455 The following map indicates which characters should be escaped in
456 the CP-1252 character map. Characters that are documented as
457 illegal characters in a file name are marked as escaped. Escaped
458 characters are marked using the ::Esc macro defined above. The
459 following exceptions apply:
461 - Path delimeters '\\' and '/' are NOT escaped because the
462 sanitization map applies to paths. While those characters are
463 illegal in filenames, they are legal in paths.
465 - Wildcard characters '*' and '?' ARE escaped. The document
466 referred below does not specify these characters as invalid.
467 Since no other escape mechanism exists, names containing
468 wildcards are indistinguishable from actual wildcards used in SMB
471 - Reserved names are not and cannot be represented in this map.
474 CON, PRN, AUX, NUL, COM1, COM2, COM3, COM4, COM5, COM6, COM7,
475 COM8, COM9, LPT1, LPT2, LPT3, LPT4, LPT5, LPT6, LPT7, LPT8, LPT9,
478 - Characters 0x80, 0x81, 0x8d, 0x8e, 0x8f, 0x90, 0x9d, 0x9e, 0x9f
479 are also escaped because they are unused in CP-1252 and hence
480 cannot be convered to a Unicode string.
482 Reserved names with extensions are also invalid. (i.e. NUL.txt)
484 \note The only bit we are actually interested in from the following
485 table is the ESCVAL bit. However, the characters themselves are
486 included for ease of maintenance.
488 \see "Naming a File" topic in the Windows SDK.
490 static const short sanitized_escapes_1252[] = {
491 Esc(0x00),Esc(0x01),Esc(0x02),Esc(0x03),Esc(0x04),Esc(0x05),Esc(0x06),Esc(0x07),
492 Esc(0x08),Esc(0x09),Esc(0x0a),Esc(0x0b),Esc(0x0c),Esc(0x0d),Esc(0x0e),Esc(0x0f),
493 Esc(0x10),Esc(0x11),Esc(0x12),Esc(0x13),Esc(0x14),Esc(0x15),Esc(0x16),Esc(0x17),
494 Esc(0x18),Esc(0x19),Esc(0x1a),Esc(0x1b),Esc(0x1c),Esc(0x1d),Esc(0x1e),Esc(0x1f),
495 ' ','!',Esc('"'),'#','$','%','&','\'','(',')',Esc('*'),'+',',','-','.','/',
496 '0','1','2','3','4','5','6','7','8','9',Esc(':'),';',Esc('<'),'=',Esc('>'),Esc('?'),
497 '@','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
498 'P','Q','R','S','T','U','V','W','X','Y','Z','[','\\',']','^','_',
499 '`','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o',
500 'p','q','r','s','t','u','v','w','x','y','z','{',Esc('|'),'}','~',Esc(0x7f),
501 Esc(0x80),Esc(0x81),0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,Esc(0x8d),Esc(0x8e),Esc(0x8f),
502 Esc(0x90),0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,Esc(0x9d),Esc(0x9e),0x9f,
503 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
504 0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
505 0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,
506 0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf,
507 0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
508 0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff
511 static int sanitize_bytestring(const char * src, int cch_src,
512 char * odest, int cch_dest)
517 cm_InitNormalization();
519 while (cch_src > 0 && *src && cch_dest > 0) {
523 rc = sanitized_escapes_1252[*src];
524 if (IS_ESCAPED(rc)) {
525 static const char hex[] =
526 {'0','1','2','3','4','5','6','7',
527 '8','9','a','b','c','d','e','f'};
535 *dest++ = hex[(((int)*src) >> 4) & 0x0f];
536 *dest++ = hex[(((int)*src) & 0x0f)];
548 if (cch_src > 0 && cch_dest > 0) {
552 return (int)(dest - odest);
555 static int sanitize_utf16char(wchar_t c, wchar_t ** pdest, size_t * pcch)
558 StringCchPrintfExW(*pdest, *pcch, pdest, pcch, 0, L"%%%04x", (int) c);
565 static int sanitize_utf16string(const wchar_t * src, size_t cch_src,
566 wchar_t * dest, size_t cch_dest)
568 int cch_dest_o = cch_dest;
571 /* only estimating */
572 for (cch_dest = 0; cch_src > 0;) {
573 if (*src >= 0xd800 && *src < 0xdc00) {
574 if (cch_src <= 1 || src[1] < 0xdc00 || src[1] > 0xdfff) {
575 /* dangling surrogate */
585 } else if (*src >= 0xdc00 && *src <= 0xdfff) {
586 /* dangling surrogate */
600 while (cch_src > 0 && cch_dest > 0) {
601 if (*src >= 0xd800 && *src < 0xdc00) {
602 if (cch_src <= 1 || src[1] < 0xdc00 || src[1] > 0xdfff) {
603 if (!sanitize_utf16char(*src++, &dest, &cch_dest))
607 /* found a surrogate pair */
610 cch_dest -= 2; cch_src -= 2;
612 } else if (*src >= 0xdc00 && *src <= 0xdfff) {
613 if (!sanitize_utf16char(*src++, &dest, &cch_dest))
618 cch_dest--; cch_src--;
622 return (cch_src == 0) ? cch_dest_o - cch_dest : 0;
629 long cm_NormalizeUtf8StringToUtf16(const char * src, int cch_src,
630 wchar_t * dest, int cch_dest)
632 wchar_t wsrcbuf[NLSMAXCCH];
638 cm_InitNormalization();
640 /* Get some edge cases out first, so we don't have to worry about
641 cch_src being 0 etc. */
644 } else if (*src == '\0') {
650 if (dest && cch_dest > 0) {
655 cch_src = (int)strlen(src) + 1;
658 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
659 cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
661 if (cch != 0 && !cm_is_valid_utf16(wsrcbuf, cch)) {
662 wchar_t wsanitized[NLSMAXCCH];
664 /* We successfully converted, but the resulting UTF-16 string
665 has dangling surrogates. We should try and escape those
667 cch = sanitize_utf16string(wsrcbuf, cch, wsanitized, NLSMAXCCH);
669 memcpy(wsrcbuf, wsanitized, cch * sizeof(wchar_t));
674 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
675 char sanitized[NLSMAXCCH];
678 /* If src doesn't have a unicode translation, then it
679 wasn't valid UTF-8. In this case, we assume that src
680 is CP-1252 and then try to convert again. But before
681 that, we use a translation table to "sanitize" the
684 cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
685 sizeof(sanitized)/sizeof(char));
687 if (cch_sanitized == 0) {
694 cch = MultiByteToWideChar(1252, 0, sanitized,
695 cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
697 /* Well, that didn't work either. Something is very wrong. */
709 wnorm = NormalizeUtf16String(wsrcbuf, cch, dest, &cch_norm);
718 /* The buffer was insufficient */
719 if (dest != NULL && cch_dest > 1) {
730 cm_normchar_t *cm_NormalizeUtf8StringToUtf16Alloc(const cm_utf8char_t * src, int cch_src,
733 wchar_t wsrcbuf[NLSMAXCCH];
739 cm_InitNormalization();
741 /* Get some edge cases out first, so we don't have to worry about
742 cch_src being 0 etc. */
743 if (cch_src == 0 || src == NULL || *src == '\0') {
745 *pcch_dest = ((cch_src != 0)? 1 : 0);
750 cch_src = (int)strlen(src) + 1;
753 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
754 cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
756 if (cch != 0 && !cm_is_valid_utf16(wsrcbuf, cch)) {
757 wchar_t wsanitized[NLSMAXCCH];
759 /* We successfully converted, but the resulting UTF-16 string
760 has dangling surrogates. We should try and escape those
762 cch = sanitize_utf16string(wsrcbuf, cch, wsanitized, NLSMAXCCH);
764 memcpy(wsrcbuf, wsanitized, cch * sizeof(wchar_t));
769 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
770 char sanitized[NLSMAXCCH];
773 /* If src doesn't have a unicode translation, then it
774 wasn't valid UTF-8. In this case, we assume that src
775 is CP-1252 and then try to convert again. But before
776 that, we use a translation table to "sanitize" the
779 cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
780 sizeof(sanitized)/sizeof(char));
782 if (cch_sanitized == 0) {
789 cch = MultiByteToWideChar(1252, 0, sanitized,
790 cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
792 /* Well, that didn't work either. Something is very wrong. */
804 wnorm = NormalizeUtf16String(wsrcbuf, cch, NULL, &cch_norm);
813 *pcch_dest = cch_norm;
818 int cm_Utf8ToUtf16(const cm_utf8char_t * src, int cch_src,
819 cm_unichar_t * dest, int cch_dest)
823 if (cch_dest >= 1 && dest != NULL) {
828 cm_InitNormalization();
831 cch_src = (int)strlen(src) + 1;
834 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
835 cch_src * sizeof(char), dest, cch_dest);
837 if (cch != 0 && !cm_is_valid_utf16(dest, cch)) {
838 wchar_t wsanitized[NLSMAXCCH];
840 cch = sanitize_utf16string(dest, cch, wsanitized, NLSMAXCCH);
842 memcpy(dest, wsanitized, cch * sizeof(wchar_t));
847 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
848 char sanitized[NLSMAXCCH];
851 /* If src doesn't have a unicode translation, then it
852 wasn't valid UTF-8. In this case, we assume that src
853 is CP-1252 and then try to convert again. But before
854 that, we use a translation table to "sanitize" the
857 cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
858 sizeof(sanitized)/sizeof(char));
860 if (cch_sanitized == 0) {
867 cch = MultiByteToWideChar(1252, 0, sanitized,
868 cch_sanitized * sizeof(char), dest, cch_dest);
870 /* Well, that didn't work either. Something is very wrong. */
887 cm_unichar_t * cm_Utf8ToUtf16Alloc(const cm_utf8char_t * src, int cch_src, int *pcch_dest)
889 cm_unichar_t * ustr = NULL;
893 cm_InitNormalization();
895 if (cch_src == 0 || src == NULL || *src == '\0') {
897 *pcch_dest = ((cch_src != 0)? 1 : 0);
902 cch_src = (int)strlen(src) + 1;
905 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
906 cch_src * sizeof(char), NULL, 0);
909 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
910 char sanitized[NLSMAXCCH];
913 /* If src doesn't have a unicode translation, then it
914 wasn't valid UTF-8. In this case, we assume that src
915 is CP-1252 and then try to convert again. But before
916 that, we use a translation table to "sanitize" the
919 cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
920 sizeof(sanitized)/sizeof(char));
922 if (cch_sanitized == 0) {
929 cch = MultiByteToWideChar(1252, 0, sanitized,
930 cch_sanitized * sizeof(char), NULL, 0);
932 /* Well, that didn't work either. Something is very wrong. */
939 ustr = malloc((cch + 1) * sizeof(wchar_t));
941 cch = MultiByteToWideChar(1252, 0, sanitized,
942 cch_sanitized * sizeof(char), ustr, cch);
949 ustr = malloc((cch + 1) * sizeof(wchar_t));
951 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
952 cch_src * sizeof(char), ustr, cch);
955 if (!cm_is_valid_utf16(ustr, cch)) {
956 cm_unichar_t * us = NULL;
959 cch_s = sanitize_utf16string(ustr, cch, NULL, 0);
961 us = malloc(cch_s * sizeof(wchar_t));
962 cch_s = sanitize_utf16string(ustr, cch, us, cch_s);
986 /* \brief Normalize a UTF-8 string.
988 \param[in] src String to normalize.
990 \param[in] cch_src : Count of characters in src. If this value is
991 -1, then src is assumed to be NULL terminated. The translated
992 string will be NULL terminated only if this is -1 or the count
993 includes the terminating NULL.
995 \param[out] adest : Destination string. Only considered valid if
996 \a cch_adest is non-zero.
998 \param[in] cch_adest : Number of characters in the destination
999 string. If this is zero, then the return value is the number
1002 \return If \a cch_adest is non-zero, then the return value is the
1003 number of bytes stored into adest. If \a cch_adest is zero,
1004 then the return value is the number of bytes required. In both
1005 cases, the return value is 0 if the call was unsuccessful.
1007 long cm_NormalizeUtf8String(const char * src, int cch_src,
1008 char * adest, int cch_adest)
1010 wchar_t wsrcbuf[NLSMAXCCH];
1016 cm_InitNormalization();
1018 /* Get some edge cases out first, so we don't have to worry about
1019 cch_src being 0 etc. */
1022 } else if (*src == '\0') {
1028 if (cch_src == -1) {
1029 cch_src = (int)strlen(src) + 1;
1032 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
1033 cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
1035 if (cch != 0 && !cm_is_valid_utf16(wsrcbuf, cch)) {
1036 wchar_t wsanitized[NLSMAXCCH];
1038 cch = sanitize_utf16string(wsrcbuf, cch, wsanitized, NLSMAXCCH);
1040 memcpy(wsrcbuf, wsanitized, cch * sizeof(wchar_t));
1045 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
1046 char sanitized[NLSMAXCCH];
1049 /* If src doesn't have a unicode translation, then it
1050 wasn't valid UTF-8. In this case, we assume that src
1051 is CP-1252 and then try to convert again. But before
1052 that, we use a translation table to "sanitize" the
1055 cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
1056 sizeof(sanitized)/sizeof(char));
1058 if (cch_sanitized == 0) {
1059 #ifdef DEBUG_UNICODE
1065 cch = MultiByteToWideChar(1252, 0, sanitized,
1066 cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
1068 /* Well, that didn't work either. Something is very wrong. */
1069 #ifdef DEBUG_UNICODE
1080 wnorm = NormalizeUtf16String(wsrcbuf, cch, NULL, &cch_norm);
1081 if (wnorm == NULL) {
1082 #ifdef DEBUG_UNICODE
1088 cch = WideCharToMultiByte(CP_UTF8, 0, wnorm,
1089 cch_norm, adest, cch_adest * sizeof(char),
1098 /*! \brief Case insensitive comparison with specific length
1100 \param[in] str1 First string to compare. Assumed to be encoded in UTF-8.
1102 \param[in] str2 Second string to compare. Assumed to be encoded in UTF-8.
1104 \param[in] n Max byte count.
1107 int cm_strnicmp_utf8(const char * str1, const char * str2, int n)
1109 wchar_t wstr1[NLSMAXCCH];
1112 wchar_t wstr2[NLSMAXCCH];
1116 cm_InitNormalization();
1121 /* first check for NULL pointers (assume NULL < "") */
1127 } else if (str2 == NULL) {
1131 len1 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str1, n, wstr1, NLSMAXCCH);
1139 len2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str2, n, wstr2, NLSMAXCCH);
1147 rv = CompareStringW(nls_lcid, NORM_IGNORECASE, wstr1, len1, wstr2, len2);
1158 int cm_strnicmp_utf16(const cm_unichar_t * str1, const cm_unichar_t * str2, int len)
1165 cm_InitNormalization();
1170 /* first check for NULL pointers */
1176 } else if (str2 == NULL) {
1180 if (FAILED(StringCchLengthW(str1, len, &cch1)))
1183 if (FAILED(StringCchLengthW(str2, len, &cch2)))
1186 rv = CompareStringW(nls_lcid, NORM_IGNORECASE, str1, (int)cch1, str2, (int)cch2);
1197 int cm_stricmp_utf16(const cm_unichar_t * str1, const cm_unichar_t * str2)
1202 cm_InitNormalization();
1204 /* first check for NULL pointers */
1210 } else if (str2 == NULL) {
1214 rv = CompareStringW(nls_lcid, NORM_IGNORECASE, str1, -1, str2, -1);
1225 cm_unichar_t *cm_strlwr_utf16(cm_unichar_t * str)
1231 cm_InitNormalization();
1233 len = (int)wcslen(str) + 1;
1234 rv = LCMapStringW(nls_lcid, LCMAP_LOWERCASE, str, len, str, len);
1244 cm_unichar_t *cm_strupr_utf16(cm_unichar_t * str)
1250 cm_InitNormalization();
1252 len = (int)wcslen(str) + 1;
1253 rv = LCMapStringW(nls_lcid, LCMAP_UPPERCASE, str, len, str, len);
1264 int cm_stricmp_utf8(const char * str1, const char * str2)
1266 wchar_t wstr1[NLSMAXCCH];
1269 wchar_t wstr2[NLSMAXCCH];
1273 cm_InitNormalization();
1275 /* first check for NULL pointers */
1281 } else if (str2 == NULL) {
1285 len1 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str1, -1, wstr1, NLSMAXCCH);
1293 len2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str2, -1, wstr2, NLSMAXCCH);
1301 rv = CompareStringW(nls_lcid, NORM_IGNORECASE, wstr1, len1, wstr2, len2);
1313 wchar_t * strupr_utf16(wchar_t * wstr, size_t cbstr)
1315 wchar_t wstrd[NLSMAXCCH];
1319 cm_InitNormalization();
1321 len = cbstr / sizeof(wchar_t);
1322 len = LCMapStringW(nls_lcid, LCMAP_UPPERCASE, wstr, len, wstrd, NLSMAXCCH);
1323 StringCbCopyW(wstr, cbstr, wstrd);
1329 char * strupr_utf8(char * str, size_t cbstr)
1331 wchar_t wstr[NLSMAXCCH];
1332 wchar_t wstrd[NLSMAXCCH];
1336 cm_InitNormalization();
1338 len = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, -1, wstr, NLSMAXCCH);
1342 len = LCMapStringW(nls_lcid, LCMAP_UPPERCASE, wstr, len, wstrd, NLSMAXCCH);
1344 len = WideCharToMultiByte(CP_UTF8, 0, wstrd, -1, str, (int)cbstr, NULL, FALSE);
1349 char * char_next_utf8(const char * c)
1351 #define CH (*((const unsigned char *)c))
1353 if ((CH & 0x80) == 0)
1354 return (char *) c+1;
1356 switch (CH & 0xf0) {
1359 return (char *) c+2;
1362 return (char *) c+3;
1365 return (char *) c+4;
1368 return (char *) c+1;
1375 char * char_prev_utf8(const char * c)
1377 #define CH (*((const unsigned char *)c))
1381 if ((CH & 0x80) == 0)
1384 while ((CH & 0xc0) == 0x80)
1391 wchar_t * char_next_utf16(const wchar_t * c)
1393 unsigned short sc = (unsigned short) *c;
1395 if (sc >= 0xd800 && sc <= 0xdbff)
1396 return (wchar_t *) c+2;
1397 return (wchar_t *) c+1;
1400 wchar_t * char_prev_utf16(const wchar_t * c)
1402 unsigned short sc = (unsigned short) *(--c);
1404 if (sc >= 0xdc00 && sc <= 0xdfff)
1405 return (wchar_t *) --c;
1406 return (wchar_t *) c;
1409 wchar_t * char_this_utf16(const wchar_t * c)
1411 unsigned short sc = (unsigned short) *c;
1413 if (sc >= 0xdc00 && sc <= 0xdfff)
1414 return (wchar_t *) --c;
1415 return (wchar_t *) c;
1418 int cm_is_valid_utf16(const wchar_t * c, int cch)
1421 cch = wcslen(c) + 1;
1423 for (; cch > 0; c++, cch--) {
1424 if (*c >= 0xd800 && *c < 0xdc00) {
1426 if (cch == 0 || *c < 0xdc00 || *c > 0xdfff)
1428 } else if (*c >= 0xdc00 && *c <= 0xdfff) {
1437 wchar_t * cm_GetRawCharsAlloc(const wchar_t * c, int len)
1447 return wcsdup(L"(empty)");
1449 cb = len * 5 * sizeof(wchar_t);
1450 current = ret = malloc(cb);
1454 for (; len > 0; ++c, --len) {
1455 StringCbPrintfExW(current, cb, ¤t, &cb, 0,
1458 StringCbCatExW(current, cb, L",", ¤t, &cb, 0);