2 * Copyright (c) 2008 Secure Endpoints Inc.
4 * Permission is hereby granted, free of charge, to any person
5 * obtaining a copy of this software and associated documentation
6 * files (the "Software"), to deal in the Software without
7 * restriction, including without limitation the rights to use, copy,
8 * modify, merge, publish, distribute, sublicense, and/or sell copies
9 * of the Software, and to permit persons to whom the Software is
10 * furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
38 /* This is part of the Microsoft Internationalized Domain Name
40 #include <normalization.h>
42 /* TODO: All the normalization and conversion code should NUL
43 terminate destination strings. */
46 (WINAPI *pNormalizeString)( __in NORM_FORM NormForm,
47 __in_ecount(cwSrcLength) LPCWSTR lpSrcString,
49 __out_ecount(cwDstLength) LPWSTR lpDstString,
50 __in int cwDstLength ) = NULL;
53 (WINAPI *pIsNormalizedString)( __in NORM_FORM NormForm,
54 __in_ecount(cwLength) LPCWSTR lpString,
55 __in int cwLength ) = NULL;
58 #define NLSDLLNAME "Normaliz.dll"
59 #define NLSMAXCCH 1024
62 #define AFS_NORM_FORM NormalizationC
64 static LCID nls_lcid = LOCALE_INVARIANT;
66 static int nls_init = 0;
69 is_windows_2000 (void)
71 static BOOL fChecked = FALSE;
72 static BOOL fIsWin2K = FALSE;
76 OSVERSIONINFO Version;
78 memset (&Version, 0x00, sizeof(Version));
79 Version.dwOSVersionInfoSize = sizeof(Version);
81 if (GetVersionEx (&Version))
83 if (Version.dwPlatformId == VER_PLATFORM_WIN32_NT &&
84 Version.dwMajorVersion >= 5)
93 long cm_InitNormalization(void)
97 if (pNormalizeString != NULL)
100 h_Nls = LoadLibrary(NLSDLLNAME);
101 if (h_Nls == INVALID_HANDLE_VALUE) {
106 (int (WINAPI *)( NORM_FORM, LPCWSTR,
108 GetProcAddress(h_Nls, "NormalizeString");
110 pIsNormalizedString =
112 (WINAPI *)( NORM_FORM, LPCWSTR, int ))
113 GetProcAddress(h_Nls, "IsNormalizedString");
115 if (is_windows_2000())
116 nls_lcid = MAKELCID(MAKELANGID(LANG_ENGLISH, SUBLANG_ENGLISH_US), SORT_DEFAULT);
120 return (pNormalizeString && pIsNormalizedString);
123 /* \brief Normalize a UTF-16 string.
125 If the supplied destination buffer is insufficient or NULL, then a
126 new buffer will be allocated to hold the normalized string.
128 \param[in] src : Source UTF-16 string. Length is specified in
131 \param[in] cch_src : The character count in cch_src is assumed to
132 be tight and include the terminating NULL character if there is
133 one. If the NULL is absent, the resulting string will not be
136 \param[out] ext_dest : The destination buffer. Can be NULL, in
137 which case *pcch_dest MUST be 0.
139 \param[in,out] pcch_dest : On entry *pcch_dest contains a count of
140 characters in the destination buffer. On exit, it will contain
141 a count of characters that were copied to the destination
144 Returns a pointer to the buffer containing the normalized string or
145 NULL if the call was unsuccessful. If the returned destination
146 buffer is different from the supplied buffer and non-NULL, it
147 should be freed using free().
150 NormalizeUtf16String(const wchar_t * src, int cch_src, wchar_t * ext_dest, int *pcch_dest)
153 cm_InitNormalization();
156 assert (pNormalizeString != NULL && pIsNormalizedString != NULL);
160 cch_src = (int)wcslen(src) + 1;
162 if ((pIsNormalizedString && (*pIsNormalizedString)(AFS_NORM_FORM, src, cch_src)) ||
163 (!pNormalizeString)) {
165 if (ext_dest == NULL || *pcch_dest < cch_src) {
166 ext_dest = malloc(cch_src * sizeof(wchar_t));
167 *pcch_dest = cch_src;
170 /* No need to or unable to normalize. Just copy the string.
171 Note that the string is not NUL terminated if the source
172 string is not NUL terminated. */
175 memcpy(ext_dest, src, cch_src * sizeof(wchar_t));
176 *pcch_dest = cch_src;
188 int cch_dest = *pcch_dest;
192 while (tries-- > 0) {
194 rv = (*pNormalizeString)(AFS_NORM_FORM, src, cch_src, dest, cch_dest);
196 if (rv <= 0 && (gle = GetLastError()) != ERROR_SUCCESS) {
197 if (gle == ERROR_INSUFFICIENT_BUFFER) {
199 /* The buffer wasn't big enough. We are going to
200 try allocating one. */
202 cch_dest = (-rv) + NLSERRCCH;
206 /* Something else is wrong */
210 } else if (rv < 0) { /* rv < 0 && gle == ERROR_SUCCESS */
212 /* Technically not one of the expected outcomes */
215 } else { /* rv > 0 || (rv == 0 && gle == ERROR_SUCCESS) */
217 /* Possibly succeeded */
219 if (rv == 0) { /* Succeeded and the return string is empty */
225 /* Nope. We only calculated the required size of the buffer */
227 cch_dest = rv + NLSERRCCH;
235 /* Can't NUL terminate */
236 cch_dest = max(rv,cch_dest) + NLSERRCCH;
245 if (dest != ext_dest && dest)
247 dest = malloc(cch_dest * sizeof(wchar_t));
252 if (dest != ext_dest && dest)
260 /*! \brief Normalize a Unicode string into a newly allocated buffer
262 The input string will be normalized using NFC.
264 \param[in] s UTF-16 string to be normalized.
266 \param[in] cch_src The number of characters in the input string. If
267 this is -1, then the input string is assumed to be NUL
270 \param[out] pcch_dest Receives the number of characters copied to
271 the output buffer. Note that the character count is the number
272 of wchar_t characters copied, and not the count of Unicode code
273 points. This includes the terminating NUL if cch_src was -1 or
274 included the terminating NUL.
276 \return A newly allocated buffer holding the normalized string or
277 NULL if the call failed.
279 cm_normchar_t * cm_NormalizeStringAlloc(const cm_unichar_t * s, int cch_src, int *pcch_dest)
285 cm_InitNormalization();
287 if (s == NULL || cch_src == 0 || *s == L'\0') {
289 *pcch_dest = ((cch_src != 0)? 1: 0);
293 r = NormalizeUtf16String(s, cch_src, NULL, &cch_dest);
296 *pcch_dest = cch_dest;
301 int cm_NormalizeString(const cm_unichar_t * s, int cch_src,
302 cm_normchar_t * dest, int cch_dest)
308 cm_InitNormalization();
310 r = NormalizeUtf16String(s, cch_src, dest, &tcch);
313 /* The supplied buffer was insufficient */
321 /*! \brief Convert a UTF-16 string to a UTF-8 string using a newly allocated buffer
323 \param[in] s UTF-16 source string
325 \param[in] cch_src Number of characters in \a s. This can be set to
326 -1 if \a s is NUL terminated.
328 \param[out] pcch_dest Receives a count of characters that were
329 copied to the target buffer.
331 \return A newly allocated buffer holding the UTF-8 string.
334 cm_utf8char_t * cm_Utf16ToUtf8Alloc(const cm_unichar_t * s, int cch_src, int *pcch_dest)
337 cm_utf8char_t * dest;
340 cm_InitNormalization();
342 if (s == NULL || cch_src == 0 || *s == L'\0') {
344 *pcch_dest = ((cch_src != 0)?1:0);
348 cch_dest = WideCharToMultiByte(CP_UTF8, 0, s, cch_src, NULL, 0, NULL, FALSE);
352 *pcch_dest = cch_dest;
356 dest = malloc((cch_dest + 1) * sizeof(cm_utf8char_t));
358 WideCharToMultiByte(CP_UTF8, 0, s, cch_src, dest, cch_dest, NULL, FALSE);
362 *pcch_dest = cch_dest;
367 int cm_Utf16ToUtf8(const cm_unichar_t * src, int cch_src,
368 cm_utf8char_t * dest, int cch_dest)
371 cm_InitNormalization();
373 return WideCharToMultiByte(CP_UTF8, 0, src, cch_src, dest, cch_dest, NULL, FALSE);
376 int cm_Utf16ToUtf16(const cm_unichar_t * src, int cch_src,
377 cm_unichar_t * dest, int cch_dest)
380 cm_InitNormalization();
383 StringCchCopyW(dest, cch_dest, src);
384 return (int)wcslen(dest) + 1;
386 int cch_conv = min(cch_src, cch_dest);
387 memcpy(dest, src, cch_conv * sizeof(cm_unichar_t));
392 /* \brief Normalize a UTF-16 string into a UTF-8 string.
394 \param[in] src : Source string.
396 \param[in] cch_src : Count of characters in src. If the count includes the
397 NULL terminator, then the resulting string will be NULL
398 terminated. If it is -1, then src is assumed to be NULL
401 \param[out] adest : Destination buffer.
403 \param[in] cch_adest : Number of characters in the destination buffer.
405 Returns the number of characters stored into cch_adest. This will
406 include the terminating NULL if cch_src included the terminating
407 NULL or was -1. If this is 0, then the operation was unsuccessful.
409 long cm_NormalizeUtf16StringToUtf8(const wchar_t * src, int cch_src,
410 char * adest, int cch_adest)
413 cm_InitNormalization();
418 if (FAILED(StringCchLengthW(src, NLSMAXCCH, &cch)))
421 cch_src = (int)cch+1;
425 wchar_t nbuf[NLSMAXCCH];
426 wchar_t * normalized;
427 int cch_norm = NLSMAXCCH;
429 normalized = NormalizeUtf16String(src, cch_src, nbuf, &cch_norm);
431 cch_adest = WideCharToMultiByte(CP_UTF8, 0, normalized, cch_norm,
432 adest, cch_adest, NULL, 0);
434 if (normalized != nbuf && normalized)
447 #define ESCVAL 0x1000
448 #define Esc(c) (ESCVAL + (short)(c))
449 #define IS_ESCAPED(c) (((c) & ESCVAL) == ESCVAL)
451 /* \brief Character sanitization map for CP-1252
453 The following map indicates which characters should be escaped in
454 the CP-1252 character map. Characters that are documented as
455 illegal characters in a file name are marked as escaped. Escaped
456 characters are marked using the ::Esc macro defined above. The
457 following exceptions apply:
459 - Path delimeters '\\' and '/' are NOT escaped because the
460 sanitization map applies to paths. While those characters are
461 illegal in filenames, they are legal in paths.
463 - Wildcard characters '*' and '?' ARE escaped. The document
464 referred below does not specify these characters as invalid.
465 Since no other escape mechanism exists, names containing
466 wildcards are indistinguishable from actual wildcards used in SMB
469 - Reserved names are not and cannot be represented in this map.
472 CON, PRN, AUX, NUL, COM1, COM2, COM3, COM4, COM5, COM6, COM7,
473 COM8, COM9, LPT1, LPT2, LPT3, LPT4, LPT5, LPT6, LPT7, LPT8, LPT9,
476 - Characters 0x80, 0x81, 0x8d, 0x8e, 0x8f, 0x90, 0x9d, 0x9e, 0x9f
477 are also escaped because they are unused in CP-1252 and hence
478 cannot be convered to a Unicode string.
480 Reserved names with extensions are also invalid. (i.e. NUL.txt)
482 \note The only bit we are actually interested in from the following
483 table is the ESCVAL bit. However, the characters themselves are
484 included for ease of maintenance.
486 \see "Naming a File" topic in the Windows SDK.
488 static const short sanitized_escapes_1252[] = {
489 Esc(0x00),Esc(0x01),Esc(0x02),Esc(0x03),Esc(0x04),Esc(0x05),Esc(0x06),Esc(0x07),
490 Esc(0x08),Esc(0x09),Esc(0x0a),Esc(0x0b),Esc(0x0c),Esc(0x0d),Esc(0x0e),Esc(0x0f),
491 Esc(0x10),Esc(0x11),Esc(0x12),Esc(0x13),Esc(0x14),Esc(0x15),Esc(0x16),Esc(0x17),
492 Esc(0x18),Esc(0x19),Esc(0x1a),Esc(0x1b),Esc(0x1c),Esc(0x1d),Esc(0x1e),Esc(0x1f),
493 ' ','!',Esc('"'),'#','$','%','&','\'','(',')',Esc('*'),'+',',','-','.','/',
494 '0','1','2','3','4','5','6','7','8','9',Esc(':'),';',Esc('<'),'=',Esc('>'),Esc('?'),
495 '@','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
496 'P','Q','R','S','T','U','V','W','X','Y','Z','[','\\',']','^','_',
497 '`','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o',
498 'p','q','r','s','t','u','v','w','x','y','z','{',Esc('|'),'}','~',Esc(0x7f),
499 Esc(0x80),Esc(0x81),0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,Esc(0x8d),Esc(0x8e),Esc(0x8f),
500 Esc(0x90),0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,Esc(0x9d),Esc(0x9e),0x9f,
501 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
502 0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
503 0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,
504 0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf,
505 0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
506 0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff
509 static int sanitize_bytestring(const char * src, int cch_src,
510 char * odest, int cch_dest)
515 cm_InitNormalization();
517 while (cch_src > 0 && *src && cch_dest > 0) {
521 rc = sanitized_escapes_1252[*src];
522 if (IS_ESCAPED(rc)) {
523 static const char hex[] =
524 {'0','1','2','3','4','5','6','7',
525 '8','9','a','b','c','d','e','f'};
533 *dest++ = hex[(((int)*src) >> 4) & 0x0f];
534 *dest++ = hex[(((int)*src) & 0x0f)];
546 if (cch_src > 0 && cch_dest > 0) {
550 return (int)(dest - odest);
553 static int sanitize_utf16char(wchar_t c, wchar_t ** pdest, size_t * pcch)
556 StringCchPrintfExW(*pdest, *pcch, pdest, pcch, 0, L"%%%04x", (int) c);
563 static int sanitize_utf16string(const wchar_t * src, size_t cch_src,
564 wchar_t * dest, size_t cch_dest)
566 int cch_dest_o = cch_dest;
569 /* only estimating */
570 for (cch_dest = 0; cch_src > 0;) {
571 if (*src >= 0xd800 && *src < 0xdc00) {
572 if (cch_src <= 1 || src[1] < 0xdc00 || src[1] > 0xdfff) {
573 /* dangling surrogate */
583 } else if (*src >= 0xdc00 && *src <= 0xdfff) {
584 /* dangling surrogate */
598 while (cch_src > 0 && cch_dest > 0) {
599 if (*src >= 0xd800 && *src < 0xdc00) {
600 if (cch_src <= 1 || src[1] < 0xdc00 || src[1] > 0xdfff) {
601 if (!sanitize_utf16char(*src++, &dest, &cch_dest))
605 /* found a surrogate pair */
608 cch_dest -= 2; cch_src -= 2;
610 } else if (*src >= 0xdc00 && *src <= 0xdfff) {
611 if (!sanitize_utf16char(*src++, &dest, &cch_dest))
616 cch_dest--; cch_src--;
620 return (cch_src == 0) ? cch_dest_o - cch_dest : 0;
627 long cm_NormalizeUtf8StringToUtf16(const char * src, int cch_src,
628 wchar_t * dest, int cch_dest)
630 wchar_t wsrcbuf[NLSMAXCCH];
636 cm_InitNormalization();
638 /* Get some edge cases out first, so we don't have to worry about
639 cch_src being 0 etc. */
642 } else if (*src == '\0') {
648 if (dest && cch_dest > 0) {
653 cch_src = (int)strlen(src) + 1;
656 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
657 cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
659 if (cch != 0 && !cm_is_valid_utf16(wsrcbuf, cch)) {
660 wchar_t wsanitized[NLSMAXCCH];
662 /* We successfully converted, but the resulting UTF-16 string
663 has dangling surrogates. We should try and escape those
665 cch = sanitize_utf16string(wsrcbuf, cch, wsanitized, NLSMAXCCH);
667 memcpy(wsrcbuf, wsanitized, cch * sizeof(wchar_t));
672 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
673 char sanitized[NLSMAXCCH];
676 /* If src doesn't have a unicode translation, then it
677 wasn't valid UTF-8. In this case, we assume that src
678 is CP-1252 and then try to convert again. But before
679 that, we use a translation table to "sanitize" the
682 cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
683 sizeof(sanitized)/sizeof(char));
685 if (cch_sanitized == 0) {
692 cch = MultiByteToWideChar(1252, 0, sanitized,
693 cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
695 /* Well, that didn't work either. Something is very wrong. */
707 wnorm = NormalizeUtf16String(wsrcbuf, cch, dest, &cch_norm);
716 /* The buffer was insufficient */
717 if (dest != NULL && cch_dest > 1) {
728 cm_normchar_t *cm_NormalizeUtf8StringToUtf16Alloc(const cm_utf8char_t * src, int cch_src,
731 wchar_t wsrcbuf[NLSMAXCCH];
737 cm_InitNormalization();
739 /* Get some edge cases out first, so we don't have to worry about
740 cch_src being 0 etc. */
741 if (cch_src == 0 || src == NULL || *src == '\0') {
743 *pcch_dest = ((cch_src != 0)? 1 : 0);
748 cch_src = (int)strlen(src) + 1;
751 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
752 cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
754 if (cch != 0 && !cm_is_valid_utf16(wsrcbuf, cch)) {
755 wchar_t wsanitized[NLSMAXCCH];
757 /* We successfully converted, but the resulting UTF-16 string
758 has dangling surrogates. We should try and escape those
760 cch = sanitize_utf16string(wsrcbuf, cch, wsanitized, NLSMAXCCH);
762 memcpy(wsrcbuf, wsanitized, cch * sizeof(wchar_t));
767 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
768 char sanitized[NLSMAXCCH];
771 /* If src doesn't have a unicode translation, then it
772 wasn't valid UTF-8. In this case, we assume that src
773 is CP-1252 and then try to convert again. But before
774 that, we use a translation table to "sanitize" the
777 cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
778 sizeof(sanitized)/sizeof(char));
780 if (cch_sanitized == 0) {
787 cch = MultiByteToWideChar(1252, 0, sanitized,
788 cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
790 /* Well, that didn't work either. Something is very wrong. */
802 wnorm = NormalizeUtf16String(wsrcbuf, cch, NULL, &cch_norm);
811 *pcch_dest = cch_norm;
816 int cm_Utf8ToUtf16(const cm_utf8char_t * src, int cch_src,
817 cm_unichar_t * dest, int cch_dest)
821 if (cch_dest >= 1 && dest != NULL) {
826 cm_InitNormalization();
829 cch_src = (int)strlen(src) + 1;
832 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
833 cch_src * sizeof(char), dest, cch_dest);
835 if (cch != 0 && !cm_is_valid_utf16(dest, cch)) {
836 wchar_t wsanitized[NLSMAXCCH];
838 cch = sanitize_utf16string(dest, cch, wsanitized, NLSMAXCCH);
840 memcpy(dest, wsanitized, cch * sizeof(wchar_t));
845 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
846 char sanitized[NLSMAXCCH];
849 /* If src doesn't have a unicode translation, then it
850 wasn't valid UTF-8. In this case, we assume that src
851 is CP-1252 and then try to convert again. But before
852 that, we use a translation table to "sanitize" the
855 cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
856 sizeof(sanitized)/sizeof(char));
858 if (cch_sanitized == 0) {
865 cch = MultiByteToWideChar(1252, 0, sanitized,
866 cch_sanitized * sizeof(char), dest, cch_dest);
868 /* Well, that didn't work either. Something is very wrong. */
885 cm_unichar_t * cm_Utf8ToUtf16Alloc(const cm_utf8char_t * src, int cch_src, int *pcch_dest)
887 cm_unichar_t * ustr = NULL;
891 cm_InitNormalization();
893 if (cch_src == 0 || src == NULL || *src == '\0') {
895 *pcch_dest = ((cch_src != 0)? 1 : 0);
900 cch_src = (int)strlen(src) + 1;
903 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
904 cch_src * sizeof(char), NULL, 0);
907 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
908 char sanitized[NLSMAXCCH];
911 /* If src doesn't have a unicode translation, then it
912 wasn't valid UTF-8. In this case, we assume that src
913 is CP-1252 and then try to convert again. But before
914 that, we use a translation table to "sanitize" the
917 cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
918 sizeof(sanitized)/sizeof(char));
920 if (cch_sanitized == 0) {
927 cch = MultiByteToWideChar(1252, 0, sanitized,
928 cch_sanitized * sizeof(char), NULL, 0);
930 /* Well, that didn't work either. Something is very wrong. */
937 ustr = malloc((cch + 1) * sizeof(wchar_t));
939 cch = MultiByteToWideChar(1252, 0, sanitized,
940 cch_sanitized * sizeof(char), ustr, cch);
947 ustr = malloc((cch + 1) * sizeof(wchar_t));
949 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
950 cch_src * sizeof(char), ustr, cch);
953 if (!cm_is_valid_utf16(ustr, cch)) {
954 cm_unichar_t * us = NULL;
957 cch_s = sanitize_utf16string(ustr, cch, NULL, 0);
959 us = malloc(cch_s * sizeof(wchar_t));
960 cch_s = sanitize_utf16string(ustr, cch, us, cch_s);
984 /* \brief Normalize a UTF-8 string.
986 \param[in] src String to normalize.
988 \param[in] cch_src : Count of characters in src. If this value is
989 -1, then src is assumed to be NULL terminated. The translated
990 string will be NULL terminated only if this is -1 or the count
991 includes the terminating NULL.
993 \param[out] adest : Destination string. Only considered valid if
994 \a cch_adest is non-zero.
996 \param[in] cch_adest : Number of characters in the destination
997 string. If this is zero, then the return value is the number
1000 \return If \a cch_adest is non-zero, then the return value is the
1001 number of bytes stored into adest. If \a cch_adest is zero,
1002 then the return value is the number of bytes required. In both
1003 cases, the return value is 0 if the call was unsuccessful.
1005 long cm_NormalizeUtf8String(const char * src, int cch_src,
1006 char * adest, int cch_adest)
1008 wchar_t wsrcbuf[NLSMAXCCH];
1014 cm_InitNormalization();
1016 /* Get some edge cases out first, so we don't have to worry about
1017 cch_src being 0 etc. */
1020 } else if (*src == '\0') {
1026 if (cch_src == -1) {
1027 cch_src = (int)strlen(src) + 1;
1030 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
1031 cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
1033 if (cch != 0 && !cm_is_valid_utf16(wsrcbuf, cch)) {
1034 wchar_t wsanitized[NLSMAXCCH];
1036 cch = sanitize_utf16string(wsrcbuf, cch, wsanitized, NLSMAXCCH);
1038 memcpy(wsrcbuf, wsanitized, cch * sizeof(wchar_t));
1043 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
1044 char sanitized[NLSMAXCCH];
1047 /* If src doesn't have a unicode translation, then it
1048 wasn't valid UTF-8. In this case, we assume that src
1049 is CP-1252 and then try to convert again. But before
1050 that, we use a translation table to "sanitize" the
1053 cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
1054 sizeof(sanitized)/sizeof(char));
1056 if (cch_sanitized == 0) {
1057 #ifdef DEBUG_UNICODE
1063 cch = MultiByteToWideChar(1252, 0, sanitized,
1064 cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
1066 /* Well, that didn't work either. Something is very wrong. */
1067 #ifdef DEBUG_UNICODE
1078 wnorm = NormalizeUtf16String(wsrcbuf, cch, NULL, &cch_norm);
1079 if (wnorm == NULL) {
1080 #ifdef DEBUG_UNICODE
1086 cch = WideCharToMultiByte(CP_UTF8, 0, wnorm,
1087 cch_norm, adest, cch_adest * sizeof(char),
1096 /*! \brief Case insensitive comparison with specific length
1098 \param[in] str1 First string to compare. Assumed to be encoded in UTF-8.
1100 \param[in] str2 Second string to compare. Assumed to be encoded in UTF-8.
1102 \param[in] n Max byte count.
1105 int cm_strnicmp_utf8(const char * str1, const char * str2, int n)
1107 wchar_t wstr1[NLSMAXCCH];
1110 wchar_t wstr2[NLSMAXCCH];
1114 cm_InitNormalization();
1119 /* first check for NULL pointers (assume NULL < "") */
1125 } else if (str2 == NULL) {
1129 len1 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str1, n, wstr1, NLSMAXCCH);
1137 len2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str2, n, wstr2, NLSMAXCCH);
1145 rv = CompareStringW(nls_lcid, NORM_IGNORECASE, wstr1, len1, wstr2, len2);
1156 int cm_strnicmp_utf16(const cm_unichar_t * str1, const cm_unichar_t * str2, int len)
1163 cm_InitNormalization();
1168 /* first check for NULL pointers */
1174 } else if (str2 == NULL) {
1178 if (FAILED(StringCchLengthW(str1, len, &cch1)))
1181 if (FAILED(StringCchLengthW(str2, len, &cch2)))
1184 rv = CompareStringW(nls_lcid, NORM_IGNORECASE, str1, (int)cch1, str2, (int)cch2);
1195 int cm_stricmp_utf16(const cm_unichar_t * str1, const cm_unichar_t * str2)
1200 cm_InitNormalization();
1202 /* first check for NULL pointers */
1208 } else if (str2 == NULL) {
1212 rv = CompareStringW(nls_lcid, NORM_IGNORECASE, str1, -1, str2, -1);
1223 cm_unichar_t *cm_strlwr_utf16(cm_unichar_t * str)
1229 cm_InitNormalization();
1231 len = (int)wcslen(str) + 1;
1232 rv = LCMapStringW(nls_lcid, LCMAP_LOWERCASE, str, len, str, len);
1242 cm_unichar_t *cm_strupr_utf16(cm_unichar_t * str)
1248 cm_InitNormalization();
1250 len = (int)wcslen(str) + 1;
1251 rv = LCMapStringW(nls_lcid, LCMAP_UPPERCASE, str, len, str, len);
1262 int cm_stricmp_utf8(const char * str1, const char * str2)
1264 wchar_t wstr1[NLSMAXCCH];
1267 wchar_t wstr2[NLSMAXCCH];
1271 cm_InitNormalization();
1273 /* first check for NULL pointers */
1279 } else if (str2 == NULL) {
1283 len1 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str1, -1, wstr1, NLSMAXCCH);
1291 len2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str2, -1, wstr2, NLSMAXCCH);
1299 rv = CompareStringW(nls_lcid, NORM_IGNORECASE, wstr1, len1, wstr2, len2);
1311 wchar_t * strupr_utf16(wchar_t * wstr, size_t cbstr)
1313 wchar_t wstrd[NLSMAXCCH];
1317 cm_InitNormalization();
1319 len = cbstr / sizeof(wchar_t);
1320 len = LCMapStringW(nls_lcid, LCMAP_UPPERCASE, wstr, len, wstrd, NLSMAXCCH);
1321 StringCbCopyW(wstr, cbstr, wstrd);
1327 char * strupr_utf8(char * str, size_t cbstr)
1329 wchar_t wstr[NLSMAXCCH];
1330 wchar_t wstrd[NLSMAXCCH];
1334 cm_InitNormalization();
1336 len = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, -1, wstr, NLSMAXCCH);
1340 len = LCMapStringW(nls_lcid, LCMAP_UPPERCASE, wstr, len, wstrd, NLSMAXCCH);
1342 len = WideCharToMultiByte(CP_UTF8, 0, wstrd, -1, str, (int)cbstr, NULL, FALSE);
1347 char * char_next_utf8(const char * c)
1349 #define CH (*((const unsigned char *)c))
1351 if ((CH & 0x80) == 0)
1352 return (char *) c+1;
1354 switch (CH & 0xf0) {
1357 return (char *) c+2;
1360 return (char *) c+3;
1363 return (char *) c+4;
1366 return (char *) c+1;
1373 char * char_prev_utf8(const char * c)
1375 #define CH (*((const unsigned char *)c))
1379 if ((CH & 0x80) == 0)
1382 while ((CH & 0xc0) == 0x80)
1389 wchar_t * char_next_utf16(const wchar_t * c)
1391 unsigned short sc = (unsigned short) *c;
1393 if (sc >= 0xd800 && sc <= 0xdbff)
1394 return (wchar_t *) c+2;
1395 return (wchar_t *) c+1;
1398 wchar_t * char_prev_utf16(const wchar_t * c)
1400 unsigned short sc = (unsigned short) *(--c);
1402 if (sc >= 0xdc00 && sc <= 0xdfff)
1403 return (wchar_t *) --c;
1404 return (wchar_t *) c;
1407 wchar_t * char_this_utf16(const wchar_t * c)
1409 unsigned short sc = (unsigned short) *c;
1411 if (sc >= 0xdc00 && sc <= 0xdfff)
1412 return (wchar_t *) --c;
1413 return (wchar_t *) c;
1416 int cm_is_valid_utf16(const wchar_t * c, int cch)
1419 cch = wcslen(c) + 1;
1421 for (; cch > 0; c++, cch--) {
1422 if (*c >= 0xd800 && *c < 0xdc00) {
1424 if (cch == 0 || *c < 0xdc00 || *c > 0xdfff)
1426 } else if (*c >= 0xdc00 && *c <= 0xdfff) {
1435 wchar_t * cm_GetRawCharsAlloc(const wchar_t * c, int len)
1445 return wcsdup(L"(empty)");
1447 cb = len * 5 * sizeof(wchar_t);
1448 current = ret = malloc(cb);
1452 for (; len > 0; ++c, --len) {
1453 StringCbPrintfExW(current, cb, ¤t, &cb, 0,
1456 StringCbCatExW(current, cb, L",", ¤t, &cb, 0);