2 * Copyright (c) 2008 Secure Endpoints Inc.
4 * Permission is hereby granted, free of charge, to any person
5 * obtaining a copy of this software and associated documentation
6 * files (the "Software"), to deal in the Software without
7 * restriction, including without limitation the rights to use, copy,
8 * modify, merge, publish, distribute, sublicense, and/or sell copies
9 * of the Software, and to permit persons to whom the Software is
10 * furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
38 /* This is part of the Microsoft Internationalized Domain Name
40 #include <normalization.h>
42 /* TODO: All the normalization and conversion code should NUL
43 terminate destination strings. */
46 (WINAPI *pNormalizeString)( __in NORM_FORM NormForm,
47 __in_ecount(cwSrcLength) LPCWSTR lpSrcString,
49 __out_ecount(cwDstLength) LPWSTR lpDstString,
50 __in int cwDstLength ) = NULL;
53 (WINAPI *pIsNormalizedString)( __in NORM_FORM NormForm,
54 __in_ecount(cwLength) LPCWSTR lpString,
55 __in int cwLength ) = NULL;
58 #define NLSDLLNAME "Normaliz.dll"
59 #define NLSMAXCCH 1024
62 #define AFS_NORM_FORM NormalizationC
64 static LCID nls_lcid = LOCALE_INVARIANT;
66 static int nls_init = 0;
69 is_windows_2000 (void)
71 static BOOL fChecked = FALSE;
72 static BOOL fIsWin2K = FALSE;
76 OSVERSIONINFO Version;
78 memset (&Version, 0x00, sizeof(Version));
79 Version.dwOSVersionInfoSize = sizeof(Version);
81 if (GetVersionEx (&Version))
83 if (Version.dwPlatformId == VER_PLATFORM_WIN32_NT &&
84 Version.dwMajorVersion >= 5)
93 long cm_InitNormalization(void)
97 if (pNormalizeString != NULL)
100 h_Nls = LoadLibrary(NLSDLLNAME);
101 if (h_Nls == INVALID_HANDLE_VALUE) {
106 (int (WINAPI *)( NORM_FORM, LPCWSTR,
108 GetProcAddress(h_Nls, "NormalizeString");
110 pIsNormalizedString =
112 (WINAPI *)( NORM_FORM, LPCWSTR, int ))
113 GetProcAddress(h_Nls, "IsNormalizedString");
115 if (is_windows_2000())
116 nls_lcid = MAKELCID(MAKELANGID(LANG_ENGLISH, SUBLANG_ENGLISH_US), SORT_DEFAULT);
120 return (pNormalizeString && pIsNormalizedString);
123 /* \brief Normalize a UTF-16 string.
125 If the supplied destination buffer is insufficient or NULL, then a
126 new buffer will be allocated to hold the normalized string.
128 \param[in] src : Source UTF-16 string. Length is specified in
131 \param[in] cch_src : The character count in cch_src is assumed to
132 be tight and include the terminating NULL character if there is
133 one. If the NULL is absent, the resulting string will not be
136 \param[out] ext_dest : The destination buffer. Can be NULL, in
137 which case *pcch_dest MUST be 0.
139 \param[in,out] pcch_dest : On entry *pcch_dest contains a count of
140 characters in the destination buffer. On exit, it will contain
141 a count of characters that were copied to the destination
144 Returns a pointer to the buffer containing the normalized string or
145 NULL if the call was unsuccessful. If the returned destination
146 buffer is different from the supplied buffer and non-NULL, it
147 should be freed using free().
150 NormalizeUtf16String(const wchar_t * src, int cch_src, wchar_t * ext_dest, int *pcch_dest)
153 cm_InitNormalization();
156 assert (pNormalizeString != NULL && pIsNormalizedString != NULL);
160 cch_src = wcslen(src) + 1;
162 if ((pIsNormalizedString && (*pIsNormalizedString)(AFS_NORM_FORM, src, cch_src)) ||
163 (!pNormalizeString)) {
165 if (ext_dest == NULL || *pcch_dest < cch_src) {
166 ext_dest = malloc(cch_src * sizeof(wchar_t));
167 *pcch_dest = cch_src;
170 /* No need to or unable to normalize. Just copy the string.
171 Note that the string is not NUL terminated if the source
172 string is not NUL terminated. */
175 memcpy(ext_dest, src, cch_src * sizeof(wchar_t));
176 *pcch_dest = cch_src;
188 int cch_dest = *pcch_dest;
192 while (tries-- > 0) {
194 rv = (*pNormalizeString)(AFS_NORM_FORM, src, cch_src, dest, cch_dest);
196 if (rv <= 0 && (gle = GetLastError()) != ERROR_SUCCESS) {
197 if (gle == ERROR_INSUFFICIENT_BUFFER) {
199 /* The buffer wasn't big enough. We are going to
200 try allocating one. */
202 cch_dest = (-rv) + NLSERRCCH;
206 /* Something else is wrong */
210 } else if (rv < 0) { /* rv < 0 && gle == ERROR_SUCCESS */
212 /* Technically not one of the expected outcomes */
215 } else { /* rv > 0 || (rv == 0 && gle == ERROR_SUCCESS) */
217 /* Possibly succeeded */
219 if (rv == 0) { /* Succeeded and the return string is empty */
225 /* Nope. We only calculated the required size of the buffer */
227 cch_dest = rv + NLSERRCCH;
235 /* Can't NUL terminate */
236 cch_dest = max(rv,cch_dest) + NLSERRCCH;
245 if (dest != ext_dest && dest)
247 dest = malloc(cch_dest * sizeof(wchar_t));
252 if (dest != ext_dest && dest)
260 /*! \brief Normalize a Unicode string into a newly allocated buffer
262 The input string will be normalized using NFC.
264 \param[in] s UTF-16 string to be normalized.
266 \param[in] cch_src The number of characters in the input string. If
267 this is -1, then the input string is assumed to be NUL
270 \param[out] pcch_dest Receives the number of characters copied to
271 the output buffer. Note that the character count is the number
272 of wchar_t characters copied, and not the count of Unicode code
273 points. This includes the terminating NUL if cch_src was -1 or
274 included the terminating NUL.
276 \return A newly allocated buffer holding the normalized string or
277 NULL if the call failed.
279 cm_normchar_t * cm_NormalizeStringAlloc(const cm_unichar_t * s, int cch_src, int *pcch_dest)
285 cm_InitNormalization();
287 if (s == NULL || cch_src == 0 || *s == L'\0') {
289 *pcch_dest = ((cch_src != 0)? 1: 0);
293 r = NormalizeUtf16String(s, cch_src, NULL, &cch_dest);
296 *pcch_dest = cch_dest;
301 int cm_NormalizeString(const cm_unichar_t * s, int cch_src,
302 cm_normchar_t * dest, int cch_dest)
308 cm_InitNormalization();
310 r = NormalizeUtf16String(s, cch_src, dest, &tcch);
313 /* The supplied buffer was insufficient */
321 /*! \brief Convert a UTF-16 string to a UTF-8 string using a newly allocated buffer
323 \param[in] s UTF-16 source string
325 \param[in] cch_src Number of characters in \a s. This can be set to
326 -1 if \a s is NUL terminated.
328 \param[out] pcch_dest Receives a count of characters that were
329 copied to the target buffer.
331 \return A newly allocated buffer holding the UTF-8 string.
334 cm_utf8char_t * cm_Utf16ToUtf8Alloc(const cm_unichar_t * s, int cch_src, int *pcch_dest)
337 cm_utf8char_t * dest;
340 cm_InitNormalization();
342 if (s == NULL || cch_src == 0 || *s == L'\0') {
344 *pcch_dest = ((cch_src != 0)?1:0);
348 cch_dest = WideCharToMultiByte(CP_UTF8, 0, s, cch_src, NULL, 0, NULL, FALSE);
352 *pcch_dest = cch_dest;
356 dest = malloc((cch_dest + 1) * sizeof(cm_utf8char_t));
358 WideCharToMultiByte(CP_UTF8, 0, s, cch_src, dest, cch_dest, NULL, FALSE);
362 *pcch_dest = cch_dest;
367 int cm_Utf16ToUtf8(const cm_unichar_t * src, int cch_src,
368 cm_utf8char_t * dest, int cch_dest)
371 cm_InitNormalization();
373 return WideCharToMultiByte(CP_UTF8, 0, src, cch_src, dest, cch_dest, NULL, FALSE);
376 int cm_Utf16ToUtf16(const cm_unichar_t * src, int cch_src,
377 cm_unichar_t * dest, int cch_dest)
380 cm_InitNormalization();
383 StringCchCopyW(dest, cch_dest, src);
384 return wcslen(dest) + 1;
386 int cch_conv = min(cch_src, cch_dest);
387 memcpy(dest, src, cch_conv * sizeof(cm_unichar_t));
392 /* \brief Normalize a UTF-16 string into a UTF-8 string.
394 \param[in] src : Source string.
396 \param[in] cch_src : Count of characters in src. If the count includes the
397 NULL terminator, then the resulting string will be NULL
398 terminated. If it is -1, then src is assumed to be NULL
401 \param[out] adest : Destination buffer.
403 \param[in] cch_adest : Number of characters in the destination buffer.
405 Returns the number of characters stored into cch_adest. This will
406 include the terminating NULL if cch_src included the terminating
407 NULL or was -1. If this is 0, then the operation was unsuccessful.
409 long cm_NormalizeUtf16StringToUtf8(const wchar_t * src, int cch_src,
410 char * adest, int cch_adest)
413 cm_InitNormalization();
418 if (FAILED(StringCchLengthW(src, NLSMAXCCH, &cch)))
425 wchar_t nbuf[NLSMAXCCH];
426 wchar_t * normalized;
427 int cch_norm = NLSMAXCCH;
429 normalized = NormalizeUtf16String(src, cch_src, nbuf, &cch_norm);
431 cch_adest = WideCharToMultiByte(CP_UTF8, 0, normalized, cch_norm,
432 adest, cch_adest, NULL, 0);
434 if (normalized != nbuf && normalized)
447 #define ESCVAL 0x1000
448 #define Esc(c) (ESCVAL + (short)(c))
449 #define IS_ESCAPED(c) (((c) & ESCVAL) == ESCVAL)
451 /* \brief Character sanitization map for CP-1252
453 The following map indicates which characters should be escaped in
454 the CP-1252 character map. Characters that are documented as
455 illegal characters in a file name are marked as escaped. Escaped
456 characters are marked using the ::Esc macro defined above. The
457 following exceptions apply:
459 - Path delimeters '\\' and '/' are NOT escaped because the
460 sanitization map applies to paths. While those characters are
461 illegal in filenames, they are legal in paths.
463 - Wildcard characters '*' and '?' ARE escaped. The document
464 referred below does not specify these characters as invalid.
465 Since no other escape mechanism exists, names containing
466 wildcards are indistinguishable from actual wildcards used in SMB
469 - Reserved names are not and cannot be represented in this map.
472 CON, PRN, AUX, NUL, COM1, COM2, COM3, COM4, COM5, COM6, COM7,
473 COM8, COM9, LPT1, LPT2, LPT3, LPT4, LPT5, LPT6, LPT7, LPT8, LPT9,
476 - Characters 0x80, 0x81, 0x8d, 0x8e, 0x8f, 0x90, 0x9d, 0x9e, 0x9f
477 are also escaped because they are unused in CP-1252 and hence
478 cannot be convered to a Unicode string.
480 Reserved names with extensions are also invalid. (i.e. NUL.txt)
482 \note The only bit we are actually interested in from the following
483 table is the ESCVAL bit. However, the characters themselves are
484 included for ease of maintenance.
486 \see "Naming a File" topic in the Windows SDK.
488 static const short sanitized_escapes_1252[] = {
489 Esc(0x00),Esc(0x01),Esc(0x02),Esc(0x03),Esc(0x04),Esc(0x05),Esc(0x06),Esc(0x07),
490 Esc(0x08),Esc(0x09),Esc(0x0a),Esc(0x0b),Esc(0x0c),Esc(0x0d),Esc(0x0e),Esc(0x0f),
491 Esc(0x10),Esc(0x11),Esc(0x12),Esc(0x13),Esc(0x14),Esc(0x15),Esc(0x16),Esc(0x17),
492 Esc(0x18),Esc(0x19),Esc(0x1a),Esc(0x1b),Esc(0x1c),Esc(0x1d),Esc(0x1e),Esc(0x1f),
493 ' ','!',Esc('"'),'#','$','%','&','\'','(',')',Esc('*'),'+',',','-','.','/',
494 '0','1','2','3','4','5','6','7','8','9',Esc(':'),';',Esc('<'),'=',Esc('>'),Esc('?'),
495 '@','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
496 'P','Q','R','S','T','U','V','W','X','Y','Z','[','\\',']','^','_',
497 '`','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o',
498 'p','q','r','s','t','u','v','w','x','y','z','{',Esc('|'),'}','~',Esc(0x7f),
499 Esc(0x80),Esc(0x81),0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,Esc(0x8d),Esc(0x8e),Esc(0x8f),
500 Esc(0x90),0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,Esc(0x9d),Esc(0x9e),0x9f,
501 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
502 0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
503 0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,
504 0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf,
505 0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
506 0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff
509 static int sanitize_bytestring(const char * src, int cch_src,
510 char * odest, int cch_dest)
515 cm_InitNormalization();
517 while (cch_src > 0 && *src && cch_dest > 0) {
521 rc = sanitized_escapes_1252[*src];
522 if (IS_ESCAPED(rc)) {
523 static const char hex[] =
524 {'0','1','2','3','4','5','6','7',
525 '8','9','a','b','c','d','e','f'};
533 *dest++ = hex[(((int)*src) >> 4) & 0x0f];
534 *dest++ = hex[(((int)*src) & 0x0f)];
546 if (cch_src > 0 && cch_dest > 0) {
550 return (int)(dest - odest);
557 long cm_NormalizeUtf8StringToUtf16(const char * src, int cch_src,
558 wchar_t * dest, int cch_dest)
560 wchar_t wsrcbuf[NLSMAXCCH];
566 cm_InitNormalization();
568 /* Get some edge cases out first, so we don't have to worry about
569 cch_src being 0 etc. */
572 } else if (*src == '\0') {
579 cch_src = strlen(src) + 1;
582 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
583 cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
586 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
587 char sanitized[NLSMAXCCH];
590 /* If src doesn't have a unicode translation, then it
591 wasn't valid UTF-8. In this case, we assume that src
592 is CP-1252 and then try to convert again. But before
593 that, we use a translation table to "sanitize" the
596 cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
597 sizeof(sanitized)/sizeof(char));
599 if (cch_sanitized == 0) {
606 cch = MultiByteToWideChar(1252, 0, sanitized,
607 cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
609 /* Well, that didn't work either. Something is very wrong. */
621 wnorm = NormalizeUtf16String(wsrcbuf, cch, dest, &cch_norm);
630 /* The buffer was insufficient */
631 if (dest != NULL && cch_dest > 1) {
642 cm_normchar_t *cm_NormalizeUtf8StringToUtf16Alloc(const cm_utf8char_t * src, int cch_src,
645 wchar_t wsrcbuf[NLSMAXCCH];
651 cm_InitNormalization();
653 /* Get some edge cases out first, so we don't have to worry about
654 cch_src being 0 etc. */
655 if (cch_src == 0 || src == NULL || *src == '\0') {
657 *pcch_dest = ((cch_src != 0)? 1 : 0);
662 cch_src = strlen(src) + 1;
665 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
666 cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
669 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
670 char sanitized[NLSMAXCCH];
673 /* If src doesn't have a unicode translation, then it
674 wasn't valid UTF-8. In this case, we assume that src
675 is CP-1252 and then try to convert again. But before
676 that, we use a translation table to "sanitize" the
679 cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
680 sizeof(sanitized)/sizeof(char));
682 if (cch_sanitized == 0) {
689 cch = MultiByteToWideChar(1252, 0, sanitized,
690 cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
692 /* Well, that didn't work either. Something is very wrong. */
704 wnorm = NormalizeUtf16String(wsrcbuf, cch, NULL, &cch_norm);
713 *pcch_dest = cch_norm;
718 int cm_Utf8ToUtf16(const cm_utf8char_t * src, int cch_src,
719 cm_unichar_t * dest, int cch_dest)
724 cm_InitNormalization();
727 cch_src = strlen(src) + 1;
730 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
731 cch_src * sizeof(char), dest, cch_dest);
734 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
735 char sanitized[NLSMAXCCH];
738 /* If src doesn't have a unicode translation, then it
739 wasn't valid UTF-8. In this case, we assume that src
740 is CP-1252 and then try to convert again. But before
741 that, we use a translation table to "sanitize" the
744 cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
745 sizeof(sanitized)/sizeof(char));
747 if (cch_sanitized == 0) {
754 cch = MultiByteToWideChar(1252, 0, sanitized,
755 cch_sanitized * sizeof(char), dest, cch_dest);
757 /* Well, that didn't work either. Something is very wrong. */
774 cm_unichar_t * cm_Utf8ToUtf16Alloc(const cm_utf8char_t * src, int cch_src, int *pcch_dest)
776 cm_unichar_t * ustr = NULL;
780 cm_InitNormalization();
782 if (cch_src == 0 || src == NULL || *src == '\0') {
784 *pcch_dest = ((cch_src != 0)? 1 : 0);
789 cch_src = strlen(src) + 1;
792 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
793 cch_src * sizeof(char), NULL, 0);
796 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
797 char sanitized[NLSMAXCCH];
800 /* If src doesn't have a unicode translation, then it
801 wasn't valid UTF-8. In this case, we assume that src
802 is CP-1252 and then try to convert again. But before
803 that, we use a translation table to "sanitize" the
806 cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
807 sizeof(sanitized)/sizeof(char));
809 if (cch_sanitized == 0) {
816 cch = MultiByteToWideChar(1252, 0, sanitized,
817 cch_sanitized * sizeof(char), NULL, 0);
819 /* Well, that didn't work either. Something is very wrong. */
826 ustr = malloc((cch + 1) * sizeof(wchar_t));
828 cch = MultiByteToWideChar(1252, 0, sanitized,
829 cch_sanitized * sizeof(char), ustr, cch);
836 ustr = malloc((cch + 1) * sizeof(wchar_t));
838 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
839 cch_src * sizeof(char), ustr, cch);
851 /* \brief Normalize a UTF-8 string.
853 \param[in] src String to normalize.
855 \param[in] cch_src : Count of characters in src. If this value is
856 -1, then src is assumed to be NULL terminated. The translated
857 string will be NULL terminated only if this is -1 or the count
858 includes the terminating NULL.
860 \param[out] adest : Destination string. Only considered valid if
861 \a cch_adest is non-zero.
863 \param[in] cch_adest : Number of characters in the destination
864 string. If this is zero, then the return value is the number
867 \return If \a cch_adest is non-zero, then the return value is the
868 number of bytes stored into adest. If \a cch_adest is zero,
869 then the return value is the number of bytes required. In both
870 cases, the return value is 0 if the call was unsuccessful.
872 long cm_NormalizeUtf8String(const char * src, int cch_src,
873 char * adest, int cch_adest)
875 wchar_t wsrcbuf[NLSMAXCCH];
881 cm_InitNormalization();
883 /* Get some edge cases out first, so we don't have to worry about
884 cch_src being 0 etc. */
887 } else if (*src == '\0') {
894 cch_src = strlen(src) + 1;
897 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
898 cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
901 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
902 char sanitized[NLSMAXCCH];
905 /* If src doesn't have a unicode translation, then it
906 wasn't valid UTF-8. In this case, we assume that src
907 is CP-1252 and then try to convert again. But before
908 that, we use a translation table to "sanitize" the
911 cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
912 sizeof(sanitized)/sizeof(char));
914 if (cch_sanitized == 0) {
921 cch = MultiByteToWideChar(1252, 0, sanitized,
922 cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
924 /* Well, that didn't work either. Something is very wrong. */
936 wnorm = NormalizeUtf16String(wsrcbuf, cch, NULL, &cch_norm);
944 cch = WideCharToMultiByte(CP_UTF8, 0, wnorm,
945 cch_norm, adest, cch_adest * sizeof(char),
954 /*! \brief Case insensitive comparison with specific length
956 \param[in] str1 First string to compare. Assumed to be encoded in UTF-8.
958 \param[in] str2 Second string to compare. Assumed to be encoded in UTF-8.
960 \param[in] n Max byte count.
963 int cm_strnicmp_utf8(const char * str1, const char * str2, int n)
965 wchar_t wstr1[NLSMAXCCH];
968 wchar_t wstr2[NLSMAXCCH];
972 cm_InitNormalization();
974 /* first check for NULL pointers (assume NULL < "") */
980 } else if (str2 == NULL) {
984 len1 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str1, n, wstr1, NLSMAXCCH);
992 len2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str2, n, wstr2, NLSMAXCCH);
1000 rv = CompareStringW(nls_lcid, NORM_IGNORECASE, wstr1, len1, wstr2, len2);
1011 int cm_strnicmp_utf16(const cm_unichar_t * str1, const cm_unichar_t * str2, int len)
1018 cm_InitNormalization();
1020 /* first check for NULL pointers */
1026 } else if (str2 == NULL) {
1030 if (FAILED(StringCchLengthW(str1, len, &cch1)))
1033 if (FAILED(StringCchLengthW(str2, len, &cch2)))
1036 rv = CompareStringW(nls_lcid, NORM_IGNORECASE, str1, cch1, str2, cch2);
1047 int cm_stricmp_utf16(const cm_unichar_t * str1, const cm_unichar_t * str2)
1052 cm_InitNormalization();
1054 /* first check for NULL pointers */
1060 } else if (str2 == NULL) {
1064 rv = CompareStringW(nls_lcid, NORM_IGNORECASE, str1, -1, str2, -1);
1075 cm_unichar_t *cm_strlwr_utf16(cm_unichar_t * str)
1081 cm_InitNormalization();
1083 len = wcslen(str) + 1;
1084 rv = LCMapStringW(nls_lcid, LCMAP_LOWERCASE, str, len, str, len);
1094 cm_unichar_t *cm_strupr_utf16(cm_unichar_t * str)
1100 cm_InitNormalization();
1102 len = wcslen(str) + 1;
1103 rv = LCMapStringW(nls_lcid, LCMAP_UPPERCASE, str, len, str, len);
1114 int cm_stricmp_utf8(const char * str1, const char * str2)
1116 wchar_t wstr1[NLSMAXCCH];
1119 wchar_t wstr2[NLSMAXCCH];
1123 cm_InitNormalization();
1125 /* first check for NULL pointers */
1131 } else if (str2 == NULL) {
1135 len1 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str1, -1, wstr1, NLSMAXCCH);
1143 len2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str2, -1, wstr2, NLSMAXCCH);
1151 rv = CompareStringW(nls_lcid, NORM_IGNORECASE, wstr1, len1, wstr2, len2);
1163 wchar_t * strupr_utf16(wchar_t * wstr, size_t cbstr)
1165 wchar_t wstrd[NLSMAXCCH];
1169 cm_InitNormalization();
1171 len = cbstr / sizeof(wchar_t);
1172 len = LCMapStringW(nls_lcid, LCMAP_UPPERCASE, wstr, len, wstrd, NLSMAXCCH);
1173 StringCbCopyW(wstr, cbstr, wstrd);
1179 char * strupr_utf8(char * str, size_t cbstr)
1181 wchar_t wstr[NLSMAXCCH];
1182 wchar_t wstrd[NLSMAXCCH];
1186 cm_InitNormalization();
1188 len = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, -1, wstr, NLSMAXCCH);
1192 len = LCMapStringW(nls_lcid, LCMAP_UPPERCASE, wstr, len, wstrd, NLSMAXCCH);
1194 len = WideCharToMultiByte(CP_UTF8, 0, wstrd, -1, str, cbstr, NULL, FALSE);
1199 char * char_next_utf8(const char * c)
1201 #define CH (*((const unsigned char *)c))
1203 if ((CH & 0x80) == 0)
1204 return (char *) c+1;
1206 switch (CH & 0xf0) {
1209 return (char *) c+2;
1212 return (char *) c+3;
1215 return (char *) c+4;
1218 return (char *) c+1;
1225 char * char_prev_utf8(const char * c)
1227 #define CH (*((const unsigned char *)c))
1231 if ((CH & 0x80) == 0)
1234 while ((CH & 0xc0) == 0x80)
1241 wchar_t * char_next_utf16(const wchar_t * c)
1243 unsigned short sc = (unsigned short) *c;
1245 if (sc >= 0xd800 && sc <= 0xdbff)
1246 return (wchar_t *) c+2;
1247 return (wchar_t *) c+1;
1250 wchar_t * char_prev_utf16(const wchar_t * c)
1252 unsigned short sc = (unsigned short) *(--c);
1254 if (sc >= 0xdc00 && sc <= 0xdfff)
1255 return (wchar_t *) --c;
1256 return (wchar_t *) c;
1259 wchar_t * char_this_utf16(const wchar_t * c)
1261 unsigned short sc = (unsigned short) *c;
1263 if (sc >= 0xdc00 && sc <= 0xdfff)
1264 return (wchar_t *) --c;
1265 return (wchar_t *) c;