2 * Copyright (c) 2008 Secure Endpoints Inc.
4 * Permission is hereby granted, free of charge, to any person
5 * obtaining a copy of this software and associated documentation
6 * files (the "Software"), to deal in the Software without
7 * restriction, including without limitation the rights to use, copy,
8 * modify, merge, publish, distribute, sublicense, and/or sell copies
9 * of the Software, and to permit persons to whom the Software is
10 * furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37 /* This is part of the Microsoft Internationalized Domain Name
39 #include <normalization.h>
41 /* TODO: All the normalization and conversion code should NUL
42 terminate destination strings. */
45 (WINAPI *pNormalizeString)( __in NORM_FORM NormForm,
46 __in_ecount(cwSrcLength) LPCWSTR lpSrcString,
48 __out_ecount(cwDstLength) LPWSTR lpDstString,
49 __in int cwDstLength ) = NULL;
52 (WINAPI *pIsNormalizedString)( __in NORM_FORM NormForm,
53 __in_ecount(cwLength) LPCWSTR lpString,
54 __in int cwLength ) = NULL;
57 #define NLSDLLNAME "Normaliz.dll"
58 #define NLSMAXCCH 1024
61 #define AFS_NORM_FORM NormalizationC
63 long cm_InitNormalization(void)
67 if (pNormalizeString != NULL)
70 h_Nls = LoadLibrary(NLSDLLNAME);
71 if (h_Nls == INVALID_HANDLE_VALUE) {
76 (int (WINAPI *)( NORM_FORM, LPCWSTR,
78 GetProcAddress(h_Nls, "NormalizeString");
82 (WINAPI *)( NORM_FORM, LPCWSTR, int ))
83 GetProcAddress(h_Nls, "IsNormalizedString");
85 return (pNormalizeString && pIsNormalizedString);
88 /* \brief Normalize a UTF-16 string.
90 If the supplied destination buffer is insufficient or NULL, then a
91 new buffer will be allocated to hold the normalized string.
93 \param[in] src : Source UTF-16 string. Length is specified in
96 \param[in] cch_src : The character count in cch_src is assumed to
97 be tight and include the terminating NULL character if there is
98 one. If the NULL is absent, the resulting string will not be
101 \param[out] ext_dest : The destination buffer. Can be NULL, in
102 which case *pcch_dest MUST be 0.
104 \param[in,out] pcch_dest : On entry *pcch_dest contains a count of
105 characters in the destination buffer. On exit, it will contain
106 a count of characters that were copied to the destination
109 Returns a pointer to the buffer containing the normalized string or
110 NULL if the call was unsuccessful. If the returned destination
111 buffer is different from the supplied buffer and non-NULL, it
112 should be freed using free().
115 NormalizeUtf16String(const wchar_t * src, int cch_src, wchar_t * ext_dest, int *pcch_dest)
118 assert (pNormalizeString != NULL && pIsNormalizedString != NULL);
122 cch_src = wcslen(src) + 1;
124 if ((pIsNormalizedString && (*pIsNormalizedString)(AFS_NORM_FORM, src, cch_src)) ||
125 (!pNormalizeString)) {
127 if (ext_dest == NULL || *pcch_dest < cch_src) {
128 ext_dest = malloc(cch_src * sizeof(wchar_t));
129 *pcch_dest = cch_src;
132 /* No need to or unable to normalize. Just copy the string.
133 Note that the string is not NUL terminated if the source
134 string is not NUL terminated. */
137 memcpy(ext_dest, src, cch_src * sizeof(wchar_t));
138 *pcch_dest = cch_src;
150 int cch_dest = *pcch_dest;
154 while (tries-- > 0) {
156 rv = (*pNormalizeString)(AFS_NORM_FORM, src, cch_src, dest, cch_dest);
158 if (rv <= 0 && (gle = GetLastError()) != ERROR_SUCCESS) {
159 if (gle == ERROR_INSUFFICIENT_BUFFER) {
161 /* The buffer wasn't big enough. We are going to
162 try allocating one. */
164 cch_dest = (-rv) + NLSERRCCH;
168 /* Something else is wrong */
172 } else if (rv < 0) { /* rv < 0 && gle == ERROR_SUCCESS */
174 /* Technically not one of the expected outcomes */
177 } else { /* rv > 0 || (rv == 0 && gle == ERROR_SUCCESS) */
179 /* Possibly succeeded */
181 if (rv == 0) { /* Succeeded and the return string is empty */
187 /* Nope. We only calculated the required size of the buffer */
189 cch_dest = rv + NLSERRCCH;
197 /* Can't NUL terminate */
198 cch_dest = max(rv,cch_dest) + NLSERRCCH;
207 if (dest != ext_dest && dest)
209 dest = malloc(cch_dest * sizeof(wchar_t));
214 if (dest != ext_dest && dest)
222 /*! \brief Normalize a Unicode string into a newly allocated buffer
224 The input string will be normalized using NFC.
226 \param[in] s UTF-16 string to be normalized.
228 \param[in] cch_src The number of characters in the input string. If
229 this is -1, then the input string is assumed to be NUL
232 \param[out] pcch_dest Receives the number of characters copied to
233 the output buffer. Note that the character count is the number
234 of wchar_t characters copied, and not the count of Unicode code
235 points. This includes the terminating NUL if cch_src was -1 or
236 included the terminating NUL.
238 \return A newly allocated buffer holding the normalized string or
239 NULL if the call failed.
241 cm_normchar_t * cm_NormalizeStringAlloc(const cm_unichar_t * s, int cch_src, int *pcch_dest)
246 if (s == NULL || cch_src == 0 || *s == L'\0') {
248 *pcch_dest = ((cch_src != 0)? 1: 0);
252 r = NormalizeUtf16String(s, cch_src, NULL, &cch_dest);
255 *pcch_dest = cch_dest;
260 int cm_NormalizeString(const cm_unichar_t * s, int cch_src,
261 cm_normchar_t * dest, int cch_dest)
266 r = NormalizeUtf16String(s, cch_src, dest, &tcch);
269 /* The supplied buffer was insufficient */
277 /*! \brief Convert a UTF-16 string to a UTF-8 string using a newly allocated buffer
279 \param[in] s UTF-16 source string
281 \param[in] cch_src Number of characters in \a s. This can be set to
282 -1 if \a s is NUL terminated.
284 \param[out] pcch_dest Receives a count of characters that were
285 copied to the target buffer.
287 \return A newly allocated buffer holding the UTF-8 string.
290 cm_utf8char_t * cm_Utf16ToUtf8Alloc(const cm_unichar_t * s, int cch_src, int *pcch_dest)
293 cm_utf8char_t * dest;
295 if (s == NULL || cch_src == 0 || *s == L'\0') {
297 *pcch_dest = ((cch_src != 0)?1:0);
301 cch_dest = WideCharToMultiByte(CP_UTF8, 0, s, cch_src, NULL, 0, NULL, FALSE);
305 *pcch_dest = cch_dest;
309 dest = malloc((cch_dest + 1) * sizeof(cm_utf8char_t));
311 WideCharToMultiByte(CP_UTF8, 0, s, cch_src, dest, cch_dest, NULL, FALSE);
315 *pcch_dest = cch_dest;
320 int cm_Utf16ToUtf8(const cm_unichar_t * src, int cch_src,
321 cm_utf8char_t * dest, int cch_dest)
323 return WideCharToMultiByte(CP_UTF8, 0, src, cch_src, dest, cch_dest, NULL, FALSE);
326 int cm_Utf16ToUtf16(const cm_unichar_t * src, int cch_src,
327 cm_unichar_t * dest, int cch_dest)
330 StringCchCopyW(dest, cch_dest, src);
331 return wcslen(dest) + 1;
333 int cch_conv = min(cch_src, cch_dest);
334 memcpy(dest, src, cch_conv * sizeof(cm_unichar_t));
339 /* \brief Normalize a UTF-16 string into a UTF-8 string.
341 \param[in] src : Source string.
343 \param[in] cch_src : Count of characters in src. If the count includes the
344 NULL terminator, then the resulting string will be NULL
345 terminated. If it is -1, then src is assumed to be NULL
348 \param[out] adest : Destination buffer.
350 \param[in] cch_adest : Number of characters in the destination buffer.
352 Returns the number of characters stored into cch_adest. This will
353 include the terminating NULL if cch_src included the terminating
354 NULL or was -1. If this is 0, then the operation was unsuccessful.
356 long cm_NormalizeUtf16StringToUtf8(const wchar_t * src, int cch_src,
357 char * adest, int cch_adest)
362 if (FAILED(StringCchLengthW(src, NLSMAXCCH, &cch)))
369 wchar_t nbuf[NLSMAXCCH];
370 wchar_t * normalized;
371 int cch_norm = NLSMAXCCH;
373 normalized = NormalizeUtf16String(src, cch_src, nbuf, &cch_norm);
375 cch_adest = WideCharToMultiByte(CP_UTF8, 0, normalized, cch_norm,
376 adest, cch_adest, NULL, 0);
378 if (normalized != nbuf && normalized)
391 #define ESCVAL 0x1000
392 #define Esc(c) (ESCVAL + (short)(c))
393 #define IS_ESCAPED(c) (((c) & ESCVAL) == ESCVAL)
395 /* \brief Character sanitization map for CP-1252
397 The following map indicates which characters should be escaped in
398 the CP-1252 character map. Characters that are documented as
399 illegal characters in a file name are marked as escaped. Escaped
400 characters are marked using the ::Esc macro defined above. The
401 following exceptions apply:
403 - Path delimeters '\\' and '/' are NOT escaped because the
404 sanitization map applies to paths. While those characters are
405 illegal in filenames, they are legal in paths.
407 - Wildcard characters '*' and '?' ARE escaped. The document
408 referred below does not specify these characters as invalid.
409 Since no other escape mechanism exists, names containing
410 wildcards are indistinguishable from actual wildcards used in SMB
413 - Reserved names are not and cannot be represented in this map.
416 CON, PRN, AUX, NUL, COM1, COM2, COM3, COM4, COM5, COM6, COM7,
417 COM8, COM9, LPT1, LPT2, LPT3, LPT4, LPT5, LPT6, LPT7, LPT8, LPT9,
420 - Characters 0x80, 0x81, 0x8d, 0x8e, 0x8f, 0x90, 0x9d, 0x9e, 0x9f
421 are also escaped because they are unused in CP-1252 and hence
422 cannot be convered to a Unicode string.
424 Reserved names with extensions are also invalid. (i.e. NUL.txt)
426 \note The only bit we are actually interested in from the following
427 table is the ESCVAL bit. However, the characters themselves are
428 included for ease of maintenance.
430 \see "Naming a File" topic in the Windows SDK.
432 static const short sanitized_escapes_1252[] = {
433 Esc(0x00),Esc(0x01),Esc(0x02),Esc(0x03),Esc(0x04),Esc(0x05),Esc(0x06),Esc(0x07),
434 Esc(0x08),Esc(0x09),Esc(0x0a),Esc(0x0b),Esc(0x0c),Esc(0x0d),Esc(0x0e),Esc(0x0f),
435 Esc(0x10),Esc(0x11),Esc(0x12),Esc(0x13),Esc(0x14),Esc(0x15),Esc(0x16),Esc(0x17),
436 Esc(0x18),Esc(0x19),Esc(0x1a),Esc(0x1b),Esc(0x1c),Esc(0x1d),Esc(0x1e),Esc(0x1f),
437 ' ','!',Esc('"'),'#','$','%','&','\'','(',')',Esc('*'),'+',',','-','.','/',
438 '0','1','2','3','4','5','6','7','8','9',Esc(':'),';',Esc('<'),'=',Esc('>'),Esc('?'),
439 '@','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
440 'P','Q','R','S','T','U','V','W','X','Y','Z','[','\\',']','^','_',
441 '`','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o',
442 'p','q','r','s','t','u','v','w','x','y','z','{',Esc('|'),'}','~',Esc(0x7f),
443 Esc(0x80),Esc(0x81),0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,Esc(0x8d),Esc(0x8e),Esc(0x8f),
444 Esc(0x90),0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,Esc(0x9d),Esc(0x9e),0x9f,
445 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
446 0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
447 0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,
448 0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf,
449 0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
450 0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff
453 static int sanitize_bytestring(const char * src, int cch_src,
454 char * odest, int cch_dest)
457 while (cch_src > 0 && *src && cch_dest > 0) {
461 rc = sanitized_escapes_1252[*src];
462 if (IS_ESCAPED(rc)) {
463 static const char hex[] =
464 {'0','1','2','3','4','5','6','7',
465 '8','9','a','b','c','d','e','f'};
473 *dest++ = hex[(((int)*src) >> 4) & 0x0f];
474 *dest++ = hex[(((int)*src) & 0x0f)];
486 if (cch_src > 0 && cch_dest > 0) {
490 return (int)(dest - odest);
497 long cm_NormalizeUtf8StringToUtf16(const char * src, int cch_src,
498 wchar_t * dest, int cch_dest)
500 wchar_t wsrcbuf[NLSMAXCCH];
505 /* Get some edge cases out first, so we don't have to worry about
506 cch_src being 0 etc. */
509 } else if (*src == '\0') {
516 cch_src = strlen(src) + 1;
519 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
520 cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
523 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
524 char sanitized[NLSMAXCCH];
527 /* If src doesn't have a unicode translation, then it
528 wasn't valid UTF-8. In this case, we assume that src
529 is CP-1252 and then try to convert again. But before
530 that, we use a translation table to "sanitize" the
533 cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
534 sizeof(sanitized)/sizeof(char));
536 if (cch_sanitized == 0) {
543 cch = MultiByteToWideChar(1252, 0, sanitized,
544 cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
546 /* Well, that didn't work either. Something is very wrong. */
558 wnorm = NormalizeUtf16String(wsrcbuf, cch, dest, &cch_norm);
567 /* The buffer was insufficient */
568 if (dest != NULL && cch_dest > 1) {
579 cm_normchar_t *cm_NormalizeUtf8StringToUtf16Alloc(const cm_utf8char_t * src, int cch_src,
582 wchar_t wsrcbuf[NLSMAXCCH];
587 /* Get some edge cases out first, so we don't have to worry about
588 cch_src being 0 etc. */
589 if (cch_src == 0 || src == NULL || *src == '\0') {
591 *pcch_dest = ((cch_src != 0)? 1 : 0);
596 cch_src = strlen(src) + 1;
599 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
600 cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
603 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
604 char sanitized[NLSMAXCCH];
607 /* If src doesn't have a unicode translation, then it
608 wasn't valid UTF-8. In this case, we assume that src
609 is CP-1252 and then try to convert again. But before
610 that, we use a translation table to "sanitize" the
613 cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
614 sizeof(sanitized)/sizeof(char));
616 if (cch_sanitized == 0) {
623 cch = MultiByteToWideChar(1252, 0, sanitized,
624 cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
626 /* Well, that didn't work either. Something is very wrong. */
638 wnorm = NormalizeUtf16String(wsrcbuf, cch, NULL, &cch_norm);
647 *pcch_dest = cch_norm;
652 int cm_Utf8ToUtf16(const cm_utf8char_t * src, int cch_src,
653 cm_unichar_t * dest, int cch_dest)
658 cch_src = strlen(src) + 1;
661 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
662 cch_src * sizeof(char), dest, cch_dest);
665 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
666 char sanitized[NLSMAXCCH];
669 /* If src doesn't have a unicode translation, then it
670 wasn't valid UTF-8. In this case, we assume that src
671 is CP-1252 and then try to convert again. But before
672 that, we use a translation table to "sanitize" the
675 cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
676 sizeof(sanitized)/sizeof(char));
678 if (cch_sanitized == 0) {
685 cch = MultiByteToWideChar(1252, 0, sanitized,
686 cch_sanitized * sizeof(char), dest, cch_dest);
688 /* Well, that didn't work either. Something is very wrong. */
705 cm_unichar_t * cm_Utf8ToUtf16Alloc(const cm_utf8char_t * src, int cch_src, int *pcch_dest)
707 cm_unichar_t * ustr = NULL;
710 if (cch_src == 0 || src == NULL || *src == '\0') {
712 *pcch_dest = ((cch_src != 0)? 1 : 0);
717 cch_src = strlen(src) + 1;
720 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
721 cch_src * sizeof(char), NULL, 0);
724 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
725 char sanitized[NLSMAXCCH];
728 /* If src doesn't have a unicode translation, then it
729 wasn't valid UTF-8. In this case, we assume that src
730 is CP-1252 and then try to convert again. But before
731 that, we use a translation table to "sanitize" the
734 cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
735 sizeof(sanitized)/sizeof(char));
737 if (cch_sanitized == 0) {
744 cch = MultiByteToWideChar(1252, 0, sanitized,
745 cch_sanitized * sizeof(char), NULL, 0);
747 /* Well, that didn't work either. Something is very wrong. */
754 ustr = malloc((cch + 1) * sizeof(wchar_t));
756 cch = MultiByteToWideChar(1252, 0, sanitized,
757 cch_sanitized * sizeof(char), ustr, cch);
764 ustr = malloc((cch + 1) * sizeof(wchar_t));
766 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
767 cch_src * sizeof(char), ustr, cch);
779 /* \brief Normalize a UTF-8 string.
781 \param[in] src String to normalize.
783 \param[in] cch_src : Count of characters in src. If this value is
784 -1, then src is assumed to be NULL terminated. The translated
785 string will be NULL terminated only if this is -1 or the count
786 includes the terminating NULL.
788 \param[out] adest : Destination string. Only considered valid if
789 \a cch_adest is non-zero.
791 \param[in] cch_adest : Number of characters in the destination
792 string. If this is zero, then the return value is the number
795 \return If \a cch_adest is non-zero, then the return value is the
796 number of bytes stored into adest. If \a cch_adest is zero,
797 then the return value is the number of bytes required. In both
798 cases, the return value is 0 if the call was unsuccessful.
800 long cm_NormalizeUtf8String(const char * src, int cch_src,
801 char * adest, int cch_adest)
803 wchar_t wsrcbuf[NLSMAXCCH];
808 /* Get some edge cases out first, so we don't have to worry about
809 cch_src being 0 etc. */
812 } else if (*src == '\0') {
819 cch_src = strlen(src) + 1;
822 cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
823 cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
826 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
827 char sanitized[NLSMAXCCH];
830 /* If src doesn't have a unicode translation, then it
831 wasn't valid UTF-8. In this case, we assume that src
832 is CP-1252 and then try to convert again. But before
833 that, we use a translation table to "sanitize" the
836 cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
837 sizeof(sanitized)/sizeof(char));
839 if (cch_sanitized == 0) {
846 cch = MultiByteToWideChar(1252, 0, sanitized,
847 cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
849 /* Well, that didn't work either. Something is very wrong. */
861 wnorm = NormalizeUtf16String(wsrcbuf, cch, NULL, &cch_norm);
869 cch = WideCharToMultiByte(CP_UTF8, 0, wnorm,
870 cch_norm, adest, cch_adest * sizeof(char),
879 /*! \brief Case insensitive comparison with specific length
881 \param[in] str1 First string to compare. Assumed to be encoded in UTF-8.
883 \param[in] str2 Second string to compare. Assumed to be encoded in UTF-8.
885 \param[in] n Max byte count.
888 int cm_strnicmp_utf8(const char * str1, const char * str2, int n)
890 wchar_t wstr1[NLSMAXCCH];
893 wchar_t wstr2[NLSMAXCCH];
896 /* first check for NULL pointers (assume NULL < "") */
902 } else if (str2 == NULL) {
906 len1 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str1, n, wstr1, NLSMAXCCH);
914 len2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str2, n, wstr2, NLSMAXCCH);
922 rv = CompareStringW(LOCALE_INVARIANT, NORM_IGNORECASE, wstr1, len1, wstr2, len2);
933 int cm_strnicmp_utf16(const cm_unichar_t * str1, const cm_unichar_t * str2, int len)
939 /* first check for NULL pointers */
945 } else if (str2 == NULL) {
949 if (FAILED(StringCchLengthW(str1, len, &cch1)))
952 if (FAILED(StringCchLengthW(str2, len, &cch2)))
955 rv = CompareStringW(LOCALE_INVARIANT, NORM_IGNORECASE, str1, cch1, str2, cch2);
966 int cm_stricmp_utf16(const cm_unichar_t * str1, const cm_unichar_t * str2)
970 /* first check for NULL pointers */
976 } else if (str2 == NULL) {
980 rv = CompareStringW(LOCALE_INVARIANT, NORM_IGNORECASE, str1, -1, str2, -1);
991 cm_unichar_t *cm_strlwr_utf16(cm_unichar_t * str)
996 len = wcslen(str) + 1;
997 rv = LCMapStringW(LOCALE_INVARIANT, LCMAP_LOWERCASE, str, len, str, len);
1007 cm_unichar_t *cm_strupr_utf16(cm_unichar_t * str)
1012 len = wcslen(str) + 1;
1013 rv = LCMapStringW(LOCALE_INVARIANT, LCMAP_UPPERCASE, str, len, str, len);
1024 int cm_stricmp_utf8(const char * str1, const char * str2)
1026 wchar_t wstr1[NLSMAXCCH];
1029 wchar_t wstr2[NLSMAXCCH];
1032 /* first check for NULL pointers */
1038 } else if (str2 == NULL) {
1042 len1 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str1, -1, wstr1, NLSMAXCCH);
1050 len2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str2, -1, wstr2, NLSMAXCCH);
1058 rv = CompareStringW(LOCALE_INVARIANT, NORM_IGNORECASE, wstr1, len1, wstr2, len2);
1070 wchar_t * strupr_utf16(wchar_t * wstr, size_t cbstr)
1072 wchar_t wstrd[NLSMAXCCH];
1075 len = cbstr / sizeof(wchar_t);
1076 len = LCMapStringW(LOCALE_INVARIANT, LCMAP_UPPERCASE, wstr, len, wstrd, NLSMAXCCH);
1077 StringCbCopyW(wstr, cbstr, wstrd);
1083 char * strupr_utf8(char * str, size_t cbstr)
1085 wchar_t wstr[NLSMAXCCH];
1086 wchar_t wstrd[NLSMAXCCH];
1089 len = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, -1, wstr, NLSMAXCCH);
1093 len = LCMapStringW(LOCALE_INVARIANT, LCMAP_UPPERCASE, wstr, len, wstrd, NLSMAXCCH);
1095 len = WideCharToMultiByte(CP_UTF8, 0, wstrd, -1, str, cbstr, NULL, FALSE);
1100 char * char_next_utf8(const char * c)
1102 #define CH (*((const unsigned char *)c))
1104 if ((CH & 0x80) == 0)
1105 return (char *) c+1;
1107 switch (CH & 0xf0) {
1110 return (char *) c+2;
1113 return (char *) c+3;
1116 return (char *) c+4;
1119 return (char *) c+1;
1126 char * char_prev_utf8(const char * c)
1128 #define CH (*((const unsigned char *)c))
1132 if ((CH & 0x80) == 0)
1135 while ((CH & 0xc0) == 0x80)
1142 wchar_t * char_next_utf16(const wchar_t * c)
1144 unsigned short sc = (unsigned short) *c;
1146 if (sc >= 0xd800 && sc <= 0xdbff)
1147 return (wchar_t *) c+2;
1148 return (wchar_t *) c+1;
1151 wchar_t * char_prev_utf16(const wchar_t * c)
1153 unsigned short sc = (unsigned short) *(--c);
1155 if (sc >= 0xdc00 && sc <= 0xdfff)
1156 return (wchar_t *) --c;
1157 return (wchar_t *) c;
1160 wchar_t * char_this_utf16(const wchar_t * c)
1162 unsigned short sc = (unsigned short) *c;
1164 if (sc >= 0xdc00 && sc <= 0xdfff)
1165 return (wchar_t *) --c;
1166 return (wchar_t *) c;