src/WINNT/afsd/cm_nls.c

   1 /*
   2  * Copyright (c) 2008 Secure Endpoints Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person
   5  * obtaining a copy of this software and associated documentation
   6  * files (the "Software"), to deal in the Software without
   7  * restriction, including without limitation the rights to use, copy,
   8  * modify, merge, publish, distribute, sublicense, and/or sell copies
   9  * of the Software, and to permit persons to whom the Software is
  10  * furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include <windows.h>
  26 #include <stdlib.h>
  27 #include <wchar.h>
  28 #include <strsafe.h>
  29 #include <stdio.h>
  30 #include <errno.h>
  31
  32 #include "cm_nls.h"
  33
  34 #ifdef DEBUG_UNICODE
  35 #include <assert.h>
  36 #endif
  37
  38 /* This is part of the Microsoft Internationalized Domain Name
  39    Mitigation APIs. */
  40 #include <normalization.h>
  41
  42 /* TODO: All the normalization and conversion code should NUL
  43    terminate destination strings. */
  44
  45 int
  46 (WINAPI *pNormalizeString)( __in NORM_FORM NormForm,
  47                             __in_ecount(cwSrcLength) LPCWSTR lpSrcString,
  48                             __in int cwSrcLength,
  49                             __out_ecount(cwDstLength) LPWSTR lpDstString,
  50                             __in int cwDstLength ) = NULL;
  51
  52 BOOL
  53 (WINAPI *pIsNormalizedString)( __in NORM_FORM NormForm,
  54                                __in_ecount(cwLength) LPCWSTR lpString,
  55                                __in int cwLength ) = NULL;
  56
  57
  58 #define NLSDLLNAME "Normaliz.dll"
  59 #define NLSMAXCCH  1024
  60 #define NLSERRCCH  8
  61
  62 #define AFS_NORM_FORM NormalizationC
  63
  64 static LCID nls_lcid = LOCALE_INVARIANT;
  65
  66 static int nls_init = 0;
  67
  68 static BOOL
  69 is_windows_2000 (void)
  70 {
  71    static BOOL fChecked = FALSE;
  72    static BOOL fIsWin2K = FALSE;
  73
  74    if (!fChecked)
  75    {
  76        OSVERSIONINFO Version;
  77
  78        memset (&Version, 0x00, sizeof(Version));
  79        Version.dwOSVersionInfoSize = sizeof(Version);
  80
  81        if (GetVersionEx (&Version))
  82        {
  83            if (Version.dwPlatformId == VER_PLATFORM_WIN32_NT &&
  84                 Version.dwMajorVersion >= 5)
  85                fIsWin2K = TRUE;
  86        }
  87        fChecked = TRUE;
  88    }
  89
  90    return fIsWin2K;
  91 }
  92
  93 long cm_InitNormalization(void)
  94 {
  95     HMODULE h_Nls;
  96
  97     if (pNormalizeString != NULL)
  98         return 0;
  99
 100     h_Nls = LoadLibrary(NLSDLLNAME);
 101     if (h_Nls == INVALID_HANDLE_VALUE) {
 102         return 1;
 103     }
 104
 105     pNormalizeString =
 106         (int (WINAPI *)( NORM_FORM, LPCWSTR,
 107                          int, LPWSTR, int))
 108         GetProcAddress(h_Nls, "NormalizeString");
 109
 110     pIsNormalizedString =
 111         (BOOL
 112          (WINAPI *)( NORM_FORM, LPCWSTR, int ))
 113         GetProcAddress(h_Nls, "IsNormalizedString");
 114
 115     if (is_windows_2000())
 116         nls_lcid = MAKELCID(MAKELANGID(LANG_ENGLISH, SUBLANG_ENGLISH_US), SORT_DEFAULT);
 117
 118     nls_init = 1;
 119
 120     return (pNormalizeString && pIsNormalizedString);
 121 }
 122
 123 /* \brief Normalize a UTF-16 string.
 124
 125    If the supplied destination buffer is insufficient or NULL, then a
 126    new buffer will be allocated to hold the normalized string.
 127
 128    \param[in] src : Source UTF-16 string.  Length is specified in
 129        cch_src.
 130
 131    \param[in] cch_src : The character count in cch_src is assumed to
 132        be tight and include the terminating NULL character if there is
 133        one.  If the NULL is absent, the resulting string will not be
 134        NULL terminated.
 135
 136    \param[out] ext_dest : The destination buffer.  Can be NULL, in
 137        which case *pcch_dest MUST be 0.
 138
 139    \param[in,out] pcch_dest : On entry *pcch_dest contains a count of
 140        characters in the destination buffer.  On exit, it will contain
 141        a count of characters that were copied to the destination
 142        buffer.
 143
 144    Returns a pointer to the buffer containing the normalized string or
 145    NULL if the call was unsuccessful.  If the returned destination
 146    buffer is different from the supplied buffer and non-NULL, it
 147    should be freed using free().
 148 */
 149 static wchar_t *
 150 NormalizeUtf16String(const wchar_t * src, int cch_src, wchar_t * ext_dest, int *pcch_dest)
 151 {
 152     if (!nls_init)
 153         cm_InitNormalization();
 154
 155 #ifdef DEBUG_UNICODE
 156     assert (pNormalizeString != NULL && pIsNormalizedString != NULL);
 157 #endif
 158
 159     if (cch_src == -1)
 160         cch_src = wcslen(src) + 1;
 161
 162     if ((pIsNormalizedString && (*pIsNormalizedString)(AFS_NORM_FORM, src, cch_src)) ||
 163         (!pNormalizeString)) {
 164
 165         if (ext_dest == NULL || *pcch_dest < cch_src) {
 166             ext_dest = malloc(cch_src * sizeof(wchar_t));
 167             *pcch_dest = cch_src;
 168         }
 169
 170         /* No need to or unable to normalize.  Just copy the string.
 171            Note that the string is not NUL terminated if the source
 172            string is not NUL terminated. */
 173
 174         if (ext_dest) {
 175             memcpy(ext_dest, src, cch_src * sizeof(wchar_t));
 176             *pcch_dest = cch_src;
 177         } else {
 178             *pcch_dest = 0;
 179         }
 180         return ext_dest;
 181
 182     } else {
 183
 184         int rv;
 185         DWORD gle;
 186         int tries = 10;
 187         wchar_t * dest;
 188         int cch_dest = *pcch_dest;
 189
 190         dest = ext_dest;
 191
 192         while (tries-- > 0) {
 193
 194             rv = (*pNormalizeString)(AFS_NORM_FORM, src, cch_src, dest, cch_dest);
 195
 196             if (rv <= 0 && (gle = GetLastError()) != ERROR_SUCCESS) {
 197                 if (gle == ERROR_INSUFFICIENT_BUFFER) {
 198
 199                     /* The buffer wasn't big enough.  We are going to
 200                        try allocating one. */
 201
 202                     cch_dest = (-rv) + NLSERRCCH;
 203                     goto cont;
 204
 205                 } else {
 206                     /* Something else is wrong */
 207                     break;
 208                 }
 209
 210             } else if (rv < 0) { /* rv < 0 && gle == ERROR_SUCCESS */
 211
 212                 /* Technically not one of the expected outcomes */
 213                 break;
 214
 215             } else {            /* rv > 0 || (rv == 0 && gle == ERROR_SUCCESS) */
 216
 217                 /* Possibly succeeded */
 218
 219                 if (rv == 0) { /* Succeeded and the return string is empty */
 220                     *pcch_dest = 0;
 221                     return dest;
 222                 }
 223
 224                 if (cch_dest == 0) {
 225                     /* Nope.  We only calculated the required size of the buffer */
 226
 227                     cch_dest = rv + NLSERRCCH;
 228                     goto cont;
 229                 }
 230
 231                 *pcch_dest = rv;
 232                 if (cch_dest > rv)
 233                     dest[rv] = 0;
 234                 else {
 235                     /* Can't NUL terminate */
 236                     cch_dest = max(rv,cch_dest) + NLSERRCCH;
 237                     goto cont;
 238                 }
 239
 240                 /* Success! */
 241                 return dest;
 242             }
 243
 244         cont:
 245             if (dest != ext_dest && dest)
 246                 free(dest);
 247             dest = malloc(cch_dest * sizeof(wchar_t));
 248         }
 249
 250         /* Failed */
 251
 252         if (dest != ext_dest && dest)
 253             free(dest);
 254
 255         *pcch_dest = 0;
 256         return NULL;
 257     }
 258 }
 259
 260 /*! \brief Normalize a Unicode string into a newly allocated buffer
 261
 262   The input string will be normalized using NFC.
 263
 264   \param[in] s UTF-16 string to be normalized.
 265
 266   \param[in] cch_src The number of characters in the input string.  If
 267       this is -1, then the input string is assumed to be NUL
 268       terminated.
 269
 270   \param[out] pcch_dest Receives the number of characters copied to
 271       the output buffer.  Note that the character count is the number
 272       of wchar_t characters copied, and not the count of Unicode code
 273       points.  This includes the terminating NUL if cch_src was -1 or
 274       included the terminating NUL.
 275
 276   \return A newly allocated buffer holding the normalized string or
 277       NULL if the call failed.
 278  */
 279 cm_normchar_t * cm_NormalizeStringAlloc(const cm_unichar_t * s, int cch_src, int *pcch_dest)
 280 {
 281     int cch_dest = 0;
 282     cm_normchar_t * r;
 283
 284     if (!nls_init)
 285         cm_InitNormalization();
 286
 287     if (s == NULL || cch_src == 0 || *s == L'\0') {
 288         if (pcch_dest)
 289             *pcch_dest = ((cch_src != 0)? 1: 0);
 290         return wcsdup(L"");
 291     }
 292
 293     r = NormalizeUtf16String(s, cch_src, NULL, &cch_dest);
 294
 295     if (pcch_dest)
 296         *pcch_dest = cch_dest;
 297
 298     return r;
 299 }
 300
 301 int cm_NormalizeString(const cm_unichar_t * s, int cch_src,
 302                        cm_normchar_t * dest, int cch_dest)
 303 {
 304     int tcch = cch_dest;
 305     cm_normchar_t * r;
 306
 307     if (!nls_init)
 308         cm_InitNormalization();
 309
 310     r = NormalizeUtf16String(s, cch_src, dest, &tcch);
 311
 312     if (r != dest) {
 313         /* The supplied buffer was insufficient */
 314         free(r);
 315         return 0;
 316     } else {
 317         return tcch;
 318     }
 319 }
 320
 321 /*! \brief Convert a UTF-16 string to a UTF-8 string using a newly allocated buffer
 322
 323   \param[in] s UTF-16 source string
 324
 325   \param[in] cch_src Number of characters in \a s. This can be set to
 326       -1 if \a s is NUL terminated.
 327
 328   \param[out] pcch_dest Receives a count of characters that were
 329       copied to the target buffer.
 330
 331   \return A newly allocated buffer holding the UTF-8 string.
 332
 333  */
 334 cm_utf8char_t * cm_Utf16ToUtf8Alloc(const cm_unichar_t * s, int cch_src, int *pcch_dest)
 335 {
 336     int cch_dest;
 337     cm_utf8char_t * dest;
 338
 339     if (!nls_init)
 340         cm_InitNormalization();
 341
 342     if (s == NULL || cch_src == 0 || *s == L'\0') {
 343         if (pcch_dest)
 344             *pcch_dest = ((cch_src != 0)?1:0);
 345         return strdup("");
 346     }
 347
 348     cch_dest = WideCharToMultiByte(CP_UTF8, 0, s, cch_src, NULL, 0, NULL, FALSE);
 349
 350     if (cch_dest == 0) {
 351         if (pcch_dest)
 352             *pcch_dest = cch_dest;
 353         return NULL;
 354     }
 355
 356     dest = malloc((cch_dest + 1) * sizeof(cm_utf8char_t));
 357
 358     WideCharToMultiByte(CP_UTF8, 0, s, cch_src, dest, cch_dest, NULL, FALSE);
 359     dest[cch_dest] = 0;
 360
 361     if (pcch_dest)
 362         *pcch_dest = cch_dest;
 363
 364     return dest;
 365 }
 366
 367 int cm_Utf16ToUtf8(const cm_unichar_t * src, int cch_src,
 368                    cm_utf8char_t * dest, int cch_dest)
 369 {
 370     if (!nls_init)
 371         cm_InitNormalization();
 372
 373     return WideCharToMultiByte(CP_UTF8, 0, src, cch_src, dest, cch_dest, NULL, FALSE);
 374 }
 375
 376 int cm_Utf16ToUtf16(const cm_unichar_t * src, int cch_src,
 377                     cm_unichar_t * dest, int cch_dest)
 378 {
 379     if (!nls_init)
 380         cm_InitNormalization();
 381
 382     if (cch_src == -1) {
 383         StringCchCopyW(dest, cch_dest, src);
 384         return wcslen(dest) + 1;
 385     } else {
 386         int cch_conv = min(cch_src, cch_dest);
 387         memcpy(dest, src, cch_conv * sizeof(cm_unichar_t));
 388         return cch_conv;
 389     }
 390 }
 391
 392 /* \brief Normalize a UTF-16 string into a UTF-8 string.
 393
 394    \param[in] src : Source string.
 395
 396    \param[in] cch_src : Count of characters in src. If the count includes the
 397        NULL terminator, then the resulting string will be NULL
 398        terminated.  If it is -1, then src is assumed to be NULL
 399        terminated.
 400
 401    \param[out] adest : Destination buffer.
 402
 403    \param[in] cch_adest : Number of characters in the destination buffer.
 404
 405    Returns the number of characters stored into cch_adest. This will
 406    include the terminating NULL if cch_src included the terminating
 407    NULL or was -1.  If this is 0, then the operation was unsuccessful.
 408  */
 409 long cm_NormalizeUtf16StringToUtf8(const wchar_t * src, int cch_src,
 410                                    char * adest, int cch_adest)
 411 {
 412     if (!nls_init)
 413         cm_InitNormalization();
 414
 415     if (cch_src < 0) {
 416         size_t cch;
 417
 418         if (FAILED(StringCchLengthW(src, NLSMAXCCH, &cch)))
 419             return E2BIG;
 420
 421         cch_src = cch+1;
 422     }
 423
 424     {
 425         wchar_t nbuf[NLSMAXCCH];
 426         wchar_t * normalized;
 427         int cch_norm = NLSMAXCCH;
 428
 429         normalized = NormalizeUtf16String(src, cch_src, nbuf, &cch_norm);
 430         if (normalized) {
 431             cch_adest = WideCharToMultiByte(CP_UTF8, 0, normalized, cch_norm,
 432                                             adest, cch_adest, NULL, 0);
 433
 434             if (normalized != nbuf && normalized)
 435                 free(normalized);
 436
 437             return cch_adest;
 438
 439         } else {
 440
 441             return 0;
 442
 443         }
 444     }
 445 }
 446
 447 #define ESCVAL 0x1000
 448 #define Esc(c) (ESCVAL + (short)(c))
 449 #define IS_ESCAPED(c) (((c) & ESCVAL) == ESCVAL)
 450
 451 /* \brief Character sanitization map for CP-1252
 452
 453    The following map indicates which characters should be escaped in
 454    the CP-1252 character map.  Characters that are documented as
 455    illegal characters in a file name are marked as escaped.  Escaped
 456    characters are marked using the ::Esc macro defined above.  The
 457    following exceptions apply:
 458
 459    - Path delimeters '\\' and '/' are NOT escaped because the
 460      sanitization map applies to paths.  While those characters are
 461      illegal in filenames, they are legal in paths.
 462
 463    - Wildcard characters '*' and '?' ARE escaped.  The document
 464      referred below does not specify these characters as invalid.
 465      Since no other escape mechanism exists, names containing
 466      wildcards are indistinguishable from actual wildcards used in SMB
 467      requests.
 468
 469    - Reserved names are not and cannot be represented in this map.
 470      Reserved names are :
 471
 472      CON, PRN, AUX, NUL, COM1, COM2, COM3, COM4, COM5, COM6, COM7,
 473      COM8, COM9, LPT1, LPT2, LPT3, LPT4, LPT5, LPT6, LPT7, LPT8, LPT9,
 474      CLOCK$
 475
 476    - Characters 0x80, 0x81, 0x8d, 0x8e, 0x8f, 0x90, 0x9d, 0x9e, 0x9f
 477      are also escaped because they are unused in CP-1252 and hence
 478      cannot be convered to a Unicode string.
 479
 480      Reserved names with extensions are also invalid. (i.e. NUL.txt)
 481
 482    \note The only bit we are actually interested in from the following
 483      table is the ESCVAL bit.  However, the characters themselves are
 484      included for ease of maintenance.
 485
 486    \see "Naming a File" topic in the Windows SDK.
 487  */
 488 static const short sanitized_escapes_1252[] = {
 489     Esc(0x00),Esc(0x01),Esc(0x02),Esc(0x03),Esc(0x04),Esc(0x05),Esc(0x06),Esc(0x07),
 490     Esc(0x08),Esc(0x09),Esc(0x0a),Esc(0x0b),Esc(0x0c),Esc(0x0d),Esc(0x0e),Esc(0x0f),
 491     Esc(0x10),Esc(0x11),Esc(0x12),Esc(0x13),Esc(0x14),Esc(0x15),Esc(0x16),Esc(0x17),
 492     Esc(0x18),Esc(0x19),Esc(0x1a),Esc(0x1b),Esc(0x1c),Esc(0x1d),Esc(0x1e),Esc(0x1f),
 493     ' ','!',Esc('"'),'#','$','%','&','\'','(',')',Esc('*'),'+',',','-','.','/',
 494     '0','1','2','3','4','5','6','7','8','9',Esc(':'),';',Esc('<'),'=',Esc('>'),Esc('?'),
 495     '@','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
 496     'P','Q','R','S','T','U','V','W','X','Y','Z','[','\\',']','^','_',
 497     '`','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o',
 498     'p','q','r','s','t','u','v','w','x','y','z','{',Esc('|'),'}','~',Esc(0x7f),
 499     Esc(0x80),Esc(0x81),0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,Esc(0x8d),Esc(0x8e),Esc(0x8f),
 500     Esc(0x90),0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,Esc(0x9d),Esc(0x9e),0x9f,
 501     0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
 502     0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
 503     0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,
 504     0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf,
 505     0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
 506     0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff
 507 };
 508
 509 static int sanitize_bytestring(const char * src, int cch_src,
 510                                char * odest, int cch_dest)
 511 {
 512     char * dest = odest;
 513
 514     if (!nls_init)
 515         cm_InitNormalization();
 516
 517     while (cch_src > 0 && *src && cch_dest > 0) {
 518
 519         unsigned short rc;
 520
 521         rc = sanitized_escapes_1252[*src];
 522         if (IS_ESCAPED(rc)) {
 523             static const char hex[] =
 524                 {'0','1','2','3','4','5','6','7',
 525                  '8','9','a','b','c','d','e','f'};
 526
 527             if (cch_dest < 3) {
 528                 *dest++ = '\0';
 529                 return 0;
 530             }
 531
 532             *dest++ = '%';
 533             *dest++ = hex[(((int)*src) >> 4) & 0x0f];
 534             *dest++ = hex[(((int)*src) & 0x0f)];
 535             cch_dest -= 3;
 536
 537         } else {
 538             *dest++ = *src;
 539             cch_dest--;
 540         }
 541
 542         cch_src--;
 543         src++;
 544     }
 545
 546     if (cch_src > 0 && cch_dest > 0) {
 547         *dest++ = '\0';
 548     }
 549
 550     return (int)(dest - odest);
 551 }
 552
 553 static int sanitize_utf16char(wchar_t c, wchar_t ** pdest, size_t * pcch)
 554 {
 555     if (*pcch >= 6) {
 556         StringCchPrintfExW(*pdest, *pcch, pdest, pcch, 0, L"%%%04x", (int) c);
 557         return 1;
 558     } else {
 559         return 0;
 560     }
 561 }
 562
 563 static int sanitize_utf16string(const wchar_t * src, size_t cch_src,
 564                                 wchar_t * dest, size_t cch_dest)
 565 {
 566     int cch_dest_o = cch_dest;
 567
 568     if (dest == NULL) {
 569         /* only estimating */
 570         for (cch_dest = 0; cch_src > 0;) {
 571             if (*src >= 0xd800 && *src < 0xdc00) {
 572                 if (cch_src <= 1 || src[1] < 0xdc00 || src[1] > 0xdfff) {
 573                     /* dangling surrogate */
 574                     src++;
 575                     cch_src --;
 576                     cch_dest += 5;
 577                 } else {
 578                     /* surrogate pair */
 579                     src += 2;
 580                     cch_src -= 2;
 581                     cch_dest += 2;
 582                 }
 583             } else if (*src >= 0xdc00 && *src <= 0xdfff) {
 584                 /* dangling surrogate */
 585                 src++;
 586                 cch_src --;
 587                 cch_dest += 5;
 588             } else {
 589                 /* normal char */
 590                 src++; cch_src --;
 591                 cch_dest++;
 592             }
 593         }
 594
 595         return cch_dest;
 596     }
 597
 598     while (cch_src > 0 && cch_dest > 0) {
 599         if (*src >= 0xd800 && *src < 0xdc00) {
 600             if (cch_src <= 1 || src[1] < 0xdc00 || src[1] > 0xdfff) {
 601                 if (!sanitize_utf16char(*src++, &dest, &cch_dest))
 602                     return 0;
 603                 cch_src--;
 604             } else {
 605                 /* found a surrogate pair */
 606                 *dest++ = *src++;
 607                 *dest++ = *src++;
 608                 cch_dest -= 2; cch_src -= 2;
 609             }
 610         } else if (*src >= 0xdc00 && *src <= 0xdfff) {
 611             if (!sanitize_utf16char(*src++, &dest, &cch_dest))
 612                 return 0;
 613             cch_src--;
 614         } else {
 615             *dest++ = *src++;
 616             cch_dest--; cch_src--;
 617         }
 618     }
 619
 620     return (cch_src == 0) ? cch_dest_o - cch_dest : 0;
 621 }
 622
 623 #undef Esc
 624 #undef IS_ESCAPED
 625 #undef ESCVAL
 626
 627 long cm_NormalizeUtf8StringToUtf16(const char * src, int cch_src,
 628                                    wchar_t * dest, int cch_dest)
 629 {
 630     wchar_t wsrcbuf[NLSMAXCCH];
 631     wchar_t *wnorm;
 632     int cch;
 633     int cch_norm;
 634
 635     if (!nls_init)
 636         cm_InitNormalization();
 637
 638     /* Get some edge cases out first, so we don't have to worry about
 639        cch_src being 0 etc. */
 640     if (cch_src == 0) {
 641         return 0;
 642     } else if (*src == '\0') {
 643         if (cch_dest >= 1)
 644             *dest = L'\0';
 645         return 1;
 646     }
 647
 648     if (dest && cch_dest > 0) {
 649         dest[0] = L'\0';
 650     }
 651
 652     if (cch_src == -1) {
 653         cch_src = strlen(src) + 1;
 654     }
 655
 656     cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
 657                               cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
 658
 659     if (cch != 0 && !cm_is_valid_utf16(wsrcbuf, cch)) {
 660         wchar_t wsanitized[NLSMAXCCH];
 661
 662         /* We successfully converted, but the resulting UTF-16 string
 663            has dangling surrogates.  We should try and escape those
 664            next.  */
 665         cch = sanitize_utf16string(wsrcbuf, cch, wsanitized, NLSMAXCCH);
 666         if (cch != 0) {
 667             memcpy(wsrcbuf, wsanitized, cch * sizeof(wchar_t));
 668         }
 669     }
 670
 671     if (cch == 0) {
 672         if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
 673             char sanitized[NLSMAXCCH];
 674             int cch_sanitized;
 675
 676             /* If src doesn't have a unicode translation, then it
 677                wasn't valid UTF-8.  In this case, we assume that src
 678                is CP-1252 and then try to convert again.  But before
 679                that, we use a translation table to "sanitize" the
 680                input. */
 681
 682             cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
 683                                                 sizeof(sanitized)/sizeof(char));
 684
 685             if (cch_sanitized == 0) {
 686 #ifdef DEBUG_UNICODE
 687                 DebugBreak();
 688 #endif
 689                 return 0;
 690             }
 691
 692             cch = MultiByteToWideChar(1252, 0, sanitized,
 693                                       cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
 694             if (cch == 0) {
 695                 /* Well, that didn't work either.  Something is very wrong. */
 696 #ifdef DEBUG_UNICODE
 697                 DebugBreak();
 698 #endif
 699                 return 0;
 700             }
 701         } else {
 702             return 0;
 703         }
 704     }
 705
 706     cch_norm = cch_dest;
 707     wnorm = NormalizeUtf16String(wsrcbuf, cch, dest, &cch_norm);
 708     if (wnorm == NULL) {
 709 #ifdef DEBUG_UNICODE
 710         DebugBreak();
 711 #endif
 712         return 0;
 713     }
 714
 715     if (wnorm != dest) {
 716         /* The buffer was insufficient */
 717         if (dest != NULL && cch_dest > 1) {
 718             *dest = L'\0';
 719             cch_norm = 0;
 720         }
 721
 722         free(wnorm);
 723     }
 724
 725     return cch_norm;
 726 }
 727
 728 cm_normchar_t *cm_NormalizeUtf8StringToUtf16Alloc(const cm_utf8char_t * src, int cch_src,
 729                                                   int *pcch_dest)
 730 {
 731     wchar_t wsrcbuf[NLSMAXCCH];
 732     wchar_t *wnorm;
 733     int cch;
 734     int cch_norm;
 735
 736     if (!nls_init)
 737         cm_InitNormalization();
 738
 739     /* Get some edge cases out first, so we don't have to worry about
 740        cch_src being 0 etc. */
 741     if (cch_src == 0 || src == NULL || *src == '\0') {
 742         if (pcch_dest)
 743             *pcch_dest = ((cch_src != 0)? 1 : 0);
 744         return wcsdup(L"");
 745     }
 746
 747     if (cch_src == -1) {
 748         cch_src = strlen(src) + 1;
 749     }
 750
 751     cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
 752                               cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
 753
 754     if (cch != 0 && !cm_is_valid_utf16(wsrcbuf, cch)) {
 755         wchar_t wsanitized[NLSMAXCCH];
 756
 757         /* We successfully converted, but the resulting UTF-16 string
 758            has dangling surrogates.  We should try and escape those
 759            next.  */
 760         cch = sanitize_utf16string(wsrcbuf, cch, wsanitized, NLSMAXCCH);
 761         if (cch != 0) {
 762             memcpy(wsrcbuf, wsanitized, cch * sizeof(wchar_t));
 763         }
 764     }
 765
 766     if (cch == 0) {
 767         if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
 768             char sanitized[NLSMAXCCH];
 769             int cch_sanitized;
 770
 771             /* If src doesn't have a unicode translation, then it
 772                wasn't valid UTF-8.  In this case, we assume that src
 773                is CP-1252 and then try to convert again.  But before
 774                that, we use a translation table to "sanitize" the
 775                input. */
 776
 777             cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
 778                                                 sizeof(sanitized)/sizeof(char));
 779
 780             if (cch_sanitized == 0) {
 781 #ifdef DEBUG_UNICODE
 782                 DebugBreak();
 783 #endif
 784                 return NULL;
 785             }
 786
 787             cch = MultiByteToWideChar(1252, 0, sanitized,
 788                                       cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
 789             if (cch == 0) {
 790                 /* Well, that didn't work either.  Something is very wrong. */
 791 #ifdef DEBUG_UNICODE
 792                 DebugBreak();
 793 #endif
 794                 return NULL;
 795             }
 796         } else {
 797             return NULL;
 798         }
 799     }
 800
 801     cch_norm = 0;
 802     wnorm = NormalizeUtf16String(wsrcbuf, cch, NULL, &cch_norm);
 803     if (wnorm == NULL) {
 804 #ifdef DEBUG_UNICODE
 805         DebugBreak();
 806 #endif
 807         return NULL;
 808     }
 809
 810     if (pcch_dest)
 811         *pcch_dest = cch_norm;
 812
 813     return wnorm;
 814 }
 815
 816 int cm_Utf8ToUtf16(const cm_utf8char_t * src, int cch_src,
 817                    cm_unichar_t * dest, int cch_dest)
 818 {
 819     int cch;
 820
 821     if (cch_dest >= 1 && dest != NULL) {
 822         dest[0] = L'\0';
 823     }
 824
 825     if (!nls_init)
 826         cm_InitNormalization();
 827
 828     if (cch_src == -1) {
 829         cch_src = strlen(src) + 1;
 830     }
 831
 832     cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
 833                               cch_src * sizeof(char), dest, cch_dest);
 834
 835     if (cch != 0 && !cm_is_valid_utf16(dest, cch)) {
 836         wchar_t wsanitized[NLSMAXCCH];
 837
 838         cch = sanitize_utf16string(dest, cch, wsanitized, NLSMAXCCH);
 839         if (cch != 0) {
 840             memcpy(dest, wsanitized, cch * sizeof(wchar_t));
 841         }
 842     }
 843
 844     if (cch == 0) {
 845         if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
 846             char sanitized[NLSMAXCCH];
 847             int cch_sanitized;
 848
 849             /* If src doesn't have a unicode translation, then it
 850                wasn't valid UTF-8.  In this case, we assume that src
 851                is CP-1252 and then try to convert again.  But before
 852                that, we use a translation table to "sanitize" the
 853                input. */
 854
 855             cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
 856                                                 sizeof(sanitized)/sizeof(char));
 857
 858             if (cch_sanitized == 0) {
 859 #ifdef DEBUG_UNICODE
 860                 DebugBreak();
 861 #endif
 862                 return 0;
 863             }
 864
 865             cch = MultiByteToWideChar(1252, 0, sanitized,
 866                                       cch_sanitized * sizeof(char), dest, cch_dest);
 867             if (cch == 0) {
 868                 /* Well, that didn't work either.  Something is very wrong. */
 869 #ifdef DEBUG_UNICODE
 870                 DebugBreak();
 871 #endif
 872                 return 0;
 873             } else {
 874                 return cch;
 875             }
 876
 877         } else {
 878             return 0;
 879         }
 880     } else {
 881         return cch;
 882     }
 883 }
 884
 885 cm_unichar_t  * cm_Utf8ToUtf16Alloc(const cm_utf8char_t * src, int cch_src, int *pcch_dest)
 886 {
 887     cm_unichar_t * ustr = NULL;
 888     int cch;
 889
 890     if (!nls_init)
 891         cm_InitNormalization();
 892
 893     if (cch_src == 0 || src == NULL || *src == '\0') {
 894         if (pcch_dest)
 895             *pcch_dest = ((cch_src != 0)? 1 : 0);
 896         return wcsdup(L"");
 897     }
 898
 899     if (cch_src == -1) {
 900         cch_src = strlen(src) + 1;
 901     }
 902
 903     cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
 904                               cch_src * sizeof(char), NULL, 0);
 905
 906     if (cch == 0) {
 907         if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
 908             char sanitized[NLSMAXCCH];
 909             int cch_sanitized;
 910
 911             /* If src doesn't have a unicode translation, then it
 912                wasn't valid UTF-8.  In this case, we assume that src
 913                is CP-1252 and then try to convert again.  But before
 914                that, we use a translation table to "sanitize" the
 915                input. */
 916
 917             cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
 918                                                 sizeof(sanitized)/sizeof(char));
 919
 920             if (cch_sanitized == 0) {
 921 #ifdef DEBUG_UNICODE
 922                 DebugBreak();
 923 #endif
 924                 return NULL;
 925             }
 926
 927             cch = MultiByteToWideChar(1252, 0, sanitized,
 928                                       cch_sanitized * sizeof(char), NULL, 0);
 929             if (cch == 0) {
 930                 /* Well, that didn't work either.  Something is very wrong. */
 931 #ifdef DEBUG_UNICODE
 932                 DebugBreak();
 933 #endif
 934                 return NULL;
 935             }
 936
 937             ustr = malloc((cch + 1) * sizeof(wchar_t));
 938
 939             cch = MultiByteToWideChar(1252, 0, sanitized,
 940                                       cch_sanitized * sizeof(char), ustr, cch);
 941             ustr[cch] = 0;
 942         } else {
 943             return NULL;
 944         }
 945     } else {
 946
 947         ustr = malloc((cch + 1) * sizeof(wchar_t));
 948
 949         cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
 950                                   cch_src * sizeof(char), ustr, cch);
 951         ustr[cch] = 0;
 952
 953         if (!cm_is_valid_utf16(ustr, cch)) {
 954             cm_unichar_t * us = NULL;
 955             int cch_s;
 956
 957             cch_s = sanitize_utf16string(ustr, cch, NULL, 0);
 958             if (cch_s != 0) {
 959                 us = malloc(cch_s * sizeof(wchar_t));
 960                 cch_s = sanitize_utf16string(ustr, cch, us, cch_s);
 961             }
 962
 963             if (cch_s != 0) {
 964                 free(ustr);
 965                 ustr = us;
 966                 us = NULL;
 967             } else {
 968                 if (us)
 969                     free(us);
 970                 free(ustr);
 971                 ustr = NULL;
 972             }
 973         }
 974     }
 975
 976     if (pcch_dest)
 977         *pcch_dest = cch;
 978
 979     return ustr;
 980 }
 981
 982
 983
 984 /* \brief Normalize a UTF-8 string.
 985
 986    \param[in] src String to normalize.
 987
 988    \param[in] cch_src : Count of characters in src.  If this value is
 989        -1, then src is assumed to be NULL terminated.  The translated
 990        string will be NULL terminated only if this is -1 or the count
 991        includes the terminating NULL.
 992
 993    \param[out] adest : Destination string.  Only considered valid if
 994        \a cch_adest is non-zero.
 995
 996    \param[in] cch_adest : Number of characters in the destination
 997        string.  If this is zero, then the return value is the number
 998        of bytes required.
 999
1000    \return If \a cch_adest is non-zero, then the return value is the
1001        number of bytes stored into adest.  If \a cch_adest is zero,
1002        then the return value is the number of bytes required.  In both
1003        cases, the return value is 0 if the call was unsuccessful.
1004  */
1005 long cm_NormalizeUtf8String(const char * src, int cch_src,
1006                             char * adest, int cch_adest)
1007 {
1008     wchar_t wsrcbuf[NLSMAXCCH];
1009     wchar_t *wnorm;
1010     int cch;
1011     int cch_norm;
1012
1013     if (!nls_init)
1014         cm_InitNormalization();
1015
1016     /* Get some edge cases out first, so we don't have to worry about
1017        cch_src being 0 etc. */
1018     if (cch_src == 0) {
1019         return 0;
1020     } else if (*src == '\0') {
1021         if (cch_adest >= 1)
1022             *adest = '\0';
1023         return 1;
1024     }
1025
1026     if (cch_src == -1) {
1027         cch_src = strlen(src) + 1;
1028     }
1029
1030     cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
1031                               cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
1032
1033     if (cch != 0 && !cm_is_valid_utf16(wsrcbuf, cch)) {
1034         wchar_t wsanitized[NLSMAXCCH];
1035
1036         cch = sanitize_utf16string(wsrcbuf, cch, wsanitized, NLSMAXCCH);
1037         if (cch != 0) {
1038             memcpy(wsrcbuf, wsanitized, cch * sizeof(wchar_t));
1039         }
1040     }
1041
1042     if (cch == 0) {
1043         if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
1044             char sanitized[NLSMAXCCH];
1045             int cch_sanitized;
1046
1047             /* If src doesn't have a unicode translation, then it
1048                wasn't valid UTF-8.  In this case, we assume that src
1049                is CP-1252 and then try to convert again.  But before
1050                that, we use a translation table to "sanitize" the
1051                input. */
1052
1053             cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
1054                                                 sizeof(sanitized)/sizeof(char));
1055
1056             if (cch_sanitized == 0) {
1057 #ifdef DEBUG_UNICODE
1058                 DebugBreak();
1059 #endif
1060                 return 0;
1061             }
1062
1063             cch = MultiByteToWideChar(1252, 0, sanitized,
1064                                       cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
1065             if (cch == 0) {
1066                 /* Well, that didn't work either.  Something is very wrong. */
1067 #ifdef DEBUG_UNICODE
1068                 DebugBreak();
1069 #endif
1070                 return 0;
1071             }
1072         } else {
1073             return 0;
1074         }
1075     }
1076
1077     cch_norm = 0;
1078     wnorm = NormalizeUtf16String(wsrcbuf, cch, NULL, &cch_norm);
1079     if (wnorm == NULL) {
1080 #ifdef DEBUG_UNICODE
1081         DebugBreak();
1082 #endif
1083         return 0;
1084     }
1085
1086     cch = WideCharToMultiByte(CP_UTF8, 0, wnorm,
1087                               cch_norm, adest, cch_adest * sizeof(char),
1088                               NULL, FALSE);
1089
1090     if (wnorm)
1091         free(wnorm);
1092
1093     return cch;
1094 }
1095
1096 /*! \brief Case insensitive comparison with specific length
1097
1098   \param[in] str1 First string to compare.  Assumed to be encoded in UTF-8.
1099
1100   \param[in] str2 Second string to compare.  Assumed to be encoded in UTF-8.
1101
1102   \param[in] n Max byte count.
1103
1104  */
1105 int cm_strnicmp_utf8(const char * str1, const char * str2, int n)
1106 {
1107     wchar_t wstr1[NLSMAXCCH];
1108     int len1;
1109     int len2;
1110     wchar_t wstr2[NLSMAXCCH];
1111     int rv;
1112
1113     if (!nls_init)
1114         cm_InitNormalization();
1115
1116     /* first check for NULL pointers (assume NULL < "") */
1117     if (str1 == NULL) {
1118         if (str2 == NULL)
1119             return 0;
1120         else
1121             return -1;
1122     } else if (str2 == NULL) {
1123         return 1;
1124     }
1125
1126     len1 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str1, n, wstr1, NLSMAXCCH);
1127     if (len1 == 0) {
1128 #ifdef DEBUG
1129         DebugBreak();
1130 #endif
1131         wstr1[0] = L'\0';
1132     }
1133
1134     len2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str2, n, wstr2, NLSMAXCCH);
1135     if (len2 == 0) {
1136 #ifdef DEBUG
1137         DebugBreak();
1138 #endif
1139         wstr2[0] = L'\0';
1140     }
1141
1142     rv = CompareStringW(nls_lcid, NORM_IGNORECASE, wstr1, len1, wstr2, len2);
1143     if (rv > 0)
1144         return (rv - 2);
1145     else {
1146 #ifdef DEBUG
1147         DebugBreak();
1148 #endif
1149         return 0;
1150     }
1151 }
1152
1153 int cm_strnicmp_utf16(const cm_unichar_t * str1, const cm_unichar_t * str2, int len)
1154 {
1155     int rv;
1156     size_t cch1;
1157     size_t cch2;
1158
1159     if (!nls_init)
1160         cm_InitNormalization();
1161
1162     /* first check for NULL pointers */
1163     if (str1 == NULL) {
1164         if (str2 == NULL)
1165             return 0;
1166         else
1167             return -1;
1168     } else if (str2 == NULL) {
1169         return 1;
1170     }
1171
1172     if (FAILED(StringCchLengthW(str1, len, &cch1)))
1173         cch1 = len;
1174
1175     if (FAILED(StringCchLengthW(str2, len, &cch2)))
1176         cch2 = len;
1177
1178     rv = CompareStringW(nls_lcid, NORM_IGNORECASE, str1, cch1, str2, cch2);
1179     if (rv > 0)
1180         return (rv - 2);
1181     else {
1182 #ifdef DEBUG
1183         DebugBreak();
1184 #endif
1185         return 0;
1186     }
1187 }
1188
1189 int cm_stricmp_utf16(const cm_unichar_t * str1, const cm_unichar_t * str2)
1190 {
1191     int rv;
1192
1193     if (!nls_init)
1194         cm_InitNormalization();
1195
1196     /* first check for NULL pointers */
1197     if (str1 == NULL) {
1198         if (str2 == NULL)
1199             return 0;
1200         else
1201             return -1;
1202     } else if (str2 == NULL) {
1203         return 1;
1204     }
1205
1206     rv = CompareStringW(nls_lcid, NORM_IGNORECASE, str1, -1, str2, -1);
1207     if (rv > 0)
1208         return (rv - 2);
1209     else {
1210 #ifdef DEBUG
1211         DebugBreak();
1212 #endif
1213         return 0;
1214     }
1215 }
1216
1217 cm_unichar_t *cm_strlwr_utf16(cm_unichar_t * str)
1218 {
1219     int rv;
1220     int len;
1221
1222     if (!nls_init)
1223         cm_InitNormalization();
1224
1225     len = wcslen(str) + 1;
1226     rv = LCMapStringW(nls_lcid, LCMAP_LOWERCASE, str, len, str, len);
1227 #ifdef DEBUG
1228     if (rv == 0) {
1229         DebugBreak();
1230     }
1231 #endif
1232
1233     return str;
1234 }
1235
1236 cm_unichar_t *cm_strupr_utf16(cm_unichar_t * str)
1237 {
1238     int rv;
1239     int len;
1240
1241     if (!nls_init)
1242         cm_InitNormalization();
1243
1244     len = wcslen(str) + 1;
1245     rv = LCMapStringW(nls_lcid, LCMAP_UPPERCASE, str, len, str, len);
1246 #ifdef DEBUG
1247     if (rv == 0) {
1248         DebugBreak();
1249     }
1250 #endif
1251
1252     return str;
1253 }
1254
1255
1256 int cm_stricmp_utf8(const char * str1, const char * str2)
1257 {
1258     wchar_t wstr1[NLSMAXCCH];
1259     int len1;
1260     int len2;
1261     wchar_t wstr2[NLSMAXCCH];
1262     int rv;
1263
1264     if (!nls_init)
1265         cm_InitNormalization();
1266
1267     /* first check for NULL pointers */
1268     if (str1 == NULL) {
1269         if (str2 == NULL)
1270             return 0;
1271         else
1272             return -1;
1273     } else if (str2 == NULL) {
1274         return 1;
1275     }
1276
1277     len1 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str1, -1, wstr1, NLSMAXCCH);
1278     if (len1 == 0) {
1279 #ifdef DEBUG
1280         DebugBreak();
1281 #endif
1282         wstr1[0] = L'\0';
1283     }
1284
1285     len2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str2, -1, wstr2, NLSMAXCCH);
1286     if (len2 == 0) {
1287 #ifdef DEBUG
1288         DebugBreak();
1289 #endif
1290         wstr2[0] = L'\0';
1291     }
1292
1293     rv = CompareStringW(nls_lcid, NORM_IGNORECASE, wstr1, len1, wstr2, len2);
1294     if (rv > 0)
1295         return (rv - 2);
1296     else {
1297 #ifdef DEBUG
1298         DebugBreak();
1299 #endif
1300         return 0;
1301     }
1302 }
1303
1304 #if 0
1305 wchar_t * strupr_utf16(wchar_t * wstr, size_t cbstr)
1306 {
1307     wchar_t wstrd[NLSMAXCCH];
1308     int len;
1309
1310     if (!nls_init)
1311         cm_InitNormalization();
1312
1313     len = cbstr / sizeof(wchar_t);
1314     len = LCMapStringW(nls_lcid, LCMAP_UPPERCASE, wstr, len, wstrd, NLSMAXCCH);
1315     StringCbCopyW(wstr, cbstr, wstrd);
1316
1317     return wstr;
1318 }
1319 #endif
1320
1321 char * strupr_utf8(char * str, size_t cbstr)
1322 {
1323     wchar_t wstr[NLSMAXCCH];
1324     wchar_t wstrd[NLSMAXCCH];
1325     int len;
1326
1327     if (!nls_init)
1328         cm_InitNormalization();
1329
1330     len = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, -1, wstr, NLSMAXCCH);
1331     if (len == 0)
1332         return str;
1333
1334     len = LCMapStringW(nls_lcid, LCMAP_UPPERCASE, wstr, len, wstrd, NLSMAXCCH);
1335
1336     len = WideCharToMultiByte(CP_UTF8, 0, wstrd, -1, str, cbstr, NULL, FALSE);
1337
1338     return str;
1339 }
1340
1341 char * char_next_utf8(const char * c)
1342 {
1343 #define CH (*((const unsigned char *)c))
1344
1345     if ((CH & 0x80) == 0)
1346         return (char *) c+1;
1347     else {
1348         switch (CH & 0xf0) {
1349         case 0xc0:
1350         case 0xd0:
1351             return (char *) c+2;
1352
1353         case 0xe0:
1354             return (char *) c+3;
1355
1356         case 0xf0:
1357             return (char *) c+4;
1358
1359         default:
1360             return (char *) c+1;
1361         }
1362     }
1363 #undef CH
1364 }
1365
1366
1367 char * char_prev_utf8(const char * c)
1368 {
1369 #define CH (*((const unsigned char *)c))
1370
1371     c--;
1372
1373     if ((CH & 0x80) == 0)
1374         return (char *) c;
1375     else
1376         while ((CH & 0xc0) == 0x80)
1377             (char *) c--;
1378     return (char *) c;
1379
1380 #undef CH
1381 }
1382
1383 wchar_t * char_next_utf16(const wchar_t * c)
1384 {
1385     unsigned short sc = (unsigned short) *c;
1386
1387     if (sc >= 0xd800 && sc <= 0xdbff)
1388         return (wchar_t *) c+2;
1389     return (wchar_t *) c+1;
1390 }
1391
1392 wchar_t * char_prev_utf16(const wchar_t * c)
1393 {
1394     unsigned short sc = (unsigned short) *(--c);
1395
1396     if (sc >= 0xdc00 && sc <= 0xdfff)
1397         return (wchar_t *) --c;
1398     return (wchar_t *) c;
1399 }
1400
1401 wchar_t * char_this_utf16(const wchar_t * c)
1402 {
1403     unsigned short sc = (unsigned short) *c;
1404
1405     if (sc >= 0xdc00 && sc <= 0xdfff)
1406         return (wchar_t *) --c;
1407     return (wchar_t *) c;
1408 }
1409
1410 int cm_is_valid_utf16(const wchar_t * c, int cch)
1411 {
1412     if (cch < 0)
1413         cch = wcslen(c) + 1;
1414
1415     for (; cch > 0; c++, cch--) {
1416         if (*c >= 0xd800 && *c < 0xdc00) {
1417             c++; cch--;
1418             if (cch == 0 || *c < 0xdc00 || *c > 0xdfff)
1419                 return 0;
1420         } else if (*c >= 0xdc00 && *c <= 0xdfff) {
1421             return 0;
1422         }
1423     }
1424
1425     return 1;
1426 }
1427
1428 #ifdef DEBUG
1429 wchar_t * cm_GetRawCharsAlloc(const wchar_t * c, int len)
1430 {
1431     wchar_t * ret;
1432     wchar_t * current;
1433     size_t cb;
1434
1435     if (len == -1)
1436         len = wcslen(c);
1437
1438     if (len == 0)
1439         return wcsdup(L"(empty)");
1440
1441     cb = len * 5 * sizeof(wchar_t);
1442     current = ret = malloc(cb);
1443     if (ret == NULL)
1444         return NULL;
1445
1446     for (; len > 0; ++c, --len) {
1447         StringCbPrintfExW(current, cb, &current, &cb, 0,
1448                          L"%04x", (int) *c);
1449         if (len > 1)
1450             StringCbCatExW(current, cb, L",", &current, &cb, 0);
1451     }
1452
1453     return ret;
1454 }
1455 #endif