src/WINNT/afsd/cm_nls.c

   1 /*
   2  * Copyright (c) 2008 Secure Endpoints Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person
   5  * obtaining a copy of this software and associated documentation
   6  * files (the "Software"), to deal in the Software without
   7  * restriction, including without limitation the rights to use, copy,
   8  * modify, merge, publish, distribute, sublicense, and/or sell copies
   9  * of the Software, and to permit persons to whom the Software is
  10  * furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include <windows.h>
  26 #include <stdlib.h>
  27 #include <wchar.h>
  28 #include <strsafe.h>
  29 #include <stdio.h>
  30 #include <errno.h>
  31
  32 #include "cm_nls.h"
  33
  34 #ifdef DEBUG_UNICODE
  35 #include <assert.h>
  36 #endif
  37
  38 /* This is part of the Microsoft Internationalized Domain Name
  39    Mitigation APIs. */
  40 #include <normalization.h>
  41
  42 /* TODO: All the normalization and conversion code should NUL
  43    terminate destination strings. */
  44
  45 int
  46 (WINAPI *pNormalizeString)( __in NORM_FORM NormForm,
  47                             __in_ecount(cwSrcLength) LPCWSTR lpSrcString,
  48                             __in int cwSrcLength,
  49                             __out_ecount(cwDstLength) LPWSTR lpDstString,
  50                             __in int cwDstLength ) = NULL;
  51
  52 BOOL
  53 (WINAPI *pIsNormalizedString)( __in NORM_FORM NormForm,
  54                                __in_ecount(cwLength) LPCWSTR lpString,
  55                                __in int cwLength ) = NULL;
  56
  57
  58 #define NLSDLLNAME "Normaliz.dll"
  59 #define NLSMAXCCH  1024
  60 #define NLSERRCCH  8
  61
  62 #define AFS_NORM_FORM NormalizationC
  63
  64 static LCID nls_lcid = LOCALE_INVARIANT;
  65
  66 static int nls_init = 0;
  67
  68 static BOOL
  69 is_windows_2000 (void)
  70 {
  71    static BOOL fChecked = FALSE;
  72    static BOOL fIsWin2K = FALSE;
  73
  74    if (!fChecked)
  75    {
  76        OSVERSIONINFO Version;
  77
  78        memset (&Version, 0x00, sizeof(Version));
  79        Version.dwOSVersionInfoSize = sizeof(Version);
  80
  81        if (GetVersionEx (&Version))
  82        {
  83            if (Version.dwPlatformId == VER_PLATFORM_WIN32_NT &&
  84                 Version.dwMajorVersion >= 5)
  85                fIsWin2K = TRUE;
  86        }
  87        fChecked = TRUE;
  88    }
  89
  90    return fIsWin2K;
  91 }
  92
  93 long cm_InitNormalization(void)
  94 {
  95     HMODULE h_Nls;
  96
  97     if (pNormalizeString != NULL)
  98         return 0;
  99
 100     h_Nls = LoadLibrary(NLSDLLNAME);
 101     if (h_Nls == INVALID_HANDLE_VALUE) {
 102         return 1;
 103     }
 104
 105     pNormalizeString =
 106         (int (WINAPI *)( NORM_FORM, LPCWSTR,
 107                          int, LPWSTR, int))
 108         GetProcAddress(h_Nls, "NormalizeString");
 109
 110     pIsNormalizedString =
 111         (BOOL
 112          (WINAPI *)( NORM_FORM, LPCWSTR, int ))
 113         GetProcAddress(h_Nls, "IsNormalizedString");
 114
 115     if (is_windows_2000())
 116         nls_lcid = MAKELCID(MAKELANGID(LANG_ENGLISH, SUBLANG_ENGLISH_US), SORT_DEFAULT);
 117
 118     nls_init = 1;
 119
 120     return (pNormalizeString && pIsNormalizedString);
 121 }
 122
 123 /* \brief Normalize a UTF-16 string.
 124
 125    If the supplied destination buffer is insufficient or NULL, then a
 126    new buffer will be allocated to hold the normalized string.
 127
 128    \param[in] src : Source UTF-16 string.  Length is specified in
 129        cch_src.
 130
 131    \param[in] cch_src : The character count in cch_src is assumed to
 132        be tight and include the terminating NULL character if there is
 133        one.  If the NULL is absent, the resulting string will not be
 134        NULL terminated.
 135
 136    \param[out] ext_dest : The destination buffer.  Can be NULL, in
 137        which case *pcch_dest MUST be 0.
 138
 139    \param[in,out] pcch_dest : On entry *pcch_dest contains a count of
 140        characters in the destination buffer.  On exit, it will contain
 141        a count of characters that were copied to the destination
 142        buffer.
 143
 144    Returns a pointer to the buffer containing the normalized string or
 145    NULL if the call was unsuccessful.  If the returned destination
 146    buffer is different from the supplied buffer and non-NULL, it
 147    should be freed using free().
 148 */
 149 static wchar_t *
 150 NormalizeUtf16String(const wchar_t * src, int cch_src, wchar_t * ext_dest, int *pcch_dest)
 151 {
 152     if (!nls_init)
 153         cm_InitNormalization();
 154
 155 #ifdef DEBUG_UNICODE
 156     assert (pNormalizeString != NULL && pIsNormalizedString != NULL);
 157 #endif
 158
 159     if (cch_src == -1)
 160         cch_src = (int)wcslen(src) + 1;
 161
 162     if ((pIsNormalizedString && (*pIsNormalizedString)(AFS_NORM_FORM, src, cch_src)) ||
 163         (!pNormalizeString)) {
 164
 165         if (ext_dest == NULL || *pcch_dest < cch_src) {
 166             ext_dest = malloc(cch_src * sizeof(wchar_t));
 167             *pcch_dest = cch_src;
 168         }
 169
 170         /* No need to or unable to normalize.  Just copy the string.
 171            Note that the string is not NUL terminated if the source
 172            string is not NUL terminated. */
 173
 174         if (ext_dest) {
 175             memcpy(ext_dest, src, cch_src * sizeof(wchar_t));
 176             *pcch_dest = cch_src;
 177         } else {
 178             *pcch_dest = 0;
 179         }
 180         return ext_dest;
 181
 182     } else {
 183
 184         int rv;
 185         DWORD gle;
 186         int tries = 10;
 187         wchar_t * dest;
 188         int cch_dest = *pcch_dest;
 189
 190         dest = ext_dest;
 191
 192         while (tries-- > 0) {
 193
 194             rv = (*pNormalizeString)(AFS_NORM_FORM, src, cch_src, dest, cch_dest);
 195
 196             if (rv <= 0 && (gle = GetLastError()) != ERROR_SUCCESS) {
 197                 if (gle == ERROR_INSUFFICIENT_BUFFER) {
 198
 199                     /* The buffer wasn't big enough.  We are going to
 200                        try allocating one. */
 201
 202                     cch_dest = (-rv) + NLSERRCCH;
 203                     goto cont;
 204
 205                 } else {
 206                     /* Something else is wrong */
 207                     break;
 208                 }
 209
 210             } else if (rv < 0) { /* rv < 0 && gle == ERROR_SUCCESS */
 211
 212                 /* Technically not one of the expected outcomes */
 213                 break;
 214
 215             } else {            /* rv > 0 || (rv == 0 && gle == ERROR_SUCCESS) */
 216
 217                 /* Possibly succeeded */
 218
 219                 if (rv == 0) { /* Succeeded and the return string is empty */
 220                     *pcch_dest = 0;
 221                     return dest;
 222                 }
 223
 224                 if (cch_dest == 0) {
 225                     /* Nope.  We only calculated the required size of the buffer */
 226
 227                     cch_dest = rv + NLSERRCCH;
 228                     goto cont;
 229                 }
 230
 231                 *pcch_dest = rv;
 232                 if (cch_dest > rv)
 233                     dest[rv] = 0;
 234                 else {
 235                     /* Can't NUL terminate */
 236                     cch_dest = max(rv,cch_dest) + NLSERRCCH;
 237                     goto cont;
 238                 }
 239
 240                 /* Success! */
 241                 return dest;
 242             }
 243
 244         cont:
 245             if (dest != ext_dest && dest)
 246                 free(dest);
 247             dest = malloc(cch_dest * sizeof(wchar_t));
 248         }
 249
 250         /* Failed */
 251
 252         if (dest != ext_dest && dest)
 253             free(dest);
 254
 255         *pcch_dest = 0;
 256         return NULL;
 257     }
 258 }
 259
 260 /*! \brief Normalize a Unicode string into a newly allocated buffer
 261
 262   The input string will be normalized using NFC.
 263
 264   \param[in] s UTF-16 string to be normalized.
 265
 266   \param[in] cch_src The number of characters in the input string.  If
 267       this is -1, then the input string is assumed to be NUL
 268       terminated.
 269
 270   \param[out] pcch_dest Receives the number of characters copied to
 271       the output buffer.  Note that the character count is the number
 272       of wchar_t characters copied, and not the count of Unicode code
 273       points.  This includes the terminating NUL if cch_src was -1 or
 274       included the terminating NUL.
 275
 276   \return A newly allocated buffer holding the normalized string or
 277       NULL if the call failed.
 278  */
 279 cm_normchar_t * cm_NormalizeStringAlloc(const cm_unichar_t * s, int cch_src, int *pcch_dest)
 280 {
 281     int cch_dest = 0;
 282     cm_normchar_t * r;
 283
 284     if (!nls_init)
 285         cm_InitNormalization();
 286
 287     if (s == NULL || cch_src == 0 || *s == L'\0') {
 288         if (pcch_dest)
 289             *pcch_dest = ((cch_src != 0)? 1: 0);
 290         return wcsdup(L"");
 291     }
 292
 293     r = NormalizeUtf16String(s, cch_src, NULL, &cch_dest);
 294
 295     if (pcch_dest)
 296         *pcch_dest = cch_dest;
 297
 298     return r;
 299 }
 300
 301 int cm_NormalizeString(const cm_unichar_t * s, int cch_src,
 302                        cm_normchar_t * dest, int cch_dest)
 303 {
 304     int tcch = cch_dest;
 305     cm_normchar_t * r;
 306
 307     if (!nls_init)
 308         cm_InitNormalization();
 309
 310     r = NormalizeUtf16String(s, cch_src, dest, &tcch);
 311
 312     if (r != dest) {
 313         /* The supplied buffer was insufficient */
 314         free(r);
 315         return 0;
 316     } else {
 317         return tcch;
 318     }
 319 }
 320
 321 /*! \brief Convert a UTF-16 string to a UTF-8 string using a newly allocated buffer
 322
 323   \param[in] s UTF-16 source string
 324
 325   \param[in] cch_src Number of characters in \a s. This can be set to
 326       -1 if \a s is NUL terminated.
 327
 328   \param[out] pcch_dest Receives a count of characters that were
 329       copied to the target buffer.
 330
 331   \return A newly allocated buffer holding the UTF-8 string.
 332
 333  */
 334 cm_utf8char_t * cm_Utf16ToUtf8Alloc(const cm_unichar_t * s, int cch_src, int *pcch_dest)
 335 {
 336     int cch_dest;
 337     cm_utf8char_t * dest;
 338
 339     if (!nls_init)
 340         cm_InitNormalization();
 341
 342     if (s == NULL || cch_src == 0 || *s == L'\0') {
 343         if (pcch_dest)
 344             *pcch_dest = ((cch_src != 0)?1:0);
 345         return strdup("");
 346     }
 347
 348     cch_dest = WideCharToMultiByte(CP_UTF8, 0, s, cch_src, NULL, 0, NULL, FALSE);
 349
 350     if (cch_dest == 0) {
 351         if (pcch_dest)
 352             *pcch_dest = cch_dest;
 353         return NULL;
 354     }
 355
 356     dest = malloc((cch_dest + 1) * sizeof(cm_utf8char_t));
 357
 358     WideCharToMultiByte(CP_UTF8, 0, s, cch_src, dest, cch_dest, NULL, FALSE);
 359     dest[cch_dest] = 0;
 360
 361     if (pcch_dest)
 362         *pcch_dest = cch_dest;
 363
 364     return dest;
 365 }
 366
 367 int cm_Utf16ToUtf8(const cm_unichar_t * src, int cch_src,
 368                    cm_utf8char_t * dest, int cch_dest)
 369 {
 370     if (!nls_init)
 371         cm_InitNormalization();
 372
 373     return WideCharToMultiByte(CP_UTF8, 0, src, cch_src, dest, cch_dest, NULL, FALSE);
 374 }
 375
 376 int cm_Utf16ToUtf16(const cm_unichar_t * src, int cch_src,
 377                     cm_unichar_t * dest, int cch_dest)
 378 {
 379     if (!nls_init)
 380         cm_InitNormalization();
 381
 382     if (cch_src == -1) {
 383         StringCchCopyW(dest, cch_dest, src);
 384         return (int)wcslen(dest) + 1;
 385     } else {
 386         int cch_conv = min(cch_src, cch_dest);
 387         memcpy(dest, src, cch_conv * sizeof(cm_unichar_t));
 388         return cch_conv;
 389     }
 390 }
 391
 392 /* \brief Normalize a UTF-16 string into a UTF-8 string.
 393
 394    \param[in] src : Source string.
 395
 396    \param[in] cch_src : Count of characters in src. If the count includes the
 397        NULL terminator, then the resulting string will be NULL
 398        terminated.  If it is -1, then src is assumed to be NULL
 399        terminated.
 400
 401    \param[out] adest : Destination buffer.
 402
 403    \param[in] cch_adest : Number of characters in the destination buffer.
 404
 405    Returns the number of characters stored into cch_adest. This will
 406    include the terminating NULL if cch_src included the terminating
 407    NULL or was -1.  If this is 0, then the operation was unsuccessful.
 408  */
 409 long cm_NormalizeUtf16StringToUtf8(const wchar_t * src, int cch_src,
 410                                    char * adest, int cch_adest)
 411 {
 412     if (!nls_init)
 413         cm_InitNormalization();
 414
 415     if (cch_src < 0) {
 416         size_t cch;
 417
 418         if (FAILED(StringCchLengthW(src, NLSMAXCCH, &cch)))
 419             return E2BIG;
 420
 421         cch_src = (int)cch+1;
 422     }
 423
 424     {
 425         wchar_t nbuf[NLSMAXCCH];
 426         wchar_t * normalized;
 427         int cch_norm = NLSMAXCCH;
 428
 429         normalized = NormalizeUtf16String(src, cch_src, nbuf, &cch_norm);
 430         if (normalized) {
 431             cch_adest = WideCharToMultiByte(CP_UTF8, 0, normalized, cch_norm,
 432                                             adest, cch_adest, NULL, 0);
 433
 434             if (normalized != nbuf && normalized)
 435                 free(normalized);
 436
 437             return cch_adest;
 438
 439         } else {
 440
 441             return 0;
 442
 443         }
 444     }
 445 }
 446
 447 #define ESCVAL 0x1000
 448 #define Esc(c) (ESCVAL + (short)(c))
 449 #define IS_ESCAPED(c) (((c) & ESCVAL) == ESCVAL)
 450
 451 /* \brief Character sanitization map for CP-1252
 452
 453    The following map indicates which characters should be escaped in
 454    the CP-1252 character map.  Characters that are documented as
 455    illegal characters in a file name are marked as escaped.  Escaped
 456    characters are marked using the ::Esc macro defined above.  The
 457    following exceptions apply:
 458
 459    - Path delimeters '\\' and '/' are NOT escaped because the
 460      sanitization map applies to paths.  While those characters are
 461      illegal in filenames, they are legal in paths.
 462
 463    - Wildcard characters '*' and '?' ARE escaped.  The document
 464      referred below does not specify these characters as invalid.
 465      Since no other escape mechanism exists, names containing
 466      wildcards are indistinguishable from actual wildcards used in SMB
 467      requests.
 468
 469    - Reserved names are not and cannot be represented in this map.
 470      Reserved names are :
 471
 472      CON, PRN, AUX, NUL, COM1, COM2, COM3, COM4, COM5, COM6, COM7,
 473      COM8, COM9, LPT1, LPT2, LPT3, LPT4, LPT5, LPT6, LPT7, LPT8, LPT9,
 474      CLOCK$
 475
 476    - Characters 0x80, 0x81, 0x8d, 0x8e, 0x8f, 0x90, 0x9d, 0x9e, 0x9f
 477      are also escaped because they are unused in CP-1252 and hence
 478      cannot be convered to a Unicode string.
 479
 480      Reserved names with extensions are also invalid. (i.e. NUL.txt)
 481
 482    \note The only bit we are actually interested in from the following
 483      table is the ESCVAL bit.  However, the characters themselves are
 484      included for ease of maintenance.
 485
 486    \see "Naming a File" topic in the Windows SDK.
 487  */
 488 static const short sanitized_escapes_1252[] = {
 489     Esc(0x00),Esc(0x01),Esc(0x02),Esc(0x03),Esc(0x04),Esc(0x05),Esc(0x06),Esc(0x07),
 490     Esc(0x08),Esc(0x09),Esc(0x0a),Esc(0x0b),Esc(0x0c),Esc(0x0d),Esc(0x0e),Esc(0x0f),
 491     Esc(0x10),Esc(0x11),Esc(0x12),Esc(0x13),Esc(0x14),Esc(0x15),Esc(0x16),Esc(0x17),
 492     Esc(0x18),Esc(0x19),Esc(0x1a),Esc(0x1b),Esc(0x1c),Esc(0x1d),Esc(0x1e),Esc(0x1f),
 493     ' ','!',Esc('"'),'#','$','%','&','\'','(',')',Esc('*'),'+',',','-','.','/',
 494     '0','1','2','3','4','5','6','7','8','9',Esc(':'),';',Esc('<'),'=',Esc('>'),Esc('?'),
 495     '@','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
 496     'P','Q','R','S','T','U','V','W','X','Y','Z','[','\\',']','^','_',
 497     '`','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o',
 498     'p','q','r','s','t','u','v','w','x','y','z','{',Esc('|'),'}','~',Esc(0x7f),
 499     Esc(0x80),Esc(0x81),0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,Esc(0x8d),Esc(0x8e),Esc(0x8f),
 500     Esc(0x90),0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,Esc(0x9d),Esc(0x9e),0x9f,
 501     0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
 502     0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
 503     0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,
 504     0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf,
 505     0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
 506     0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff
 507 };
 508
 509 static int sanitize_bytestring(const char * src, int cch_src,
 510                                char * odest, int cch_dest)
 511 {
 512     char * dest = odest;
 513
 514     if (!nls_init)
 515         cm_InitNormalization();
 516
 517     while (cch_src > 0 && *src && cch_dest > 0) {
 518
 519         unsigned short rc;
 520
 521         rc = sanitized_escapes_1252[*src];
 522         if (IS_ESCAPED(rc)) {
 523             static const char hex[] =
 524                 {'0','1','2','3','4','5','6','7',
 525                  '8','9','a','b','c','d','e','f'};
 526
 527             if (cch_dest < 3) {
 528                 *dest++ = '\0';
 529                 return 0;
 530             }
 531
 532             *dest++ = '%';
 533             *dest++ = hex[(((int)*src) >> 4) & 0x0f];
 534             *dest++ = hex[(((int)*src) & 0x0f)];
 535             cch_dest -= 3;
 536
 537         } else {
 538             *dest++ = *src;
 539             cch_dest--;
 540         }
 541
 542         cch_src--;
 543         src++;
 544     }
 545
 546     if (cch_src > 0 && cch_dest > 0) {
 547         *dest++ = '\0';
 548     }
 549
 550     return (int)(dest - odest);
 551 }
 552
 553 static int sanitize_utf16char(wchar_t c, wchar_t ** pdest, size_t * pcch)
 554 {
 555     if (*pcch >= 6) {
 556         StringCchPrintfExW(*pdest, *pcch, pdest, pcch, 0, L"%%%04x", (int) c);
 557         return 1;
 558     } else {
 559         return 0;
 560     }
 561 }
 562
 563 static int sanitize_utf16string(const wchar_t * src, size_t cch_src,
 564                                 wchar_t * dest, size_t cch_dest)
 565 {
 566     int cch_dest_o = cch_dest;
 567
 568     if (dest == NULL) {
 569         /* only estimating */
 570         for (cch_dest = 0; cch_src > 0;) {
 571             if (*src >= 0xd800 && *src < 0xdc00) {
 572                 if (cch_src <= 1 || src[1] < 0xdc00 || src[1] > 0xdfff) {
 573                     /* dangling surrogate */
 574                     src++;
 575                     cch_src --;
 576                     cch_dest += 5;
 577                 } else {
 578                     /* surrogate pair */
 579                     src += 2;
 580                     cch_src -= 2;
 581                     cch_dest += 2;
 582                 }
 583             } else if (*src >= 0xdc00 && *src <= 0xdfff) {
 584                 /* dangling surrogate */
 585                 src++;
 586                 cch_src --;
 587                 cch_dest += 5;
 588             } else {
 589                 /* normal char */
 590                 src++; cch_src --;
 591                 cch_dest++;
 592             }
 593         }
 594
 595         return cch_dest;
 596     }
 597
 598     while (cch_src > 0 && cch_dest > 0) {
 599         if (*src >= 0xd800 && *src < 0xdc00) {
 600             if (cch_src <= 1 || src[1] < 0xdc00 || src[1] > 0xdfff) {
 601                 if (!sanitize_utf16char(*src++, &dest, &cch_dest))
 602                     return 0;
 603                 cch_src--;
 604             } else {
 605                 /* found a surrogate pair */
 606                 *dest++ = *src++;
 607                 *dest++ = *src++;
 608                 cch_dest -= 2; cch_src -= 2;
 609             }
 610         } else if (*src >= 0xdc00 && *src <= 0xdfff) {
 611             if (!sanitize_utf16char(*src++, &dest, &cch_dest))
 612                 return 0;
 613             cch_src--;
 614         } else {
 615             *dest++ = *src++;
 616             cch_dest--; cch_src--;
 617         }
 618     }
 619
 620     return (cch_src == 0) ? cch_dest_o - cch_dest : 0;
 621 }
 622
 623 #undef Esc
 624 #undef IS_ESCAPED
 625 #undef ESCVAL
 626
 627 long cm_NormalizeUtf8StringToUtf16(const char * src, int cch_src,
 628                                    wchar_t * dest, int cch_dest)
 629 {
 630     wchar_t wsrcbuf[NLSMAXCCH];
 631     wchar_t *wnorm;
 632     int cch;
 633     int cch_norm;
 634
 635     if (!nls_init)
 636         cm_InitNormalization();
 637
 638     /* Get some edge cases out first, so we don't have to worry about
 639        cch_src being 0 etc. */
 640     if (cch_src == 0) {
 641         return 0;
 642     } else if (*src == '\0') {
 643         if (cch_dest >= 1)
 644             *dest = L'\0';
 645         return 1;
 646     }
 647
 648     if (dest && cch_dest > 0) {
 649         dest[0] = L'\0';
 650     }
 651
 652     if (cch_src == -1) {
 653         cch_src = (int)strlen(src) + 1;
 654     }
 655
 656     cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
 657                               cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
 658
 659     if (cch != 0 && !cm_is_valid_utf16(wsrcbuf, cch)) {
 660         wchar_t wsanitized[NLSMAXCCH];
 661
 662         /* We successfully converted, but the resulting UTF-16 string
 663            has dangling surrogates.  We should try and escape those
 664            next.  */
 665         cch = sanitize_utf16string(wsrcbuf, cch, wsanitized, NLSMAXCCH);
 666         if (cch != 0) {
 667             memcpy(wsrcbuf, wsanitized, cch * sizeof(wchar_t));
 668         }
 669     }
 670
 671     if (cch == 0) {
 672         if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
 673             char sanitized[NLSMAXCCH];
 674             int cch_sanitized;
 675
 676             /* If src doesn't have a unicode translation, then it
 677                wasn't valid UTF-8.  In this case, we assume that src
 678                is CP-1252 and then try to convert again.  But before
 679                that, we use a translation table to "sanitize" the
 680                input. */
 681
 682             cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
 683                                                 sizeof(sanitized)/sizeof(char));
 684
 685             if (cch_sanitized == 0) {
 686 #ifdef DEBUG_UNICODE
 687                 DebugBreak();
 688 #endif
 689                 return 0;
 690             }
 691
 692             cch = MultiByteToWideChar(1252, 0, sanitized,
 693                                       cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
 694             if (cch == 0) {
 695                 /* Well, that didn't work either.  Something is very wrong. */
 696 #ifdef DEBUG_UNICODE
 697                 DebugBreak();
 698 #endif
 699                 return 0;
 700             }
 701         } else {
 702             return 0;
 703         }
 704     }
 705
 706     cch_norm = cch_dest;
 707     wnorm = NormalizeUtf16String(wsrcbuf, cch, dest, &cch_norm);
 708     if (wnorm == NULL) {
 709 #ifdef DEBUG_UNICODE
 710         DebugBreak();
 711 #endif
 712         return 0;
 713     }
 714
 715     if (wnorm != dest) {
 716         /* The buffer was insufficient */
 717         if (dest != NULL && cch_dest > 1) {
 718             *dest = L'\0';
 719             cch_norm = 0;
 720         }
 721
 722         free(wnorm);
 723     }
 724
 725     return cch_norm;
 726 }
 727
 728 cm_normchar_t *cm_NormalizeUtf8StringToUtf16Alloc(const cm_utf8char_t * src, int cch_src,
 729                                                   int *pcch_dest)
 730 {
 731     wchar_t wsrcbuf[NLSMAXCCH];
 732     wchar_t *wnorm;
 733     int cch;
 734     int cch_norm;
 735
 736     if (!nls_init)
 737         cm_InitNormalization();
 738
 739     /* Get some edge cases out first, so we don't have to worry about
 740        cch_src being 0 etc. */
 741     if (cch_src == 0 || src == NULL || *src == '\0') {
 742         if (pcch_dest)
 743             *pcch_dest = ((cch_src != 0)? 1 : 0);
 744         return wcsdup(L"");
 745     }
 746
 747     if (cch_src == -1) {
 748         cch_src = (int)strlen(src) + 1;
 749     }
 750
 751     cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
 752                               cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
 753
 754     if (cch != 0 && !cm_is_valid_utf16(wsrcbuf, cch)) {
 755         wchar_t wsanitized[NLSMAXCCH];
 756
 757         /* We successfully converted, but the resulting UTF-16 string
 758            has dangling surrogates.  We should try and escape those
 759            next.  */
 760         cch = sanitize_utf16string(wsrcbuf, cch, wsanitized, NLSMAXCCH);
 761         if (cch != 0) {
 762             memcpy(wsrcbuf, wsanitized, cch * sizeof(wchar_t));
 763         }
 764     }
 765
 766     if (cch == 0) {
 767         if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
 768             char sanitized[NLSMAXCCH];
 769             int cch_sanitized;
 770
 771             /* If src doesn't have a unicode translation, then it
 772                wasn't valid UTF-8.  In this case, we assume that src
 773                is CP-1252 and then try to convert again.  But before
 774                that, we use a translation table to "sanitize" the
 775                input. */
 776
 777             cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
 778                                                 sizeof(sanitized)/sizeof(char));
 779
 780             if (cch_sanitized == 0) {
 781 #ifdef DEBUG_UNICODE
 782                 DebugBreak();
 783 #endif
 784                 return NULL;
 785             }
 786
 787             cch = MultiByteToWideChar(1252, 0, sanitized,
 788                                       cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
 789             if (cch == 0) {
 790                 /* Well, that didn't work either.  Something is very wrong. */
 791 #ifdef DEBUG_UNICODE
 792                 DebugBreak();
 793 #endif
 794                 return NULL;
 795             }
 796         } else {
 797             return NULL;
 798         }
 799     }
 800
 801     cch_norm = 0;
 802     wnorm = NormalizeUtf16String(wsrcbuf, cch, NULL, &cch_norm);
 803     if (wnorm == NULL) {
 804 #ifdef DEBUG_UNICODE
 805         DebugBreak();
 806 #endif
 807         return NULL;
 808     }
 809
 810     if (pcch_dest)
 811         *pcch_dest = cch_norm;
 812
 813     return wnorm;
 814 }
 815
 816 int cm_Utf8ToUtf16(const cm_utf8char_t * src, int cch_src,
 817                    cm_unichar_t * dest, int cch_dest)
 818 {
 819     int cch;
 820
 821     if (cch_dest >= 1 && dest != NULL) {
 822         dest[0] = L'\0';
 823     }
 824
 825     if (!nls_init)
 826         cm_InitNormalization();
 827
 828     if (cch_src == -1) {
 829         cch_src = (int)strlen(src) + 1;
 830     }
 831
 832     cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
 833                               cch_src * sizeof(char), dest, cch_dest);
 834
 835     if (cch != 0 && !cm_is_valid_utf16(dest, cch)) {
 836         wchar_t wsanitized[NLSMAXCCH];
 837
 838         cch = sanitize_utf16string(dest, cch, wsanitized, NLSMAXCCH);
 839         if (cch != 0) {
 840             memcpy(dest, wsanitized, cch * sizeof(wchar_t));
 841         }
 842     }
 843
 844     if (cch == 0) {
 845         if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
 846             char sanitized[NLSMAXCCH];
 847             int cch_sanitized;
 848
 849             /* If src doesn't have a unicode translation, then it
 850                wasn't valid UTF-8.  In this case, we assume that src
 851                is CP-1252 and then try to convert again.  But before
 852                that, we use a translation table to "sanitize" the
 853                input. */
 854
 855             cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
 856                                                 sizeof(sanitized)/sizeof(char));
 857
 858             if (cch_sanitized == 0) {
 859 #ifdef DEBUG_UNICODE
 860                 DebugBreak();
 861 #endif
 862                 return 0;
 863             }
 864
 865             cch = MultiByteToWideChar(1252, 0, sanitized,
 866                                       cch_sanitized * sizeof(char), dest, cch_dest);
 867             if (cch == 0) {
 868                 /* Well, that didn't work either.  Something is very wrong. */
 869 #ifdef DEBUG_UNICODE
 870                 DebugBreak();
 871 #endif
 872                 return 0;
 873             } else {
 874                 return cch;
 875             }
 876
 877         } else {
 878             return 0;
 879         }
 880     } else {
 881         return cch;
 882     }
 883 }
 884
 885 cm_unichar_t  * cm_Utf8ToUtf16Alloc(const cm_utf8char_t * src, int cch_src, int *pcch_dest)
 886 {
 887     cm_unichar_t * ustr = NULL;
 888     int cch;
 889
 890     if (!nls_init)
 891         cm_InitNormalization();
 892
 893     if (cch_src == 0 || src == NULL || *src == '\0') {
 894         if (pcch_dest)
 895             *pcch_dest = ((cch_src != 0)? 1 : 0);
 896         return wcsdup(L"");
 897     }
 898
 899     if (cch_src == -1) {
 900         cch_src = (int)strlen(src) + 1;
 901     }
 902
 903     cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
 904                               cch_src * sizeof(char), NULL, 0);
 905
 906     if (cch == 0) {
 907         if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
 908             char sanitized[NLSMAXCCH];
 909             int cch_sanitized;
 910
 911             /* If src doesn't have a unicode translation, then it
 912                wasn't valid UTF-8.  In this case, we assume that src
 913                is CP-1252 and then try to convert again.  But before
 914                that, we use a translation table to "sanitize" the
 915                input. */
 916
 917             cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
 918                                                 sizeof(sanitized)/sizeof(char));
 919
 920             if (cch_sanitized == 0) {
 921 #ifdef DEBUG_UNICODE
 922                 DebugBreak();
 923 #endif
 924                 return NULL;
 925             }
 926
 927             cch = MultiByteToWideChar(1252, 0, sanitized,
 928                                       cch_sanitized * sizeof(char), NULL, 0);
 929             if (cch == 0) {
 930                 /* Well, that didn't work either.  Something is very wrong. */
 931 #ifdef DEBUG_UNICODE
 932                 DebugBreak();
 933 #endif
 934                 return NULL;
 935             }
 936
 937             ustr = malloc((cch + 1) * sizeof(wchar_t));
 938
 939             cch = MultiByteToWideChar(1252, 0, sanitized,
 940                                       cch_sanitized * sizeof(char), ustr, cch);
 941             ustr[cch] = 0;
 942         } else {
 943             return NULL;
 944         }
 945     } else {
 946
 947         ustr = malloc((cch + 1) * sizeof(wchar_t));
 948
 949         cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
 950                                   cch_src * sizeof(char), ustr, cch);
 951         ustr[cch] = 0;
 952
 953         if (!cm_is_valid_utf16(ustr, cch)) {
 954             cm_unichar_t * us = NULL;
 955             int cch_s;
 956
 957             cch_s = sanitize_utf16string(ustr, cch, NULL, 0);
 958             if (cch_s != 0) {
 959                 us = malloc(cch_s * sizeof(wchar_t));
 960                 cch_s = sanitize_utf16string(ustr, cch, us, cch_s);
 961             }
 962
 963             if (cch_s != 0) {
 964                 free(ustr);
 965                 ustr = us;
 966                 us = NULL;
 967             } else {
 968                 if (us)
 969                     free(us);
 970                 free(ustr);
 971                 ustr = NULL;
 972             }
 973         }
 974     }
 975
 976     if (pcch_dest)
 977         *pcch_dest = cch;
 978
 979     return ustr;
 980 }
 981
 982
 983
 984 /* \brief Normalize a UTF-8 string.
 985
 986    \param[in] src String to normalize.
 987
 988    \param[in] cch_src : Count of characters in src.  If this value is
 989        -1, then src is assumed to be NULL terminated.  The translated
 990        string will be NULL terminated only if this is -1 or the count
 991        includes the terminating NULL.
 992
 993    \param[out] adest : Destination string.  Only considered valid if
 994        \a cch_adest is non-zero.
 995
 996    \param[in] cch_adest : Number of characters in the destination
 997        string.  If this is zero, then the return value is the number
 998        of bytes required.
 999
1000    \return If \a cch_adest is non-zero, then the return value is the
1001        number of bytes stored into adest.  If \a cch_adest is zero,
1002        then the return value is the number of bytes required.  In both
1003        cases, the return value is 0 if the call was unsuccessful.
1004  */
1005 long cm_NormalizeUtf8String(const char * src, int cch_src,
1006                             char * adest, int cch_adest)
1007 {
1008     wchar_t wsrcbuf[NLSMAXCCH];
1009     wchar_t *wnorm;
1010     int cch;
1011     int cch_norm;
1012
1013     if (!nls_init)
1014         cm_InitNormalization();
1015
1016     /* Get some edge cases out first, so we don't have to worry about
1017        cch_src being 0 etc. */
1018     if (cch_src == 0) {
1019         return 0;
1020     } else if (*src == '\0') {
1021         if (cch_adest >= 1)
1022             *adest = '\0';
1023         return 1;
1024     }
1025
1026     if (cch_src == -1) {
1027         cch_src = (int)strlen(src) + 1;
1028     }
1029
1030     cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
1031                               cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
1032
1033     if (cch != 0 && !cm_is_valid_utf16(wsrcbuf, cch)) {
1034         wchar_t wsanitized[NLSMAXCCH];
1035
1036         cch = sanitize_utf16string(wsrcbuf, cch, wsanitized, NLSMAXCCH);
1037         if (cch != 0) {
1038             memcpy(wsrcbuf, wsanitized, cch * sizeof(wchar_t));
1039         }
1040     }
1041
1042     if (cch == 0) {
1043         if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
1044             char sanitized[NLSMAXCCH];
1045             int cch_sanitized;
1046
1047             /* If src doesn't have a unicode translation, then it
1048                wasn't valid UTF-8.  In this case, we assume that src
1049                is CP-1252 and then try to convert again.  But before
1050                that, we use a translation table to "sanitize" the
1051                input. */
1052
1053             cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
1054                                                 sizeof(sanitized)/sizeof(char));
1055
1056             if (cch_sanitized == 0) {
1057 #ifdef DEBUG_UNICODE
1058                 DebugBreak();
1059 #endif
1060                 return 0;
1061             }
1062
1063             cch = MultiByteToWideChar(1252, 0, sanitized,
1064                                       cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
1065             if (cch == 0) {
1066                 /* Well, that didn't work either.  Something is very wrong. */
1067 #ifdef DEBUG_UNICODE
1068                 DebugBreak();
1069 #endif
1070                 return 0;
1071             }
1072         } else {
1073             return 0;
1074         }
1075     }
1076
1077     cch_norm = 0;
1078     wnorm = NormalizeUtf16String(wsrcbuf, cch, NULL, &cch_norm);
1079     if (wnorm == NULL) {
1080 #ifdef DEBUG_UNICODE
1081         DebugBreak();
1082 #endif
1083         return 0;
1084     }
1085
1086     cch = WideCharToMultiByte(CP_UTF8, 0, wnorm,
1087                               cch_norm, adest, cch_adest * sizeof(char),
1088                               NULL, FALSE);
1089
1090     if (wnorm)
1091         free(wnorm);
1092
1093     return cch;
1094 }
1095
1096 /*! \brief Case insensitive comparison with specific length
1097
1098   \param[in] str1 First string to compare.  Assumed to be encoded in UTF-8.
1099
1100   \param[in] str2 Second string to compare.  Assumed to be encoded in UTF-8.
1101
1102   \param[in] n Max byte count.
1103
1104  */
1105 int cm_strnicmp_utf8(const char * str1, const char * str2, int n)
1106 {
1107     wchar_t wstr1[NLSMAXCCH];
1108     int len1;
1109     int len2;
1110     wchar_t wstr2[NLSMAXCCH];
1111     int rv;
1112
1113     if (!nls_init)
1114         cm_InitNormalization();
1115
1116     if (n == 0)
1117         return 0;
1118
1119     /* first check for NULL pointers (assume NULL < "") */
1120     if (str1 == NULL) {
1121         if (str2 == NULL)
1122             return 0;
1123         else
1124             return -1;
1125     } else if (str2 == NULL) {
1126         return 1;
1127     }
1128
1129     len1 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str1, n, wstr1, NLSMAXCCH);
1130     if (len1 == 0) {
1131 #ifdef DEBUG
1132         DebugBreak();
1133 #endif
1134         wstr1[0] = L'\0';
1135     }
1136
1137     len2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str2, n, wstr2, NLSMAXCCH);
1138     if (len2 == 0) {
1139 #ifdef DEBUG
1140         DebugBreak();
1141 #endif
1142         wstr2[0] = L'\0';
1143     }
1144
1145     rv = CompareStringW(nls_lcid, NORM_IGNORECASE, wstr1, len1, wstr2, len2);
1146     if (rv > 0)
1147         return (rv - 2);
1148     else {
1149 #ifdef DEBUG
1150         DebugBreak();
1151 #endif
1152         return 0;
1153     }
1154 }
1155
1156 int cm_strnicmp_utf16(const cm_unichar_t * str1, const cm_unichar_t * str2, int len)
1157 {
1158     int rv;
1159     size_t cch1;
1160     size_t cch2;
1161
1162     if (!nls_init)
1163         cm_InitNormalization();
1164
1165     if (len == 0)
1166         return 0;
1167
1168     /* first check for NULL pointers */
1169     if (str1 == NULL) {
1170         if (str2 == NULL)
1171             return 0;
1172         else
1173             return -1;
1174     } else if (str2 == NULL) {
1175         return 1;
1176     }
1177
1178     if (FAILED(StringCchLengthW(str1, len, &cch1)))
1179         cch1 = len;
1180
1181     if (FAILED(StringCchLengthW(str2, len, &cch2)))
1182         cch2 = len;
1183
1184     rv = CompareStringW(nls_lcid, NORM_IGNORECASE, str1, (int)cch1, str2, (int)cch2);
1185     if (rv > 0)
1186         return (rv - 2);
1187     else {
1188 #ifdef DEBUG
1189         DebugBreak();
1190 #endif
1191         return 0;
1192     }
1193 }
1194
1195 int cm_stricmp_utf16(const cm_unichar_t * str1, const cm_unichar_t * str2)
1196 {
1197     int rv;
1198
1199     if (!nls_init)
1200         cm_InitNormalization();
1201
1202     /* first check for NULL pointers */
1203     if (str1 == NULL) {
1204         if (str2 == NULL)
1205             return 0;
1206         else
1207             return -1;
1208     } else if (str2 == NULL) {
1209         return 1;
1210     }
1211
1212     rv = CompareStringW(nls_lcid, NORM_IGNORECASE, str1, -1, str2, -1);
1213     if (rv > 0)
1214         return (rv - 2);
1215     else {
1216 #ifdef DEBUG
1217         DebugBreak();
1218 #endif
1219         return 0;
1220     }
1221 }
1222
1223 cm_unichar_t *cm_strlwr_utf16(cm_unichar_t * str)
1224 {
1225     int rv;
1226     int len;
1227
1228     if (!nls_init)
1229         cm_InitNormalization();
1230
1231     len = (int)wcslen(str) + 1;
1232     rv = LCMapStringW(nls_lcid, LCMAP_LOWERCASE, str, len, str, len);
1233 #ifdef DEBUG
1234     if (rv == 0) {
1235         DebugBreak();
1236     }
1237 #endif
1238
1239     return str;
1240 }
1241
1242 cm_unichar_t *cm_strupr_utf16(cm_unichar_t * str)
1243 {
1244     int rv;
1245     int len;
1246
1247     if (!nls_init)
1248         cm_InitNormalization();
1249
1250     len = (int)wcslen(str) + 1;
1251     rv = LCMapStringW(nls_lcid, LCMAP_UPPERCASE, str, len, str, len);
1252 #ifdef DEBUG
1253     if (rv == 0) {
1254         DebugBreak();
1255     }
1256 #endif
1257
1258     return str;
1259 }
1260
1261
1262 int cm_stricmp_utf8(const char * str1, const char * str2)
1263 {
1264     wchar_t wstr1[NLSMAXCCH];
1265     int len1;
1266     int len2;
1267     wchar_t wstr2[NLSMAXCCH];
1268     int rv;
1269
1270     if (!nls_init)
1271         cm_InitNormalization();
1272
1273     /* first check for NULL pointers */
1274     if (str1 == NULL) {
1275         if (str2 == NULL)
1276             return 0;
1277         else
1278             return -1;
1279     } else if (str2 == NULL) {
1280         return 1;
1281     }
1282
1283     len1 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str1, -1, wstr1, NLSMAXCCH);
1284     if (len1 == 0) {
1285 #ifdef DEBUG
1286         DebugBreak();
1287 #endif
1288         wstr1[0] = L'\0';
1289     }
1290
1291     len2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str2, -1, wstr2, NLSMAXCCH);
1292     if (len2 == 0) {
1293 #ifdef DEBUG
1294         DebugBreak();
1295 #endif
1296         wstr2[0] = L'\0';
1297     }
1298
1299     rv = CompareStringW(nls_lcid, NORM_IGNORECASE, wstr1, len1, wstr2, len2);
1300     if (rv > 0)
1301         return (rv - 2);
1302     else {
1303 #ifdef DEBUG
1304         DebugBreak();
1305 #endif
1306         return 0;
1307     }
1308 }
1309
1310 #if 0
1311 wchar_t * strupr_utf16(wchar_t * wstr, size_t cbstr)
1312 {
1313     wchar_t wstrd[NLSMAXCCH];
1314     int len;
1315
1316     if (!nls_init)
1317         cm_InitNormalization();
1318
1319     len = cbstr / sizeof(wchar_t);
1320     len = LCMapStringW(nls_lcid, LCMAP_UPPERCASE, wstr, len, wstrd, NLSMAXCCH);
1321     StringCbCopyW(wstr, cbstr, wstrd);
1322
1323     return wstr;
1324 }
1325 #endif
1326
1327 char * strupr_utf8(char * str, size_t cbstr)
1328 {
1329     wchar_t wstr[NLSMAXCCH];
1330     wchar_t wstrd[NLSMAXCCH];
1331     int len;
1332
1333     if (!nls_init)
1334         cm_InitNormalization();
1335
1336     len = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, -1, wstr, NLSMAXCCH);
1337     if (len == 0)
1338         return str;
1339
1340     len = LCMapStringW(nls_lcid, LCMAP_UPPERCASE, wstr, len, wstrd, NLSMAXCCH);
1341
1342     len = WideCharToMultiByte(CP_UTF8, 0, wstrd, -1, str, (int)cbstr, NULL, FALSE);
1343
1344     return str;
1345 }
1346
1347 char * char_next_utf8(const char * c)
1348 {
1349 #define CH (*((const unsigned char *)c))
1350
1351     if ((CH & 0x80) == 0)
1352         return (char *) c+1;
1353     else {
1354         switch (CH & 0xf0) {
1355         case 0xc0:
1356         case 0xd0:
1357             return (char *) c+2;
1358
1359         case 0xe0:
1360             return (char *) c+3;
1361
1362         case 0xf0:
1363             return (char *) c+4;
1364
1365         default:
1366             return (char *) c+1;
1367         }
1368     }
1369 #undef CH
1370 }
1371
1372
1373 char * char_prev_utf8(const char * c)
1374 {
1375 #define CH (*((const unsigned char *)c))
1376
1377     c--;
1378
1379     if ((CH & 0x80) == 0)
1380         return (char *) c;
1381     else
1382         while ((CH & 0xc0) == 0x80)
1383             (char *) c--;
1384     return (char *) c;
1385
1386 #undef CH
1387 }
1388
1389 wchar_t * char_next_utf16(const wchar_t * c)
1390 {
1391     unsigned short sc = (unsigned short) *c;
1392
1393     if (sc >= 0xd800 && sc <= 0xdbff)
1394         return (wchar_t *) c+2;
1395     return (wchar_t *) c+1;
1396 }
1397
1398 wchar_t * char_prev_utf16(const wchar_t * c)
1399 {
1400     unsigned short sc = (unsigned short) *(--c);
1401
1402     if (sc >= 0xdc00 && sc <= 0xdfff)
1403         return (wchar_t *) --c;
1404     return (wchar_t *) c;
1405 }
1406
1407 wchar_t * char_this_utf16(const wchar_t * c)
1408 {
1409     unsigned short sc = (unsigned short) *c;
1410
1411     if (sc >= 0xdc00 && sc <= 0xdfff)
1412         return (wchar_t *) --c;
1413     return (wchar_t *) c;
1414 }
1415
1416 int cm_is_valid_utf16(const wchar_t * c, int cch)
1417 {
1418     if (cch < 0)
1419         cch = wcslen(c) + 1;
1420
1421     for (; cch > 0; c++, cch--) {
1422         if (*c >= 0xd800 && *c < 0xdc00) {
1423             c++; cch--;
1424             if (cch == 0 || *c < 0xdc00 || *c > 0xdfff)
1425                 return 0;
1426         } else if (*c >= 0xdc00 && *c <= 0xdfff) {
1427             return 0;
1428         }
1429     }
1430
1431     return 1;
1432 }
1433
1434 #ifdef DEBUG
1435 wchar_t * cm_GetRawCharsAlloc(const wchar_t * c, int len)
1436 {
1437     wchar_t * ret;
1438     wchar_t * current;
1439     size_t cb;
1440
1441     if (len == -1)
1442         len = wcslen(c);
1443
1444     if (len == 0)
1445         return wcsdup(L"(empty)");
1446
1447     cb = len * 5 * sizeof(wchar_t);
1448     current = ret = malloc(cb);
1449     if (ret == NULL)
1450         return NULL;
1451
1452     for (; len > 0; ++c, --len) {
1453         StringCbPrintfExW(current, cb, &current, &cb, 0,
1454                          L"%04x", (int) *c);
1455         if (len > 1)
1456             StringCbCatExW(current, cb, L",", &current, &cb, 0);
1457     }
1458
1459     return ret;
1460 }
1461 #endif