src/WINNT/afsd/cm_nls.c

   1 /*
   2  * Copyright (c) 2008 Secure Endpoints Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person
   5  * obtaining a copy of this software and associated documentation
   6  * files (the "Software"), to deal in the Software without
   7  * restriction, including without limitation the rights to use, copy,
   8  * modify, merge, publish, distribute, sublicense, and/or sell copies
   9  * of the Software, and to permit persons to whom the Software is
  10  * furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include <windows.h>
  26 #include <stdlib.h>
  27 #include <wchar.h>
  28 #include <strsafe.h>
  29 #include <errno.h>
  30
  31 #include "cm_nls.h"
  32
  33 #ifdef DEBUG_UNICODE
  34 #include <assert.h>
  35 #endif
  36
  37 /* This is part of the Microsoft Internationalized Domain Name
  38    Mitigation APIs. */
  39 #include <normalization.h>
  40
  41 /* TODO: All the normalization and conversion code should NUL
  42    terminate destination strings. */
  43
  44 int
  45 (WINAPI *pNormalizeString)( __in NORM_FORM NormForm,
  46                             __in_ecount(cwSrcLength) LPCWSTR lpSrcString,
  47                             __in int cwSrcLength,
  48                             __out_ecount(cwDstLength) LPWSTR lpDstString,
  49                             __in int cwDstLength ) = NULL;
  50
  51 BOOL
  52 (WINAPI *pIsNormalizedString)( __in NORM_FORM NormForm,
  53                                __in_ecount(cwLength) LPCWSTR lpString,
  54                                __in int cwLength ) = NULL;
  55
  56
  57 #define NLSDLLNAME "Normaliz.dll"
  58 #define NLSMAXCCH  1024
  59 #define NLSERRCCH  8
  60
  61 #define AFS_NORM_FORM NormalizationC
  62
  63 long cm_InitNormalization(void)
  64 {
  65     HMODULE h_Nls;
  66
  67     if (pNormalizeString != NULL)
  68         return 0;
  69
  70     h_Nls = LoadLibrary(NLSDLLNAME);
  71     if (h_Nls == INVALID_HANDLE_VALUE) {
  72         return 1;
  73     }
  74
  75     pNormalizeString =
  76         (int (WINAPI *)( NORM_FORM, LPCWSTR,
  77                          int, LPWSTR, int))
  78         GetProcAddress(h_Nls, "NormalizeString");
  79
  80     pIsNormalizedString =
  81         (BOOL
  82          (WINAPI *)( NORM_FORM, LPCWSTR, int ))
  83         GetProcAddress(h_Nls, "IsNormalizedString");
  84
  85     return (pNormalizeString && pIsNormalizedString);
  86 }
  87
  88 /* \brief Normalize a UTF-16 string.
  89
  90    If the supplied destination buffer is insufficient or NULL, then a
  91    new buffer will be allocated to hold the normalized string.
  92
  93    \param[in] src : Source UTF-16 string.  Length is specified in
  94        cch_src.
  95
  96    \param[in] cch_src : The character count in cch_src is assumed to
  97        be tight and include the terminating NULL character if there is
  98        one.  If the NULL is absent, the resulting string will not be
  99        NULL terminated.
 100
 101    \param[out] ext_dest : The destination buffer.  Can be NULL, in
 102        which case *pcch_dest MUST be 0.
 103
 104    \param[in,out] pcch_dest : On entry *pcch_dest contains a count of
 105        characters in the destination buffer.  On exit, it will contain
 106        a count of characters that were copied to the destination
 107        buffer.
 108
 109    Returns a pointer to the buffer containing the normalized string or
 110    NULL if the call was unsuccessful.  If the returned destination
 111    buffer is different from the supplied buffer and non-NULL, it
 112    should be freed using free().
 113 */
 114 static wchar_t *
 115 NormalizeUtf16String(const wchar_t * src, int cch_src, wchar_t * ext_dest, int *pcch_dest)
 116 {
 117 #ifdef DEBUG_UNICODE
 118     assert (pNormalizeString != NULL && pIsNormalizedString != NULL);
 119 #endif
 120
 121     if (cch_src == -1)
 122         cch_src = wcslen(src) + 1;
 123
 124     if ((pIsNormalizedString && (*pIsNormalizedString)(AFS_NORM_FORM, src, cch_src)) ||
 125         (!pNormalizeString)) {
 126
 127         if (ext_dest == NULL || *pcch_dest < cch_src) {
 128             ext_dest = malloc(cch_src * sizeof(wchar_t));
 129             *pcch_dest = cch_src;
 130         }
 131
 132         /* No need to or unable to normalize.  Just copy the string.
 133            Note that the string is not NUL terminated if the source
 134            string is not NUL terminated. */
 135
 136         if (ext_dest) {
 137             memcpy(ext_dest, src, cch_src * sizeof(wchar_t));
 138             *pcch_dest = cch_src;
 139         } else {
 140             *pcch_dest = 0;
 141         }
 142         return ext_dest;
 143
 144     } else {
 145
 146         int rv;
 147         DWORD gle;
 148         int tries = 10;
 149         wchar_t * dest;
 150         int cch_dest = *pcch_dest;
 151
 152         dest = ext_dest;
 153
 154         while (tries-- > 0) {
 155
 156             rv = (*pNormalizeString)(AFS_NORM_FORM, src, cch_src, dest, cch_dest);
 157
 158             if (rv <= 0 && (gle = GetLastError()) != ERROR_SUCCESS) {
 159                 if (gle == ERROR_INSUFFICIENT_BUFFER) {
 160
 161                     /* The buffer wasn't big enough.  We are going to
 162                        try allocating one. */
 163
 164                     cch_dest = (-rv) + NLSERRCCH;
 165                     goto cont;
 166
 167                 } else {
 168                     /* Something else is wrong */
 169                     break;
 170                 }
 171
 172             } else if (rv < 0) { /* rv < 0 && gle == ERROR_SUCCESS */
 173
 174                 /* Technically not one of the expected outcomes */
 175                 break;
 176
 177             } else {            /* rv > 0 || (rv == 0 && gle == ERROR_SUCCESS) */
 178
 179                 /* Possibly succeeded */
 180
 181                 if (rv == 0) { /* Succeeded and the return string is empty */
 182                     *pcch_dest = 0;
 183                     return dest;
 184                 }
 185
 186                 if (cch_dest == 0) {
 187                     /* Nope.  We only calculated the required size of the buffer */
 188
 189                     cch_dest = rv + NLSERRCCH;
 190                     goto cont;
 191                 }
 192
 193                 *pcch_dest = rv;
 194                 if (cch_dest > rv)
 195                     dest[rv] = 0;
 196                 else {
 197                     /* Can't NUL terminate */
 198                     cch_dest = max(rv,cch_dest) + NLSERRCCH;
 199                     goto cont;
 200                 }
 201
 202                 /* Success! */
 203                 return dest;
 204             }
 205
 206         cont:
 207             if (dest != ext_dest && dest)
 208                 free(dest);
 209             dest = malloc(cch_dest * sizeof(wchar_t));
 210         }
 211
 212         /* Failed */
 213
 214         if (dest != ext_dest && dest)
 215             free(dest);
 216
 217         *pcch_dest = 0;
 218         return NULL;
 219     }
 220 }
 221
 222 /*! \brief Normalize a Unicode string into a newly allocated buffer
 223
 224   The input string will be normalized using NFC.
 225
 226   \param[in] s UTF-16 string to be normalized.
 227
 228   \param[in] cch_src The number of characters in the input string.  If
 229       this is -1, then the input string is assumed to be NUL
 230       terminated.
 231
 232   \param[out] pcch_dest Receives the number of characters copied to
 233       the output buffer.  Note that the character count is the number
 234       of wchar_t characters copied, and not the count of Unicode code
 235       points.  This includes the terminating NUL if cch_src was -1 or
 236       included the terminating NUL.
 237
 238   \return A newly allocated buffer holding the normalized string or
 239       NULL if the call failed.
 240  */
 241 cm_normchar_t * cm_NormalizeStringAlloc(const cm_unichar_t * s, int cch_src, int *pcch_dest)
 242 {
 243     int cch_dest = 0;
 244     cm_normchar_t * r;
 245
 246     if (s == NULL || cch_src == 0 || *s == L'\0') {
 247         if (pcch_dest)
 248             *pcch_dest = ((cch_src != 0)? 1: 0);
 249         return wcsdup(L"");
 250     }
 251
 252     r = NormalizeUtf16String(s, cch_src, NULL, &cch_dest);
 253
 254     if (pcch_dest)
 255         *pcch_dest = cch_dest;
 256
 257     return r;
 258 }
 259
 260 int cm_NormalizeString(const cm_unichar_t * s, int cch_src,
 261                        cm_normchar_t * dest, int cch_dest)
 262 {
 263     int tcch = cch_dest;
 264     cm_normchar_t * r;
 265
 266     r = NormalizeUtf16String(s, cch_src, dest, &tcch);
 267
 268     if (r != dest) {
 269         /* The supplied buffer was insufficient */
 270         free(r);
 271         return 0;
 272     } else {
 273         return tcch;
 274     }
 275 }
 276
 277 /*! \brief Convert a UTF-16 string to a UTF-8 string using a newly allocated buffer
 278
 279   \param[in] s UTF-16 source string
 280
 281   \param[in] cch_src Number of characters in \a s. This can be set to
 282       -1 if \a s is NUL terminated.
 283
 284   \param[out] pcch_dest Receives a count of characters that were
 285       copied to the target buffer.
 286
 287   \return A newly allocated buffer holding the UTF-8 string.
 288
 289  */
 290 cm_utf8char_t * cm_Utf16ToUtf8Alloc(const cm_unichar_t * s, int cch_src, int *pcch_dest)
 291 {
 292     int cch_dest;
 293     cm_utf8char_t * dest;
 294
 295     if (s == NULL || cch_src == 0 || *s == L'\0') {
 296         if (pcch_dest)
 297             *pcch_dest = ((cch_src != 0)?1:0);
 298         return strdup("");
 299     }
 300
 301     cch_dest = WideCharToMultiByte(CP_UTF8, 0, s, cch_src, NULL, 0, NULL, FALSE);
 302
 303     if (cch_dest == 0) {
 304         if (pcch_dest)
 305             *pcch_dest = cch_dest;
 306         return NULL;
 307     }
 308
 309     dest = malloc((cch_dest + 1) * sizeof(cm_utf8char_t));
 310
 311     WideCharToMultiByte(CP_UTF8, 0, s, cch_src, dest, cch_dest, NULL, FALSE);
 312     dest[cch_dest] = 0;
 313
 314     if (pcch_dest)
 315         *pcch_dest = cch_dest;
 316
 317     return dest;
 318 }
 319
 320 int cm_Utf16ToUtf8(const cm_unichar_t * src, int cch_src,
 321                    cm_utf8char_t * dest, int cch_dest)
 322 {
 323     return WideCharToMultiByte(CP_UTF8, 0, src, cch_src, dest, cch_dest, NULL, FALSE);
 324 }
 325
 326 int cm_Utf16ToUtf16(const cm_unichar_t * src, int cch_src,
 327                     cm_unichar_t * dest, int cch_dest)
 328 {
 329     if (cch_src == -1) {
 330         StringCchCopyW(dest, cch_dest, src);
 331         return wcslen(dest) + 1;
 332     } else {
 333         int cch_conv = min(cch_src, cch_dest);
 334         memcpy(dest, src, cch_conv * sizeof(cm_unichar_t));
 335         return cch_conv;
 336     }
 337 }
 338
 339 /* \brief Normalize a UTF-16 string into a UTF-8 string.
 340
 341    \param[in] src : Source string.
 342
 343    \param[in] cch_src : Count of characters in src. If the count includes the
 344        NULL terminator, then the resulting string will be NULL
 345        terminated.  If it is -1, then src is assumed to be NULL
 346        terminated.
 347
 348    \param[out] adest : Destination buffer.
 349
 350    \param[in] cch_adest : Number of characters in the destination buffer.
 351
 352    Returns the number of characters stored into cch_adest. This will
 353    include the terminating NULL if cch_src included the terminating
 354    NULL or was -1.  If this is 0, then the operation was unsuccessful.
 355  */
 356 long cm_NormalizeUtf16StringToUtf8(const wchar_t * src, int cch_src,
 357                                    char * adest, int cch_adest)
 358 {
 359     if (cch_src < 0) {
 360         size_t cch;
 361
 362         if (FAILED(StringCchLengthW(src, NLSMAXCCH, &cch)))
 363             return E2BIG;
 364
 365         cch_src = cch+1;
 366     }
 367
 368     {
 369         wchar_t nbuf[NLSMAXCCH];
 370         wchar_t * normalized;
 371         int cch_norm = NLSMAXCCH;
 372
 373         normalized = NormalizeUtf16String(src, cch_src, nbuf, &cch_norm);
 374         if (normalized) {
 375             cch_adest = WideCharToMultiByte(CP_UTF8, 0, normalized, cch_norm,
 376                                             adest, cch_adest, NULL, 0);
 377
 378             if (normalized != nbuf && normalized)
 379                 free(normalized);
 380
 381             return cch_adest;
 382
 383         } else {
 384
 385             return 0;
 386
 387         }
 388     }
 389 }
 390
 391 #define ESCVAL 0x1000
 392 #define Esc(c) (ESCVAL + (short)(c))
 393 #define IS_ESCAPED(c) (((c) & ESCVAL) == ESCVAL)
 394
 395 /* \brief Character sanitization map for CP-1252
 396
 397    The following map indicates which characters should be escaped in
 398    the CP-1252 character map.  Characters that are documented as
 399    illegal characters in a file name are marked as escaped.  Escaped
 400    characters are marked using the ::Esc macro defined above.  The
 401    following exceptions apply:
 402
 403    - Path delimeters '\\' and '/' are NOT escaped because the
 404      sanitization map applies to paths.  While those characters are
 405      illegal in filenames, they are legal in paths.
 406
 407    - Wildcard characters '*' and '?' ARE escaped.  The document
 408      referred below does not specify these characters as invalid.
 409      Since no other escape mechanism exists, names containing
 410      wildcards are indistinguishable from actual wildcards used in SMB
 411      requests.
 412
 413    - Reserved names are not and cannot be represented in this map.
 414      Reserved names are :
 415
 416      CON, PRN, AUX, NUL, COM1, COM2, COM3, COM4, COM5, COM6, COM7,
 417      COM8, COM9, LPT1, LPT2, LPT3, LPT4, LPT5, LPT6, LPT7, LPT8, LPT9,
 418      CLOCK$
 419
 420    - Characters 0x80, 0x81, 0x8d, 0x8e, 0x8f, 0x90, 0x9d, 0x9e, 0x9f
 421      are also escaped because they are unused in CP-1252 and hence
 422      cannot be convered to a Unicode string.
 423
 424      Reserved names with extensions are also invalid. (i.e. NUL.txt)
 425
 426    \note The only bit we are actually interested in from the following
 427      table is the ESCVAL bit.  However, the characters themselves are
 428      included for ease of maintenance.
 429
 430    \see "Naming a File" topic in the Windows SDK.
 431  */
 432 static const short sanitized_escapes_1252[] = {
 433     Esc(0x00),Esc(0x01),Esc(0x02),Esc(0x03),Esc(0x04),Esc(0x05),Esc(0x06),Esc(0x07),
 434     Esc(0x08),Esc(0x09),Esc(0x0a),Esc(0x0b),Esc(0x0c),Esc(0x0d),Esc(0x0e),Esc(0x0f),
 435     Esc(0x10),Esc(0x11),Esc(0x12),Esc(0x13),Esc(0x14),Esc(0x15),Esc(0x16),Esc(0x17),
 436     Esc(0x18),Esc(0x19),Esc(0x1a),Esc(0x1b),Esc(0x1c),Esc(0x1d),Esc(0x1e),Esc(0x1f),
 437     ' ','!',Esc('"'),'#','$','%','&','\'','(',')',Esc('*'),'+',',','-','.','/',
 438     '0','1','2','3','4','5','6','7','8','9',Esc(':'),';',Esc('<'),'=',Esc('>'),Esc('?'),
 439     '@','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
 440     'P','Q','R','S','T','U','V','W','X','Y','Z','[','\\',']','^','_',
 441     '`','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o',
 442     'p','q','r','s','t','u','v','w','x','y','z','{',Esc('|'),'}','~',Esc(0x7f),
 443     Esc(0x80),Esc(0x81),0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,Esc(0x8d),Esc(0x8e),Esc(0x8f),
 444     Esc(0x90),0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,Esc(0x9d),Esc(0x9e),0x9f,
 445     0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
 446     0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
 447     0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,
 448     0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf,
 449     0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
 450     0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff
 451 };
 452
 453 static int sanitize_bytestring(const char * src, int cch_src,
 454                                char * odest, int cch_dest)
 455 {
 456     char * dest = odest;
 457     while (cch_src > 0 && *src && cch_dest > 0) {
 458
 459         unsigned short rc;
 460
 461         rc = sanitized_escapes_1252[*src];
 462         if (IS_ESCAPED(rc)) {
 463             static const char hex[] =
 464                 {'0','1','2','3','4','5','6','7',
 465                  '8','9','a','b','c','d','e','f'};
 466
 467             if (cch_dest < 3) {
 468                 *dest++ = '\0';
 469                 return 0;
 470             }
 471
 472             *dest++ = '%';
 473             *dest++ = hex[(((int)*src) >> 4) & 0x0f];
 474             *dest++ = hex[(((int)*src) & 0x0f)];
 475             cch_dest -= 3;
 476
 477         } else {
 478             *dest++ = *src;
 479             cch_dest--;
 480         }
 481
 482         cch_src--;
 483         src++;
 484     }
 485
 486     if (cch_src > 0 && cch_dest > 0) {
 487         *dest++ = '\0';
 488     }
 489
 490     return (int)(dest - odest);
 491 }
 492
 493 #undef Esc
 494 #undef IS_ESCAPED
 495 #undef ESCVAL
 496
 497 long cm_NormalizeUtf8StringToUtf16(const char * src, int cch_src,
 498                                    wchar_t * dest, int cch_dest)
 499 {
 500     wchar_t wsrcbuf[NLSMAXCCH];
 501     wchar_t *wnorm;
 502     int cch;
 503     int cch_norm;
 504
 505     /* Get some edge cases out first, so we don't have to worry about
 506        cch_src being 0 etc. */
 507     if (cch_src == 0) {
 508         return 0;
 509     } else if (*src == '\0') {
 510         if (cch_dest >= 1)
 511             *dest = L'\0';
 512         return 1;
 513     }
 514
 515     if (cch_src == -1) {
 516         cch_src = strlen(src) + 1;
 517     }
 518
 519     cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
 520                               cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
 521
 522     if (cch == 0) {
 523         if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
 524             char sanitized[NLSMAXCCH];
 525             int cch_sanitized;
 526
 527             /* If src doesn't have a unicode translation, then it
 528                wasn't valid UTF-8.  In this case, we assume that src
 529                is CP-1252 and then try to convert again.  But before
 530                that, we use a translation table to "sanitize" the
 531                input. */
 532
 533             cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
 534                                                 sizeof(sanitized)/sizeof(char));
 535
 536             if (cch_sanitized == 0) {
 537 #ifdef DEBUG_UNICODE
 538                 DebugBreak();
 539 #endif
 540                 return 0;
 541             }
 542
 543             cch = MultiByteToWideChar(1252, 0, sanitized,
 544                                       cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
 545             if (cch == 0) {
 546                 /* Well, that didn't work either.  Something is very wrong. */
 547 #ifdef DEBUG_UNICODE
 548                 DebugBreak();
 549 #endif
 550                 return 0;
 551             }
 552         } else {
 553             return 0;
 554         }
 555     }
 556
 557     cch_norm = cch_dest;
 558     wnorm = NormalizeUtf16String(wsrcbuf, cch, dest, &cch_norm);
 559     if (wnorm == NULL) {
 560 #ifdef DEBUG_UNICODE
 561         DebugBreak();
 562 #endif
 563         return 0;
 564     }
 565
 566     if (wnorm != dest) {
 567         /* The buffer was insufficient */
 568         if (dest != NULL && cch_dest > 1) {
 569             *dest = L'\0';
 570             cch_norm = 0;
 571         }
 572
 573         free(wnorm);
 574     }
 575
 576     return cch_norm;
 577 }
 578
 579 cm_normchar_t *cm_NormalizeUtf8StringToUtf16Alloc(const cm_utf8char_t * src, int cch_src,
 580                                                   int *pcch_dest)
 581 {
 582     wchar_t wsrcbuf[NLSMAXCCH];
 583     wchar_t *wnorm;
 584     int cch;
 585     int cch_norm;
 586
 587     /* Get some edge cases out first, so we don't have to worry about
 588        cch_src being 0 etc. */
 589     if (cch_src == 0 || src == NULL || *src == '\0') {
 590         if (pcch_dest)
 591             *pcch_dest = ((cch_src != 0)? 1 : 0);
 592         return wcsdup(L"");
 593     }
 594
 595     if (cch_src == -1) {
 596         cch_src = strlen(src) + 1;
 597     }
 598
 599     cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
 600                               cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
 601
 602     if (cch == 0) {
 603         if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
 604             char sanitized[NLSMAXCCH];
 605             int cch_sanitized;
 606
 607             /* If src doesn't have a unicode translation, then it
 608                wasn't valid UTF-8.  In this case, we assume that src
 609                is CP-1252 and then try to convert again.  But before
 610                that, we use a translation table to "sanitize" the
 611                input. */
 612
 613             cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
 614                                                 sizeof(sanitized)/sizeof(char));
 615
 616             if (cch_sanitized == 0) {
 617 #ifdef DEBUG_UNICODE
 618                 DebugBreak();
 619 #endif
 620                 return NULL;
 621             }
 622
 623             cch = MultiByteToWideChar(1252, 0, sanitized,
 624                                       cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
 625             if (cch == 0) {
 626                 /* Well, that didn't work either.  Something is very wrong. */
 627 #ifdef DEBUG_UNICODE
 628                 DebugBreak();
 629 #endif
 630                 return NULL;
 631             }
 632         } else {
 633             return NULL;
 634         }
 635     }
 636
 637     cch_norm = 0;
 638     wnorm = NormalizeUtf16String(wsrcbuf, cch, NULL, &cch_norm);
 639     if (wnorm == NULL) {
 640 #ifdef DEBUG_UNICODE
 641         DebugBreak();
 642 #endif
 643         return NULL;
 644     }
 645
 646     if (pcch_dest)
 647         *pcch_dest = cch_norm;
 648
 649     return wnorm;
 650 }
 651
 652 int cm_Utf8ToUtf16(const cm_utf8char_t * src, int cch_src,
 653                    cm_unichar_t * dest, int cch_dest)
 654 {
 655     int cch;
 656
 657     if (cch_src == -1) {
 658         cch_src = strlen(src) + 1;
 659     }
 660
 661     cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
 662                               cch_src * sizeof(char), dest, cch_dest);
 663
 664     if (cch == 0) {
 665         if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
 666             char sanitized[NLSMAXCCH];
 667             int cch_sanitized;
 668
 669             /* If src doesn't have a unicode translation, then it
 670                wasn't valid UTF-8.  In this case, we assume that src
 671                is CP-1252 and then try to convert again.  But before
 672                that, we use a translation table to "sanitize" the
 673                input. */
 674
 675             cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
 676                                                 sizeof(sanitized)/sizeof(char));
 677
 678             if (cch_sanitized == 0) {
 679 #ifdef DEBUG_UNICODE
 680                 DebugBreak();
 681 #endif
 682                 return 0;
 683             }
 684
 685             cch = MultiByteToWideChar(1252, 0, sanitized,
 686                                       cch_sanitized * sizeof(char), dest, cch_dest);
 687             if (cch == 0) {
 688                 /* Well, that didn't work either.  Something is very wrong. */
 689 #ifdef DEBUG_UNICODE
 690                 DebugBreak();
 691 #endif
 692                 return 0;
 693             } else {
 694                 return cch;
 695             }
 696
 697         } else {
 698             return 0;
 699         }
 700     } else {
 701         return cch;
 702     }
 703 }
 704
 705 cm_unichar_t  * cm_Utf8ToUtf16Alloc(const cm_utf8char_t * src, int cch_src, int *pcch_dest)
 706 {
 707     cm_unichar_t * ustr = NULL;
 708     int cch;
 709
 710     if (cch_src == 0 || src == NULL || *src == '\0') {
 711         if (pcch_dest)
 712             *pcch_dest = ((cch_src != 0)? 1 : 0);
 713         return wcsdup(L"");
 714     }
 715
 716     if (cch_src == -1) {
 717         cch_src = strlen(src) + 1;
 718     }
 719
 720     cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
 721                               cch_src * sizeof(char), NULL, 0);
 722
 723     if (cch == 0) {
 724         if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
 725             char sanitized[NLSMAXCCH];
 726             int cch_sanitized;
 727
 728             /* If src doesn't have a unicode translation, then it
 729                wasn't valid UTF-8.  In this case, we assume that src
 730                is CP-1252 and then try to convert again.  But before
 731                that, we use a translation table to "sanitize" the
 732                input. */
 733
 734             cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
 735                                                 sizeof(sanitized)/sizeof(char));
 736
 737             if (cch_sanitized == 0) {
 738 #ifdef DEBUG_UNICODE
 739                 DebugBreak();
 740 #endif
 741                 return NULL;
 742             }
 743
 744             cch = MultiByteToWideChar(1252, 0, sanitized,
 745                                       cch_sanitized * sizeof(char), NULL, 0);
 746             if (cch == 0) {
 747                 /* Well, that didn't work either.  Something is very wrong. */
 748 #ifdef DEBUG_UNICODE
 749                 DebugBreak();
 750 #endif
 751                 return NULL;
 752             }
 753
 754             ustr = malloc((cch + 1) * sizeof(wchar_t));
 755
 756             cch = MultiByteToWideChar(1252, 0, sanitized,
 757                                       cch_sanitized * sizeof(char), ustr, cch);
 758             ustr[cch] = 0;
 759         } else {
 760             return NULL;
 761         }
 762     } else {
 763
 764         ustr = malloc((cch + 1) * sizeof(wchar_t));
 765
 766         cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
 767                                   cch_src * sizeof(char), ustr, cch);
 768         ustr[cch] = 0;
 769     }
 770
 771     if (pcch_dest)
 772         *pcch_dest = cch;
 773
 774     return ustr;
 775 }
 776
 777
 778
 779 /* \brief Normalize a UTF-8 string.
 780
 781    \param[in] src String to normalize.
 782
 783    \param[in] cch_src : Count of characters in src.  If this value is
 784        -1, then src is assumed to be NULL terminated.  The translated
 785        string will be NULL terminated only if this is -1 or the count
 786        includes the terminating NULL.
 787
 788    \param[out] adest : Destination string.  Only considered valid if
 789        \a cch_adest is non-zero.
 790
 791    \param[in] cch_adest : Number of characters in the destination
 792        string.  If this is zero, then the return value is the number
 793        of bytes required.
 794
 795    \return If \a cch_adest is non-zero, then the return value is the
 796        number of bytes stored into adest.  If \a cch_adest is zero,
 797        then the return value is the number of bytes required.  In both
 798        cases, the return value is 0 if the call was unsuccessful.
 799  */
 800 long cm_NormalizeUtf8String(const char * src, int cch_src,
 801                             char * adest, int cch_adest)
 802 {
 803     wchar_t wsrcbuf[NLSMAXCCH];
 804     wchar_t *wnorm;
 805     int cch;
 806     int cch_norm;
 807
 808     /* Get some edge cases out first, so we don't have to worry about
 809        cch_src being 0 etc. */
 810     if (cch_src == 0) {
 811         return 0;
 812     } else if (*src == '\0') {
 813         if (cch_adest >= 1)
 814             *adest = '\0';
 815         return 1;
 816     }
 817
 818     if (cch_src == -1) {
 819         cch_src = strlen(src) + 1;
 820     }
 821
 822     cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
 823                               cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
 824
 825     if (cch == 0) {
 826         if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
 827             char sanitized[NLSMAXCCH];
 828             int cch_sanitized;
 829
 830             /* If src doesn't have a unicode translation, then it
 831                wasn't valid UTF-8.  In this case, we assume that src
 832                is CP-1252 and then try to convert again.  But before
 833                that, we use a translation table to "sanitize" the
 834                input. */
 835
 836             cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
 837                                                 sizeof(sanitized)/sizeof(char));
 838
 839             if (cch_sanitized == 0) {
 840 #ifdef DEBUG_UNICODE
 841                 DebugBreak();
 842 #endif
 843                 return 0;
 844             }
 845
 846             cch = MultiByteToWideChar(1252, 0, sanitized,
 847                                       cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
 848             if (cch == 0) {
 849                 /* Well, that didn't work either.  Something is very wrong. */
 850 #ifdef DEBUG_UNICODE
 851                 DebugBreak();
 852 #endif
 853                 return 0;
 854             }
 855         } else {
 856             return 0;
 857         }
 858     }
 859
 860     cch_norm = 0;
 861     wnorm = NormalizeUtf16String(wsrcbuf, cch, NULL, &cch_norm);
 862     if (wnorm == NULL) {
 863 #ifdef DEBUG_UNICODE
 864         DebugBreak();
 865 #endif
 866         return 0;
 867     }
 868
 869     cch = WideCharToMultiByte(CP_UTF8, 0, wnorm,
 870                               cch_norm, adest, cch_adest * sizeof(char),
 871                               NULL, FALSE);
 872
 873     if (wnorm)
 874         free(wnorm);
 875
 876     return cch;
 877 }
 878
 879 /*! \brief Case insensitive comparison with specific length
 880
 881   \param[in] str1 First string to compare.  Assumed to be encoded in UTF-8.
 882
 883   \param[in] str2 Second string to compare.  Assumed to be encoded in UTF-8.
 884
 885   \param[in] n Max byte count.
 886
 887  */
 888 int cm_strnicmp_utf8(const char * str1, const char * str2, int n)
 889 {
 890     wchar_t wstr1[NLSMAXCCH];
 891     int len1;
 892     int len2;
 893     wchar_t wstr2[NLSMAXCCH];
 894     int rv;
 895
 896     /* first check for NULL pointers (assume NULL < "") */
 897     if (str1 == NULL) {
 898         if (str2 == NULL)
 899             return 0;
 900         else
 901             return -1;
 902     } else if (str2 == NULL) {
 903         return 1;
 904     }
 905
 906     len1 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str1, n, wstr1, NLSMAXCCH);
 907     if (len1 == 0) {
 908 #ifdef DEBUG
 909         DebugBreak();
 910 #endif
 911         wstr1[0] = L'\0';
 912     }
 913
 914     len2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str2, n, wstr2, NLSMAXCCH);
 915     if (len2 == 0) {
 916 #ifdef DEBUG
 917         DebugBreak();
 918 #endif
 919         wstr2[0] = L'\0';
 920     }
 921
 922     rv = CompareStringW(LOCALE_INVARIANT, NORM_IGNORECASE, wstr1, len1, wstr2, len2);
 923     if (rv > 0)
 924         return (rv - 2);
 925     else {
 926 #ifdef DEBUG
 927         DebugBreak();
 928 #endif
 929         return 0;
 930     }
 931 }
 932
 933 int cm_strnicmp_utf16(const cm_unichar_t * str1, const cm_unichar_t * str2, int len)
 934 {
 935     int rv;
 936     size_t cch1;
 937     size_t cch2;
 938
 939     /* first check for NULL pointers */
 940     if (str1 == NULL) {
 941         if (str2 == NULL)
 942             return 0;
 943         else
 944             return -1;
 945     } else if (str2 == NULL) {
 946         return 1;
 947     }
 948
 949     if (FAILED(StringCchLengthW(str1, len, &cch1)))
 950         cch1 = len;
 951
 952     if (FAILED(StringCchLengthW(str2, len, &cch2)))
 953         cch2 = len;
 954
 955     rv = CompareStringW(LOCALE_INVARIANT, NORM_IGNORECASE, str1, cch1, str2, cch2);
 956     if (rv > 0)
 957         return (rv - 2);
 958     else {
 959 #ifdef DEBUG
 960         DebugBreak();
 961 #endif
 962         return 0;
 963     }
 964 }
 965
 966 int cm_stricmp_utf16(const cm_unichar_t * str1, const cm_unichar_t * str2)
 967 {
 968     int rv;
 969
 970     /* first check for NULL pointers */
 971     if (str1 == NULL) {
 972         if (str2 == NULL)
 973             return 0;
 974         else
 975             return -1;
 976     } else if (str2 == NULL) {
 977         return 1;
 978     }
 979
 980     rv = CompareStringW(LOCALE_INVARIANT, NORM_IGNORECASE, str1, -1, str2, -1);
 981     if (rv > 0)
 982         return (rv - 2);
 983     else {
 984 #ifdef DEBUG
 985         DebugBreak();
 986 #endif
 987         return 0;
 988     }
 989 }
 990
 991 cm_unichar_t *cm_strlwr_utf16(cm_unichar_t * str)
 992 {
 993     int rv;
 994     int len;
 995
 996     len = wcslen(str) + 1;
 997     rv = LCMapStringW(LOCALE_INVARIANT, LCMAP_LOWERCASE, str, len, str, len);
 998 #ifdef DEBUG
 999     if (rv == 0) {
1000         DebugBreak();
1001     }
1002 #endif
1003
1004     return str;
1005 }
1006
1007 cm_unichar_t *cm_strupr_utf16(cm_unichar_t * str)
1008 {
1009     int rv;
1010     int len;
1011
1012     len = wcslen(str) + 1;
1013     rv = LCMapStringW(LOCALE_INVARIANT, LCMAP_UPPERCASE, str, len, str, len);
1014 #ifdef DEBUG
1015     if (rv == 0) {
1016         DebugBreak();
1017     }
1018 #endif
1019
1020     return str;
1021 }
1022
1023
1024 int cm_stricmp_utf8(const char * str1, const char * str2)
1025 {
1026     wchar_t wstr1[NLSMAXCCH];
1027     int len1;
1028     int len2;
1029     wchar_t wstr2[NLSMAXCCH];
1030     int rv;
1031
1032     /* first check for NULL pointers */
1033     if (str1 == NULL) {
1034         if (str2 == NULL)
1035             return 0;
1036         else
1037             return -1;
1038     } else if (str2 == NULL) {
1039         return 1;
1040     }
1041
1042     len1 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str1, -1, wstr1, NLSMAXCCH);
1043     if (len1 == 0) {
1044 #ifdef DEBUG
1045         DebugBreak();
1046 #endif
1047         wstr1[0] = L'\0';
1048     }
1049
1050     len2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str2, -1, wstr2, NLSMAXCCH);
1051     if (len2 == 0) {
1052 #ifdef DEBUG
1053         DebugBreak();
1054 #endif
1055         wstr2[0] = L'\0';
1056     }
1057
1058     rv = CompareStringW(LOCALE_INVARIANT, NORM_IGNORECASE, wstr1, len1, wstr2, len2);
1059     if (rv > 0)
1060         return (rv - 2);
1061     else {
1062 #ifdef DEBUG
1063         DebugBreak();
1064 #endif
1065         return 0;
1066     }
1067 }
1068
1069 #if 0
1070 wchar_t * strupr_utf16(wchar_t * wstr, size_t cbstr)
1071 {
1072     wchar_t wstrd[NLSMAXCCH];
1073     int len;
1074
1075     len = cbstr / sizeof(wchar_t);
1076     len = LCMapStringW(LOCALE_INVARIANT, LCMAP_UPPERCASE, wstr, len, wstrd, NLSMAXCCH);
1077     StringCbCopyW(wstr, cbstr, wstrd);
1078
1079     return wstr;
1080 }
1081 #endif
1082
1083 char * strupr_utf8(char * str, size_t cbstr)
1084 {
1085     wchar_t wstr[NLSMAXCCH];
1086     wchar_t wstrd[NLSMAXCCH];
1087     int len;
1088
1089     len = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, -1, wstr, NLSMAXCCH);
1090     if (len == 0)
1091         return str;
1092
1093     len = LCMapStringW(LOCALE_INVARIANT, LCMAP_UPPERCASE, wstr, len, wstrd, NLSMAXCCH);
1094
1095     len = WideCharToMultiByte(CP_UTF8, 0, wstrd, -1, str, cbstr, NULL, FALSE);
1096
1097     return str;
1098 }
1099
1100 char * char_next_utf8(const char * c)
1101 {
1102 #define CH (*((const unsigned char *)c))
1103
1104     if ((CH & 0x80) == 0)
1105         return (char *) c+1;
1106     else {
1107         switch (CH & 0xf0) {
1108         case 0xc0:
1109         case 0xd0:
1110             return (char *) c+2;
1111
1112         case 0xe0:
1113             return (char *) c+3;
1114
1115         case 0xf0:
1116             return (char *) c+4;
1117
1118         default:
1119             return (char *) c+1;
1120         }
1121     }
1122 #undef CH
1123 }
1124
1125
1126 char * char_prev_utf8(const char * c)
1127 {
1128 #define CH (*((const unsigned char *)c))
1129
1130     c--;
1131
1132     if ((CH & 0x80) == 0)
1133         return (char *) c;
1134     else
1135         while ((CH & 0xc0) == 0x80)
1136             (char *) c--;
1137     return (char *) c;
1138
1139 #undef CH
1140 }
1141
1142 wchar_t * char_next_utf16(const wchar_t * c)
1143 {
1144     unsigned short sc = (unsigned short) *c;
1145
1146     if (sc >= 0xd800 && sc <= 0xdbff)
1147         return (wchar_t *) c+2;
1148     return (wchar_t *) c+1;
1149 }
1150
1151 wchar_t * char_prev_utf16(const wchar_t * c)
1152 {
1153     unsigned short sc = (unsigned short) *(--c);
1154
1155     if (sc >= 0xdc00 && sc <= 0xdfff)
1156         return (wchar_t *) --c;
1157     return (wchar_t *) c;
1158 }
1159
1160 wchar_t * char_this_utf16(const wchar_t * c)
1161 {
1162     unsigned short sc = (unsigned short) *c;
1163
1164     if (sc >= 0xdc00 && sc <= 0xdfff)
1165         return (wchar_t *) --c;
1166     return (wchar_t *) c;
1167 }
1168