src/WINNT/afsd/cm_nls.c

   1 /*
   2  * Copyright (c) 2008 Secure Endpoints Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person
   5  * obtaining a copy of this software and associated documentation
   6  * files (the "Software"), to deal in the Software without
   7  * restriction, including without limitation the rights to use, copy,
   8  * modify, merge, publish, distribute, sublicense, and/or sell copies
   9  * of the Software, and to permit persons to whom the Software is
  10  * furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include <windows.h>
  26 #include <stdlib.h>
  27 #include <wchar.h>
  28 #include <strsafe.h>
  29 #include <errno.h>
  30
  31 #include "cm_nls.h"
  32
  33 #ifdef DEBUG_UNICODE
  34 #include <assert.h>
  35 #endif
  36
  37 /* This is part of the Microsoft Internationalized Domain Name
  38    Mitigation APIs. */
  39 #include <normalization.h>
  40
  41 /* TODO: All the normalization and conversion code should NUL
  42    terminate destination strings. */
  43
  44 int
  45 (WINAPI *pNormalizeString)( __in NORM_FORM NormForm,
  46                             __in_ecount(cwSrcLength) LPCWSTR lpSrcString,
  47                             __in int cwSrcLength,
  48                             __out_ecount(cwDstLength) LPWSTR lpDstString,
  49                             __in int cwDstLength ) = NULL;
  50
  51 BOOL
  52 (WINAPI *pIsNormalizedString)( __in NORM_FORM NormForm,
  53                                __in_ecount(cwLength) LPCWSTR lpString,
  54                                __in int cwLength ) = NULL;
  55
  56
  57 #define NLSDLLNAME "Normaliz.dll"
  58 #define NLSMAXCCH  1024
  59 #define NLSERRCCH  8
  60
  61 #define AFS_NORM_FORM NormalizationC
  62
  63 long cm_InitNormalization(void)
  64 {
  65     HMODULE h_Nls;
  66
  67     if (pNormalizeString != NULL)
  68         return 0;
  69
  70     h_Nls = LoadLibrary(NLSDLLNAME);
  71     if (h_Nls == INVALID_HANDLE_VALUE) {
  72         return 1;
  73     }
  74
  75     pNormalizeString =
  76         (int (WINAPI *)( NORM_FORM, LPCWSTR,
  77                          int, LPWSTR, int))
  78         GetProcAddress(h_Nls, "NormalizeString");
  79
  80     pIsNormalizedString =
  81         (BOOL
  82          (WINAPI *)( NORM_FORM, LPCWSTR, int ))
  83         GetProcAddress(h_Nls, "IsNormalizedString");
  84
  85     return (pNormalizeString && pIsNormalizedString);
  86 }
  87
  88 /* \brief Normalize a UTF-16 string.
  89
  90    If the supplied destination buffer is insufficient or NULL, then a
  91    new buffer will be allocated to hold the normalized string.
  92
  93    \param[in] src : Source UTF-16 string.  Length is specified in
  94        cch_src.
  95
  96    \param[in] cch_src : The character count in cch_src is assumed to
  97        be tight and include the terminating NULL character if there is
  98        one.  If the NULL is absent, the resulting string will not be
  99        NULL terminated.
 100
 101    \param[out] ext_dest : The destination buffer.  Can be NULL, in
 102        which case *pcch_dest MUST be 0.
 103
 104    \param[in,out] pcch_dest : On entry *pcch_dest contains a count of
 105        characters in the destination buffer.  On exit, it will contain
 106        a count of characters that were copied to the destination
 107        buffer.
 108
 109    Returns a pointer to the buffer containing the normalized string or
 110    NULL if the call was unsuccessful.  If the returned destination
 111    buffer is different from the supplied buffer and non-NULL, it
 112    should be freed using free().
 113 */
 114 static wchar_t *
 115 NormalizeUtf16String(const wchar_t * src, int cch_src, wchar_t * ext_dest, int *pcch_dest)
 116 {
 117 #ifdef DEBUG_UNICODE
 118     assert (pNormalizeString != NULL && pIsNormalizedString != NULL);
 119 #endif
 120
 121     if (cch_src == -1)
 122         cch_src = wcslen(src) + 1;
 123
 124     if ((pIsNormalizedString && (*pIsNormalizedString)(AFS_NORM_FORM, src, cch_src)) ||
 125         (!pNormalizeString)) {
 126
 127         if (ext_dest == NULL || *pcch_dest < cch_src) {
 128             ext_dest = malloc(cch_src * sizeof(wchar_t));
 129             *pcch_dest = cch_src;
 130         }
 131
 132         /* No need to or unable to normalize.  Just copy the string.
 133            Note that the string is not NUL terminated if the source
 134            string is not NUL terminated. */
 135
 136         if (ext_dest) {
 137             memcpy(ext_dest, src, cch_src * sizeof(wchar_t));
 138             *pcch_dest = cch_src;
 139         } else {
 140             *pcch_dest = 0;
 141         }
 142         return ext_dest;
 143
 144     } else {
 145
 146         int rv;
 147         DWORD gle;
 148         int tries = 10;
 149         wchar_t * dest;
 150         int cch_dest = *pcch_dest;
 151
 152         dest = ext_dest;
 153
 154         while (tries-- > 0) {
 155
 156             rv = (*pNormalizeString)(AFS_NORM_FORM, src, cch_src, dest, cch_dest);
 157
 158             if (rv <= 0 && (gle = GetLastError()) != ERROR_SUCCESS) {
 159                 if (gle == ERROR_INSUFFICIENT_BUFFER) {
 160
 161                     /* The buffer wasn't big enough.  We are going to
 162                        try allocating one. */
 163
 164                     cch_dest = (-rv) + NLSERRCCH;
 165                     goto cont;
 166
 167                 } else {
 168                     /* Something else is wrong */
 169                     break;
 170                 }
 171
 172             } else if (rv < 0) { /* rv < 0 && gle == ERROR_SUCCESS */
 173
 174                 /* Technically not one of the expected outcomes */
 175                 break;
 176
 177             } else {            /* rv > 0 || (rv == 0 && gle == ERROR_SUCCESS) */
 178
 179                 /* Possibly succeeded */
 180
 181                 if (rv == 0) { /* Succeeded and the return string is empty */
 182                     *pcch_dest = 0;
 183                     return dest;
 184                 }
 185
 186                 if (cch_dest == 0) {
 187                     /* Nope.  We only calculated the required size of the buffer */
 188
 189                     cch_dest = rv + NLSERRCCH;
 190                     goto cont;
 191                 }
 192
 193                 *pcch_dest = rv;
 194                 if (cch_dest > rv)
 195                     dest[rv] = 0;
 196                 else {
 197                     /* Can't NUL terminate */
 198                     cch_dest = max(rv,cch_dest) + NLSERRCCH;
 199                     goto cont;
 200                 }
 201
 202                 /* Success! */
 203                 return dest;
 204             }
 205
 206         cont:
 207             if (dest != ext_dest && dest)
 208                 free(dest);
 209             dest = malloc(cch_dest * sizeof(wchar_t));
 210         }
 211
 212         /* Failed */
 213
 214         if (dest != ext_dest && dest)
 215             free(dest);
 216
 217         *pcch_dest = 0;
 218         return NULL;
 219     }
 220 }
 221
 222 /*! \brief Normalize a Unicode string into a newly allocated buffer
 223
 224   The input string will be normalized using NFC.
 225
 226   \param[in] s UTF-16 string to be normalized.
 227
 228   \param[in] cch_src The number of characters in the input string.  If
 229       this is -1, then the input string is assumed to be NUL
 230       terminated.
 231
 232   \param[out] pcch_dest Receives the number of characters copied to
 233       the output buffer.  Note that the character count is the number
 234       of wchar_t characters copied, and not the count of Unicode code
 235       points.  This includes the terminating NUL if cch_src was -1 or
 236       included the terminating NUL.
 237
 238   \return A newly allocated buffer holding the normalized string or
 239       NULL if the call failed.
 240  */
 241 cm_normchar_t * cm_NormalizeStringAlloc(const cm_unichar_t * s, int cch_src, int *pcch_dest)
 242 {
 243     int cch_dest = 0;
 244     cm_normchar_t * r;
 245
 246     r = NormalizeUtf16String(s, cch_src, NULL, &cch_dest);
 247
 248     if (pcch_dest)
 249         *pcch_dest = cch_dest;
 250
 251     return r;
 252 }
 253
 254 int cm_NormalizeString(const cm_unichar_t * s, int cch_src,
 255                        cm_normchar_t * dest, int cch_dest)
 256 {
 257     int tcch = cch_dest;
 258     cm_normchar_t * r;
 259
 260     r = NormalizeUtf16String(s, cch_src, dest, &tcch);
 261
 262     if (r != dest) {
 263         /* The supplied buffer was insufficient */
 264         free(r);
 265         return 0;
 266     } else {
 267         return tcch;
 268     }
 269 }
 270
 271 /*! \brief Convert a UTF-16 string to a UTF-8 string using a newly allocated buffer
 272
 273   \param[in] s UTF-16 source string
 274
 275   \param[in] cch_src Number of characters in \a s. This can be set to
 276       -1 if \a s is NUL terminated.
 277
 278   \param[out] pcch_dest Receives a count of characters that were
 279       copied to the target buffer.
 280
 281   \return A newly allocated buffer holding the UTF-8 string.
 282
 283  */
 284 cm_utf8char_t * cm_Utf16ToUtf8Alloc(const cm_unichar_t * s, int cch_src, int *pcch_dest)
 285 {
 286     int cch_dest;
 287     cm_utf8char_t * dest;
 288
 289     cch_dest = WideCharToMultiByte(CP_UTF8, 0, s, cch_src, NULL, 0, NULL, FALSE);
 290
 291     if (cch_dest == 0) {
 292         if (pcch_dest)
 293             *pcch_dest = cch_dest;
 294         return NULL;
 295     }
 296
 297     dest = malloc((cch_dest + 1) * sizeof(cm_utf8char_t));
 298
 299     WideCharToMultiByte(CP_UTF8, 0, s, cch_src, dest, cch_dest, NULL, FALSE);
 300     dest[cch_dest] = 0;
 301
 302     if (pcch_dest)
 303         *pcch_dest = cch_dest;
 304
 305     return dest;
 306 }
 307
 308 int cm_Utf16ToUtf8(const cm_unichar_t * src, int cch_src,
 309                    cm_utf8char_t * dest, int cch_dest)
 310 {
 311     return WideCharToMultiByte(CP_UTF8, 0, src, cch_src, dest, cch_dest, NULL, FALSE);
 312 }
 313
 314 int cm_Utf16ToUtf16(const cm_unichar_t * src, int cch_src,
 315                     cm_unichar_t * dest, int cch_dest)
 316 {
 317     if (cch_src == -1) {
 318         StringCchCopyW(dest, cch_dest, src);
 319         return wcslen(dest) + 1;
 320     } else {
 321         int cch_conv = min(cch_src, cch_dest);
 322         memcpy(dest, src, cch_conv * sizeof(cm_unichar_t));
 323         return cch_conv;
 324     }
 325 }
 326
 327 /* \brief Normalize a UTF-16 string into a UTF-8 string.
 328
 329    \param[in] src : Source string.
 330
 331    \param[in] cch_src : Count of characters in src. If the count includes the
 332        NULL terminator, then the resulting string will be NULL
 333        terminated.  If it is -1, then src is assumed to be NULL
 334        terminated.
 335
 336    \param[out] adest : Destination buffer.
 337
 338    \param[in] cch_adest : Number of characters in the destination buffer.
 339
 340    Returns the number of characters stored into cch_adest. This will
 341    include the terminating NULL if cch_src included the terminating
 342    NULL or was -1.  If this is 0, then the operation was unsuccessful.
 343  */
 344 long cm_NormalizeUtf16StringToUtf8(const wchar_t * src, int cch_src,
 345                                    char * adest, int cch_adest)
 346 {
 347     if (cch_src < 0) {
 348         size_t cch;
 349
 350         if (FAILED(StringCchLengthW(src, NLSMAXCCH, &cch)))
 351             return E2BIG;
 352
 353         cch_src = cch+1;
 354     }
 355
 356     {
 357         wchar_t nbuf[NLSMAXCCH];
 358         wchar_t * normalized;
 359         int cch_norm = NLSMAXCCH;
 360
 361         normalized = NormalizeUtf16String(src, cch_src, nbuf, &cch_norm);
 362         if (normalized) {
 363             cch_adest = WideCharToMultiByte(CP_UTF8, 0, normalized, cch_norm,
 364                                             adest, cch_adest, NULL, 0);
 365
 366             if (normalized != nbuf && normalized)
 367                 free(normalized);
 368
 369             return cch_adest;
 370
 371         } else {
 372
 373             return 0;
 374
 375         }
 376     }
 377 }
 378
 379 #define ESCVAL 0x1000
 380 #define Esc(c) (ESCVAL + (short)(c))
 381 #define IS_ESCAPED(c) (((c) & ESCVAL) == ESCVAL)
 382
 383 /* \brief Character sanitization map for CP-1252
 384
 385    The following map indicates which characters should be escaped in
 386    the CP-1252 character map.  Characters that are documented as
 387    illegal characters in a file name are marked as escaped.  Escaped
 388    characters are marked using the ::Esc macro defined above.  The
 389    following exceptions apply:
 390
 391    - Path delimeters '\\' and '/' are NOT escaped because the
 392      sanitization map applies to paths.  While those characters are
 393      illegal in filenames, they are legal in paths.
 394
 395    - Wildcard characters '*' and '?' ARE escaped.  The document
 396      referred below does not specify these characters as invalid.
 397      Since no other escape mechanism exists, names containing
 398      wildcards are indistinguishable from actual wildcards used in SMB
 399      requests.
 400
 401    - Reserved names are not and cannot be represented in this map.
 402      Reserved names are :
 403
 404      CON, PRN, AUX, NUL, COM1, COM2, COM3, COM4, COM5, COM6, COM7,
 405      COM8, COM9, LPT1, LPT2, LPT3, LPT4, LPT5, LPT6, LPT7, LPT8, LPT9,
 406      CLOCK$
 407
 408    - Characters 0x80, 0x81, 0x8d, 0x8e, 0x8f, 0x90, 0x9d, 0x9e, 0x9f
 409      are also escaped because they are unused in CP-1252 and hence
 410      cannot be convered to a Unicode string.
 411
 412      Reserved names with extensions are also invalid. (i.e. NUL.txt)
 413
 414    \note The only bit we are actually interested in from the following
 415      table is the ESCVAL bit.  However, the characters themselves are
 416      included for ease of maintenance.
 417
 418    \see "Naming a File" topic in the Windows SDK.
 419  */
 420 static const short sanitized_escapes_1252[] = {
 421     Esc(0x00),Esc(0x01),Esc(0x02),Esc(0x03),Esc(0x04),Esc(0x05),Esc(0x06),Esc(0x07),
 422     Esc(0x08),Esc(0x09),Esc(0x0a),Esc(0x0b),Esc(0x0c),Esc(0x0d),Esc(0x0e),Esc(0x0f),
 423     Esc(0x10),Esc(0x11),Esc(0x12),Esc(0x13),Esc(0x14),Esc(0x15),Esc(0x16),Esc(0x17),
 424     Esc(0x18),Esc(0x19),Esc(0x1a),Esc(0x1b),Esc(0x1c),Esc(0x1d),Esc(0x1e),Esc(0x1f),
 425     ' ','!',Esc('"'),'#','$','%','&','\'','(',')',Esc('*'),'+',',','-','.','/',
 426     '0','1','2','3','4','5','6','7','8','9',Esc(':'),';',Esc('<'),'=',Esc('>'),Esc('?'),
 427     '@','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
 428     'P','Q','R','S','T','U','V','W','X','Y','Z','[','\\',']','^','_',
 429     '`','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o',
 430     'p','q','r','s','t','u','v','w','x','y','z','{',Esc('|'),'}','~',Esc(0x7f),
 431     Esc(0x80),Esc(0x81),0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,Esc(0x8d),Esc(0x8e),Esc(0x8f),
 432     Esc(0x90),0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,Esc(0x9d),Esc(0x9e),0x9f,
 433     0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
 434     0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
 435     0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,
 436     0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf,
 437     0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
 438     0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff
 439 };
 440
 441 static int sanitize_bytestring(const char * src, int cch_src,
 442                                char * odest, int cch_dest)
 443 {
 444     char * dest = odest;
 445     while (cch_src > 0 && *src && cch_dest > 0) {
 446
 447         unsigned short rc;
 448
 449         rc = sanitized_escapes_1252[*src];
 450         if (IS_ESCAPED(rc)) {
 451             static const char hex[] =
 452                 {'0','1','2','3','4','5','6','7',
 453                  '8','9','a','b','c','d','e','f'};
 454
 455             if (cch_dest < 3) {
 456                 *dest++ = '\0';
 457                 return 0;
 458             }
 459
 460             *dest++ = '%';
 461             *dest++ = hex[(((int)*src) >> 4) & 0x0f];
 462             *dest++ = hex[(((int)*src) & 0x0f)];
 463             cch_dest -= 3;
 464
 465         } else {
 466             *dest++ = *src;
 467             cch_dest--;
 468         }
 469
 470         cch_src--;
 471         src++;
 472     }
 473
 474     if (cch_src > 0 && cch_dest > 0) {
 475         *dest++ = '\0';
 476     }
 477
 478     return (int)(dest - odest);
 479 }
 480
 481 #undef Esc
 482 #undef IS_ESCAPED
 483 #undef ESCVAL
 484
 485 long cm_NormalizeUtf8StringToUtf16(const char * src, int cch_src,
 486                                    wchar_t * dest, int cch_dest)
 487 {
 488     wchar_t wsrcbuf[NLSMAXCCH];
 489     wchar_t *wnorm;
 490     int cch;
 491     int cch_norm;
 492
 493     /* Get some edge cases out first, so we don't have to worry about
 494        cch_src being 0 etc. */
 495     if (cch_src == 0) {
 496         return 0;
 497     } else if (*src == '\0') {
 498         if (cch_dest >= 1)
 499             *dest = L'\0';
 500         return 1;
 501     }
 502
 503     if (cch_src == -1) {
 504         cch_src = strlen(src) + 1;
 505     }
 506
 507     cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
 508                               cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
 509
 510     if (cch == 0) {
 511         if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
 512             char sanitized[NLSMAXCCH];
 513             int cch_sanitized;
 514
 515             /* If src doesn't have a unicode translation, then it
 516                wasn't valid UTF-8.  In this case, we assume that src
 517                is CP-1252 and then try to convert again.  But before
 518                that, we use a translation table to "sanitize" the
 519                input. */
 520
 521             cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
 522                                                 sizeof(sanitized)/sizeof(char));
 523
 524             if (cch_sanitized == 0) {
 525 #ifdef DEBUG_UNICODE
 526                 DebugBreak();
 527 #endif
 528                 return 0;
 529             }
 530
 531             cch = MultiByteToWideChar(1252, 0, sanitized,
 532                                       cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
 533             if (cch == 0) {
 534                 /* Well, that didn't work either.  Something is very wrong. */
 535 #ifdef DEBUG_UNICODE
 536                 DebugBreak();
 537 #endif
 538                 return 0;
 539             }
 540         } else {
 541             return 0;
 542         }
 543     }
 544
 545     cch_norm = cch_dest;
 546     wnorm = NormalizeUtf16String(wsrcbuf, cch, dest, &cch_norm);
 547     if (wnorm == NULL) {
 548 #ifdef DEBUG_UNICODE
 549         DebugBreak();
 550 #endif
 551         return 0;
 552     }
 553
 554     if (wnorm != dest) {
 555         /* The buffer was insufficient */
 556         if (dest != NULL && cch_dest > 1) {
 557             *dest = L'\0';
 558             cch_norm = 0;
 559         }
 560
 561         free(wnorm);
 562     }
 563
 564     return cch_norm;
 565 }
 566
 567 cm_normchar_t *cm_NormalizeUtf8StringToUtf16Alloc(const cm_utf8char_t * src, int cch_src,
 568                                                   int *pcch_dest)
 569 {
 570     wchar_t wsrcbuf[NLSMAXCCH];
 571     wchar_t *wnorm;
 572     int cch;
 573     int cch_norm;
 574
 575     /* Get some edge cases out first, so we don't have to worry about
 576        cch_src being 0 etc. */
 577     if (cch_src == 0) {
 578         return NULL;
 579     } else if (*src == '\0') {
 580         return wcsdup(L"");
 581     }
 582
 583     if (cch_src == -1) {
 584         cch_src = strlen(src) + 1;
 585     }
 586
 587     cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
 588                               cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
 589
 590     if (cch == 0) {
 591         if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
 592             char sanitized[NLSMAXCCH];
 593             int cch_sanitized;
 594
 595             /* If src doesn't have a unicode translation, then it
 596                wasn't valid UTF-8.  In this case, we assume that src
 597                is CP-1252 and then try to convert again.  But before
 598                that, we use a translation table to "sanitize" the
 599                input. */
 600
 601             cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
 602                                                 sizeof(sanitized)/sizeof(char));
 603
 604             if (cch_sanitized == 0) {
 605 #ifdef DEBUG_UNICODE
 606                 DebugBreak();
 607 #endif
 608                 return NULL;
 609             }
 610
 611             cch = MultiByteToWideChar(1252, 0, sanitized,
 612                                       cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
 613             if (cch == 0) {
 614                 /* Well, that didn't work either.  Something is very wrong. */
 615 #ifdef DEBUG_UNICODE
 616                 DebugBreak();
 617 #endif
 618                 return NULL;
 619             }
 620         } else {
 621             return NULL;
 622         }
 623     }
 624
 625     cch_norm = 0;
 626     wnorm = NormalizeUtf16String(wsrcbuf, cch, NULL, &cch_norm);
 627     if (wnorm == NULL) {
 628 #ifdef DEBUG_UNICODE
 629         DebugBreak();
 630 #endif
 631         return NULL;
 632     }
 633
 634     if (pcch_dest)
 635         *pcch_dest = cch_norm;
 636
 637     return wnorm;
 638 }
 639
 640 int cm_Utf8ToUtf16(const cm_utf8char_t * src, int cch_src,
 641                    cm_unichar_t * dest, int cch_dest)
 642 {
 643     int cch;
 644
 645     if (cch_src == -1) {
 646         cch_src = strlen(src) + 1;
 647     }
 648
 649     cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
 650                               cch_src * sizeof(char), dest, cch_dest);
 651
 652     if (cch == 0) {
 653         if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
 654             char sanitized[NLSMAXCCH];
 655             int cch_sanitized;
 656
 657             /* If src doesn't have a unicode translation, then it
 658                wasn't valid UTF-8.  In this case, we assume that src
 659                is CP-1252 and then try to convert again.  But before
 660                that, we use a translation table to "sanitize" the
 661                input. */
 662
 663             cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
 664                                                 sizeof(sanitized)/sizeof(char));
 665
 666             if (cch_sanitized == 0) {
 667 #ifdef DEBUG_UNICODE
 668                 DebugBreak();
 669 #endif
 670                 return 0;
 671             }
 672
 673             cch = MultiByteToWideChar(1252, 0, sanitized,
 674                                       cch_sanitized * sizeof(char), dest, cch_dest);
 675             if (cch == 0) {
 676                 /* Well, that didn't work either.  Something is very wrong. */
 677 #ifdef DEBUG_UNICODE
 678                 DebugBreak();
 679 #endif
 680                 return 0;
 681             } else {
 682                 return cch;
 683             }
 684
 685         } else {
 686             return 0;
 687         }
 688     } else {
 689         return cch;
 690     }
 691 }
 692
 693 cm_unichar_t  * cm_Utf8ToUtf16Alloc(const cm_utf8char_t * src, int cch_src, int *pcch_dest)
 694 {
 695     cm_unichar_t * ustr = NULL;
 696     int cch;
 697
 698     if (cch_src == -1) {
 699         cch_src = strlen(src) + 1;
 700     }
 701
 702     cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
 703                               cch_src * sizeof(char), NULL, 0);
 704
 705     if (cch == 0) {
 706         if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
 707             char sanitized[NLSMAXCCH];
 708             int cch_sanitized;
 709
 710             /* If src doesn't have a unicode translation, then it
 711                wasn't valid UTF-8.  In this case, we assume that src
 712                is CP-1252 and then try to convert again.  But before
 713                that, we use a translation table to "sanitize" the
 714                input. */
 715
 716             cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
 717                                                 sizeof(sanitized)/sizeof(char));
 718
 719             if (cch_sanitized == 0) {
 720 #ifdef DEBUG_UNICODE
 721                 DebugBreak();
 722 #endif
 723                 return NULL;
 724             }
 725
 726             cch = MultiByteToWideChar(1252, 0, sanitized,
 727                                       cch_sanitized * sizeof(char), NULL, 0);
 728             if (cch == 0) {
 729                 /* Well, that didn't work either.  Something is very wrong. */
 730 #ifdef DEBUG_UNICODE
 731                 DebugBreak();
 732 #endif
 733                 return NULL;
 734             }
 735
 736             ustr = malloc((cch + 1) * sizeof(wchar_t));
 737
 738             cch = MultiByteToWideChar(1252, 0, sanitized,
 739                                       cch_sanitized * sizeof(char), ustr, cch);
 740             ustr[cch] = 0;
 741         } else {
 742             return NULL;
 743         }
 744     } else {
 745
 746         ustr = malloc((cch + 1) * sizeof(wchar_t));
 747
 748         cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
 749                                   cch_src * sizeof(char), ustr, cch);
 750         ustr[cch] = 0;
 751     }
 752
 753     if (pcch_dest)
 754         *pcch_dest = cch;
 755
 756     return ustr;
 757 }
 758
 759
 760
 761 /* \brief Normalize a UTF-8 string.
 762
 763    \param[in] src String to normalize.
 764
 765    \param[in] cch_src : Count of characters in src.  If this value is
 766        -1, then src is assumed to be NULL terminated.  The translated
 767        string will be NULL terminated only if this is -1 or the count
 768        includes the terminating NULL.
 769
 770    \param[out] adest : Destination string.  Only considered valid if
 771        \a cch_adest is non-zero.
 772
 773    \param[in] cch_adest : Number of characters in the destination
 774        string.  If this is zero, then the return value is the number
 775        of bytes required.
 776
 777    \return If \a cch_adest is non-zero, then the return value is the
 778        number of bytes stored into adest.  If \a cch_adest is zero,
 779        then the return value is the number of bytes required.  In both
 780        cases, the return value is 0 if the call was unsuccessful.
 781  */
 782 long cm_NormalizeUtf8String(const char * src, int cch_src,
 783                             char * adest, int cch_adest)
 784 {
 785     wchar_t wsrcbuf[NLSMAXCCH];
 786     wchar_t *wnorm;
 787     int cch;
 788     int cch_norm;
 789
 790     /* Get some edge cases out first, so we don't have to worry about
 791        cch_src being 0 etc. */
 792     if (cch_src == 0) {
 793         return 0;
 794     } else if (*src == '\0') {
 795         if (cch_adest >= 1)
 796             *adest = '\0';
 797         return 1;
 798     }
 799
 800     if (cch_src == -1) {
 801         cch_src = strlen(src) + 1;
 802     }
 803
 804     cch = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src,
 805                               cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
 806
 807     if (cch == 0) {
 808         if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
 809             char sanitized[NLSMAXCCH];
 810             int cch_sanitized;
 811
 812             /* If src doesn't have a unicode translation, then it
 813                wasn't valid UTF-8.  In this case, we assume that src
 814                is CP-1252 and then try to convert again.  But before
 815                that, we use a translation table to "sanitize" the
 816                input. */
 817
 818             cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
 819                                                 sizeof(sanitized)/sizeof(char));
 820
 821             if (cch_sanitized == 0) {
 822 #ifdef DEBUG_UNICODE
 823                 DebugBreak();
 824 #endif
 825                 return 0;
 826             }
 827
 828             cch = MultiByteToWideChar(1252, 0, sanitized,
 829                                       cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
 830             if (cch == 0) {
 831                 /* Well, that didn't work either.  Something is very wrong. */
 832 #ifdef DEBUG_UNICODE
 833                 DebugBreak();
 834 #endif
 835                 return 0;
 836             }
 837         } else {
 838             return 0;
 839         }
 840     }
 841
 842     cch_norm = 0;
 843     wnorm = NormalizeUtf16String(wsrcbuf, cch, NULL, &cch_norm);
 844     if (wnorm == NULL) {
 845 #ifdef DEBUG_UNICODE
 846         DebugBreak();
 847 #endif
 848         return 0;
 849     }
 850
 851     cch = WideCharToMultiByte(CP_UTF8, 0, wnorm,
 852                               cch_norm, adest, cch_adest * sizeof(char),
 853                               NULL, FALSE);
 854
 855     if (wnorm)
 856         free(wnorm);
 857
 858     return cch;
 859 }
 860
 861 /*! \brief Case insensitive comparison with specific length
 862
 863   \param[in] str1 First string to compare.  Assumed to be encoded in UTF-8.
 864
 865   \param[in] str2 Second string to compare.  Assumed to be encoded in UTF-8.
 866
 867   \param[in] n Max byte count.
 868
 869  */
 870 int cm_strnicmp_utf8(const char * str1, const char * str2, int n)
 871 {
 872     wchar_t wstr1[NLSMAXCCH];
 873     int len1;
 874     int len2;
 875     wchar_t wstr2[NLSMAXCCH];
 876     int rv;
 877
 878     /* first check for NULL pointers (assume NULL < "") */
 879     if (str1 == NULL) {
 880         if (str2 == NULL)
 881             return 0;
 882         else
 883             return -1;
 884     } else if (str2 == NULL) {
 885         return 1;
 886     }
 887
 888     len1 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str1, n, wstr1, NLSMAXCCH);
 889     if (len1 == 0) {
 890 #ifdef DEBUG
 891         DebugBreak();
 892 #endif
 893         wstr1[0] = L'\0';
 894     }
 895
 896     len2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str2, n, wstr2, NLSMAXCCH);
 897     if (len2 == 0) {
 898 #ifdef DEBUG
 899         DebugBreak();
 900 #endif
 901         wstr2[0] = L'\0';
 902     }
 903
 904     rv = CompareStringW(LOCALE_INVARIANT, NORM_IGNORECASE, wstr1, len1, wstr2, len2);
 905     if (rv > 0)
 906         return (rv - 2);
 907     else {
 908 #ifdef DEBUG
 909         DebugBreak();
 910 #endif
 911         return 0;
 912     }
 913 }
 914
 915 int cm_strnicmp_utf16(const cm_unichar_t * str1, const cm_unichar_t * str2, int len)
 916 {
 917     int rv;
 918     size_t cch1;
 919     size_t cch2;
 920
 921     /* first check for NULL pointers */
 922     if (str1 == NULL) {
 923         if (str2 == NULL)
 924             return 0;
 925         else
 926             return -1;
 927     } else if (str2 == NULL) {
 928         return 1;
 929     }
 930
 931     if (FAILED(StringCchLengthW(str1, len, &cch1)))
 932         cch1 = len;
 933
 934     if (FAILED(StringCchLengthW(str2, len, &cch2)))
 935         cch2 = len;
 936
 937     rv = CompareStringW(LOCALE_INVARIANT, NORM_IGNORECASE, str1, cch1, str2, cch2);
 938     if (rv > 0)
 939         return (rv - 2);
 940     else {
 941 #ifdef DEBUG
 942         DebugBreak();
 943 #endif
 944         return 0;
 945     }
 946 }
 947
 948 int cm_stricmp_utf16(const cm_unichar_t * str1, const cm_unichar_t * str2)
 949 {
 950     int rv;
 951
 952     /* first check for NULL pointers */
 953     if (str1 == NULL) {
 954         if (str2 == NULL)
 955             return 0;
 956         else
 957             return -1;
 958     } else if (str2 == NULL) {
 959         return 1;
 960     }
 961
 962     rv = CompareStringW(LOCALE_INVARIANT, NORM_IGNORECASE, str1, -1, str2, -1);
 963     if (rv > 0)
 964         return (rv - 2);
 965     else {
 966 #ifdef DEBUG
 967         DebugBreak();
 968 #endif
 969         return 0;
 970     }
 971 }
 972
 973 cm_unichar_t *cm_strlwr_utf16(cm_unichar_t * str)
 974 {
 975     int rv;
 976     int len;
 977
 978     len = wcslen(str) + 1;
 979     rv = LCMapStringW(LOCALE_INVARIANT, LCMAP_LOWERCASE, str, len, str, len);
 980 #ifdef DEBUG
 981     if (rv == 0) {
 982         DebugBreak();
 983     }
 984 #endif
 985
 986     return str;
 987 }
 988
 989 cm_unichar_t *cm_strupr_utf16(cm_unichar_t * str)
 990 {
 991     int rv;
 992     int len;
 993
 994     len = wcslen(str) + 1;
 995     rv = LCMapStringW(LOCALE_INVARIANT, LCMAP_UPPERCASE, str, len, str, len);
 996 #ifdef DEBUG
 997     if (rv == 0) {
 998         DebugBreak();
 999     }
1000 #endif
1001
1002     return str;
1003 }
1004
1005
1006 int cm_stricmp_utf8(const char * str1, const char * str2)
1007 {
1008     wchar_t wstr1[NLSMAXCCH];
1009     int len1;
1010     int len2;
1011     wchar_t wstr2[NLSMAXCCH];
1012     int rv;
1013
1014     /* first check for NULL pointers */
1015     if (str1 == NULL) {
1016         if (str2 == NULL)
1017             return 0;
1018         else
1019             return -1;
1020     } else if (str2 == NULL) {
1021         return 1;
1022     }
1023
1024     len1 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str1, -1, wstr1, NLSMAXCCH);
1025     if (len1 == 0) {
1026 #ifdef DEBUG
1027         DebugBreak();
1028 #endif
1029         wstr1[0] = L'\0';
1030     }
1031
1032     len2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str2, -1, wstr2, NLSMAXCCH);
1033     if (len2 == 0) {
1034 #ifdef DEBUG
1035         DebugBreak();
1036 #endif
1037         wstr2[0] = L'\0';
1038     }
1039
1040     rv = CompareStringW(LOCALE_INVARIANT, NORM_IGNORECASE, wstr1, len1, wstr2, len2);
1041     if (rv > 0)
1042         return (rv - 2);
1043     else {
1044 #ifdef DEBUG
1045         DebugBreak();
1046 #endif
1047         return 0;
1048     }
1049 }
1050
1051 #if 0
1052 wchar_t * strupr_utf16(wchar_t * wstr, size_t cbstr)
1053 {
1054     wchar_t wstrd[NLSMAXCCH];
1055     int len;
1056
1057     len = cbstr / sizeof(wchar_t);
1058     len = LCMapStringW(LOCALE_INVARIANT, LCMAP_UPPERCASE, wstr, len, wstrd, NLSMAXCCH);
1059     StringCbCopyW(wstr, cbstr, wstrd);
1060
1061     return wstr;
1062 }
1063 #endif
1064
1065 char * strupr_utf8(char * str, size_t cbstr)
1066 {
1067     wchar_t wstr[NLSMAXCCH];
1068     wchar_t wstrd[NLSMAXCCH];
1069     int len;
1070
1071     len = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, -1, wstr, NLSMAXCCH);
1072     if (len == 0)
1073         return str;
1074
1075     len = LCMapStringW(LOCALE_INVARIANT, LCMAP_UPPERCASE, wstr, len, wstrd, NLSMAXCCH);
1076
1077     len = WideCharToMultiByte(CP_UTF8, 0, wstrd, -1, str, cbstr, NULL, FALSE);
1078
1079     return str;
1080 }
1081
1082 char * char_next_utf8(const char * c)
1083 {
1084 #define CH (*((const unsigned char *)c))
1085
1086     if ((CH & 0x80) == 0)
1087         return (char *) c+1;
1088     else {
1089         switch (CH & 0xf0) {
1090         case 0xc0:
1091         case 0xd0:
1092             return (char *) c+2;
1093
1094         case 0xe0:
1095             return (char *) c+3;
1096
1097         case 0xf0:
1098             return (char *) c+4;
1099
1100         default:
1101             return (char *) c+1;
1102         }
1103     }
1104 #undef CH
1105 }
1106
1107
1108 char * char_prev_utf8(const char * c)
1109 {
1110 #define CH (*((const unsigned char *)c))
1111
1112     c--;
1113
1114     if ((CH & 0x80) == 0)
1115         return (char *) c;
1116     else
1117         while ((CH & 0xc0) == 0x80)
1118             (char *) c--;
1119     return (char *) c;
1120
1121 #undef CH
1122 }
1123
1124 wchar_t * char_next_utf16(const wchar_t * c)
1125 {
1126     unsigned short sc = (unsigned short) *c;
1127
1128     if (sc >= 0xd800 && sc <= 0xdbff)
1129         return (wchar_t *) c+2;
1130     return (wchar_t *) c+1;
1131 }
1132
1133 wchar_t * char_prev_utf16(const wchar_t * c)
1134 {
1135     unsigned short sc = (unsigned short) *(--c);
1136
1137     if (sc >= 0xdc00 && sc <= 0xdfff)
1138         return (wchar_t *) --c;
1139     return (wchar_t *) c;
1140 }
1141
1142 wchar_t * char_this_utf16(const wchar_t * c)
1143 {
1144     unsigned short sc = (unsigned short) *c;
1145
1146     if (sc >= 0xdc00 && sc <= 0xdfff)
1147         return (wchar_t *) --c;
1148     return (wchar_t *) c;
1149 }
1150