src/WINNT/afsd/cm_nls.c

   1 /*
   2  * Copyright (c) 2008 Secure Endpoints Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person
   5  * obtaining a copy of this software and associated documentation
   6  * files (the "Software"), to deal in the Software without
   7  * restriction, including without limitation the rights to use, copy,
   8  * modify, merge, publish, distribute, sublicense, and/or sell copies
   9  * of the Software, and to permit persons to whom the Software is
  10  * furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include <windows.h>
  26 #include <stdlib.h>
  27 #include <wchar.h>
  28 #include <strsafe.h>
  29 #include <errno.h>
  30
  31 #define DEBUG_UNICODE
  32
  33 /* This is part of the Microsoft Internationalized Domain Name
  34    Mitigation APIs. */
  35 #include <normalization.h>
  36
  37 int
  38 (WINAPI *pNormalizeString)( __in NORM_FORM NormForm,
  39                             __in_ecount(cwSrcLength) LPCWSTR lpSrcString,
  40                             __in int cwSrcLength,
  41                             __out_ecount(cwDstLength) LPWSTR lpDstString,
  42                             __in int cwDstLength ) = NULL;
  43
  44 BOOL
  45 (WINAPI *pIsNormalizedString)( __in NORM_FORM NormForm,
  46                                __in_ecount(cwLength) LPCWSTR lpString,
  47                                __in int cwLength ) = NULL;
  48
  49
  50 #define NLSDLLNAME "Normaliz.dll"
  51 #define NLSMAXCCH  1024
  52 #define NLSERRCCH  8
  53
  54 #define AFS_NORM_FORM NormalizationC
  55
  56 long cm_InitNormalization(void)
  57 {
  58     HMODULE h_Nls;
  59
  60     if (pNormalizeString != NULL)
  61         return 0;
  62
  63     h_Nls = LoadLibrary(NLSDLLNAME);
  64     if (h_Nls == INVALID_HANDLE_VALUE) {
  65         return 1;
  66     }
  67
  68     pNormalizeString = GetProcAddress(h_Nls, "NormalizeString");
  69     pIsNormalizedString = GetProcAddress(h_Nls, "IsNormalizedString");
  70
  71     return (pNormalizeString && pIsNormalizedString);
  72 }
  73
  74 /* \brief Normalize a UTF-16 string.
  75
  76    If the supplied destination buffer is insufficient or NULL, then a
  77    new buffer will be allocated to hold the normalized string.
  78
  79    \param[in] src : Source UTF-16 string.  Length is specified in
  80        cch_src.
  81
  82    \param[in] cch_src : The character count in cch_src is assumed to
  83        be tight and include the terminating NULL character if there is
  84        one.  If the NULL is absent, the resulting string will not be
  85        NULL terminated.
  86
  87    \param[out] ext_dest : The destination buffer.  Can be NULL, in
  88        which case *pcch_dest MUST be 0.
  89
  90    \param[in,out] pcch_dest : On entry *pcch_dest contains a count of
  91        characters in the destination buffer.  On exit, it will contain
  92        a count of characters that were copied to the destination
  93        buffer.
  94
  95    Returns a pointer to the buffer containing the normalized string or
  96    NULL if the call was unsuccessful.  If the returned destination
  97    buffer is different from the supplied buffer and non-NULL, it
  98    should be freed using free().
  99 */
 100 static wchar_t *
 101 NormalizeUtf16String(const wchar_t * src, int cch_src, wchar_t * ext_dest, int *pcch_dest)
 102 {
 103     if ((pIsNormalizedString && (*pIsNormalizedString)(AFS_NORM_FORM, src, cch_src)) ||
 104         (!pNormalizeString)) {
 105
 106         if (ext_dest == NULL || *pcch_dest < cch_src) {
 107             ext_dest = malloc(cch_src * sizeof(wchar_t));
 108             *pcch_dest = cch_src;
 109         }
 110
 111         /* No need to or unable to normalize.  Just copy the string.
 112            Note that the string is not necessarily NULL terminated. */
 113
 114         if (ext_dest) {
 115             memcpy(ext_dest, src, cch_src * sizeof(wchar_t));
 116             *pcch_dest = cch_src;
 117         } else {
 118             *pcch_dest = 0;
 119         }
 120         return ext_dest;
 121
 122     } else {
 123
 124         int rv;
 125         DWORD gle;
 126         int tries = 10;
 127         wchar_t * dest;
 128         int cch_dest = *pcch_dest;
 129
 130         dest = ext_dest;
 131
 132         while (tries-- > 0) {
 133
 134             rv = (*pNormalizeString)(AFS_NORM_FORM, src, cch_src, dest, cch_dest);
 135
 136             if (rv <= 0 && (gle = GetLastError()) != ERROR_SUCCESS) {
 137                 if (gle == ERROR_INSUFFICIENT_BUFFER) {
 138
 139                     /* The buffer wasn't big enough.  We are going to
 140                        try allocating one. */
 141
 142                     cch_dest = (-rv) + NLSERRCCH;
 143                     goto cont;
 144
 145                 } else {
 146                     /* Something else is wrong */
 147                     break;
 148                 }
 149
 150             } else if (rv < 0) { /* rv < 0 && gle == ERROR_SUCCESS */
 151
 152                 /* Technically not one of the expected outcomes */
 153                 break;
 154
 155             } else {            /* rv > 0 || (rv == 0 && gle == ERROR_SUCCESS) */
 156
 157                 /* Possibly succeeded */
 158
 159                 if (rv == 0) { /* Succeeded and the return string is empty */
 160                     *pcch_dest = 0;
 161                     return dest;
 162                 }
 163
 164                 if (cch_dest == 0) {
 165                     /* Nope.  We only calculated the required size of the buffer */
 166
 167                     cch_dest = rv + NLSERRCCH;
 168                     goto cont;
 169                 }
 170
 171                 *pcch_dest = rv;
 172
 173                 /* Success! */
 174                 return dest;
 175             }
 176
 177         cont:
 178             if (dest != ext_dest && dest)
 179                 free(dest);
 180             dest = malloc(cch_dest * sizeof(wchar_t));
 181         }
 182
 183         /* Failed */
 184
 185         if (dest != ext_dest && dest)
 186             free(dest);
 187
 188         *pcch_dest = 0;
 189         return NULL;
 190     }
 191 }
 192
 193 /* \brief Normalize a UTF-16 string into a UTF-8 string.
 194
 195    \param[in] src : Source string.
 196
 197    \param[in] cch_src : Count of characters in src. If the count includes the
 198        NULL terminator, then the resulting string will be NULL
 199        terminated.  If it is -1, then src is assumed to be NULL
 200        terminated.
 201
 202    \param[out] adest : Destination buffer.
 203
 204    \param[in] cch_adest : Number of characters in the destination buffer.
 205
 206    Returns the number of characters stored into cch_adest. This will
 207    include the terminating NULL if cch_src included the terminating
 208    NULL or was -1.  If this is 0, then the operation was unsuccessful.
 209  */
 210 long cm_NormalizeUtf16StringToUtf8(const wchar_t * src, int cch_src,
 211                                    char * adest, int cch_adest)
 212 {
 213     if (cch_src < 0) {
 214         size_t cch;
 215
 216         if (FAILED(StringCchLengthW(src, NLSMAXCCH, &cch)))
 217             return E2BIG;
 218
 219         cch_src = cch+1;
 220     }
 221
 222     {
 223         wchar_t nbuf[NLSMAXCCH];
 224         wchar_t * normalized;
 225         int cch_norm = NLSMAXCCH;
 226
 227         normalized = NormalizeUtf16String(src, cch_src, nbuf, &cch_norm);
 228         if (normalized) {
 229             cch_adest = WideCharToMultiByte(CP_UTF8, 0, normalized, cch_norm,
 230                                             adest, cch_adest, NULL, 0);
 231
 232             if (normalized != nbuf && normalized)
 233                 free(normalized);
 234
 235             return cch_adest;
 236
 237         } else {
 238
 239             return 0;
 240
 241         }
 242     }
 243 }
 244
 245 #define ESCVAL 0x1000
 246 #define Esc(c) (ESCVAL + (short)(c))
 247 #define IS_ESCAPED(c) (((c) & ESCVAL) == ESCVAL)
 248
 249 /* \brief Character sanitization map for CP-1252
 250
 251    The following map indicates which characters should be escaped in
 252    the CP-1252 character map.  Characters that are documented as
 253    illegal characters in a file name are marked as escaped.  Escaped
 254    characters are marked using the ::Esc macro defined above.  The
 255    following exceptions apply:
 256
 257    - Path delimeters '\\' and '/' are NOT escaped because the
 258      sanitization map applies to paths.  While those characters are
 259      illegal in filenames, they are legal in paths.
 260
 261    - Wildcard characters '*' and '?' ARE escaped.  The document
 262      referred below does not specify these characters as invalid.
 263      Since no other escape mechanism exists, names containing
 264      wildcards are indistinguishable from actual wildcards used in SMB
 265      requests.
 266
 267    - Reserved names are not and cannot be represented in this map.
 268      Reserved names are :
 269
 270      CON, PRN, AUX, NUL, COM1, COM2, COM3, COM4, COM5, COM6, COM7,
 271      COM8, COM9, LPT1, LPT2, LPT3, LPT4, LPT5, LPT6, LPT7, LPT8, LPT9,
 272      CLOCK$
 273
 274    - Characters 0x80, 0x81, 0x8d, 0x8e, 0x8f, 0x90, 0x9d, 0x9e, 0x9f
 275      are also escaped because they are unused in CP-1252 and hence
 276      cannot be convered to a Unicode string.
 277
 278      Reserved names with extensions are also invalid. (i.e. NUL.txt)
 279
 280    \note The only bit we are actually interested in from the following
 281      table is the ESCVAL bit.  However, the characters themselves are
 282      included for ease of maintenance.
 283
 284    \see "Naming a File" topic in the Windows SDK.
 285  */
 286 static const short sanitized_escapes_1252[] = {
 287     Esc(0x00),Esc(0x01),Esc(0x02),Esc(0x03),Esc(0x04),Esc(0x05),Esc(0x06),Esc(0x07),
 288     Esc(0x08),Esc(0x09),Esc(0x0a),Esc(0x0b),Esc(0x0c),Esc(0x0d),Esc(0x0e),Esc(0x0f),
 289     Esc(0x10),Esc(0x11),Esc(0x12),Esc(0x13),Esc(0x14),Esc(0x15),Esc(0x16),Esc(0x17),
 290     Esc(0x18),Esc(0x19),Esc(0x1a),Esc(0x1b),Esc(0x1c),Esc(0x1d),Esc(0x1e),Esc(0x1f),
 291     ' ','!',Esc('"'),'#','$','%','&','\'','(',')',Esc('*'),'+',',','-','.','/',
 292     '0','1','2','3','4','5','6','7','8','9',Esc(':'),';',Esc('<'),'=',Esc('>'),Esc('?'),
 293     '@','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
 294     'P','Q','R','S','T','U','V','W','X','Y','Z','[','\\',']','^','_',
 295     '`','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o',
 296     'p','q','r','s','t','u','v','w','x','y','z','{',Esc('|'),'}','~',Esc(0x7f),
 297     Esc(0x80),Esc(0x81),0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,Esc(0x8d),Esc(0x8e),Esc(0x8f),
 298     Esc(0x90),0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,Esc(0x9d),Esc(0x9e),0x9f,
 299     0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
 300     0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
 301     0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,
 302     0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf,
 303     0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
 304     0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff
 305 };
 306
 307 static int sanitize_bytestring(const char * src, int cch_src,
 308                                 char * odest, int cch_dest)
 309 {
 310     char * dest = odest;
 311     while (cch_src > 0 && *src && cch_dest > 0) {
 312
 313         unsigned short rc;
 314
 315         rc = sanitized_escapes_1252[*src];
 316         if (IS_ESCAPED(rc)) {
 317             static const char hex[] =
 318                 {'0','1','2','3','4','5','6','7',
 319                  '8','9','a','b','c','d','e','f'};
 320
 321             if (cch_dest < 3) {
 322                 *dest++ = '\0';
 323                 return 0;
 324             }
 325
 326             *dest++ = '%';
 327             *dest++ = hex[(((int)*src) >> 4) & 0x0f];
 328             *dest++ = hex[(((int)*src) & 0x0f)];
 329             cch_dest -= 3;
 330
 331         } else {
 332             *dest++ = *src;
 333             cch_dest--;
 334         }
 335
 336         cch_src--;
 337         src++;
 338     }
 339
 340     if (cch_src > 0 && cch_dest > 0) {
 341         *dest++ = '\0';
 342     }
 343
 344     return (int)(dest - odest);
 345 }
 346
 347 #undef Esc
 348 #undef IS_ESCAPED
 349 #undef ESCVAL
 350
 351 /* \brief Normalize a UTF-8 string.
 352
 353    \param[in] src String to normalize.
 354
 355    \param[in] cch_src : Count of characters in src.  If this value is
 356        -1, then src is assumed to be NULL terminated.  The translated
 357        string will be NULL terminated only if this is -1 or the count
 358        includes the terminating NULL.
 359
 360    \param[out] adest : Destination string.  Only considered valid if
 361        \a cch_adest is non-zero.
 362
 363    \param[in] cch_adest : Number of characters in the destination
 364        string.  If this is zero, then the return value is the number
 365        of bytes required.
 366
 367    \return If \a cch_adest is non-zero, then the return value is the
 368        number of bytes stored into adest.  If \a cch_adest is zero,
 369        then the return value is the number of bytes required.  In both
 370        cases, the return value is 0 if the call was unsuccessful.
 371  */
 372 long cm_NormalizeUtf8String(const char * src, int cch_src,
 373                             char * adest, int cch_adest)
 374 {
 375     wchar_t wsrcbuf[NLSMAXCCH];
 376     wchar_t *wnorm;
 377     int cch;
 378     int cch_norm;
 379
 380     /* Get some edge cases out first, so we don't have to worry about
 381        cch_src being 0 etc. */
 382     if (cch_src == 0) {
 383         return 0;
 384     } else if (*src == '\0') {
 385         if (cch_adest >= 1)
 386             *adest = '\0';
 387         return 1;
 388     }
 389
 390     if (cch_src == -1) {
 391         cch_src = strlen(src) + 1;
 392     }
 393
 394     cch = MultiByteToWideChar(CP_UTF8, 0, src,
 395                               cch_src * sizeof(char), wsrcbuf, NLSMAXCCH);
 396
 397     if (cch == 0) {
 398         if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
 399             char sanitized[NLSMAXCCH];
 400             int cch_sanitized;
 401
 402             /* If src doesn't have a unicode translation, then it
 403                wasn't valid UTF-8.  In this case, we assume that src
 404                is CP-1252 and then try to convert again.  But before
 405                that, we use a translation table to "sanitize" the
 406                input. */
 407
 408             cch_sanitized = sanitize_bytestring(src, cch_src, sanitized,
 409                                                 sizeof(sanitized)/sizeof(char));
 410
 411             if (cch_sanitized == 0) {
 412 #ifdef DEBUG_UNICODE
 413                 DebugBreak();
 414 #endif
 415                 return 0;
 416             }
 417
 418             cch = MultiByteToWideChar(1252, 0, sanitized,
 419                                       cch_sanitized * sizeof(char), wsrcbuf, NLSMAXCCH);
 420             if (cch == 0) {
 421                 /* Well, that didn't work either.  Something is very wrong. */
 422 #ifdef DEBUG_UNICODE
 423                 DebugBreak();
 424 #endif
 425                 return 0;
 426             }
 427         } else {
 428             return 0;
 429         }
 430     }
 431
 432     cch_norm = 0;
 433     wnorm = NormalizeUtf16String(wsrcbuf, cch, NULL, &cch_norm);
 434     if (wnorm == NULL) {
 435 #ifdef DEBUG_UNICODE
 436         DebugBreak();
 437 #endif
 438         return 0;
 439     }
 440
 441     cch = WideCharToMultiByte(CP_UTF8, 0, wnorm,
 442                               cch_norm, adest, cch_adest * sizeof(char),
 443                               NULL, FALSE);
 444
 445     if (wnorm)
 446         free(wnorm);
 447
 448     return cch;
 449 }
 450
 451 /*! \brief Case insensitive comparison with specific length
 452
 453   \param[in] str1 First string to compare.  Assumed to be encoded in UTF-8.
 454
 455   \param[in] str2 Second string to compare.  Assumed to be encoded in UTF-8.
 456
 457   \param[in] n Max byte count.
 458
 459  */
 460 int cm_strnicmp_utf8(const char * str1, const char * str2, int n)
 461 {
 462     wchar_t wstr1[NLSMAXCCH];
 463     int len1;
 464     int len2;
 465     wchar_t wstr2[NLSMAXCCH];
 466     int rv;
 467
 468     /* first check for NULL pointers */
 469     if (str1 == NULL) {
 470         if (str2 == NULL)
 471             return 0;
 472         else
 473             return -1;
 474     } else if (str2 == NULL) {
 475         return 1;
 476     }
 477
 478     len1 = MultiByteToWideChar(CP_UTF8, 0, str1, n, wstr1, NLSMAXCCH);
 479     if (len1 == 0) {
 480 #ifdef DEBUG
 481         DebugBreak();
 482 #endif
 483         wstr1[0] = L'\0';
 484     }
 485
 486     len2 = MultiByteToWideChar(CP_UTF8, 0, str2, n, wstr2, NLSMAXCCH);
 487     if (len2 == 0) {
 488 #ifdef DEBUG
 489         DebugBreak();
 490 #endif
 491         wstr2[0] = L'\0';
 492     }
 493
 494     rv = CompareStringW(LOCALE_INVARIANT, NORM_IGNORECASE, wstr1, len1, wstr2, len2);
 495     if (rv > 0)
 496         return (rv - 2);
 497     else {
 498 #ifdef DEBUG
 499         DebugBreak();
 500 #endif
 501         return 0;
 502     }
 503 }
 504
 505 int cm_stricmp_utf8(const char * str1, const char * str2)
 506 {
 507     wchar_t wstr1[NLSMAXCCH];
 508     int len1;
 509     int len2;
 510     wchar_t wstr2[NLSMAXCCH];
 511     int rv;
 512
 513     /* first check for NULL pointers */
 514     if (str1 == NULL) {
 515         if (str2 == NULL)
 516             return 0;
 517         else
 518             return -1;
 519     } else if (str2 == NULL) {
 520         return 1;
 521     }
 522
 523     len1 = MultiByteToWideChar(CP_UTF8, 0, str1, -1, wstr1, NLSMAXCCH);
 524     if (len1 == 0) {
 525 #ifdef DEBUG
 526         DebugBreak();
 527 #endif
 528         wstr1[0] = L'\0';
 529     }
 530
 531     len2 = MultiByteToWideChar(CP_UTF8, 0, str2, -1, wstr2, NLSMAXCCH);
 532     if (len2 == 0) {
 533 #ifdef DEBUG
 534         DebugBreak();
 535 #endif
 536         wstr2[0] = L'\0';
 537     }
 538
 539     rv = CompareStringW(LOCALE_INVARIANT, NORM_IGNORECASE, wstr1, len1, wstr2, len2);
 540     if (rv > 0)
 541         return (rv - 2);
 542     else {
 543 #ifdef DEBUG
 544         DebugBreak();
 545 #endif
 546         return 0;
 547     }
 548 }
 549
 550 wchar_t * strupr_utf16(wchar_t * wstr, size_t cbstr)
 551 {
 552     wchar_t wstrd[NLSMAXCCH];
 553     int len;
 554
 555     len = cbstr / sizeof(wchar_t);
 556     len = LCMapStringW(LOCALE_INVARIANT, LCMAP_UPPERCASE, wstr, len, wstrd, NLSMAXCCH);
 557     StringCbCopyW(wstr, cbstr, wstrd);
 558
 559     return wstr;
 560 }
 561
 562 char * strupr_utf8(char * str, size_t cbstr)
 563 {
 564     wchar_t wstr[NLSMAXCCH];
 565     wchar_t wstrd[NLSMAXCCH];
 566     int len;
 567     int r;
 568
 569     len = MultiByteToWideChar(CP_UTF8, 0, str, -1, wstr, NLSMAXCCH);
 570     if (len == 0)
 571         return str;
 572
 573     len = LCMapStringW(LOCALE_INVARIANT, LCMAP_UPPERCASE, wstr, len, wstrd, NLSMAXCCH);
 574
 575     len = WideCharToMultiByte(CP_UTF8, 0, wstrd, -1, str, cbstr, NULL, FALSE);
 576
 577     return str;
 578 }
 579
 580 char * char_next_utf8(const char * c)
 581 {
 582 #define CH (*((const unsigned char *)c))
 583
 584     if ((CH & 0x80) == 0)
 585         return (char *) c+1;
 586     else {
 587         switch (CH & 0xf0) {
 588         case 0xc0:
 589         case 0xd0:
 590             return (char *) c+2;
 591
 592         case 0xe0:
 593             return (char *) c+3;
 594
 595         case 0xf0:
 596             return (char *) c+4;
 597
 598         default:
 599             return (char *) c+1;
 600         }
 601     }
 602 #undef CH
 603 }
 604
 605
 606 char * char_prev_utf8(const char * c)
 607 {
 608 #define CH (*((const unsigned char *)c))
 609
 610     c--;
 611
 612     if ((CH & 0x80) == 0)
 613         return (char *) c;
 614     else
 615         while ((CH & 0xc0) == 0x80)
 616             (char *) c--;
 617     return (char *) c;
 618
 619 #undef CH
 620 }