lib/ext2fs/nls_utf8.c

   1 /*
   2  * Copyright (c) 2014 SGI.
   3  * Copyright (c) 2018 Collabora Ltd.
   4  * All rights reserved.
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License as
   8  * published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it would be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  */
  16
  17 /*
  18  * This code is adapted from the Linux Kernel.  We have a
  19  * userspace version here such that the hashes will match that
  20  * implementation.
  21  */
  22
  23 #include "config.h"
  24 #include <stdint.h>
  25 #include <unistd.h>
  26 #include <string.h>
  27 #include <limits.h>
  28 #include <errno.h>
  29
  30 #include "ext2_fs.h"
  31 #include "ext2fs.h"
  32
  33 /* Encoding a unicode version number as a single unsigned int. */
  34 #define UNICODE_MAJ_SHIFT               (16)
  35 #define UNICODE_MIN_SHIFT               (8)
  36
  37 #define UNICODE_AGE(MAJ, MIN, REV)                      \
  38         (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |   \
  39          ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |   \
  40          ((unsigned int)(REV)))
  41
  42 /* Needed in struct utf8cursor below. */
  43 #define UTF8HANGULLEAF  (12)
  44
  45 /*
  46  * Cursor structure used by the normalizer.
  47  */
  48 struct utf8cursor {
  49         const struct utf8data   *data;
  50         const char      *s;
  51         const char      *p;
  52         const char      *ss;
  53         const char      *sp;
  54         unsigned int    len;
  55         unsigned int    slen;
  56         short int       ccc;
  57         short int       nccc;
  58         unsigned char   hangul[UTF8HANGULLEAF];
  59 };
  60
  61 /*
  62  * Initialize a utf8cursor to normalize a string.
  63  * Returns 0 on success.
  64  * Returns -1 on failure.
  65  */
  66 // extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
  67 //                    const char *s);
  68 // extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
  69 //                     const char *s, size_t len);
  70
  71 /*
  72  * Get the next byte in the normalization.
  73  * Returns a value > 0 && < 256 on success.
  74  * Returns 0 when the end of the normalization is reached.
  75  * Returns -1 if the string being normalized is not valid UTF-8.
  76  */
  77 // extern int utf8byte(struct utf8cursor *u8c);
  78
  79
  80 struct utf8data {
  81         unsigned int maxage;
  82         unsigned int offset;
  83 };
  84
  85 #define __INCLUDED_FROM_UTF8NORM_C__
  86 #include "utf8data.h"
  87 #undef __INCLUDED_FROM_UTF8NORM_C__
  88
  89 #define ARRAY_SIZE(array)                       \
  90         (sizeof(array) / sizeof(array[0]))
  91
  92 #if 0
  93 /* Highest unicode version supported by the data tables. */
  94 static int utf8version_is_supported(uint8_t maj, uint8_t min, uint8_t rev)
  95 {
  96         int i = ARRAY_SIZE(utf8agetab) - 1;
  97         unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);
  98
  99         while (i >= 0 && utf8agetab[i] != 0) {
 100                 if (sb_utf8version == utf8agetab[i])
 101                         return 1;
 102                 i--;
 103         }
 104         return 0;
 105 }
 106 #endif
 107
 108 #if 0
 109 static int utf8version_latest(void)
 110 {
 111         return utf8vers;
 112 }
 113 #endif
 114
 115 /*
 116  * UTF-8 valid ranges.
 117  *
 118  * The UTF-8 encoding spreads the bits of a 32bit word over several
 119  * bytes. This table gives the ranges that can be held and how they'd
 120  * be represented.
 121  *
 122  * 0x00000000 0x0000007F: 0xxxxxxx
 123  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
 124  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
 125  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 126  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 127  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 128  *
 129  * There is an additional requirement on UTF-8, in that only the
 130  * shortest representation of a 32bit value is to be used.  A decoder
 131  * must not decode sequences that do not satisfy this requirement.
 132  * Thus the allowed ranges have a lower bound.
 133  *
 134  * 0x00000000 0x0000007F: 0xxxxxxx
 135  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
 136  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
 137  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 138  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 139  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 140  *
 141  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
 142  * 17 planes of 65536 values.  This limits the sequences actually seen
 143  * even more, to just the following.
 144  *
 145  *          0 -     0x7F: 0                   - 0x7F
 146  *       0x80 -    0x7FF: 0xC2 0x80           - 0xDF 0xBF
 147  *      0x800 -   0xFFFF: 0xE0 0xA0 0x80      - 0xEF 0xBF 0xBF
 148  *    0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
 149  *
 150  * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
 151  *
 152  * Note that the longest sequence seen with valid usage is 4 bytes,
 153  * the same a single UTF-32 character.  This makes the UTF-8
 154  * representation of Unicode strictly smaller than UTF-32.
 155  *
 156  * The shortest sequence requirement was introduced by:
 157  *    Corrigendum #1: UTF-8 Shortest Form
 158  * It can be found here:
 159  *    http://www.unicode.org/versions/corrigendum1.html
 160  *
 161  */
 162
 163 /*
 164  * Return the number of bytes used by the current UTF-8 sequence.
 165  * Assumes the input points to the first byte of a valid UTF-8
 166  * sequence.
 167  */
 168 static inline int utf8clen(const char *s)
 169 {
 170         unsigned char c = *s;
 171
 172         return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
 173 }
 174
 175 /*
 176  * Decode a 3-byte UTF-8 sequence.
 177  */
 178 static unsigned int
 179 utf8decode3(const char *str)
 180 {
 181         unsigned int            uc;
 182
 183         uc = *str++ & 0x0F;
 184         uc <<= 6;
 185         uc |= *str++ & 0x3F;
 186         uc <<= 6;
 187         uc |= *str++ & 0x3F;
 188
 189         return uc;
 190 }
 191
 192 /*
 193  * Encode a 3-byte UTF-8 sequence.
 194  */
 195 static int
 196 utf8encode3(char *str, unsigned int val)
 197 {
 198         str[2] = (val & 0x3F) | 0x80;
 199         val >>= 6;
 200         str[1] = (val & 0x3F) | 0x80;
 201         val >>= 6;
 202         str[0] = val | 0xE0;
 203
 204         return 3;
 205 }
 206
 207 /*
 208  * utf8trie_t
 209  *
 210  * A compact binary tree, used to decode UTF-8 characters.
 211  *
 212  * Internal nodes are one byte for the node itself, and up to three
 213  * bytes for an offset into the tree.  The first byte contains the
 214  * following information:
 215  *  NEXTBYTE  - flag        - advance to next byte if set
 216  *  BITNUM    - 3 bit field - the bit number to tested
 217  *  OFFLEN    - 2 bit field - number of bytes in the offset
 218  * if offlen == 0 (non-branching node)
 219  *  RIGHTPATH - 1 bit field - set if the following node is for the
 220  *                            right-hand path (tested bit is set)
 221  *  TRIENODE  - 1 bit field - set if the following node is an internal
 222  *                            node, otherwise it is a leaf node
 223  * if offlen != 0 (branching node)
 224  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
 225  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
 226  *
 227  * Due to the way utf8 works, there cannot be branching nodes with
 228  * NEXTBYTE set, and moreover those nodes always have a righthand
 229  * descendant.
 230  */
 231 typedef const unsigned char utf8trie_t;
 232 #define BITNUM          0x07
 233 #define NEXTBYTE        0x08
 234 #define OFFLEN          0x30
 235 #define OFFLEN_SHIFT    4
 236 #define RIGHTPATH       0x40
 237 #define TRIENODE        0x80
 238 #define RIGHTNODE       0x40
 239 #define LEFTNODE        0x80
 240
 241 /*
 242  * utf8leaf_t
 243  *
 244  * The leaves of the trie are embedded in the trie, and so the same
 245  * underlying datatype: unsigned char.
 246  *
 247  * leaf[0]: The unicode version, stored as a generation number that is
 248  *          an index into utf8agetab[].  With this we can filter code
 249  *          points based on the unicode version in which they were
 250  *          defined.  The CCC of a non-defined code point is 0.
 251  * leaf[1]: Canonical Combining Class. During normalization, we need
 252  *          to do a stable sort into ascending order of all characters
 253  *          with a non-zero CCC that occur between two characters with
 254  *          a CCC of 0, or at the begin or end of a string.
 255  *          The unicode standard guarantees that all CCC values are
 256  *          between 0 and 254 inclusive, which leaves 255 available as
 257  *          a special value.
 258  *          Code points with CCC 0 are known as stoppers.
 259  * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
 260  *          start of a NUL-terminated string that is the decomposition
 261  *          of the character.
 262  *          The CCC of a decomposable character is the same as the CCC
 263  *          of the first character of its decomposition.
 264  *          Some characters decompose as the empty string: these are
 265  *          characters with the Default_Ignorable_Code_Point property.
 266  *          These do affect normalization, as they all have CCC 0.
 267  *
 268  * The decompositions in the trie have been fully expanded, with the
 269  * exception of Hangul syllables, which are decomposed algorithmically.
 270  *
 271  * Casefolding, if applicable, is also done using decompositions.
 272  *
 273  * The trie is constructed in such a way that leaves exist for all
 274  * UTF-8 sequences that match the criteria from the "UTF-8 valid
 275  * ranges" comment above, and only for those sequences.  Therefore a
 276  * lookup in the trie can be used to validate the UTF-8 input.
 277  */
 278 typedef const unsigned char utf8leaf_t;
 279
 280 #define LEAF_GEN(LEAF)  ((LEAF)[0])
 281 #define LEAF_CCC(LEAF)  ((LEAF)[1])
 282 #define LEAF_STR(LEAF)  ((const char *)((LEAF) + 2))
 283
 284 #define MINCCC          (0)
 285 #define MAXCCC          (254)
 286 #define STOPPER         (0)
 287 #define DECOMPOSE       (255)
 288
 289 /* Marker for hangul syllable decomposition. */
 290 #define HANGUL          ((char)(255))
 291 /* Size of the synthesized leaf used for Hangul syllable decomposition. */
 292 #define UTF8HANGULLEAF  (12)
 293
 294 /*
 295  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
 296  *
 297  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
 298  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
 299  *
 300  * SBase = 0xAC00
 301  * LBase = 0x1100
 302  * VBase = 0x1161
 303  * TBase = 0x11A7
 304  * LCount = 19
 305  * VCount = 21
 306  * TCount = 28
 307  * NCount = 588 (VCount * TCount)
 308  * SCount = 11172 (LCount * NCount)
 309  *
 310  * Decomposition:
 311  *   SIndex = s - SBase
 312  *
 313  * LV (Canonical/Full)
 314  *   LIndex = SIndex / NCount
 315  *   VIndex = (Sindex % NCount) / TCount
 316  *   LPart = LBase + LIndex
 317  *   VPart = VBase + VIndex
 318  *
 319  * LVT (Canonical)
 320  *   LVIndex = (SIndex / TCount) * TCount
 321  *   TIndex = (Sindex % TCount)
 322  *   LVPart = SBase + LVIndex
 323  *   TPart = TBase + TIndex
 324  *
 325  * LVT (Full)
 326  *   LIndex = SIndex / NCount
 327  *   VIndex = (Sindex % NCount) / TCount
 328  *   TIndex = (Sindex % TCount)
 329  *   LPart = LBase + LIndex
 330  *   VPart = VBase + VIndex
 331  *   if (TIndex == 0) {
 332  *          d = <LPart, VPart>
 333  *   } else {
 334  *          TPart = TBase + TIndex
 335  *          d = <LPart, TPart, VPart>
 336  *   }
 337  */
 338
 339 /* Constants */
 340 #define SB      (0xAC00)
 341 #define LB      (0x1100)
 342 #define VB      (0x1161)
 343 #define TB      (0x11A7)
 344 #define LC      (19)
 345 #define VC      (21)
 346 #define TC      (28)
 347 #define NC      (VC * TC)
 348 #define SC      (LC * NC)
 349
 350 /* Algorithmic decomposition of hangul syllable. */
 351 static utf8leaf_t *
 352 utf8hangul(const char *str, unsigned char *hangul)
 353 {
 354         unsigned int    si;
 355         unsigned int    li;
 356         unsigned int    vi;
 357         unsigned int    ti;
 358         unsigned char   *h;
 359
 360         /* Calculate the SI, LI, VI, and TI values. */
 361         si = utf8decode3(str) - SB;
 362         li = si / NC;
 363         vi = (si % NC) / TC;
 364         ti = si % TC;
 365
 366         /* Fill in base of leaf. */
 367         h = hangul;
 368         LEAF_GEN(h) = 2;
 369         LEAF_CCC(h) = DECOMPOSE;
 370         h += 2;
 371
 372         /* Add LPart, a 3-byte UTF-8 sequence. */
 373         h += utf8encode3((char *)h, li + LB);
 374
 375         /* Add VPart, a 3-byte UTF-8 sequence. */
 376         h += utf8encode3((char *)h, vi + VB);
 377
 378         /* Add TPart if required, also a 3-byte UTF-8 sequence. */
 379         if (ti)
 380                 h += utf8encode3((char *)h, ti + TB);
 381
 382         /* Terminate string. */
 383         h[0] = '\0';
 384
 385         return hangul;
 386 }
 387
 388 /*
 389  * Use trie to scan s, touching at most len bytes.
 390  * Returns the leaf if one exists, NULL otherwise.
 391  *
 392  * A non-NULL return guarantees that the UTF-8 sequence starting at s
 393  * is well-formed and corresponds to a known unicode code point.  The
 394  * shorthand for this will be "is valid UTF-8 unicode".
 395  */
 396 static utf8leaf_t *utf8nlookup(const struct utf8data *data,
 397                                unsigned char *hangul, const char *s, size_t len)
 398 {
 399         utf8trie_t      *trie;
 400         int             offlen;
 401         int             offset;
 402         int             mask;
 403         int             node;
 404
 405         if (!data)
 406                 return NULL;
 407         if (len == 0)
 408                 return NULL;
 409
 410         trie = utf8data + data->offset;
 411         node = 1;
 412         while (node) {
 413                 offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
 414                 if (*trie & NEXTBYTE) {
 415                         if (--len == 0)
 416                                 return NULL;
 417                         s++;
 418                 }
 419                 mask = 1 << (*trie & BITNUM);
 420                 if (*s & mask) {
 421                         /* Right leg */
 422                         if (offlen) {
 423                                 /* Right node at offset of trie */
 424                                 node = (*trie & RIGHTNODE);
 425                                 offset = trie[offlen];
 426                                 while (--offlen) {
 427                                         offset <<= 8;
 428                                         offset |= trie[offlen];
 429                                 }
 430                                 trie += offset;
 431                         } else if (*trie & RIGHTPATH) {
 432                                 /* Right node after this node */
 433                                 node = (*trie & TRIENODE);
 434                                 trie++;
 435                         } else {
 436                                 /* No right node. */
 437                                 return NULL;
 438                         }
 439                 } else {
 440                         /* Left leg */
 441                         if (offlen) {
 442                                 /* Left node after this node. */
 443                                 node = (*trie & LEFTNODE);
 444                                 trie += offlen + 1;
 445                         } else if (*trie & RIGHTPATH) {
 446                                 /* No left node. */
 447                                 return NULL;
 448                         } else {
 449                                 /* Left node after this node */
 450                                 node = (*trie & TRIENODE);
 451                                 trie++;
 452                         }
 453                 }
 454         }
 455         /*
 456          * Hangul decomposition is done algorithmically. These are the
 457          * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
 458          * always 3 bytes long, so s has been advanced twice, and the
 459          * start of the sequence is at s-2.
 460          */
 461         if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
 462                 trie = utf8hangul(s - 2, hangul);
 463         return trie;
 464 }
 465
 466 /*
 467  * Use trie to scan s.
 468  * Returns the leaf if one exists, NULL otherwise.
 469  *
 470  * Forwards to utf8nlookup().
 471  */
 472 static utf8leaf_t *utf8lookup(const struct utf8data *data,
 473                               unsigned char *hangul, const char *s)
 474 {
 475         return utf8nlookup(data, hangul, s, (size_t)-1);
 476 }
 477
 478 #if 0
 479 /*
 480  * Maximum age of any character in s.
 481  * Return -1 if s is not valid UTF-8 unicode.
 482  * Return 0 if only non-assigned code points are used.
 483  */
 484 static int utf8agemax(const struct utf8data *data, const char *s)
 485 {
 486         utf8leaf_t      *leaf;
 487         int             age = 0;
 488         int             leaf_age;
 489         unsigned char   hangul[UTF8HANGULLEAF];
 490
 491         if (!data)
 492                 return -1;
 493
 494         while (*s) {
 495                 leaf = utf8lookup(data, hangul, s);
 496                 if (!leaf)
 497                         return -1;
 498
 499                 leaf_age = utf8agetab[LEAF_GEN(leaf)];
 500                 if (leaf_age <= data->maxage && leaf_age > age)
 501                         age = leaf_age;
 502                 s += utf8clen(s);
 503         }
 504         return age;
 505 }
 506 #endif
 507
 508 #if 0
 509 /*
 510  * Minimum age of any character in s.
 511  * Return -1 if s is not valid UTF-8 unicode.
 512  * Return 0 if non-assigned code points are used.
 513  */
 514 static int utf8agemin(const struct utf8data *data, const char *s)
 515 {
 516         utf8leaf_t      *leaf;
 517         int             age;
 518         int             leaf_age;
 519         unsigned char   hangul[UTF8HANGULLEAF];
 520
 521         if (!data)
 522                 return -1;
 523         age = data->maxage;
 524         while (*s) {
 525                 leaf = utf8lookup(data, hangul, s);
 526                 if (!leaf)
 527                         return -1;
 528                 leaf_age = utf8agetab[LEAF_GEN(leaf)];
 529                 if (leaf_age <= data->maxage && leaf_age < age)
 530                         age = leaf_age;
 531                 s += utf8clen(s);
 532         }
 533         return age;
 534 }
 535 #endif
 536
 537 #if 0
 538 /*
 539  * Maximum age of any character in s, touch at most len bytes.
 540  * Return -1 if s is not valid UTF-8 unicode.
 541  */
 542 static int utf8nagemax(const struct utf8data *data, const char *s, size_t len)
 543 {
 544         utf8leaf_t      *leaf;
 545         int             age = 0;
 546         int             leaf_age;
 547         unsigned char   hangul[UTF8HANGULLEAF];
 548
 549         if (!data)
 550                 return -1;
 551
 552         while (len && *s) {
 553                 leaf = utf8nlookup(data, hangul, s, len);
 554                 if (!leaf)
 555                         return -1;
 556                 leaf_age = utf8agetab[LEAF_GEN(leaf)];
 557                 if (leaf_age <= data->maxage && leaf_age > age)
 558                         age = leaf_age;
 559                 len -= utf8clen(s);
 560                 s += utf8clen(s);
 561         }
 562         return age;
 563 }
 564 #endif
 565
 566 #if 0
 567 /*
 568  * Maximum age of any character in s, touch at most len bytes.
 569  * Return -1 if s is not valid UTF-8 unicode.
 570  */
 571 static int utf8nagemin(const struct utf8data *data, const char *s, size_t len)
 572 {
 573         utf8leaf_t      *leaf;
 574         int             leaf_age;
 575         int             age;
 576         unsigned char   hangul[UTF8HANGULLEAF];
 577
 578         if (!data)
 579                 return -1;
 580         age = data->maxage;
 581         while (len && *s) {
 582                 leaf = utf8nlookup(data, hangul, s, len);
 583                 if (!leaf)
 584                         return -1;
 585                 leaf_age = utf8agetab[LEAF_GEN(leaf)];
 586                 if (leaf_age <= data->maxage && leaf_age < age)
 587                         age = leaf_age;
 588                 len -= utf8clen(s);
 589                 s += utf8clen(s);
 590         }
 591         return age;
 592 }
 593 #endif
 594
 595 #if 0
 596 /*
 597  * Length of the normalization of s.
 598  * Return -1 if s is not valid UTF-8 unicode.
 599  *
 600  * A string of Default_Ignorable_Code_Point has length 0.
 601  */
 602 static ssize_t utf8len(const struct utf8data *data, const char *s)
 603 {
 604         utf8leaf_t      *leaf;
 605         size_t          ret = 0;
 606         unsigned char   hangul[UTF8HANGULLEAF];
 607
 608         if (!data)
 609                 return -1;
 610         while (*s) {
 611                 leaf = utf8lookup(data, hangul, s);
 612                 if (!leaf)
 613                         return -1;
 614                 if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
 615                         ret += utf8clen(s);
 616                 else if (LEAF_CCC(leaf) == DECOMPOSE)
 617                         ret += strlen(LEAF_STR(leaf));
 618                 else
 619                         ret += utf8clen(s);
 620                 s += utf8clen(s);
 621         }
 622         return ret;
 623 }
 624 #endif
 625
 626 #if 0
 627 /*
 628  * Length of the normalization of s, touch at most len bytes.
 629  * Return -1 if s is not valid UTF-8 unicode.
 630  */
 631 static ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
 632 {
 633         utf8leaf_t      *leaf;
 634         size_t          ret = 0;
 635         unsigned char   hangul[UTF8HANGULLEAF];
 636
 637         if (!data)
 638                 return -1;
 639         while (len && *s) {
 640                 leaf = utf8nlookup(data, hangul, s, len);
 641                 if (!leaf)
 642                         return -1;
 643                 if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
 644                         ret += utf8clen(s);
 645                 else if (LEAF_CCC(leaf) == DECOMPOSE)
 646                         ret += strlen(LEAF_STR(leaf));
 647                 else
 648                         ret += utf8clen(s);
 649                 len -= utf8clen(s);
 650                 s += utf8clen(s);
 651         }
 652         return ret;
 653 }
 654 #endif
 655
 656 /*
 657  * Set up an utf8cursor for use by utf8byte().
 658  *
 659  *   u8c    : pointer to cursor.
 660  *   data   : const struct utf8data to use for normalization.
 661  *   s      : string.
 662  *   len    : length of s.
 663  *
 664  * Returns -1 on error, 0 on success.
 665  */
 666 static int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
 667                 const char *s, size_t len)
 668 {
 669         if (!data)
 670                 return -1;
 671         if (!s)
 672                 return -1;
 673         u8c->data = data;
 674         u8c->s = s;
 675         u8c->p = NULL;
 676         u8c->ss = NULL;
 677         u8c->sp = NULL;
 678         u8c->len = len;
 679         u8c->slen = 0;
 680         u8c->ccc = STOPPER;
 681         u8c->nccc = STOPPER;
 682         /* Check we didn't clobber the maximum length. */
 683         if (u8c->len != len)
 684                 return -1;
 685         /* The first byte of s may not be an utf8 continuation. */
 686         if (len > 0 && (*s & 0xC0) == 0x80)
 687                 return -1;
 688         return 0;
 689 }
 690
 691 #if 0
 692 /*
 693  * Set up an utf8cursor for use by utf8byte().
 694  *
 695  *   u8c    : pointer to cursor.
 696  *   data   : const struct utf8data to use for normalization.
 697  *   s      : NUL-terminated string.
 698  *
 699  * Returns -1 on error, 0 on success.
 700  */
 701 static int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
 702                const char *s)
 703 {
 704         return utf8ncursor(u8c, data, s, (unsigned int)-1);
 705 }
 706 #endif
 707
 708 /*
 709  * Get one byte from the normalized form of the string described by u8c.
 710  *
 711  * Returns the byte cast to an unsigned char on succes, and -1 on failure.
 712  *
 713  * The cursor keeps track of the location in the string in u8c->s.
 714  * When a character is decomposed, the current location is stored in
 715  * u8c->p, and u8c->s is set to the start of the decomposition. Note
 716  * that bytes from a decomposition do not count against u8c->len.
 717  *
 718  * Characters are emitted if they match the current CCC in u8c->ccc.
 719  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
 720  * and the function returns 0 in that case.
 721  *
 722  * Sorting by CCC is done by repeatedly scanning the string.  The
 723  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
 724  * the start of the scan.  The first pass finds the lowest CCC to be
 725  * emitted and stores it in u8c->nccc, the second pass emits the
 726  * characters with this CCC and finds the next lowest CCC. This limits
 727  * the number of passes to 1 + the number of different CCCs in the
 728  * sequence being scanned.
 729  *
 730  * Therefore:
 731  *  u8c->p  != NULL -> a decomposition is being scanned.
 732  *  u8c->ss != NULL -> this is a repeating scan.
 733  *  u8c->ccc == -1   -> this is the first scan of a repeating scan.
 734  */
 735 static int utf8byte(struct utf8cursor *u8c)
 736 {
 737         utf8leaf_t *leaf;
 738         int ccc;
 739
 740         for (;;) {
 741                 /* Check for the end of a decomposed character. */
 742                 if (u8c->p && *u8c->s == '\0') {
 743                         u8c->s = u8c->p;
 744                         u8c->p = NULL;
 745                 }
 746
 747                 /* Check for end-of-string. */
 748                 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
 749                         /* There is no next byte. */
 750                         if (u8c->ccc == STOPPER)
 751                                 return 0;
 752                         /* End-of-string during a scan counts as a stopper. */
 753                         ccc = STOPPER;
 754                         goto ccc_mismatch;
 755                 } else if ((*u8c->s & 0xC0) == 0x80) {
 756                         /* This is a continuation of the current character. */
 757                         if (!u8c->p)
 758                                 u8c->len--;
 759                         return (unsigned char)*u8c->s++;
 760                 }
 761
 762                 /* Look up the data for the current character. */
 763                 if (u8c->p) {
 764                         leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
 765                 } else {
 766                         leaf = utf8nlookup(u8c->data, u8c->hangul,
 767                                            u8c->s, u8c->len);
 768                 }
 769
 770                 /* No leaf found implies that the input is a binary blob. */
 771                 if (!leaf)
 772                         return -1;
 773
 774                 ccc = LEAF_CCC(leaf);
 775                 /* Characters that are too new have CCC 0. */
 776                 if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
 777                         ccc = STOPPER;
 778                 } else if (ccc == DECOMPOSE) {
 779                         u8c->len -= utf8clen(u8c->s);
 780                         u8c->p = u8c->s + utf8clen(u8c->s);
 781                         u8c->s = LEAF_STR(leaf);
 782                         /* Empty decomposition implies CCC 0. */
 783                         if (*u8c->s == '\0') {
 784                                 if (u8c->ccc == STOPPER)
 785                                         continue;
 786                                 ccc = STOPPER;
 787                                 goto ccc_mismatch;
 788                         }
 789
 790                         leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
 791                         ccc = LEAF_CCC(leaf);
 792                 }
 793
 794                 /*
 795                  * If this is not a stopper, then see if it updates
 796                  * the next canonical class to be emitted.
 797                  */
 798                 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
 799                         u8c->nccc = ccc;
 800
 801                 /*
 802                  * Return the current byte if this is the current
 803                  * combining class.
 804                  */
 805                 if (ccc == u8c->ccc) {
 806                         if (!u8c->p)
 807                                 u8c->len--;
 808                         return (unsigned char)*u8c->s++;
 809                 }
 810
 811                 /* Current combining class mismatch. */
 812 ccc_mismatch:
 813                 if (u8c->nccc == STOPPER) {
 814                         /*
 815                          * Scan forward for the first canonical class
 816                          * to be emitted.  Save the position from
 817                          * which to restart.
 818                          */
 819                         u8c->ccc = MINCCC - 1;
 820                         u8c->nccc = ccc;
 821                         u8c->sp = u8c->p;
 822                         u8c->ss = u8c->s;
 823                         u8c->slen = u8c->len;
 824                         if (!u8c->p)
 825                                 u8c->len -= utf8clen(u8c->s);
 826                         u8c->s += utf8clen(u8c->s);
 827                 } else if (ccc != STOPPER) {
 828                         /* Not a stopper, and not the ccc we're emitting. */
 829                         if (!u8c->p)
 830                                 u8c->len -= utf8clen(u8c->s);
 831                         u8c->s += utf8clen(u8c->s);
 832                 } else if (u8c->nccc != MAXCCC + 1) {
 833                         /* At a stopper, restart for next ccc. */
 834                         u8c->ccc = u8c->nccc;
 835                         u8c->nccc = MAXCCC + 1;
 836                         u8c->s = u8c->ss;
 837                         u8c->p = u8c->sp;
 838                         u8c->len = u8c->slen;
 839                 } else {
 840                         /* All done, proceed from here. */
 841                         u8c->ccc = STOPPER;
 842                         u8c->nccc = STOPPER;
 843                         u8c->sp = NULL;
 844                         u8c->ss = NULL;
 845                         u8c->slen = 0;
 846                 }
 847         }
 848 }
 849
 850 #if 0
 851 /*
 852  * Look for the correct const struct utf8data for a unicode version.
 853  * Returns NULL if the version requested is too new.
 854  *
 855  * Two normalization forms are supported: nfdi and nfdicf.
 856  *
 857  * nfdi:
 858  *  - Apply unicode normalization form NFD.
 859  *  - Remove any Default_Ignorable_Code_Point.
 860  *
 861  * nfdicf:
 862  *  - Apply unicode normalization form NFD.
 863  *  - Remove any Default_Ignorable_Code_Point.
 864  *  - Apply a full casefold (C + F).
 865  */
 866 static const struct utf8data *utf8nfdi(unsigned int maxage)
 867 {
 868         int i = ARRAY_SIZE(utf8nfdidata) - 1;
 869
 870         while (maxage < utf8nfdidata[i].maxage)
 871                 i--;
 872         if (maxage > utf8nfdidata[i].maxage)
 873                 return NULL;
 874         return &utf8nfdidata[i];
 875 }
 876 #endif
 877
 878 static const struct utf8data *utf8nfdicf(unsigned int maxage)
 879 {
 880         int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
 881
 882         while (maxage < utf8nfdicfdata[i].maxage)
 883                 i--;
 884         if (maxage > utf8nfdicfdata[i].maxage)
 885                 return NULL;
 886         return &utf8nfdicfdata[i];
 887 }
 888
 889 static int utf8_casefold(const struct ext2fs_nls_table *table,
 890                           const unsigned char *str, size_t len,
 891                           unsigned char *dest, size_t dlen)
 892 {
 893         const struct utf8data *data = utf8nfdicf(table->version);
 894         struct utf8cursor cur;
 895         size_t nlen = 0;
 896
 897         if (utf8ncursor(&cur, data, (const char *) str, len) < 0)
 898                 goto invalid_seq;
 899
 900         for (nlen = 0; nlen < dlen; nlen++) {
 901                 int c = utf8byte(&cur);
 902
 903                 dest[nlen] = c;
 904                 if (!c)
 905                         return nlen;
 906                 if (c == -1)
 907                         break;
 908         }
 909
 910         return -ENAMETOOLONG;
 911
 912 invalid_seq:
 913         if (dlen < len)
 914                 return -ENAMETOOLONG;
 915
 916         /* Signal invalid sequence */
 917         return -EINVAL;
 918 }
 919
 920 static const struct ext2fs_nls_ops utf8_ops = {
 921         .casefold = utf8_casefold,
 922 };
 923
 924 static const struct ext2fs_nls_table nls_utf8 = {
 925         .ops = &utf8_ops,
 926         .version = UNICODE_AGE(12, 1, 0),
 927 };
 928
 929 const struct ext2fs_nls_table *ext2fs_load_nls_table(int encoding)
 930 {
 931         if (encoding == EXT4_ENC_UTF8_12_1)
 932                 return &nls_utf8;
 933
 934         return NULL;
 935 }