lib/ext2fs/utf8n.h

   1 /*
   2  * Copyright (c) 2014 SGI.
   3  * All rights reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  */
  15
  16 /* This code is copied from the linux kernel.  We have a userspace
  17  * version here to such that hashes will match that implementation.
  18  */
  19
  20 #ifndef UTF8NORM_H
  21 #define UTF8NORM_H
  22
  23 #include <stdint.h>
  24 #include <unistd.h>
  25 #include <string.h>
  26
  27 /* Encoding a unicode version number as a single unsigned int. */
  28 #define UNICODE_MAJ_SHIFT               (16)
  29 #define UNICODE_MIN_SHIFT               (8)
  30
  31 #define UNICODE_AGE(MAJ, MIN, REV)                      \
  32         (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |   \
  33          ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |   \
  34          ((unsigned int)(REV)))
  35
  36 /* Highest unicode version supported by the data tables. */
  37 extern int utf8version_is_supported(uint8_t maj, uint8_t min, uint8_t rev);
  38 extern int utf8version_latest(void);
  39
  40 /*
  41  * Look for the correct const struct utf8data for a unicode version.
  42  * Returns NULL if the version requested is too new.
  43  *
  44  * Two normalization forms are supported: nfdi and nfdicf.
  45  *
  46  * nfdi:
  47  *  - Apply unicode normalization form NFD.
  48  *  - Remove any Default_Ignorable_Code_Point.
  49  *
  50  * nfdicf:
  51  *  - Apply unicode normalization form NFD.
  52  *  - Remove any Default_Ignorable_Code_Point.
  53  *  - Apply a full casefold (C + F).
  54  */
  55 extern const struct utf8data *utf8nfdi(unsigned int maxage);
  56 extern const struct utf8data *utf8nfdicf(unsigned int maxage);
  57
  58 /*
  59  * Determine the maximum age of any unicode character in the string.
  60  * Returns 0 if only unassigned code points are present.
  61  * Returns -1 if the input is not valid UTF-8.
  62  */
  63 extern int utf8agemax(const struct utf8data *data, const char *s);
  64 extern int utf8nagemax(const struct utf8data *data, const char *s, size_t len);
  65
  66 /*
  67  * Determine the minimum age of any unicode character in the string.
  68  * Returns 0 if any unassigned code points are present.
  69  * Returns -1 if the input is not valid UTF-8.
  70  */
  71 extern int utf8agemin(const struct utf8data *data, const char *s);
  72 extern int utf8nagemin(const struct utf8data *data, const char *s, size_t len);
  73
  74 /*
  75  * Determine the length of the normalized from of the string,
  76  * excluding any terminating NULL byte.
  77  * Returns 0 if only ignorable code points are present.
  78  * Returns -1 if the input is not valid UTF-8.
  79  */
  80 extern ssize_t utf8len(const struct utf8data *data, const char *s);
  81 extern ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len);
  82
  83 /* Needed in struct utf8cursor below. */
  84 #define UTF8HANGULLEAF  (12)
  85
  86 /*
  87  * Cursor structure used by the normalizer.
  88  */
  89 struct utf8cursor {
  90         const struct utf8data   *data;
  91         const char      *s;
  92         const char      *p;
  93         const char      *ss;
  94         const char      *sp;
  95         unsigned int    len;
  96         unsigned int    slen;
  97         short int       ccc;
  98         short int       nccc;
  99         unsigned char   hangul[UTF8HANGULLEAF];
 100 };
 101
 102 /*
 103  * Initialize a utf8cursor to normalize a string.
 104  * Returns 0 on success.
 105  * Returns -1 on failure.
 106  */
 107 extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
 108                       const char *s);
 109 extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
 110                        const char *s, size_t len);
 111
 112 /*
 113  * Get the next byte in the normalization.
 114  * Returns a value > 0 && < 256 on success.
 115  * Returns 0 when the end of the normalization is reached.
 116  * Returns -1 if the string being normalized is not valid UTF-8.
 117  */
 118 extern int utf8byte(struct utf8cursor *u8c);
 119
 120 #endif /* UTF8NORM_H */