2 * Copyright (c) 2014 SGI.
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
16 /* This code is copied from the linux kernel. We have a userspace
17 * version here to such that hashes will match that implementation.
27 /* Encoding a unicode version number as a single unsigned int. */
28 #define UNICODE_MAJ_SHIFT (16)
29 #define UNICODE_MIN_SHIFT (8)
31 #define UNICODE_AGE(MAJ, MIN, REV) \
32 (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
33 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
34 ((unsigned int)(REV)))
36 /* Highest unicode version supported by the data tables. */
37 extern int utf8version_is_supported(uint8_t maj, uint8_t min, uint8_t rev);
38 extern int utf8version_latest(void);
41 * Look for the correct const struct utf8data for a unicode version.
42 * Returns NULL if the version requested is too new.
44 * Two normalization forms are supported: nfdi and nfdicf.
47 * - Apply unicode normalization form NFD.
48 * - Remove any Default_Ignorable_Code_Point.
51 * - Apply unicode normalization form NFD.
52 * - Remove any Default_Ignorable_Code_Point.
53 * - Apply a full casefold (C + F).
55 extern const struct utf8data *utf8nfdi(unsigned int maxage);
56 extern const struct utf8data *utf8nfdicf(unsigned int maxage);
59 * Determine the maximum age of any unicode character in the string.
60 * Returns 0 if only unassigned code points are present.
61 * Returns -1 if the input is not valid UTF-8.
63 extern int utf8agemax(const struct utf8data *data, const char *s);
64 extern int utf8nagemax(const struct utf8data *data, const char *s, size_t len);
67 * Determine the minimum age of any unicode character in the string.
68 * Returns 0 if any unassigned code points are present.
69 * Returns -1 if the input is not valid UTF-8.
71 extern int utf8agemin(const struct utf8data *data, const char *s);
72 extern int utf8nagemin(const struct utf8data *data, const char *s, size_t len);
75 * Determine the length of the normalized from of the string,
76 * excluding any terminating NULL byte.
77 * Returns 0 if only ignorable code points are present.
78 * Returns -1 if the input is not valid UTF-8.
80 extern ssize_t utf8len(const struct utf8data *data, const char *s);
81 extern ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len);
83 /* Needed in struct utf8cursor below. */
84 #define UTF8HANGULLEAF (12)
87 * Cursor structure used by the normalizer.
90 const struct utf8data *data;
99 unsigned char hangul[UTF8HANGULLEAF];
103 * Initialize a utf8cursor to normalize a string.
104 * Returns 0 on success.
105 * Returns -1 on failure.
107 extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
109 extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
110 const char *s, size_t len);
113 * Get the next byte in the normalization.
114 * Returns a value > 0 && < 256 on success.
115 * Returns 0 when the end of the normalization is reached.
116 * Returns -1 if the string being normalized is not valid UTF-8.
118 extern int utf8byte(struct utf8cursor *u8c);
120 #endif /* UTF8NORM_H */