/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458). * * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F), * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also * the section 3C man pages. * Interface stability: Committed. */ #include #ifdef _KERNEL #include #include #include #include #include #include #include #else #include #include #endif /* _KERNEL */ #include #include #include /* The maximum possible number of bytes in a UTF-8 character. */ #define U8_MB_CUR_MAX (4) /* * The maximum number of bytes needed for a UTF-8 character to cover * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2. */ #define U8_MAX_BYTES_UCS2 (3) /* The maximum possible number of bytes in a Stream-Safe Text. */ #define U8_STREAM_SAFE_TEXT_MAX (128) /* * The maximum number of characters in a combining/conjoining sequence and * the actual upperbound limit of a combining/conjoining sequence. */ #define U8_MAX_CHARS_A_SEQ (32) #define U8_UPPER_LIMIT_IN_A_SEQ (31) /* The combining class value for Starter. */ #define U8_COMBINING_CLASS_STARTER (0) /* * Some Hangul related macros at below. * * The first and the last of Hangul syllables, Hangul Jamo Leading consonants, * Vowels, and optional Trailing consonants in Unicode scalar values. * * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not * the actual U+11A8. This is due to that the trailing consonant is optional * and thus we are doing a pre-calculation of subtracting one. * * Each of 19 modern leading consonants has total 588 possible syllables since * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for * no trailing consonant case, i.e., 21 x 28 = 588. * * We also have bunch of Hangul related macros at below. Please bear in mind * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is * a Hangul Jamo or not but the value does not guarantee that it is a Hangul * Jamo; it just guarantee that it will be most likely. */ #define U8_HANGUL_SYL_FIRST (0xAC00U) #define U8_HANGUL_SYL_LAST (0xD7A3U) #define U8_HANGUL_JAMO_L_FIRST (0x1100U) #define U8_HANGUL_JAMO_L_LAST (0x1112U) #define U8_HANGUL_JAMO_V_FIRST (0x1161U) #define U8_HANGUL_JAMO_V_LAST (0x1175U) #define U8_HANGUL_JAMO_T_FIRST (0x11A7U) #define U8_HANGUL_JAMO_T_LAST (0x11C2U) #define U8_HANGUL_V_COUNT (21) #define U8_HANGUL_VT_COUNT (588) #define U8_HANGUL_T_COUNT (28) #define U8_HANGUL_JAMO_1ST_BYTE (0xE1U) #define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \ (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \ (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \ (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU)); #define U8_HANGUL_JAMO_L(u) \ ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST) #define U8_HANGUL_JAMO_V(u) \ ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST) #define U8_HANGUL_JAMO_T(u) \ ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) #define U8_HANGUL_JAMO(u) \ ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) #define U8_HANGUL_SYLLABLE(u) \ ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST) #define U8_HANGUL_COMPOSABLE_L_V(s, u) \ ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u))) #define U8_HANGUL_COMPOSABLE_LV_T(s, u) \ ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u))) /* The types of decomposition mappings. */ #define U8_DECOMP_BOTH (0xF5U) #define U8_DECOMP_CANONICAL (0xF6U) /* The indicator for 16-bit table. */ #define U8_16BIT_TABLE_INDICATOR (0x8000U) /* The following are some convenience macros. */ #define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \ (u) = ((((uint32_t)(b1) & 0x0F) << 12) | \ (((uint32_t)(b2) & 0x3F) << 6) | \ ((uint32_t)(b3) & 0x3F)); #define U8_SIMPLE_SWAP(a, b, t) \ (t) = (a); \ (a) = (b); \ (b) = (t); #define U8_ASCII_TOUPPER(c) \ (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c)) #define U8_ASCII_TOLOWER(c) \ (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c)) #define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U) /* * The following macro assumes that the two characters that are to be * swapped are adjacent to each other and 'a' comes before 'b'. * * If the assumptions are not met, then, the macro will fail. */ #define U8_SWAP_COMB_MARKS(a, b) \ for (k = 0; k < disp[(a)]; k++) \ u8t[k] = u8s[start[(a)] + k]; \ for (k = 0; k < disp[(b)]; k++) \ u8s[start[(a)] + k] = u8s[start[(b)] + k]; \ start[(b)] = start[(a)] + disp[(b)]; \ for (k = 0; k < disp[(a)]; k++) \ u8s[start[(b)] + k] = u8t[k]; \ U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \ U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc); /* The possible states during normalization. */ typedef enum { U8_STATE_START = 0, U8_STATE_HANGUL_L = 1, U8_STATE_HANGUL_LV = 2, U8_STATE_HANGUL_LVT = 3, U8_STATE_HANGUL_V = 4, U8_STATE_HANGUL_T = 5, U8_STATE_COMBINING_MARK = 6 } u8_normalization_states_t; /* * The three vectors at below are used to check bytes of a given UTF-8 * character are valid and not containing any malformed byte values. * * We used to have a quite relaxed UTF-8 binary representation but then there * was some security related issues and so the Unicode Consortium defined * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it * one more time at the Unicode 3.2. The following three tables are based on * that. */ #define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF) #define I_ U8_ILLEGAL_CHAR #define O_ U8_OUT_OF_RANGE_CHAR const int8_t u8_number_of_bytes[0x100] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, /* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, /* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, /* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ 4, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, }; #undef I_ #undef O_ const uint8_t u8_valid_min_2nd_byte[0x100] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* C0 C1 C2 C3 C4 C5 C6 C7 */ 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* C8 C9 CA CB CC CD CE CF */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* D0 D1 D2 D3 D4 D5 D6 D7 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* D8 D9 DA DB DC DD DE DF */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* E0 E1 E2 E3 E4 E5 E6 E7 */ 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* E8 E9 EA EB EC ED EE EF */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* F0 F1 F2 F3 F4 F5 F6 F7 */ 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; const uint8_t u8_valid_max_2nd_byte[0x100] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* C0 C1 C2 C3 C4 C5 C6 C7 */ 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, /* C8 C9 CA CB CC CD CE CF */ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, /* D0 D1 D2 D3 D4 D5 D6 D7 */ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, /* D8 D9 DA DB DC DD DE DF */ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, /* E0 E1 E2 E3 E4 E5 E6 E7 */ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, /* E8 E9 EA EB EC ED EE EF */ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, /* F0 F1 F2 F3 F4 F5 F6 F7 */ 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; /* * The u8_validate() validates on the given UTF-8 character string and * calculate the byte length. It is quite similar to mblen(3C) except that * this will validate against the list of characters if required and * specific to UTF-8 and Unicode. */ int u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum) { uchar_t *ib; uchar_t *ibtail; uchar_t **p; uchar_t *s1; uchar_t *s2; uchar_t f; int sz; size_t i; int ret_val; boolean_t second; boolean_t no_need_to_validate_entire; boolean_t check_additional; boolean_t validate_ucs2_range_only; if (! u8str) return (0); ib = (uchar_t *)u8str; ibtail = ib + n; ret_val = 0; no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE); check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL; validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE; while (ib < ibtail) { /* * The first byte of a UTF-8 character tells how many * bytes will follow for the character. If the first byte * is an illegal byte value or out of range value, we just * return -1 with an appropriate error number. */ sz = u8_number_of_bytes[*ib]; if (sz == U8_ILLEGAL_CHAR) { *errnum = EILSEQ; return (-1); } if (sz == U8_OUT_OF_RANGE_CHAR || (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) { *errnum = ERANGE; return (-1); } /* * If we don't have enough bytes to check on, that's also * an error. As you can see, we give illegal byte sequence * checking higher priority then EINVAL cases. */ if ((ibtail - ib) < sz) { *errnum = EINVAL; return (-1); } if (sz == 1) { ib++; ret_val++; } else { /* * Check on the multi-byte UTF-8 character. For more * details on this, see comment added for the used * data structures at the beginning of the file. */ f = *ib++; ret_val++; second = B_TRUE; for (i = 1; i < sz; i++) { if (second) { if (*ib < u8_valid_min_2nd_byte[f] || *ib > u8_valid_max_2nd_byte[f]) { *errnum = EILSEQ; return (-1); } second = B_FALSE; } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) { *errnum = EILSEQ; return (-1); } ib++; ret_val++; } } if (check_additional) { for (p = (uchar_t **)list, i = 0; p[i]; i++) { s1 = ib - sz; s2 = p[i]; while (s1 < ib) { if (*s1 != *s2 || *s2 == '\0') break; s1++; s2++; } if (s1 >= ib && *s2 == '\0') { *errnum = EBADF; return (-1); } } } if (no_need_to_validate_entire) break; } return (ret_val); } /* * The do_case_conv() looks at the mapping tables and returns found * bytes if any. If not found, the input bytes are returned. The function * always terminate the return bytes with a null character assuming that * there are plenty of room to do so. * * The case conversions are simple case conversions mapping a character to * another character as specified in the Unicode data. The byte size of * the mapped character could be different from that of the input character. * * The return value is the byte length of the returned character excluding * the terminating null byte. */ static size_t do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper) { size_t i; uint16_t b1 = 0; uint16_t b2 = 0; uint16_t b3 = 0; uint16_t b3_tbl; uint16_t b3_base; uint16_t b4 = 0; size_t start_id; size_t end_id; /* * At this point, the only possible values for sz are 2, 3, and 4. * The u8s should point to a vector that is well beyond the size of * 5 bytes. */ if (sz == 2) { b3 = u8s[0] = s[0]; b4 = u8s[1] = s[1]; } else if (sz == 3) { b2 = u8s[0] = s[0]; b3 = u8s[1] = s[1]; b4 = u8s[2] = s[2]; } else if (sz == 4) { b1 = u8s[0] = s[0]; b2 = u8s[1] = s[1]; b3 = u8s[2] = s[2]; b4 = u8s[3] = s[3]; } else { /* This is not possible but just in case as a fallback. */ if (is_it_toupper) *u8s = U8_ASCII_TOUPPER(*s); else *u8s = U8_ASCII_TOLOWER(*s); u8s[1] = '\0'; return (1); } u8s[sz] = '\0'; /* * Let's find out if we have a corresponding character. */ b1 = u8_common_b1_tbl[uv][b1]; if (b1 == U8_TBL_ELEMENT_NOT_DEF) return ((size_t)sz); b2 = u8_case_common_b2_tbl[uv][b1][b2]; if (b2 == U8_TBL_ELEMENT_NOT_DEF) return ((size_t)sz); if (is_it_toupper) { b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id; if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) return ((size_t)sz); start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4]; end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1]; /* Either there is no match or an error at the table. */ if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) return ((size_t)sz); b3_base = u8_toupper_b3_tbl[uv][b2][b3].base; for (i = 0; start_id < end_id; start_id++) u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id]; } else { b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id; if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) return ((size_t)sz); start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4]; end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1]; if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) return ((size_t)sz); b3_base = u8_tolower_b3_tbl[uv][b2][b3].base; for (i = 0; start_id < end_id; start_id++) u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id]; } /* * If i is still zero, that means there is no corresponding character. */ if (i == 0) return ((size_t)sz); u8s[i] = '\0'; return (i); } /* * The do_case_compare() function compares the two input strings, s1 and s2, * one character at a time doing case conversions if applicable and return * the comparison result as like strcmp(). * * Since, in empirical sense, most of text data are 7-bit ASCII characters, * we treat the 7-bit ASCII characters as a special case trying to yield * faster processing time. */ static int do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2, boolean_t is_it_toupper, int *errnum) { int f; int sz1; int sz2; size_t j; size_t i1; size_t i2; uchar_t u8s1[U8_MB_CUR_MAX + 1]; uchar_t u8s2[U8_MB_CUR_MAX + 1]; i1 = i2 = 0; while (i1 < n1 && i2 < n2) { /* * Find out what would be the byte length for this UTF-8 * character at string s1 and also find out if this is * an illegal start byte or not and if so, issue a proper * error number and yet treat this byte as a character. */ sz1 = u8_number_of_bytes[*s1]; if (sz1 < 0) { *errnum = EILSEQ; sz1 = 1; } /* * For 7-bit ASCII characters mainly, we do a quick case * conversion right at here. * * If we don't have enough bytes for this character, issue * an EINVAL error and use what are available. * * If we have enough bytes, find out if there is * a corresponding uppercase character and if so, copy over * the bytes for a comparison later. If there is no * corresponding uppercase character, then, use what we have * for the comparison. */ if (sz1 == 1) { if (is_it_toupper) u8s1[0] = U8_ASCII_TOUPPER(*s1); else u8s1[0] = U8_ASCII_TOLOWER(*s1); s1++; u8s1[1] = '\0'; } else if ((i1 + sz1) > n1) { *errnum = EINVAL; for (j = 0; (i1 + j) < n1; ) u8s1[j++] = *s1++; u8s1[j] = '\0'; } else { (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper); s1 += sz1; } /* Do the same for the string s2. */ sz2 = u8_number_of_bytes[*s2]; if (sz2 < 0) { *errnum = EILSEQ; sz2 = 1; } if (sz2 == 1) { if (is_it_toupper) u8s2[0] = U8_ASCII_TOUPPER(*s2); else u8s2[0] = U8_ASCII_TOLOWER(*s2); s2++; u8s2[1] = '\0'; } else if ((i2 + sz2) > n2) { *errnum = EINVAL; for (j = 0; (i2 + j) < n2; ) u8s2[j++] = *s2++; u8s2[j] = '\0'; } else { (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper); s2 += sz2; } /* Now compare the two characters. */ if (sz1 == 1 && sz2 == 1) { if (*u8s1 > *u8s2) return (1); if (*u8s1 < *u8s2) return (-1); } else { f = strcmp((const char *)u8s1, (const char *)u8s2); if (f != 0) return (f); } /* * They were the same. Let's move on to the next * characters then. */ i1 += sz1; i2 += sz2; } /* * We compared until the end of either or both strings. * * If we reached to or went over the ends for the both, that means * they are the same. * * If we reached only one of the two ends, that means the other string * has something which then the fact can be used to determine * the return value. */ if (i1 >= n1) { if (i2 >= n2) return (0); return (-1); } return (1); } /* * The combining_class() function checks on the given bytes and find out * the corresponding Unicode combining class value. The return value 0 means * it is a Starter. Any illegal UTF-8 character will also be treated as * a Starter. */ static uchar_t combining_class(size_t uv, uchar_t *s, size_t sz) { uint16_t b1 = 0; uint16_t b2 = 0; uint16_t b3 = 0; uint16_t b4 = 0; if (sz == 1 || sz > 4) return (0); if (sz == 2) { b3 = s[0]; b4 = s[1]; } else if (sz == 3) { b2 = s[0]; b3 = s[1]; b4 = s[2]; } else if (sz == 4) { b1 = s[0]; b2 = s[1]; b3 = s[2]; b4 = s[3]; } b1 = u8_common_b1_tbl[uv][b1]; if (b1 == U8_TBL_ELEMENT_NOT_DEF) return (0); b2 = u8_combining_class_b2_tbl[uv][b1][b2]; if (b2 == U8_TBL_ELEMENT_NOT_DEF) return (0); b3 = u8_combining_class_b3_tbl[uv][b2][b3]; if (b3 == U8_TBL_ELEMENT_NOT_DEF) return (0); return (u8_combining_class_b4_tbl[uv][b3][b4]); } /* * The do_decomp() function finds out a matching decomposition if any * and return. If there is no match, the input bytes are copied and returned. * The function also checks if there is a Hangul, decomposes it if necessary * and returns. * * To save time, a single byte 7-bit ASCII character should be handled by * the caller. * * The function returns the number of bytes returned sans always terminating * the null byte. It will also return a state that will tell if there was * a Hangul character decomposed which then will be used by the caller. */ static size_t do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t canonical_decomposition, u8_normalization_states_t *state) { uint16_t b1 = 0; uint16_t b2 = 0; uint16_t b3 = 0; uint16_t b3_tbl; uint16_t b3_base; uint16_t b4 = 0; size_t start_id; size_t end_id; size_t i; uint32_t u1; if (sz == 2) { b3 = u8s[0] = s[0]; b4 = u8s[1] = s[1]; u8s[2] = '\0'; } else if (sz == 3) { /* Convert it to a Unicode scalar value. */ U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]); /* * If this is a Hangul syllable, we decompose it into * a leading consonant, a vowel, and an optional trailing * consonant and then return. */ if (U8_HANGUL_SYLLABLE(u1)) { u1 -= U8_HANGUL_SYL_FIRST; b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT; b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT) / U8_HANGUL_T_COUNT; b3 = u1 % U8_HANGUL_T_COUNT; U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1); U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2); if (b3) { b3 += U8_HANGUL_JAMO_T_FIRST; U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3); u8s[9] = '\0'; *state = U8_STATE_HANGUL_LVT; return (9); } u8s[6] = '\0'; *state = U8_STATE_HANGUL_LV; return (6); } b2 = u8s[0] = s[0]; b3 = u8s[1] = s[1]; b4 = u8s[2] = s[2]; u8s[3] = '\0'; /* * If this is a Hangul Jamo, we know there is nothing * further that we can decompose. */ if (U8_HANGUL_JAMO_L(u1)) { *state = U8_STATE_HANGUL_L; return (3); } if (U8_HANGUL_JAMO_V(u1)) { if (*state == U8_STATE_HANGUL_L) *state = U8_STATE_HANGUL_LV; else *state = U8_STATE_HANGUL_V; return (3); } if (U8_HANGUL_JAMO_T(u1)) { if (*state == U8_STATE_HANGUL_LV) *state = U8_STATE_HANGUL_LVT; else *state = U8_STATE_HANGUL_T; return (3); } } else if (sz == 4) { b1 = u8s[0] = s[0]; b2 = u8s[1] = s[1]; b3 = u8s[2] = s[2]; b4 = u8s[3] = s[3]; u8s[4] = '\0'; } else { /* * This is a fallback and should not happen if the function * was called properly. */ u8s[0] = s[0]; u8s[1] = '\0'; *state = U8_STATE_START; return (1); } /* * At this point, this rountine does not know what it would get. * The caller should sort it out if the state isn't a Hangul one. */ *state = U8_STATE_START; /* Try to find matching decomposition mapping byte sequence. */ b1 = u8_common_b1_tbl[uv][b1]; if (b1 == U8_TBL_ELEMENT_NOT_DEF) return ((size_t)sz); b2 = u8_decomp_b2_tbl[uv][b1][b2]; if (b2 == U8_TBL_ELEMENT_NOT_DEF) return ((size_t)sz); b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id; if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) return ((size_t)sz); /* * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR * which is 0x8000, this means we couldn't fit the mappings into * the cardinality of a unsigned byte. */ if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { b3_tbl -= U8_16BIT_TABLE_INDICATOR; start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4]; end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; } else { start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4]; end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1]; } /* This also means there wasn't any matching decomposition. */ if (start_id >= end_id) return ((size_t)sz); /* * The final table for decomposition mappings has three types of * byte sequences depending on whether a mapping is for compatibility * decomposition, canonical decomposition, or both like the following: * * (1) Compatibility decomposition mappings: * * +---+---+-...-+---+ * | B0| B1| ... | Bm| * +---+---+-...-+---+ * * The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH). * * (2) Canonical decomposition mappings: * * +---+---+---+-...-+---+ * | T | b0| b1| ... | bn| * +---+---+---+-...-+---+ * * where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL). * * (3) Both mappings: * * +---+---+---+---+-...-+---+---+---+-...-+---+ * | T | D | b0| b1| ... | bn| B0| B1| ... | Bm| * +---+---+---+---+-...-+---+---+---+-...-+---+ * * where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement * byte, b0 to bn are canonical mapping bytes and B0 to Bm are * compatibility mapping bytes. * * Note that compatibility decomposition means doing recursive * decompositions using both compatibility decomposition mappings and * canonical decomposition mappings. On the other hand, canonical * decomposition means doing recursive decompositions using only * canonical decomposition mappings. Since the table we have has gone * through the recursions already, we do not need to do so during * runtime, i.e., the table has been completely flattened out * already. */ b3_base = u8_decomp_b3_tbl[uv][b2][b3].base; /* Get the type, T, of the byte sequence. */ b1 = u8_decomp_final_tbl[uv][b3_base + start_id]; /* * If necessary, adjust start_id, end_id, or both. Note that if * this is compatibility decomposition mapping, there is no * adjustment. */ if (canonical_decomposition) { /* Is the mapping only for compatibility decomposition? */ if (b1 < U8_DECOMP_BOTH) return ((size_t)sz); start_id++; if (b1 == U8_DECOMP_BOTH) { end_id = start_id + u8_decomp_final_tbl[uv][b3_base + start_id]; start_id++; } } else { /* * Unless this is a compatibility decomposition mapping, * we adjust the start_id. */ if (b1 == U8_DECOMP_BOTH) { start_id++; start_id += u8_decomp_final_tbl[uv][b3_base + start_id]; } else if (b1 == U8_DECOMP_CANONICAL) { start_id++; } } for (i = 0; start_id < end_id; start_id++) u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id]; u8s[i] = '\0'; return (i); } /* * The find_composition_start() function uses the character bytes given and * find out the matching composition mappings if any and return the address * to the composition mappings as explained in the do_composition(). */ static uchar_t * find_composition_start(size_t uv, uchar_t *s, size_t sz) { uint16_t b1 = 0; uint16_t b2 = 0; uint16_t b3 = 0; uint16_t b3_tbl; uint16_t b3_base; uint16_t b4 = 0; size_t start_id; size_t end_id; if (sz == 1) { b4 = s[0]; } else if (sz == 2) { b3 = s[0]; b4 = s[1]; } else if (sz == 3) { b2 = s[0]; b3 = s[1]; b4 = s[2]; } else if (sz == 4) { b1 = s[0]; b2 = s[1]; b3 = s[2]; b4 = s[3]; } else { /* * This is a fallback and should not happen if the function * was called properly. */ return (NULL); } b1 = u8_composition_b1_tbl[uv][b1]; if (b1 == U8_TBL_ELEMENT_NOT_DEF) return (NULL); b2 = u8_composition_b2_tbl[uv][b1][b2]; if (b2 == U8_TBL_ELEMENT_NOT_DEF) return (NULL); b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id; if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) return (NULL); if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { b3_tbl -= U8_16BIT_TABLE_INDICATOR; start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4]; end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; } else { start_id = u8_composition_b4_tbl[uv][b3_tbl][b4]; end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1]; } if (start_id >= end_id) return (NULL); b3_base = u8_composition_b3_tbl[uv][b2][b3].base; return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id])); } /* * The blocked() function checks on the combining class values of previous * characters in this sequence and return whether it is blocked or not. */ static boolean_t blocked(uchar_t *comb_class, size_t last) { uchar_t my_comb_class; size_t i; my_comb_class = comb_class[last]; for (i = 1; i < last; i++) if (comb_class[i] >= my_comb_class || comb_class[i] == U8_COMBINING_CLASS_STARTER) return (B_TRUE); return (B_FALSE); } /* * The do_composition() reads the character string pointed by 's' and * do necessary canonical composition and then copy over the result back to * the 's'. * * The input argument 's' cannot contain more than 32 characters. */ static size_t do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start, uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast) { uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1]; uchar_t tc[U8_MB_CUR_MAX]; uint8_t saved_marks[U8_MAX_CHARS_A_SEQ]; size_t saved_marks_count; uchar_t *p; uchar_t *saved_p; uchar_t *q; size_t i; size_t saved_i; size_t j; size_t k; size_t l; size_t C; size_t saved_l; size_t size; uint32_t u1; uint32_t u2; boolean_t match_not_found = B_TRUE; /* * This should never happen unless the callers are doing some strange * and unexpected things. * * The "last" is the index pointing to the last character not last + 1. */ if (last >= U8_MAX_CHARS_A_SEQ) last = U8_UPPER_LIMIT_IN_A_SEQ; for (i = l = 0; i <= last; i++) { /* * The last or any non-Starters at the beginning, we don't * have any chance to do composition and so we just copy them * to the temporary buffer. */ if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) { SAVE_THE_CHAR: p = s + start[i]; size = disp[i]; for (k = 0; k < size; k++) t[l++] = *p++; continue; } /* * If this could be a start of Hangul Jamos, then, we try to * conjoin them. */ if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) { U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]], s[start[i] + 1], s[start[i] + 2]); U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3], s[start[i] + 4], s[start[i] + 5]); if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) { u1 -= U8_HANGUL_JAMO_L_FIRST; u2 -= U8_HANGUL_JAMO_V_FIRST; u1 = U8_HANGUL_SYL_FIRST + (u1 * U8_HANGUL_V_COUNT + u2) * U8_HANGUL_T_COUNT; i += 2; if (i <= last) { U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i]], s[start[i] + 1], s[start[i] + 2]); if (U8_HANGUL_JAMO_T(u2)) { u1 += u2 - U8_HANGUL_JAMO_T_FIRST; i++; } } U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1); i--; l += 3; continue; } } /* * Let's then find out if this Starter has composition * mapping. */ p = find_composition_start(uv, s + start[i], disp[i]); if (p == NULL) goto SAVE_THE_CHAR; /* * We have a Starter with composition mapping and the next * character is a non-Starter. Let's try to find out if * we can do composition. */ saved_p = p; saved_i = i; saved_l = l; saved_marks_count = 0; TRY_THE_NEXT_MARK: q = s + start[++i]; size = disp[i]; /* * The next for() loop compares the non-Starter pointed by * 'q' with the possible (joinable) characters pointed by 'p'. * * The composition final table entry pointed by the 'p' * looks like the following: * * +---+---+---+-...-+---+---+---+---+-...-+---+---+ * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F | * +---+---+---+-...-+---+---+---+---+-...-+---+---+ * * where C is the count byte indicating the number of * mapping pairs where each pair would be look like * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second * character of a canonical decomposition and the B0-Bm are * the bytes of a matching composite character. The F is * a filler byte after each character as the separator. */ match_not_found = B_TRUE; for (C = *p++; C > 0; C--) { for (k = 0; k < size; p++, k++) if (*p != q[k]) break; /* Have we found it? */ if (k >= size && *p == U8_TBL_ELEMENT_FILLER) { match_not_found = B_FALSE; l = saved_l; while (*++p != U8_TBL_ELEMENT_FILLER) t[l++] = *p; break; } /* We didn't find; skip to the next pair. */ if (*p != U8_TBL_ELEMENT_FILLER) while (*++p != U8_TBL_ELEMENT_FILLER) ; while (*++p != U8_TBL_ELEMENT_FILLER) ; p++; } /* * If there was no match, we will need to save the combining * mark for later appending. After that, if the next one * is a non-Starter and not blocked, then, we try once * again to do composition with the next non-Starter. * * If there was no match and this was a Starter, then, * this is a new start. * * If there was a match and a composition done and we have * more to check on, then, we retrieve a new composition final * table entry for the composite and then try to do the * composition again. */ if (match_not_found) { if (comb_class[i] == U8_COMBINING_CLASS_STARTER) { i--; goto SAVE_THE_CHAR; } saved_marks[saved_marks_count++] = i; } if (saved_l == l) { while (i < last) { if (blocked(comb_class, i + 1)) saved_marks[saved_marks_count++] = ++i; else break; } if (i < last) { p = saved_p; goto TRY_THE_NEXT_MARK; } } else if (i < last) { p = find_composition_start(uv, t + saved_l, l - saved_l); if (p != NULL) { saved_p = p; goto TRY_THE_NEXT_MARK; } } /* * There is no more composition possible. * * If there was no composition what so ever then we copy * over the original Starter and then append any non-Starters * remaining at the target string sequentially after that. */ if (saved_l == l) { p = s + start[saved_i]; size = disp[saved_i]; for (j = 0; j < size; j++) t[l++] = *p++; } for (k = 0; k < saved_marks_count; k++) { p = s + start[saved_marks[k]]; size = disp[saved_marks[k]]; for (j = 0; j < size; j++) t[l++] = *p++; } } /* * If the last character is a Starter and if we have a character * (possibly another Starter) that can be turned into a composite, * we do so and we do so until there is no more of composition * possible. */ if (comb_class[last] == U8_COMBINING_CLASS_STARTER) { p = *os; saved_l = l - disp[last]; while (p < oslast) { size = u8_number_of_bytes[*p]; if (size <= 1 || (p + size) > oslast) break; saved_p = p; for (i = 0; i < size; i++) tc[i] = *p++; q = find_composition_start(uv, t + saved_l, l - saved_l); if (q == NULL) { p = saved_p; break; } match_not_found = B_TRUE; for (C = *q++; C > 0; C--) { for (k = 0; k < size; q++, k++) if (*q != tc[k]) break; if (k >= size && *q == U8_TBL_ELEMENT_FILLER) { match_not_found = B_FALSE; l = saved_l; while (*++q != U8_TBL_ELEMENT_FILLER) { /* * This is practically * impossible but we don't * want to take any chances. */ if (l >= U8_STREAM_SAFE_TEXT_MAX) { p = saved_p; goto SAFE_RETURN; } t[l++] = *q; } break; } if (*q != U8_TBL_ELEMENT_FILLER) while (*++q != U8_TBL_ELEMENT_FILLER) ; while (*++q != U8_TBL_ELEMENT_FILLER) ; q++; } if (match_not_found) { p = saved_p; break; } } SAFE_RETURN: *os = p; } /* * Now we copy over the temporary string to the target string. * Since composition always reduces the number of characters or * the number of characters stay, we don't need to worry about * the buffer overflow here. */ for (i = 0; i < l; i++) s[i] = t[i]; s[l] = '\0'; return (l); } /* * The collect_a_seq() function checks on the given string s, collect * a sequence of characters at u8s, and return the sequence. While it collects * a sequence, it also applies case conversion, canonical or compatibility * decomposition, canonical decomposition, or some or all of them and * in that order. * * The collected sequence cannot be bigger than 32 characters since if * it is having more than 31 characters, the sequence will be terminated * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into * a Stream-Safe Text. The collected sequence is always terminated with * a null byte and the return value is the byte length of the sequence * including 0. The return value does not include the terminating * null byte. */ static size_t collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast, boolean_t is_it_toupper, boolean_t is_it_tolower, boolean_t canonical_decomposition, boolean_t compatibility_decomposition, boolean_t canonical_composition, int *errnum, u8_normalization_states_t *state) { uchar_t *s; int sz; int saved_sz; size_t i; size_t j; size_t k; size_t l; uchar_t comb_class[U8_MAX_CHARS_A_SEQ]; uchar_t disp[U8_MAX_CHARS_A_SEQ]; uchar_t start[U8_MAX_CHARS_A_SEQ]; uchar_t u8t[U8_MB_CUR_MAX]; uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1]; uchar_t tc; size_t last; size_t saved_last; uint32_t u1; /* * Save the source string pointer which we will return a changed * pointer if we do processing. */ s = *source; /* * The following is a fallback for just in case callers are not * checking the string boundaries before the calling. */ if (s >= slast) { u8s[0] = '\0'; return (0); } /* * As the first thing, let's collect a character and do case * conversion if necessary. */ sz = u8_number_of_bytes[*s]; if (sz < 0) { *errnum = EILSEQ; u8s[0] = *s++; u8s[1] = '\0'; *source = s; return (1); } if (sz == 1) { if (is_it_toupper) u8s[0] = U8_ASCII_TOUPPER(*s); else if (is_it_tolower) u8s[0] = U8_ASCII_TOLOWER(*s); else u8s[0] = *s; s++; u8s[1] = '\0'; } else if ((s + sz) > slast) { *errnum = EINVAL; for (i = 0; s < slast; ) u8s[i++] = *s++; u8s[i] = '\0'; *source = s; return (i); } else { if (is_it_toupper || is_it_tolower) { i = do_case_conv(uv, u8s, s, sz, is_it_toupper); s += sz; sz = i; } else { for (i = 0; i < sz; ) u8s[i++] = *s++; u8s[i] = '\0'; } } /* * And then canonical/compatibility decomposition followed by * an optional canonical composition. Please be noted that * canonical composition is done only when a decomposition is * done. */ if (canonical_decomposition || compatibility_decomposition) { if (sz == 1) { *state = U8_STATE_START; saved_sz = 1; comb_class[0] = 0; start[0] = 0; disp[0] = 1; last = 1; } else { saved_sz = do_decomp(uv, u8s, u8s, sz, canonical_decomposition, state); last = 0; for (i = 0; i < saved_sz; ) { sz = u8_number_of_bytes[u8s[i]]; comb_class[last] = combining_class(uv, u8s + i, sz); start[last] = i; disp[last] = sz; last++; i += sz; } /* * Decomposition yields various Hangul related * states but not on combining marks. We need to * find out at here by checking on the last * character. */ if (*state == U8_STATE_START) { if (comb_class[last - 1]) *state = U8_STATE_COMBINING_MARK; } } saved_last = last; while (s < slast) { sz = u8_number_of_bytes[*s]; /* * If this is an illegal character, an incomplete * character, or an 7-bit ASCII Starter character, * then we have collected a sequence; break and let * the next call deal with the two cases. * * Note that this is okay only if you are using this * function with a fixed length string, not on * a buffer with multiple calls of one chunk at a time. */ if (sz <= 1) { break; } else if ((s + sz) > slast) { break; } else { /* * If the previous character was a Hangul Jamo * and this character is a Hangul Jamo that * can be conjoined, we collect the Jamo. */ if (*s == U8_HANGUL_JAMO_1ST_BYTE) { U8_PUT_3BYTES_INTO_UTF32(u1, *s, *(s + 1), *(s + 2)); if (U8_HANGUL_COMPOSABLE_L_V(*state, u1)) { i = 0; *state = U8_STATE_HANGUL_LV; goto COLLECT_A_HANGUL; } if (U8_HANGUL_COMPOSABLE_LV_T(*state, u1)) { i = 0; *state = U8_STATE_HANGUL_LVT; goto COLLECT_A_HANGUL; } } /* * Regardless of whatever it was, if this is * a Starter, we don't collect the character * since that's a new start and we will deal * with it at the next time. */ i = combining_class(uv, s, sz); if (i == U8_COMBINING_CLASS_STARTER) break; /* * We know the current character is a combining * mark. If the previous character wasn't * a Starter (not Hangul) or a combining mark, * then, we don't collect this combining mark. */ if (*state != U8_STATE_START && *state != U8_STATE_COMBINING_MARK) break; *state = U8_STATE_COMBINING_MARK; COLLECT_A_HANGUL: /* * If we collected a Starter and combining * marks up to 30, i.e., total 31 characters, * then, we terminate this degenerately long * combining sequence with a U+034F COMBINING * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in * UTF-8 and turn this into a Stream-Safe * Text. This will be extremely rare but * possible. * * The following will also guarantee that * we are not writing more than 32 characters * plus a NULL at u8s[]. */ if (last >= U8_UPPER_LIMIT_IN_A_SEQ) { TURN_STREAM_SAFE: *state = U8_STATE_START; comb_class[last] = 0; start[last] = saved_sz; disp[last] = 2; last++; u8s[saved_sz++] = 0xCD; u8s[saved_sz++] = 0x8F; break; } /* * Some combining marks also do decompose into * another combining mark or marks. */ if (*state == U8_STATE_COMBINING_MARK) { k = last; l = sz; i = do_decomp(uv, uts, s, sz, canonical_decomposition, state); for (j = 0; j < i; ) { sz = u8_number_of_bytes[uts[j]]; comb_class[last] = combining_class(uv, uts + j, sz); start[last] = saved_sz + j; disp[last] = sz; last++; if (last >= U8_UPPER_LIMIT_IN_A_SEQ) { last = k; goto TURN_STREAM_SAFE; } j += sz; } *state = U8_STATE_COMBINING_MARK; sz = i; s += l; for (i = 0; i < sz; i++) u8s[saved_sz++] = uts[i]; } else { comb_class[last] = i; start[last] = saved_sz; disp[last] = sz; last++; for (i = 0; i < sz; i++) u8s[saved_sz++] = *s++; } /* * If this is U+0345 COMBINING GREEK * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a., * iota subscript, and need to be converted to * uppercase letter, convert it to U+0399 GREEK * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8), * i.e., convert to capital adscript form as * specified in the Unicode standard. * * This is the only special case of (ambiguous) * case conversion at combining marks and * probably the standard will never have * anything similar like this in future. */ if (is_it_toupper && sz >= 2 && u8s[saved_sz - 2] == 0xCD && u8s[saved_sz - 1] == 0x85) { u8s[saved_sz - 2] = 0xCE; u8s[saved_sz - 1] = 0x99; } } } /* * Let's try to ensure a canonical ordering for the collected * combining marks. We do this only if we have collected * at least one more non-Starter. (The decomposition mapping * data tables have fully (and recursively) expanded and * canonically ordered decompositions.) * * The U8_SWAP_COMB_MARKS() convenience macro has some * assumptions and we are meeting the assumptions. */ last--; if (last >= saved_last) { for (i = 0; i < last; i++) for (j = last; j > i; j--) if (comb_class[j] && comb_class[j - 1] > comb_class[j]) { U8_SWAP_COMB_MARKS(j - 1, j); } } *source = s; if (! canonical_composition) { u8s[saved_sz] = '\0'; return (saved_sz); } /* * Now do the canonical composition. Note that we do this * only after a canonical or compatibility decomposition to * finish up NFC or NFKC. */ sz = do_composition(uv, u8s, comb_class, start, disp, last, &s, slast); } *source = s; return ((size_t)sz); } /* * The do_norm_compare() function does string comparion based on Unicode * simple case mappings and Unicode Normalization definitions. * * It does so by collecting a sequence of character at a time and comparing * the collected sequences from the strings. * * The meanings on the return values are the same as the usual strcmp(). */ static int do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2, int flag, int *errnum) { int result; size_t sz1; size_t sz2; uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1]; uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1]; uchar_t *s1last; uchar_t *s2last; boolean_t is_it_toupper; boolean_t is_it_tolower; boolean_t canonical_decomposition; boolean_t compatibility_decomposition; boolean_t canonical_composition; u8_normalization_states_t state; s1last = s1 + n1; s2last = s2 + n2; is_it_toupper = flag & U8_TEXTPREP_TOUPPER; is_it_tolower = flag & U8_TEXTPREP_TOLOWER; canonical_decomposition = flag & U8_CANON_DECOMP; compatibility_decomposition = flag & U8_COMPAT_DECOMP; canonical_composition = flag & U8_CANON_COMP; while (s1 < s1last && s2 < s2last) { /* * If the current character is a 7-bit ASCII and the last * character, or, if the current character and the next * character are both some 7-bit ASCII characters then * we treat the current character as a sequence. * * In any other cases, we need to call collect_a_seq(). */ if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last || ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) { if (is_it_toupper) u8s1[0] = U8_ASCII_TOUPPER(*s1); else if (is_it_tolower) u8s1[0] = U8_ASCII_TOLOWER(*s1); else u8s1[0] = *s1; u8s1[1] = '\0'; sz1 = 1; s1++; } else { state = U8_STATE_START; sz1 = collect_a_seq(uv, u8s1, &s1, s1last, is_it_toupper, is_it_tolower, canonical_decomposition, compatibility_decomposition, canonical_composition, errnum, &state); } if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last || ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) { if (is_it_toupper) u8s2[0] = U8_ASCII_TOUPPER(*s2); else if (is_it_tolower) u8s2[0] = U8_ASCII_TOLOWER(*s2); else u8s2[0] = *s2; u8s2[1] = '\0'; sz2 = 1; s2++; } else { state = U8_STATE_START; sz2 = collect_a_seq(uv, u8s2, &s2, s2last, is_it_toupper, is_it_tolower, canonical_decomposition, compatibility_decomposition, canonical_composition, errnum, &state); } /* * Now compare the two characters. If they are the same, * we move on to the next character sequences. */ if (sz1 == 1 && sz2 == 1) { if (*u8s1 > *u8s2) return (1); if (*u8s1 < *u8s2) return (-1); } else { result = strcmp((const char *)u8s1, (const char *)u8s2); if (result != 0) return (result); } } /* * We compared until the end of either or both strings. * * If we reached to or went over the ends for the both, that means * they are the same. * * If we reached only one end, that means the other string has * something which then can be used to determine the return value. */ if (s1 >= s1last) { if (s2 >= s2last) return (0); return (-1); } return (1); } /* * The u8_strcmp() function compares two UTF-8 strings quite similar to * the strcmp(). For the comparison, however, Unicode Normalization specific * equivalency and Unicode simple case conversion mappings based equivalency * can be requested and checked against. */ int u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv, int *errnum) { int f; size_t n1; size_t n2; *errnum = 0; /* * Check on the requested Unicode version, case conversion, and * normalization flag values. */ if (uv > U8_UNICODE_LATEST) { *errnum = ERANGE; uv = U8_UNICODE_LATEST; } if (flag == 0) { flag = U8_STRCMP_CS; } else { f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER | U8_STRCMP_CI_LOWER); if (f == 0) { flag |= U8_STRCMP_CS; } else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER && f != U8_STRCMP_CI_LOWER) { *errnum = EBADF; flag = U8_STRCMP_CS; } f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC && f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) { *errnum = EBADF; flag = U8_STRCMP_CS; } } if (flag == U8_STRCMP_CS) { return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n)); } n1 = strlen(s1); n2 = strlen(s2); if (n != 0) { if (n < n1) n1 = n; if (n < n2) n2 = n; } /* * Simple case conversion can be done much faster and so we do * them separately here. */ if (flag == U8_STRCMP_CI_UPPER) { return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2, B_TRUE, errnum)); } else if (flag == U8_STRCMP_CI_LOWER) { return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2, B_FALSE, errnum)); } return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2, flag, errnum)); } size_t u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen, int flag, size_t unicode_version, int *errnum) { int f; int sz; uchar_t *ib; uchar_t *ibtail; uchar_t *ob; uchar_t *obtail; boolean_t do_not_ignore_null; boolean_t do_not_ignore_invalid; boolean_t is_it_toupper; boolean_t is_it_tolower; boolean_t canonical_decomposition; boolean_t compatibility_decomposition; boolean_t canonical_composition; size_t ret_val; size_t i; size_t j; uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1]; u8_normalization_states_t state; if (unicode_version > U8_UNICODE_LATEST) { *errnum = ERANGE; return ((size_t)-1); } f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER); if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) { *errnum = EBADF; return ((size_t)-1); } f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC && f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) { *errnum = EBADF; return ((size_t)-1); } if (inarray == NULL || *inlen == 0) return (0); if (outarray == NULL) { *errnum = E2BIG; return ((size_t)-1); } ib = (uchar_t *)inarray; ob = (uchar_t *)outarray; ibtail = ib + *inlen; obtail = ob + *outlen; do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL); do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID); is_it_toupper = flag & U8_TEXTPREP_TOUPPER; is_it_tolower = flag & U8_TEXTPREP_TOLOWER; ret_val = 0; /* * If we don't have a normalization flag set, we do the simple case * conversion based text preparation separately below. Text * preparation involving Normalization will be done in the false task * block, again, separately since it will take much more time and * resource than doing simple case conversions. */ if (f == 0) { while (ib < ibtail) { if (*ib == '\0' && do_not_ignore_null) break; sz = u8_number_of_bytes[*ib]; if (sz < 0) { if (do_not_ignore_invalid) { *errnum = EILSEQ; ret_val = (size_t)-1; break; } sz = 1; ret_val++; } if (sz == 1) { if (ob >= obtail) { *errnum = E2BIG; ret_val = (size_t)-1; break; } if (is_it_toupper) *ob = U8_ASCII_TOUPPER(*ib); else if (is_it_tolower) *ob = U8_ASCII_TOLOWER(*ib); else *ob = *ib; ib++; ob++; } else if ((ib + sz) > ibtail) { if (do_not_ignore_invalid) { *errnum = EINVAL; ret_val = (size_t)-1; break; } if ((obtail - ob) < (ibtail - ib)) { *errnum = E2BIG; ret_val = (size_t)-1; break; } /* * We treat the remaining incomplete character * bytes as a character. */ ret_val++; while (ib < ibtail) *ob++ = *ib++; } else { if (is_it_toupper || is_it_tolower) { i = do_case_conv(unicode_version, u8s, ib, sz, is_it_toupper); if ((obtail - ob) < i) { *errnum = E2BIG; ret_val = (size_t)-1; break; } ib += sz; for (sz = 0; sz < i; sz++) *ob++ = u8s[sz]; } else { if ((obtail - ob) < sz) { *errnum = E2BIG; ret_val = (size_t)-1; break; } for (i = 0; i < sz; i++) *ob++ = *ib++; } } } } else { canonical_decomposition = flag & U8_CANON_DECOMP; compatibility_decomposition = flag & U8_COMPAT_DECOMP; canonical_composition = flag & U8_CANON_COMP; while (ib < ibtail) { if (*ib == '\0' && do_not_ignore_null) break; /* * If the current character is a 7-bit ASCII * character and it is the last character, or, * if the current character is a 7-bit ASCII * character and the next character is also a 7-bit * ASCII character, then, we copy over this * character without going through collect_a_seq(). * * In any other cases, we need to look further with * the collect_a_seq() function. */ if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail || ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) { if (ob >= obtail) { *errnum = E2BIG; ret_val = (size_t)-1; break; } if (is_it_toupper) *ob = U8_ASCII_TOUPPER(*ib); else if (is_it_tolower) *ob = U8_ASCII_TOLOWER(*ib); else *ob = *ib; ib++; ob++; } else { *errnum = 0; state = U8_STATE_START; j = collect_a_seq(unicode_version, u8s, &ib, ibtail, is_it_toupper, is_it_tolower, canonical_decomposition, compatibility_decomposition, canonical_composition, errnum, &state); if (*errnum && do_not_ignore_invalid) { ret_val = (size_t)-1; break; } if ((obtail - ob) < j) { *errnum = E2BIG; ret_val = (size_t)-1; break; } for (i = 0; i < j; i++) *ob++ = u8s[i]; } } } *inlen = ibtail - ib; *outlen = obtail - ob; return (ret_val); } #if defined(_KERNEL) && defined(HAVE_SPL) static int unicode_init(void) { return 0; } static int unicode_fini(void) { return 0; } spl_module_init(unicode_init); spl_module_exit(unicode_fini); MODULE_DESCRIPTION("Unicode implementation"); MODULE_AUTHOR(ZFS_META_AUTHOR); MODULE_LICENSE(ZFS_META_LICENSE); MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); EXPORT_SYMBOL(u8_validate); EXPORT_SYMBOL(u8_strcmp); EXPORT_SYMBOL(u8_textprep_str); #endif