diff options
author | Brian Behlendorf <[email protected]> | 2008-12-11 15:38:59 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2008-12-11 15:38:59 -0800 |
commit | 6b2c60acca39ef3468797095d3a4162e6ce69786 (patch) | |
tree | 598cab78d45f233695d0e7c73e2e6ec44b8efd71 /lib/libspl | |
parent | a4076c7544bdbdc0ac0fe20f4ef86c2aa06862fb (diff) |
Moving lib/libspl to linux-libspl branch
Diffstat (limited to 'lib/libspl')
-rw-r--r-- | lib/libspl/include/sys/list.h | 67 | ||||
-rw-r--r-- | lib/libspl/include/sys/list_impl.h | 53 | ||||
-rw-r--r-- | lib/libspl/list.c | 245 | ||||
-rw-r--r-- | lib/libspl/mkdirp.c | 212 | ||||
-rw-r--r-- | lib/libspl/strlcat.c | 59 | ||||
-rw-r--r-- | lib/libspl/strlcpy.c | 55 | ||||
-rw-r--r-- | lib/libspl/strnlen.c | 47 | ||||
-rw-r--r-- | lib/libspl/u8_textprep.c | 2132 |
8 files changed, 0 insertions, 2870 deletions
diff --git a/lib/libspl/include/sys/list.h b/lib/libspl/include/sys/list.h deleted file mode 100644 index 8339b6226..000000000 --- a/lib/libspl/include/sys/list.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_LIST_H -#define _SYS_LIST_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/list_impl.h> - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct list_node list_node_t; -typedef struct list list_t; - -void list_create(list_t *, size_t, size_t); -void list_destroy(list_t *); - -void list_insert_after(list_t *, void *, void *); -void list_insert_before(list_t *, void *, void *); -void list_insert_head(list_t *, void *); -void list_insert_tail(list_t *, void *); -void list_remove(list_t *, void *); -void *list_remove_head(list_t *); -void *list_remove_tail(list_t *); -void list_move_tail(list_t *, list_t *); - -void *list_head(list_t *); -void *list_tail(list_t *); -void *list_next(list_t *, void *); -void *list_prev(list_t *, void *); -int list_is_empty(list_t *); - -void list_link_init(list_node_t *); -void list_link_replace(list_node_t *, list_node_t *); - -int list_link_active(list_node_t *); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_LIST_H */ diff --git a/lib/libspl/include/sys/list_impl.h b/lib/libspl/include/sys/list_impl.h deleted file mode 100644 index 9c42f8832..000000000 --- a/lib/libspl/include/sys/list_impl.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2003 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_LIST_IMPL_H -#define _SYS_LIST_IMPL_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/types.h> - -#ifdef __cplusplus -extern "C" { -#endif - -struct list_node { - struct list_node *list_next; - struct list_node *list_prev; -}; - -struct list { - size_t list_size; - size_t list_offset; - struct list_node list_head; -}; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_LIST_IMPL_H */ diff --git a/lib/libspl/list.c b/lib/libspl/list.c deleted file mode 100644 index e8db13a5c..000000000 --- a/lib/libspl/list.c +++ /dev/null @@ -1,245 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -/* - * Generic doubly-linked list implementation - */ - -#include <sys/list.h> -#include <sys/list_impl.h> -#include <sys/types.h> -#include <sys/sysmacros.h> -#include <sys/debug.h> - -#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset)) -#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset)) -#define list_empty(a) ((a)->list_head.list_next == &(a)->list_head) - -#define list_insert_after_node(list, node, object) { \ - list_node_t *lnew = list_d2l(list, object); \ - lnew->list_prev = (node); \ - lnew->list_next = (node)->list_next; \ - (node)->list_next->list_prev = lnew; \ - (node)->list_next = lnew; \ -} - -#define list_insert_before_node(list, node, object) { \ - list_node_t *lnew = list_d2l(list, object); \ - lnew->list_next = (node); \ - lnew->list_prev = (node)->list_prev; \ - (node)->list_prev->list_next = lnew; \ - (node)->list_prev = lnew; \ -} - -#define list_remove_node(node) \ - (node)->list_prev->list_next = (node)->list_next; \ - (node)->list_next->list_prev = (node)->list_prev; \ - (node)->list_next = (node)->list_prev = NULL - -void -list_create(list_t *list, size_t size, size_t offset) -{ - ASSERT(list); - ASSERT(size > 0); - ASSERT(size >= offset + sizeof (list_node_t)); - - list->list_size = size; - list->list_offset = offset; - list->list_head.list_next = list->list_head.list_prev = - &list->list_head; -} - -void -list_destroy(list_t *list) -{ - list_node_t *node = &list->list_head; - - ASSERT(list); - ASSERT(list->list_head.list_next == node); - ASSERT(list->list_head.list_prev == node); - - node->list_next = node->list_prev = NULL; -} - -void -list_insert_after(list_t *list, void *object, void *nobject) -{ - if (object == NULL) { - list_insert_head(list, nobject); - } else { - list_node_t *lold = list_d2l(list, object); - list_insert_after_node(list, lold, nobject); - } -} - -void -list_insert_before(list_t *list, void *object, void *nobject) -{ - if (object == NULL) { - list_insert_tail(list, nobject); - } else { - list_node_t *lold = list_d2l(list, object); - list_insert_before_node(list, lold, nobject); - } -} - -void -list_insert_head(list_t *list, void *object) -{ - list_node_t *lold = &list->list_head; - list_insert_after_node(list, lold, object); -} - -void -list_insert_tail(list_t *list, void *object) -{ - list_node_t *lold = &list->list_head; - list_insert_before_node(list, lold, object); -} - -void -list_remove(list_t *list, void *object) -{ - list_node_t *lold = list_d2l(list, object); - ASSERT(!list_empty(list)); - ASSERT(lold->list_next != NULL); - list_remove_node(lold); -} - -void * -list_remove_head(list_t *list) -{ - list_node_t *head = list->list_head.list_next; - if (head == &list->list_head) - return (NULL); - list_remove_node(head); - return (list_object(list, head)); -} - -void * -list_remove_tail(list_t *list) -{ - list_node_t *tail = list->list_head.list_prev; - if (tail == &list->list_head) - return (NULL); - list_remove_node(tail); - return (list_object(list, tail)); -} - -void * -list_head(list_t *list) -{ - if (list_empty(list)) - return (NULL); - return (list_object(list, list->list_head.list_next)); -} - -void * -list_tail(list_t *list) -{ - if (list_empty(list)) - return (NULL); - return (list_object(list, list->list_head.list_prev)); -} - -void * -list_next(list_t *list, void *object) -{ - list_node_t *node = list_d2l(list, object); - - if (node->list_next != &list->list_head) - return (list_object(list, node->list_next)); - - return (NULL); -} - -void * -list_prev(list_t *list, void *object) -{ - list_node_t *node = list_d2l(list, object); - - if (node->list_prev != &list->list_head) - return (list_object(list, node->list_prev)); - - return (NULL); -} - -/* - * Insert src list after dst list. Empty src list thereafter. - */ -void -list_move_tail(list_t *dst, list_t *src) -{ - list_node_t *dstnode = &dst->list_head; - list_node_t *srcnode = &src->list_head; - - ASSERT(dst->list_size == src->list_size); - ASSERT(dst->list_offset == src->list_offset); - - if (list_empty(src)) - return; - - dstnode->list_prev->list_next = srcnode->list_next; - srcnode->list_next->list_prev = dstnode->list_prev; - dstnode->list_prev = srcnode->list_prev; - srcnode->list_prev->list_next = dstnode; - - /* empty src list */ - srcnode->list_next = srcnode->list_prev = srcnode; -} - -void -list_link_replace(list_node_t *lold, list_node_t *lnew) -{ - ASSERT(list_link_active(lold)); - ASSERT(!list_link_active(lnew)); - - lnew->list_next = lold->list_next; - lnew->list_prev = lold->list_prev; - lold->list_prev->list_next = lnew; - lold->list_next->list_prev = lnew; - lold->list_next = lold->list_prev = NULL; -} - -void -list_link_init(list_node_t *link) -{ - link->list_next = NULL; - link->list_prev = NULL; -} - -int -list_link_active(list_node_t *link) -{ - return (link->list_next != NULL); -} - -int -list_is_empty(list_t *list) -{ - return (list_empty(list)); -} diff --git a/lib/libspl/mkdirp.c b/lib/libspl/mkdirp.c deleted file mode 100644 index 9c81f2a0b..000000000 --- a/lib/libspl/mkdirp.c +++ /dev/null @@ -1,212 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* Copyright (c) 1988 AT&T */ -/* All Rights Reserved */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -/* - * Creates directory and it's parents if the parents do not - * exist yet. - * - * Returns -1 if fails for reasons other than non-existing - * parents. - * Does NOT simplify pathnames with . or .. in them. - */ - -#include <sys/types.h> -#include <libgen.h> -#include <stdlib.h> -#include <unistd.h> -#include <errno.h> -#include <string.h> -#include <sys/stat.h> - -static char *simplify(const char *str); - -int -mkdirp(const char *d, mode_t mode) -{ - char *endptr, *ptr, *slash, *str; - - str = simplify(d); - - /* If space couldn't be allocated for the simplified names, return. */ - - if (str == NULL) - return (-1); - - /* Try to make the directory */ - - if (mkdir(str, mode) == 0) { - free(str); - return (0); - } - if (errno != ENOENT) { - free(str); - return (-1); - } - endptr = strrchr(str, '\0'); - slash = strrchr(str, '/'); - - /* Search upward for the non-existing parent */ - - while (slash != NULL) { - - ptr = slash; - *ptr = '\0'; - - /* If reached an existing parent, break */ - - if (access(str, F_OK) == 0) - break; - - /* If non-existing parent */ - - else { - slash = strrchr(str, '/'); - - /* If under / or current directory, make it. */ - - if (slash == NULL || slash == str) { - if (mkdir(str, mode) != 0 && errno != EEXIST) { - free(str); - return (-1); - } - break; - } - } - } - - /* Create directories starting from upmost non-existing parent */ - - while ((ptr = strchr(str, '\0')) != endptr) { - *ptr = '/'; - if (mkdir(str, mode) != 0 && errno != EEXIST) { - /* - * If the mkdir fails because str already - * exists (EEXIST), then str has the form - * "existing-dir/..", and this is really - * ok. (Remember, this loop is creating the - * portion of the path that didn't exist) - */ - free(str); - return (-1); - } - } - free(str); - return (0); -} - -/* - * simplify - given a pathname, simplify that path by removing - * duplicate contiguous slashes. - * - * A simplified copy of the argument is returned to the - * caller, or NULL is returned on error. - * - * The caller should handle error reporting based upon the - * returned vlaue, and should free the returned value, - * when appropriate. - */ - -static char * -simplify(const char *str) -{ - int i; - size_t mbPathlen; /* length of multi-byte path */ - size_t wcPathlen; /* length of wide-character path */ - wchar_t *wptr; /* scratch pointer */ - wchar_t *wcPath; /* wide-character version of the path */ - char *mbPath; /* The copy fo the path to be returned */ - - /* - * bail out if there is nothing there. - */ - - if (!str) - return (NULL); - - /* - * Get a copy of the argument. - */ - - if ((mbPath = strdup(str)) == NULL) { - return (NULL); - } - - /* - * convert the multi-byte version of the path to a - * wide-character rendering, for doing our figuring. - */ - - mbPathlen = strlen(mbPath); - - if ((wcPath = calloc(sizeof (wchar_t), mbPathlen+1)) == NULL) { - free(mbPath); - return (NULL); - } - - if ((wcPathlen = mbstowcs(wcPath, mbPath, mbPathlen)) == (size_t)-1) { - free(mbPath); - free(wcPath); - return (NULL); - } - - /* - * remove duplicate slashes first ("//../" -> "/") - */ - - for (wptr = wcPath, i = 0; i < wcPathlen; i++) { - *wptr++ = wcPath[i]; - - if (wcPath[i] == '/') { - i++; - - while (wcPath[i] == '/') { - i++; - } - - i--; - } - } - - *wptr = '\0'; - - /* - * now convert back to the multi-byte format. - */ - - if (wcstombs(mbPath, wcPath, mbPathlen) == (size_t)-1) { - free(mbPath); - free(wcPath); - return (NULL); - } - - free(wcPath); - return (mbPath); -} diff --git a/lib/libspl/strlcat.c b/lib/libspl/strlcat.c deleted file mode 100644 index 07d1403dd..000000000 --- a/lib/libspl/strlcat.c +++ /dev/null @@ -1,59 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include "lint.h" -#include <string.h> -#include <sys/types.h> - -/* - * Appends src to the dstsize buffer at dst. The append will never - * overflow the destination buffer and the buffer will always be null - * terminated. Never reference beyond &dst[dstsize-1] when computing - * the length of the pre-existing string. - */ - -size_t -strlcat(char *dst, const char *src, size_t dstsize) -{ - char *df = dst; - size_t left = dstsize; - size_t l1; - size_t l2 = strlen(src); - size_t copied; - - while (left-- != 0 && *df != '\0') - df++; - l1 = df - dst; - if (dstsize == l1) - return (l1 + l2); - - copied = l1 + l2 >= dstsize ? dstsize - l1 - 1 : l2; - (void) memcpy(dst + l1, src, copied); - dst[l1+copied] = '\0'; - return (l1 + l2); -} diff --git a/lib/libspl/strlcpy.c b/lib/libspl/strlcpy.c deleted file mode 100644 index 7a8009b89..000000000 --- a/lib/libspl/strlcpy.c +++ /dev/null @@ -1,55 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include "lint.h" -#include <string.h> -#include <sys/types.h> - -/* - * Copies src to the dstsize buffer at dst. The copy will never - * overflow the destination buffer and the buffer will always be null - * terminated. - */ - -size_t -strlcpy(char *dst, const char *src, size_t len) -{ - size_t slen = strlen(src); - size_t copied; - - if (len == 0) - return (slen); - - if (slen >= len) - copied = len - 1; - else - copied = slen; - (void) memcpy(dst, src, copied); - dst[copied] = '\0'; - return (slen); -} diff --git a/lib/libspl/strnlen.c b/lib/libspl/strnlen.c deleted file mode 100644 index 605245b6b..000000000 --- a/lib/libspl/strnlen.c +++ /dev/null @@ -1,47 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2008 Sun Microsystems, Inc. - * All rights reserved. Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include "lint.h" -#include <string.h> -#include <sys/types.h> - -/* - * Returns the number of non-NULL bytes in string argument, - * but not more than maxlen. Does not look past str + maxlen. - */ -size_t -strnlen(const char *str, size_t maxlen) -{ - const char *ptr; - - ptr = memchr(str, 0, maxlen); - if (ptr == NULL) - return (maxlen); - - return (ptr - str); -} diff --git a/lib/libspl/u8_textprep.c b/lib/libspl/u8_textprep.c deleted file mode 100644 index 8faf1a97e..000000000 --- a/lib/libspl/u8_textprep.c +++ /dev/null @@ -1,2132 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - - -/* - * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458). - * - * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F), - * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also - * the section 3C man pages. - * Interface stability: Committed. - */ - -#include <sys/types.h> -#ifdef _KERNEL -#include <sys/param.h> -#include <sys/sysmacros.h> -#include <sys/systm.h> -#include <sys/debug.h> -#include <sys/kmem.h> -#include <sys/ddi.h> -#include <sys/sunddi.h> -#else -#include <sys/u8_textprep.h> -#include <strings.h> -#endif /* _KERNEL */ -#include <sys/byteorder.h> -#include <sys/errno.h> -#include <sys/u8_textprep_data.h> - - -/* The maximum possible number of bytes in a UTF-8 character. */ -#define U8_MB_CUR_MAX (4) - -/* - * The maximum number of bytes needed for a UTF-8 character to cover - * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2. - */ -#define U8_MAX_BYTES_UCS2 (3) - -/* The maximum possible number of bytes in a Stream-Safe Text. */ -#define U8_STREAM_SAFE_TEXT_MAX (128) - -/* - * The maximum number of characters in a combining/conjoining sequence and - * the actual upperbound limit of a combining/conjoining sequence. - */ -#define U8_MAX_CHARS_A_SEQ (32) -#define U8_UPPER_LIMIT_IN_A_SEQ (31) - -/* The combining class value for Starter. */ -#define U8_COMBINING_CLASS_STARTER (0) - -/* - * Some Hangul related macros at below. - * - * The first and the last of Hangul syllables, Hangul Jamo Leading consonants, - * Vowels, and optional Trailing consonants in Unicode scalar values. - * - * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not - * the actual U+11A8. This is due to that the trailing consonant is optional - * and thus we are doing a pre-calculation of subtracting one. - * - * Each of 19 modern leading consonants has total 588 possible syllables since - * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for - * no trailing consonant case, i.e., 21 x 28 = 588. - * - * We also have bunch of Hangul related macros at below. Please bear in mind - * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is - * a Hangul Jamo or not but the value does not guarantee that it is a Hangul - * Jamo; it just guarantee that it will be most likely. - */ -#define U8_HANGUL_SYL_FIRST (0xAC00U) -#define U8_HANGUL_SYL_LAST (0xD7A3U) - -#define U8_HANGUL_JAMO_L_FIRST (0x1100U) -#define U8_HANGUL_JAMO_L_LAST (0x1112U) -#define U8_HANGUL_JAMO_V_FIRST (0x1161U) -#define U8_HANGUL_JAMO_V_LAST (0x1175U) -#define U8_HANGUL_JAMO_T_FIRST (0x11A7U) -#define U8_HANGUL_JAMO_T_LAST (0x11C2U) - -#define U8_HANGUL_V_COUNT (21) -#define U8_HANGUL_VT_COUNT (588) -#define U8_HANGUL_T_COUNT (28) - -#define U8_HANGUL_JAMO_1ST_BYTE (0xE1U) - -#define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \ - (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \ - (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \ - (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU)); - -#define U8_HANGUL_JAMO_L(u) \ - ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST) - -#define U8_HANGUL_JAMO_V(u) \ - ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST) - -#define U8_HANGUL_JAMO_T(u) \ - ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) - -#define U8_HANGUL_JAMO(u) \ - ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) - -#define U8_HANGUL_SYLLABLE(u) \ - ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST) - -#define U8_HANGUL_COMPOSABLE_L_V(s, u) \ - ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u))) - -#define U8_HANGUL_COMPOSABLE_LV_T(s, u) \ - ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u))) - -/* The types of decomposition mappings. */ -#define U8_DECOMP_BOTH (0xF5U) -#define U8_DECOMP_CANONICAL (0xF6U) - -/* The indicator for 16-bit table. */ -#define U8_16BIT_TABLE_INDICATOR (0x8000U) - -/* The following are some convenience macros. */ -#define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \ - (u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \ - (uint32_t)(b3) & 0x3F; - -#define U8_SIMPLE_SWAP(a, b, t) \ - (t) = (a); \ - (a) = (b); \ - (b) = (t); - -#define U8_ASCII_TOUPPER(c) \ - (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c)) - -#define U8_ASCII_TOLOWER(c) \ - (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c)) - -#define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U) -/* - * The following macro assumes that the two characters that are to be - * swapped are adjacent to each other and 'a' comes before 'b'. - * - * If the assumptions are not met, then, the macro will fail. - */ -#define U8_SWAP_COMB_MARKS(a, b) \ - for (k = 0; k < disp[(a)]; k++) \ - u8t[k] = u8s[start[(a)] + k]; \ - for (k = 0; k < disp[(b)]; k++) \ - u8s[start[(a)] + k] = u8s[start[(b)] + k]; \ - start[(b)] = start[(a)] + disp[(b)]; \ - for (k = 0; k < disp[(a)]; k++) \ - u8s[start[(b)] + k] = u8t[k]; \ - U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \ - U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc); - -/* The possible states during normalization. */ -typedef enum { - U8_STATE_START = 0, - U8_STATE_HANGUL_L = 1, - U8_STATE_HANGUL_LV = 2, - U8_STATE_HANGUL_LVT = 3, - U8_STATE_HANGUL_V = 4, - U8_STATE_HANGUL_T = 5, - U8_STATE_COMBINING_MARK = 6 -} u8_normalization_states_t; - -/* - * The three vectors at below are used to check bytes of a given UTF-8 - * character are valid and not containing any malformed byte values. - * - * We used to have a quite relaxed UTF-8 binary representation but then there - * was some security related issues and so the Unicode Consortium defined - * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it - * one more time at the Unicode 3.2. The following three tables are based on - * that. - */ - -#define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF) - -#define I_ U8_ILLEGAL_CHAR -#define O_ U8_OUT_OF_RANGE_CHAR - -const int8_t u8_number_of_bytes[0x100] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - -/* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */ - I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, - -/* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */ - I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, - -/* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */ - I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, - -/* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */ - I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, - -/* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ - I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - -/* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - -/* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - -/* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ - 4, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, -}; - -#undef I_ -#undef O_ - -const uint8_t u8_valid_min_2nd_byte[0x100] = { - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, -/* C0 C1 C2 C3 C4 C5 C6 C7 */ - 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, -/* C8 C9 CA CB CC CD CE CF */ - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, -/* D0 D1 D2 D3 D4 D5 D6 D7 */ - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, -/* D8 D9 DA DB DC DD DE DF */ - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, -/* E0 E1 E2 E3 E4 E5 E6 E7 */ - 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, -/* E8 E9 EA EB EC ED EE EF */ - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, -/* F0 F1 F2 F3 F4 F5 F6 F7 */ - 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, -}; - -const uint8_t u8_valid_max_2nd_byte[0x100] = { - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, -/* C0 C1 C2 C3 C4 C5 C6 C7 */ - 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, -/* C8 C9 CA CB CC CD CE CF */ - 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, -/* D0 D1 D2 D3 D4 D5 D6 D7 */ - 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, -/* D8 D9 DA DB DC DD DE DF */ - 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, -/* E0 E1 E2 E3 E4 E5 E6 E7 */ - 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, -/* E8 E9 EA EB EC ED EE EF */ - 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, -/* F0 F1 F2 F3 F4 F5 F6 F7 */ - 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, -}; - - -/* - * The u8_validate() validates on the given UTF-8 character string and - * calculate the byte length. It is quite similar to mblen(3C) except that - * this will validate against the list of characters if required and - * specific to UTF-8 and Unicode. - */ -int -u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum) -{ - uchar_t *ib; - uchar_t *ibtail; - uchar_t **p; - uchar_t *s1; - uchar_t *s2; - uchar_t f; - int sz; - size_t i; - int ret_val; - boolean_t second; - boolean_t no_need_to_validate_entire; - boolean_t check_additional; - boolean_t validate_ucs2_range_only; - - if (! u8str) - return (0); - - ib = (uchar_t *)u8str; - ibtail = ib + n; - - ret_val = 0; - - no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE); - check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL; - validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE; - - while (ib < ibtail) { - /* - * The first byte of a UTF-8 character tells how many - * bytes will follow for the character. If the first byte - * is an illegal byte value or out of range value, we just - * return -1 with an appropriate error number. - */ - sz = u8_number_of_bytes[*ib]; - if (sz == U8_ILLEGAL_CHAR) { - *errnum = EILSEQ; - return (-1); - } - - if (sz == U8_OUT_OF_RANGE_CHAR || - (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) { - *errnum = ERANGE; - return (-1); - } - - /* - * If we don't have enough bytes to check on, that's also - * an error. As you can see, we give illegal byte sequence - * checking higher priority then EINVAL cases. - */ - if ((ibtail - ib) < sz) { - *errnum = EINVAL; - return (-1); - } - - if (sz == 1) { - ib++; - ret_val++; - } else { - /* - * Check on the multi-byte UTF-8 character. For more - * details on this, see comment added for the used - * data structures at the beginning of the file. - */ - f = *ib++; - ret_val++; - second = B_TRUE; - for (i = 1; i < sz; i++) { - if (second) { - if (*ib < u8_valid_min_2nd_byte[f] || - *ib > u8_valid_max_2nd_byte[f]) { - *errnum = EILSEQ; - return (-1); - } - second = B_FALSE; - } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) { - *errnum = EILSEQ; - return (-1); - } - ib++; - ret_val++; - } - } - - if (check_additional) { - for (p = (uchar_t **)list, i = 0; p[i]; i++) { - s1 = ib - sz; - s2 = p[i]; - while (s1 < ib) { - if (*s1 != *s2 || *s2 == '\0') - break; - s1++; - s2++; - } - - if (s1 >= ib && *s2 == '\0') { - *errnum = EBADF; - return (-1); - } - } - } - - if (no_need_to_validate_entire) - break; - } - - return (ret_val); -} - -/* - * The do_case_conv() looks at the mapping tables and returns found - * bytes if any. If not found, the input bytes are returned. The function - * always terminate the return bytes with a null character assuming that - * there are plenty of room to do so. - * - * The case conversions are simple case conversions mapping a character to - * another character as specified in the Unicode data. The byte size of - * the mapped character could be different from that of the input character. - * - * The return value is the byte length of the returned character excluding - * the terminating null byte. - */ -static size_t -do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper) -{ - size_t i; - uint16_t b1 = 0; - uint16_t b2 = 0; - uint16_t b3 = 0; - uint16_t b3_tbl; - uint16_t b3_base; - uint16_t b4 = 0; - size_t start_id; - size_t end_id; - - /* - * At this point, the only possible values for sz are 2, 3, and 4. - * The u8s should point to a vector that is well beyond the size of - * 5 bytes. - */ - if (sz == 2) { - b3 = u8s[0] = s[0]; - b4 = u8s[1] = s[1]; - } else if (sz == 3) { - b2 = u8s[0] = s[0]; - b3 = u8s[1] = s[1]; - b4 = u8s[2] = s[2]; - } else if (sz == 4) { - b1 = u8s[0] = s[0]; - b2 = u8s[1] = s[1]; - b3 = u8s[2] = s[2]; - b4 = u8s[3] = s[3]; - } else { - /* This is not possible but just in case as a fallback. */ - if (is_it_toupper) - *u8s = U8_ASCII_TOUPPER(*s); - else - *u8s = U8_ASCII_TOLOWER(*s); - u8s[1] = '\0'; - - return (1); - } - u8s[sz] = '\0'; - - /* - * Let's find out if we have a corresponding character. - */ - b1 = u8_common_b1_tbl[uv][b1]; - if (b1 == U8_TBL_ELEMENT_NOT_DEF) - return ((size_t)sz); - - b2 = u8_case_common_b2_tbl[uv][b1][b2]; - if (b2 == U8_TBL_ELEMENT_NOT_DEF) - return ((size_t)sz); - - if (is_it_toupper) { - b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id; - if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) - return ((size_t)sz); - - start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4]; - end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1]; - - /* Either there is no match or an error at the table. */ - if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) - return ((size_t)sz); - - b3_base = u8_toupper_b3_tbl[uv][b2][b3].base; - - for (i = 0; start_id < end_id; start_id++) - u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id]; - } else { - b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id; - if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) - return ((size_t)sz); - - start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4]; - end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1]; - - if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) - return ((size_t)sz); - - b3_base = u8_tolower_b3_tbl[uv][b2][b3].base; - - for (i = 0; start_id < end_id; start_id++) - u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id]; - } - - /* - * If i is still zero, that means there is no corresponding character. - */ - if (i == 0) - return ((size_t)sz); - - u8s[i] = '\0'; - - return (i); -} - -/* - * The do_case_compare() function compares the two input strings, s1 and s2, - * one character at a time doing case conversions if applicable and return - * the comparison result as like strcmp(). - * - * Since, in empirical sense, most of text data are 7-bit ASCII characters, - * we treat the 7-bit ASCII characters as a special case trying to yield - * faster processing time. - */ -static int -do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, - size_t n2, boolean_t is_it_toupper, int *errnum) -{ - int f; - int sz1; - int sz2; - size_t j; - size_t i1; - size_t i2; - uchar_t u8s1[U8_MB_CUR_MAX + 1]; - uchar_t u8s2[U8_MB_CUR_MAX + 1]; - - i1 = i2 = 0; - while (i1 < n1 && i2 < n2) { - /* - * Find out what would be the byte length for this UTF-8 - * character at string s1 and also find out if this is - * an illegal start byte or not and if so, issue a proper - * error number and yet treat this byte as a character. - */ - sz1 = u8_number_of_bytes[*s1]; - if (sz1 < 0) { - *errnum = EILSEQ; - sz1 = 1; - } - - /* - * For 7-bit ASCII characters mainly, we do a quick case - * conversion right at here. - * - * If we don't have enough bytes for this character, issue - * an EINVAL error and use what are available. - * - * If we have enough bytes, find out if there is - * a corresponding uppercase character and if so, copy over - * the bytes for a comparison later. If there is no - * corresponding uppercase character, then, use what we have - * for the comparison. - */ - if (sz1 == 1) { - if (is_it_toupper) - u8s1[0] = U8_ASCII_TOUPPER(*s1); - else - u8s1[0] = U8_ASCII_TOLOWER(*s1); - s1++; - u8s1[1] = '\0'; - } else if ((i1 + sz1) > n1) { - *errnum = EINVAL; - for (j = 0; (i1 + j) < n1; ) - u8s1[j++] = *s1++; - u8s1[j] = '\0'; - } else { - (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper); - s1 += sz1; - } - - /* Do the same for the string s2. */ - sz2 = u8_number_of_bytes[*s2]; - if (sz2 < 0) { - *errnum = EILSEQ; - sz2 = 1; - } - - if (sz2 == 1) { - if (is_it_toupper) - u8s2[0] = U8_ASCII_TOUPPER(*s2); - else - u8s2[0] = U8_ASCII_TOLOWER(*s2); - s2++; - u8s2[1] = '\0'; - } else if ((i2 + sz2) > n2) { - *errnum = EINVAL; - for (j = 0; (i2 + j) < n2; ) - u8s2[j++] = *s2++; - u8s2[j] = '\0'; - } else { - (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper); - s2 += sz2; - } - - /* Now compare the two characters. */ - if (sz1 == 1 && sz2 == 1) { - if (*u8s1 > *u8s2) - return (1); - if (*u8s1 < *u8s2) - return (-1); - } else { - f = strcmp((const char *)u8s1, (const char *)u8s2); - if (f != 0) - return (f); - } - - /* - * They were the same. Let's move on to the next - * characters then. - */ - i1 += sz1; - i2 += sz2; - } - - /* - * We compared until the end of either or both strings. - * - * If we reached to or went over the ends for the both, that means - * they are the same. - * - * If we reached only one of the two ends, that means the other string - * has something which then the fact can be used to determine - * the return value. - */ - if (i1 >= n1) { - if (i2 >= n2) - return (0); - return (-1); - } - return (1); -} - -/* - * The combining_class() function checks on the given bytes and find out - * the corresponding Unicode combining class value. The return value 0 means - * it is a Starter. Any illegal UTF-8 character will also be treated as - * a Starter. - */ -static uchar_t -combining_class(size_t uv, uchar_t *s, size_t sz) -{ - uint16_t b1 = 0; - uint16_t b2 = 0; - uint16_t b3 = 0; - uint16_t b4 = 0; - - if (sz == 1 || sz > 4) - return (0); - - if (sz == 2) { - b3 = s[0]; - b4 = s[1]; - } else if (sz == 3) { - b2 = s[0]; - b3 = s[1]; - b4 = s[2]; - } else if (sz == 4) { - b1 = s[0]; - b2 = s[1]; - b3 = s[2]; - b4 = s[3]; - } - - b1 = u8_common_b1_tbl[uv][b1]; - if (b1 == U8_TBL_ELEMENT_NOT_DEF) - return (0); - - b2 = u8_combining_class_b2_tbl[uv][b1][b2]; - if (b2 == U8_TBL_ELEMENT_NOT_DEF) - return (0); - - b3 = u8_combining_class_b3_tbl[uv][b2][b3]; - if (b3 == U8_TBL_ELEMENT_NOT_DEF) - return (0); - - return (u8_combining_class_b4_tbl[uv][b3][b4]); -} - -/* - * The do_decomp() function finds out a matching decomposition if any - * and return. If there is no match, the input bytes are copied and returned. - * The function also checks if there is a Hangul, decomposes it if necessary - * and returns. - * - * To save time, a single byte 7-bit ASCII character should be handled by - * the caller. - * - * The function returns the number of bytes returned sans always terminating - * the null byte. It will also return a state that will tell if there was - * a Hangul character decomposed which then will be used by the caller. - */ -static size_t -do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz, - boolean_t canonical_decomposition, u8_normalization_states_t *state) -{ - uint16_t b1 = 0; - uint16_t b2 = 0; - uint16_t b3 = 0; - uint16_t b3_tbl; - uint16_t b3_base; - uint16_t b4 = 0; - size_t start_id; - size_t end_id; - size_t i; - uint32_t u1; - - if (sz == 2) { - b3 = u8s[0] = s[0]; - b4 = u8s[1] = s[1]; - u8s[2] = '\0'; - } else if (sz == 3) { - /* Convert it to a Unicode scalar value. */ - U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]); - - /* - * If this is a Hangul syllable, we decompose it into - * a leading consonant, a vowel, and an optional trailing - * consonant and then return. - */ - if (U8_HANGUL_SYLLABLE(u1)) { - u1 -= U8_HANGUL_SYL_FIRST; - - b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT; - b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT) - / U8_HANGUL_T_COUNT; - b3 = u1 % U8_HANGUL_T_COUNT; - - U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1); - U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2); - if (b3) { - b3 += U8_HANGUL_JAMO_T_FIRST; - U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3); - - u8s[9] = '\0'; - *state = U8_STATE_HANGUL_LVT; - return (9); - } - - u8s[6] = '\0'; - *state = U8_STATE_HANGUL_LV; - return (6); - } - - b2 = u8s[0] = s[0]; - b3 = u8s[1] = s[1]; - b4 = u8s[2] = s[2]; - u8s[3] = '\0'; - - /* - * If this is a Hangul Jamo, we know there is nothing - * further that we can decompose. - */ - if (U8_HANGUL_JAMO_L(u1)) { - *state = U8_STATE_HANGUL_L; - return (3); - } - - if (U8_HANGUL_JAMO_V(u1)) { - if (*state == U8_STATE_HANGUL_L) - *state = U8_STATE_HANGUL_LV; - else - *state = U8_STATE_HANGUL_V; - return (3); - } - - if (U8_HANGUL_JAMO_T(u1)) { - if (*state == U8_STATE_HANGUL_LV) - *state = U8_STATE_HANGUL_LVT; - else - *state = U8_STATE_HANGUL_T; - return (3); - } - } else if (sz == 4) { - b1 = u8s[0] = s[0]; - b2 = u8s[1] = s[1]; - b3 = u8s[2] = s[2]; - b4 = u8s[3] = s[3]; - u8s[4] = '\0'; - } else { - /* - * This is a fallback and should not happen if the function - * was called properly. - */ - u8s[0] = s[0]; - u8s[1] = '\0'; - *state = U8_STATE_START; - return (1); - } - - /* - * At this point, this rountine does not know what it would get. - * The caller should sort it out if the state isn't a Hangul one. - */ - *state = U8_STATE_START; - - /* Try to find matching decomposition mapping byte sequence. */ - b1 = u8_common_b1_tbl[uv][b1]; - if (b1 == U8_TBL_ELEMENT_NOT_DEF) - return ((size_t)sz); - - b2 = u8_decomp_b2_tbl[uv][b1][b2]; - if (b2 == U8_TBL_ELEMENT_NOT_DEF) - return ((size_t)sz); - - b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id; - if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) - return ((size_t)sz); - - /* - * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR - * which is 0x8000, this means we couldn't fit the mappings into - * the cardinality of a unsigned byte. - */ - if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { - b3_tbl -= U8_16BIT_TABLE_INDICATOR; - start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4]; - end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; - } else { - start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4]; - end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1]; - } - - /* This also means there wasn't any matching decomposition. */ - if (start_id >= end_id) - return ((size_t)sz); - - /* - * The final table for decomposition mappings has three types of - * byte sequences depending on whether a mapping is for compatibility - * decomposition, canonical decomposition, or both like the following: - * - * (1) Compatibility decomposition mappings: - * - * +---+---+-...-+---+ - * | B0| B1| ... | Bm| - * +---+---+-...-+---+ - * - * The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH). - * - * (2) Canonical decomposition mappings: - * - * +---+---+---+-...-+---+ - * | T | b0| b1| ... | bn| - * +---+---+---+-...-+---+ - * - * where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL). - * - * (3) Both mappings: - * - * +---+---+---+---+-...-+---+---+---+-...-+---+ - * | T | D | b0| b1| ... | bn| B0| B1| ... | Bm| - * +---+---+---+---+-...-+---+---+---+-...-+---+ - * - * where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement - * byte, b0 to bn are canonical mapping bytes and B0 to Bm are - * compatibility mapping bytes. - * - * Note that compatibility decomposition means doing recursive - * decompositions using both compatibility decomposition mappings and - * canonical decomposition mappings. On the other hand, canonical - * decomposition means doing recursive decompositions using only - * canonical decomposition mappings. Since the table we have has gone - * through the recursions already, we do not need to do so during - * runtime, i.e., the table has been completely flattened out - * already. - */ - - b3_base = u8_decomp_b3_tbl[uv][b2][b3].base; - - /* Get the type, T, of the byte sequence. */ - b1 = u8_decomp_final_tbl[uv][b3_base + start_id]; - - /* - * If necessary, adjust start_id, end_id, or both. Note that if - * this is compatibility decomposition mapping, there is no - * adjustment. - */ - if (canonical_decomposition) { - /* Is the mapping only for compatibility decomposition? */ - if (b1 < U8_DECOMP_BOTH) - return ((size_t)sz); - - start_id++; - - if (b1 == U8_DECOMP_BOTH) { - end_id = start_id + - u8_decomp_final_tbl[uv][b3_base + start_id]; - start_id++; - } - } else { - /* - * Unless this is a compatibility decomposition mapping, - * we adjust the start_id. - */ - if (b1 == U8_DECOMP_BOTH) { - start_id++; - start_id += u8_decomp_final_tbl[uv][b3_base + start_id]; - } else if (b1 == U8_DECOMP_CANONICAL) { - start_id++; - } - } - - for (i = 0; start_id < end_id; start_id++) - u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id]; - u8s[i] = '\0'; - - return (i); -} - -/* - * The find_composition_start() function uses the character bytes given and - * find out the matching composition mappings if any and return the address - * to the composition mappings as explained in the do_composition(). - */ -static uchar_t * -find_composition_start(size_t uv, uchar_t *s, size_t sz) -{ - uint16_t b1 = 0; - uint16_t b2 = 0; - uint16_t b3 = 0; - uint16_t b3_tbl; - uint16_t b3_base; - uint16_t b4 = 0; - size_t start_id; - size_t end_id; - - if (sz == 1) { - b4 = s[0]; - } else if (sz == 2) { - b3 = s[0]; - b4 = s[1]; - } else if (sz == 3) { - b2 = s[0]; - b3 = s[1]; - b4 = s[2]; - } else if (sz == 4) { - b1 = s[0]; - b2 = s[1]; - b3 = s[2]; - b4 = s[3]; - } else { - /* - * This is a fallback and should not happen if the function - * was called properly. - */ - return (NULL); - } - - b1 = u8_composition_b1_tbl[uv][b1]; - if (b1 == U8_TBL_ELEMENT_NOT_DEF) - return (NULL); - - b2 = u8_composition_b2_tbl[uv][b1][b2]; - if (b2 == U8_TBL_ELEMENT_NOT_DEF) - return (NULL); - - b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id; - if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) - return (NULL); - - if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { - b3_tbl -= U8_16BIT_TABLE_INDICATOR; - start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4]; - end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; - } else { - start_id = u8_composition_b4_tbl[uv][b3_tbl][b4]; - end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1]; - } - - if (start_id >= end_id) - return (NULL); - - b3_base = u8_composition_b3_tbl[uv][b2][b3].base; - - return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id])); -} - -/* - * The blocked() function checks on the combining class values of previous - * characters in this sequence and return whether it is blocked or not. - */ -static boolean_t -blocked(uchar_t *comb_class, size_t last) -{ - uchar_t my_comb_class; - size_t i; - - my_comb_class = comb_class[last]; - for (i = 1; i < last; i++) - if (comb_class[i] >= my_comb_class || - comb_class[i] == U8_COMBINING_CLASS_STARTER) - return (B_TRUE); - - return (B_FALSE); -} - -/* - * The do_composition() reads the character string pointed by 's' and - * do necessary canonical composition and then copy over the result back to - * the 's'. - * - * The input argument 's' cannot contain more than 32 characters. - */ -static size_t -do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start, - uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast) -{ - uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1]; - uchar_t tc[U8_MB_CUR_MAX]; - uint8_t saved_marks[U8_MAX_CHARS_A_SEQ]; - size_t saved_marks_count; - uchar_t *p; - uchar_t *saved_p; - uchar_t *q; - size_t i; - size_t saved_i; - size_t j; - size_t k; - size_t l; - size_t C; - size_t saved_l; - size_t size; - uint32_t u1; - uint32_t u2; - boolean_t match_not_found = B_TRUE; - - /* - * This should never happen unless the callers are doing some strange - * and unexpected things. - * - * The "last" is the index pointing to the last character not last + 1. - */ - if (last >= U8_MAX_CHARS_A_SEQ) - last = U8_UPPER_LIMIT_IN_A_SEQ; - - for (i = l = 0; i <= last; i++) { - /* - * The last or any non-Starters at the beginning, we don't - * have any chance to do composition and so we just copy them - * to the temporary buffer. - */ - if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) { -SAVE_THE_CHAR: - p = s + start[i]; - size = disp[i]; - for (k = 0; k < size; k++) - t[l++] = *p++; - continue; - } - - /* - * If this could be a start of Hangul Jamos, then, we try to - * conjoin them. - */ - if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) { - U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]], - s[start[i] + 1], s[start[i] + 2]); - U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3], - s[start[i] + 4], s[start[i] + 5]); - - if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) { - u1 -= U8_HANGUL_JAMO_L_FIRST; - u2 -= U8_HANGUL_JAMO_V_FIRST; - u1 = U8_HANGUL_SYL_FIRST + - (u1 * U8_HANGUL_V_COUNT + u2) * - U8_HANGUL_T_COUNT; - - i += 2; - if (i <= last) { - U8_PUT_3BYTES_INTO_UTF32(u2, - s[start[i]], s[start[i] + 1], - s[start[i] + 2]); - - if (U8_HANGUL_JAMO_T(u2)) { - u1 += u2 - - U8_HANGUL_JAMO_T_FIRST; - i++; - } - } - - U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1); - i--; - l += 3; - continue; - } - } - - /* - * Let's then find out if this Starter has composition - * mapping. - */ - p = find_composition_start(uv, s + start[i], disp[i]); - if (p == NULL) - goto SAVE_THE_CHAR; - - /* - * We have a Starter with composition mapping and the next - * character is a non-Starter. Let's try to find out if - * we can do composition. - */ - - saved_p = p; - saved_i = i; - saved_l = l; - saved_marks_count = 0; - -TRY_THE_NEXT_MARK: - q = s + start[++i]; - size = disp[i]; - - /* - * The next for() loop compares the non-Starter pointed by - * 'q' with the possible (joinable) characters pointed by 'p'. - * - * The composition final table entry pointed by the 'p' - * looks like the following: - * - * +---+---+---+-...-+---+---+---+---+-...-+---+---+ - * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F | - * +---+---+---+-...-+---+---+---+---+-...-+---+---+ - * - * where C is the count byte indicating the number of - * mapping pairs where each pair would be look like - * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second - * character of a canonical decomposition and the B0-Bm are - * the bytes of a matching composite character. The F is - * a filler byte after each character as the separator. - */ - - match_not_found = B_TRUE; - - for (C = *p++; C > 0; C--) { - for (k = 0; k < size; p++, k++) - if (*p != q[k]) - break; - - /* Have we found it? */ - if (k >= size && *p == U8_TBL_ELEMENT_FILLER) { - match_not_found = B_FALSE; - - l = saved_l; - - while (*++p != U8_TBL_ELEMENT_FILLER) - t[l++] = *p; - - break; - } - - /* We didn't find; skip to the next pair. */ - if (*p != U8_TBL_ELEMENT_FILLER) - while (*++p != U8_TBL_ELEMENT_FILLER) - ; - while (*++p != U8_TBL_ELEMENT_FILLER) - ; - p++; - } - - /* - * If there was no match, we will need to save the combining - * mark for later appending. After that, if the next one - * is a non-Starter and not blocked, then, we try once - * again to do composition with the next non-Starter. - * - * If there was no match and this was a Starter, then, - * this is a new start. - * - * If there was a match and a composition done and we have - * more to check on, then, we retrieve a new composition final - * table entry for the composite and then try to do the - * composition again. - */ - - if (match_not_found) { - if (comb_class[i] == U8_COMBINING_CLASS_STARTER) { - i--; - goto SAVE_THE_CHAR; - } - - saved_marks[saved_marks_count++] = i; - } - - if (saved_l == l) { - while (i < last) { - if (blocked(comb_class, i + 1)) - saved_marks[saved_marks_count++] = ++i; - else - break; - } - if (i < last) { - p = saved_p; - goto TRY_THE_NEXT_MARK; - } - } else if (i < last) { - p = find_composition_start(uv, t + saved_l, - l - saved_l); - if (p != NULL) { - saved_p = p; - goto TRY_THE_NEXT_MARK; - } - } - - /* - * There is no more composition possible. - * - * If there was no composition what so ever then we copy - * over the original Starter and then append any non-Starters - * remaining at the target string sequentially after that. - */ - - if (saved_l == l) { - p = s + start[saved_i]; - size = disp[saved_i]; - for (j = 0; j < size; j++) - t[l++] = *p++; - } - - for (k = 0; k < saved_marks_count; k++) { - p = s + start[saved_marks[k]]; - size = disp[saved_marks[k]]; - for (j = 0; j < size; j++) - t[l++] = *p++; - } - } - - /* - * If the last character is a Starter and if we have a character - * (possibly another Starter) that can be turned into a composite, - * we do so and we do so until there is no more of composition - * possible. - */ - if (comb_class[last] == U8_COMBINING_CLASS_STARTER) { - p = *os; - saved_l = l - disp[last]; - - while (p < oslast) { - size = u8_number_of_bytes[*p]; - if (size <= 1 || (p + size) > oslast) - break; - - saved_p = p; - - for (i = 0; i < size; i++) - tc[i] = *p++; - - q = find_composition_start(uv, t + saved_l, - l - saved_l); - if (q == NULL) { - p = saved_p; - break; - } - - match_not_found = B_TRUE; - - for (C = *q++; C > 0; C--) { - for (k = 0; k < size; q++, k++) - if (*q != tc[k]) - break; - - if (k >= size && *q == U8_TBL_ELEMENT_FILLER) { - match_not_found = B_FALSE; - - l = saved_l; - - while (*++q != U8_TBL_ELEMENT_FILLER) { - /* - * This is practically - * impossible but we don't - * want to take any chances. - */ - if (l >= - U8_STREAM_SAFE_TEXT_MAX) { - p = saved_p; - goto SAFE_RETURN; - } - t[l++] = *q; - } - - break; - } - - if (*q != U8_TBL_ELEMENT_FILLER) - while (*++q != U8_TBL_ELEMENT_FILLER) - ; - while (*++q != U8_TBL_ELEMENT_FILLER) - ; - q++; - } - - if (match_not_found) { - p = saved_p; - break; - } - } -SAFE_RETURN: - *os = p; - } - - /* - * Now we copy over the temporary string to the target string. - * Since composition always reduces the number of characters or - * the number of characters stay, we don't need to worry about - * the buffer overflow here. - */ - for (i = 0; i < l; i++) - s[i] = t[i]; - s[l] = '\0'; - - return (l); -} - -/* - * The collect_a_seq() function checks on the given string s, collect - * a sequence of characters at u8s, and return the sequence. While it collects - * a sequence, it also applies case conversion, canonical or compatibility - * decomposition, canonical decomposition, or some or all of them and - * in that order. - * - * The collected sequence cannot be bigger than 32 characters since if - * it is having more than 31 characters, the sequence will be terminated - * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into - * a Stream-Safe Text. The collected sequence is always terminated with - * a null byte and the return value is the byte length of the sequence - * including 0. The return value does not include the terminating - * null byte. - */ -static size_t -collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast, - boolean_t is_it_toupper, - boolean_t is_it_tolower, - boolean_t canonical_decomposition, - boolean_t compatibility_decomposition, - boolean_t canonical_composition, - int *errnum, u8_normalization_states_t *state) -{ - uchar_t *s; - int sz; - int saved_sz; - size_t i; - size_t j; - size_t k; - size_t l; - uchar_t comb_class[U8_MAX_CHARS_A_SEQ]; - uchar_t disp[U8_MAX_CHARS_A_SEQ]; - uchar_t start[U8_MAX_CHARS_A_SEQ]; - uchar_t u8t[U8_MB_CUR_MAX]; - uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1]; - uchar_t tc; - size_t last; - size_t saved_last; - uint32_t u1; - - /* - * Save the source string pointer which we will return a changed - * pointer if we do processing. - */ - s = *source; - - /* - * The following is a fallback for just in case callers are not - * checking the string boundaries before the calling. - */ - if (s >= slast) { - u8s[0] = '\0'; - - return (0); - } - - /* - * As the first thing, let's collect a character and do case - * conversion if necessary. - */ - - sz = u8_number_of_bytes[*s]; - - if (sz < 0) { - *errnum = EILSEQ; - - u8s[0] = *s++; - u8s[1] = '\0'; - - *source = s; - - return (1); - } - - if (sz == 1) { - if (is_it_toupper) - u8s[0] = U8_ASCII_TOUPPER(*s); - else if (is_it_tolower) - u8s[0] = U8_ASCII_TOLOWER(*s); - else - u8s[0] = *s; - s++; - u8s[1] = '\0'; - } else if ((s + sz) > slast) { - *errnum = EINVAL; - - for (i = 0; s < slast; ) - u8s[i++] = *s++; - u8s[i] = '\0'; - - *source = s; - - return (i); - } else { - if (is_it_toupper || is_it_tolower) { - i = do_case_conv(uv, u8s, s, sz, is_it_toupper); - s += sz; - sz = i; - } else { - for (i = 0; i < sz; ) - u8s[i++] = *s++; - u8s[i] = '\0'; - } - } - - /* - * And then canonical/compatibility decomposition followed by - * an optional canonical composition. Please be noted that - * canonical composition is done only when a decomposition is - * done. - */ - if (canonical_decomposition || compatibility_decomposition) { - if (sz == 1) { - *state = U8_STATE_START; - - saved_sz = 1; - - comb_class[0] = 0; - start[0] = 0; - disp[0] = 1; - - last = 1; - } else { - saved_sz = do_decomp(uv, u8s, u8s, sz, - canonical_decomposition, state); - - last = 0; - - for (i = 0; i < saved_sz; ) { - sz = u8_number_of_bytes[u8s[i]]; - - comb_class[last] = combining_class(uv, - u8s + i, sz); - start[last] = i; - disp[last] = sz; - - last++; - i += sz; - } - - /* - * Decomposition yields various Hangul related - * states but not on combining marks. We need to - * find out at here by checking on the last - * character. - */ - if (*state == U8_STATE_START) { - if (comb_class[last - 1]) - *state = U8_STATE_COMBINING_MARK; - } - } - - saved_last = last; - - while (s < slast) { - sz = u8_number_of_bytes[*s]; - - /* - * If this is an illegal character, an incomplete - * character, or an 7-bit ASCII Starter character, - * then we have collected a sequence; break and let - * the next call deal with the two cases. - * - * Note that this is okay only if you are using this - * function with a fixed length string, not on - * a buffer with multiple calls of one chunk at a time. - */ - if (sz <= 1) { - break; - } else if ((s + sz) > slast) { - break; - } else { - /* - * If the previous character was a Hangul Jamo - * and this character is a Hangul Jamo that - * can be conjoined, we collect the Jamo. - */ - if (*s == U8_HANGUL_JAMO_1ST_BYTE) { - U8_PUT_3BYTES_INTO_UTF32(u1, - *s, *(s + 1), *(s + 2)); - - if (U8_HANGUL_COMPOSABLE_L_V(*state, - u1)) { - i = 0; - *state = U8_STATE_HANGUL_LV; - goto COLLECT_A_HANGUL; - } - - if (U8_HANGUL_COMPOSABLE_LV_T(*state, - u1)) { - i = 0; - *state = U8_STATE_HANGUL_LVT; - goto COLLECT_A_HANGUL; - } - } - - /* - * Regardless of whatever it was, if this is - * a Starter, we don't collect the character - * since that's a new start and we will deal - * with it at the next time. - */ - i = combining_class(uv, s, sz); - if (i == U8_COMBINING_CLASS_STARTER) - break; - - /* - * We know the current character is a combining - * mark. If the previous character wasn't - * a Starter (not Hangul) or a combining mark, - * then, we don't collect this combining mark. - */ - if (*state != U8_STATE_START && - *state != U8_STATE_COMBINING_MARK) - break; - - *state = U8_STATE_COMBINING_MARK; -COLLECT_A_HANGUL: - /* - * If we collected a Starter and combining - * marks up to 30, i.e., total 31 characters, - * then, we terminate this degenerately long - * combining sequence with a U+034F COMBINING - * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in - * UTF-8 and turn this into a Stream-Safe - * Text. This will be extremely rare but - * possible. - * - * The following will also guarantee that - * we are not writing more than 32 characters - * plus a NULL at u8s[]. - */ - if (last >= U8_UPPER_LIMIT_IN_A_SEQ) { -TURN_STREAM_SAFE: - *state = U8_STATE_START; - comb_class[last] = 0; - start[last] = saved_sz; - disp[last] = 2; - last++; - - u8s[saved_sz++] = 0xCD; - u8s[saved_sz++] = 0x8F; - - break; - } - - /* - * Some combining marks also do decompose into - * another combining mark or marks. - */ - if (*state == U8_STATE_COMBINING_MARK) { - k = last; - l = sz; - i = do_decomp(uv, uts, s, sz, - canonical_decomposition, state); - for (j = 0; j < i; ) { - sz = u8_number_of_bytes[uts[j]]; - - comb_class[last] = - combining_class(uv, - uts + j, sz); - start[last] = saved_sz + j; - disp[last] = sz; - - last++; - if (last >= - U8_UPPER_LIMIT_IN_A_SEQ) { - last = k; - goto TURN_STREAM_SAFE; - } - j += sz; - } - - *state = U8_STATE_COMBINING_MARK; - sz = i; - s += l; - - for (i = 0; i < sz; i++) - u8s[saved_sz++] = uts[i]; - } else { - comb_class[last] = i; - start[last] = saved_sz; - disp[last] = sz; - last++; - - for (i = 0; i < sz; i++) - u8s[saved_sz++] = *s++; - } - - /* - * If this is U+0345 COMBINING GREEK - * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a., - * iota subscript, and need to be converted to - * uppercase letter, convert it to U+0399 GREEK - * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8), - * i.e., convert to capital adscript form as - * specified in the Unicode standard. - * - * This is the only special case of (ambiguous) - * case conversion at combining marks and - * probably the standard will never have - * anything similar like this in future. - */ - if (is_it_toupper && sz >= 2 && - u8s[saved_sz - 2] == 0xCD && - u8s[saved_sz - 1] == 0x85) { - u8s[saved_sz - 2] = 0xCE; - u8s[saved_sz - 1] = 0x99; - } - } - } - - /* - * Let's try to ensure a canonical ordering for the collected - * combining marks. We do this only if we have collected - * at least one more non-Starter. (The decomposition mapping - * data tables have fully (and recursively) expanded and - * canonically ordered decompositions.) - * - * The U8_SWAP_COMB_MARKS() convenience macro has some - * assumptions and we are meeting the assumptions. - */ - last--; - if (last >= saved_last) { - for (i = 0; i < last; i++) - for (j = last; j > i; j--) - if (comb_class[j] && - comb_class[j - 1] > comb_class[j]) { - U8_SWAP_COMB_MARKS(j - 1, j); - } - } - - *source = s; - - if (! canonical_composition) { - u8s[saved_sz] = '\0'; - return (saved_sz); - } - - /* - * Now do the canonical composition. Note that we do this - * only after a canonical or compatibility decomposition to - * finish up NFC or NFKC. - */ - sz = do_composition(uv, u8s, comb_class, start, disp, last, - &s, slast); - } - - *source = s; - - return ((size_t)sz); -} - -/* - * The do_norm_compare() function does string comparion based on Unicode - * simple case mappings and Unicode Normalization definitions. - * - * It does so by collecting a sequence of character at a time and comparing - * the collected sequences from the strings. - * - * The meanings on the return values are the same as the usual strcmp(). - */ -static int -do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2, - int flag, int *errnum) -{ - int result; - size_t sz1; - size_t sz2; - uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1]; - uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1]; - uchar_t *s1last; - uchar_t *s2last; - boolean_t is_it_toupper; - boolean_t is_it_tolower; - boolean_t canonical_decomposition; - boolean_t compatibility_decomposition; - boolean_t canonical_composition; - u8_normalization_states_t state; - - s1last = s1 + n1; - s2last = s2 + n2; - - is_it_toupper = flag & U8_TEXTPREP_TOUPPER; - is_it_tolower = flag & U8_TEXTPREP_TOLOWER; - canonical_decomposition = flag & U8_CANON_DECOMP; - compatibility_decomposition = flag & U8_COMPAT_DECOMP; - canonical_composition = flag & U8_CANON_COMP; - - while (s1 < s1last && s2 < s2last) { - /* - * If the current character is a 7-bit ASCII and the last - * character, or, if the current character and the next - * character are both some 7-bit ASCII characters then - * we treat the current character as a sequence. - * - * In any other cases, we need to call collect_a_seq(). - */ - - if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last || - ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) { - if (is_it_toupper) - u8s1[0] = U8_ASCII_TOUPPER(*s1); - else if (is_it_tolower) - u8s1[0] = U8_ASCII_TOLOWER(*s1); - else - u8s1[0] = *s1; - u8s1[1] = '\0'; - sz1 = 1; - s1++; - } else { - state = U8_STATE_START; - sz1 = collect_a_seq(uv, u8s1, &s1, s1last, - is_it_toupper, is_it_tolower, - canonical_decomposition, - compatibility_decomposition, - canonical_composition, errnum, &state); - } - - if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last || - ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) { - if (is_it_toupper) - u8s2[0] = U8_ASCII_TOUPPER(*s2); - else if (is_it_tolower) - u8s2[0] = U8_ASCII_TOLOWER(*s2); - else - u8s2[0] = *s2; - u8s2[1] = '\0'; - sz2 = 1; - s2++; - } else { - state = U8_STATE_START; - sz2 = collect_a_seq(uv, u8s2, &s2, s2last, - is_it_toupper, is_it_tolower, - canonical_decomposition, - compatibility_decomposition, - canonical_composition, errnum, &state); - } - - /* - * Now compare the two characters. If they are the same, - * we move on to the next character sequences. - */ - if (sz1 == 1 && sz2 == 1) { - if (*u8s1 > *u8s2) - return (1); - if (*u8s1 < *u8s2) - return (-1); - } else { - result = strcmp((const char *)u8s1, (const char *)u8s2); - if (result != 0) - return (result); - } - } - - /* - * We compared until the end of either or both strings. - * - * If we reached to or went over the ends for the both, that means - * they are the same. - * - * If we reached only one end, that means the other string has - * something which then can be used to determine the return value. - */ - if (s1 >= s1last) { - if (s2 >= s2last) - return (0); - return (-1); - } - return (1); -} - -/* - * The u8_strcmp() function compares two UTF-8 strings quite similar to - * the strcmp(). For the comparison, however, Unicode Normalization specific - * equivalency and Unicode simple case conversion mappings based equivalency - * can be requested and checked against. - */ -int -u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv, - int *errnum) -{ - int f; - size_t n1; - size_t n2; - - *errnum = 0; - - /* - * Check on the requested Unicode version, case conversion, and - * normalization flag values. - */ - - if (uv > U8_UNICODE_LATEST) { - *errnum = ERANGE; - uv = U8_UNICODE_LATEST; - } - - if (flag == 0) { - flag = U8_STRCMP_CS; - } else { - f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER | - U8_STRCMP_CI_LOWER); - if (f == 0) { - flag |= U8_STRCMP_CS; - } else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER && - f != U8_STRCMP_CI_LOWER) { - *errnum = EBADF; - flag = U8_STRCMP_CS; - } - - f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); - if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC && - f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) { - *errnum = EBADF; - flag = U8_STRCMP_CS; - } - } - - if (flag == U8_STRCMP_CS) { - return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n)); - } - - n1 = strlen(s1); - n2 = strlen(s2); - if (n != 0) { - if (n < n1) - n1 = n; - if (n < n2) - n2 = n; - } - - /* - * Simple case conversion can be done much faster and so we do - * them separately here. - */ - if (flag == U8_STRCMP_CI_UPPER) { - return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, - n1, n2, B_TRUE, errnum)); - } else if (flag == U8_STRCMP_CI_LOWER) { - return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, - n1, n2, B_FALSE, errnum)); - } - - return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2, - flag, errnum)); -} - -size_t -u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen, - int flag, size_t unicode_version, int *errnum) -{ - int f; - int sz; - uchar_t *ib; - uchar_t *ibtail; - uchar_t *ob; - uchar_t *obtail; - boolean_t do_not_ignore_null; - boolean_t do_not_ignore_invalid; - boolean_t is_it_toupper; - boolean_t is_it_tolower; - boolean_t canonical_decomposition; - boolean_t compatibility_decomposition; - boolean_t canonical_composition; - size_t ret_val; - size_t i; - size_t j; - uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1]; - u8_normalization_states_t state; - - if (unicode_version > U8_UNICODE_LATEST) { - *errnum = ERANGE; - return ((size_t)-1); - } - - f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER); - if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) { - *errnum = EBADF; - return ((size_t)-1); - } - - f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); - if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC && - f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) { - *errnum = EBADF; - return ((size_t)-1); - } - - if (inarray == NULL || *inlen == 0) - return (0); - - if (outarray == NULL) { - *errnum = E2BIG; - return ((size_t)-1); - } - - ib = (uchar_t *)inarray; - ob = (uchar_t *)outarray; - ibtail = ib + *inlen; - obtail = ob + *outlen; - - do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL); - do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID); - is_it_toupper = flag & U8_TEXTPREP_TOUPPER; - is_it_tolower = flag & U8_TEXTPREP_TOLOWER; - - ret_val = 0; - - /* - * If we don't have a normalization flag set, we do the simple case - * conversion based text preparation separately below. Text - * preparation involving Normalization will be done in the false task - * block, again, separately since it will take much more time and - * resource than doing simple case conversions. - */ - if (f == 0) { - while (ib < ibtail) { - if (*ib == '\0' && do_not_ignore_null) - break; - - sz = u8_number_of_bytes[*ib]; - - if (sz < 0) { - if (do_not_ignore_invalid) { - *errnum = EILSEQ; - ret_val = (size_t)-1; - break; - } - - sz = 1; - ret_val++; - } - - if (sz == 1) { - if (ob >= obtail) { - *errnum = E2BIG; - ret_val = (size_t)-1; - break; - } - - if (is_it_toupper) - *ob = U8_ASCII_TOUPPER(*ib); - else if (is_it_tolower) - *ob = U8_ASCII_TOLOWER(*ib); - else - *ob = *ib; - ib++; - ob++; - } else if ((ib + sz) > ibtail) { - if (do_not_ignore_invalid) { - *errnum = EINVAL; - ret_val = (size_t)-1; - break; - } - - if ((obtail - ob) < (ibtail - ib)) { - *errnum = E2BIG; - ret_val = (size_t)-1; - break; - } - - /* - * We treat the remaining incomplete character - * bytes as a character. - */ - ret_val++; - - while (ib < ibtail) - *ob++ = *ib++; - } else { - if (is_it_toupper || is_it_tolower) { - i = do_case_conv(unicode_version, u8s, - ib, sz, is_it_toupper); - - if ((obtail - ob) < i) { - *errnum = E2BIG; - ret_val = (size_t)-1; - break; - } - - ib += sz; - - for (sz = 0; sz < i; sz++) - *ob++ = u8s[sz]; - } else { - if ((obtail - ob) < sz) { - *errnum = E2BIG; - ret_val = (size_t)-1; - break; - } - - for (i = 0; i < sz; i++) - *ob++ = *ib++; - } - } - } - } else { - canonical_decomposition = flag & U8_CANON_DECOMP; - compatibility_decomposition = flag & U8_COMPAT_DECOMP; - canonical_composition = flag & U8_CANON_COMP; - - while (ib < ibtail) { - if (*ib == '\0' && do_not_ignore_null) - break; - - /* - * If the current character is a 7-bit ASCII - * character and it is the last character, or, - * if the current character is a 7-bit ASCII - * character and the next character is also a 7-bit - * ASCII character, then, we copy over this - * character without going through collect_a_seq(). - * - * In any other cases, we need to look further with - * the collect_a_seq() function. - */ - if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail || - ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) { - if (ob >= obtail) { - *errnum = E2BIG; - ret_val = (size_t)-1; - break; - } - - if (is_it_toupper) - *ob = U8_ASCII_TOUPPER(*ib); - else if (is_it_tolower) - *ob = U8_ASCII_TOLOWER(*ib); - else - *ob = *ib; - ib++; - ob++; - } else { - *errnum = 0; - state = U8_STATE_START; - - j = collect_a_seq(unicode_version, u8s, - &ib, ibtail, - is_it_toupper, - is_it_tolower, - canonical_decomposition, - compatibility_decomposition, - canonical_composition, - errnum, &state); - - if (*errnum && do_not_ignore_invalid) { - ret_val = (size_t)-1; - break; - } - - if ((obtail - ob) < j) { - *errnum = E2BIG; - ret_val = (size_t)-1; - break; - } - - for (i = 0; i < j; i++) - *ob++ = u8s[i]; - } - } - } - - *inlen = ibtail - ib; - *outlen = obtail - ob; - - return (ret_val); -} |