summaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2008-12-11 15:38:59 -0800
committerBrian Behlendorf <[email protected]>2008-12-11 15:38:59 -0800
commit6b2c60acca39ef3468797095d3a4162e6ce69786 (patch)
tree598cab78d45f233695d0e7c73e2e6ec44b8efd71 /lib
parenta4076c7544bdbdc0ac0fe20f4ef86c2aa06862fb (diff)
Moving lib/libspl to linux-libspl branch
Diffstat (limited to 'lib')
-rw-r--r--lib/libspl/include/sys/list.h67
-rw-r--r--lib/libspl/include/sys/list_impl.h53
-rw-r--r--lib/libspl/list.c245
-rw-r--r--lib/libspl/mkdirp.c212
-rw-r--r--lib/libspl/strlcat.c59
-rw-r--r--lib/libspl/strlcpy.c55
-rw-r--r--lib/libspl/strnlen.c47
-rw-r--r--lib/libspl/u8_textprep.c2132
8 files changed, 0 insertions, 2870 deletions
diff --git a/lib/libspl/include/sys/list.h b/lib/libspl/include/sys/list.h
deleted file mode 100644
index 8339b6226..000000000
--- a/lib/libspl/include/sys/list.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_LIST_H
-#define _SYS_LIST_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/list_impl.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct list_node list_node_t;
-typedef struct list list_t;
-
-void list_create(list_t *, size_t, size_t);
-void list_destroy(list_t *);
-
-void list_insert_after(list_t *, void *, void *);
-void list_insert_before(list_t *, void *, void *);
-void list_insert_head(list_t *, void *);
-void list_insert_tail(list_t *, void *);
-void list_remove(list_t *, void *);
-void *list_remove_head(list_t *);
-void *list_remove_tail(list_t *);
-void list_move_tail(list_t *, list_t *);
-
-void *list_head(list_t *);
-void *list_tail(list_t *);
-void *list_next(list_t *, void *);
-void *list_prev(list_t *, void *);
-int list_is_empty(list_t *);
-
-void list_link_init(list_node_t *);
-void list_link_replace(list_node_t *, list_node_t *);
-
-int list_link_active(list_node_t *);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_LIST_H */
diff --git a/lib/libspl/include/sys/list_impl.h b/lib/libspl/include/sys/list_impl.h
deleted file mode 100644
index 9c42f8832..000000000
--- a/lib/libspl/include/sys/list_impl.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_LIST_IMPL_H
-#define _SYS_LIST_IMPL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct list_node {
- struct list_node *list_next;
- struct list_node *list_prev;
-};
-
-struct list {
- size_t list_size;
- size_t list_offset;
- struct list_node list_head;
-};
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_LIST_IMPL_H */
diff --git a/lib/libspl/list.c b/lib/libspl/list.c
deleted file mode 100644
index e8db13a5c..000000000
--- a/lib/libspl/list.c
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * Generic doubly-linked list implementation
- */
-
-#include <sys/list.h>
-#include <sys/list_impl.h>
-#include <sys/types.h>
-#include <sys/sysmacros.h>
-#include <sys/debug.h>
-
-#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset))
-#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset))
-#define list_empty(a) ((a)->list_head.list_next == &(a)->list_head)
-
-#define list_insert_after_node(list, node, object) { \
- list_node_t *lnew = list_d2l(list, object); \
- lnew->list_prev = (node); \
- lnew->list_next = (node)->list_next; \
- (node)->list_next->list_prev = lnew; \
- (node)->list_next = lnew; \
-}
-
-#define list_insert_before_node(list, node, object) { \
- list_node_t *lnew = list_d2l(list, object); \
- lnew->list_next = (node); \
- lnew->list_prev = (node)->list_prev; \
- (node)->list_prev->list_next = lnew; \
- (node)->list_prev = lnew; \
-}
-
-#define list_remove_node(node) \
- (node)->list_prev->list_next = (node)->list_next; \
- (node)->list_next->list_prev = (node)->list_prev; \
- (node)->list_next = (node)->list_prev = NULL
-
-void
-list_create(list_t *list, size_t size, size_t offset)
-{
- ASSERT(list);
- ASSERT(size > 0);
- ASSERT(size >= offset + sizeof (list_node_t));
-
- list->list_size = size;
- list->list_offset = offset;
- list->list_head.list_next = list->list_head.list_prev =
- &list->list_head;
-}
-
-void
-list_destroy(list_t *list)
-{
- list_node_t *node = &list->list_head;
-
- ASSERT(list);
- ASSERT(list->list_head.list_next == node);
- ASSERT(list->list_head.list_prev == node);
-
- node->list_next = node->list_prev = NULL;
-}
-
-void
-list_insert_after(list_t *list, void *object, void *nobject)
-{
- if (object == NULL) {
- list_insert_head(list, nobject);
- } else {
- list_node_t *lold = list_d2l(list, object);
- list_insert_after_node(list, lold, nobject);
- }
-}
-
-void
-list_insert_before(list_t *list, void *object, void *nobject)
-{
- if (object == NULL) {
- list_insert_tail(list, nobject);
- } else {
- list_node_t *lold = list_d2l(list, object);
- list_insert_before_node(list, lold, nobject);
- }
-}
-
-void
-list_insert_head(list_t *list, void *object)
-{
- list_node_t *lold = &list->list_head;
- list_insert_after_node(list, lold, object);
-}
-
-void
-list_insert_tail(list_t *list, void *object)
-{
- list_node_t *lold = &list->list_head;
- list_insert_before_node(list, lold, object);
-}
-
-void
-list_remove(list_t *list, void *object)
-{
- list_node_t *lold = list_d2l(list, object);
- ASSERT(!list_empty(list));
- ASSERT(lold->list_next != NULL);
- list_remove_node(lold);
-}
-
-void *
-list_remove_head(list_t *list)
-{
- list_node_t *head = list->list_head.list_next;
- if (head == &list->list_head)
- return (NULL);
- list_remove_node(head);
- return (list_object(list, head));
-}
-
-void *
-list_remove_tail(list_t *list)
-{
- list_node_t *tail = list->list_head.list_prev;
- if (tail == &list->list_head)
- return (NULL);
- list_remove_node(tail);
- return (list_object(list, tail));
-}
-
-void *
-list_head(list_t *list)
-{
- if (list_empty(list))
- return (NULL);
- return (list_object(list, list->list_head.list_next));
-}
-
-void *
-list_tail(list_t *list)
-{
- if (list_empty(list))
- return (NULL);
- return (list_object(list, list->list_head.list_prev));
-}
-
-void *
-list_next(list_t *list, void *object)
-{
- list_node_t *node = list_d2l(list, object);
-
- if (node->list_next != &list->list_head)
- return (list_object(list, node->list_next));
-
- return (NULL);
-}
-
-void *
-list_prev(list_t *list, void *object)
-{
- list_node_t *node = list_d2l(list, object);
-
- if (node->list_prev != &list->list_head)
- return (list_object(list, node->list_prev));
-
- return (NULL);
-}
-
-/*
- * Insert src list after dst list. Empty src list thereafter.
- */
-void
-list_move_tail(list_t *dst, list_t *src)
-{
- list_node_t *dstnode = &dst->list_head;
- list_node_t *srcnode = &src->list_head;
-
- ASSERT(dst->list_size == src->list_size);
- ASSERT(dst->list_offset == src->list_offset);
-
- if (list_empty(src))
- return;
-
- dstnode->list_prev->list_next = srcnode->list_next;
- srcnode->list_next->list_prev = dstnode->list_prev;
- dstnode->list_prev = srcnode->list_prev;
- srcnode->list_prev->list_next = dstnode;
-
- /* empty src list */
- srcnode->list_next = srcnode->list_prev = srcnode;
-}
-
-void
-list_link_replace(list_node_t *lold, list_node_t *lnew)
-{
- ASSERT(list_link_active(lold));
- ASSERT(!list_link_active(lnew));
-
- lnew->list_next = lold->list_next;
- lnew->list_prev = lold->list_prev;
- lold->list_prev->list_next = lnew;
- lold->list_next->list_prev = lnew;
- lold->list_next = lold->list_prev = NULL;
-}
-
-void
-list_link_init(list_node_t *link)
-{
- link->list_next = NULL;
- link->list_prev = NULL;
-}
-
-int
-list_link_active(list_node_t *link)
-{
- return (link->list_next != NULL);
-}
-
-int
-list_is_empty(list_t *list)
-{
- return (list_empty(list));
-}
diff --git a/lib/libspl/mkdirp.c b/lib/libspl/mkdirp.c
deleted file mode 100644
index 9c81f2a0b..000000000
--- a/lib/libspl/mkdirp.c
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-/* Copyright (c) 1988 AT&T */
-/* All Rights Reserved */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * Creates directory and it's parents if the parents do not
- * exist yet.
- *
- * Returns -1 if fails for reasons other than non-existing
- * parents.
- * Does NOT simplify pathnames with . or .. in them.
- */
-
-#include <sys/types.h>
-#include <libgen.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-#include <sys/stat.h>
-
-static char *simplify(const char *str);
-
-int
-mkdirp(const char *d, mode_t mode)
-{
- char *endptr, *ptr, *slash, *str;
-
- str = simplify(d);
-
- /* If space couldn't be allocated for the simplified names, return. */
-
- if (str == NULL)
- return (-1);
-
- /* Try to make the directory */
-
- if (mkdir(str, mode) == 0) {
- free(str);
- return (0);
- }
- if (errno != ENOENT) {
- free(str);
- return (-1);
- }
- endptr = strrchr(str, '\0');
- slash = strrchr(str, '/');
-
- /* Search upward for the non-existing parent */
-
- while (slash != NULL) {
-
- ptr = slash;
- *ptr = '\0';
-
- /* If reached an existing parent, break */
-
- if (access(str, F_OK) == 0)
- break;
-
- /* If non-existing parent */
-
- else {
- slash = strrchr(str, '/');
-
- /* If under / or current directory, make it. */
-
- if (slash == NULL || slash == str) {
- if (mkdir(str, mode) != 0 && errno != EEXIST) {
- free(str);
- return (-1);
- }
- break;
- }
- }
- }
-
- /* Create directories starting from upmost non-existing parent */
-
- while ((ptr = strchr(str, '\0')) != endptr) {
- *ptr = '/';
- if (mkdir(str, mode) != 0 && errno != EEXIST) {
- /*
- * If the mkdir fails because str already
- * exists (EEXIST), then str has the form
- * "existing-dir/..", and this is really
- * ok. (Remember, this loop is creating the
- * portion of the path that didn't exist)
- */
- free(str);
- return (-1);
- }
- }
- free(str);
- return (0);
-}
-
-/*
- * simplify - given a pathname, simplify that path by removing
- * duplicate contiguous slashes.
- *
- * A simplified copy of the argument is returned to the
- * caller, or NULL is returned on error.
- *
- * The caller should handle error reporting based upon the
- * returned vlaue, and should free the returned value,
- * when appropriate.
- */
-
-static char *
-simplify(const char *str)
-{
- int i;
- size_t mbPathlen; /* length of multi-byte path */
- size_t wcPathlen; /* length of wide-character path */
- wchar_t *wptr; /* scratch pointer */
- wchar_t *wcPath; /* wide-character version of the path */
- char *mbPath; /* The copy fo the path to be returned */
-
- /*
- * bail out if there is nothing there.
- */
-
- if (!str)
- return (NULL);
-
- /*
- * Get a copy of the argument.
- */
-
- if ((mbPath = strdup(str)) == NULL) {
- return (NULL);
- }
-
- /*
- * convert the multi-byte version of the path to a
- * wide-character rendering, for doing our figuring.
- */
-
- mbPathlen = strlen(mbPath);
-
- if ((wcPath = calloc(sizeof (wchar_t), mbPathlen+1)) == NULL) {
- free(mbPath);
- return (NULL);
- }
-
- if ((wcPathlen = mbstowcs(wcPath, mbPath, mbPathlen)) == (size_t)-1) {
- free(mbPath);
- free(wcPath);
- return (NULL);
- }
-
- /*
- * remove duplicate slashes first ("//../" -> "/")
- */
-
- for (wptr = wcPath, i = 0; i < wcPathlen; i++) {
- *wptr++ = wcPath[i];
-
- if (wcPath[i] == '/') {
- i++;
-
- while (wcPath[i] == '/') {
- i++;
- }
-
- i--;
- }
- }
-
- *wptr = '\0';
-
- /*
- * now convert back to the multi-byte format.
- */
-
- if (wcstombs(mbPath, wcPath, mbPathlen) == (size_t)-1) {
- free(mbPath);
- free(wcPath);
- return (NULL);
- }
-
- free(wcPath);
- return (mbPath);
-}
diff --git a/lib/libspl/strlcat.c b/lib/libspl/strlcat.c
deleted file mode 100644
index 07d1403dd..000000000
--- a/lib/libspl/strlcat.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include "lint.h"
-#include <string.h>
-#include <sys/types.h>
-
-/*
- * Appends src to the dstsize buffer at dst. The append will never
- * overflow the destination buffer and the buffer will always be null
- * terminated. Never reference beyond &dst[dstsize-1] when computing
- * the length of the pre-existing string.
- */
-
-size_t
-strlcat(char *dst, const char *src, size_t dstsize)
-{
- char *df = dst;
- size_t left = dstsize;
- size_t l1;
- size_t l2 = strlen(src);
- size_t copied;
-
- while (left-- != 0 && *df != '\0')
- df++;
- l1 = df - dst;
- if (dstsize == l1)
- return (l1 + l2);
-
- copied = l1 + l2 >= dstsize ? dstsize - l1 - 1 : l2;
- (void) memcpy(dst + l1, src, copied);
- dst[l1+copied] = '\0';
- return (l1 + l2);
-}
diff --git a/lib/libspl/strlcpy.c b/lib/libspl/strlcpy.c
deleted file mode 100644
index 7a8009b89..000000000
--- a/lib/libspl/strlcpy.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include "lint.h"
-#include <string.h>
-#include <sys/types.h>
-
-/*
- * Copies src to the dstsize buffer at dst. The copy will never
- * overflow the destination buffer and the buffer will always be null
- * terminated.
- */
-
-size_t
-strlcpy(char *dst, const char *src, size_t len)
-{
- size_t slen = strlen(src);
- size_t copied;
-
- if (len == 0)
- return (slen);
-
- if (slen >= len)
- copied = len - 1;
- else
- copied = slen;
- (void) memcpy(dst, src, copied);
- dst[copied] = '\0';
- return (slen);
-}
diff --git a/lib/libspl/strnlen.c b/lib/libspl/strnlen.c
deleted file mode 100644
index 605245b6b..000000000
--- a/lib/libspl/strnlen.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2008 Sun Microsystems, Inc.
- * All rights reserved. Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include "lint.h"
-#include <string.h>
-#include <sys/types.h>
-
-/*
- * Returns the number of non-NULL bytes in string argument,
- * but not more than maxlen. Does not look past str + maxlen.
- */
-size_t
-strnlen(const char *str, size_t maxlen)
-{
- const char *ptr;
-
- ptr = memchr(str, 0, maxlen);
- if (ptr == NULL)
- return (maxlen);
-
- return (ptr - str);
-}
diff --git a/lib/libspl/u8_textprep.c b/lib/libspl/u8_textprep.c
deleted file mode 100644
index 8faf1a97e..000000000
--- a/lib/libspl/u8_textprep.c
+++ /dev/null
@@ -1,2132 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-
-/*
- * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
- *
- * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
- * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
- * the section 3C man pages.
- * Interface stability: Committed.
- */
-
-#include <sys/types.h>
-#ifdef _KERNEL
-#include <sys/param.h>
-#include <sys/sysmacros.h>
-#include <sys/systm.h>
-#include <sys/debug.h>
-#include <sys/kmem.h>
-#include <sys/ddi.h>
-#include <sys/sunddi.h>
-#else
-#include <sys/u8_textprep.h>
-#include <strings.h>
-#endif /* _KERNEL */
-#include <sys/byteorder.h>
-#include <sys/errno.h>
-#include <sys/u8_textprep_data.h>
-
-
-/* The maximum possible number of bytes in a UTF-8 character. */
-#define U8_MB_CUR_MAX (4)
-
-/*
- * The maximum number of bytes needed for a UTF-8 character to cover
- * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
- */
-#define U8_MAX_BYTES_UCS2 (3)
-
-/* The maximum possible number of bytes in a Stream-Safe Text. */
-#define U8_STREAM_SAFE_TEXT_MAX (128)
-
-/*
- * The maximum number of characters in a combining/conjoining sequence and
- * the actual upperbound limit of a combining/conjoining sequence.
- */
-#define U8_MAX_CHARS_A_SEQ (32)
-#define U8_UPPER_LIMIT_IN_A_SEQ (31)
-
-/* The combining class value for Starter. */
-#define U8_COMBINING_CLASS_STARTER (0)
-
-/*
- * Some Hangul related macros at below.
- *
- * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
- * Vowels, and optional Trailing consonants in Unicode scalar values.
- *
- * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
- * the actual U+11A8. This is due to that the trailing consonant is optional
- * and thus we are doing a pre-calculation of subtracting one.
- *
- * Each of 19 modern leading consonants has total 588 possible syllables since
- * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
- * no trailing consonant case, i.e., 21 x 28 = 588.
- *
- * We also have bunch of Hangul related macros at below. Please bear in mind
- * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
- * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
- * Jamo; it just guarantee that it will be most likely.
- */
-#define U8_HANGUL_SYL_FIRST (0xAC00U)
-#define U8_HANGUL_SYL_LAST (0xD7A3U)
-
-#define U8_HANGUL_JAMO_L_FIRST (0x1100U)
-#define U8_HANGUL_JAMO_L_LAST (0x1112U)
-#define U8_HANGUL_JAMO_V_FIRST (0x1161U)
-#define U8_HANGUL_JAMO_V_LAST (0x1175U)
-#define U8_HANGUL_JAMO_T_FIRST (0x11A7U)
-#define U8_HANGUL_JAMO_T_LAST (0x11C2U)
-
-#define U8_HANGUL_V_COUNT (21)
-#define U8_HANGUL_VT_COUNT (588)
-#define U8_HANGUL_T_COUNT (28)
-
-#define U8_HANGUL_JAMO_1ST_BYTE (0xE1U)
-
-#define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
- (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
- (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
- (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
-
-#define U8_HANGUL_JAMO_L(u) \
- ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
-
-#define U8_HANGUL_JAMO_V(u) \
- ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
-
-#define U8_HANGUL_JAMO_T(u) \
- ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
-
-#define U8_HANGUL_JAMO(u) \
- ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
-
-#define U8_HANGUL_SYLLABLE(u) \
- ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
-
-#define U8_HANGUL_COMPOSABLE_L_V(s, u) \
- ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
-
-#define U8_HANGUL_COMPOSABLE_LV_T(s, u) \
- ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
-
-/* The types of decomposition mappings. */
-#define U8_DECOMP_BOTH (0xF5U)
-#define U8_DECOMP_CANONICAL (0xF6U)
-
-/* The indicator for 16-bit table. */
-#define U8_16BIT_TABLE_INDICATOR (0x8000U)
-
-/* The following are some convenience macros. */
-#define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
- (u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \
- (uint32_t)(b3) & 0x3F;
-
-#define U8_SIMPLE_SWAP(a, b, t) \
- (t) = (a); \
- (a) = (b); \
- (b) = (t);
-
-#define U8_ASCII_TOUPPER(c) \
- (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
-
-#define U8_ASCII_TOLOWER(c) \
- (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
-
-#define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U)
-/*
- * The following macro assumes that the two characters that are to be
- * swapped are adjacent to each other and 'a' comes before 'b'.
- *
- * If the assumptions are not met, then, the macro will fail.
- */
-#define U8_SWAP_COMB_MARKS(a, b) \
- for (k = 0; k < disp[(a)]; k++) \
- u8t[k] = u8s[start[(a)] + k]; \
- for (k = 0; k < disp[(b)]; k++) \
- u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
- start[(b)] = start[(a)] + disp[(b)]; \
- for (k = 0; k < disp[(a)]; k++) \
- u8s[start[(b)] + k] = u8t[k]; \
- U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
- U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
-
-/* The possible states during normalization. */
-typedef enum {
- U8_STATE_START = 0,
- U8_STATE_HANGUL_L = 1,
- U8_STATE_HANGUL_LV = 2,
- U8_STATE_HANGUL_LVT = 3,
- U8_STATE_HANGUL_V = 4,
- U8_STATE_HANGUL_T = 5,
- U8_STATE_COMBINING_MARK = 6
-} u8_normalization_states_t;
-
-/*
- * The three vectors at below are used to check bytes of a given UTF-8
- * character are valid and not containing any malformed byte values.
- *
- * We used to have a quite relaxed UTF-8 binary representation but then there
- * was some security related issues and so the Unicode Consortium defined
- * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
- * one more time at the Unicode 3.2. The following three tables are based on
- * that.
- */
-
-#define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF)
-
-#define I_ U8_ILLEGAL_CHAR
-#define O_ U8_OUT_OF_RANGE_CHAR
-
-const int8_t u8_number_of_bytes[0x100] = {
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-
-/* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */
- I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
-
-/* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */
- I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
-
-/* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */
- I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
-
-/* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */
- I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
-
-/* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */
- I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-
-/* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-
-/* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-
-/* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */
- 4, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
-};
-
-#undef I_
-#undef O_
-
-const uint8_t u8_valid_min_2nd_byte[0x100] = {
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
-/* C0 C1 C2 C3 C4 C5 C6 C7 */
- 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-/* C8 C9 CA CB CC CD CE CF */
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-/* D0 D1 D2 D3 D4 D5 D6 D7 */
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-/* D8 D9 DA DB DC DD DE DF */
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-/* E0 E1 E2 E3 E4 E5 E6 E7 */
- 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-/* E8 E9 EA EB EC ED EE EF */
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-/* F0 F1 F2 F3 F4 F5 F6 F7 */
- 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
-};
-
-const uint8_t u8_valid_max_2nd_byte[0x100] = {
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
-/* C0 C1 C2 C3 C4 C5 C6 C7 */
- 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
-/* C8 C9 CA CB CC CD CE CF */
- 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
-/* D0 D1 D2 D3 D4 D5 D6 D7 */
- 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
-/* D8 D9 DA DB DC DD DE DF */
- 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
-/* E0 E1 E2 E3 E4 E5 E6 E7 */
- 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
-/* E8 E9 EA EB EC ED EE EF */
- 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
-/* F0 F1 F2 F3 F4 F5 F6 F7 */
- 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
-};
-
-
-/*
- * The u8_validate() validates on the given UTF-8 character string and
- * calculate the byte length. It is quite similar to mblen(3C) except that
- * this will validate against the list of characters if required and
- * specific to UTF-8 and Unicode.
- */
-int
-u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum)
-{
- uchar_t *ib;
- uchar_t *ibtail;
- uchar_t **p;
- uchar_t *s1;
- uchar_t *s2;
- uchar_t f;
- int sz;
- size_t i;
- int ret_val;
- boolean_t second;
- boolean_t no_need_to_validate_entire;
- boolean_t check_additional;
- boolean_t validate_ucs2_range_only;
-
- if (! u8str)
- return (0);
-
- ib = (uchar_t *)u8str;
- ibtail = ib + n;
-
- ret_val = 0;
-
- no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
- check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
- validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
-
- while (ib < ibtail) {
- /*
- * The first byte of a UTF-8 character tells how many
- * bytes will follow for the character. If the first byte
- * is an illegal byte value or out of range value, we just
- * return -1 with an appropriate error number.
- */
- sz = u8_number_of_bytes[*ib];
- if (sz == U8_ILLEGAL_CHAR) {
- *errnum = EILSEQ;
- return (-1);
- }
-
- if (sz == U8_OUT_OF_RANGE_CHAR ||
- (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
- *errnum = ERANGE;
- return (-1);
- }
-
- /*
- * If we don't have enough bytes to check on, that's also
- * an error. As you can see, we give illegal byte sequence
- * checking higher priority then EINVAL cases.
- */
- if ((ibtail - ib) < sz) {
- *errnum = EINVAL;
- return (-1);
- }
-
- if (sz == 1) {
- ib++;
- ret_val++;
- } else {
- /*
- * Check on the multi-byte UTF-8 character. For more
- * details on this, see comment added for the used
- * data structures at the beginning of the file.
- */
- f = *ib++;
- ret_val++;
- second = B_TRUE;
- for (i = 1; i < sz; i++) {
- if (second) {
- if (*ib < u8_valid_min_2nd_byte[f] ||
- *ib > u8_valid_max_2nd_byte[f]) {
- *errnum = EILSEQ;
- return (-1);
- }
- second = B_FALSE;
- } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
- *errnum = EILSEQ;
- return (-1);
- }
- ib++;
- ret_val++;
- }
- }
-
- if (check_additional) {
- for (p = (uchar_t **)list, i = 0; p[i]; i++) {
- s1 = ib - sz;
- s2 = p[i];
- while (s1 < ib) {
- if (*s1 != *s2 || *s2 == '\0')
- break;
- s1++;
- s2++;
- }
-
- if (s1 >= ib && *s2 == '\0') {
- *errnum = EBADF;
- return (-1);
- }
- }
- }
-
- if (no_need_to_validate_entire)
- break;
- }
-
- return (ret_val);
-}
-
-/*
- * The do_case_conv() looks at the mapping tables and returns found
- * bytes if any. If not found, the input bytes are returned. The function
- * always terminate the return bytes with a null character assuming that
- * there are plenty of room to do so.
- *
- * The case conversions are simple case conversions mapping a character to
- * another character as specified in the Unicode data. The byte size of
- * the mapped character could be different from that of the input character.
- *
- * The return value is the byte length of the returned character excluding
- * the terminating null byte.
- */
-static size_t
-do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
-{
- size_t i;
- uint16_t b1 = 0;
- uint16_t b2 = 0;
- uint16_t b3 = 0;
- uint16_t b3_tbl;
- uint16_t b3_base;
- uint16_t b4 = 0;
- size_t start_id;
- size_t end_id;
-
- /*
- * At this point, the only possible values for sz are 2, 3, and 4.
- * The u8s should point to a vector that is well beyond the size of
- * 5 bytes.
- */
- if (sz == 2) {
- b3 = u8s[0] = s[0];
- b4 = u8s[1] = s[1];
- } else if (sz == 3) {
- b2 = u8s[0] = s[0];
- b3 = u8s[1] = s[1];
- b4 = u8s[2] = s[2];
- } else if (sz == 4) {
- b1 = u8s[0] = s[0];
- b2 = u8s[1] = s[1];
- b3 = u8s[2] = s[2];
- b4 = u8s[3] = s[3];
- } else {
- /* This is not possible but just in case as a fallback. */
- if (is_it_toupper)
- *u8s = U8_ASCII_TOUPPER(*s);
- else
- *u8s = U8_ASCII_TOLOWER(*s);
- u8s[1] = '\0';
-
- return (1);
- }
- u8s[sz] = '\0';
-
- /*
- * Let's find out if we have a corresponding character.
- */
- b1 = u8_common_b1_tbl[uv][b1];
- if (b1 == U8_TBL_ELEMENT_NOT_DEF)
- return ((size_t)sz);
-
- b2 = u8_case_common_b2_tbl[uv][b1][b2];
- if (b2 == U8_TBL_ELEMENT_NOT_DEF)
- return ((size_t)sz);
-
- if (is_it_toupper) {
- b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
- if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
- return ((size_t)sz);
-
- start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
- end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
-
- /* Either there is no match or an error at the table. */
- if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
- return ((size_t)sz);
-
- b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
-
- for (i = 0; start_id < end_id; start_id++)
- u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
- } else {
- b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
- if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
- return ((size_t)sz);
-
- start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
- end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
-
- if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
- return ((size_t)sz);
-
- b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
-
- for (i = 0; start_id < end_id; start_id++)
- u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
- }
-
- /*
- * If i is still zero, that means there is no corresponding character.
- */
- if (i == 0)
- return ((size_t)sz);
-
- u8s[i] = '\0';
-
- return (i);
-}
-
-/*
- * The do_case_compare() function compares the two input strings, s1 and s2,
- * one character at a time doing case conversions if applicable and return
- * the comparison result as like strcmp().
- *
- * Since, in empirical sense, most of text data are 7-bit ASCII characters,
- * we treat the 7-bit ASCII characters as a special case trying to yield
- * faster processing time.
- */
-static int
-do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
- size_t n2, boolean_t is_it_toupper, int *errnum)
-{
- int f;
- int sz1;
- int sz2;
- size_t j;
- size_t i1;
- size_t i2;
- uchar_t u8s1[U8_MB_CUR_MAX + 1];
- uchar_t u8s2[U8_MB_CUR_MAX + 1];
-
- i1 = i2 = 0;
- while (i1 < n1 && i2 < n2) {
- /*
- * Find out what would be the byte length for this UTF-8
- * character at string s1 and also find out if this is
- * an illegal start byte or not and if so, issue a proper
- * error number and yet treat this byte as a character.
- */
- sz1 = u8_number_of_bytes[*s1];
- if (sz1 < 0) {
- *errnum = EILSEQ;
- sz1 = 1;
- }
-
- /*
- * For 7-bit ASCII characters mainly, we do a quick case
- * conversion right at here.
- *
- * If we don't have enough bytes for this character, issue
- * an EINVAL error and use what are available.
- *
- * If we have enough bytes, find out if there is
- * a corresponding uppercase character and if so, copy over
- * the bytes for a comparison later. If there is no
- * corresponding uppercase character, then, use what we have
- * for the comparison.
- */
- if (sz1 == 1) {
- if (is_it_toupper)
- u8s1[0] = U8_ASCII_TOUPPER(*s1);
- else
- u8s1[0] = U8_ASCII_TOLOWER(*s1);
- s1++;
- u8s1[1] = '\0';
- } else if ((i1 + sz1) > n1) {
- *errnum = EINVAL;
- for (j = 0; (i1 + j) < n1; )
- u8s1[j++] = *s1++;
- u8s1[j] = '\0';
- } else {
- (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
- s1 += sz1;
- }
-
- /* Do the same for the string s2. */
- sz2 = u8_number_of_bytes[*s2];
- if (sz2 < 0) {
- *errnum = EILSEQ;
- sz2 = 1;
- }
-
- if (sz2 == 1) {
- if (is_it_toupper)
- u8s2[0] = U8_ASCII_TOUPPER(*s2);
- else
- u8s2[0] = U8_ASCII_TOLOWER(*s2);
- s2++;
- u8s2[1] = '\0';
- } else if ((i2 + sz2) > n2) {
- *errnum = EINVAL;
- for (j = 0; (i2 + j) < n2; )
- u8s2[j++] = *s2++;
- u8s2[j] = '\0';
- } else {
- (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
- s2 += sz2;
- }
-
- /* Now compare the two characters. */
- if (sz1 == 1 && sz2 == 1) {
- if (*u8s1 > *u8s2)
- return (1);
- if (*u8s1 < *u8s2)
- return (-1);
- } else {
- f = strcmp((const char *)u8s1, (const char *)u8s2);
- if (f != 0)
- return (f);
- }
-
- /*
- * They were the same. Let's move on to the next
- * characters then.
- */
- i1 += sz1;
- i2 += sz2;
- }
-
- /*
- * We compared until the end of either or both strings.
- *
- * If we reached to or went over the ends for the both, that means
- * they are the same.
- *
- * If we reached only one of the two ends, that means the other string
- * has something which then the fact can be used to determine
- * the return value.
- */
- if (i1 >= n1) {
- if (i2 >= n2)
- return (0);
- return (-1);
- }
- return (1);
-}
-
-/*
- * The combining_class() function checks on the given bytes and find out
- * the corresponding Unicode combining class value. The return value 0 means
- * it is a Starter. Any illegal UTF-8 character will also be treated as
- * a Starter.
- */
-static uchar_t
-combining_class(size_t uv, uchar_t *s, size_t sz)
-{
- uint16_t b1 = 0;
- uint16_t b2 = 0;
- uint16_t b3 = 0;
- uint16_t b4 = 0;
-
- if (sz == 1 || sz > 4)
- return (0);
-
- if (sz == 2) {
- b3 = s[0];
- b4 = s[1];
- } else if (sz == 3) {
- b2 = s[0];
- b3 = s[1];
- b4 = s[2];
- } else if (sz == 4) {
- b1 = s[0];
- b2 = s[1];
- b3 = s[2];
- b4 = s[3];
- }
-
- b1 = u8_common_b1_tbl[uv][b1];
- if (b1 == U8_TBL_ELEMENT_NOT_DEF)
- return (0);
-
- b2 = u8_combining_class_b2_tbl[uv][b1][b2];
- if (b2 == U8_TBL_ELEMENT_NOT_DEF)
- return (0);
-
- b3 = u8_combining_class_b3_tbl[uv][b2][b3];
- if (b3 == U8_TBL_ELEMENT_NOT_DEF)
- return (0);
-
- return (u8_combining_class_b4_tbl[uv][b3][b4]);
-}
-
-/*
- * The do_decomp() function finds out a matching decomposition if any
- * and return. If there is no match, the input bytes are copied and returned.
- * The function also checks if there is a Hangul, decomposes it if necessary
- * and returns.
- *
- * To save time, a single byte 7-bit ASCII character should be handled by
- * the caller.
- *
- * The function returns the number of bytes returned sans always terminating
- * the null byte. It will also return a state that will tell if there was
- * a Hangul character decomposed which then will be used by the caller.
- */
-static size_t
-do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
- boolean_t canonical_decomposition, u8_normalization_states_t *state)
-{
- uint16_t b1 = 0;
- uint16_t b2 = 0;
- uint16_t b3 = 0;
- uint16_t b3_tbl;
- uint16_t b3_base;
- uint16_t b4 = 0;
- size_t start_id;
- size_t end_id;
- size_t i;
- uint32_t u1;
-
- if (sz == 2) {
- b3 = u8s[0] = s[0];
- b4 = u8s[1] = s[1];
- u8s[2] = '\0';
- } else if (sz == 3) {
- /* Convert it to a Unicode scalar value. */
- U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
-
- /*
- * If this is a Hangul syllable, we decompose it into
- * a leading consonant, a vowel, and an optional trailing
- * consonant and then return.
- */
- if (U8_HANGUL_SYLLABLE(u1)) {
- u1 -= U8_HANGUL_SYL_FIRST;
-
- b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
- b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
- / U8_HANGUL_T_COUNT;
- b3 = u1 % U8_HANGUL_T_COUNT;
-
- U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
- U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
- if (b3) {
- b3 += U8_HANGUL_JAMO_T_FIRST;
- U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
-
- u8s[9] = '\0';
- *state = U8_STATE_HANGUL_LVT;
- return (9);
- }
-
- u8s[6] = '\0';
- *state = U8_STATE_HANGUL_LV;
- return (6);
- }
-
- b2 = u8s[0] = s[0];
- b3 = u8s[1] = s[1];
- b4 = u8s[2] = s[2];
- u8s[3] = '\0';
-
- /*
- * If this is a Hangul Jamo, we know there is nothing
- * further that we can decompose.
- */
- if (U8_HANGUL_JAMO_L(u1)) {
- *state = U8_STATE_HANGUL_L;
- return (3);
- }
-
- if (U8_HANGUL_JAMO_V(u1)) {
- if (*state == U8_STATE_HANGUL_L)
- *state = U8_STATE_HANGUL_LV;
- else
- *state = U8_STATE_HANGUL_V;
- return (3);
- }
-
- if (U8_HANGUL_JAMO_T(u1)) {
- if (*state == U8_STATE_HANGUL_LV)
- *state = U8_STATE_HANGUL_LVT;
- else
- *state = U8_STATE_HANGUL_T;
- return (3);
- }
- } else if (sz == 4) {
- b1 = u8s[0] = s[0];
- b2 = u8s[1] = s[1];
- b3 = u8s[2] = s[2];
- b4 = u8s[3] = s[3];
- u8s[4] = '\0';
- } else {
- /*
- * This is a fallback and should not happen if the function
- * was called properly.
- */
- u8s[0] = s[0];
- u8s[1] = '\0';
- *state = U8_STATE_START;
- return (1);
- }
-
- /*
- * At this point, this rountine does not know what it would get.
- * The caller should sort it out if the state isn't a Hangul one.
- */
- *state = U8_STATE_START;
-
- /* Try to find matching decomposition mapping byte sequence. */
- b1 = u8_common_b1_tbl[uv][b1];
- if (b1 == U8_TBL_ELEMENT_NOT_DEF)
- return ((size_t)sz);
-
- b2 = u8_decomp_b2_tbl[uv][b1][b2];
- if (b2 == U8_TBL_ELEMENT_NOT_DEF)
- return ((size_t)sz);
-
- b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
- if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
- return ((size_t)sz);
-
- /*
- * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
- * which is 0x8000, this means we couldn't fit the mappings into
- * the cardinality of a unsigned byte.
- */
- if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
- b3_tbl -= U8_16BIT_TABLE_INDICATOR;
- start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
- end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
- } else {
- start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
- end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
- }
-
- /* This also means there wasn't any matching decomposition. */
- if (start_id >= end_id)
- return ((size_t)sz);
-
- /*
- * The final table for decomposition mappings has three types of
- * byte sequences depending on whether a mapping is for compatibility
- * decomposition, canonical decomposition, or both like the following:
- *
- * (1) Compatibility decomposition mappings:
- *
- * +---+---+-...-+---+
- * | B0| B1| ... | Bm|
- * +---+---+-...-+---+
- *
- * The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
- *
- * (2) Canonical decomposition mappings:
- *
- * +---+---+---+-...-+---+
- * | T | b0| b1| ... | bn|
- * +---+---+---+-...-+---+
- *
- * where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
- *
- * (3) Both mappings:
- *
- * +---+---+---+---+-...-+---+---+---+-...-+---+
- * | T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
- * +---+---+---+---+-...-+---+---+---+-...-+---+
- *
- * where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
- * byte, b0 to bn are canonical mapping bytes and B0 to Bm are
- * compatibility mapping bytes.
- *
- * Note that compatibility decomposition means doing recursive
- * decompositions using both compatibility decomposition mappings and
- * canonical decomposition mappings. On the other hand, canonical
- * decomposition means doing recursive decompositions using only
- * canonical decomposition mappings. Since the table we have has gone
- * through the recursions already, we do not need to do so during
- * runtime, i.e., the table has been completely flattened out
- * already.
- */
-
- b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
-
- /* Get the type, T, of the byte sequence. */
- b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
-
- /*
- * If necessary, adjust start_id, end_id, or both. Note that if
- * this is compatibility decomposition mapping, there is no
- * adjustment.
- */
- if (canonical_decomposition) {
- /* Is the mapping only for compatibility decomposition? */
- if (b1 < U8_DECOMP_BOTH)
- return ((size_t)sz);
-
- start_id++;
-
- if (b1 == U8_DECOMP_BOTH) {
- end_id = start_id +
- u8_decomp_final_tbl[uv][b3_base + start_id];
- start_id++;
- }
- } else {
- /*
- * Unless this is a compatibility decomposition mapping,
- * we adjust the start_id.
- */
- if (b1 == U8_DECOMP_BOTH) {
- start_id++;
- start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
- } else if (b1 == U8_DECOMP_CANONICAL) {
- start_id++;
- }
- }
-
- for (i = 0; start_id < end_id; start_id++)
- u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
- u8s[i] = '\0';
-
- return (i);
-}
-
-/*
- * The find_composition_start() function uses the character bytes given and
- * find out the matching composition mappings if any and return the address
- * to the composition mappings as explained in the do_composition().
- */
-static uchar_t *
-find_composition_start(size_t uv, uchar_t *s, size_t sz)
-{
- uint16_t b1 = 0;
- uint16_t b2 = 0;
- uint16_t b3 = 0;
- uint16_t b3_tbl;
- uint16_t b3_base;
- uint16_t b4 = 0;
- size_t start_id;
- size_t end_id;
-
- if (sz == 1) {
- b4 = s[0];
- } else if (sz == 2) {
- b3 = s[0];
- b4 = s[1];
- } else if (sz == 3) {
- b2 = s[0];
- b3 = s[1];
- b4 = s[2];
- } else if (sz == 4) {
- b1 = s[0];
- b2 = s[1];
- b3 = s[2];
- b4 = s[3];
- } else {
- /*
- * This is a fallback and should not happen if the function
- * was called properly.
- */
- return (NULL);
- }
-
- b1 = u8_composition_b1_tbl[uv][b1];
- if (b1 == U8_TBL_ELEMENT_NOT_DEF)
- return (NULL);
-
- b2 = u8_composition_b2_tbl[uv][b1][b2];
- if (b2 == U8_TBL_ELEMENT_NOT_DEF)
- return (NULL);
-
- b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
- if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
- return (NULL);
-
- if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
- b3_tbl -= U8_16BIT_TABLE_INDICATOR;
- start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
- end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
- } else {
- start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
- end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
- }
-
- if (start_id >= end_id)
- return (NULL);
-
- b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
-
- return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
-}
-
-/*
- * The blocked() function checks on the combining class values of previous
- * characters in this sequence and return whether it is blocked or not.
- */
-static boolean_t
-blocked(uchar_t *comb_class, size_t last)
-{
- uchar_t my_comb_class;
- size_t i;
-
- my_comb_class = comb_class[last];
- for (i = 1; i < last; i++)
- if (comb_class[i] >= my_comb_class ||
- comb_class[i] == U8_COMBINING_CLASS_STARTER)
- return (B_TRUE);
-
- return (B_FALSE);
-}
-
-/*
- * The do_composition() reads the character string pointed by 's' and
- * do necessary canonical composition and then copy over the result back to
- * the 's'.
- *
- * The input argument 's' cannot contain more than 32 characters.
- */
-static size_t
-do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
- uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
-{
- uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
- uchar_t tc[U8_MB_CUR_MAX];
- uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
- size_t saved_marks_count;
- uchar_t *p;
- uchar_t *saved_p;
- uchar_t *q;
- size_t i;
- size_t saved_i;
- size_t j;
- size_t k;
- size_t l;
- size_t C;
- size_t saved_l;
- size_t size;
- uint32_t u1;
- uint32_t u2;
- boolean_t match_not_found = B_TRUE;
-
- /*
- * This should never happen unless the callers are doing some strange
- * and unexpected things.
- *
- * The "last" is the index pointing to the last character not last + 1.
- */
- if (last >= U8_MAX_CHARS_A_SEQ)
- last = U8_UPPER_LIMIT_IN_A_SEQ;
-
- for (i = l = 0; i <= last; i++) {
- /*
- * The last or any non-Starters at the beginning, we don't
- * have any chance to do composition and so we just copy them
- * to the temporary buffer.
- */
- if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
-SAVE_THE_CHAR:
- p = s + start[i];
- size = disp[i];
- for (k = 0; k < size; k++)
- t[l++] = *p++;
- continue;
- }
-
- /*
- * If this could be a start of Hangul Jamos, then, we try to
- * conjoin them.
- */
- if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
- U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
- s[start[i] + 1], s[start[i] + 2]);
- U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
- s[start[i] + 4], s[start[i] + 5]);
-
- if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
- u1 -= U8_HANGUL_JAMO_L_FIRST;
- u2 -= U8_HANGUL_JAMO_V_FIRST;
- u1 = U8_HANGUL_SYL_FIRST +
- (u1 * U8_HANGUL_V_COUNT + u2) *
- U8_HANGUL_T_COUNT;
-
- i += 2;
- if (i <= last) {
- U8_PUT_3BYTES_INTO_UTF32(u2,
- s[start[i]], s[start[i] + 1],
- s[start[i] + 2]);
-
- if (U8_HANGUL_JAMO_T(u2)) {
- u1 += u2 -
- U8_HANGUL_JAMO_T_FIRST;
- i++;
- }
- }
-
- U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
- i--;
- l += 3;
- continue;
- }
- }
-
- /*
- * Let's then find out if this Starter has composition
- * mapping.
- */
- p = find_composition_start(uv, s + start[i], disp[i]);
- if (p == NULL)
- goto SAVE_THE_CHAR;
-
- /*
- * We have a Starter with composition mapping and the next
- * character is a non-Starter. Let's try to find out if
- * we can do composition.
- */
-
- saved_p = p;
- saved_i = i;
- saved_l = l;
- saved_marks_count = 0;
-
-TRY_THE_NEXT_MARK:
- q = s + start[++i];
- size = disp[i];
-
- /*
- * The next for() loop compares the non-Starter pointed by
- * 'q' with the possible (joinable) characters pointed by 'p'.
- *
- * The composition final table entry pointed by the 'p'
- * looks like the following:
- *
- * +---+---+---+-...-+---+---+---+---+-...-+---+---+
- * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
- * +---+---+---+-...-+---+---+---+---+-...-+---+---+
- *
- * where C is the count byte indicating the number of
- * mapping pairs where each pair would be look like
- * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
- * character of a canonical decomposition and the B0-Bm are
- * the bytes of a matching composite character. The F is
- * a filler byte after each character as the separator.
- */
-
- match_not_found = B_TRUE;
-
- for (C = *p++; C > 0; C--) {
- for (k = 0; k < size; p++, k++)
- if (*p != q[k])
- break;
-
- /* Have we found it? */
- if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
- match_not_found = B_FALSE;
-
- l = saved_l;
-
- while (*++p != U8_TBL_ELEMENT_FILLER)
- t[l++] = *p;
-
- break;
- }
-
- /* We didn't find; skip to the next pair. */
- if (*p != U8_TBL_ELEMENT_FILLER)
- while (*++p != U8_TBL_ELEMENT_FILLER)
- ;
- while (*++p != U8_TBL_ELEMENT_FILLER)
- ;
- p++;
- }
-
- /*
- * If there was no match, we will need to save the combining
- * mark for later appending. After that, if the next one
- * is a non-Starter and not blocked, then, we try once
- * again to do composition with the next non-Starter.
- *
- * If there was no match and this was a Starter, then,
- * this is a new start.
- *
- * If there was a match and a composition done and we have
- * more to check on, then, we retrieve a new composition final
- * table entry for the composite and then try to do the
- * composition again.
- */
-
- if (match_not_found) {
- if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
- i--;
- goto SAVE_THE_CHAR;
- }
-
- saved_marks[saved_marks_count++] = i;
- }
-
- if (saved_l == l) {
- while (i < last) {
- if (blocked(comb_class, i + 1))
- saved_marks[saved_marks_count++] = ++i;
- else
- break;
- }
- if (i < last) {
- p = saved_p;
- goto TRY_THE_NEXT_MARK;
- }
- } else if (i < last) {
- p = find_composition_start(uv, t + saved_l,
- l - saved_l);
- if (p != NULL) {
- saved_p = p;
- goto TRY_THE_NEXT_MARK;
- }
- }
-
- /*
- * There is no more composition possible.
- *
- * If there was no composition what so ever then we copy
- * over the original Starter and then append any non-Starters
- * remaining at the target string sequentially after that.
- */
-
- if (saved_l == l) {
- p = s + start[saved_i];
- size = disp[saved_i];
- for (j = 0; j < size; j++)
- t[l++] = *p++;
- }
-
- for (k = 0; k < saved_marks_count; k++) {
- p = s + start[saved_marks[k]];
- size = disp[saved_marks[k]];
- for (j = 0; j < size; j++)
- t[l++] = *p++;
- }
- }
-
- /*
- * If the last character is a Starter and if we have a character
- * (possibly another Starter) that can be turned into a composite,
- * we do so and we do so until there is no more of composition
- * possible.
- */
- if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
- p = *os;
- saved_l = l - disp[last];
-
- while (p < oslast) {
- size = u8_number_of_bytes[*p];
- if (size <= 1 || (p + size) > oslast)
- break;
-
- saved_p = p;
-
- for (i = 0; i < size; i++)
- tc[i] = *p++;
-
- q = find_composition_start(uv, t + saved_l,
- l - saved_l);
- if (q == NULL) {
- p = saved_p;
- break;
- }
-
- match_not_found = B_TRUE;
-
- for (C = *q++; C > 0; C--) {
- for (k = 0; k < size; q++, k++)
- if (*q != tc[k])
- break;
-
- if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
- match_not_found = B_FALSE;
-
- l = saved_l;
-
- while (*++q != U8_TBL_ELEMENT_FILLER) {
- /*
- * This is practically
- * impossible but we don't
- * want to take any chances.
- */
- if (l >=
- U8_STREAM_SAFE_TEXT_MAX) {
- p = saved_p;
- goto SAFE_RETURN;
- }
- t[l++] = *q;
- }
-
- break;
- }
-
- if (*q != U8_TBL_ELEMENT_FILLER)
- while (*++q != U8_TBL_ELEMENT_FILLER)
- ;
- while (*++q != U8_TBL_ELEMENT_FILLER)
- ;
- q++;
- }
-
- if (match_not_found) {
- p = saved_p;
- break;
- }
- }
-SAFE_RETURN:
- *os = p;
- }
-
- /*
- * Now we copy over the temporary string to the target string.
- * Since composition always reduces the number of characters or
- * the number of characters stay, we don't need to worry about
- * the buffer overflow here.
- */
- for (i = 0; i < l; i++)
- s[i] = t[i];
- s[l] = '\0';
-
- return (l);
-}
-
-/*
- * The collect_a_seq() function checks on the given string s, collect
- * a sequence of characters at u8s, and return the sequence. While it collects
- * a sequence, it also applies case conversion, canonical or compatibility
- * decomposition, canonical decomposition, or some or all of them and
- * in that order.
- *
- * The collected sequence cannot be bigger than 32 characters since if
- * it is having more than 31 characters, the sequence will be terminated
- * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
- * a Stream-Safe Text. The collected sequence is always terminated with
- * a null byte and the return value is the byte length of the sequence
- * including 0. The return value does not include the terminating
- * null byte.
- */
-static size_t
-collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
- boolean_t is_it_toupper,
- boolean_t is_it_tolower,
- boolean_t canonical_decomposition,
- boolean_t compatibility_decomposition,
- boolean_t canonical_composition,
- int *errnum, u8_normalization_states_t *state)
-{
- uchar_t *s;
- int sz;
- int saved_sz;
- size_t i;
- size_t j;
- size_t k;
- size_t l;
- uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
- uchar_t disp[U8_MAX_CHARS_A_SEQ];
- uchar_t start[U8_MAX_CHARS_A_SEQ];
- uchar_t u8t[U8_MB_CUR_MAX];
- uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
- uchar_t tc;
- size_t last;
- size_t saved_last;
- uint32_t u1;
-
- /*
- * Save the source string pointer which we will return a changed
- * pointer if we do processing.
- */
- s = *source;
-
- /*
- * The following is a fallback for just in case callers are not
- * checking the string boundaries before the calling.
- */
- if (s >= slast) {
- u8s[0] = '\0';
-
- return (0);
- }
-
- /*
- * As the first thing, let's collect a character and do case
- * conversion if necessary.
- */
-
- sz = u8_number_of_bytes[*s];
-
- if (sz < 0) {
- *errnum = EILSEQ;
-
- u8s[0] = *s++;
- u8s[1] = '\0';
-
- *source = s;
-
- return (1);
- }
-
- if (sz == 1) {
- if (is_it_toupper)
- u8s[0] = U8_ASCII_TOUPPER(*s);
- else if (is_it_tolower)
- u8s[0] = U8_ASCII_TOLOWER(*s);
- else
- u8s[0] = *s;
- s++;
- u8s[1] = '\0';
- } else if ((s + sz) > slast) {
- *errnum = EINVAL;
-
- for (i = 0; s < slast; )
- u8s[i++] = *s++;
- u8s[i] = '\0';
-
- *source = s;
-
- return (i);
- } else {
- if (is_it_toupper || is_it_tolower) {
- i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
- s += sz;
- sz = i;
- } else {
- for (i = 0; i < sz; )
- u8s[i++] = *s++;
- u8s[i] = '\0';
- }
- }
-
- /*
- * And then canonical/compatibility decomposition followed by
- * an optional canonical composition. Please be noted that
- * canonical composition is done only when a decomposition is
- * done.
- */
- if (canonical_decomposition || compatibility_decomposition) {
- if (sz == 1) {
- *state = U8_STATE_START;
-
- saved_sz = 1;
-
- comb_class[0] = 0;
- start[0] = 0;
- disp[0] = 1;
-
- last = 1;
- } else {
- saved_sz = do_decomp(uv, u8s, u8s, sz,
- canonical_decomposition, state);
-
- last = 0;
-
- for (i = 0; i < saved_sz; ) {
- sz = u8_number_of_bytes[u8s[i]];
-
- comb_class[last] = combining_class(uv,
- u8s + i, sz);
- start[last] = i;
- disp[last] = sz;
-
- last++;
- i += sz;
- }
-
- /*
- * Decomposition yields various Hangul related
- * states but not on combining marks. We need to
- * find out at here by checking on the last
- * character.
- */
- if (*state == U8_STATE_START) {
- if (comb_class[last - 1])
- *state = U8_STATE_COMBINING_MARK;
- }
- }
-
- saved_last = last;
-
- while (s < slast) {
- sz = u8_number_of_bytes[*s];
-
- /*
- * If this is an illegal character, an incomplete
- * character, or an 7-bit ASCII Starter character,
- * then we have collected a sequence; break and let
- * the next call deal with the two cases.
- *
- * Note that this is okay only if you are using this
- * function with a fixed length string, not on
- * a buffer with multiple calls of one chunk at a time.
- */
- if (sz <= 1) {
- break;
- } else if ((s + sz) > slast) {
- break;
- } else {
- /*
- * If the previous character was a Hangul Jamo
- * and this character is a Hangul Jamo that
- * can be conjoined, we collect the Jamo.
- */
- if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
- U8_PUT_3BYTES_INTO_UTF32(u1,
- *s, *(s + 1), *(s + 2));
-
- if (U8_HANGUL_COMPOSABLE_L_V(*state,
- u1)) {
- i = 0;
- *state = U8_STATE_HANGUL_LV;
- goto COLLECT_A_HANGUL;
- }
-
- if (U8_HANGUL_COMPOSABLE_LV_T(*state,
- u1)) {
- i = 0;
- *state = U8_STATE_HANGUL_LVT;
- goto COLLECT_A_HANGUL;
- }
- }
-
- /*
- * Regardless of whatever it was, if this is
- * a Starter, we don't collect the character
- * since that's a new start and we will deal
- * with it at the next time.
- */
- i = combining_class(uv, s, sz);
- if (i == U8_COMBINING_CLASS_STARTER)
- break;
-
- /*
- * We know the current character is a combining
- * mark. If the previous character wasn't
- * a Starter (not Hangul) or a combining mark,
- * then, we don't collect this combining mark.
- */
- if (*state != U8_STATE_START &&
- *state != U8_STATE_COMBINING_MARK)
- break;
-
- *state = U8_STATE_COMBINING_MARK;
-COLLECT_A_HANGUL:
- /*
- * If we collected a Starter and combining
- * marks up to 30, i.e., total 31 characters,
- * then, we terminate this degenerately long
- * combining sequence with a U+034F COMBINING
- * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
- * UTF-8 and turn this into a Stream-Safe
- * Text. This will be extremely rare but
- * possible.
- *
- * The following will also guarantee that
- * we are not writing more than 32 characters
- * plus a NULL at u8s[].
- */
- if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
-TURN_STREAM_SAFE:
- *state = U8_STATE_START;
- comb_class[last] = 0;
- start[last] = saved_sz;
- disp[last] = 2;
- last++;
-
- u8s[saved_sz++] = 0xCD;
- u8s[saved_sz++] = 0x8F;
-
- break;
- }
-
- /*
- * Some combining marks also do decompose into
- * another combining mark or marks.
- */
- if (*state == U8_STATE_COMBINING_MARK) {
- k = last;
- l = sz;
- i = do_decomp(uv, uts, s, sz,
- canonical_decomposition, state);
- for (j = 0; j < i; ) {
- sz = u8_number_of_bytes[uts[j]];
-
- comb_class[last] =
- combining_class(uv,
- uts + j, sz);
- start[last] = saved_sz + j;
- disp[last] = sz;
-
- last++;
- if (last >=
- U8_UPPER_LIMIT_IN_A_SEQ) {
- last = k;
- goto TURN_STREAM_SAFE;
- }
- j += sz;
- }
-
- *state = U8_STATE_COMBINING_MARK;
- sz = i;
- s += l;
-
- for (i = 0; i < sz; i++)
- u8s[saved_sz++] = uts[i];
- } else {
- comb_class[last] = i;
- start[last] = saved_sz;
- disp[last] = sz;
- last++;
-
- for (i = 0; i < sz; i++)
- u8s[saved_sz++] = *s++;
- }
-
- /*
- * If this is U+0345 COMBINING GREEK
- * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
- * iota subscript, and need to be converted to
- * uppercase letter, convert it to U+0399 GREEK
- * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
- * i.e., convert to capital adscript form as
- * specified in the Unicode standard.
- *
- * This is the only special case of (ambiguous)
- * case conversion at combining marks and
- * probably the standard will never have
- * anything similar like this in future.
- */
- if (is_it_toupper && sz >= 2 &&
- u8s[saved_sz - 2] == 0xCD &&
- u8s[saved_sz - 1] == 0x85) {
- u8s[saved_sz - 2] = 0xCE;
- u8s[saved_sz - 1] = 0x99;
- }
- }
- }
-
- /*
- * Let's try to ensure a canonical ordering for the collected
- * combining marks. We do this only if we have collected
- * at least one more non-Starter. (The decomposition mapping
- * data tables have fully (and recursively) expanded and
- * canonically ordered decompositions.)
- *
- * The U8_SWAP_COMB_MARKS() convenience macro has some
- * assumptions and we are meeting the assumptions.
- */
- last--;
- if (last >= saved_last) {
- for (i = 0; i < last; i++)
- for (j = last; j > i; j--)
- if (comb_class[j] &&
- comb_class[j - 1] > comb_class[j]) {
- U8_SWAP_COMB_MARKS(j - 1, j);
- }
- }
-
- *source = s;
-
- if (! canonical_composition) {
- u8s[saved_sz] = '\0';
- return (saved_sz);
- }
-
- /*
- * Now do the canonical composition. Note that we do this
- * only after a canonical or compatibility decomposition to
- * finish up NFC or NFKC.
- */
- sz = do_composition(uv, u8s, comb_class, start, disp, last,
- &s, slast);
- }
-
- *source = s;
-
- return ((size_t)sz);
-}
-
-/*
- * The do_norm_compare() function does string comparion based on Unicode
- * simple case mappings and Unicode Normalization definitions.
- *
- * It does so by collecting a sequence of character at a time and comparing
- * the collected sequences from the strings.
- *
- * The meanings on the return values are the same as the usual strcmp().
- */
-static int
-do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
- int flag, int *errnum)
-{
- int result;
- size_t sz1;
- size_t sz2;
- uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
- uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
- uchar_t *s1last;
- uchar_t *s2last;
- boolean_t is_it_toupper;
- boolean_t is_it_tolower;
- boolean_t canonical_decomposition;
- boolean_t compatibility_decomposition;
- boolean_t canonical_composition;
- u8_normalization_states_t state;
-
- s1last = s1 + n1;
- s2last = s2 + n2;
-
- is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
- is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
- canonical_decomposition = flag & U8_CANON_DECOMP;
- compatibility_decomposition = flag & U8_COMPAT_DECOMP;
- canonical_composition = flag & U8_CANON_COMP;
-
- while (s1 < s1last && s2 < s2last) {
- /*
- * If the current character is a 7-bit ASCII and the last
- * character, or, if the current character and the next
- * character are both some 7-bit ASCII characters then
- * we treat the current character as a sequence.
- *
- * In any other cases, we need to call collect_a_seq().
- */
-
- if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
- ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
- if (is_it_toupper)
- u8s1[0] = U8_ASCII_TOUPPER(*s1);
- else if (is_it_tolower)
- u8s1[0] = U8_ASCII_TOLOWER(*s1);
- else
- u8s1[0] = *s1;
- u8s1[1] = '\0';
- sz1 = 1;
- s1++;
- } else {
- state = U8_STATE_START;
- sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
- is_it_toupper, is_it_tolower,
- canonical_decomposition,
- compatibility_decomposition,
- canonical_composition, errnum, &state);
- }
-
- if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
- ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
- if (is_it_toupper)
- u8s2[0] = U8_ASCII_TOUPPER(*s2);
- else if (is_it_tolower)
- u8s2[0] = U8_ASCII_TOLOWER(*s2);
- else
- u8s2[0] = *s2;
- u8s2[1] = '\0';
- sz2 = 1;
- s2++;
- } else {
- state = U8_STATE_START;
- sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
- is_it_toupper, is_it_tolower,
- canonical_decomposition,
- compatibility_decomposition,
- canonical_composition, errnum, &state);
- }
-
- /*
- * Now compare the two characters. If they are the same,
- * we move on to the next character sequences.
- */
- if (sz1 == 1 && sz2 == 1) {
- if (*u8s1 > *u8s2)
- return (1);
- if (*u8s1 < *u8s2)
- return (-1);
- } else {
- result = strcmp((const char *)u8s1, (const char *)u8s2);
- if (result != 0)
- return (result);
- }
- }
-
- /*
- * We compared until the end of either or both strings.
- *
- * If we reached to or went over the ends for the both, that means
- * they are the same.
- *
- * If we reached only one end, that means the other string has
- * something which then can be used to determine the return value.
- */
- if (s1 >= s1last) {
- if (s2 >= s2last)
- return (0);
- return (-1);
- }
- return (1);
-}
-
-/*
- * The u8_strcmp() function compares two UTF-8 strings quite similar to
- * the strcmp(). For the comparison, however, Unicode Normalization specific
- * equivalency and Unicode simple case conversion mappings based equivalency
- * can be requested and checked against.
- */
-int
-u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
- int *errnum)
-{
- int f;
- size_t n1;
- size_t n2;
-
- *errnum = 0;
-
- /*
- * Check on the requested Unicode version, case conversion, and
- * normalization flag values.
- */
-
- if (uv > U8_UNICODE_LATEST) {
- *errnum = ERANGE;
- uv = U8_UNICODE_LATEST;
- }
-
- if (flag == 0) {
- flag = U8_STRCMP_CS;
- } else {
- f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
- U8_STRCMP_CI_LOWER);
- if (f == 0) {
- flag |= U8_STRCMP_CS;
- } else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
- f != U8_STRCMP_CI_LOWER) {
- *errnum = EBADF;
- flag = U8_STRCMP_CS;
- }
-
- f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
- if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
- f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
- *errnum = EBADF;
- flag = U8_STRCMP_CS;
- }
- }
-
- if (flag == U8_STRCMP_CS) {
- return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
- }
-
- n1 = strlen(s1);
- n2 = strlen(s2);
- if (n != 0) {
- if (n < n1)
- n1 = n;
- if (n < n2)
- n2 = n;
- }
-
- /*
- * Simple case conversion can be done much faster and so we do
- * them separately here.
- */
- if (flag == U8_STRCMP_CI_UPPER) {
- return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
- n1, n2, B_TRUE, errnum));
- } else if (flag == U8_STRCMP_CI_LOWER) {
- return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
- n1, n2, B_FALSE, errnum));
- }
-
- return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
- flag, errnum));
-}
-
-size_t
-u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
- int flag, size_t unicode_version, int *errnum)
-{
- int f;
- int sz;
- uchar_t *ib;
- uchar_t *ibtail;
- uchar_t *ob;
- uchar_t *obtail;
- boolean_t do_not_ignore_null;
- boolean_t do_not_ignore_invalid;
- boolean_t is_it_toupper;
- boolean_t is_it_tolower;
- boolean_t canonical_decomposition;
- boolean_t compatibility_decomposition;
- boolean_t canonical_composition;
- size_t ret_val;
- size_t i;
- size_t j;
- uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
- u8_normalization_states_t state;
-
- if (unicode_version > U8_UNICODE_LATEST) {
- *errnum = ERANGE;
- return ((size_t)-1);
- }
-
- f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
- if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
- *errnum = EBADF;
- return ((size_t)-1);
- }
-
- f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
- if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
- f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
- *errnum = EBADF;
- return ((size_t)-1);
- }
-
- if (inarray == NULL || *inlen == 0)
- return (0);
-
- if (outarray == NULL) {
- *errnum = E2BIG;
- return ((size_t)-1);
- }
-
- ib = (uchar_t *)inarray;
- ob = (uchar_t *)outarray;
- ibtail = ib + *inlen;
- obtail = ob + *outlen;
-
- do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
- do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
- is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
- is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
-
- ret_val = 0;
-
- /*
- * If we don't have a normalization flag set, we do the simple case
- * conversion based text preparation separately below. Text
- * preparation involving Normalization will be done in the false task
- * block, again, separately since it will take much more time and
- * resource than doing simple case conversions.
- */
- if (f == 0) {
- while (ib < ibtail) {
- if (*ib == '\0' && do_not_ignore_null)
- break;
-
- sz = u8_number_of_bytes[*ib];
-
- if (sz < 0) {
- if (do_not_ignore_invalid) {
- *errnum = EILSEQ;
- ret_val = (size_t)-1;
- break;
- }
-
- sz = 1;
- ret_val++;
- }
-
- if (sz == 1) {
- if (ob >= obtail) {
- *errnum = E2BIG;
- ret_val = (size_t)-1;
- break;
- }
-
- if (is_it_toupper)
- *ob = U8_ASCII_TOUPPER(*ib);
- else if (is_it_tolower)
- *ob = U8_ASCII_TOLOWER(*ib);
- else
- *ob = *ib;
- ib++;
- ob++;
- } else if ((ib + sz) > ibtail) {
- if (do_not_ignore_invalid) {
- *errnum = EINVAL;
- ret_val = (size_t)-1;
- break;
- }
-
- if ((obtail - ob) < (ibtail - ib)) {
- *errnum = E2BIG;
- ret_val = (size_t)-1;
- break;
- }
-
- /*
- * We treat the remaining incomplete character
- * bytes as a character.
- */
- ret_val++;
-
- while (ib < ibtail)
- *ob++ = *ib++;
- } else {
- if (is_it_toupper || is_it_tolower) {
- i = do_case_conv(unicode_version, u8s,
- ib, sz, is_it_toupper);
-
- if ((obtail - ob) < i) {
- *errnum = E2BIG;
- ret_val = (size_t)-1;
- break;
- }
-
- ib += sz;
-
- for (sz = 0; sz < i; sz++)
- *ob++ = u8s[sz];
- } else {
- if ((obtail - ob) < sz) {
- *errnum = E2BIG;
- ret_val = (size_t)-1;
- break;
- }
-
- for (i = 0; i < sz; i++)
- *ob++ = *ib++;
- }
- }
- }
- } else {
- canonical_decomposition = flag & U8_CANON_DECOMP;
- compatibility_decomposition = flag & U8_COMPAT_DECOMP;
- canonical_composition = flag & U8_CANON_COMP;
-
- while (ib < ibtail) {
- if (*ib == '\0' && do_not_ignore_null)
- break;
-
- /*
- * If the current character is a 7-bit ASCII
- * character and it is the last character, or,
- * if the current character is a 7-bit ASCII
- * character and the next character is also a 7-bit
- * ASCII character, then, we copy over this
- * character without going through collect_a_seq().
- *
- * In any other cases, we need to look further with
- * the collect_a_seq() function.
- */
- if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
- ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
- if (ob >= obtail) {
- *errnum = E2BIG;
- ret_val = (size_t)-1;
- break;
- }
-
- if (is_it_toupper)
- *ob = U8_ASCII_TOUPPER(*ib);
- else if (is_it_tolower)
- *ob = U8_ASCII_TOLOWER(*ib);
- else
- *ob = *ib;
- ib++;
- ob++;
- } else {
- *errnum = 0;
- state = U8_STATE_START;
-
- j = collect_a_seq(unicode_version, u8s,
- &ib, ibtail,
- is_it_toupper,
- is_it_tolower,
- canonical_decomposition,
- compatibility_decomposition,
- canonical_composition,
- errnum, &state);
-
- if (*errnum && do_not_ignore_invalid) {
- ret_val = (size_t)-1;
- break;
- }
-
- if ((obtail - ob) < j) {
- *errnum = E2BIG;
- ret_val = (size_t)-1;
- break;
- }
-
- for (i = 0; i < j; i++)
- *ob++ = u8s[i];
- }
- }
- }
-
- *inlen = ibtail - ib;
- *outlen = obtail - ob;
-
- return (ret_val);
-}